From fb51a8b55f90f79046acfbe2876cedc86a6799df Mon Sep 17 00:00:00 2001
From: Per Stark <per@stark.pub>
Date: Wed, 17 Jun 2026 19:23:11 +0200
Subject: [PATCH] evals: eval crate overhaul, simplification and performance
 improvements

---
 .cargo/config.toml                            |   2 +-
 CHANGELOG.md                                  |   1 +
 Cargo.lock                                    |  94 ----
 Cargo.toml                                    |   2 +-
 devenv.nix                                    |   2 +
 evaluations/Cargo.toml                        |   2 -
 evaluations/README.md                         | 252 +++-------
 evaluations/REFACTOR.md                       |  98 ++++
 evaluations/manifest.yaml                     |   3 +-
 evaluations/src/args.rs                       |  84 +++-
 evaluations/src/cache.rs                      |  88 ----
 evaluations/src/cases.rs                      |   1 +
 evaluations/src/cli/mod.rs                    |   3 +
 evaluations/src/cli/status.rs                 | 316 +++++++++++++
 evaluations/src/context_stats.rs              | 177 +++++++
 evaluations/src/corpus/config.rs              |  32 +-
 evaluations/src/corpus/mod.rs                 |   6 +-
 evaluations/src/corpus/orchestrator.rs        |  58 +--
 evaluations/src/corpus/store.rs               |  13 +-
 evaluations/src/datasets/beir.rs              | 132 +++++-
 evaluations/src/datasets/beir_mix.rs          | 262 +++++++++++
 evaluations/src/datasets/checksum.rs          | 216 +++++++++
 evaluations/src/datasets/loader.rs            | 197 ++++++++
 evaluations/src/datasets/mod.rs               | 181 ++-----
 evaluations/src/datasets/nq.rs                |   6 +-
 evaluations/src/datasets/store.rs             | 410 ++++++++++++++++
 .../src/{namespace.rs => db/connect.rs}       | 105 +++--
 .../src/{db_helpers.rs => db/lifecycle.rs}    |  48 +-
 evaluations/src/db/mod.rs                     |   9 +
 evaluations/src/eval.rs                       | 128 -----
 evaluations/src/inspection.rs                 |  70 +--
 evaluations/src/main.rs                       |  95 ++--
 evaluations/src/openai.rs                     |  20 +-
 evaluations/src/perf.rs                       |  15 +-
 evaluations/src/pipeline/context.rs           |  28 +-
 evaluations/src/pipeline/diagnostics.rs       |  20 +
 evaluations/src/pipeline/mod.rs               |  53 ++-
 evaluations/src/pipeline/stages/finalize.rs   |  21 +-
 evaluations/src/pipeline/stages/mod.rs        |  11 -
 .../src/pipeline/stages/prepare_corpus.rs     |  55 +--
 evaluations/src/pipeline/stages/prepare_db.rs |  52 +-
 .../src/pipeline/stages/prepare_namespace.rs  |  79 +---
 .../src/pipeline/stages/prepare_slice.rs      |  28 +-
 .../src/pipeline/stages/run_queries.rs        |  29 +-
 evaluations/src/pipeline/stages/summarize.rs  |  29 +-
 evaluations/src/pipeline/state.rs             |  31 --
 evaluations/src/report.rs                     | 293 ++++--------
 evaluations/src/slice/beir.rs                 | 174 +++++++
 evaluations/src/slice/build.rs                |  19 +
 evaluations/src/{slice.rs => slice/mod.rs}    | 443 ++++++++----------
 evaluations/src/snapshot.rs                   | 179 -------
 evaluations/src/types.rs                      |  10 +-
 html-router/assets/style.css                  |   1 -
 53 files changed, 2852 insertions(+), 1831 deletions(-)
 create mode 100644 evaluations/REFACTOR.md
 delete mode 100644 evaluations/src/cache.rs
 create mode 100644 evaluations/src/cli/mod.rs
 create mode 100644 evaluations/src/cli/status.rs
 create mode 100644 evaluations/src/context_stats.rs
 create mode 100644 evaluations/src/datasets/beir_mix.rs
 create mode 100644 evaluations/src/datasets/checksum.rs
 create mode 100644 evaluations/src/datasets/loader.rs
 create mode 100644 evaluations/src/datasets/store.rs
 rename evaluations/src/{namespace.rs => db/connect.rs} (67%)
 rename evaluations/src/{db_helpers.rs => db/lifecycle.rs} (75%)
 create mode 100644 evaluations/src/db/mod.rs
 delete mode 100644 evaluations/src/eval.rs
 create mode 100644 evaluations/src/pipeline/diagnostics.rs
 delete mode 100644 evaluations/src/pipeline/state.rs
 create mode 100644 evaluations/src/slice/beir.rs
 create mode 100644 evaluations/src/slice/build.rs
 rename evaluations/src/{slice.rs => slice/mod.rs} (83%)
 delete mode 100644 evaluations/src/snapshot.rs

diff --git a/.cargo/config.toml b/.cargo/config.toml
index 61f4796..8522063 100644
--- a/.cargo/config.toml
+++ b/.cargo/config.toml
@@ -1,2 +1,2 @@
 [alias]
-eval = "run -p evaluations --"
+eval = "run -p evaluations --release --"
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 32a6e7a..6278ba5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,6 @@
 # Changelog
 ## Unreleased
+- Evaluations: simplified crate layout — linear pipeline, sharded-only converted store, in-memory ingestion, `db/` and `cli/` modules; namespace reuse state in corpus manifest (removed `cache/snapshots/`); no legacy JSON/history compatibility (re-run `--warm` after upgrade)
 - Performance: ingestion skips per-task index rebuild; worker runs scheduled `REBUILD INDEX` (default every 24h via `index_rebuild_interval_secs`, `0` disables)
 - Performance: ingestion persists all artifacts in a single SurrealDB transaction per task (atomic replace by task id)
 - Performance: entity embeddings during ingestion use batched `embed_batch`, matching chunk embedding
diff --git a/Cargo.lock b/Cargo.lock
index 5c7a3ae..1c2c9be 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -165,12 +165,6 @@ dependencies = [
  "libc",
 ]
 
-[[package]]
-name = "anes"
-version = "0.1.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
-
 [[package]]
 name = "anstream"
 version = "0.6.21"
@@ -1071,12 +1065,6 @@ dependencies = [
  "serde",
 ]
 
-[[package]]
-name = "cast"
-version = "0.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
-
 [[package]]
 name = "castaway"
 version = "0.2.4"
@@ -1582,42 +1570,6 @@ dependencies = [
  "cfg-if",
 ]
 
-[[package]]
-name = "criterion"
-version = "0.5.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f"
-dependencies = [
- "anes",
- "cast",
- "ciborium",
- "clap",
- "criterion-plot",
- "is-terminal",
- "itertools 0.10.5",
- "num-traits",
- "once_cell",
- "oorandom",
- "plotters",
- "rayon",
- "regex",
- "serde",
- "serde_derive",
- "serde_json",
- "tinytemplate",
- "walkdir",
-]
-
-[[package]]
-name = "criterion-plot"
-version = "0.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1"
-dependencies = [
- "cast",
- "itertools 0.10.5",
-]
-
 [[package]]
 name = "critical-section"
 version = "1.2.0"
@@ -2238,7 +2190,6 @@ dependencies = [
  "chrono",
  "clap",
  "common",
- "criterion",
  "fastembed",
  "futures",
  "ingestion-pipeline",
@@ -2250,7 +2201,6 @@ dependencies = [
  "serde_json",
  "serde_yaml",
  "sha2",
- "state-machines",
  "surrealdb",
  "tempfile",
  "text-splitter",
@@ -4438,12 +4388,6 @@ dependencies = [
  "pkg-config",
 ]
 
-[[package]]
-name = "oorandom"
-version = "11.1.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e"
-
 [[package]]
 name = "opaque-debug"
 version = "0.3.1"
@@ -4836,34 +4780,6 @@ version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
 
-[[package]]
-name = "plotters"
-version = "0.3.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747"
-dependencies = [
- "num-traits",
- "plotters-backend",
- "plotters-svg",
- "wasm-bindgen",
- "web-sys",
-]
-
-[[package]]
-name = "plotters-backend"
-version = "0.3.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a"
-
-[[package]]
-name = "plotters-svg"
-version = "0.3.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670"
-dependencies = [
- "plotters-backend",
-]
-
 [[package]]
 name = "polling"
 version = "3.11.0"
@@ -6940,16 +6856,6 @@ dependencies = [
  "zerovec",
 ]
 
-[[package]]
-name = "tinytemplate"
-version = "1.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc"
-dependencies = [
- "serde",
- "serde_json",
-]
-
 [[package]]
 name = "tinyvec"
 version = "1.10.0"
diff --git a/Cargo.toml b/Cargo.toml
index e5be857..ca8c315 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -9,7 +9,7 @@ members = [
   "json-stream-parser",
   "evaluations"
 ]
-resolver = "2"
+resolver = "3"
 
 [workspace.dependencies]
 anyhow = "1.0.94"
diff --git a/devenv.nix b/devenv.nix
index b5b49bf..5594b24 100644
--- a/devenv.nix
+++ b/devenv.nix
@@ -13,6 +13,8 @@ let
     else
       throw "pkgs.onnxruntime.version (${pkgs.onnxruntime.version}) must match ort-version (${ortVersion})";
 in {
+  devenv.warnOnNewVersion = false;
+
   cachix.enable = false;
 
   packages = [
diff --git a/evaluations/Cargo.toml b/evaluations/Cargo.toml
index 34d36f7..b68e061 100644
--- a/evaluations/Cargo.toml
+++ b/evaluations/Cargo.toml
@@ -30,8 +30,6 @@ serde_json = { workspace = true }
 async-trait = { workspace = true }
 once_cell = "1.19"
 serde_yaml = "0.9"
-criterion = "0.5"
-state-machines = { workspace = true }
 clap = { version = "4.4", features = ["derive", "env"] }
 
 [dev-dependencies]
diff --git a/evaluations/README.md b/evaluations/README.md
index b150a0e..5df7b7d 100644
--- a/evaluations/README.md
+++ b/evaluations/README.md
@@ -1,212 +1,102 @@
 # Evaluations
 
-The `evaluations` crate provides a retrieval evaluation framework for benchmarking Minne's information retrieval pipeline against standard datasets.
+The `evaluations` crate benchmarks Minne's retrieval pipeline against standard datasets.
 
 ## Quick Start
 
 ```bash
-# Run SQuAD v2.0 evaluation (vector-only, recommended)
-cargo run --package evaluations -- --ingest-chunks-only
+# One-time prep (convert, slice ledger, corpus cache, DB seed)
+cargo eval --warm --dataset beir --slice beir-mix-600
 
-# Run a specific dataset
-cargo run --package evaluations -- --dataset fiqa --ingest-chunks-only
+# Check readiness
+cargo eval --status --dataset beir --slice beir-mix-600
 
-# Convert dataset only (no evaluation)
-cargo run --package evaluations -- --convert-only
+# Run benchmark (steady state after warm)
+cargo eval --dataset beir --slice beir-mix-600 --require-ready
 ```
 
+Default dataset is `beir`. When `--slice` is omitted, the first catalog slice for the dataset is applied automatically (e.g. `beir-mix-600`).
+
+Chunk-only ingestion is the default. Pass `--include-entities` to opt into entity extraction during ingestion (requires `OPENAI_API_KEY`).
+
+### Custom slice sizes
+
+`--slice` is a ledger id, not only a catalog name. You can use any id; `--limit` controls how many questions the ledger contains:
+
+```bash
+# 200-case BEIR mix (default --limit is 200)
+cargo eval --warm --dataset beir --slice beir-mix-200
+cargo eval --dataset beir --slice beir-mix-200 --require-ready
+```
+
+The catalog slice `beir-mix-600` in `manifest.yaml` is a preset with `limit: 600` and `negative_multiplier: 9.0`.
+
+### BEIR mix layout
+
+`beir` is a **virtual mix** across eight subset datasets (FEVER, FiQA, HotpotQA, NFCorpus, Quora, TREC-COVID, SciFact, NQ-BEIR). There is no monolithic `beir-minne/` store.
+
+1. Build an in-memory qrels-world mix from raw subset data
+2. Resolve the slice ledger (`cache/slices/beir/<slice-id>.json`)
+3. Materialize only ledger paragraph ids into per-subset stores (`fever-minne/`, `fiqa-minne/`, …)
+4. Ingest the slice corpus and seed SurrealDB
+
+Conversion is **qrels-closed**: only documents that appear in qrels are exported, not the full BEIR corpus.
+
+Chunk-only mode may evaluate fewer cases than the slice ledger size when some questions are impossible or lack verifiable answer chunks.
+
+Reports include a **Retrieved Context Volume** section: total characters and estimated tokens across all chunks returned per query (`~chars/4`, comparable across `--chunk-result-cap` sweeps). Use this to compare the cost of raising `--chunk-result-cap`.
+
 ## Prerequisites
 
-### 1. SurrealDB
-
-Start a SurrealDB instance before running evaluations:
+### SurrealDB
 
 ```bash
 docker-compose up -d surrealdb
 ```
 
-Or using the default endpoint configuration:
+### Raw datasets
 
-```bash
-surreal start --user root_user --pass root_password
-```
+Place raw datasets under `evaluations/data/raw/`. See [manifest.yaml](./manifest.yaml) for paths.
 
-### 2. Download Raw Datasets
+BEIR subsets live in sibling directories (`data/raw/fever`, `data/raw/fiqa`, …). The `data/raw/beir` entry is a virtual catalog placeholder; warm uses the subset paths.
 
-Raw datasets must be downloaded manually and placed in `evaluations/data/raw/`. See [Dataset Sources](#dataset-sources) below for links and formats.
-
-## Directory Structure
+## Directory structure
 
 ```
 evaluations/
 ├── data/
-│   ├── raw/          # Downloaded raw datasets (manual)
-│   │   ├── squad/    # SQuAD v2.0
-│   │   ├── nq-dev/   # Natural Questions
-│   │   ├── fiqa/     # BEIR: FiQA-2018
-│   │   ├── fever/    # BEIR: FEVER
-│   │   ├── hotpotqa/ # BEIR: HotpotQA
-│   │   └── ...       # Other BEIR subsets
-│   └── converted/    # Auto-generated (Minne JSON format)
-├── cache/            # Ingestion and embedding caches
-├── reports/          # Evaluation output (JSON + Markdown)
-├── manifest.yaml     # Dataset and slice definitions
-└── src/              # Evaluation source code
+│   ├── raw/           # Downloaded datasets (manual)
+│   │   ├── fever/     # BEIR subset raw dirs (corpus.jsonl, queries.jsonl, qrels/)
+│   │   ├── fiqa/
+│   │   └── …
+│   └── converted/     # Sharded stores (auto-generated)
+│       ├── fever-minne/  # per-BEIR-subset stores
+│       ├── fiqa-minne/
+│       └── …             # BEIR mix loads from subset stores (no monolithic beir-minne/)
+├── cache/
+│   ├── slices/        # Slice ledgers
+│   └── ingested/      # Corpus ingestion caches (manifest includes namespace seed)
+├── reports/           # JSON + Markdown output from benchmark runs
+├── manifest.yaml
+└── src/
 ```
 
-## Dataset Sources
+**After upgrading:** delete old monolithic `*-minne.json` files, any legacy `beir-minne/` merged store, `cache/snapshots/` directories, and stale `reports/history/` artifacts, then re-run `--warm`.
 
-### SQuAD v2.0
-
-Download and place at `data/raw/squad/dev-v2.0.json`:
-
-```bash
-mkdir -p evaluations/data/raw/squad
-curl -L https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json \
-  -o evaluations/data/raw/squad/dev-v2.0.json
-```
-
-### Natural Questions (NQ)
-
-Download and place at `data/raw/nq-dev/dev-all.jsonl`:
-
-```bash
-mkdir -p evaluations/data/raw/nq-dev
-# Download from Google's Natural Questions page or HuggingFace
-# File: dev-all.jsonl (simplified JSONL format)
-```
-
-Source: [Google Natural Questions](https://ai.google.com/research/NaturalQuestions)
-
-### BEIR Datasets
-
-All BEIR datasets follow the same format structure:
-
-```
-data/raw/<dataset>/
-├── corpus.jsonl      # Document corpus
-├── queries.jsonl     # Query set
-└── qrels/
-    └── test.tsv      # Relevance judgments (or dev.tsv)
-```
-
-Download datasets from the [BEIR Benchmark repository](https://github.com/beir-cellar/beir). Each dataset zip extracts to the required directory structure.
-
-| Dataset    | Directory     |
-|------------|---------------|
-| FEVER      | `fever/`      |
-| FiQA-2018  | `fiqa/`       |
-| HotpotQA   | `hotpotqa/`   |
-| NFCorpus   | `nfcorpus/`   |
-| Quora      | `quora/`      |
-| TREC-COVID | `trec-covid/` |
-| SciFact    | `scifact/`    |
-| NQ (BEIR)  | `nq/`         |
-
-Example download:
-
-```bash
-cd evaluations/data/raw
-curl -L https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/fiqa.zip -o fiqa.zip
-unzip fiqa.zip && rm fiqa.zip
-```
-
-## Dataset Conversion
-
-Raw datasets are automatically converted to Minne's internal JSON format on first run. To force reconversion:
-
-```bash
-cargo run --package evaluations -- --force-convert
-```
-
-Converted files are saved to `data/converted/` and cached for subsequent runs.
-
-## CLI Reference
-
-### Common Options
+## Common flags
 
 | Flag | Description | Default |
 |------|-------------|---------|
-| `--dataset <NAME>` | Dataset to evaluate | `squad-v2` |
-| `--limit <N>` | Max questions to evaluate (0 = all) | `200` |
-| `--k <N>` | Precision@k cutoff | `5` |
-| `--slice <ID>` | Use a predefined slice from manifest | — |
-| `--rerank` | Enable FastEmbed reranking stage | disabled |
-| `--embedding-backend <BE>` | `fastembed` or `hashed` | `fastembed` |
-| `--ingest-chunks-only` | Skip entity extraction, ingest only text chunks | disabled |
+| `--dataset` | Dataset to evaluate | `beir` |
+| `--slice` | Slice ledger id (catalog or custom) | first catalog slice |
+| `--limit` | Max questions in the slice ledger | `200` |
+| `--warm` | Prepare without running queries | — |
+| `--status` | Print readiness | — |
+| `--require-ready` | Fail if not warmed | — |
+| `--include-entities` | Entity extraction during ingestion | off |
+| `--force-convert` | Rebuild converted store | — |
+| `--chunk-result-cap` | Max chunks returned per query (raise with `--k`) | `5` |
+| `--perf-log-console` | Print per-stage timings after a run | off |
+| `--label` | Label stored in JSON/Markdown reports | — |
 
-> [!TIP]
-> Use `--ingest-chunks-only` when evaluating vector-only retrieval strategies. This skips the LLM-based entity extraction and graph generation, significantly speeding up ingestion while focusing on pure chunk-based vector search.
-
-### Available Datasets
-
-```
-squad-v2, natural-questions, beir, fever, fiqa, hotpotqa, 
-nfcorpus, quora, trec-covid, scifact, nq-beir
-```
-
-### Database Configuration
-
-| Flag | Environment | Default |
-|------|-------------|---------|
-| `--db-endpoint` | `EVAL_DB_ENDPOINT` | `ws://127.0.0.1:8000` |
-| `--db-username` | `EVAL_DB_USERNAME` | `root_user` |
-| `--db-password` | `EVAL_DB_PASSWORD` | `root_password` |
-| `--db-namespace` | `EVAL_DB_NAMESPACE` | auto-generated |
-| `--db-database` | `EVAL_DB_DATABASE` | auto-generated |
-
-### Example Runs
-
-```bash
-# Vector-only evaluation (recommended for benchmarking)
-cargo run --package evaluations -- \
-  --dataset fiqa \
-  --ingest-chunks-only \
-  --limit 200
-
-# Full FiQA evaluation with reranking
-cargo run --package evaluations -- \
-  --dataset fiqa \
-  --ingest-chunks-only \
-  --limit 500 \
-  --rerank \
-  --k 10
-
-# Use a predefined slice for reproducibility
-cargo run --package evaluations -- --slice fiqa-test-200 --ingest-chunks-only
-
-# Run the mixed BEIR benchmark
-cargo run --package evaluations -- --dataset beir --slice beir-mix-600 --ingest-chunks-only
-```
-
-## Slices
-
-Slices are predefined, reproducible subsets defined in `manifest.yaml`. Each slice specifies:
-
-- **limit**: Number of questions
-- **corpus_limit**: Maximum corpus size
-- **seed**: Fixed RNG seed for reproducibility
-
-View available slices in [manifest.yaml](./manifest.yaml).
-
-## Reports
-
-Evaluations generate reports in `reports/`:
-
-- **JSON**: Full structured results (`*-report.json`)
-- **Markdown**: Human-readable summary with sample mismatches (`*-report.md`)
-- **History**: Timestamped run history (`history/`)
-
-## Performance Tuning
-
-```bash
-# Log per-stage performance timings
-cargo run --package evaluations -- --perf-log-console
-
-# Save telemetry to file
-cargo run --package evaluations -- --perf-log-json ./perf.json
-```
-
-## License
-
-See [../LICENSE](../LICENSE).
+See [REFACTOR.md](./REFACTOR.md) for architecture notes.
diff --git a/evaluations/REFACTOR.md b/evaluations/REFACTOR.md
new file mode 100644
index 0000000..1b61575
--- /dev/null
+++ b/evaluations/REFACTOR.md
@@ -0,0 +1,98 @@
+# Evaluations crate refactor plan
+
+This document records the architecture review and the simplification work applied to the
+`evaluations` crate. **No backwards compatibility** is maintained for converted JSON layouts,
+legacy report history, or old cache artifact formats.
+
+## Goals
+
+- Smaller, linear pipeline (no state machine ceremony)
+- Sharded converted store for **all** datasets (memory-efficient partial loading)
+- Slice-first loading when a catalog slice is selected
+- In-memory SurrealDB for ingestion (no ephemeral server namespaces)
+- Single DB lifecycle module (`db/`)
+- CLI helpers under `cli/`
+
+## Primary workflow
+
+```bash
+# One-time prep (converts raw data if needed, builds slice ledger, corpus cache, DB seed)
+cargo eval --warm --dataset beir --slice beir-mix-600
+
+# Check readiness
+cargo eval --status --dataset beir --slice beir-mix-600
+
+# Steady-state benchmark
+cargo eval --dataset beir --slice beir-mix-600 --require-ready
+```
+
+Default dataset is `beir`. Chunk-only ingestion is the default; pass `--include-entities` to
+opt into entity extraction (requires `OPENAI_API_KEY`). Slice tuning such as
+`negative_multiplier` lives in `manifest.yaml` (e.g. `beir-mix-600` uses `9.0`).
+
+## Cache layers (after refactor)
+
+| Layer | Location | Purpose |
+|-------|----------|---------|
+| Converted store | `data/converted/<name>/` | Sharded paragraphs + question catalog |
+| Slice ledger | `cache/slices/<dataset>/<slice-id>.json` | Deterministic questions + paragraph set |
+| Corpus cache | `cache/ingested/<dataset>/<slice-id>/` | Ingestion paragraph shards, manifest, and namespace reuse seed |
+
+Namespace reuse state lives in the corpus manifest (`metadata.namespace_seed`), not a separate
+`snapshots/` tree. After upgrading, delete old `*-minne.json` monolithic files, any
+`cache/snapshots/` directories, and re-run `--warm`.
+
+## Phases applied
+
+### Phase 0 — dead code
+
+- Removed unused `criterion` dependency
+- Removed unused `EmbeddingCache`
+- Updated README for current CLI
+
+### Phase 1 — structure
+
+- Flattened pipeline to linear `async fn` stages
+- Removed `eval.rs` hub; imports go to owning modules
+- Merged `namespace.rs`, `db_helpers.rs` → `db/`; dropped standalone `snapshot.rs`
+- Moved `status.rs` → `cli/status.rs`
+- Fixed catalog slice bootstrap (build ledger when explicit slice manifest is missing)
+
+### Phase 2 — no legacy paths
+
+- All datasets use sharded converted store only
+- Removed legacy JSON layout and migration
+- Removed legacy report history format
+- Auto-apply first catalog slice when `--slice` omitted
+- Namespace seed folded into corpus manifest (removed `cache/snapshots/`)
+
+### Phase 3 — performance
+
+- Ingestion always uses in-memory SurrealDB
+- Slice-first partial load when ledger is complete
+- Default catalog slice for dataset when `--slice` not passed
+- Split `slice/` into `mod.rs`, `build.rs`, and `beir.rs`
+
+### Phase 4 — BEIR mix slice-first
+
+- `beir` is a virtual mix: slice ledger references prefixed ids (`fever-…`, `fiqa-…`, …)
+- Conversion is **qrels-closed** per subset (only documents appearing in qrels, not full corpus)
+- Slice ledger is resolved for the requested `--slice` (catalog preset or custom id + `--limit`)
+- Only ledger paragraph ids are materialized into per-subset stores (`fever-minne/`, `fiqa-minne/`, …)
+- No monolithic `beir-minne/` merged store
+- Raw BEIR data lives in per-subset dirs under `data/raw/`; `data/raw/beir` is a catalog placeholder
+
+## Do not re-introduce
+
+- Monolithic `*-minne.json` converted files
+- Monolithic `beir-minne/` merged converted store (use per-subset stores + virtual mix loader)
+- `state-machines` pipeline for this linear flow
+- `eval.rs` re-export hub
+- Legacy history migration in reports
+- Ephemeral `ingest_eval_*` namespaces on the shared SurrealDB server
+- Separate `cache/snapshots/` namespace state files
+
+## Open follow-ups
+
+- Generate `DatasetKind` from `manifest.yaml` at build time
+- Split `report.rs` when touching reporting again
diff --git a/evaluations/manifest.yaml b/evaluations/manifest.yaml
index 28c32ed..55a62f4 100644
--- a/evaluations/manifest.yaml
+++ b/evaluations/manifest.yaml
@@ -1,4 +1,4 @@
-default_dataset: squad-v2
+default_dataset: beir
 datasets:
   - id: squad-v2
     label: "SQuAD v2.0"
@@ -45,6 +45,7 @@ datasets:
         description: "Balanced slice across FEVER, FiQA, HotpotQA, NFCorpus, Quora, TREC-COVID, SciFact, NQ-BEIR"
         limit: 600
         corpus_limit: 6000
+        negative_multiplier: 9.0
         seed: 0x5eed2025
   - id: fever
     label: "FEVER (BEIR)"
diff --git a/evaluations/src/args.rs b/evaluations/src/args.rs
index 1600305..b7d7f7d 100644
--- a/evaluations/src/args.rs
+++ b/evaluations/src/args.rs
@@ -137,9 +137,9 @@ pub struct IngestConfig {
     #[arg(long, default_value_t = 50)]
     pub ingest_chunk_overlap_tokens: usize,
 
-    /// Run ingestion in chunk-only mode (skip analyzer/graph generation)
+    /// Include entity extraction and graph generation during ingestion (uses LLM tokens)
     #[arg(long)]
-    pub ingest_chunks_only: bool,
+    pub include_entities: bool,
 
     /// Number of paragraphs to ingest concurrently
     #[arg(long, default_value_t = 10)]
@@ -159,6 +159,7 @@ pub struct IngestConfig {
 }
 
 #[derive(Debug, Clone, Args)]
+#[allow(clippy::struct_field_names)]
 pub struct DatabaseArgs {
     /// `SurrealDB` server endpoint
     #[arg(long, default_value = "ws://127.0.0.1:8000", env = "EVAL_DB_ENDPOINT")]
@@ -179,10 +180,6 @@ pub struct DatabaseArgs {
     /// Override the database used on the `SurrealDB` server
     #[arg(long, env = "EVAL_DB_DATABASE")]
     pub db_database: Option<String>,
-
-    /// Path to inspect DB state
-    #[arg(long)]
-    pub inspect_db_state: Option<PathBuf>,
 }
 
 #[derive(Parser, Debug, Clone)]
@@ -233,10 +230,6 @@ pub struct Config {
     #[arg(long, default_value_t = 5)]
     pub sample: usize,
 
-    /// Disable context cropping when converting datasets (ingest entire documents)
-    #[arg(long)]
-    pub full_context: bool,
-
     #[command(flatten)]
     pub retrieval: RetrievalSettings,
 
@@ -322,6 +315,18 @@ pub struct Config {
     #[command(flatten)]
     pub database: DatabaseArgs,
 
+    /// Require warmed corpus/namespace before running queries
+    #[arg(long)]
+    pub require_ready: bool,
+
+    /// Prepare converted data, slice, corpus, and namespace without running queries
+    #[arg(long, conflicts_with = "status")]
+    pub warm: bool,
+
+    /// Print readiness of converted data, slice, corpus, and namespace
+    #[arg(long, conflicts_with = "warm")]
+    pub status: bool,
+
     // Computed fields (not arguments)
     #[arg(skip)]
     pub raw_dataset_path: PathBuf,
@@ -334,11 +339,6 @@ pub struct Config {
 }
 
 impl Config {
-    #[allow(clippy::unused_self)]
-    pub fn context_token_limit(&self) -> Option<usize> {
-        None
-    }
-
     #[allow(clippy::too_many_lines)]
     pub fn finalize(&mut self) -> Result<()> {
         // Handle dataset paths
@@ -367,9 +367,7 @@ impl Config {
         // Handle retrieval settings
         self.retrieval.require_verified_chunks = !self.llm_mode;
 
-        if self.dataset == DatasetKind::Beir {
-            self.negative_multiplier = 9.0;
-        }
+        self.apply_catalog_slice_defaults()?;
 
         // Validations
         if self.ingest.ingest_chunk_min_tokens == 0
@@ -477,6 +475,56 @@ impl Config {
 
         Ok(())
     }
+
+    fn apply_catalog_slice_defaults(&mut self) -> Result<()> {
+        let catalog = crate::datasets::catalog()?;
+        let entry = catalog.dataset(self.dataset.id())?;
+
+        if self.slice.is_none() {
+            if let Some(default_slice) = entry.slices.first() {
+                self.slice = Some(default_slice.id.clone());
+            }
+        }
+
+        let Some(slice_id) = self.slice.as_deref() else {
+            return Ok(());
+        };
+
+        let Ok((_, slice)) = catalog.slice(slice_id) else {
+            return Ok(());
+        };
+
+        if slice.dataset_id != self.dataset.id() {
+            return Ok(());
+        }
+
+        if let Some(limit) = slice.limit {
+            if self.limit_arg == 200 {
+                self.limit_arg = limit;
+                self.limit = Some(limit);
+            }
+        }
+        if self.corpus_limit.is_none() {
+            self.corpus_limit = slice.corpus_limit;
+        }
+        if let Some(seed) = slice.seed {
+            self.slice_seed = seed;
+        }
+        if let Some(include_unanswerable) = slice.include_unanswerable {
+            self.llm_mode = include_unanswerable;
+            self.retrieval.require_verified_chunks = !include_unanswerable;
+        }
+        if let Some(multiplier) = slice.negative_multiplier {
+            if negative_multiplier_is_default(self.negative_multiplier) {
+                self.negative_multiplier = multiplier;
+            }
+        }
+        Ok(())
+    }
+}
+
+fn negative_multiplier_is_default(value: f32) -> bool {
+    (value - crate::slice::DEFAULT_NEGATIVE_MULTIPLIER).abs() < f32::EPSILON
 }
 
 pub struct ParsedArgs {
diff --git a/evaluations/src/cache.rs b/evaluations/src/cache.rs
deleted file mode 100644
index 31a8594..0000000
--- a/evaluations/src/cache.rs
+++ /dev/null
@@ -1,88 +0,0 @@
-use std::{
-    collections::HashMap,
-    path::Path,
-    sync::{
-        atomic::{AtomicBool, Ordering},
-        Arc,
-    },
-};
-
-use anyhow::{Context, Result};
-use serde::{Deserialize, Serialize};
-use tokio::sync::Mutex;
-
-#[derive(Debug, Default, Serialize, Deserialize)]
-struct EmbeddingCacheData {
-    entities: HashMap<String, Vec<f32>>,
-    chunks: HashMap<String, Vec<f32>>,
-}
-
-#[derive(Clone)]
-pub struct EmbeddingCache {
-    path: Arc<Path>,
-    data: Arc<Mutex<EmbeddingCacheData>>,
-    dirty: Arc<AtomicBool>,
-}
-
-#[allow(dead_code)]
-impl EmbeddingCache {
-    pub async fn load(path: impl AsRef<Path>) -> Result<Self> {
-        let path = path.as_ref().to_path_buf();
-        let data = if path.exists() {
-            let raw = tokio::fs::read(&path)
-                .await
-                .with_context(|| format!("reading embedding cache {}", path.display()))?;
-            serde_json::from_slice(&raw)
-                .with_context(|| format!("parsing embedding cache {}", path.display()))?
-        } else {
-            EmbeddingCacheData::default()
-        };
-
-        Ok(Self {
-            path: Arc::from(path.as_path()),
-            data: Arc::new(Mutex::new(data)),
-            dirty: Arc::new(AtomicBool::new(false)),
-        })
-    }
-
-    pub async fn get_entity(&self, id: &str) -> Option<Vec<f32>> {
-        let guard = self.data.lock().await;
-        guard.entities.get(id).cloned()
-    }
-
-    pub async fn insert_entity(&self, id: String, embedding: Vec<f32>) {
-        let mut guard = self.data.lock().await;
-        guard.entities.insert(id, embedding);
-        self.dirty.store(true, Ordering::Relaxed);
-    }
-
-    pub async fn get_chunk(&self, id: &str) -> Option<Vec<f32>> {
-        let guard = self.data.lock().await;
-        guard.chunks.get(id).cloned()
-    }
-
-    pub async fn insert_chunk(&self, id: String, embedding: Vec<f32>) {
-        let mut guard = self.data.lock().await;
-        guard.chunks.insert(id, embedding);
-        self.dirty.store(true, Ordering::Relaxed);
-    }
-
-    pub async fn persist(&self) -> Result<()> {
-        if !self.dirty.load(Ordering::Relaxed) {
-            return Ok(());
-        }
-
-        let guard = self.data.lock().await;
-        let body = serde_json::to_vec_pretty(&*guard).context("serialising embedding cache")?;
-        if let Some(parent) = self.path.parent() {
-            tokio::fs::create_dir_all(parent)
-                .await
-                .with_context(|| format!("creating cache directory {}", parent.display()))?;
-        }
-        tokio::fs::write(&*self.path, body)
-            .await
-            .with_context(|| format!("writing embedding cache {}", self.path.display()))?;
-        self.dirty.store(false, Ordering::Relaxed);
-        Ok(())
-    }
-}
diff --git a/evaluations/src/cases.rs b/evaluations/src/cases.rs
index 1b20a39..c1fe2f0 100644
--- a/evaluations/src/cases.rs
+++ b/evaluations/src/cases.rs
@@ -156,6 +156,7 @@ mod tests {
                 chunk_min_tokens: 1,
                 chunk_max_tokens: 10,
                 chunk_only: false,
+                namespace_seed: None,
             },
             paragraphs,
             questions,
diff --git a/evaluations/src/cli/mod.rs b/evaluations/src/cli/mod.rs
new file mode 100644
index 0000000..0bac432
--- /dev/null
+++ b/evaluations/src/cli/mod.rs
@@ -0,0 +1,3 @@
+pub mod status;
+
+pub use status::{collect_status, ensure_query_ready, print_status, warm};
diff --git a/evaluations/src/cli/status.rs b/evaluations/src/cli/status.rs
new file mode 100644
index 0000000..b65a5f5
--- /dev/null
+++ b/evaluations/src/cli/status.rs
@@ -0,0 +1,316 @@
+#![allow(clippy::module_name_repetitions)]
+
+use std::path::Path;
+
+use anyhow::{Context, Result};
+use serde::Serialize;
+
+use crate::{
+    args::Config,
+    corpus::{self, CorpusCacheConfig},
+    datasets::{
+        beir_subset_store_summary, beir_subset_stores_ready, content_checksum_for_layout,
+        detect_layout, mix_content_checksum, store_dir_for, ConvertedLayout, DatasetKind,
+    },
+    db::{connect_eval_db, default_database, default_namespace, namespace_has_corpus},
+    slice::{self, ledger_target},
+};
+
+#[derive(Debug, Clone, Serialize)]
+pub struct EvalStatus {
+    pub dataset: String,
+    pub slice: Option<String>,
+    pub converted: ConvertedStatus,
+    pub slice_ledger: SliceLedgerStatus,
+    pub corpus_cache: CorpusCacheStatus,
+    pub namespace: NamespaceStatus,
+    pub query_ready: bool,
+    pub notes: Vec<String>,
+}
+
+#[derive(Debug, Clone, Serialize)]
+pub struct ConvertedStatus {
+    pub layout: String,
+    pub path: String,
+    pub ready: bool,
+    pub partial_load_eligible: bool,
+    pub checksum: Option<String>,
+}
+
+#[derive(Debug, Clone, Serialize)]
+pub struct SliceLedgerStatus {
+    pub ready: bool,
+    pub path: Option<String>,
+    pub cases: Option<usize>,
+    pub positives: Option<usize>,
+    pub negatives: Option<usize>,
+}
+
+#[derive(Debug, Clone, Serialize)]
+pub struct CorpusCacheStatus {
+    pub ready: bool,
+    pub path: Option<String>,
+    pub manifest_present: bool,
+}
+
+#[derive(Debug, Clone, Serialize)]
+pub struct NamespaceStatus {
+    pub namespace: String,
+    pub database: String,
+    pub seeded: bool,
+    pub namespace_seed_recorded: bool,
+}
+
+#[allow(clippy::too_many_lines)]
+pub async fn collect_status(config: &Config) -> Result<EvalStatus> {
+    let mut notes = Vec::new();
+    let is_beir_mix = config.dataset == DatasetKind::Beir;
+    let converted_path = &config.converted_dataset_path;
+    let layout = if is_beir_mix {
+        ConvertedLayout::Missing
+    } else {
+        detect_layout(converted_path)
+    };
+    let layout_label = if is_beir_mix {
+        "beir-mix-subset-stores"
+    } else {
+        match layout {
+            ConvertedLayout::ShardedStore => "sharded-store",
+            ConvertedLayout::Missing => "missing",
+        }
+    };
+
+    let store_dir = store_dir_for(converted_path);
+    let display_path = if is_beir_mix {
+        beir_subset_store_summary()?
+            .into_iter()
+            .map(|(subset, paragraphs, questions)| {
+                format!("{subset}-minne ({paragraphs} paragraphs, {questions} questions)")
+            })
+            .collect::<Vec<_>>()
+            .join("; ")
+    } else {
+        store_dir.display().to_string()
+    };
+
+    let manifest_path = slice::cached_manifest_path(config);
+    let slice_config = slice::slice_config_with_limit(config, ledger_target(config));
+    let slice_manifest = manifest_path
+        .as_ref()
+        .and_then(|path| slice::read_manifest_if_exists(path).ok().flatten());
+
+    let slice_ledger = SliceLedgerStatus {
+        ready: slice_manifest
+            .as_ref()
+            .is_some_and(|manifest| slice::manifest_is_complete(manifest, &slice_config)),
+        path: manifest_path.as_ref().map(|path| path.display().to_string()),
+        cases: slice_manifest.as_ref().map(|manifest| manifest.case_count),
+        positives: slice_manifest.as_ref().map(|manifest| manifest.positive_paragraphs),
+        negatives: slice_manifest.as_ref().map(|manifest| manifest.negative_paragraphs),
+    };
+
+    let beir_paragraph_ids = slice_manifest.as_ref().map(|manifest| {
+        manifest
+            .paragraphs
+            .iter()
+            .map(|entry| entry.id.clone())
+            .collect::<std::collections::HashSet<_>>()
+    });
+
+    let converted_ready = if is_beir_mix {
+        slice_ledger.ready
+            && beir_paragraph_ids
+                .as_ref()
+                .is_some_and(|ids| beir_subset_stores_ready(ids).unwrap_or(false))
+    } else {
+        layout == ConvertedLayout::ShardedStore
+    };
+
+    let checksum = if is_beir_mix {
+        beir_paragraph_ids
+            .as_ref()
+            .and_then(|ids| mix_content_checksum(ids).ok())
+    } else if layout == ConvertedLayout::ShardedStore {
+        content_checksum_for_layout(converted_path).ok()
+    } else {
+        None
+    };
+
+    let partial_load_eligible = slice_ledger.ready && config.slice.is_some();
+
+    let corpus_cache = if let Some(manifest) = slice_manifest.as_ref() {
+        let cache_settings = CorpusCacheConfig::from(config);
+        let base_dir = corpus::cached_corpus_dir(
+            &cache_settings,
+            config.dataset.id(),
+            manifest.slice_id.as_str(),
+        );
+        let manifest_present = corpus::load_cached_manifest(&base_dir)?.is_some();
+        CorpusCacheStatus {
+            ready: manifest_present,
+            path: Some(base_dir.display().to_string()),
+            manifest_present,
+        }
+    } else {
+        CorpusCacheStatus {
+            ready: false,
+            path: None,
+            manifest_present: false,
+        }
+    };
+
+    let namespace = config
+        .database
+        .db_namespace
+        .clone()
+        .unwrap_or_else(|| {
+            default_namespace(
+                config.dataset.id(),
+                config.limit,
+                config.slice.as_deref(),
+            )
+        });
+    let database = config
+        .database
+        .db_database
+        .clone()
+        .unwrap_or_else(default_database);
+
+    let namespace_seed = corpus_cache.path.as_ref().and_then(|path| {
+        corpus::load_cached_manifest(Path::new(path))
+            .ok()
+            .flatten()
+            .and_then(|manifest| manifest.metadata.namespace_seed)
+    });
+
+    let (seeded, namespace_seed_recorded) = match connect_eval_db(config, &namespace, &database).await {
+        Ok(db) => {
+            let has_corpus = namespace_has_corpus(&db).await.unwrap_or(false);
+            (has_corpus, namespace_seed.is_some())
+        }
+        Err(err) => {
+            notes.push(format!("SurrealDB unavailable: {err}"));
+            (false, false)
+        }
+    };
+
+    let query_ready = converted_ready
+        && slice_ledger.ready
+        && corpus_cache.ready
+        && seeded
+        && namespace_seed_recorded;
+
+    if !query_ready {
+        notes.push("Run `cargo eval --warm --slice <id>` to prepare corpus and namespace.".into());
+    }
+
+    Ok(EvalStatus {
+        dataset: config.dataset.id().to_string(),
+        slice: config.slice.clone(),
+        converted: ConvertedStatus {
+            layout: layout_label.to_string(),
+            path: display_path,
+            ready: converted_ready,
+            partial_load_eligible,
+            checksum,
+        },
+        slice_ledger,
+        corpus_cache,
+        namespace: NamespaceStatus {
+            namespace,
+            database,
+            seeded,
+            namespace_seed_recorded,
+        },
+        query_ready,
+        notes,
+    })
+}
+
+pub fn print_status(status: &EvalStatus) {
+    println!("Evaluation status for dataset `{}`", status.dataset);
+    if let Some(slice) = &status.slice {
+        println!("Slice: {slice}");
+    }
+    println!(
+        "Converted: {} ({})",
+        if status.converted.ready {
+            "ready"
+        } else {
+            "missing"
+        },
+        status.converted.layout
+    );
+    println!("Converted path: {}", status.converted.path);
+    if status.converted.partial_load_eligible {
+        println!("Slice-first loading: eligible");
+    }
+    println!(
+        "Slice ledger: {}",
+        if status.slice_ledger.ready {
+            format!(
+                "ready ({} cases, {} positives, {} negatives)",
+                status.slice_ledger.cases.unwrap_or(0),
+                status.slice_ledger.positives.unwrap_or(0),
+                status.slice_ledger.negatives.unwrap_or(0)
+            )
+        } else {
+            "missing or incomplete".to_string()
+        }
+    );
+    if let Some(path) = &status.slice_ledger.path {
+        println!("Slice ledger path: {path}");
+    }
+    println!(
+        "Corpus cache: {}",
+        if status.corpus_cache.ready {
+            "ready"
+        } else {
+            "missing"
+        }
+    );
+    if let Some(path) = &status.corpus_cache.path {
+        println!("Corpus cache path: {path}");
+    }
+    println!(
+        "Namespace `{}` / `{}`: seeded={}, namespace_seed_recorded={}",
+        status.namespace.namespace,
+        status.namespace.database,
+        status.namespace.seeded,
+        status.namespace.namespace_seed_recorded
+    );
+    println!(
+        "Query-ready: {}",
+        if status.query_ready {
+            "yes"
+        } else {
+            "no"
+        }
+    );
+    for note in &status.notes {
+        println!("Note: {note}");
+    }
+}
+
+pub async fn warm(config: &Config) -> Result<()> {
+    let loaded =
+        crate::datasets::prepare_dataset(config.dataset, config).context("preparing dataset")?;
+    crate::pipeline::warm_evaluation(&loaded.dataset, config, &loaded.content_checksum)
+        .await
+        .context("warming evaluation corpus and namespace")?;
+    let status = collect_status(config).await?;
+    print_status(&status);
+    Ok(())
+}
+
+pub async fn ensure_query_ready(config: &Config) -> Result<()> {
+    let status = collect_status(config).await?;
+    if status.query_ready {
+        return Ok(());
+    }
+    print_status(&status);
+    anyhow::bail!(
+        "evaluation is not query-ready; run `cargo eval --warm --slice {}` first",
+        config.slice.as_deref().unwrap_or("<slice-id>")
+    );
+}
diff --git a/evaluations/src/context_stats.rs b/evaluations/src/context_stats.rs
new file mode 100644
index 0000000..ffcd425
--- /dev/null
+++ b/evaluations/src/context_stats.rs
@@ -0,0 +1,177 @@
+use serde::{Deserialize, Serialize};
+
+use common::storage::types::StoredObject;
+
+use crate::types::EvaluationCandidate;
+
+const TOKENIZER_LABEL: &str = "estimated (~chars/4; ingestion uses bert-base-cased)";
+
+#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq)]
+pub struct RetrievedContextStats {
+    pub chunk_count: usize,
+    pub char_count: usize,
+    pub token_count: usize,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub struct RetrievalContextStats {
+    pub tokenizer: String,
+    pub queries: usize,
+    pub total_chunks: usize,
+    pub total_chars: usize,
+    pub total_tokens: usize,
+    pub avg_chunks_per_query: f64,
+    pub avg_chars_per_query: f64,
+    pub avg_tokens_per_query: f64,
+    pub p50_tokens_per_query: usize,
+    pub p95_tokens_per_query: usize,
+    pub max_tokens_per_query: usize,
+}
+
+pub fn stats_for_candidates(candidates: &[EvaluationCandidate]) -> RetrievedContextStats {
+    let mut seen_chunk_ids = std::collections::HashSet::new();
+    let mut stats = RetrievedContextStats::default();
+
+    for candidate in candidates {
+        for chunk in &candidate.chunks {
+            let chunk_id = chunk.chunk.id().to_string();
+            if !seen_chunk_ids.insert(chunk_id) {
+                continue;
+            }
+            let text = chunk.chunk.chunk.as_str();
+            stats.chunk_count += 1;
+            stats.char_count += text.chars().count();
+            stats.token_count += estimate_ingestion_tokens(text);
+        }
+    }
+
+    stats
+}
+
+pub fn aggregate_context_stats(per_query: &[RetrievedContextStats]) -> RetrievalContextStats {
+    let queries = per_query.len();
+    if queries == 0 {
+        return RetrievalContextStats {
+            tokenizer: TOKENIZER_LABEL.to_string(),
+            queries: 0,
+            total_chunks: 0,
+            total_chars: 0,
+            total_tokens: 0,
+            avg_chunks_per_query: 0.0,
+            avg_chars_per_query: 0.0,
+            avg_tokens_per_query: 0.0,
+            p50_tokens_per_query: 0,
+            p95_tokens_per_query: 0,
+            max_tokens_per_query: 0,
+        };
+    }
+
+    let total_chunks: usize = per_query.iter().map(|stats| stats.chunk_count).sum();
+    let total_chars: usize = per_query.iter().map(|stats| stats.char_count).sum();
+    let total_tokens: usize = per_query.iter().map(|stats| stats.token_count).sum();
+    let mut tokens_per_query: Vec<usize> = per_query.iter().map(|stats| stats.token_count).collect();
+    tokens_per_query.sort_unstable();
+    let max_tokens_per_query = *tokens_per_query.last().unwrap_or(&0);
+
+    RetrievalContextStats {
+        tokenizer: TOKENIZER_LABEL.to_string(),
+        queries,
+        total_chunks,
+        total_chars,
+        total_tokens,
+        avg_chunks_per_query: total_chunks as f64 / queries as f64,
+        avg_chars_per_query: total_chars as f64 / queries as f64,
+        avg_tokens_per_query: total_tokens as f64 / queries as f64,
+        p50_tokens_per_query: percentile_usize(&tokens_per_query, 0.50),
+        p95_tokens_per_query: percentile_usize(&tokens_per_query, 0.95),
+        max_tokens_per_query,
+    }
+}
+
+fn estimate_ingestion_tokens(text: &str) -> usize {
+    let chars = text.chars().count();
+    if chars == 0 {
+        return 0;
+    }
+    chars.div_ceil(4)
+}
+
+#[allow(clippy::cast_precision_loss, clippy::indexing_slicing, clippy::arithmetic_side_effects)]
+fn percentile_usize(sorted: &[usize], fraction: f64) -> usize {
+    if sorted.is_empty() {
+        return 0;
+    }
+    let clamped = fraction.clamp(0.0, 1.0);
+    let index = ((sorted.len() - 1) as f64 * clamped).round() as usize;
+    sorted[index.min(sorted.len() - 1)]
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use super::*;
+    use common::storage::types::text_chunk::TextChunk;
+    use retrieval_pipeline::RetrievedChunk;
+
+    #[test]
+    fn deduplicates_chunks_when_counting_context() {
+        let shared = Arc::new(TextChunk::new(
+            "src".into(),
+            "hello world".into(),
+            "user".into(),
+        ));
+        let candidates = vec![
+            EvaluationCandidate {
+                entity_id: "a".into(),
+                source_id: "src".into(),
+                entity_name: "A".into(),
+                entity_description: None,
+                entity_category: None,
+                score: 1.0,
+                chunks: vec![RetrievedChunk {
+                    chunk: Arc::clone(&shared),
+                    score: 1.0,
+                }],
+            },
+            EvaluationCandidate {
+                entity_id: "b".into(),
+                source_id: "src".into(),
+                entity_name: "B".into(),
+                entity_description: None,
+                entity_category: None,
+                score: 0.9,
+                chunks: vec![RetrievedChunk {
+                    chunk: shared,
+                    score: 0.9,
+                }],
+            },
+        ];
+        let stats = stats_for_candidates(&candidates);
+        assert_eq!(stats.chunk_count, 1);
+        assert_eq!(stats.char_count, "hello world".chars().count());
+        assert_eq!(stats.token_count, 3);
+    }
+
+    #[test]
+    fn aggregates_per_query_token_totals() {
+        let per_query = vec![
+            RetrievedContextStats {
+                chunk_count: 2,
+                char_count: 100,
+                token_count: 40,
+            },
+            RetrievedContextStats {
+                chunk_count: 5,
+                char_count: 250,
+                token_count: 100,
+            },
+        ];
+        let aggregate = aggregate_context_stats(&per_query);
+        assert_eq!(aggregate.queries, 2);
+        assert_eq!(aggregate.total_chunks, 7);
+        assert_eq!(aggregate.total_tokens, 140);
+        assert_eq!(aggregate.max_tokens_per_query, 100);
+        assert!((aggregate.avg_tokens_per_query - 70.0).abs() < f64::EPSILON);
+    }
+}
diff --git a/evaluations/src/corpus/config.rs b/evaluations/src/corpus/config.rs
index a7e6045..880771b 100644
--- a/evaluations/src/corpus/config.rs
+++ b/evaluations/src/corpus/config.rs
@@ -11,32 +11,14 @@ pub struct CorpusCacheConfig {
     pub ingestion_max_retries: usize,
 }
 
-impl CorpusCacheConfig {
-    pub fn new(
-        ingestion_cache_dir: impl Into<PathBuf>,
-        force_refresh: bool,
-        refresh_embeddings_only: bool,
-        ingestion_batch_size: usize,
-        ingestion_max_retries: usize,
-    ) -> Self {
+impl From<&Config> for CorpusCacheConfig {
+    fn from(config: &Config) -> Self {
         Self {
-            ingestion_cache_dir: ingestion_cache_dir.into(),
-            force_refresh,
-            refresh_embeddings_only,
-            ingestion_batch_size,
-            ingestion_max_retries,
+            ingestion_cache_dir: config.ingest.ingestion_cache_dir.clone(),
+            force_refresh: config.force_convert || config.ingest.slice_reset_ingestion,
+            refresh_embeddings_only: config.ingest.refresh_embeddings_only,
+            ingestion_batch_size: config.ingest.ingestion_batch_size,
+            ingestion_max_retries: config.ingest.ingestion_max_retries,
         }
     }
 }
-
-impl From<&Config> for CorpusCacheConfig {
-    fn from(config: &Config) -> Self {
-        CorpusCacheConfig::new(
-            config.ingest.ingestion_cache_dir.clone(),
-            config.force_convert || config.ingest.slice_reset_ingestion,
-            config.ingest.refresh_embeddings_only,
-            config.ingest.ingestion_batch_size,
-            config.ingest.ingestion_max_retries,
-        )
-    }
-}
diff --git a/evaluations/src/corpus/mod.rs b/evaluations/src/corpus/mod.rs
index 5804384..ba41444 100644
--- a/evaluations/src/corpus/mod.rs
+++ b/evaluations/src/corpus/mod.rs
@@ -5,11 +5,11 @@ pub(crate) mod store;
 pub use config::CorpusCacheConfig;
 pub use orchestrator::{
     cached_corpus_dir, compute_ingestion_fingerprint, corpus_handle_from_manifest, ensure_corpus,
-    load_cached_manifest,
+    load_cached_manifest, persist_corpus_manifest,
 };
 pub use store::{
     seed_manifest_into_db, window_manifest, CorpusHandle, CorpusManifest, CorpusMetadata,
-    CorpusQuestion, ParagraphShard, ParagraphShardStore, MANIFEST_VERSION,
+    CorpusQuestion, NamespaceSeedRecord, ParagraphShard, ParagraphShardStore, MANIFEST_VERSION,
 };
 
 pub fn make_ingestion_config(config: &crate::args::Config) -> ingestion_pipeline::IngestionConfig {
@@ -20,6 +20,6 @@ pub fn make_ingestion_config(config: &crate::args::Config) -> ingestion_pipeline
             chunk_overlap_tokens: config.ingest.ingest_chunk_overlap_tokens,
             ..Default::default()
         },
-        chunk_only: config.ingest.ingest_chunks_only,
+        chunk_only: !config.ingest.include_entities,
     }
 }
diff --git a/evaluations/src/corpus/orchestrator.rs b/evaluations/src/corpus/orchestrator.rs
index a445f9e..3575090 100644
--- a/evaluations/src/corpus/orchestrator.rs
+++ b/evaluations/src/corpus/orchestrator.rs
@@ -9,8 +9,6 @@ use std::{
 use anyhow::{anyhow, Context, Result};
 use async_openai::Client;
 use chrono::Utc;
-#[cfg(not(test))]
-use common::utils::config::get_config;
 use common::{
     storage::{
         db::SurrealDbClient,
@@ -125,10 +123,14 @@ pub async fn ensure_corpus(
     openai: Arc<OpenAIClient>,
     user_id: &str,
     converted_path: &Path,
+    precomputed_checksum: Option<&str>,
     ingestion_config: IngestionConfig,
 ) -> Result<CorpusHandle> {
-    let checksum = compute_file_checksum(converted_path)
-        .with_context(|| format!("computing checksum for {}", converted_path.display()))?;
+    let checksum = match precomputed_checksum {
+        Some(value) => value.to_string(),
+        None => crate::datasets::content_checksum_for_layout(converted_path)
+            .with_context(|| format!("computing checksum for {}", converted_path.display()))?,
+    };
     let ingestion_fingerprint =
         build_ingestion_fingerprint(dataset, slice, &checksum, &ingestion_config);
 
@@ -381,6 +383,7 @@ pub async fn ensure_corpus(
             chunk_min_tokens: ingestion_config.tuning.chunk_min_tokens,
             chunk_max_tokens: ingestion_config.tuning.chunk_max_tokens,
             chunk_only: ingestion_config.chunk_only,
+            namespace_seed: None,
         },
         paragraphs: corpus_paragraphs,
         questions: corpus_questions,
@@ -415,7 +418,7 @@ pub async fn ensure_corpus(
         negative_ingested: stats.negative_ingested,
     };
 
-    persist_manifest(&handle).context("persisting corpus manifest")?;
+    persist_corpus_manifest(&handle).context("persisting corpus manifest")?;
 
     Ok(handle)
 }
@@ -501,7 +504,6 @@ async fn ingest_paragraph_batch(
     Ok(shards)
 }
 
-#[cfg(test)]
 async fn create_ingest_db(namespace: &str) -> Result<Arc<SurrealDbClient>> {
     let db = SurrealDbClient::memory(namespace, "corpus")
         .await
@@ -509,21 +511,6 @@ async fn create_ingest_db(namespace: &str) -> Result<Arc<SurrealDbClient>> {
     Ok(Arc::new(db))
 }
 
-#[cfg(not(test))]
-async fn create_ingest_db(namespace: &str) -> Result<Arc<SurrealDbClient>> {
-    let config = get_config().context("loading app config for ingestion database")?;
-    let db = SurrealDbClient::new(
-        &config.surrealdb_address,
-        &config.surrealdb_username,
-        &config.surrealdb_password,
-        namespace,
-        "corpus",
-    )
-    .await
-    .context("creating surrealdb database for ingestion")?;
-    Ok(Arc::new(db))
-}
-
 #[allow(clippy::too_many_arguments)]
 async fn ingest_single_paragraph(
     pipeline: Arc<IngestionPipeline>,
@@ -631,8 +618,12 @@ pub fn compute_ingestion_fingerprint(
     slice: &ResolvedSlice<'_>,
     converted_path: &Path,
     ingestion_config: &IngestionConfig,
+    precomputed_checksum: Option<&str>,
 ) -> Result<String> {
-    let checksum = compute_file_checksum(converted_path)?;
+    let checksum = match precomputed_checksum {
+        Some(value) => value.to_string(),
+        None => crate::datasets::content_checksum_for_layout(converted_path)?,
+    };
     Ok(build_ingestion_fingerprint(
         dataset,
         slice,
@@ -641,7 +632,7 @@ pub fn compute_ingestion_fingerprint(
     ))
 }
 
-pub fn load_cached_manifest(base_dir: &Path) -> Result<Option<CorpusManifest>> {
+pub fn load_cached_manifest(base_dir: &std::path::Path) -> Result<Option<CorpusManifest>> {
     let path = base_dir.join("manifest.json");
     if !path.exists() {
         return Ok(None);
@@ -656,7 +647,7 @@ pub fn load_cached_manifest(base_dir: &Path) -> Result<Option<CorpusManifest>> {
     Ok(Some(manifest))
 }
 
-fn persist_manifest(handle: &CorpusHandle) -> Result<()> {
+pub fn persist_corpus_manifest(handle: &CorpusHandle) -> Result<()> {
     let path = handle.path.join("manifest.json");
     if let Some(parent) = path.parent() {
         fs::create_dir_all(parent)
@@ -685,24 +676,6 @@ pub fn corpus_handle_from_manifest(manifest: CorpusManifest, base_dir: PathBuf)
     }
 }
 
-#[allow(clippy::indexing_slicing)]
-fn compute_file_checksum(path: &Path) -> Result<String> {
-    let mut file = fs::File::open(path)
-        .with_context(|| format!("opening file {} for checksum", path.display()))?;
-    let mut hasher = Sha256::new();
-    let mut buffer = [0u8; 8192];
-    loop {
-        let read = file
-            .read(&mut buffer)
-            .with_context(|| format!("reading {} for checksum", path.display()))?;
-        if read == 0 {
-            break;
-        }
-        hasher.update(&buffer[..read]);
-    }
-    Ok(format!("{:x}", hasher.finalize()))
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -731,7 +704,6 @@ mod tests {
             metadata: crate::datasets::DatasetMetadata::for_kind(
                 DatasetKind::default(),
                 false,
-                None,
             ),
             source: "src".to_string(),
             paragraphs: vec![paragraph],
diff --git a/evaluations/src/corpus/store.rs b/evaluations/src/corpus/store.rs
index f219251..294ceed 100644
--- a/evaluations/src/corpus/store.rs
+++ b/evaluations/src/corpus/store.rs
@@ -42,7 +42,7 @@ fn default_chunk_max_tokens() -> usize {
 }
 
 fn default_chunk_only() -> bool {
-    false
+    true
 }
 
 // Reuse the pipeline's canonical embedded-artifact types so the on-disk corpus
@@ -122,6 +122,14 @@ pub struct CorpusManifest {
     pub questions: Vec<CorpusQuestion>,
 }
 
+#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
+pub struct NamespaceSeedRecord {
+    pub namespace: String,
+    pub database: String,
+    pub slice_case_count: usize,
+    pub seeded_at: DateTime<Utc>,
+}
+
 #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
 pub struct CorpusMetadata {
     pub dataset_id: String,
@@ -144,6 +152,8 @@ pub struct CorpusMetadata {
     pub chunk_max_tokens: usize,
     #[serde(default = "default_chunk_only")]
     pub chunk_only: bool,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub namespace_seed: Option<NamespaceSeedRecord>,
 }
 
 #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
@@ -629,6 +639,7 @@ mod tests {
                 chunk_min_tokens: 1,
                 chunk_max_tokens: 10,
                 chunk_only: false,
+                namespace_seed: None,
             },
             paragraphs: vec![paragraph_one, paragraph_two],
             questions: vec![question],
diff --git a/evaluations/src/datasets/beir.rs b/evaluations/src/datasets/beir.rs
index a06a529..be355a0 100644
--- a/evaluations/src/datasets/beir.rs
+++ b/evaluations/src/datasets/beir.rs
@@ -1,5 +1,5 @@
 use std::{
-    collections::{BTreeMap, HashMap},
+    collections::{BTreeMap, HashMap, HashSet},
     fs::File,
     io::{BufRead, BufReader},
     path::{Path, PathBuf},
@@ -47,20 +47,71 @@ struct QrelEntry {
     score: i32,
 }
 
+/// Convert only documents that appear in qrels (the BEIR evaluation closed world).
 #[allow(clippy::arithmetic_side_effects, clippy::indexing_slicing)]
 pub fn convert_beir(raw_dir: &Path, dataset: DatasetKind) -> Result<Vec<ConvertedParagraph>> {
+    convert_beir_documents(raw_dir, dataset, None)
+}
+
+/// Convert a subset of qrels-world documents. `doc_ids` use corpus ids (unprefixed).
+#[allow(
+    clippy::too_many_lines,
+    clippy::arithmetic_side_effects,
+    clippy::indexing_slicing
+)]
+pub fn convert_beir_documents(
+    raw_dir: &Path,
+    dataset: DatasetKind,
+    doc_ids: Option<&HashSet<String>>,
+) -> Result<Vec<ConvertedParagraph>> {
     let corpus_path = raw_dir.join("corpus.jsonl");
     let queries_path = raw_dir.join("queries.jsonl");
     let qrels_path = resolve_qrels_path(raw_dir)?;
 
-    let corpus = load_corpus(&corpus_path)?;
     let queries = load_queries(&queries_path)?;
     let qrels = load_qrels(&qrels_path)?;
 
-    let mut paragraphs = Vec::with_capacity(corpus.len());
+    let mut qrels_doc_ids = HashSet::new();
+    for entries in qrels.values() {
+        for entry in entries {
+            qrels_doc_ids.insert(entry.doc_id.clone());
+        }
+    }
+
+    let target_doc_ids: HashSet<String> = match doc_ids {
+        Some(ids) => ids
+            .iter()
+            .filter(|id| qrels_doc_ids.contains(*id))
+            .cloned()
+            .collect(),
+        None => qrels_doc_ids.clone(),
+    };
+
+    if target_doc_ids.is_empty() {
+        return Err(anyhow!(
+            "no qrels documents to convert for {} at {}",
+            dataset.id(),
+            raw_dir.display()
+        ));
+    }
+
+    let corpus = load_corpus_filtered(&corpus_path, &target_doc_ids)?;
+
+    let mut doc_ids_sorted: Vec<String> = target_doc_ids.into_iter().collect();
+    doc_ids_sorted.sort();
+
+    let mut paragraphs = Vec::with_capacity(doc_ids_sorted.len());
     let mut paragraph_index = HashMap::new();
 
-    for (doc_id, entry) in &corpus {
+    for doc_id in &doc_ids_sorted {
+        let Some(entry) = corpus.get(doc_id) else {
+            warn!(
+                doc_id = %doc_id,
+                dataset = %dataset.id(),
+                "Skipping qrels document missing from corpus"
+            );
+            continue;
+        };
         let paragraph_id = format!("{}-{doc_id}", dataset.source_prefix());
         let paragraph = ConvertedParagraph {
             id: paragraph_id.clone(),
@@ -87,6 +138,12 @@ pub fn convert_beir(raw_dir: &Path, dataset: DatasetKind) -> Result<Vec<Converte
             continue;
         };
 
+        if let Some(filter) = doc_ids {
+            if !filter.contains(&best.doc_id) {
+                continue;
+            }
+        }
+
         let Some(&paragraph_slot) = paragraph_index.get(&best.doc_id) else {
             missing_docs += 1;
             warn!(
@@ -106,7 +163,6 @@ pub fn convert_beir(raw_dir: &Path, dataset: DatasetKind) -> Result<Vec<Converte
             );
             continue;
         };
-        let answers = vec![snippet];
 
         let question_id = format!("{}-{query_id}", dataset.source_prefix());
         paragraphs[paragraph_slot]
@@ -114,7 +170,7 @@ pub fn convert_beir(raw_dir: &Path, dataset: DatasetKind) -> Result<Vec<Converte
             .push(ConvertedQuestion {
                 id: question_id,
                 question: query.text.clone(),
-                answers,
+                answers: vec![snippet],
                 is_impossible: false,
             });
     }
@@ -122,13 +178,23 @@ pub fn convert_beir(raw_dir: &Path, dataset: DatasetKind) -> Result<Vec<Converte
     if missing_queries + missing_docs + skipped_answers > 0 {
         warn!(
             missing_queries,
-            missing_docs, skipped_answers, "Skipped some BEIR qrels entries during conversion"
+            missing_docs,
+            skipped_answers,
+            dataset = %dataset.id(),
+            "Skipped some BEIR qrels entries during conversion"
         );
     }
 
     Ok(paragraphs)
 }
 
+pub fn corpus_doc_id(paragraph_id: &str, dataset: DatasetKind) -> Option<String> {
+    let prefix = format!("{}-", dataset.source_prefix());
+    paragraph_id
+        .strip_prefix(&prefix)
+        .map(str::to_string)
+}
+
 fn resolve_qrels_path(raw_dir: &Path) -> Result<PathBuf> {
     let qrels_dir = raw_dir.join("qrels");
     let candidates = ["test.tsv", "dev.tsv", "train.tsv"];
@@ -148,7 +214,10 @@ fn resolve_qrels_path(raw_dir: &Path) -> Result<PathBuf> {
 }
 
 #[allow(clippy::arithmetic_side_effects)]
-fn load_corpus(path: &Path) -> Result<BTreeMap<String, BeirParagraph>> {
+fn load_corpus_filtered(
+    path: &Path,
+    doc_ids: &HashSet<String>,
+) -> Result<BTreeMap<String, BeirParagraph>> {
     let file =
         File::open(path).with_context(|| format!("opening BEIR corpus at {}", path.display()))?;
     let reader = BufReader::new(file);
@@ -167,6 +236,9 @@ fn load_corpus(path: &Path) -> Result<BTreeMap<String, BeirParagraph>> {
                 path.display()
             )
         })?;
+        if !doc_ids.contains(&corpus_row.id) {
+            continue;
+        }
         let title = corpus_row.title.unwrap_or_else(|| corpus_row.id.clone());
         let text = corpus_row.text.unwrap_or_default();
         let context = build_context(&title, &text);
@@ -296,10 +368,8 @@ mod tests {
     use std::fs;
     use tempfile::tempdir;
 
-    #[test]
-    #[allow(clippy::unwrap_used, clippy::expect_used, clippy::indexing_slicing)]
-    fn converts_basic_beir_layout() {
-        let dir = tempdir().unwrap();
+    #[allow(clippy::unwrap_used)]
+    fn write_fixture(dir: &tempfile::TempDir) {
         let corpus = r#"
 {"_id":"d1","title":"Doc 1","text":"Doc one has some text for testing."}
 {"_id":"d2","title":"Doc 2","text":"Second document content."}
@@ -313,24 +383,34 @@ mod tests {
         fs::write(dir.path().join("queries.jsonl"), queries.trim()).unwrap();
         fs::create_dir_all(dir.path().join("qrels")).unwrap();
         fs::write(dir.path().join("qrels/test.tsv"), qrels).unwrap();
+    }
+
+    #[test]
+    #[allow(clippy::unwrap_used, clippy::expect_used, clippy::indexing_slicing)]
+    fn converts_qrels_world_only() {
+        let dir = tempdir().unwrap();
+        write_fixture(&dir);
 
         let paragraphs = convert_beir(dir.path(), DatasetKind::Fever).unwrap();
 
-        assert_eq!(paragraphs.len(), 2);
-        let doc_one = paragraphs
-            .iter()
-            .find(|p| p.id == "fever-d1")
-            .expect("missing paragraph for d1");
+        assert_eq!(paragraphs.len(), 1);
+        let doc_one = &paragraphs[0];
+        assert_eq!(doc_one.id, "fever-d1");
         assert_eq!(doc_one.questions.len(), 1);
-        let question = &doc_one.questions[0];
-        assert_eq!(question.id, "fever-q1");
-        assert!(!question.answers.is_empty());
-        assert!(doc_one.context.contains(&question.answers[0]));
+        assert_eq!(doc_one.questions[0].id, "fever-q1");
+    }
 
-        let doc_two = paragraphs
-            .iter()
-            .find(|p| p.id == "fever-d2")
-            .expect("missing paragraph for d2");
-        assert!(doc_two.questions.is_empty());
+    #[test]
+    #[allow(clippy::unwrap_used, clippy::expect_used, clippy::indexing_slicing)]
+    fn converts_filtered_doc_ids() {
+        let dir = tempdir().unwrap();
+        write_fixture(&dir);
+
+        let mut ids = HashSet::new();
+        ids.insert("d1".to_string());
+        let paragraphs =
+            convert_beir_documents(dir.path(), DatasetKind::Fever, Some(&ids)).unwrap();
+        assert_eq!(paragraphs.len(), 1);
+        assert_eq!(paragraphs[0].id, "fever-d1");
     }
 }
diff --git a/evaluations/src/datasets/beir_mix.rs b/evaluations/src/datasets/beir_mix.rs
new file mode 100644
index 0000000..45a8e66
--- /dev/null
+++ b/evaluations/src/datasets/beir_mix.rs
@@ -0,0 +1,262 @@
+use std::collections::{HashMap, HashSet};
+
+use anyhow::{anyhow, Context, Result};
+use sha2::{Digest, Sha256};
+use tracing::info;
+
+use super::{
+    beir,
+    checksum::hash_file,
+    store::{
+        self, build_dataset_from_catalog, paragraph_path, read_meta, store_dir_for,
+        upsert_sharded_paragraphs, write_sharded,
+    },
+    BEIR_DATASETS, ConvertedDataset, DatasetKind, DatasetMetadata,
+};
+use crate::{
+    args::Config,
+    slice,
+};
+
+pub fn subset_for_paragraph_id(paragraph_id: &str) -> Option<DatasetKind> {
+    let mut kinds: Vec<DatasetKind> = BEIR_DATASETS.to_vec();
+    kinds.sort_by_key(|kind| std::cmp::Reverse(kind.source_prefix().len()));
+    for kind in kinds {
+        let prefix = format!("{}-", kind.source_prefix());
+        if paragraph_id.starts_with(&prefix) {
+            return Some(kind);
+        }
+    }
+    None
+}
+
+pub fn build_beir_mix_qrels_dataset(include_unanswerable: bool) -> Result<ConvertedDataset> {
+    if include_unanswerable {
+        tracing::warn!("BEIR mix ignores include_unanswerable flag; all questions are answerable");
+    }
+
+    let mut paragraphs = Vec::new();
+    for subset in BEIR_DATASETS {
+        let entry = super::dataset_entry_for_kind(subset)?;
+        let subset_paragraphs = beir::convert_beir(&entry.raw_path, subset)?;
+        paragraphs.extend(subset_paragraphs);
+    }
+
+    Ok(ConvertedDataset {
+        generated_at: super::base_timestamp(),
+        metadata: DatasetMetadata::for_kind(DatasetKind::Beir, include_unanswerable),
+        source: "beir-mix".to_string(),
+        paragraphs,
+    })
+}
+
+pub fn prepare_beir_mix(config: &Config) -> Result<super::loader::LoadedDataset> {
+    let virtual_ds = build_beir_mix_qrels_dataset(config.llm_mode)?;
+    let slice_config = slice::slice_config_with_limit(config, slice::ledger_target(config));
+    let resolved = slice::resolve_slice(&virtual_ds, &slice_config).context(
+        "resolving BEIR mix slice ledger (check --slice and --limit match your intent)",
+    )?;
+
+    let unique: HashSet<String> = resolved
+        .manifest
+        .paragraphs
+        .iter()
+        .map(|entry| entry.id.clone())
+        .collect();
+
+    materialize_subset_stores(&unique, config.force_convert)?;
+
+    let dataset = load_beir_mix_from_subsets(&unique)?;
+    let checksum = mix_content_checksum(&unique)?;
+
+    info!(
+        slice = resolved.manifest.slice_id.as_str(),
+        paragraphs = unique.len(),
+        checksum = %checksum,
+        "Prepared BEIR mix from per-subset converted stores"
+    );
+
+    Ok(super::loader::LoadedDataset {
+        dataset,
+        content_checksum: checksum,
+        partial: true,
+    })
+}
+
+pub fn materialize_subset_stores(
+    paragraph_ids: &HashSet<String>,
+    force: bool,
+) -> Result<()> {
+    let mut by_subset: HashMap<DatasetKind, Vec<String>> = HashMap::new();
+    for paragraph_id in paragraph_ids {
+        let kind = subset_for_paragraph_id(paragraph_id).with_context(|| {
+            format!("routing BEIR mix paragraph id '{paragraph_id}' to subset store")
+        })?;
+        by_subset.entry(kind).or_default().push(paragraph_id.clone());
+    }
+
+    for (kind, ids) in by_subset {
+        let entry = super::dataset_entry_for_kind(kind)?;
+        let store_dir = store_dir_for(&entry.converted_path);
+        let existing = if store_dir.join("meta.json").is_file() {
+            store::load_paragraph_ids_set(&store_dir)?
+        } else {
+            HashSet::new()
+        };
+
+        let missing: Vec<String> = if force {
+            ids
+        } else {
+            ids.into_iter()
+                .filter(|paragraph_id| !existing.contains(paragraph_id))
+                .collect()
+        };
+
+        if missing.is_empty() {
+            continue;
+        }
+
+        let corpus_ids: HashSet<String> = missing
+            .iter()
+            .filter_map(|paragraph_id| beir::corpus_doc_id(paragraph_id, kind))
+            .collect();
+        let paragraphs = beir::convert_beir_documents(
+            &entry.raw_path,
+            kind,
+            Some(&corpus_ids),
+        )?;
+
+        if store_dir.join("meta.json").is_file() {
+            upsert_sharded_paragraphs(&store_dir, &paragraphs)?;
+        } else {
+            let question_count = paragraphs
+                .iter()
+                .map(|paragraph| paragraph.questions.len())
+                .sum::<usize>();
+            let dataset = ConvertedDataset {
+                generated_at: super::base_timestamp(),
+                metadata: DatasetMetadata::for_kind(kind, false),
+                source: entry.raw_path.display().to_string(),
+                paragraphs,
+            };
+            write_sharded(&dataset, &store_dir)?;
+            info!(
+                subset = kind.id(),
+                store = %store_dir.display(),
+                paragraphs = dataset.paragraphs.len(),
+                questions = question_count,
+                "Created subset converted store for BEIR mix"
+            );
+        }
+    }
+
+    Ok(())
+}
+
+pub fn load_beir_mix_from_subsets(paragraph_ids: &HashSet<String>) -> Result<ConvertedDataset> {
+    let mut by_subset: HashMap<DatasetKind, HashSet<String>> = HashMap::new();
+    for paragraph_id in paragraph_ids {
+        let kind = subset_for_paragraph_id(paragraph_id).with_context(|| {
+            format!("routing BEIR mix paragraph id '{paragraph_id}' to subset store")
+        })?;
+        by_subset
+            .entry(kind)
+            .or_default()
+            .insert(paragraph_id.clone());
+    }
+
+    let mut paragraphs = Vec::with_capacity(paragraph_ids.len());
+    for (kind, subset_ids) in by_subset {
+        let entry = super::dataset_entry_for_kind(kind)?;
+        let store_dir = store_dir_for(&entry.converted_path);
+        let partial = build_dataset_from_catalog(&store_dir, &subset_ids)?;
+        paragraphs.extend(partial.paragraphs);
+    }
+
+    paragraphs.sort_by(|left, right| left.id.cmp(&right.id));
+
+    Ok(ConvertedDataset {
+        generated_at: super::base_timestamp(),
+        metadata: DatasetMetadata::for_kind(DatasetKind::Beir, false),
+        source: "beir-mix".to_string(),
+        paragraphs,
+    })
+}
+
+pub fn mix_content_checksum(paragraph_ids: &HashSet<String>) -> Result<String> {
+    let mut ids: Vec<String> = paragraph_ids.iter().cloned().collect();
+    ids.sort();
+
+    let mut hasher = Sha256::new();
+    for paragraph_id in ids {
+        let kind = subset_for_paragraph_id(&paragraph_id)
+            .ok_or_else(|| anyhow!("unknown BEIR subset for paragraph '{paragraph_id}'"))?;
+        let entry = super::dataset_entry_for_kind(kind)?;
+        let store_dir = store_dir_for(&entry.converted_path);
+        let path = paragraph_path(&store_dir, &paragraph_id);
+        if !path.is_file() {
+            return Err(anyhow!(
+                "missing converted paragraph {} at {}",
+                paragraph_id,
+                path.display()
+            ));
+        }
+        hasher.update(paragraph_id.as_bytes());
+        hasher.update([0]);
+        hasher.update(hash_file(&path)?.as_bytes());
+    }
+
+    Ok(format!("{:x}", hasher.finalize()))
+}
+
+pub fn beir_subset_stores_ready(paragraph_ids: &HashSet<String>) -> Result<bool> {
+    for paragraph_id in paragraph_ids {
+        let kind = subset_for_paragraph_id(paragraph_id).with_context(|| {
+            format!("routing BEIR mix paragraph id '{paragraph_id}' to subset store")
+        })?;
+        let entry = super::dataset_entry_for_kind(kind)?;
+        let store_dir = store_dir_for(&entry.converted_path);
+        if !store_dir.join("meta.json").is_file() {
+            return Ok(false);
+        }
+        if !paragraph_path(&store_dir, paragraph_id).is_file() {
+            return Ok(false);
+        }
+    }
+    Ok(true)
+}
+
+pub fn beir_subset_store_summary() -> Result<Vec<(String, usize, usize)>> {
+    let mut summary = Vec::new();
+    for kind in BEIR_DATASETS {
+        let entry = super::dataset_entry_for_kind(kind)?;
+        let store_dir = store_dir_for(&entry.converted_path);
+        if store_dir.join("meta.json").is_file() {
+            let meta = read_meta(&store_dir)?;
+            summary.push((kind.id().to_string(), meta.paragraph_count, meta.question_count));
+        }
+    }
+    Ok(summary)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn routes_prefixed_paragraph_ids() {
+        assert_eq!(
+            subset_for_paragraph_id("fever-doc-1"),
+            Some(DatasetKind::Fever)
+        );
+        assert_eq!(
+            subset_for_paragraph_id("nq-beir-doc-1"),
+            Some(DatasetKind::NqBeir)
+        );
+        assert_eq!(
+            subset_for_paragraph_id("trec-covid-doc-1"),
+            Some(DatasetKind::TrecCovid)
+        );
+        assert!(subset_for_paragraph_id("unknown-doc").is_none());
+    }
+}
diff --git a/evaluations/src/datasets/checksum.rs b/evaluations/src/datasets/checksum.rs
new file mode 100644
index 0000000..331457a
--- /dev/null
+++ b/evaluations/src/datasets/checksum.rs
@@ -0,0 +1,216 @@
+use std::{
+    fs::{self, File},
+    io::Read,
+    path::Path,
+};
+
+#[cfg(test)]
+use std::path::PathBuf;
+
+use anyhow::{Context, Result};
+use serde::{Deserialize, Serialize};
+use sha2::{Digest, Sha256};
+
+const SIDECAR_VERSION: u32 = 1;
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ChecksumSidecar {
+    pub version: u32,
+    pub sha256: String,
+    pub size_bytes: u64,
+    #[serde(default)]
+    pub modified_unix_secs: u64,
+}
+
+impl ChecksumSidecar {
+    #[cfg(test)]
+    pub fn sidecar_path(content_path: &Path) -> PathBuf {
+        content_path.with_extension("sha256")
+    }
+
+    #[cfg(test)]
+    pub fn is_valid_for(&self, content_path: &Path) -> bool {
+        if self.version != SIDECAR_VERSION {
+            return false;
+        }
+        let Ok(metadata) = fs::metadata(content_path) else {
+            return false;
+        };
+        if metadata.len() != self.size_bytes {
+            return false;
+        }
+        if self.modified_unix_secs != 0 {
+            let Ok(modified) = metadata.modified() else {
+                return true;
+            };
+            let Ok(secs) = modified.duration_since(std::time::UNIX_EPOCH) else {
+                return true;
+            };
+            if secs.as_secs() != self.modified_unix_secs {
+                return false;
+            }
+        }
+        true
+    }
+}
+
+#[allow(clippy::indexing_slicing)]
+pub fn hash_file(path: &Path) -> Result<String> {
+    let mut file =
+        File::open(path).with_context(|| format!("opening file {} for checksum", path.display()))?;
+    let mut hasher = Sha256::new();
+    let mut buffer = vec![0u8; 65_536];
+    loop {
+        let read = file
+            .read(&mut buffer)
+            .with_context(|| format!("reading {} for checksum", path.display()))?;
+        if read == 0 {
+            break;
+        }
+        hasher.update(&buffer[..read]);
+    }
+    Ok(format!("{:x}", hasher.finalize()))
+}
+
+pub fn read_sidecar(path: &Path) -> Result<Option<ChecksumSidecar>> {
+    if !path.exists() {
+        return Ok(None);
+    }
+    let raw = fs::read_to_string(path)
+        .with_context(|| format!("reading checksum sidecar {}", path.display()))?;
+    let sidecar: ChecksumSidecar = serde_json::from_str(&raw)
+        .with_context(|| format!("parsing checksum sidecar {}", path.display()))?;
+    Ok(Some(sidecar))
+}
+
+#[cfg(test)]
+pub fn write_sidecar(content_path: &Path, sha256: &str) -> Result<()> {
+    let metadata = fs::metadata(content_path)
+        .with_context(|| format!("reading metadata for {}", content_path.display()))?;
+    let modified_unix_secs = metadata
+        .modified()
+        .ok()
+        .and_then(|time| time.duration_since(std::time::UNIX_EPOCH).ok())
+        .map_or(0, |duration| duration.as_secs());
+    let sidecar = ChecksumSidecar {
+        version: SIDECAR_VERSION,
+        sha256: sha256.to_string(),
+        size_bytes: metadata.len(),
+        modified_unix_secs,
+    };
+    let path = ChecksumSidecar::sidecar_path(content_path);
+    if let Some(parent) = path.parent() {
+        fs::create_dir_all(parent)
+            .with_context(|| format!("creating checksum sidecar directory {}", parent.display()))?;
+    }
+    let blob = serde_json::to_vec_pretty(&sidecar).context("serialising checksum sidecar")?;
+    fs::write(&path, blob)
+        .with_context(|| format!("writing checksum sidecar {}", path.display()))?;
+    Ok(())
+}
+
+#[cfg(test)]
+pub fn content_checksum(content_path: &Path) -> Result<String> {
+    let sidecar_path = ChecksumSidecar::sidecar_path(content_path);
+    if let Some(sidecar) = read_sidecar(&sidecar_path)? {
+        if sidecar.is_valid_for(content_path) {
+            return Ok(sidecar.sha256);
+        }
+    }
+    let sha256 = hash_file(content_path)?;
+    write_sidecar(content_path, &sha256)?;
+    Ok(sha256)
+}
+
+pub fn store_aggregate_checksum(store_dir: &Path) -> Result<String> {
+    let marker = store_dir.join("checksum.sha256");
+    let meta = store_dir.join("meta.json");
+    if marker.is_file() && meta.is_file() {
+        if let (Ok(marker_meta), Ok(meta_meta)) = (marker.metadata(), meta.metadata()) {
+            if marker_meta
+                .modified()
+                .ok()
+                .zip(meta_meta.modified().ok())
+                .is_some_and(|(marker_modified, meta_modified)| marker_modified >= meta_modified)
+            {
+                if let Some(sidecar) = read_sidecar(&marker)? {
+                    return Ok(sidecar.sha256);
+                }
+            }
+        }
+    }
+
+    let mut entries = Vec::new();
+    collect_store_files(store_dir, store_dir, &mut entries)?;
+    entries.sort();
+
+    let mut hasher = Sha256::new();
+    for relative in &entries {
+        let path = store_dir.join(relative);
+        if path == marker {
+            continue;
+        }
+        hasher.update(relative.as_bytes());
+        hasher.update([0]);
+        let file_hash = hash_file(&path)?;
+        hasher.update(file_hash.as_bytes());
+    }
+    let digest = format!("{:x}", hasher.finalize());
+
+    let sidecar = ChecksumSidecar {
+        version: SIDECAR_VERSION,
+        sha256: digest.clone(),
+        size_bytes: entries.len() as u64,
+        modified_unix_secs: std::time::SystemTime::now()
+            .duration_since(std::time::UNIX_EPOCH)
+            .map_or(0, |duration| duration.as_secs()),
+    };
+    if let Some(parent) = marker.parent() {
+        fs::create_dir_all(parent)?;
+    }
+    fs::write(&marker, serde_json::to_vec_pretty(&sidecar)?)?;
+    Ok(digest)
+}
+
+fn collect_store_files(base: &Path, current: &Path, entries: &mut Vec<String>) -> Result<()> {
+    for entry in fs::read_dir(current)? {
+        let entry = entry?;
+        let path = entry.path();
+        if path.file_name().is_some_and(|name| name == "checksum.sha256") {
+            continue;
+        }
+        if path.is_dir() {
+            collect_store_files(base, &path, entries)?;
+        } else if path.is_file() {
+            let relative = path
+                .strip_prefix(base)
+                .unwrap_or(&path)
+                .to_string_lossy()
+                .replace('\\', "/");
+            entries.push(relative);
+        }
+    }
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use tempfile::tempdir;
+
+    #[test]
+    fn sidecar_round_trip() -> Result<()> {
+        let dir = tempdir()?;
+        let file = dir.path().join("sample.json");
+        fs::write(&file, br#"{"hello":"world"}"#)?;
+
+        let first = content_checksum(&file)?;
+        let second = content_checksum(&file)?;
+        assert_eq!(first, second);
+
+        fs::write(&file, br#"{"hello":"world!"}"#)?;
+        let third = content_checksum(&file)?;
+        assert_ne!(first, third);
+        Ok(())
+    }
+}
diff --git a/evaluations/src/datasets/loader.rs b/evaluations/src/datasets/loader.rs
new file mode 100644
index 0000000..752ad93
--- /dev/null
+++ b/evaluations/src/datasets/loader.rs
@@ -0,0 +1,197 @@
+use std::collections::HashSet;
+
+use anyhow::{Context, Result};
+use tracing::info;
+
+use super::{
+    catalog,
+    store::{
+        self, build_dataset_from_catalog, detect_layout, read_meta, store_dir_for, write_sharded,
+        ConvertedLayout,
+    },
+    ConvertedDataset, DatasetKind,
+};
+use crate::{
+    args::Config,
+    slice::{self, SliceConfig},
+};
+
+#[derive(Debug, Clone)]
+pub struct LoadedDataset {
+    pub dataset: ConvertedDataset,
+    pub content_checksum: String,
+    pub partial: bool,
+}
+
+pub fn prepare_dataset(dataset_kind: DatasetKind, config: &Config) -> Result<LoadedDataset> {
+    if dataset_kind == DatasetKind::Beir {
+        return super::beir_mix::prepare_beir_mix(config);
+    }
+
+    let converted_path = &config.converted_dataset_path;
+    let layout = detect_layout(converted_path);
+    let store_dir = store_dir_for(converted_path);
+
+    if layout == ConvertedLayout::Missing || config.force_convert {
+        return convert_and_load(dataset_kind, config);
+    }
+
+    load_from_store(dataset_kind, config, &store_dir, true)
+}
+
+fn convert_and_load(dataset_kind: DatasetKind, config: &Config) -> Result<LoadedDataset> {
+    let dataset = super::convert(
+        config.raw_dataset_path.as_path(),
+        dataset_kind,
+        config.llm_mode,
+    )
+    .with_context(|| format!("converting {} dataset", dataset_kind.label()))?;
+
+    let store_dir = store_dir_for(&config.converted_dataset_path);
+    write_sharded(&dataset, &store_dir)?;
+    prebuild_catalog_slices(&dataset, config)?;
+    let checksum = crate::datasets::store_aggregate_checksum(&store_dir)?;
+
+    Ok(LoadedDataset {
+        dataset,
+        content_checksum: checksum,
+        partial: false,
+    })
+}
+
+fn load_from_store(
+    dataset_kind: DatasetKind,
+    config: &Config,
+    store_dir: &std::path::Path,
+    allow_partial: bool,
+) -> Result<LoadedDataset> {
+    let checksum = crate::datasets::store_aggregate_checksum(store_dir)?;
+    let meta = read_meta(store_dir)?;
+    validate_metadata_fields(&meta.metadata, dataset_kind, config)?;
+
+    if allow_partial {
+        if let Some(paragraph_ids) = slice_paragraph_ids_for_fast_path(config)? {
+            let unique: HashSet<String> = paragraph_ids.into_iter().collect();
+            info!(
+                paragraphs = unique.len(),
+                store = %store_dir.display(),
+                "Loading slice-addressed paragraphs from sharded converted store"
+            );
+            let dataset = build_dataset_from_catalog(store_dir, &unique)?;
+            return Ok(LoadedDataset {
+                dataset,
+                content_checksum: checksum,
+                partial: true,
+            });
+        }
+    }
+
+    info!(
+        store = %store_dir.display(),
+        paragraphs = meta.paragraph_count,
+        "Loading full sharded converted store"
+    );
+    let dataset = store::load_sharded_full(store_dir)?;
+    Ok(LoadedDataset {
+        dataset,
+        content_checksum: checksum,
+        partial: false,
+    })
+}
+
+fn slice_paragraph_ids_for_fast_path(config: &Config) -> Result<Option<Vec<String>>> {
+    let Some(manifest_path) = slice::cached_manifest_path(config) else {
+        return Ok(None);
+    };
+    let Some(manifest) = slice::read_manifest_if_exists(&manifest_path)? else {
+        return Ok(None);
+    };
+    let slice_config = slice::slice_config_with_limit(config, slice::ledger_target(config));
+    if !slice::manifest_is_complete(&manifest, &slice_config) {
+        return Ok(None);
+    }
+    Ok(Some(
+        manifest
+            .paragraphs
+            .iter()
+            .map(|entry| entry.id.clone())
+            .collect(),
+    ))
+}
+
+fn validate_metadata_fields(
+    metadata: &super::DatasetMetadata,
+    dataset_kind: DatasetKind,
+    config: &Config,
+) -> Result<()> {
+    if metadata.id != dataset_kind.id() {
+        anyhow::bail!(
+            "converted dataset targets '{}', expected '{}'",
+            metadata.id,
+            dataset_kind.id()
+        );
+    }
+    if metadata.include_unanswerable != config.llm_mode {
+        anyhow::bail!(
+            "converted dataset include_unanswerable mismatch (expected {}, found {})",
+            config.llm_mode,
+            metadata.include_unanswerable
+        );
+    }
+    Ok(())
+}
+
+pub fn prebuild_catalog_slices(dataset: &ConvertedDataset, config: &Config) -> Result<()> {
+    let catalog = catalog()?;
+    let entry = catalog.dataset(dataset.metadata.id.as_str())?;
+    if entry.slices.is_empty() {
+        return Ok(());
+    }
+
+    info!(
+        dataset = dataset.metadata.id.as_str(),
+        slices = entry.slices.len(),
+        "Prebuilding catalog slice ledgers"
+    );
+
+    for slice_entry in &entry.slices {
+        let slice_config = slice_config_for_catalog_entry(config, slice_entry);
+        match slice::resolve_slice(dataset, &slice_config) {
+            Ok(resolved) => info!(
+                slice = resolved.manifest.slice_id.as_str(),
+                cases = resolved.manifest.case_count,
+                positives = resolved.manifest.positive_paragraphs,
+                negatives = resolved.manifest.negative_paragraphs,
+                "Prebuilt catalog slice ledger"
+            ),
+            Err(err) => tracing::warn!(
+                slice = slice_entry.id.as_str(),
+                error = %err,
+                "Failed to prebuild catalog slice ledger"
+            ),
+        }
+    }
+
+    Ok(())
+}
+
+fn slice_config_for_catalog_entry<'a>(
+    config: &'a Config,
+    slice_entry: &'a super::SliceEntry,
+) -> SliceConfig<'a> {
+    SliceConfig {
+        cache_dir: config.cache_dir.as_path(),
+        force_convert: config.force_convert,
+        explicit_slice: Some(slice_entry.id.as_str()),
+        limit: slice_entry.limit,
+        corpus_limit: slice_entry.corpus_limit,
+        slice_seed: slice_entry.seed.unwrap_or(config.slice_seed),
+        llm_mode: slice_entry
+            .include_unanswerable
+            .unwrap_or(config.llm_mode),
+        negative_multiplier: slice_entry
+            .negative_multiplier
+            .unwrap_or(config.negative_multiplier),
+        require_verified_chunks: config.retrieval.require_verified_chunks,
+    }
+}
diff --git a/evaluations/src/datasets/mod.rs b/evaluations/src/datasets/mod.rs
index 1274c5e..7380d77 100644
--- a/evaluations/src/datasets/mod.rs
+++ b/evaluations/src/datasets/mod.rs
@@ -1,6 +1,10 @@
 mod beir;
+mod beir_mix;
+mod checksum;
+mod loader;
 mod nq;
 mod squad;
+mod store;
 
 use std::{
     collections::{BTreeMap, HashMap},
@@ -20,38 +24,31 @@ const MANIFEST_PATH: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/manifest.yaml"
 static DATASET_CATALOG: OnceCell<DatasetCatalog> = OnceCell::new();
 
 #[derive(Debug, Clone)]
-#[allow(dead_code)]
 pub struct DatasetCatalog {
     datasets: BTreeMap<String, DatasetEntry>,
     slices: HashMap<String, SliceLocation>,
-    default_dataset: String,
 }
 
 #[derive(Debug, Clone)]
-#[allow(dead_code)]
 pub struct DatasetEntry {
     pub metadata: DatasetMetadata,
     pub raw_path: PathBuf,
     pub converted_path: PathBuf,
-    pub include_unanswerable: bool,
     pub slices: Vec<SliceEntry>,
 }
 
 #[derive(Debug, Clone)]
-#[allow(dead_code)]
 pub struct SliceEntry {
     pub id: String,
     pub dataset_id: String,
-    pub label: String,
-    pub description: Option<String>,
     pub limit: Option<usize>,
     pub corpus_limit: Option<usize>,
     pub include_unanswerable: Option<bool>,
     pub seed: Option<u64>,
+    pub negative_multiplier: Option<f32>,
 }
 
 #[derive(Debug, Clone)]
-#[allow(dead_code)]
 struct SliceLocation {
     dataset_id: String,
     slice_index: usize,
@@ -59,7 +56,6 @@ struct SliceLocation {
 
 #[derive(Debug, Deserialize)]
 struct ManifestFile {
-    default_dataset: Option<String>,
     datasets: Vec<ManifestDataset>,
 }
 
@@ -81,6 +77,7 @@ struct ManifestDataset {
 }
 
 #[derive(Debug, Deserialize)]
+#[allow(dead_code)]
 struct ManifestSlice {
     id: String,
     label: String,
@@ -94,6 +91,8 @@ struct ManifestSlice {
     include_unanswerable: Option<bool>,
     #[serde(default)]
     seed: Option<u64>,
+    #[serde(default)]
+    negative_multiplier: Option<f32>,
 }
 
 impl DatasetCatalog {
@@ -111,18 +110,19 @@ impl DatasetCatalog {
             let raw_path = resolve_path(root, &dataset.raw);
             let converted_path = resolve_path(root, &dataset.converted);
 
-            if !raw_path.exists() {
+            if !raw_path.exists() && dataset.id != "beir" {
                 bail!(
                     "dataset '{}' raw file missing at {}",
                     dataset.id,
                     raw_path.display()
                 );
             }
-            if !converted_path.exists() {
+            let store_dir = store::store_dir_for(&converted_path);
+            if !converted_path.exists() && !store_dir.join("meta.json").is_file() {
                 warn!(
-                    "dataset '{}' converted file missing at {}; the next conversion run will regenerate it",
+                    "dataset '{}' converted store missing at {}; the next conversion run will regenerate it",
                     dataset.id,
-                    converted_path.display()
+                    store_dir.display()
                 );
             }
 
@@ -139,7 +139,6 @@ impl DatasetCatalog {
                     .clone()
                     .unwrap_or_else(|| dataset.id.clone()),
                 include_unanswerable: dataset.include_unanswerable,
-                context_token_limit: None,
             };
 
             let mut entry_slices = Vec::with_capacity(dataset.slices.len());
@@ -154,12 +153,11 @@ impl DatasetCatalog {
                 entry_slices.push(SliceEntry {
                     id: manifest_slice.id.clone(),
                     dataset_id: dataset.id.clone(),
-                    label: manifest_slice.label,
-                    description: manifest_slice.description,
                     limit: manifest_slice.limit,
                     corpus_limit: manifest_slice.corpus_limit,
                     include_unanswerable: manifest_slice.include_unanswerable,
                     seed: manifest_slice.seed,
+                    negative_multiplier: manifest_slice.negative_multiplier,
                 });
                 slices.insert(
                     manifest_slice.id,
@@ -176,22 +174,16 @@ impl DatasetCatalog {
                     metadata,
                     raw_path,
                     converted_path,
-                    include_unanswerable: dataset.include_unanswerable,
                     slices: entry_slices,
                 },
             );
         }
 
-        let default_dataset = manifest
-            .default_dataset
-            .or_else(|| datasets.keys().next().cloned())
-            .ok_or_else(|| anyhow!("dataset manifest does not include any datasets"))?;
+        if datasets.is_empty() {
+            bail!("dataset manifest does not include any datasets");
+        }
 
-        Ok(Self {
-            datasets,
-            slices,
-            default_dataset,
-        })
+        Ok(Self { datasets, slices })
     }
 
     pub fn global() -> Result<&'static Self> {
@@ -204,12 +196,6 @@ impl DatasetCatalog {
             .ok_or_else(|| anyhow!("unknown dataset '{id}' in manifest"))
     }
 
-    #[allow(dead_code)]
-    pub fn default_dataset(&self) -> Result<&DatasetEntry> {
-        self.dataset(&self.default_dataset)
-    }
-
-    #[allow(dead_code)]
     pub fn slice(&self, slice_id: &str) -> Result<(&DatasetEntry, &SliceEntry)> {
         let location = self
             .slices
@@ -236,20 +222,29 @@ fn resolve_path(root: &Path, value: &str) -> PathBuf {
     }
 }
 
+pub use checksum::store_aggregate_checksum;
+pub use beir_mix::{
+    beir_subset_store_summary, beir_subset_stores_ready, mix_content_checksum,
+};
+pub use loader::{prebuild_catalog_slices, prepare_dataset};
+pub use store::{
+    content_checksum_for_layout, detect_layout, store_dir_for, write_sharded, ConvertedLayout,
+};
+
 pub fn catalog() -> Result<&'static DatasetCatalog> {
     DatasetCatalog::global()
 }
 
-fn dataset_entry_for_kind(kind: DatasetKind) -> Result<&'static DatasetEntry> {
+pub(crate) fn dataset_entry_for_kind(kind: DatasetKind) -> Result<&'static DatasetEntry> {
     let catalog = catalog()?;
     catalog.dataset(kind.id())
 }
 
-#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum, Default)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, ValueEnum, Default)]
 pub enum DatasetKind {
-    #[default]
     SquadV2,
     NaturalQuestions,
+    #[default]
     Beir,
     #[value(name = "fever")]
     Fever,
@@ -416,16 +411,10 @@ pub struct DatasetMetadata {
     pub source_prefix: String,
     #[serde(default)]
     pub include_unanswerable: bool,
-    #[serde(default)]
-    pub context_token_limit: Option<usize>,
 }
 
 impl DatasetMetadata {
-    pub fn for_kind(
-        kind: DatasetKind,
-        include_unanswerable: bool,
-        context_token_limit: Option<usize>,
-    ) -> Self {
+    pub fn for_kind(kind: DatasetKind, include_unanswerable: bool) -> Self {
         if let Ok(entry) = dataset_entry_for_kind(kind) {
             return Self {
                 id: entry.metadata.id.clone(),
@@ -434,7 +423,6 @@ impl DatasetMetadata {
                 entity_suffix: entry.metadata.entity_suffix.clone(),
                 source_prefix: entry.metadata.source_prefix.clone(),
                 include_unanswerable,
-                context_token_limit,
             };
         }
 
@@ -445,13 +433,12 @@ impl DatasetMetadata {
             entity_suffix: kind.entity_suffix().to_string(),
             source_prefix: kind.source_prefix().to_string(),
             include_unanswerable,
-            context_token_limit,
         }
     }
 }
 
 fn default_metadata() -> DatasetMetadata {
-    DatasetMetadata::for_kind(DatasetKind::default(), false, None)
+    DatasetMetadata::for_kind(DatasetKind::default(), false)
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -483,14 +470,15 @@ pub fn convert(
     raw_path: &Path,
     dataset: DatasetKind,
     include_unanswerable: bool,
-    context_token_limit: Option<usize>,
 ) -> Result<ConvertedDataset> {
     let paragraphs = match dataset {
         DatasetKind::SquadV2 => squad::convert_squad(raw_path)?,
-        DatasetKind::NaturalQuestions => {
-            nq::convert_nq(raw_path, include_unanswerable, context_token_limit)?
+        DatasetKind::NaturalQuestions => nq::convert_nq(raw_path, include_unanswerable)?,
+        DatasetKind::Beir => {
+            bail!(
+                "BEIR mix is prepared via slice-first subset stores; use prepare_beir_mix instead of convert"
+            );
         }
-        DatasetKind::Beir => convert_beir_mix(include_unanswerable, context_token_limit)?,
         DatasetKind::Fever
         | DatasetKind::Fiqa
         | DatasetKind::HotpotQa
@@ -501,11 +489,6 @@ pub fn convert(
         | DatasetKind::NqBeir => beir::convert_beir(raw_path, dataset)?,
     };
 
-    let metadata_limit = match dataset {
-        DatasetKind::NaturalQuestions => None,
-        _ => context_token_limit,
-    };
-
     let generated_at = match dataset {
         DatasetKind::Beir
         | DatasetKind::Fever
@@ -526,100 +509,12 @@ pub fn convert(
 
     Ok(ConvertedDataset {
         generated_at,
-        metadata: DatasetMetadata::for_kind(dataset, include_unanswerable, metadata_limit),
+        metadata: DatasetMetadata::for_kind(dataset, include_unanswerable),
         source: source_label,
         paragraphs,
     })
 }
 
-fn convert_beir_mix(
-    include_unanswerable: bool,
-    _context_token_limit: Option<usize>,
-) -> Result<Vec<ConvertedParagraph>> {
-    if include_unanswerable {
-        warn!("BEIR mix ignores include_unanswerable flag; all questions are answerable");
-    }
-
-    let mut paragraphs = Vec::new();
-    for subset in BEIR_DATASETS {
-        let entry = dataset_entry_for_kind(subset)?;
-        let subset_paragraphs = beir::convert_beir(&entry.raw_path, subset)?;
-        paragraphs.extend(subset_paragraphs);
-    }
-
-    Ok(paragraphs)
-}
-
-fn ensure_parent(path: &Path) -> Result<()> {
-    if let Some(parent) = path.parent() {
-        fs::create_dir_all(parent)
-            .with_context(|| format!("creating parent directory for {}", path.display()))?;
-    }
-    Ok(())
-}
-
-pub fn write_converted(dataset: &ConvertedDataset, converted_path: &Path) -> Result<()> {
-    ensure_parent(converted_path)?;
-    let json =
-        serde_json::to_string_pretty(dataset).context("serialising converted dataset to JSON")?;
-    fs::write(converted_path, json)
-        .with_context(|| format!("writing converted dataset to {}", converted_path.display()))
-}
-
-pub fn read_converted(converted_path: &Path) -> Result<ConvertedDataset> {
-    let raw = fs::read_to_string(converted_path)
-        .with_context(|| format!("reading converted dataset at {}", converted_path.display()))?;
-    let mut dataset: ConvertedDataset = serde_json::from_str(&raw)
-        .with_context(|| format!("parsing converted dataset at {}", converted_path.display()))?;
-    if dataset.metadata.id.trim().is_empty() {
-        dataset.metadata = default_metadata();
-    }
-    if dataset.source.is_empty() {
-        dataset.source = converted_path.display().to_string();
-    }
-    Ok(dataset)
-}
-
-pub fn ensure_converted(
-    dataset_kind: DatasetKind,
-    raw_path: &Path,
-    converted_path: &Path,
-    force: bool,
-    include_unanswerable: bool,
-    context_token_limit: Option<usize>,
-) -> Result<ConvertedDataset> {
-    if force || !converted_path.exists() {
-        let dataset = convert(
-            raw_path,
-            dataset_kind,
-            include_unanswerable,
-            context_token_limit,
-        )?;
-        write_converted(&dataset, converted_path)?;
-        return Ok(dataset);
-    }
-
-    match read_converted(converted_path) {
-        Ok(dataset)
-            if dataset.metadata.id == dataset_kind.id()
-                && dataset.metadata.include_unanswerable == include_unanswerable
-                && dataset.metadata.context_token_limit == context_token_limit =>
-        {
-            Ok(dataset)
-        }
-        _ => {
-            let dataset = convert(
-                raw_path,
-                dataset_kind,
-                include_unanswerable,
-                context_token_limit,
-            )?;
-            write_converted(&dataset, converted_path)?;
-            Ok(dataset)
-        }
-    }
-}
-
 pub fn base_timestamp() -> DateTime<Utc> {
     Utc.with_ymd_and_hms(2023, 1, 1, 0, 0, 0).unwrap()
 }
diff --git a/evaluations/src/datasets/nq.rs b/evaluations/src/datasets/nq.rs
index a39956d..2b72682 100644
--- a/evaluations/src/datasets/nq.rs
+++ b/evaluations/src/datasets/nq.rs
@@ -16,11 +16,7 @@ use super::{ConvertedParagraph, ConvertedQuestion};
     clippy::arithmetic_side_effects,
     clippy::cast_sign_loss
 )]
-pub fn convert_nq(
-    raw_path: &Path,
-    include_unanswerable: bool,
-    _context_token_limit: Option<usize>,
-) -> Result<Vec<ConvertedParagraph>> {
+pub fn convert_nq(raw_path: &Path, include_unanswerable: bool) -> Result<Vec<ConvertedParagraph>> {
     #[allow(dead_code)]
     #[derive(Debug, Deserialize)]
     struct NqExample {
diff --git a/evaluations/src/datasets/store.rs b/evaluations/src/datasets/store.rs
new file mode 100644
index 0000000..a4e85d9
--- /dev/null
+++ b/evaluations/src/datasets/store.rs
@@ -0,0 +1,410 @@
+use std::{
+    collections::{HashMap, HashSet},
+    fs::{self, File, OpenOptions},
+    io::{BufRead, BufReader, Write},
+    path::{Path, PathBuf},
+};
+
+use anyhow::{anyhow, Context, Result};
+use chrono::{DateTime, Utc};
+use serde::{Deserialize, Serialize};
+use tracing::info;
+
+use super::{
+    checksum::store_aggregate_checksum,
+    ConvertedDataset, ConvertedParagraph, ConvertedQuestion, DatasetMetadata,
+};
+use crate::slice;
+
+pub const SHARDED_STORE_VERSION: u32 = 1;
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ShardedMeta {
+    pub version: u32,
+    pub generated_at: DateTime<Utc>,
+    pub metadata: DatasetMetadata,
+    pub source: String,
+    pub paragraph_count: usize,
+    pub question_count: usize,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub(crate) struct QuestionRecord {
+    paragraph_id: String,
+    #[serde(flatten)]
+    question: ConvertedQuestion,
+}
+
+#[derive(Debug, Clone)]
+pub struct QuestionCatalog {
+    pub entries: Vec<QuestionRecord>,
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum ConvertedLayout {
+    ShardedStore,
+    Missing,
+}
+
+pub fn store_dir_for(converted_path: &Path) -> PathBuf {
+    converted_path
+        .parent()
+        .unwrap_or_else(|| Path::new("."))
+        .join(
+            converted_path
+                .file_stem()
+                .map_or_else(|| "dataset".to_string(), |stem| stem.to_string_lossy().into()),
+        )
+}
+
+pub fn detect_layout(converted_path: &Path) -> ConvertedLayout {
+    let store_dir = store_dir_for(converted_path);
+    if store_dir.join("meta.json").is_file() {
+        ConvertedLayout::ShardedStore
+    } else {
+        ConvertedLayout::Missing
+    }
+}
+
+fn paragraph_file_name(paragraph_id: &str) -> String {
+    format!("{}.json", slice::paragraph_storage_key(paragraph_id))
+}
+
+pub fn paragraph_path(store_dir: &Path, paragraph_id: &str) -> PathBuf {
+    store_dir
+        .join("paragraphs")
+        .join(paragraph_file_name(paragraph_id))
+}
+
+pub fn write_sharded(dataset: &ConvertedDataset, store_dir: &Path) -> Result<String> {
+    if store_dir.exists() {
+        fs::remove_dir_all(store_dir)
+            .with_context(|| format!("clearing sharded store {}", store_dir.display()))?;
+    }
+    fs::create_dir_all(store_dir.join("paragraphs"))
+        .with_context(|| format!("creating sharded store {}", store_dir.display()))?;
+
+    let question_count = dataset
+        .paragraphs
+        .iter()
+        .map(|paragraph| paragraph.questions.len())
+        .sum::<usize>();
+
+    let meta = ShardedMeta {
+        version: SHARDED_STORE_VERSION,
+        generated_at: dataset.generated_at,
+        metadata: dataset.metadata.clone(),
+        source: dataset.source.clone(),
+        paragraph_count: dataset.paragraphs.len(),
+        question_count,
+    };
+    let meta_path = store_dir.join("meta.json");
+    fs::write(
+        &meta_path,
+        serde_json::to_vec_pretty(&meta).context("serialising sharded store metadata")?,
+    )
+    .with_context(|| format!("writing sharded metadata {}", meta_path.display()))?;
+
+    let mut questions_file = File::create(store_dir.join("questions.jsonl"))
+        .context("creating questions.jsonl for sharded store")?;
+    let mut paragraph_ids_file = File::create(store_dir.join("paragraph_ids.jsonl"))
+        .context("creating paragraph_ids.jsonl for sharded store")?;
+
+    for paragraph in &dataset.paragraphs {
+        writeln!(paragraph_ids_file, "{}", paragraph.id)
+            .context("writing paragraph id to paragraph_ids.jsonl")?;
+        for question in &paragraph.questions {
+            let record = QuestionRecord {
+                paragraph_id: paragraph.id.clone(),
+                question: question.clone(),
+            };
+            serde_json::to_writer(&mut questions_file, &record)
+                .context("writing question record to questions.jsonl")?;
+            questions_file.write_all(b"\n")?;
+        }
+
+        let path = paragraph_path(store_dir, &paragraph.id);
+        if let Some(parent) = path.parent() {
+            fs::create_dir_all(parent)?;
+        }
+        fs::write(
+            &path,
+            serde_json::to_vec(paragraph).context("serialising sharded paragraph")?,
+        )
+        .with_context(|| format!("writing sharded paragraph {}", path.display()))?;
+    }
+
+    let digest = store_aggregate_checksum(store_dir)?;
+    info!(
+        store = %store_dir.display(),
+        paragraphs = dataset.paragraphs.len(),
+        questions = question_count,
+        checksum = %digest,
+        "Wrote sharded converted dataset"
+    );
+    Ok(digest)
+}
+
+pub fn read_meta(store_dir: &Path) -> Result<ShardedMeta> {
+    let path = store_dir.join("meta.json");
+    let raw = fs::read_to_string(&path)
+        .with_context(|| format!("reading sharded metadata {}", path.display()))?;
+    serde_json::from_str(&raw)
+        .with_context(|| format!("parsing sharded metadata {}", path.display()))
+}
+
+pub fn content_checksum_for_layout(converted_path: &Path) -> Result<String> {
+    match detect_layout(converted_path) {
+        ConvertedLayout::ShardedStore => {
+            crate::datasets::store_aggregate_checksum(&store_dir_for(converted_path))
+        }
+        ConvertedLayout::Missing => Err(anyhow!(
+            "converted dataset missing at {}",
+            converted_path.display()
+        )),
+    }
+}
+
+fn load_paragraph(store_dir: &Path, paragraph_id: &str) -> Result<ConvertedParagraph> {
+    let path = paragraph_path(store_dir, paragraph_id);
+    let raw = fs::read(&path)
+        .with_context(|| format!("reading sharded paragraph {}", path.display()))?;
+    serde_json::from_slice(&raw)
+        .with_context(|| format!("parsing sharded paragraph {}", path.display()))
+}
+
+fn load_paragraphs(store_dir: &Path, paragraph_ids: &[String]) -> Result<Vec<ConvertedParagraph>> {
+    paragraph_ids
+        .iter()
+        .map(|paragraph_id| load_paragraph(store_dir, paragraph_id))
+        .collect()
+}
+
+pub fn load_sharded_partial(store_dir: &Path, paragraph_ids: &[String]) -> Result<ConvertedDataset> {
+    let meta = read_meta(store_dir)?;
+    let mut paragraphs = load_paragraphs(store_dir, paragraph_ids)?;
+    paragraphs.sort_by(|left, right| left.id.cmp(&right.id));
+    Ok(ConvertedDataset {
+        generated_at: meta.generated_at,
+        metadata: meta.metadata,
+        source: meta.source,
+        paragraphs,
+    })
+}
+
+pub fn load_sharded_full(store_dir: &Path) -> Result<ConvertedDataset> {
+    let meta = read_meta(store_dir)?;
+    let ids = load_paragraph_ids(store_dir)?;
+    let paragraphs = load_paragraphs(store_dir, &ids)?;
+    Ok(ConvertedDataset {
+        generated_at: meta.generated_at,
+        metadata: meta.metadata,
+        source: meta.source,
+        paragraphs,
+    })
+}
+
+pub fn load_paragraph_ids_set(store_dir: &Path) -> Result<HashSet<String>> {
+    Ok(load_paragraph_ids(store_dir)?.into_iter().collect())
+}
+
+#[allow(clippy::arithmetic_side_effects)]
+pub fn upsert_sharded_paragraphs(
+    store_dir: &Path,
+    paragraphs: &[ConvertedParagraph],
+) -> Result<()> {
+    if paragraphs.is_empty() {
+        return Ok(());
+    }
+    if !store_dir.join("meta.json").is_file() {
+        return Err(anyhow!(
+            "cannot upsert into missing sharded store at {}",
+            store_dir.display()
+        ));
+    }
+
+    fs::create_dir_all(store_dir.join("paragraphs"))
+        .with_context(|| format!("creating paragraphs directory in {}", store_dir.display()))?;
+
+    let existing = load_paragraph_ids_set(store_dir)?;
+    let questions_path = store_dir.join("questions.jsonl");
+    let mut questions_file = OpenOptions::new()
+        .create(true)
+        .append(true)
+        .open(&questions_path)
+        .with_context(|| format!("opening question catalog {}", questions_path.display()))?;
+
+    let mut ids_file = None;
+    let mut new_paragraphs = 0usize;
+    let mut new_questions = 0usize;
+
+    for paragraph in paragraphs {
+        let is_new = !existing.contains(&paragraph.id);
+        let path = paragraph_path(store_dir, &paragraph.id);
+        if let Some(parent) = path.parent() {
+            fs::create_dir_all(parent)?;
+        }
+        fs::write(
+            &path,
+            serde_json::to_vec(paragraph).context("serialising sharded paragraph")?,
+        )
+        .with_context(|| format!("writing sharded paragraph {}", path.display()))?;
+
+        if is_new {
+            if ids_file.is_none() {
+                ids_file = Some(
+                    OpenOptions::new()
+                        .create(true)
+                        .append(true)
+                        .open(store_dir.join("paragraph_ids.jsonl"))
+                        .context("opening paragraph_ids.jsonl for append")?,
+                );
+            }
+            if let Some(file) = ids_file.as_mut() {
+                writeln!(file, "{}", paragraph.id).context("appending paragraph id")?;
+            }
+            new_paragraphs += 1;
+
+            for question in &paragraph.questions {
+                let record = QuestionRecord {
+                    paragraph_id: paragraph.id.clone(),
+                    question: question.clone(),
+                };
+                serde_json::to_writer(&mut questions_file, &record)
+                    .context("writing question record to questions.jsonl")?;
+                questions_file.write_all(b"\n")?;
+                new_questions += 1;
+            }
+        }
+    }
+
+    if new_paragraphs > 0 || new_questions > 0 {
+        let meta = read_meta(store_dir)?;
+        let updated = ShardedMeta {
+            paragraph_count: meta.paragraph_count + new_paragraphs,
+            question_count: meta.question_count + new_questions,
+            ..meta
+        };
+        fs::write(
+            store_dir.join("meta.json"),
+            serde_json::to_vec_pretty(&updated).context("serialising updated sharded metadata")?,
+        )?;
+        store_aggregate_checksum(store_dir)?;
+        info!(
+            store = %store_dir.display(),
+            new_paragraphs,
+            new_questions,
+            "Upserted paragraphs into sharded converted store"
+        );
+    }
+
+    Ok(())
+}
+
+pub fn load_paragraph_ids(store_dir: &Path) -> Result<Vec<String>> {
+    let path = store_dir.join("paragraph_ids.jsonl");
+    let file = File::open(&path)
+        .with_context(|| format!("opening paragraph id index {}", path.display()))?;
+    let reader = BufReader::new(file);
+    reader
+        .lines()
+        .map(|line| {
+            line.context("reading paragraph id index line")
+                .and_then(|value| {
+                    let trimmed = value.trim();
+                    if trimmed.is_empty() {
+                        Err(anyhow!("empty paragraph id in index"))
+                    } else {
+                        Ok(trimmed.to_string())
+                    }
+                })
+        })
+        .collect()
+}
+
+pub fn load_question_catalog(store_dir: &Path) -> Result<QuestionCatalog> {
+    let path = store_dir.join("questions.jsonl");
+    let file = File::open(&path)
+        .with_context(|| format!("opening question catalog {}", path.display()))?;
+    let reader = BufReader::new(file);
+    let mut entries = Vec::new();
+    for line in reader.lines() {
+        let line = line.context("reading question catalog line")?;
+        if line.trim().is_empty() {
+            continue;
+        }
+        let record: QuestionRecord = serde_json::from_str(&line)
+            .context("parsing question catalog record")?;
+        entries.push(record);
+    }
+    Ok(QuestionCatalog { entries })
+}
+
+pub fn build_dataset_from_catalog(
+    store_dir: &Path,
+    paragraph_ids: &HashSet<String>,
+) -> Result<ConvertedDataset> {
+    let catalog = load_question_catalog(store_dir)?;
+    let mut questions_by_paragraph: HashMap<String, Vec<ConvertedQuestion>> = HashMap::new();
+    for entry in catalog.entries {
+        if paragraph_ids.contains(&entry.paragraph_id) {
+            questions_by_paragraph
+                .entry(entry.paragraph_id.clone())
+                .or_default()
+                .push(entry.question);
+        }
+    }
+
+    let mut dataset = load_sharded_partial(
+        store_dir,
+        &paragraph_ids.iter().cloned().collect::<Vec<_>>(),
+    )?;
+    for paragraph in &mut dataset.paragraphs {
+        if let Some(questions) = questions_by_paragraph.remove(&paragraph.id) {
+            paragraph.questions = questions;
+        } else {
+            paragraph.questions.clear();
+        }
+    }
+
+    Ok(dataset)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::datasets::{DatasetKind, DatasetMetadata};
+
+    fn sample_dataset() -> ConvertedDataset {
+        ConvertedDataset {
+            generated_at: Utc::now(),
+            metadata: DatasetMetadata::for_kind(DatasetKind::SquadV2, false),
+            source: "test".to_string(),
+            paragraphs: vec![ConvertedParagraph {
+                id: "p1".to_string(),
+                title: "Title".to_string(),
+                context: "Body".to_string(),
+                questions: vec![ConvertedQuestion {
+                    id: "q1".to_string(),
+                    question: "Question?".to_string(),
+                    answers: vec!["Answer".to_string()],
+                    is_impossible: false,
+                }],
+            }],
+        }
+    }
+
+    #[test]
+    #[allow(clippy::indexing_slicing)]
+    fn sharded_round_trip() -> Result<()> {
+        let dir = tempfile::tempdir()?;
+        let store_dir = dir.path().join("sample");
+        let dataset = sample_dataset();
+        write_sharded(&dataset, &store_dir)?;
+
+        let loaded = load_sharded_full(&store_dir)?;
+        assert_eq!(loaded.paragraphs.len(), 1);
+        assert_eq!(loaded.paragraphs[0].questions[0].id, "q1");
+        Ok(())
+    }
+}
diff --git a/evaluations/src/namespace.rs b/evaluations/src/db/connect.rs
similarity index 67%
rename from evaluations/src/namespace.rs
rename to evaluations/src/db/connect.rs
index ff39a98..d200c5e 100644
--- a/evaluations/src/namespace.rs
+++ b/evaluations/src/db/connect.rs
@@ -1,22 +1,22 @@
-//! Database namespace management utilities.
-
 use anyhow::{anyhow, Context, Result};
 use chrono::Utc;
-use common::storage::{
-    db::SurrealDbClient,
-    types::user::{Theme, User},
-    types::StoredObject,
+use common::{
+    storage::{
+        db::SurrealDbClient,
+        types::user::{Theme, User},
+        types::StoredObject,
+    },
+    utils::embedding::EmbeddingProvider,
 };
 use serde::Deserialize;
 use tracing::{info, warn};
 
 use crate::{
     args::Config,
+    corpus::{self, CorpusHandle, CorpusManifest, NamespaceSeedRecord},
     datasets,
-    snapshot::{self, DbSnapshotState},
 };
 
-/// Connect to the evaluation database with fallback auth strategies.
 pub(crate) async fn connect_eval_db(
     config: &Config,
     namespace: &str,
@@ -73,7 +73,6 @@ pub(crate) async fn connect_eval_db(
     }
 }
 
-/// Check if the namespace contains any corpus data.
 pub(crate) async fn namespace_has_corpus(db: &SurrealDbClient) -> Result<bool> {
     #[derive(Deserialize)]
     struct CountRow {
@@ -89,41 +88,52 @@ pub(crate) async fn namespace_has_corpus(db: &SurrealDbClient) -> Result<bool> {
     Ok(rows.first().map_or(0, |row| row.count) > 0)
 }
 
-/// Determine if we can reuse an existing namespace based on cached state.
+fn manifest_matches_runtime(
+    manifest: &CorpusManifest,
+    embedding_provider: &EmbeddingProvider,
+    ingestion_fingerprint: &str,
+) -> bool {
+    let metadata = &manifest.metadata;
+    metadata.ingestion_fingerprint == ingestion_fingerprint
+        && metadata.embedding_backend == embedding_provider.backend_label()
+        && metadata.embedding_model == embedding_provider.model_code()
+        && metadata.embedding_dimension == embedding_provider.dimension()
+}
+
 #[allow(clippy::too_many_arguments)]
 pub(crate) async fn can_reuse_namespace(
     db: &SurrealDbClient,
-    descriptor: &snapshot::Descriptor,
+    manifest: &CorpusManifest,
+    embedding_provider: &EmbeddingProvider,
     namespace: &str,
     database: &str,
-    dataset_id: &str,
-    slice_id: &str,
     ingestion_fingerprint: &str,
     slice_case_count: usize,
 ) -> Result<bool> {
-    let Some(state) = descriptor.load_db_state().await? else {
-        info!("No namespace state recorded; reseeding corpus from cached shards");
+    if !manifest_matches_runtime(manifest, embedding_provider, ingestion_fingerprint) {
+        info!("Corpus manifest metadata mismatch; rebuilding namespace from cached shards");
+        return Ok(false);
+    }
+
+    let Some(seed) = manifest.metadata.namespace_seed.as_ref() else {
+        info!("No namespace seed recorded in corpus manifest; reseeding");
         return Ok(false);
     };
 
-    if state.slice_case_count != slice_case_count {
+    if seed.slice_case_count != slice_case_count {
         info!(
             requested_cases = slice_case_count,
-            stored_cases = state.slice_case_count,
-            "Skipping live namespace reuse; cached state does not match requested window"
+            stored_cases = seed.slice_case_count,
+            "Skipping namespace reuse; case window mismatch"
         );
         return Ok(false);
     }
 
-    if state.dataset_id != dataset_id
-        || state.slice_id != slice_id
-        || state.ingestion_fingerprint != ingestion_fingerprint
-        || state.namespace.as_deref() != Some(namespace)
-        || state.database.as_deref() != Some(database)
-    {
+    if seed.namespace != namespace || seed.database != database {
         info!(
             namespace,
-            database, "Cached namespace metadata mismatch; rebuilding corpus from ingestion cache"
+            database,
+            "Corpus manifest namespace metadata mismatch; reseeding"
         );
         return Ok(false);
     }
@@ -140,28 +150,20 @@ pub(crate) async fn can_reuse_namespace(
     }
 }
 
-/// Record the current namespace state to allow future reuse checks.
-pub(crate) async fn record_namespace_state(
-    descriptor: &snapshot::Descriptor,
-    dataset_id: &str,
-    slice_id: &str,
-    ingestion_fingerprint: &str,
+pub(crate) async fn record_namespace_seed(
+    handle: &mut CorpusHandle,
     namespace: &str,
     database: &str,
     slice_case_count: usize,
 ) {
-    let state = DbSnapshotState {
-        dataset_id: dataset_id.to_string(),
-        slice_id: slice_id.to_string(),
-        ingestion_fingerprint: ingestion_fingerprint.to_string(),
-        snapshot_hash: descriptor.metadata_hash().to_string(),
-        updated_at: Utc::now(),
-        namespace: Some(namespace.to_string()),
-        database: Some(database.to_string()),
+    handle.manifest.metadata.namespace_seed = Some(NamespaceSeedRecord {
+        namespace: namespace.to_string(),
+        database: database.to_string(),
         slice_case_count,
-    };
-    if let Err(err) = descriptor.store_db_state(&state).await {
-        warn!(error = %err, "Failed to record namespace state");
+        seeded_at: Utc::now(),
+    });
+    if let Err(err) = corpus::persist_corpus_manifest(handle) {
+        warn!(error = %err, "Failed to record namespace seed in corpus manifest");
     }
 }
 
@@ -185,8 +187,17 @@ fn sanitize_identifier(input: &str) -> String {
     cleaned
 }
 
-/// Generate a default namespace name based on dataset and limit.
-pub(crate) fn default_namespace(dataset_id: &str, limit: Option<usize>) -> String {
+pub(crate) fn default_namespace(
+    dataset_id: &str,
+    limit: Option<usize>,
+    slice_id: Option<&str>,
+) -> String {
+    if let Some(slice_id) = slice_id {
+        let sanitized = sanitize_identifier(slice_id);
+        if !sanitized.is_empty() {
+            return format!("eval_{sanitized}");
+        }
+    }
     let dataset_component = sanitize_identifier(dataset_id);
     let limit_component = match limit {
         Some(value) if value > 0 => format!("limit{value}"),
@@ -195,12 +206,10 @@ pub(crate) fn default_namespace(dataset_id: &str, limit: Option<usize>) -> Strin
     format!("eval_{dataset_component}_{limit_component}")
 }
 
-/// Generate the default database name for evaluations.
 pub(crate) fn default_database() -> String {
     "retrieval_eval".to_string()
 }
 
-/// Ensure the evaluation user exists in the database.
 pub(crate) async fn ensure_eval_user(db: &SurrealDbClient) -> Result<User> {
     let timestamp = datasets::base_timestamp();
     let user = User {
@@ -225,3 +234,7 @@ pub(crate) async fn ensure_eval_user(db: &SurrealDbClient) -> Result<User> {
         .context("storing evaluation user")?;
     Ok(user)
 }
+
+pub(crate) fn sanitize_model_code(code: &str) -> String {
+    sanitize_identifier(code)
+}
diff --git a/evaluations/src/db_helpers.rs b/evaluations/src/db/lifecycle.rs
similarity index 75%
rename from evaluations/src/db_helpers.rs
rename to evaluations/src/db/lifecycle.rs
index e154b73..00ded78 100644
--- a/evaluations/src/db_helpers.rs
+++ b/evaluations/src/db/lifecycle.rs
@@ -2,13 +2,6 @@ use anyhow::{Context, Result};
 use common::storage::{db::SurrealDbClient, indexes::ensure_runtime};
 use tracing::info;
 
-// Helper functions for index management during namespace reseed
-pub async fn remove_all_indexes(db: &SurrealDbClient) -> Result<()> {
-    let _ = db;
-    info!("Removing ALL indexes before namespace reseed (no-op placeholder)");
-    Ok(())
-}
-
 pub async fn recreate_indexes(db: &SurrealDbClient, dimension: usize) -> Result<()> {
     info!("Recreating ALL indexes after namespace reseed via shared runtime helper");
     ensure_runtime(db, dimension)
@@ -34,14 +27,39 @@ pub async fn reset_namespace(db: &SurrealDbClient, namespace: &str, database: &s
     Ok(())
 }
 
-// // Test helper to force index dimension change
-// #[allow(dead_code)]
-// pub async fn change_embedding_length_in_hnsw_indexes(
-//     db: &SurrealDbClient,
-//     dimension: usize,
-// ) -> Result<()> {
-//     recreate_indexes(db, dimension).await
-// }
+#[allow(clippy::cast_precision_loss)]
+pub(crate) async fn warm_hnsw_cache(db: &SurrealDbClient, dimension: usize) -> Result<()> {
+    let dummy_embedding: Vec<f32> = (0..dimension).map(|i| (i as f32).sin()).collect();
+
+    info!("Warming HNSW caches with sample queries");
+
+    let _ = db
+        .client
+        .query(
+            r#"SELECT chunk_id
+               FROM text_chunk_embedding
+               WHERE embedding <|1,1|> $embedding
+               LIMIT 5"#,
+        )
+        .bind(("embedding", dummy_embedding.clone()))
+        .await
+        .context("warming text chunk HNSW cache")?;
+
+    let _ = db
+        .client
+        .query(
+            r#"SELECT entity_id
+               FROM knowledge_entity_embedding
+               WHERE embedding <|1,1|> $embedding
+               LIMIT 5"#,
+        )
+        .bind(("embedding", dummy_embedding))
+        .await
+        .context("warming knowledge entity HNSW cache")?;
+
+    info!("HNSW cache warming completed");
+    Ok(())
+}
 
 #[cfg(test)]
 mod tests {
diff --git a/evaluations/src/db/mod.rs b/evaluations/src/db/mod.rs
new file mode 100644
index 0000000..ee57459
--- /dev/null
+++ b/evaluations/src/db/mod.rs
@@ -0,0 +1,9 @@
+mod connect;
+mod lifecycle;
+
+pub(crate) use connect::{
+    can_reuse_namespace, connect_eval_db, default_database, default_namespace, ensure_eval_user,
+    namespace_has_corpus, record_namespace_seed, sanitize_model_code,
+};
+pub use lifecycle::{recreate_indexes, reset_namespace};
+pub(crate) use lifecycle::warm_hnsw_cache;
diff --git a/evaluations/src/eval.rs b/evaluations/src/eval.rs
deleted file mode 100644
index e0abf4b..0000000
--- a/evaluations/src/eval.rs
+++ /dev/null
@@ -1,128 +0,0 @@
-//! Evaluation utilities module - re-exports from focused submodules.
-
-// Re-export types from the root types module
-pub use crate::types::*;
-
-// Re-export from focused modules at crate root (crate-internal only)
-pub(crate) use crate::cases::{cases_from_manifest, SeededCase};
-pub(crate) use crate::namespace::{
-    can_reuse_namespace, connect_eval_db, default_database, default_namespace, ensure_eval_user,
-    record_namespace_state,
-};
-pub(crate) use crate::settings::{enforce_system_settings, load_or_init_system_settings};
-
-use std::path::Path;
-
-use anyhow::{Context, Result};
-use common::storage::db::SurrealDbClient;
-use tokio::io::AsyncWriteExt;
-use tracing::info;
-
-use crate::{
-    args::{self, Config},
-    datasets::ConvertedDataset,
-    slice::{self},
-};
-
-/// Grow the slice ledger to contain the target number of cases.
-pub fn grow_slice(dataset: &ConvertedDataset, config: &Config) -> Result<()> {
-    let ledger_limit = ledger_target(config);
-    let slice_settings = slice::slice_config_with_limit(config, ledger_limit);
-    let slice =
-        slice::resolve_slice(dataset, &slice_settings).context("resolving dataset slice")?;
-    info!(
-        slice = slice.manifest.slice_id.as_str(),
-        cases = slice.manifest.case_count,
-        positives = slice.manifest.positive_paragraphs,
-        negatives = slice.manifest.negative_paragraphs,
-        total_paragraphs = slice.manifest.total_paragraphs,
-        "Slice ledger ready"
-    );
-    println!(
-        "Slice `{}` now contains {} questions ({} positives, {} negatives)",
-        slice.manifest.slice_id,
-        slice.manifest.case_count,
-        slice.manifest.positive_paragraphs,
-        slice.manifest.negative_paragraphs
-    );
-    Ok(())
-}
-
-pub(crate) fn ledger_target(config: &Config) -> Option<usize> {
-    match (config.slice_grow, config.limit) {
-        (Some(grow), Some(limit)) => Some(limit.max(grow)),
-        (Some(grow), None) => Some(grow),
-        (None, limit) => limit,
-    }
-}
-
-pub(crate) async fn write_chunk_diagnostics(path: &Path, cases: &[CaseDiagnostics]) -> Result<()> {
-    args::ensure_parent(path)?;
-    let mut file = tokio::fs::File::create(path)
-        .await
-        .with_context(|| format!("creating diagnostics file {}", path.display()))?;
-    for case in cases {
-        let line = serde_json::to_vec(case).context("serialising chunk diagnostics entry")?;
-        file.write_all(&line).await?;
-        file.write_all(b"\n").await?;
-    }
-    file.flush().await?;
-    Ok(())
-}
-
-#[allow(clippy::cast_precision_loss)]
-pub(crate) async fn warm_hnsw_cache(db: &SurrealDbClient, dimension: usize) -> Result<()> {
-    let dummy_embedding: Vec<f32> = (0..dimension).map(|i| (i as f32).sin()).collect();
-
-    info!("Warming HNSW caches with sample queries");
-
-    // Warm up chunk embedding index - just query the embedding table to load HNSW index
-    let _ = db
-        .client
-        .query(
-            r#"SELECT chunk_id
-               FROM text_chunk_embedding
-               WHERE embedding <|1,1|> $embedding
-               LIMIT 5"#,
-        )
-        .bind(("embedding", dummy_embedding.clone()))
-        .await
-        .context("warming text chunk HNSW cache")?;
-
-    // Warm up entity embedding index
-    let _ = db
-        .client
-        .query(
-            r#"SELECT entity_id
-               FROM knowledge_entity_embedding
-               WHERE embedding <|1,1|> $embedding
-               LIMIT 5"#,
-        )
-        .bind(("embedding", dummy_embedding))
-        .await
-        .context("warming knowledge entity HNSW cache")?;
-
-    info!("HNSW cache warming completed");
-    Ok(())
-}
-
-use chrono::{DateTime, SecondsFormat, Utc};
-
-pub fn format_timestamp(timestamp: &DateTime<Utc>) -> String {
-    timestamp.to_rfc3339_opts(SecondsFormat::Secs, true)
-}
-
-pub(crate) fn sanitize_model_code(code: &str) -> String {
-    code.chars()
-        .map(|ch| {
-            if ch.is_ascii_alphanumeric() {
-                ch.to_ascii_lowercase()
-            } else {
-                '_'
-            }
-        })
-        .collect()
-}
-
-// Re-export run_evaluation from the pipeline module at crate root
-pub use crate::pipeline::run_evaluation;
diff --git a/evaluations/src/inspection.rs b/evaluations/src/inspection.rs
index ba71f0b..cca57b9 100644
--- a/evaluations/src/inspection.rs
+++ b/evaluations/src/inspection.rs
@@ -1,13 +1,13 @@
 use std::{
     collections::HashMap,
     fs,
-    path::{Path, PathBuf},
+    path::Path,
 };
 
 use anyhow::{anyhow, Context, Result};
 use common::storage::{db::SurrealDbClient, types::text_chunk::TextChunk};
 
-use crate::{args::Config, corpus, eval::connect_eval_db, snapshot::DbSnapshotState};
+use crate::{args::Config, corpus, db::connect_eval_db};
 
 pub async fn inspect_question(config: &Config) -> Result<()> {
     let question_id = config
@@ -64,39 +64,26 @@ pub async fn inspect_question(config: &Config) -> Result<()> {
         );
     }
 
-    let db_state_path = config
-        .database
-        .inspect_db_state
-        .clone()
-        .unwrap_or_else(|| default_state_path(config, &manifest));
-    if let Some(state) = load_db_state(&db_state_path)? {
-        if let (Some(ns), Some(db_name)) = (state.namespace.as_deref(), state.database.as_deref()) {
-            match connect_eval_db(config, ns, db_name).await {
-                Ok(db) => match verify_chunks_in_db(&db, &question.matching_chunk_ids).await? {
-                    MissingChunks::None => println!(
-                        "All matching_chunk_ids exist in namespace '{ns}', database '{db_name}'"
-                    ),
-                    MissingChunks::Missing(list) => println!(
-                        "Missing chunks in namespace '{ns}', database '{db_name}': {list:?}"
-                    ),
-                },
-                Err(err) => {
-                    println!(
-                        "Failed to connect to SurrealDB namespace '{ns}' / database '{db_name}': {err}"
-                    );
-                }
+    if let Some(seed) = manifest.metadata.namespace_seed.as_ref() {
+        let ns = seed.namespace.as_str();
+        let db_name = seed.database.as_str();
+        match connect_eval_db(config, ns, db_name).await {
+            Ok(db) => match verify_chunks_in_db(&db, &question.matching_chunk_ids).await? {
+                MissingChunks::None => println!(
+                    "All matching_chunk_ids exist in namespace '{ns}', database '{db_name}'"
+                ),
+                MissingChunks::Missing(list) => println!(
+                    "Missing chunks in namespace '{ns}', database '{db_name}': {list:?}"
+                ),
+            },
+            Err(err) => {
+                println!(
+                    "Failed to connect to SurrealDB namespace '{ns}' / database '{db_name}': {err}"
+                );
             }
-        } else {
-            println!(
-                "State file {} is missing namespace/database fields; skipping live DB validation",
-                db_state_path.display()
-            );
         }
     } else {
-        println!(
-            "State file {} not found; skipping live DB validation",
-            db_state_path.display()
-        );
+        println!("Corpus manifest has no namespace seed; skipping live DB validation");
     }
 
     Ok(())
@@ -137,25 +124,6 @@ fn build_chunk_lookup(manifest: &corpus::CorpusManifest) -> HashMap<String, Chun
     lookup
 }
 
-fn default_state_path(config: &Config, manifest: &corpus::CorpusManifest) -> PathBuf {
-    config
-        .cache_dir
-        .join("snapshots")
-        .join(&manifest.metadata.dataset_id)
-        .join(&manifest.metadata.slice_id)
-        .join("db/state.json")
-}
-
-fn load_db_state(path: &Path) -> Result<Option<DbSnapshotState>> {
-    if !path.exists() {
-        return Ok(None);
-    }
-    let bytes = fs::read(path).with_context(|| format!("reading db state {}", path.display()))?;
-    let state = serde_json::from_slice(&bytes)
-        .with_context(|| format!("parsing db state {}", path.display()))?;
-    Ok(Some(state))
-}
-
 enum MissingChunks {
     None,
     Missing(Vec<String>),
diff --git a/evaluations/src/main.rs b/evaluations/src/main.rs
index 196878c..087c73c 100644
--- a/evaluations/src/main.rs
+++ b/evaluations/src/main.rs
@@ -1,19 +1,17 @@
 mod args;
-mod cache;
+mod context_stats;
 mod cases;
+mod cli;
 mod corpus;
 mod datasets;
-mod db_helpers;
-mod eval;
+mod db;
 mod inspection;
-mod namespace;
 mod openai;
 mod perf;
 mod pipeline;
 mod report;
 mod settings;
 mod slice;
-mod snapshot;
 mod types;
 
 use anyhow::Context;
@@ -24,7 +22,6 @@ use tracing_subscriber::{fmt, EnvFilter};
 /// Configure `SurrealDB` environment variables for optimal performance
 #[allow(clippy::arithmetic_side_effects, clippy::unwrap_used)]
 fn configure_surrealdb_performance(cpu_count: usize) {
-    // Set environment variables only if they're not already set
     let indexing_batch_size = std::env::var("SURREAL_INDEXING_BATCH_SIZE")
         .unwrap_or_else(|_| (cpu_count * 2).to_string());
     std::env::set_var("SURREAL_INDEXING_BATCH_SIZE", indexing_batch_size);
@@ -62,12 +59,11 @@ fn configure_surrealdb_performance(cpu_count: usize) {
 }
 
 fn main() -> anyhow::Result<()> {
-    // Create an explicit multi-threaded runtime with optimized configuration
     let runtime = Builder::new_multi_thread()
         .enable_all()
         .worker_threads(std::thread::available_parallelism()?.get())
         .max_blocking_threads(std::thread::available_parallelism()?.get())
-        .thread_stack_size(10 * 1024 * 1024) // 10MiB stack size
+        .thread_stack_size(10 * 1024 * 1024)
         .thread_name("eval-retrieval-worker")
         .build()
         .context("failed to create tokio runtime")?;
@@ -77,7 +73,6 @@ fn main() -> anyhow::Result<()> {
 
 #[allow(clippy::too_many_lines)]
 async fn async_main() -> anyhow::Result<()> {
-    // Log runtime configuration
     let cpu_count = std::thread::available_parallelism()?.get();
     info!(
         cpu_cores = cpu_count,
@@ -87,7 +82,6 @@ async fn async_main() -> anyhow::Result<()> {
         "Started multi-threaded tokio runtime"
     );
 
-    // Configure SurrealDB environment variables for better performance
     configure_surrealdb_performance(cpu_count);
 
     let filter = std::env::var("RUST_LOG").unwrap_or_else(|_| "info".to_string());
@@ -97,13 +91,22 @@ async fn async_main() -> anyhow::Result<()> {
 
     let parsed = args::parse()?;
 
-    // Clap handles help automatically, so we don't need to check for it manually
-
     if parsed.config.inspect_question.is_some() {
         inspection::inspect_question(&parsed.config).await?;
         return Ok(());
     }
 
+    if parsed.config.status {
+        let status = cli::collect_status(&parsed.config).await?;
+        cli::print_status(&status);
+        return Ok(());
+    }
+
+    if parsed.config.warm {
+        cli::warm(&parsed.config).await?;
+        return Ok(());
+    }
+
     let dataset_kind = parsed.config.dataset;
 
     if parsed.config.convert_only {
@@ -115,7 +118,6 @@ async fn async_main() -> anyhow::Result<()> {
             parsed.config.raw_dataset_path.as_path(),
             dataset_kind,
             parsed.config.llm_mode,
-            parsed.config.context_token_limit(),
         )
         .with_context(|| {
             format!(
@@ -124,56 +126,56 @@ async fn async_main() -> anyhow::Result<()> {
                 parsed.config.raw_dataset_path.display()
             )
         })?;
-        crate::datasets::write_converted(&dataset, parsed.config.converted_dataset_path.as_path())
-            .with_context(|| {
-                format!(
-                    "writing converted dataset to {}",
-                    parsed.config.converted_dataset_path.display()
-                )
-            })?;
+        let store_dir = datasets::store_dir_for(&parsed.config.converted_dataset_path);
+        datasets::write_sharded(&dataset, &store_dir)?;
+        datasets::prebuild_catalog_slices(&dataset, &parsed.config)?;
         println!(
-            "Converted dataset written to {}",
-            parsed.config.converted_dataset_path.display()
+            "Converted dataset written under {}",
+            store_dir.display()
         );
         return Ok(());
     }
 
+    if parsed.config.require_ready {
+        cli::ensure_query_ready(&parsed.config).await?;
+    }
+
     info!(dataset = dataset_kind.id(), "Preparing converted dataset");
-    let dataset = crate::datasets::ensure_converted(
-        dataset_kind,
-        parsed.config.raw_dataset_path.as_path(),
-        parsed.config.converted_dataset_path.as_path(),
-        parsed.config.force_convert,
-        parsed.config.llm_mode,
-        parsed.config.context_token_limit(),
-    )
-    .with_context(|| {
-        format!(
-            "preparing converted dataset at {}",
-            parsed.config.converted_dataset_path.display()
-        )
-    })?;
+    let loaded = crate::datasets::prepare_dataset(dataset_kind, &parsed.config).with_context(
+        || {
+            format!(
+                "preparing converted dataset at {}",
+                parsed.config.converted_dataset_path.display()
+            )
+        },
+    )?;
 
     info!(
-        questions = dataset
+        questions = loaded
+            .dataset
             .paragraphs
             .iter()
             .map(|p| p.questions.len())
             .sum::<usize>(),
-        paragraphs = dataset.paragraphs.len(),
-        dataset = dataset.metadata.id.as_str(),
+        paragraphs = loaded.dataset.paragraphs.len(),
+        partial = loaded.partial,
+        dataset = loaded.dataset.metadata.id.as_str(),
         "Dataset ready"
     );
 
     if parsed.config.slice_grow.is_some() {
-        eval::grow_slice(&dataset, &parsed.config).context("growing slice ledger")?;
+        slice::grow_slice(&loaded.dataset, &parsed.config).context("growing slice ledger")?;
         return Ok(());
     }
 
     info!("Running retrieval evaluation");
-    let summary = eval::run_evaluation(&dataset, &parsed.config)
-        .await
-        .context("running retrieval evaluation")?;
+    let summary = pipeline::run_evaluation(
+        &loaded.dataset,
+        &parsed.config,
+        Some(loaded.content_checksum.as_str()),
+    )
+    .await
+    .context("running retrieval evaluation")?;
 
     let report = report::write_reports(
         &summary,
@@ -226,12 +228,17 @@ async fn async_main() -> anyhow::Result<()> {
         );
     } else {
         println!(
-            "[{}] Retrieval Precision@{k}: {precision:.3} ({correct}/{retrieval_total}) → JSON: {json} | Markdown: {md} | History: {history}{perf_note}",
+            "[{}] Retrieval Precision@{k}: {precision:.3} ({correct}/{retrieval_total}) | Retrieved context: {chunks} chunks, {tokens} tokens ({tokenizer}, avg {avg_tokens:.0}/query, p95 {p95}) → JSON: {json} | Markdown: {md} | History: {history}{perf_note}",
             summary.dataset_label,
             k = summary.k,
             precision = summary.precision,
             correct = summary.correct,
             retrieval_total = summary.retrieval_cases,
+            chunks = summary.retrieved_context.total_chunks,
+            tokens = summary.retrieved_context.total_tokens,
+            tokenizer = summary.retrieved_context.tokenizer,
+            avg_tokens = summary.retrieved_context.avg_tokens_per_query,
+            p95 = summary.retrieved_context.p95_tokens_per_query,
             json = report.paths.json.display(),
             md = report.paths.markdown.display(),
             history = report.history_path.display(),
diff --git a/evaluations/src/openai.rs b/evaluations/src/openai.rs
index 7c5e644..1928dc6 100644
--- a/evaluations/src/openai.rs
+++ b/evaluations/src/openai.rs
@@ -1,9 +1,27 @@
+use std::sync::Arc;
+
 use anyhow::{Context, Result};
 use async_openai::{config::OpenAIConfig, Client};
 
 const DEFAULT_BASE_URL: &str = "https://api.openai.com/v1";
 
-pub fn build_client_from_env() -> Result<(Client<OpenAIConfig>, String)> {
+pub fn ingestion_openai_client(
+    include_entities: bool,
+) -> Result<(Arc<Client<OpenAIConfig>>, Option<String>)> {
+    if include_entities {
+        let (client, base_url) = build_client_from_env().context(
+            "OPENAI_API_KEY must be set when --include-entities is enabled (entity extraction uses OpenAI)",
+        )?;
+        Ok((Arc::new(client), Some(base_url)))
+    } else {
+        Ok((
+            Arc::new(Client::with_config(OpenAIConfig::default())),
+            None,
+        ))
+    }
+}
+
+fn build_client_from_env() -> Result<(Client<OpenAIConfig>, String)> {
     let api_key = std::env::var("OPENAI_API_KEY")
         .context("OPENAI_API_KEY must be set to run retrieval evaluations")?;
     let base_url =
diff --git a/evaluations/src/perf.rs b/evaluations/src/perf.rs
index dc29036..de12810 100644
--- a/evaluations/src/perf.rs
+++ b/evaluations/src/perf.rs
@@ -7,8 +7,8 @@ use anyhow::{Context, Result};
 
 use crate::{
     args,
-    eval::EvaluationSummary,
     report::{self, EvaluationReport},
+    types::EvaluationSummary,
 };
 
 pub fn mirror_perf_outputs(
@@ -91,23 +91,23 @@ fn format_duration(value: Option<u128>) -> String {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::eval::{EvaluationStageTimings, PerformanceTimings};
+    use crate::types::{EvaluationStageTimings, PerformanceTimings, LatencyStats, StageLatency, StageLatencyBreakdown};
     use chrono::Utc;
     use tempfile::tempdir;
 
-    fn sample_latency() -> crate::eval::LatencyStats {
-        crate::eval::LatencyStats {
+    fn sample_latency() -> LatencyStats {
+        LatencyStats {
             avg: 10.0,
             p50: 8,
             p95: 15,
         }
     }
 
-    fn sample_stage_latency() -> crate::eval::StageLatencyBreakdown {
-        crate::eval::StageLatencyBreakdown {
+    fn sample_stage_latency() -> StageLatencyBreakdown {
+        StageLatencyBreakdown {
             stages: ["embed", "search", "rerank", "resolve_entities", "assemble"]
                 .into_iter()
-                .map(|stage| crate::eval::StageLatency {
+                .map(|stage| StageLatency {
                     stage: stage.to_string(),
                     stats: sample_latency(),
                 })
@@ -206,6 +206,7 @@ mod tests {
             chunk_vector_take: 20,
             chunk_fts_take: 20,
             max_chunks_per_entity: 4,
+            retrieved_context: crate::context_stats::aggregate_context_stats(&[]),
             cases: Vec::new(),
         }
     }
diff --git a/evaluations/src/pipeline/context.rs b/evaluations/src/pipeline/context.rs
index 9f2fb9c..08d8723 100644
--- a/evaluations/src/pipeline/context.rs
+++ b/evaluations/src/pipeline/context.rs
@@ -20,11 +20,11 @@ use retrieval_pipeline::{
 
 use crate::{
     args::Config,
-    cache::EmbeddingCache,
+    cases::SeededCase,
     corpus,
     datasets::ConvertedDataset,
-    eval::{CaseDiagnostics, CaseSummary, EvaluationStageTimings, EvaluationSummary, SeededCase},
-    slice, snapshot,
+    slice,
+    types::{CaseDiagnostics, CaseSummary, EvaluationStageTimings, EvaluationSummary},
 };
 
 #[allow(clippy::struct_excessive_bools)]
@@ -41,12 +41,10 @@ pub(super) struct EvaluationContext<'a> {
     pub namespace: String,
     pub database: String,
     pub db: Option<SurrealDbClient>,
-    pub descriptor: Option<snapshot::Descriptor>,
     pub settings: Option<SystemSettings>,
     pub settings_missing: bool,
     pub must_reapply_settings: bool,
     pub embedding_provider: Option<EmbeddingProvider>,
-    pub embedding_cache: Option<EmbeddingCache>,
     pub openai_client: Option<Arc<Client<async_openai::config::OpenAIConfig>>>,
     pub openai_base_url: Option<String>,
     pub expected_fingerprint: Option<String>,
@@ -67,13 +65,19 @@ pub(super) struct EvaluationContext<'a> {
     pub summary: Option<EvaluationSummary>,
     pub diagnostics_path: Option<PathBuf>,
     pub diagnostics_enabled: bool,
+    pub content_checksum: Option<String>,
 }
 
 impl<'a> EvaluationContext<'a> {
-    pub fn new(dataset: &'a ConvertedDataset, config: &'a Config) -> Self {
+    pub fn new(
+        dataset: &'a ConvertedDataset,
+        config: &'a Config,
+        content_checksum: Option<String>,
+    ) -> Self {
         Self {
             dataset,
             config,
+            content_checksum,
             stage_timings: EvaluationStageTimings::default(),
             ledger_limit: None,
             slice_settings: None,
@@ -84,12 +88,10 @@ impl<'a> EvaluationContext<'a> {
             namespace: String::new(),
             database: String::new(),
             db: None,
-            descriptor: None,
             settings: None,
             settings_missing: false,
             must_reapply_settings: false,
             embedding_provider: None,
-            embedding_cache: None,
             openai_client: None,
             openai_base_url: None,
             expected_fingerprint: None,
@@ -133,12 +135,6 @@ impl<'a> EvaluationContext<'a> {
             .ok_or_else(|| anyhow!("database connection missing"))
     }
 
-    pub fn descriptor(&self) -> Result<&snapshot::Descriptor> {
-        self.descriptor
-            .as_ref()
-            .ok_or_else(|| anyhow!("snapshot descriptor unavailable"))
-    }
-
     pub fn embedding_provider(&self) -> Result<&EmbeddingProvider> {
         self.embedding_provider
             .as_ref()
@@ -159,6 +155,10 @@ impl<'a> EvaluationContext<'a> {
             .ok_or_else(|| anyhow!("corpus handle missing"))
     }
 
+    pub fn content_checksum(&self) -> Option<&str> {
+        self.content_checksum.as_deref()
+    }
+
     pub fn evaluation_user(&self) -> Result<&User> {
         self.eval_user
             .as_ref()
diff --git a/evaluations/src/pipeline/diagnostics.rs b/evaluations/src/pipeline/diagnostics.rs
new file mode 100644
index 0000000..94bf940
--- /dev/null
+++ b/evaluations/src/pipeline/diagnostics.rs
@@ -0,0 +1,20 @@
+use std::path::Path;
+
+use anyhow::{Context, Result};
+use tokio::io::AsyncWriteExt;
+
+use crate::{args, types::CaseDiagnostics};
+
+pub(crate) async fn write_chunk_diagnostics(path: &Path, cases: &[CaseDiagnostics]) -> Result<()> {
+    args::ensure_parent(path)?;
+    let mut file = tokio::fs::File::create(path)
+        .await
+        .with_context(|| format!("creating diagnostics file {}", path.display()))?;
+    for case in cases {
+        let line = serde_json::to_vec(case).context("serialising chunk diagnostics entry")?;
+        file.write_all(&line).await?;
+        file.write_all(b"\n").await?;
+    }
+    file.flush().await?;
+    Ok(())
+}
diff --git a/evaluations/src/pipeline/mod.rs b/evaluations/src/pipeline/mod.rs
index d8386b5..f154e2a 100644
--- a/evaluations/src/pipeline/mod.rs
+++ b/evaluations/src/pipeline/mod.rs
@@ -1,6 +1,6 @@
 mod context;
+mod diagnostics;
 mod stages;
-mod state;
 
 use anyhow::Result;
 
@@ -8,20 +8,49 @@ use crate::{args::Config, datasets::ConvertedDataset, types::EvaluationSummary};
 
 use context::EvaluationContext;
 
+async fn run_through_namespace<'a>(
+    dataset: &'a ConvertedDataset,
+    config: &'a Config,
+    content_checksum: Option<String>,
+) -> Result<EvaluationContext<'a>> {
+    let mut ctx = EvaluationContext::new(dataset, config, content_checksum);
+    stages::prepare_slice(&mut ctx).await?;
+    stages::prepare_db(&mut ctx).await?;
+    stages::prepare_corpus(&mut ctx).await?;
+    stages::prepare_namespace(&mut ctx).await?;
+    Ok(ctx)
+}
+
+pub async fn warm_evaluation(
+    dataset: &ConvertedDataset,
+    config: &Config,
+    content_checksum: &str,
+) -> Result<()> {
+    let _ctx = run_through_namespace(
+        dataset,
+        config,
+        Some(content_checksum.to_string()),
+    )
+    .await?;
+    Ok(())
+}
+
 pub async fn run_evaluation(
     dataset: &ConvertedDataset,
     config: &Config,
+    content_checksum: Option<&str>,
 ) -> Result<EvaluationSummary> {
-    let mut ctx = EvaluationContext::new(dataset, config);
-    let machine = state::ready();
-
-    let machine = stages::prepare_slice(machine, &mut ctx).await?;
-    let machine = stages::prepare_db(machine, &mut ctx).await?;
-    let machine = stages::prepare_corpus(machine, &mut ctx).await?;
-    let machine = stages::prepare_namespace(machine, &mut ctx).await?;
-    let machine = stages::run_queries(machine, &mut ctx).await?;
-    let machine = stages::summarize(machine, &mut ctx).await?;
-    let _ = stages::finalize(machine, &mut ctx).await?;
-
+    let mut ctx = EvaluationContext::new(
+        dataset,
+        config,
+        content_checksum.map(str::to_string),
+    );
+    stages::prepare_slice(&mut ctx).await?;
+    stages::prepare_db(&mut ctx).await?;
+    stages::prepare_corpus(&mut ctx).await?;
+    stages::prepare_namespace(&mut ctx).await?;
+    stages::run_queries(&mut ctx).await?;
+    stages::summarize(&mut ctx).await?;
+    stages::finalize(&mut ctx).await?;
     ctx.into_summary()
 }
diff --git a/evaluations/src/pipeline/stages/finalize.rs b/evaluations/src/pipeline/stages/finalize.rs
index b54708b..82d8e53 100644
--- a/evaluations/src/pipeline/stages/finalize.rs
+++ b/evaluations/src/pipeline/stages/finalize.rs
@@ -3,18 +3,12 @@ use std::time::Instant;
 use anyhow::Context;
 use tracing::info;
 
-use crate::eval::write_chunk_diagnostics;
-
 use super::super::{
     context::{EvalStage, EvaluationContext},
-    state::{Completed, EvaluationMachine, Summarized},
+    diagnostics::write_chunk_diagnostics,
 };
-use super::{map_guard_error, StageResult};
 
-pub(crate) async fn finalize(
-    machine: EvaluationMachine<(), Summarized>,
-    ctx: &mut EvaluationContext<'_>,
-) -> StageResult<Completed> {
+pub(crate) async fn finalize(ctx: &mut EvaluationContext<'_>) -> anyhow::Result<()> {
     let stage = EvalStage::Finalize;
     info!(
         evaluation_stage = stage.label(),
@@ -22,13 +16,6 @@ pub(crate) async fn finalize(
     );
     let started = Instant::now();
 
-    if let Some(cache) = ctx.embedding_cache.as_ref() {
-        cache
-            .persist()
-            .await
-            .context("persisting embedding cache")?;
-    }
-
     if let Some(path) = ctx.diagnostics_path.as_ref() {
         if ctx.diagnostics_enabled {
             write_chunk_diagnostics(path.as_path(), &ctx.diagnostics_output)
@@ -53,7 +40,5 @@ pub(crate) async fn finalize(
         "completed evaluation stage"
     );
 
-    machine
-        .finalize()
-        .map_err(|(_, guard)| map_guard_error("finalize", &guard))
+    Ok(())
 }
diff --git a/evaluations/src/pipeline/stages/mod.rs b/evaluations/src/pipeline/stages/mod.rs
index 356b532..99b35cc 100644
--- a/evaluations/src/pipeline/stages/mod.rs
+++ b/evaluations/src/pipeline/stages/mod.rs
@@ -13,14 +13,3 @@ pub(crate) use prepare_namespace::prepare_namespace;
 pub(crate) use prepare_slice::prepare_slice;
 pub(crate) use run_queries::run_queries;
 pub(crate) use summarize::summarize;
-
-use anyhow::Result;
-use state_machines::core::GuardError;
-
-use super::state::EvaluationMachine;
-
-fn map_guard_error(event: &str, guard: &GuardError) -> anyhow::Error {
-    anyhow::anyhow!("invalid evaluation pipeline transition during {event}: {guard:?}")
-}
-
-type StageResult<S> = Result<EvaluationMachine<(), S>>;
diff --git a/evaluations/src/pipeline/stages/prepare_corpus.rs b/evaluations/src/pipeline/stages/prepare_corpus.rs
index a8a16f1..d5da651 100644
--- a/evaluations/src/pipeline/stages/prepare_corpus.rs
+++ b/evaluations/src/pipeline/stages/prepare_corpus.rs
@@ -3,19 +3,12 @@ use std::time::Instant;
 use anyhow::Context;
 use tracing::info;
 
-use crate::{corpus, eval::can_reuse_namespace, slice, snapshot};
+use crate::{corpus, db::can_reuse_namespace, slice};
 
-use super::super::{
-    context::{EvalStage, EvaluationContext},
-    state::{CorpusReady, DbReady, EvaluationMachine},
-};
-use super::{map_guard_error, StageResult};
+use super::super::context::{EvalStage, EvaluationContext};
 
 #[allow(clippy::too_many_lines)]
-pub(crate) async fn prepare_corpus(
-    machine: EvaluationMachine<(), DbReady>,
-    ctx: &mut EvaluationContext<'_>,
-) -> StageResult<CorpusReady> {
+pub(crate) async fn prepare_corpus(ctx: &mut EvaluationContext<'_>) -> anyhow::Result<()> {
     let stage = EvalStage::PrepareCorpus;
     info!(
         evaluation_stage = stage.label(),
@@ -31,13 +24,13 @@ pub(crate) async fn prepare_corpus(
     let window = slice::select_window(slice, ctx.config().slice_offset, ctx.config().limit)
         .context("selecting slice window for corpus preparation")?;
 
-    let descriptor = snapshot::Descriptor::new(config, slice, ctx.embedding_provider()?);
     let ingestion_config = corpus::make_ingestion_config(config);
     let expected_fingerprint = corpus::compute_ingestion_fingerprint(
         ctx.dataset(),
         slice,
         config.converted_dataset_path.as_path(),
         &ingestion_config,
+        ctx.content_checksum(),
     )?;
     let base_dir = corpus::cached_corpus_dir(
         &cache_settings,
@@ -47,19 +40,18 @@ pub(crate) async fn prepare_corpus(
 
     if !config.reseed_slice {
         let requested_cases = window.cases.len();
-        if can_reuse_namespace(
-            ctx.db()?,
-            &descriptor,
-            &ctx.namespace,
-            &ctx.database,
-            ctx.dataset().metadata.id.as_str(),
-            slice.manifest.slice_id.as_str(),
-            expected_fingerprint.as_str(),
-            requested_cases,
-        )
-        .await?
-        {
-            if let Some(manifest) = corpus::load_cached_manifest(&base_dir)? {
+        if let Some(manifest) = corpus::load_cached_manifest(&base_dir)? {
+            if can_reuse_namespace(
+                ctx.db()?,
+                &manifest,
+                &embedding_provider,
+                &ctx.namespace,
+                &ctx.database,
+                expected_fingerprint.as_str(),
+                requested_cases,
+            )
+            .await?
+            {
                 info!(
                     cache = %base_dir.display(),
                     namespace = ctx.namespace.as_str(),
@@ -70,7 +62,6 @@ pub(crate) async fn prepare_corpus(
                 ctx.corpus_handle = Some(corpus_handle);
                 ctx.expected_fingerprint = Some(expected_fingerprint);
                 ctx.ingestion_duration_ms = 0;
-                ctx.descriptor = Some(descriptor);
 
                 let elapsed = started.elapsed();
                 ctx.record_stage_duration(stage, elapsed);
@@ -80,14 +71,8 @@ pub(crate) async fn prepare_corpus(
                     "completed evaluation stage"
                 );
 
-                return machine
-                    .prepare_corpus()
-                    .map_err(|(_, guard)| map_guard_error("prepare_corpus", &guard));
+                return Ok(());
             }
-            info!(
-                cache = %base_dir.display(),
-                "Namespace reusable but cached manifest missing; regenerating corpus"
-            );
         }
     }
 
@@ -103,6 +88,7 @@ pub(crate) async fn prepare_corpus(
             openai_client,
             &eval_user_id,
             config.converted_dataset_path.as_path(),
+            ctx.content_checksum(),
             ingestion_config.clone(),
         )
         .await
@@ -126,7 +112,6 @@ pub(crate) async fn prepare_corpus(
     ctx.corpus_handle = Some(corpus_handle);
     ctx.expected_fingerprint = Some(expected_fingerprint);
     ctx.ingestion_duration_ms = ingestion_duration_ms;
-    ctx.descriptor = Some(descriptor);
 
     let elapsed = started.elapsed();
     ctx.record_stage_duration(stage, elapsed);
@@ -136,7 +121,5 @@ pub(crate) async fn prepare_corpus(
         "completed evaluation stage"
     );
 
-    machine
-        .prepare_corpus()
-        .map_err(|(_, guard)| map_guard_error("prepare_corpus", &guard))
+    Ok(())
 }
diff --git a/evaluations/src/pipeline/stages/prepare_db.rs b/evaluations/src/pipeline/stages/prepare_db.rs
index 01eff64..3eb09cb 100644
--- a/evaluations/src/pipeline/stages/prepare_db.rs
+++ b/evaluations/src/pipeline/stages/prepare_db.rs
@@ -1,28 +1,19 @@
-use std::{sync::Arc, time::Instant};
+use std::time::Instant;
 
 use anyhow::{anyhow, Context};
 use tracing::info;
 
 use crate::{
     args::EmbeddingBackend,
-    cache::EmbeddingCache,
-    eval::{
-        connect_eval_db, enforce_system_settings, load_or_init_system_settings, sanitize_model_code,
-    },
+    db::{connect_eval_db, sanitize_model_code},
     openai,
+    settings::{enforce_system_settings, load_or_init_system_settings},
 };
 use common::utils::embedding::{default_embedding_pool_size, EmbeddingProvider};
 
-use super::super::{
-    context::{EvalStage, EvaluationContext},
-    state::{DbReady, EvaluationMachine, SlicePrepared},
-};
-use super::{map_guard_error, StageResult};
+use super::super::context::{EvalStage, EvaluationContext};
 
-pub(crate) async fn prepare_db(
-    machine: EvaluationMachine<(), SlicePrepared>,
-    ctx: &mut EvaluationContext<'_>,
-) -> StageResult<DbReady> {
+pub(crate) async fn prepare_db(ctx: &mut EvaluationContext<'_>) -> anyhow::Result<()> {
     let stage = EvalStage::PrepareDb;
     info!(
         evaluation_stage = stage.label(),
@@ -36,19 +27,18 @@ pub(crate) async fn prepare_db(
 
     let db = connect_eval_db(config, &namespace, &database).await?;
 
-    let (raw_openai_client, openai_base_url) =
-        openai::build_client_from_env().context("building OpenAI client")?;
-    let openai_client = Arc::new(raw_openai_client);
+    let (openai_client, openai_base_url) =
+        openai::ingestion_openai_client(config.ingest.include_entities)
+            .context("building OpenAI client for ingestion")?;
 
-    // Create embedding provider directly from config (eval only supports FastEmbed and Hashed)
     let embedding_provider = match config.embedding_backend {
-        crate::args::EmbeddingBackend::FastEmbed => EmbeddingProvider::new_fastembed(
+        EmbeddingBackend::FastEmbed => EmbeddingProvider::new_fastembed(
             config.embedding_model.clone(),
             default_embedding_pool_size(),
         )
         .await
         .context("creating FastEmbed provider")?,
-        crate::args::EmbeddingBackend::Hashed => {
+        EmbeddingBackend::Hashed => {
             EmbeddingProvider::new_hashed(1536).context("creating Hashed provider")?
         }
     };
@@ -68,12 +58,14 @@ pub(crate) async fn prepare_db(
         dimension = provider_dimension,
         "Embedding provider initialised"
     );
-    info!(openai_base_url = %openai_base_url, "OpenAI client configured");
+    if let Some(base_url) = &openai_base_url {
+        info!(openai_base_url = %base_url, "OpenAI client configured for entity ingestion");
+    }
 
     let (mut settings, settings_missing) =
         load_or_init_system_settings(&db, provider_dimension).await?;
 
-    let embedding_cache = if config.embedding_backend == EmbeddingBackend::FastEmbed {
+    if config.embedding_backend == EmbeddingBackend::FastEmbed {
         if let Some(model_code) = embedding_provider.model_code() {
             let sanitized = sanitize_model_code(&model_code);
             let path = config.cache_dir.join(format!("{sanitized}.json"));
@@ -83,15 +75,8 @@ pub(crate) async fn prepare_db(
                     .with_context(|| format!("removing stale cache {}", path.display()))
                     .ok();
             }
-            let cache = EmbeddingCache::load(&path).await?;
-            info!(path = %path.display(), "Embedding cache ready");
-            Some(cache)
-        } else {
-            None
         }
-    } else {
-        None
-    };
+    }
 
     let must_reapply_settings = settings_missing;
     let defer_initial_enforce = settings_missing && !config.reseed_slice;
@@ -104,9 +89,8 @@ pub(crate) async fn prepare_db(
     ctx.must_reapply_settings = must_reapply_settings;
     ctx.settings = Some(settings);
     ctx.embedding_provider = Some(embedding_provider);
-    ctx.embedding_cache = embedding_cache;
     ctx.openai_client = Some(openai_client);
-    ctx.openai_base_url = Some(openai_base_url);
+    ctx.openai_base_url = openai_base_url;
 
     let elapsed = started.elapsed();
     ctx.record_stage_duration(stage, elapsed);
@@ -116,7 +100,5 @@ pub(crate) async fn prepare_db(
         "completed evaluation stage"
     );
 
-    machine
-        .prepare_db()
-        .map_err(|(_, guard)| map_guard_error("prepare_db", &guard))
+    Ok(())
 }
diff --git a/evaluations/src/pipeline/stages/prepare_namespace.rs b/evaluations/src/pipeline/stages/prepare_namespace.rs
index 1af78d0..8eb635d 100644
--- a/evaluations/src/pipeline/stages/prepare_namespace.rs
+++ b/evaluations/src/pipeline/stages/prepare_namespace.rs
@@ -5,25 +5,19 @@ use common::storage::types::system_settings::SystemSettings;
 use tracing::{info, warn};
 
 use crate::{
+    cases::cases_from_manifest,
     corpus,
-    db_helpers::{recreate_indexes, remove_all_indexes, reset_namespace},
-    eval::{
-        can_reuse_namespace, cases_from_manifest, enforce_system_settings, ensure_eval_user,
-        record_namespace_state, warm_hnsw_cache,
+    db::{
+        can_reuse_namespace, ensure_eval_user, record_namespace_seed, recreate_indexes,
+        reset_namespace, warm_hnsw_cache,
     },
+    settings::enforce_system_settings,
 };
 
-use super::super::{
-    context::{EvalStage, EvaluationContext},
-    state::{CorpusReady, EvaluationMachine, NamespaceReady},
-};
-use super::{map_guard_error, StageResult};
+use super::super::context::{EvalStage, EvaluationContext};
 
 #[allow(clippy::too_many_lines)]
-pub(crate) async fn prepare_namespace(
-    machine: EvaluationMachine<(), CorpusReady>,
-    ctx: &mut EvaluationContext<'_>,
-) -> StageResult<NamespaceReady> {
+pub(crate) async fn prepare_namespace(ctx: &mut EvaluationContext<'_>) -> anyhow::Result<()> {
     let stage = EvalStage::PrepareNamespace;
     info!(
         evaluation_stage = stage.label(),
@@ -32,7 +26,6 @@ pub(crate) async fn prepare_namespace(
     let started = Instant::now();
 
     let config = ctx.config();
-    let dataset = ctx.dataset();
     let expected_fingerprint = ctx
         .expected_fingerprint
         .as_deref()
@@ -60,20 +53,16 @@ pub(crate) async fn prepare_namespace(
 
     let mut namespace_reused = false;
     if !config.reseed_slice {
-        namespace_reused = {
-            let slice = ctx.slice()?;
-            can_reuse_namespace(
-                ctx.db()?,
-                ctx.descriptor()?,
-                &namespace,
-                &database,
-                dataset.metadata.id.as_str(),
-                slice.manifest.slice_id.as_str(),
-                expected_fingerprint.as_str(),
-                requested_cases,
-            )
-            .await?
-        };
+        namespace_reused = can_reuse_namespace(
+            ctx.db()?,
+            base_manifest,
+            &embedding_provider,
+            &namespace,
+            &database,
+            expected_fingerprint.as_str(),
+            requested_cases,
+        )
+        .await?;
     }
 
     let mut namespace_seed_ms = None;
@@ -114,34 +103,20 @@ pub(crate) async fn prepare_namespace(
                 "Seeding ingestion corpus into SurrealDB"
             );
         }
-        let indexes_disabled = remove_all_indexes(ctx.db()?).await.is_ok();
-
         let seed_start = Instant::now();
         corpus::seed_manifest_into_db(ctx.db()?, &manifest_for_seed)
             .await
             .context("seeding ingestion corpus from manifest")?;
         namespace_seed_ms = Some(seed_start.elapsed().as_millis());
 
-        // Recreate indexes AFTER data is loaded (correct bulk loading pattern)
-        if indexes_disabled {
-            info!("Recreating indexes after seeding data");
-            recreate_indexes(ctx.db()?, embedding_provider.dimension())
-                .await
-                .context("recreating indexes with correct dimension")?;
-            warm_hnsw_cache(ctx.db()?, embedding_provider.dimension()).await?;
-        }
-        {
-            let slice = ctx.slice()?;
-            record_namespace_state(
-                ctx.descriptor()?,
-                dataset.metadata.id.as_str(),
-                slice.manifest.slice_id.as_str(),
-                expected_fingerprint.as_str(),
-                &namespace,
-                &database,
-                requested_cases,
-            )
-            .await;
+        info!("Recreating indexes after seeding data");
+        recreate_indexes(ctx.db()?, embedding_provider.dimension())
+            .await
+            .context("recreating indexes with correct dimension")?;
+        warm_hnsw_cache(ctx.db()?, embedding_provider.dimension()).await?;
+
+        if let Some(handle) = ctx.corpus_handle.as_mut() {
+            record_namespace_seed(handle, &namespace, &database, requested_cases).await;
         }
     }
 
@@ -198,7 +173,5 @@ pub(crate) async fn prepare_namespace(
         "completed evaluation stage"
     );
 
-    machine
-        .prepare_namespace()
-        .map_err(|(_, guard)| map_guard_error("prepare_namespace", &guard))
+    Ok(())
 }
diff --git a/evaluations/src/pipeline/stages/prepare_slice.rs b/evaluations/src/pipeline/stages/prepare_slice.rs
index 861c1c6..9b2493e 100644
--- a/evaluations/src/pipeline/stages/prepare_slice.rs
+++ b/evaluations/src/pipeline/stages/prepare_slice.rs
@@ -3,21 +3,11 @@ use std::time::Instant;
 use anyhow::Context;
 use tracing::info;
 
-use crate::{
-    eval::{default_database, default_namespace, ledger_target},
-    slice,
-};
+use crate::{db::{default_database, default_namespace}, slice};
 
-use super::super::{
-    context::{EvalStage, EvaluationContext},
-    state::{EvaluationMachine, Ready, SlicePrepared},
-};
-use super::{map_guard_error, StageResult};
+use super::super::context::{EvalStage, EvaluationContext};
 
-pub(crate) async fn prepare_slice(
-    machine: EvaluationMachine<(), Ready>,
-    ctx: &mut EvaluationContext<'_>,
-) -> StageResult<SlicePrepared> {
+pub(crate) async fn prepare_slice(ctx: &mut EvaluationContext<'_>) -> anyhow::Result<()> {
     let stage = EvalStage::PrepareSlice;
     info!(
         evaluation_stage = stage.label(),
@@ -25,7 +15,7 @@ pub(crate) async fn prepare_slice(
     );
     let started = Instant::now();
 
-    let ledger_limit = ledger_target(ctx.config());
+    let ledger_limit = slice::ledger_target(ctx.config());
     let slice_settings = slice::slice_config_with_limit(ctx.config(), ledger_limit);
     let resolved_slice =
         slice::resolve_slice(ctx.dataset(), &slice_settings).context("resolving dataset slice")?;
@@ -49,7 +39,11 @@ pub(crate) async fn prepare_slice(
         .db_namespace
         .clone()
         .unwrap_or_else(|| {
-            default_namespace(ctx.dataset().metadata.id.as_str(), ctx.config().limit)
+            default_namespace(
+                ctx.dataset().metadata.id.as_str(),
+                ctx.config().limit,
+                ctx.config().slice.as_deref(),
+            )
         });
     ctx.database = ctx
         .config()
@@ -66,7 +60,5 @@ pub(crate) async fn prepare_slice(
         "completed evaluation stage"
     );
 
-    machine
-        .prepare_slice()
-        .map_err(|(_, guard)| map_guard_error("prepare_slice", &guard))
+    Ok(())
 }
diff --git a/evaluations/src/pipeline/stages/run_queries.rs b/evaluations/src/pipeline/stages/run_queries.rs
index c8683f5..96fd948 100644
--- a/evaluations/src/pipeline/stages/run_queries.rs
+++ b/evaluations/src/pipeline/stages/run_queries.rs
@@ -5,9 +5,13 @@ use common::storage::types::StoredObject;
 use futures::stream::{self, StreamExt};
 use tracing::{debug, info};
 
-use crate::eval::{
-    adapt_retrieval_output, build_case_diagnostics, text_contains_answer, CaseDiagnostics,
-    CaseSummary, RetrievedSummary,
+use crate::{
+    cases::SeededCase,
+    context_stats,
+    types::{
+        adapt_retrieval_output, build_case_diagnostics, text_contains_answer, CaseDiagnostics,
+        CaseSummary, RetrievedSummary,
+    },
 };
 use retrieval_pipeline::{
     pipeline::{self, RetrievalConfig, StageTimings},
@@ -15,17 +19,10 @@ use retrieval_pipeline::{
 };
 use tokio::sync::Semaphore;
 
-use super::super::{
-    context::{EvalStage, EvaluationContext},
-    state::{EvaluationMachine, NamespaceReady, QueriesFinished},
-};
-use super::{map_guard_error, StageResult};
+use super::super::context::{EvalStage, EvaluationContext};
 
 #[allow(clippy::too_many_lines, clippy::arithmetic_side_effects)]
-pub(crate) async fn run_queries(
-    machine: EvaluationMachine<(), NamespaceReady>,
-    ctx: &mut EvaluationContext<'_>,
-) -> StageResult<QueriesFinished> {
+pub(crate) async fn run_queries(ctx: &mut EvaluationContext<'_>) -> anyhow::Result<()> {
     let stage = EvalStage::RunQueries;
     info!(
         evaluation_stage = stage.label(),
@@ -153,7 +150,7 @@ pub(crate) async fn run_queries(
                     .await
                     .context("acquiring query semaphore permit")?;
 
-                let crate::eval::SeededCase {
+                let SeededCase {
                     question_id,
                     question,
                     expected_source,
@@ -197,6 +194,7 @@ pub(crate) async fn run_queries(
                 let query_latency = query_start.elapsed().as_millis();
 
                 let candidates = adapt_retrieval_output(result_output);
+                let retrieved_context = context_stats::stats_for_candidates(&candidates);
                 let mut retrieved = Vec::new();
                 let mut match_rank = None;
                 let answers_lower: Vec<String> =
@@ -288,6 +286,7 @@ pub(crate) async fn run_queries(
                     reciprocal_rank: Some(reciprocal_rank),
                     ndcg: Some(ndcg),
                     latency_ms: query_latency,
+                    retrieved_context,
                     retrieved,
                 };
 
@@ -353,9 +352,7 @@ pub(crate) async fn run_queries(
         "completed evaluation stage"
     );
 
-    machine
-        .run_queries()
-        .map_err(|(_, guard)| map_guard_error("run_queries", &guard))
+    Ok(())
 }
 
 #[allow(clippy::arithmetic_side_effects, clippy::cast_precision_loss)]
diff --git a/evaluations/src/pipeline/stages/summarize.rs b/evaluations/src/pipeline/stages/summarize.rs
index fa7f67f..9603439 100644
--- a/evaluations/src/pipeline/stages/summarize.rs
+++ b/evaluations/src/pipeline/stages/summarize.rs
@@ -3,25 +3,19 @@ use std::time::Instant;
 use chrono::Utc;
 use tracing::info;
 
-use crate::eval::{
+use crate::types::{
     build_stage_latency_breakdown, compute_latency_stats, EvaluationSummary, PerformanceTimings,
+    RetrievedContextStats,
 };
 
-use super::super::{
-    context::{EvalStage, EvaluationContext},
-    state::{EvaluationMachine, QueriesFinished, Summarized},
-};
-use super::{map_guard_error, StageResult};
+use super::super::context::{EvalStage, EvaluationContext};
 
 #[allow(
     clippy::too_many_lines,
     clippy::arithmetic_side_effects,
     clippy::cast_precision_loss
 )]
-pub(crate) async fn summarize(
-    machine: EvaluationMachine<(), QueriesFinished>,
-    ctx: &mut EvaluationContext<'_>,
-) -> StageResult<Summarized> {
+pub(crate) async fn summarize(ctx: &mut EvaluationContext<'_>) -> anyhow::Result<()> {
     let stage = EvalStage::Summarize;
     info!(
         evaluation_stage = stage.label(),
@@ -123,6 +117,12 @@ pub(crate) async fn summarize(
         sum_ndcg / (retrieval_cases as f64)
     };
 
+    let per_query_context: Vec<RetrievedContextStats> = summaries
+        .iter()
+        .map(|summary| summary.retrieved_context)
+        .collect();
+    let retrieved_context = crate::context_stats::aggregate_context_stats(&per_query_context);
+
     let active_tuning = ctx
         .retrieval_config
         .as_ref()
@@ -133,7 +133,7 @@ pub(crate) async fn summarize(
         openai_base_url: ctx
             .openai_base_url
             .clone()
-            .unwrap_or_else(|| "<unknown>".to_string()),
+            .unwrap_or_else(|| "n/a (chunk-only ingestion)".to_string()),
         ingestion_ms: ctx.ingestion_duration_ms,
         namespace_seed_ms: ctx.namespace_seed_ms,
         evaluation_stage_ms: ctx.stage_timings.clone(),
@@ -217,11 +217,12 @@ pub(crate) async fn summarize(
         chunk_rrf_use_fts: active_tuning.flags.chunk_rrf_use_fts.as_bool(),
         ingest_chunk_min_tokens: config.ingest.ingest_chunk_min_tokens,
         ingest_chunk_max_tokens: config.ingest.ingest_chunk_max_tokens,
-        ingest_chunks_only: config.ingest.ingest_chunks_only,
+        ingest_chunks_only: !config.ingest.include_entities,
         ingest_chunk_overlap_tokens: config.ingest.ingest_chunk_overlap_tokens,
         chunk_vector_take: active_tuning.chunk_vector_take,
         chunk_fts_take: active_tuning.chunk_fts_take,
         max_chunks_per_entity: active_tuning.max_chunks_per_entity,
+        retrieved_context,
         cases: summaries,
     });
 
@@ -233,7 +234,5 @@ pub(crate) async fn summarize(
         "completed evaluation stage"
     );
 
-    machine
-        .summarize()
-        .map_err(|(_, guard)| map_guard_error("summarize", &guard))
+    Ok(())
 }
diff --git a/evaluations/src/pipeline/state.rs b/evaluations/src/pipeline/state.rs
deleted file mode 100644
index aa9e753..0000000
--- a/evaluations/src/pipeline/state.rs
+++ /dev/null
@@ -1,31 +0,0 @@
-use state_machines::state_machine;
-
-state_machine! {
-    name: EvaluationMachine,
-    state: EvaluationState,
-    initial: Ready,
-    states: [Ready, SlicePrepared, DbReady, CorpusReady, NamespaceReady, QueriesFinished, Summarized, Completed, Failed],
-    events {
-        prepare_slice { transition: { from: Ready, to: SlicePrepared } }
-        prepare_db { transition: { from: SlicePrepared, to: DbReady } }
-        prepare_corpus { transition: { from: DbReady, to: CorpusReady } }
-        prepare_namespace { transition: { from: CorpusReady, to: NamespaceReady } }
-        run_queries { transition: { from: NamespaceReady, to: QueriesFinished } }
-        summarize { transition: { from: QueriesFinished, to: Summarized } }
-        finalize { transition: { from: Summarized, to: Completed } }
-        abort {
-            transition: { from: Ready, to: Failed }
-            transition: { from: SlicePrepared, to: Failed }
-            transition: { from: DbReady, to: Failed }
-            transition: { from: CorpusReady, to: Failed }
-            transition: { from: NamespaceReady, to: Failed }
-            transition: { from: QueriesFinished, to: Failed }
-            transition: { from: Summarized, to: Failed }
-            transition: { from: Completed, to: Failed }
-        }
-    }
-}
-
-pub fn ready() -> EvaluationMachine<(), Ready> {
-    EvaluationMachine::new(())
-}
diff --git a/evaluations/src/report.rs b/evaluations/src/report.rs
index 33b419b..e299567 100644
--- a/evaluations/src/report.rs
+++ b/evaluations/src/report.rs
@@ -7,12 +7,10 @@ use std::{
 use anyhow::{Context, Result};
 use serde::{Deserialize, Serialize};
 
-use crate::eval::{
+use crate::types::{
     format_timestamp, CaseSummary, EvaluationStageTimings, EvaluationSummary, LatencyStats,
-    StageLatencyBreakdown,
+    RetrievalContextStats, StageLatencyBreakdown,
 };
-use chrono::Utc;
-use tracing::warn;
 
 #[derive(Debug)]
 pub struct ReportPaths {
@@ -108,6 +106,7 @@ pub struct RetrievalSection {
     pub ingest_chunk_max_tokens: usize,
     pub ingest_chunk_overlap_tokens: usize,
     pub ingest_chunks_only: bool,
+    pub retrieved_context: RetrievalContextStats,
 }
 
 const fn default_chunk_rrf_k() -> f32 {
@@ -242,6 +241,7 @@ impl EvaluationReport {
             ingest_chunk_max_tokens: summary.ingest_chunk_max_tokens,
             ingest_chunk_overlap_tokens: summary.ingest_chunk_overlap_tokens,
             ingest_chunks_only: summary.ingest_chunks_only,
+            retrieved_context: summary.retrieved_context.clone(),
         };
 
         let llm = if summary.llm_cases > 0 {
@@ -345,7 +345,7 @@ impl LlmCaseEntry {
 }
 
 impl RetrievedSnippet {
-    fn from_summary(entry: &crate::eval::RetrievedSummary) -> Self {
+    fn from_summary(entry: &crate::types::RetrievedSummary) -> Self {
         Self {
             rank: entry.rank,
             source_id: entry.source_id.clone(),
@@ -558,6 +558,65 @@ fn render_markdown(report: &EvaluationReport) -> String {
     } else {
         md.push_str("| Rerank | disabled |\\n");
     }
+    write!(
+        md,
+        "| Chunk result cap | {} |\\n",
+        report.retrieval.chunk_result_cap
+    )
+    .unwrap();
+
+    md.push_str("\\n## Retrieved Context Volume\\n\\n");
+    md.push_str("| Metric | Value |\\n| --- | --- |\\n");
+    write!(
+        md,
+        "| Tokenizer | {} |\\n",
+        report.retrieval.retrieved_context.tokenizer
+    )
+    .unwrap();
+    write!(
+        md,
+        "| Queries measured | {} |\\n",
+        report.retrieval.retrieved_context.queries
+    )
+    .unwrap();
+    write!(
+        md,
+        "| Total chunks returned | {} |\\n",
+        report.retrieval.retrieved_context.total_chunks
+    )
+    .unwrap();
+    write!(
+        md,
+        "| Total characters | {} |\\n",
+        report.retrieval.retrieved_context.total_chars
+    )
+    .unwrap();
+    write!(
+        md,
+        "| Total tokens | {} |\\n",
+        report.retrieval.retrieved_context.total_tokens
+    )
+    .unwrap();
+    write!(
+        md,
+        "| Avg chunks / query | {:.1} |\\n",
+        report.retrieval.retrieved_context.avg_chunks_per_query
+    )
+    .unwrap();
+    write!(
+        md,
+        "| Avg tokens / query | {:.1} |\\n",
+        report.retrieval.retrieved_context.avg_tokens_per_query
+    )
+    .unwrap();
+    write!(
+        md,
+        "| P50 / P95 / max tokens / query | {} / {} / {} |\\n",
+        report.retrieval.retrieved_context.p50_tokens_per_query,
+        report.retrieval.retrieved_context.p95_tokens_per_query,
+        report.retrieval.retrieved_context.max_tokens_per_query
+    )
+    .unwrap();
 
     if let Some(llm) = &report.llm {
         md.push_str("\\n## LLM Mode Metrics\\n\\n");
@@ -797,182 +856,6 @@ pub fn dataset_report_dir(report_dir: &Path, dataset_id: &str) -> PathBuf {
     report_dir.join(sanitize_component(dataset_id))
 }
 
-#[derive(Debug, Serialize, Deserialize)]
-struct LegacyHistoryEntry {
-    generated_at: String,
-    run_label: Option<String>,
-    dataset_id: String,
-    dataset_label: String,
-    slice_id: String,
-    slice_seed: u64,
-    slice_window_offset: usize,
-    slice_window_length: usize,
-    slice_cases: usize,
-    slice_total_cases: usize,
-    k: usize,
-    limit: Option<usize>,
-    precision: f64,
-    precision_at_1: f64,
-    precision_at_2: f64,
-    precision_at_3: f64,
-    #[serde(default)]
-    mrr: f64,
-    #[serde(default)]
-    average_ndcg: f64,
-    #[serde(default)]
-    retrieval_cases: usize,
-    #[serde(default)]
-    retrieval_precision: f64,
-    #[serde(default)]
-    llm_cases: usize,
-    #[serde(default)]
-    llm_precision: f64,
-    duration_ms: u128,
-    latency_ms: LatencyStats,
-    embedding_backend: String,
-    embedding_model: Option<String>,
-    ingestion_reused: bool,
-    ingestion_embeddings_reused: bool,
-    rerank_enabled: bool,
-    rerank_keep_top: usize,
-    rerank_pool_size: Option<usize>,
-    #[serde(default)]
-    chunk_result_cap: Option<usize>,
-    #[serde(default)]
-    ingest_chunk_min_tokens: Option<usize>,
-    #[serde(default)]
-    ingest_chunk_max_tokens: Option<usize>,
-    #[serde(default)]
-    ingest_chunk_overlap_tokens: Option<usize>,
-    #[serde(default)]
-    ingest_chunks_only: Option<bool>,
-    #[serde(default)]
-    delta: Option<LegacyHistoryDelta>,
-    openai_base_url: String,
-    ingestion_ms: u128,
-    #[serde(default)]
-    namespace_seed_ms: Option<u128>,
-}
-
-#[derive(Debug, Serialize, Deserialize)]
-struct LegacyHistoryDelta {
-    precision: f64,
-    precision_at_1: f64,
-    latency_avg_ms: f64,
-}
-
-#[allow(clippy::too_many_lines)]
-fn convert_legacy_entry(entry: LegacyHistoryEntry) -> EvaluationReport {
-    let overview = OverviewSection {
-        generated_at: entry.generated_at,
-        run_label: entry.run_label,
-        total_cases: entry.slice_cases,
-        filtered_questions: 0,
-    };
-
-    let dataset = DatasetSection {
-        id: entry.dataset_id,
-        label: entry.dataset_label,
-        source: String::new(),
-        includes_unanswerable: entry.llm_cases > 0,
-        require_verified_chunks: true,
-        embedding_backend: entry.embedding_backend,
-        embedding_model: entry.embedding_model,
-        embedding_dimension: 0,
-    };
-
-    let slice = SliceSection {
-        id: entry.slice_id,
-        seed: entry.slice_seed,
-        window_offset: entry.slice_window_offset,
-        window_length: entry.slice_window_length,
-        slice_cases: entry.slice_cases,
-        ledger_total_cases: entry.slice_total_cases,
-        positives: 0,
-        negatives: 0,
-        total_paragraphs: 0,
-        negative_multiplier: 0.0,
-    };
-
-    let retrieval_cases = if entry.retrieval_cases > 0 {
-        entry.retrieval_cases
-    } else {
-        entry.slice_cases.saturating_sub(entry.llm_cases)
-    };
-    let retrieval_precision = if entry.retrieval_precision > 0.0 {
-        entry.retrieval_precision
-    } else {
-        entry.precision
-    };
-
-    let retrieval = RetrievalSection {
-        k: entry.k,
-        cases: retrieval_cases,
-        correct: 0,
-        precision: retrieval_precision,
-        precision_at_1: entry.precision_at_1,
-        precision_at_2: entry.precision_at_2,
-        precision_at_3: entry.precision_at_3,
-        mrr: entry.mrr,
-        average_ndcg: entry.average_ndcg,
-        latency: entry.latency_ms,
-        concurrency: 0,
-        resolve_entities: false,
-        rerank_enabled: entry.rerank_enabled,
-        rerank_pool_size: entry.rerank_pool_size,
-        rerank_keep_top: entry.rerank_keep_top,
-        chunk_result_cap: entry.chunk_result_cap.unwrap_or(5),
-        chunk_rrf_k: default_chunk_rrf_k(),
-        chunk_rrf_vector_weight: default_chunk_rrf_weight(),
-        chunk_rrf_fts_weight: default_chunk_rrf_weight(),
-        chunk_rrf_use_vector: default_chunk_rrf_use(),
-        chunk_rrf_use_fts: default_chunk_rrf_use(),
-        chunk_vector_take: 0,
-        chunk_fts_take: 0,
-        ingest_chunk_min_tokens: entry.ingest_chunk_min_tokens.unwrap_or(256),
-        ingest_chunk_max_tokens: entry.ingest_chunk_max_tokens.unwrap_or(512),
-        ingest_chunk_overlap_tokens: entry.ingest_chunk_overlap_tokens.unwrap_or(50),
-        ingest_chunks_only: entry.ingest_chunks_only.unwrap_or(false),
-    };
-
-    let llm = if entry.llm_cases > 0 {
-        Some(LlmSection {
-            cases: entry.llm_cases,
-            answered: 0,
-            precision: entry.llm_precision,
-        })
-    } else {
-        None
-    };
-
-    let performance = PerformanceSection {
-        openai_base_url: entry.openai_base_url,
-        ingestion_ms: entry.ingestion_ms,
-        namespace_seed_ms: entry.namespace_seed_ms,
-        evaluation_stages_ms: EvaluationStageTimings::default(),
-        stage_latency: StageLatencyBreakdown::default(),
-        namespace_reused: false,
-        ingestion_reused: entry.ingestion_reused,
-        embeddings_reused: entry.ingestion_embeddings_reused,
-        ingestion_cache_path: String::new(),
-        corpus_paragraphs: 0,
-        positive_paragraphs_reused: 0,
-        negative_paragraphs_reused: 0,
-    };
-
-    EvaluationReport {
-        overview,
-        dataset,
-        slice,
-        retrieval,
-        llm,
-        performance,
-        misses: Vec::new(),
-        llm_cases: Vec::new(),
-        detailed_report: false,
-    }
-}
-
 fn load_history(path: &Path) -> Result<Vec<EvaluationReport>> {
     if !path.exists() {
         return Ok(Vec::new());
@@ -981,34 +864,12 @@ fn load_history(path: &Path) -> Result<Vec<EvaluationReport>> {
     let contents =
         fs::read(path).with_context(|| format!("reading evaluation log {}", path.display()))?;
 
-    if let Ok(entries) = serde_json::from_slice::<Vec<EvaluationReport>>(&contents) {
-        return Ok(entries);
-    }
-
-    match serde_json::from_slice::<Vec<LegacyHistoryEntry>>(&contents) {
-        Ok(entries) => Ok(entries.into_iter().map(convert_legacy_entry).collect()),
-        Err(err) => {
-            let timestamp = Utc::now().format("%Y%m%dT%H%M%S");
-            let backup_path = path
-                .parent()
-                .unwrap_or_else(|| Path::new("."))
-                .join(format!("evaluations.json.corrupted.{timestamp}"));
-            warn!(
-                path = %path.display(),
-                backup = %backup_path.display(),
-                error = %err,
-                "Evaluation history file is corrupted; backing up and starting fresh"
-            );
-            if let Err(e) = fs::rename(path, &backup_path) {
-                warn!(
-                    path = %path.display(),
-                    error = %e,
-                    "Failed to backup corrupted evaluation history"
-                );
-            }
-            Ok(Vec::new())
-        }
-    }
+    serde_json::from_slice(&contents).with_context(|| {
+        format!(
+            "parsing evaluation history at {}; delete the file and re-run if upgrading from an older format",
+            path.display()
+        )
+    })
 }
 
 fn record_history(report: &EvaluationReport, report_dir: &Path) -> Result<PathBuf> {
@@ -1024,9 +885,9 @@ fn record_history(report: &EvaluationReport, report_dir: &Path) -> Result<PathBu
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::eval::{
-        EvaluationStageTimings, PerformanceTimings, RetrievedSummary, StageLatency,
-        StageLatencyBreakdown,
+    use crate::types::{
+        EvaluationStageTimings, PerformanceTimings, RetrievedContextStats, RetrievedSummary,
+        StageLatency, StageLatencyBreakdown,
     };
     use chrono::Utc;
     use tempfile::tempdir;
@@ -1101,6 +962,7 @@ mod tests {
             has_verified_chunks: !is_impossible,
             match_rank: if matched { Some(1) } else { None },
             latency_ms: 42,
+            retrieved_context: RetrievedContextStats::default(),
             retrieved: vec![RetrievedSummary {
                 rank: 1,
                 entity_id: "entity1".into(),
@@ -1199,6 +1061,13 @@ mod tests {
             chunk_vector_take: 50,
             chunk_fts_take: 50,
             max_chunks_per_entity: 4,
+            retrieved_context: crate::context_stats::aggregate_context_stats(&[
+                RetrievedContextStats {
+                    chunk_count: 1,
+                    char_count: 10,
+                    token_count: 3,
+                },
+            ]),
             cases,
         }
     }
diff --git a/evaluations/src/slice/beir.rs b/evaluations/src/slice/beir.rs
new file mode 100644
index 0000000..56108c9
--- /dev/null
+++ b/evaluations/src/slice/beir.rs
@@ -0,0 +1,174 @@
+use std::collections::{HashMap, VecDeque};
+
+use anyhow::{anyhow, Result};
+use rand::{rngs::StdRng, seq::SliceRandom, SeedableRng};
+use tracing::warn;
+
+use crate::datasets::{ConvertedDataset, BEIR_DATASETS};
+
+use super::build::{mix_seed, BuildParams};
+
+#[allow(clippy::too_many_lines, clippy::arithmetic_side_effects)]
+pub(super) fn ordered_question_refs_beir(
+    dataset: &ConvertedDataset,
+    params: &BuildParams,
+    target_cases: usize,
+) -> Result<Vec<(usize, usize)>> {
+    let prefixes: Vec<&str> = BEIR_DATASETS
+        .iter()
+        .map(|kind| kind.source_prefix())
+        .collect();
+
+    let mut grouped: HashMap<&str, Vec<(usize, usize)>> = HashMap::new();
+    for (p_idx, paragraph) in dataset.paragraphs.iter().enumerate() {
+        for (q_idx, question) in paragraph.questions.iter().enumerate() {
+            let include = if params.include_impossible {
+                true
+            } else {
+                !question.is_impossible && !question.answers.is_empty()
+            };
+            if !include {
+                continue;
+            }
+
+            let Some(prefix) = question_prefix(&question.id) else {
+                warn!(
+                    question_id = %question.id,
+                    "Skipping BEIR question without expected prefix"
+                );
+                continue;
+            };
+            if !prefixes.contains(&prefix) {
+                warn!(
+                    question_id = %question.id,
+                    prefix = %prefix,
+                    "Skipping BEIR question with unknown subset prefix"
+                );
+                continue;
+            }
+            grouped.entry(prefix).or_default().push((p_idx, q_idx));
+        }
+    }
+
+    if grouped.values().all(std::vec::Vec::is_empty) {
+        return Err(anyhow!(
+            "no eligible BEIR questions found; cannot build slice"
+        ));
+    }
+
+    for prefix in &prefixes {
+        if let Some(entries) = grouped.get_mut(prefix) {
+            let seed = mix_seed(
+                &format!("{}::{prefix}", dataset.metadata.id),
+                params.base_seed,
+            );
+            let mut rng = StdRng::seed_from_u64(seed);
+            entries.shuffle(&mut rng);
+        }
+    }
+
+    let dataset_count = prefixes.len().max(1);
+    let base_quota = target_cases / dataset_count;
+    let mut remainder = target_cases % dataset_count;
+
+    let mut quotas: HashMap<&str, usize> = HashMap::new();
+    for prefix in &prefixes {
+        let mut quota = base_quota;
+        if remainder > 0 {
+            quota += 1;
+            remainder -= 1;
+        }
+        quotas.insert(*prefix, quota);
+    }
+
+    let mut take_counts: HashMap<&str, usize> = HashMap::new();
+    let mut spare_slots: HashMap<&str, usize> = HashMap::new();
+    let mut shortfall = 0usize;
+
+    for prefix in &prefixes {
+        let available = grouped.get(prefix).map_or(0, std::vec::Vec::len);
+        let quota = *quotas.get(prefix).unwrap_or(&0);
+        let take = quota.min(available);
+        let missing = quota.saturating_sub(take);
+        shortfall += missing;
+        take_counts.insert(*prefix, take);
+        spare_slots.insert(*prefix, available.saturating_sub(take));
+    }
+
+    while shortfall > 0 {
+        let mut allocated = false;
+        for prefix in &prefixes {
+            if shortfall == 0 {
+                break;
+            }
+            let spare = spare_slots.get(prefix).copied().unwrap_or(0);
+            if spare == 0 {
+                continue;
+            }
+            if let Some(count) = take_counts.get_mut(prefix) {
+                *count += 1;
+            }
+            spare_slots.insert(*prefix, spare - 1);
+            shortfall = shortfall.saturating_sub(1);
+            allocated = true;
+        }
+        if !allocated {
+            break;
+        }
+    }
+
+    let mut queues: Vec<VecDeque<(usize, usize)>> = Vec::new();
+    let mut total_selected = 0usize;
+    for prefix in &prefixes {
+        let take = *take_counts.get(prefix).unwrap_or(&0);
+        let mut deque = VecDeque::new();
+        if let Some(entries) = grouped.get(prefix) {
+            for item in entries.iter().take(take) {
+                deque.push_back(*item);
+                total_selected += 1;
+            }
+        }
+        queues.push(deque);
+    }
+
+    if total_selected < target_cases {
+        warn!(
+            requested = target_cases,
+            available = total_selected,
+            "BEIR mix requested more questions than available after balancing; continuing with capped set"
+        );
+    }
+
+    let mut output = Vec::with_capacity(total_selected);
+    loop {
+        let mut progressed = false;
+        for queue in &mut queues {
+            if let Some(item) = queue.pop_front() {
+                output.push(item);
+                progressed = true;
+            }
+        }
+        if !progressed {
+            break;
+        }
+    }
+
+    if output.is_empty() {
+        return Err(anyhow!(
+            "no eligible BEIR questions found; cannot build slice"
+        ));
+    }
+
+    Ok(output)
+}
+
+pub(super) fn question_prefix(question_id: &str) -> Option<&'static str> {
+    for prefix in BEIR_DATASETS.iter().map(|kind| kind.source_prefix()) {
+        if let Some(rest) = question_id.strip_prefix(prefix) {
+            if rest.starts_with('-') {
+                return Some(prefix);
+            }
+        }
+    }
+    None
+}
diff --git a/evaluations/src/slice/build.rs b/evaluations/src/slice/build.rs
new file mode 100644
index 0000000..8018ac7
--- /dev/null
+++ b/evaluations/src/slice/build.rs
@@ -0,0 +1,19 @@
+use sha2::{Digest, Sha256};
+
+#[derive(Debug)]
+pub(super) struct BuildParams {
+    pub include_impossible: bool,
+    pub base_seed: u64,
+    pub rng_seed: u64,
+}
+
+#[allow(clippy::indexing_slicing)]
+pub(super) fn mix_seed(dataset_id: &str, seed: u64) -> u64 {
+    let mut hasher = Sha256::new();
+    hasher.update(dataset_id.as_bytes());
+    hasher.update(seed.to_le_bytes());
+    let digest = hasher.finalize();
+    let mut bytes = [0u8; 8];
+    bytes.copy_from_slice(&digest[..8]);
+    u64::from_le_bytes(bytes)
+}
diff --git a/evaluations/src/slice.rs b/evaluations/src/slice/mod.rs
similarity index 83%
rename from evaluations/src/slice.rs
rename to evaluations/src/slice/mod.rs
index d1da847..3b7d7d9 100644
--- a/evaluations/src/slice.rs
+++ b/evaluations/src/slice/mod.rs
@@ -1,5 +1,5 @@
 use std::{
-    collections::{HashMap, HashSet, VecDeque},
+    collections::{HashMap, HashSet},
     fmt::Write,
     fs,
     path::{Path, PathBuf},
@@ -12,10 +12,18 @@ use serde::{Deserialize, Serialize};
 use sha2::{Digest, Sha256};
 use tracing::{info, warn};
 
-use crate::datasets::{
-    ConvertedDataset, ConvertedParagraph, ConvertedQuestion, DatasetKind, BEIR_DATASETS,
+use crate::{
+    args::Config,
+    datasets::{
+        ConvertedDataset, ConvertedParagraph, ConvertedQuestion, DatasetKind,
+    },
 };
 
+mod beir;
+mod build;
+
+use build::{mix_seed, BuildParams};
+
 const SLICE_VERSION: u32 = 2;
 pub const DEFAULT_NEGATIVE_MULTIPLIER: f32 = 4.0;
 
@@ -80,8 +88,12 @@ pub enum SliceParagraphKind {
     Negative,
 }
 
+pub fn paragraph_storage_key(paragraph_id: &str) -> String {
+    sanitize_identifier(paragraph_id)
+}
+
 pub(crate) fn default_shard_path(paragraph_id: &str) -> String {
-    let sanitized = sanitize_identifier(paragraph_id);
+    let sanitized = paragraph_storage_key(paragraph_id);
     format!("paragraphs/{sanitized}.json")
 }
 
@@ -210,13 +222,6 @@ struct SliceKey<'a> {
     seed: u64,
 }
 
-#[derive(Debug)]
-struct BuildParams {
-    include_impossible: bool,
-    base_seed: u64,
-    rng_seed: u64,
-}
-
 #[allow(clippy::too_many_lines)]
 pub fn resolve_slice<'a>(
     dataset: &'a ConvertedDataset,
@@ -225,15 +230,29 @@ pub fn resolve_slice<'a>(
     let index = DatasetIndex::build(dataset);
 
     if let Some(slice_arg) = config.explicit_slice {
-        let (path, manifest) = load_explicit_slice(dataset, &index, config, slice_arg)?;
-        let resolved = manifest_to_resolved(dataset, &index, manifest, path)?;
+        let path = explicit_slice_path(dataset, config, slice_arg);
+        if path.exists() {
+            let (path, manifest) = load_explicit_slice(dataset, &index, config, slice_arg)?;
+            let resolved = manifest_to_resolved(dataset, &index, manifest, path)?;
+            info!(
+                slice = %resolved.manifest.slice_id,
+                path = %resolved.path.display(),
+                cases = resolved.manifest.case_count,
+                positives = resolved.manifest.positive_paragraphs,
+                negatives = resolved.manifest.negative_paragraphs,
+                "Using explicitly selected slice"
+            );
+            return Ok(resolved);
+        }
+        let resolved =
+            materialize_slice_ledger(dataset, config, &index, slice_arg, path)?;
         info!(
             slice = %resolved.manifest.slice_id,
             path = %resolved.path.display(),
             cases = resolved.manifest.case_count,
             positives = resolved.manifest.positive_paragraphs,
             negatives = resolved.manifest.negative_paragraphs,
-            "Using explicitly selected slice"
+            "Built catalog slice ledger"
         );
         return Ok(resolved);
     }
@@ -256,6 +275,82 @@ pub fn resolve_slice<'a>(
         .join("slices")
         .join(dataset.metadata.id.as_str());
     let path = base.join(format!("{slice_id}.json"));
+    materialize_slice_ledger(dataset, config, &index, &slice_id, path)
+}
+
+#[allow(clippy::indexing_slicing, clippy::arithmetic_side_effects)]
+pub fn select_window<'a>(
+    resolved: &'a ResolvedSlice<'a>,
+    offset: usize,
+    limit: Option<usize>,
+) -> Result<SliceWindow<'a>> {
+    let total = resolved.manifest.case_count;
+    if total == 0 {
+        return Err(anyhow!(
+            "slice '{}' contains no cases",
+            resolved.manifest.slice_id
+        ));
+    }
+    if offset >= total {
+        return Err(anyhow!(
+            "slice offset {offset} exceeds available cases ({total})",
+        ));
+    }
+    let available = total - offset;
+    let requested = limit.unwrap_or(available).max(1);
+    let length = requested.min(available);
+    let cases = resolved.cases[offset..offset + length].to_vec();
+    let mut seen = HashSet::new();
+    let mut positive_ids = Vec::new();
+    for case in &cases {
+        if seen.insert(case.paragraph.id.as_str()) {
+            positive_ids.push(case.paragraph.id.clone());
+        }
+    }
+    Ok(SliceWindow {
+        offset,
+        length,
+        total_cases: total,
+        cases,
+        positive_paragraph_ids: positive_ids,
+    })
+}
+
+#[allow(dead_code)]
+pub fn full_window<'a>(resolved: &'a ResolvedSlice<'a>) -> Result<SliceWindow<'a>> {
+    select_window(resolved, 0, None)
+}
+
+fn explicit_slice_path(
+    dataset: &ConvertedDataset,
+    config: &SliceConfig<'_>,
+    slice_arg: &str,
+) -> PathBuf {
+    let explicit_path = Path::new(slice_arg);
+    if explicit_path.exists() {
+        explicit_path.to_path_buf()
+    } else {
+        config
+            .cache_dir
+            .join("slices")
+            .join(dataset.metadata.id.as_str())
+            .join(format!("{slice_arg}.json"))
+    }
+}
+
+#[allow(clippy::too_many_lines)]
+fn materialize_slice_ledger<'a>(
+    dataset: &'a ConvertedDataset,
+    config: &SliceConfig<'_>,
+    index: &DatasetIndex,
+    slice_id: &str,
+    path: PathBuf,
+) -> Result<ResolvedSlice<'a>> {
+    let requested_corpus = config
+        .corpus_limit
+        .unwrap_or(dataset.paragraphs.len())
+        .min(dataset.paragraphs.len())
+        .max(1);
 
     let total_questions = dataset
         .paragraphs
@@ -339,7 +434,7 @@ pub fn resolve_slice<'a>(
     let mut manifest = manifest.unwrap_or_else(|| {
         empty_manifest(
             dataset,
-            slice_id.clone(),
+            slice_id.to_string(),
             &params,
             requested_corpus,
             config.negative_multiplier,
@@ -396,52 +491,7 @@ pub fn resolve_slice<'a>(
         );
     }
 
-    let resolved = manifest_to_resolved(dataset, &index, manifest.clone(), path)?;
-
-    Ok(resolved)
-}
-
-#[allow(clippy::indexing_slicing, clippy::arithmetic_side_effects)]
-pub fn select_window<'a>(
-    resolved: &'a ResolvedSlice<'a>,
-    offset: usize,
-    limit: Option<usize>,
-) -> Result<SliceWindow<'a>> {
-    let total = resolved.manifest.case_count;
-    if total == 0 {
-        return Err(anyhow!(
-            "slice '{}' contains no cases",
-            resolved.manifest.slice_id
-        ));
-    }
-    if offset >= total {
-        return Err(anyhow!(
-            "slice offset {offset} exceeds available cases ({total})",
-        ));
-    }
-    let available = total - offset;
-    let requested = limit.unwrap_or(available).max(1);
-    let length = requested.min(available);
-    let cases = resolved.cases[offset..offset + length].to_vec();
-    let mut seen = HashSet::new();
-    let mut positive_ids = Vec::new();
-    for case in &cases {
-        if seen.insert(case.paragraph.id.as_str()) {
-            positive_ids.push(case.paragraph.id.clone());
-        }
-    }
-    Ok(SliceWindow {
-        offset,
-        length,
-        total_cases: total,
-        cases,
-        positive_paragraph_ids: positive_ids,
-    })
-}
-
-#[allow(dead_code)]
-pub fn full_window<'a>(resolved: &'a ResolvedSlice<'a>) -> Result<SliceWindow<'a>> {
-    select_window(resolved, 0, None)
+    manifest_to_resolved(dataset, index, manifest, path)
 }
 
 fn load_explicit_slice(
@@ -450,16 +500,7 @@ fn load_explicit_slice(
     config: &SliceConfig<'_>,
     slice_arg: &str,
 ) -> Result<(PathBuf, SliceManifest)> {
-    let explicit_path = Path::new(slice_arg);
-    let candidate_path = if explicit_path.exists() {
-        explicit_path.to_path_buf()
-    } else {
-        config
-            .cache_dir
-            .join("slices")
-            .join(dataset.metadata.id.as_str())
-            .join(format!("{slice_arg}.json"))
-    };
+    let candidate_path = explicit_slice_path(dataset, config, slice_arg);
 
     let manifest = read_manifest(&candidate_path)
         .with_context(|| format!("reading slice manifest at {}", candidate_path.display()))?;
@@ -613,7 +654,7 @@ fn ordered_question_refs(
     target_cases: usize,
 ) -> Result<Vec<(usize, usize)>> {
     if dataset.metadata.id == DatasetKind::Beir.id() {
-        return ordered_question_refs_beir(dataset, params, target_cases);
+        return beir::ordered_question_refs_beir(dataset, params, target_cases);
     }
 
     let mut question_refs = Vec::new();
@@ -642,171 +683,6 @@ fn ordered_question_refs(
     Ok(question_refs)
 }
 
-#[allow(clippy::too_many_lines, clippy::arithmetic_side_effects)]
-fn ordered_question_refs_beir(
-    dataset: &ConvertedDataset,
-    params: &BuildParams,
-    target_cases: usize,
-) -> Result<Vec<(usize, usize)>> {
-    let prefixes: Vec<&str> = BEIR_DATASETS
-        .iter()
-        .map(|kind| kind.source_prefix())
-        .collect();
-
-    let mut grouped: HashMap<&str, Vec<(usize, usize)>> = HashMap::new();
-    for (p_idx, paragraph) in dataset.paragraphs.iter().enumerate() {
-        for (q_idx, question) in paragraph.questions.iter().enumerate() {
-            let include = if params.include_impossible {
-                true
-            } else {
-                !question.is_impossible && !question.answers.is_empty()
-            };
-            if !include {
-                continue;
-            }
-
-            let Some(prefix) = question_prefix(&question.id) else {
-                warn!(
-                    question_id = %question.id,
-                    "Skipping BEIR question without expected prefix"
-                );
-                continue;
-            };
-            if !prefixes.contains(&prefix) {
-                warn!(
-                    question_id = %question.id,
-                    prefix = %prefix,
-                    "Skipping BEIR question with unknown subset prefix"
-                );
-                continue;
-            }
-            grouped.entry(prefix).or_default().push((p_idx, q_idx));
-        }
-    }
-
-    if grouped.values().all(std::vec::Vec::is_empty) {
-        return Err(anyhow!(
-            "no eligible BEIR questions found; cannot build slice"
-        ));
-    }
-
-    for prefix in &prefixes {
-        if let Some(entries) = grouped.get_mut(prefix) {
-            let seed = mix_seed(
-                &format!("{}::{prefix}", dataset.metadata.id),
-                params.base_seed,
-            );
-            let mut rng = StdRng::seed_from_u64(seed);
-            entries.shuffle(&mut rng);
-        }
-    }
-
-    let dataset_count = prefixes.len().max(1);
-    let base_quota = target_cases / dataset_count;
-    let mut remainder = target_cases % dataset_count;
-
-    let mut quotas: HashMap<&str, usize> = HashMap::new();
-    for prefix in &prefixes {
-        let mut quota = base_quota;
-        if remainder > 0 {
-            quota += 1;
-            remainder -= 1;
-        }
-        quotas.insert(*prefix, quota);
-    }
-
-    let mut take_counts: HashMap<&str, usize> = HashMap::new();
-    let mut spare_slots: HashMap<&str, usize> = HashMap::new();
-    let mut shortfall = 0usize;
-
-    for prefix in &prefixes {
-        let available = grouped.get(prefix).map_or(0, std::vec::Vec::len);
-        let quota = *quotas.get(prefix).unwrap_or(&0);
-        let take = quota.min(available);
-        let missing = quota.saturating_sub(take);
-        shortfall += missing;
-        take_counts.insert(*prefix, take);
-        spare_slots.insert(*prefix, available.saturating_sub(take));
-    }
-
-    while shortfall > 0 {
-        let mut allocated = false;
-        for prefix in &prefixes {
-            if shortfall == 0 {
-                break;
-            }
-            let spare = spare_slots.get(prefix).copied().unwrap_or(0);
-            if spare == 0 {
-                continue;
-            }
-            if let Some(count) = take_counts.get_mut(prefix) {
-                *count += 1;
-            }
-            spare_slots.insert(*prefix, spare - 1);
-            shortfall = shortfall.saturating_sub(1);
-            allocated = true;
-        }
-        if !allocated {
-            break;
-        }
-    }
-
-    let mut queues: Vec<VecDeque<(usize, usize)>> = Vec::new();
-    let mut total_selected = 0usize;
-    for prefix in &prefixes {
-        let take = *take_counts.get(prefix).unwrap_or(&0);
-        let mut deque = VecDeque::new();
-        if let Some(entries) = grouped.get(prefix) {
-            for item in entries.iter().take(take) {
-                deque.push_back(*item);
-                total_selected += 1;
-            }
-        }
-        queues.push(deque);
-    }
-
-    if total_selected < target_cases {
-        warn!(
-            requested = target_cases,
-            available = total_selected,
-            "BEIR mix requested more questions than available after balancing; continuing with capped set"
-        );
-    }
-
-    let mut output = Vec::with_capacity(total_selected);
-    loop {
-        let mut progressed = false;
-        for queue in &mut queues {
-            if let Some(item) = queue.pop_front() {
-                output.push(item);
-                progressed = true;
-            }
-        }
-        if !progressed {
-            break;
-        }
-    }
-
-    if output.is_empty() {
-        return Err(anyhow!(
-            "no eligible BEIR questions found; cannot build slice"
-        ));
-    }
-
-    Ok(output)
-}
-
-fn question_prefix(question_id: &str) -> Option<&'static str> {
-    for prefix in BEIR_DATASETS.iter().map(|kind| kind.source_prefix()) {
-        if let Some(rest) = question_id.strip_prefix(prefix) {
-            if rest.starts_with('-') {
-                return Some(prefix);
-            }
-        }
-    }
-    None
-}
-
 #[allow(clippy::indexing_slicing)]
 fn ensure_negative_pool(
     dataset: &ConvertedDataset,
@@ -1028,15 +904,48 @@ fn compute_slice_id(key: &SliceKey<'_>) -> Result<String> {
         }))
 }
 
-#[allow(clippy::indexing_slicing)]
-fn mix_seed(dataset_id: &str, seed: u64) -> u64 {
-    let mut hasher = Sha256::new();
-    hasher.update(dataset_id.as_bytes());
-    hasher.update(seed.to_le_bytes());
-    let digest = hasher.finalize();
-    let mut bytes = [0u8; 8];
-    bytes.copy_from_slice(&digest[..8]);
-    u64::from_le_bytes(bytes)
+pub fn read_manifest_if_exists(path: &Path) -> Result<Option<SliceManifest>> {
+    if !path.exists() {
+        return Ok(None);
+    }
+    read_manifest(path).map(Some)
+}
+
+pub fn cached_manifest_path(config: &crate::args::Config) -> Option<PathBuf> {
+    let slice_arg = config.slice.as_deref()?;
+    let explicit_path = Path::new(slice_arg);
+    if explicit_path.exists() {
+        return Some(explicit_path.to_path_buf());
+    }
+    Some(
+        config
+            .cache_dir
+            .join("slices")
+            .join(config.dataset.id())
+            .join(format!("{slice_arg}.json")),
+    )
+}
+
+pub fn manifest_is_complete(manifest: &SliceManifest, config: &SliceConfig<'_>) -> bool {
+    let requested_limit = config
+        .limit
+        .unwrap_or(manifest.case_count.max(1))
+        .max(1);
+    if manifest.case_count < requested_limit {
+        return false;
+    }
+
+    let requested_corpus = config
+        .corpus_limit
+        .unwrap_or(manifest.total_paragraphs.max(1))
+        .max(1);
+    let desired_negatives = desired_negative_target(
+        manifest.positive_paragraphs,
+        requested_corpus,
+        manifest.total_paragraphs.max(manifest.positive_paragraphs.max(1)),
+        config.negative_multiplier,
+    );
+    manifest.negative_paragraphs >= desired_negatives
 }
 
 fn read_manifest(path: &Path) -> Result<SliceManifest> {
@@ -1057,14 +966,38 @@ fn write_manifest(path: &Path, manifest: &SliceManifest) -> Result<()> {
     Ok(())
 }
 
-use crate::args::Config;
-
-impl<'a> From<&'a Config> for SliceConfig<'a> {
-    fn from(config: &'a Config) -> Self {
-        slice_config_with_limit(config, None)
+pub fn ledger_target(config: &Config) -> Option<usize> {
+    match (config.slice_grow, config.limit) {
+        (Some(grow), Some(limit)) => Some(limit.max(grow)),
+        (Some(grow), None) => Some(grow),
+        (None, limit) => limit,
     }
 }
 
+/// Grow the slice ledger to contain the target number of cases.
+pub fn grow_slice(dataset: &ConvertedDataset, config: &Config) -> Result<()> {
+    let ledger_limit = ledger_target(config);
+    let slice_settings = slice_config_with_limit(config, ledger_limit);
+    let slice =
+        resolve_slice(dataset, &slice_settings).context("resolving dataset slice")?;
+    info!(
+        slice = slice.manifest.slice_id.as_str(),
+        cases = slice.manifest.case_count,
+        positives = slice.manifest.positive_paragraphs,
+        negatives = slice.manifest.negative_paragraphs,
+        total_paragraphs = slice.manifest.total_paragraphs,
+        "Slice ledger ready"
+    );
+    println!(
+        "Slice `{}` now contains {} questions ({} positives, {} negatives)",
+        slice.manifest.slice_id,
+        slice.manifest.case_count,
+        slice.manifest.positive_paragraphs,
+        slice.manifest.negative_paragraphs
+    );
+    Ok(())
+}
+
 pub fn slice_config_with_limit(config: &Config, limit_override: Option<usize>) -> SliceConfig<'_> {
     SliceConfig {
         cache_dir: config.cache_dir.as_path(),
@@ -1088,7 +1021,7 @@ mod tests {
     use tempfile::tempdir;
 
     fn sample_dataset() -> ConvertedDataset {
-        let metadata = DatasetMetadata::for_kind(DatasetKind::SquadV2, false, None);
+        let metadata = DatasetMetadata::for_kind(DatasetKind::SquadV2, false);
         ConvertedDataset {
             generated_at: Utc::now(),
             metadata,
@@ -1226,7 +1159,7 @@ mod tests {
             }
         }
 
-        let metadata = DatasetMetadata::for_kind(DatasetKind::Beir, false, None);
+        let metadata = DatasetMetadata::for_kind(DatasetKind::Beir, false);
         let dataset = ConvertedDataset {
             generated_at: Utc::now(),
             metadata,
@@ -1240,11 +1173,11 @@ mod tests {
             rng_seed: 0xBB,
         };
 
-        let refs = ordered_question_refs_beir(&dataset, &params, 8)?;
+        let refs = beir::ordered_question_refs_beir(&dataset, &params, 8)?;
         let mut per_prefix: HashMap<String, usize> = HashMap::new();
         for (p_idx, q_idx) in refs {
             let question = &dataset.paragraphs[p_idx].questions[q_idx];
-            let prefix = question_prefix(&question.id).unwrap_or("unknown");
+            let prefix = beir::question_prefix(&question.id).unwrap_or("unknown");
             *per_prefix.entry(prefix.to_string()).or_default() += 1;
         }
 
diff --git a/evaluations/src/snapshot.rs b/evaluations/src/snapshot.rs
deleted file mode 100644
index 5b1a827..0000000
--- a/evaluations/src/snapshot.rs
+++ /dev/null
@@ -1,179 +0,0 @@
-use std::path::PathBuf;
-
-use anyhow::{Context, Result};
-use chrono::{DateTime, Utc};
-use serde::{Deserialize, Serialize};
-use sha2::{Digest, Sha256};
-use tokio::fs;
-
-use crate::{args::Config, slice};
-use common::utils::embedding::EmbeddingProvider;
-
-#[derive(Clone, Debug, Serialize, Deserialize)]
-pub struct SnapshotMetadata {
-    pub dataset_id: String,
-    pub slice_id: String,
-    pub embedding_backend: String,
-    pub embedding_model: Option<String>,
-    pub embedding_dimension: usize,
-    pub rerank_enabled: bool,
-}
-
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct DbSnapshotState {
-    pub dataset_id: String,
-    pub slice_id: String,
-    pub ingestion_fingerprint: String,
-    pub snapshot_hash: String,
-    pub updated_at: DateTime<Utc>,
-    #[serde(default)]
-    pub namespace: Option<String>,
-    #[serde(default)]
-    pub database: Option<String>,
-    #[serde(default)]
-    pub slice_case_count: usize,
-}
-
-pub struct Descriptor {
-    #[allow(dead_code)]
-    metadata: SnapshotMetadata,
-    dir: PathBuf,
-    metadata_hash: String,
-}
-
-impl Descriptor {
-    pub fn new(
-        config: &Config,
-        slice: &slice::ResolvedSlice<'_>,
-        embedding_provider: &EmbeddingProvider,
-    ) -> Self {
-        let metadata = SnapshotMetadata {
-            dataset_id: slice.manifest.dataset_id.clone(),
-            slice_id: slice.manifest.slice_id.clone(),
-            embedding_backend: embedding_provider.backend_label().to_string(),
-            embedding_model: embedding_provider.model_code(),
-            embedding_dimension: embedding_provider.dimension(),
-            rerank_enabled: config.retrieval.rerank,
-        };
-
-        let dir = config
-            .cache_dir
-            .join("snapshots")
-            .join(&metadata.dataset_id)
-            .join(&metadata.slice_id);
-        let metadata_hash = compute_hash(&metadata);
-
-        Self {
-            metadata,
-            dir,
-            metadata_hash,
-        }
-    }
-
-    pub fn metadata_hash(&self) -> &str {
-        &self.metadata_hash
-    }
-
-    pub async fn load_db_state(&self) -> Result<Option<DbSnapshotState>> {
-        let path = self.db_state_path();
-        if !path.exists() {
-            return Ok(None);
-        }
-        let bytes = fs::read(&path)
-            .await
-            .with_context(|| format!("reading namespace state {}", path.display()))?;
-        let state = serde_json::from_slice(&bytes)
-            .with_context(|| format!("deserialising namespace state {}", path.display()))?;
-        Ok(Some(state))
-    }
-
-    pub async fn store_db_state(&self, state: &DbSnapshotState) -> Result<()> {
-        let path = self.db_state_path();
-        if let Some(parent) = path.parent() {
-            fs::create_dir_all(parent).await.with_context(|| {
-                format!("creating namespace state directory {}", parent.display())
-            })?;
-        }
-        let blob =
-            serde_json::to_vec_pretty(state).context("serialising namespace state payload")?;
-        fs::write(&path, blob)
-            .await
-            .with_context(|| format!("writing namespace state {}", path.display()))?;
-        Ok(())
-    }
-
-    fn db_dir(&self) -> PathBuf {
-        self.dir.join("db")
-    }
-
-    fn db_state_path(&self) -> PathBuf {
-        self.db_dir().join("state.json")
-    }
-
-    #[cfg(test)]
-    pub fn from_parts(metadata: SnapshotMetadata, dir: PathBuf) -> Self {
-        let metadata_hash = compute_hash(&metadata);
-        Self {
-            metadata,
-            dir,
-            metadata_hash,
-        }
-    }
-}
-
-#[allow(clippy::expect_used)]
-fn compute_hash(metadata: &SnapshotMetadata) -> String {
-    let mut hasher = Sha256::new();
-    hasher.update(
-        serde_json::to_vec(metadata).expect("snapshot metadata serialisation should succeed"),
-    );
-    format!("{:x}", hasher.finalize())
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[tokio::test]
-    #[allow(clippy::unwrap_used, clippy::expect_used)]
-    async fn state_round_trip() {
-        let temp_dir = tempfile::tempdir().unwrap();
-        let metadata = SnapshotMetadata {
-            dataset_id: "dataset".into(),
-            slice_id: "slice".into(),
-            embedding_backend: "hashed".into(),
-            embedding_model: None,
-            embedding_dimension: 128,
-            rerank_enabled: true,
-        };
-        let descriptor = Descriptor::from_parts(
-            metadata,
-            temp_dir
-                .path()
-                .join("snapshots")
-                .join("dataset")
-                .join("slice"),
-        );
-
-        let state = DbSnapshotState {
-            dataset_id: "dataset".into(),
-            slice_id: "slice".into(),
-            ingestion_fingerprint: "fingerprint".into(),
-            snapshot_hash: descriptor.metadata_hash().to_string(),
-            updated_at: Utc::now(),
-            namespace: Some("ns".into()),
-            database: Some("db".into()),
-            slice_case_count: 42,
-        };
-        descriptor.store_db_state(&state).await.unwrap();
-
-        let loaded = descriptor.load_db_state().await.unwrap().unwrap();
-        assert_eq!(loaded.dataset_id, state.dataset_id);
-        assert_eq!(loaded.slice_id, state.slice_id);
-        assert_eq!(loaded.ingestion_fingerprint, state.ingestion_fingerprint);
-        assert_eq!(loaded.snapshot_hash, state.snapshot_hash);
-        assert_eq!(loaded.namespace, state.namespace);
-        assert_eq!(loaded.database, state.database);
-        assert_eq!(loaded.slice_case_count, state.slice_case_count);
-    }
-}
diff --git a/evaluations/src/types.rs b/evaluations/src/types.rs
index 0f4950f..cb99e61 100644
--- a/evaluations/src/types.rs
+++ b/evaluations/src/types.rs
@@ -1,6 +1,6 @@
 use std::collections::HashSet;
 
-use chrono::{DateTime, Utc};
+use chrono::{DateTime, SecondsFormat, Utc};
 use common::storage::types::StoredObject;
 use retrieval_pipeline::{
     Diagnostics, RetrievalOutput, RetrievedChunk, RetrievedEntity, StageKind, StageTimings,
@@ -8,6 +8,8 @@ use retrieval_pipeline::{
 use serde::{Deserialize, Serialize};
 use unicode_normalization::UnicodeNormalization;
 
+pub use crate::context_stats::{RetrievalContextStats, RetrievedContextStats};
+
 #[allow(clippy::struct_excessive_bools)]
 #[derive(Debug, Serialize)]
 pub struct EvaluationSummary {
@@ -83,6 +85,7 @@ pub struct EvaluationSummary {
     pub chunk_vector_take: usize,
     pub chunk_fts_take: usize,
     pub max_chunks_per_entity: usize,
+    pub retrieved_context: RetrievalContextStats,
     pub cases: Vec<CaseSummary>,
 }
 
@@ -108,6 +111,7 @@ pub struct CaseSummary {
     #[serde(skip_serializing_if = "Option::is_none")]
     pub ndcg: Option<f64>,
     pub latency_ms: u128,
+    pub retrieved_context: RetrievedContextStats,
     pub retrieved: Vec<RetrievedSummary>,
 }
 
@@ -483,3 +487,7 @@ pub fn build_case_diagnostics(
         pipeline: pipeline_stats,
     }
 }
+
+pub fn format_timestamp(timestamp: &DateTime<Utc>) -> String {
+    timestamp.to_rfc3339_opts(SecondsFormat::Secs, true)
+}
diff --git a/html-router/assets/style.css b/html-router/assets/style.css
index a9691cc..57fbde4 100644
--- a/html-router/assets/style.css
+++ b/html-router/assets/style.css
@@ -44,7 +44,6 @@
     --leading-snug: 1.375;
     --leading-relaxed: 1.625;
     --ease-out: cubic-bezier(0, 0, 0.2, 1);
-    --ease-in-out: cubic-bezier(0.4, 0, 0.2, 1);
     --animate-pulse: pulse 2s cubic-bezier(0.4, 0, 0.6, 1) infinite;
     --default-transition-duration: 150ms;
     --default-transition-timing-function: cubic-bezier(0.4, 0, 0.2, 1);