From fb51a8b55f90f79046acfbe2876cedc86a6799df Mon Sep 17 00:00:00 2001 From: Per Stark Date: Wed, 17 Jun 2026 19:23:11 +0200 Subject: [PATCH] evals: eval crate overhaul, simplification and performance improvements --- .cargo/config.toml | 2 +- CHANGELOG.md | 1 + Cargo.lock | 94 ---- Cargo.toml | 2 +- devenv.nix | 2 + evaluations/Cargo.toml | 2 - evaluations/README.md | 252 +++------- evaluations/REFACTOR.md | 98 ++++ evaluations/manifest.yaml | 3 +- evaluations/src/args.rs | 84 +++- evaluations/src/cache.rs | 88 ---- evaluations/src/cases.rs | 1 + evaluations/src/cli/mod.rs | 3 + evaluations/src/cli/status.rs | 316 +++++++++++++ evaluations/src/context_stats.rs | 177 +++++++ evaluations/src/corpus/config.rs | 32 +- evaluations/src/corpus/mod.rs | 6 +- evaluations/src/corpus/orchestrator.rs | 58 +-- evaluations/src/corpus/store.rs | 13 +- evaluations/src/datasets/beir.rs | 132 +++++- evaluations/src/datasets/beir_mix.rs | 262 +++++++++++ evaluations/src/datasets/checksum.rs | 216 +++++++++ evaluations/src/datasets/loader.rs | 197 ++++++++ evaluations/src/datasets/mod.rs | 181 ++----- evaluations/src/datasets/nq.rs | 6 +- evaluations/src/datasets/store.rs | 410 ++++++++++++++++ .../src/{namespace.rs => db/connect.rs} | 105 +++-- .../src/{db_helpers.rs => db/lifecycle.rs} | 48 +- evaluations/src/db/mod.rs | 9 + evaluations/src/eval.rs | 128 ----- evaluations/src/inspection.rs | 70 +-- evaluations/src/main.rs | 95 ++-- evaluations/src/openai.rs | 20 +- evaluations/src/perf.rs | 15 +- evaluations/src/pipeline/context.rs | 28 +- evaluations/src/pipeline/diagnostics.rs | 20 + evaluations/src/pipeline/mod.rs | 53 ++- evaluations/src/pipeline/stages/finalize.rs | 21 +- evaluations/src/pipeline/stages/mod.rs | 11 - .../src/pipeline/stages/prepare_corpus.rs | 55 +-- evaluations/src/pipeline/stages/prepare_db.rs | 52 +- .../src/pipeline/stages/prepare_namespace.rs | 79 +--- .../src/pipeline/stages/prepare_slice.rs | 28 +- .../src/pipeline/stages/run_queries.rs | 29 +- evaluations/src/pipeline/stages/summarize.rs | 29 +- evaluations/src/pipeline/state.rs | 31 -- evaluations/src/report.rs | 293 ++++-------- evaluations/src/slice/beir.rs | 174 +++++++ evaluations/src/slice/build.rs | 19 + evaluations/src/{slice.rs => slice/mod.rs} | 443 ++++++++---------- evaluations/src/snapshot.rs | 179 ------- evaluations/src/types.rs | 10 +- html-router/assets/style.css | 1 - 53 files changed, 2852 insertions(+), 1831 deletions(-) create mode 100644 evaluations/REFACTOR.md delete mode 100644 evaluations/src/cache.rs create mode 100644 evaluations/src/cli/mod.rs create mode 100644 evaluations/src/cli/status.rs create mode 100644 evaluations/src/context_stats.rs create mode 100644 evaluations/src/datasets/beir_mix.rs create mode 100644 evaluations/src/datasets/checksum.rs create mode 100644 evaluations/src/datasets/loader.rs create mode 100644 evaluations/src/datasets/store.rs rename evaluations/src/{namespace.rs => db/connect.rs} (67%) rename evaluations/src/{db_helpers.rs => db/lifecycle.rs} (75%) create mode 100644 evaluations/src/db/mod.rs delete mode 100644 evaluations/src/eval.rs create mode 100644 evaluations/src/pipeline/diagnostics.rs delete mode 100644 evaluations/src/pipeline/state.rs create mode 100644 evaluations/src/slice/beir.rs create mode 100644 evaluations/src/slice/build.rs rename evaluations/src/{slice.rs => slice/mod.rs} (83%) delete mode 100644 evaluations/src/snapshot.rs diff --git a/.cargo/config.toml b/.cargo/config.toml index 61f4796..8522063 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -1,2 +1,2 @@ [alias] -eval = "run -p evaluations --" +eval = "run -p evaluations --release --" diff --git a/CHANGELOG.md b/CHANGELOG.md index 32a6e7a..6278ba5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,6 @@ # Changelog ## Unreleased +- Evaluations: simplified crate layout — linear pipeline, sharded-only converted store, in-memory ingestion, `db/` and `cli/` modules; namespace reuse state in corpus manifest (removed `cache/snapshots/`); no legacy JSON/history compatibility (re-run `--warm` after upgrade) - Performance: ingestion skips per-task index rebuild; worker runs scheduled `REBUILD INDEX` (default every 24h via `index_rebuild_interval_secs`, `0` disables) - Performance: ingestion persists all artifacts in a single SurrealDB transaction per task (atomic replace by task id) - Performance: entity embeddings during ingestion use batched `embed_batch`, matching chunk embedding diff --git a/Cargo.lock b/Cargo.lock index 5c7a3ae..1c2c9be 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -165,12 +165,6 @@ dependencies = [ "libc", ] -[[package]] -name = "anes" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" - [[package]] name = "anstream" version = "0.6.21" @@ -1071,12 +1065,6 @@ dependencies = [ "serde", ] -[[package]] -name = "cast" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" - [[package]] name = "castaway" version = "0.2.4" @@ -1582,42 +1570,6 @@ dependencies = [ "cfg-if", ] -[[package]] -name = "criterion" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" -dependencies = [ - "anes", - "cast", - "ciborium", - "clap", - "criterion-plot", - "is-terminal", - "itertools 0.10.5", - "num-traits", - "once_cell", - "oorandom", - "plotters", - "rayon", - "regex", - "serde", - "serde_derive", - "serde_json", - "tinytemplate", - "walkdir", -] - -[[package]] -name = "criterion-plot" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" -dependencies = [ - "cast", - "itertools 0.10.5", -] - [[package]] name = "critical-section" version = "1.2.0" @@ -2238,7 +2190,6 @@ dependencies = [ "chrono", "clap", "common", - "criterion", "fastembed", "futures", "ingestion-pipeline", @@ -2250,7 +2201,6 @@ dependencies = [ "serde_json", "serde_yaml", "sha2", - "state-machines", "surrealdb", "tempfile", "text-splitter", @@ -4438,12 +4388,6 @@ dependencies = [ "pkg-config", ] -[[package]] -name = "oorandom" -version = "11.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e" - [[package]] name = "opaque-debug" version = "0.3.1" @@ -4836,34 +4780,6 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" -[[package]] -name = "plotters" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747" -dependencies = [ - "num-traits", - "plotters-backend", - "plotters-svg", - "wasm-bindgen", - "web-sys", -] - -[[package]] -name = "plotters-backend" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a" - -[[package]] -name = "plotters-svg" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670" -dependencies = [ - "plotters-backend", -] - [[package]] name = "polling" version = "3.11.0" @@ -6940,16 +6856,6 @@ dependencies = [ "zerovec", ] -[[package]] -name = "tinytemplate" -version = "1.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" -dependencies = [ - "serde", - "serde_json", -] - [[package]] name = "tinyvec" version = "1.10.0" diff --git a/Cargo.toml b/Cargo.toml index e5be857..ca8c315 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,7 +9,7 @@ members = [ "json-stream-parser", "evaluations" ] -resolver = "2" +resolver = "3" [workspace.dependencies] anyhow = "1.0.94" diff --git a/devenv.nix b/devenv.nix index b5b49bf..5594b24 100644 --- a/devenv.nix +++ b/devenv.nix @@ -13,6 +13,8 @@ let else throw "pkgs.onnxruntime.version (${pkgs.onnxruntime.version}) must match ort-version (${ortVersion})"; in { + devenv.warnOnNewVersion = false; + cachix.enable = false; packages = [ diff --git a/evaluations/Cargo.toml b/evaluations/Cargo.toml index 34d36f7..b68e061 100644 --- a/evaluations/Cargo.toml +++ b/evaluations/Cargo.toml @@ -30,8 +30,6 @@ serde_json = { workspace = true } async-trait = { workspace = true } once_cell = "1.19" serde_yaml = "0.9" -criterion = "0.5" -state-machines = { workspace = true } clap = { version = "4.4", features = ["derive", "env"] } [dev-dependencies] diff --git a/evaluations/README.md b/evaluations/README.md index b150a0e..5df7b7d 100644 --- a/evaluations/README.md +++ b/evaluations/README.md @@ -1,212 +1,102 @@ # Evaluations -The `evaluations` crate provides a retrieval evaluation framework for benchmarking Minne's information retrieval pipeline against standard datasets. +The `evaluations` crate benchmarks Minne's retrieval pipeline against standard datasets. ## Quick Start ```bash -# Run SQuAD v2.0 evaluation (vector-only, recommended) -cargo run --package evaluations -- --ingest-chunks-only +# One-time prep (convert, slice ledger, corpus cache, DB seed) +cargo eval --warm --dataset beir --slice beir-mix-600 -# Run a specific dataset -cargo run --package evaluations -- --dataset fiqa --ingest-chunks-only +# Check readiness +cargo eval --status --dataset beir --slice beir-mix-600 -# Convert dataset only (no evaluation) -cargo run --package evaluations -- --convert-only +# Run benchmark (steady state after warm) +cargo eval --dataset beir --slice beir-mix-600 --require-ready ``` +Default dataset is `beir`. When `--slice` is omitted, the first catalog slice for the dataset is applied automatically (e.g. `beir-mix-600`). + +Chunk-only ingestion is the default. Pass `--include-entities` to opt into entity extraction during ingestion (requires `OPENAI_API_KEY`). + +### Custom slice sizes + +`--slice` is a ledger id, not only a catalog name. You can use any id; `--limit` controls how many questions the ledger contains: + +```bash +# 200-case BEIR mix (default --limit is 200) +cargo eval --warm --dataset beir --slice beir-mix-200 +cargo eval --dataset beir --slice beir-mix-200 --require-ready +``` + +The catalog slice `beir-mix-600` in `manifest.yaml` is a preset with `limit: 600` and `negative_multiplier: 9.0`. + +### BEIR mix layout + +`beir` is a **virtual mix** across eight subset datasets (FEVER, FiQA, HotpotQA, NFCorpus, Quora, TREC-COVID, SciFact, NQ-BEIR). There is no monolithic `beir-minne/` store. + +1. Build an in-memory qrels-world mix from raw subset data +2. Resolve the slice ledger (`cache/slices/beir/.json`) +3. Materialize only ledger paragraph ids into per-subset stores (`fever-minne/`, `fiqa-minne/`, …) +4. Ingest the slice corpus and seed SurrealDB + +Conversion is **qrels-closed**: only documents that appear in qrels are exported, not the full BEIR corpus. + +Chunk-only mode may evaluate fewer cases than the slice ledger size when some questions are impossible or lack verifiable answer chunks. + +Reports include a **Retrieved Context Volume** section: total characters and estimated tokens across all chunks returned per query (`~chars/4`, comparable across `--chunk-result-cap` sweeps). Use this to compare the cost of raising `--chunk-result-cap`. + ## Prerequisites -### 1. SurrealDB - -Start a SurrealDB instance before running evaluations: +### SurrealDB ```bash docker-compose up -d surrealdb ``` -Or using the default endpoint configuration: +### Raw datasets -```bash -surreal start --user root_user --pass root_password -``` +Place raw datasets under `evaluations/data/raw/`. See [manifest.yaml](./manifest.yaml) for paths. -### 2. Download Raw Datasets +BEIR subsets live in sibling directories (`data/raw/fever`, `data/raw/fiqa`, …). The `data/raw/beir` entry is a virtual catalog placeholder; warm uses the subset paths. -Raw datasets must be downloaded manually and placed in `evaluations/data/raw/`. See [Dataset Sources](#dataset-sources) below for links and formats. - -## Directory Structure +## Directory structure ``` evaluations/ ├── data/ -│ ├── raw/ # Downloaded raw datasets (manual) -│ │ ├── squad/ # SQuAD v2.0 -│ │ ├── nq-dev/ # Natural Questions -│ │ ├── fiqa/ # BEIR: FiQA-2018 -│ │ ├── fever/ # BEIR: FEVER -│ │ ├── hotpotqa/ # BEIR: HotpotQA -│ │ └── ... # Other BEIR subsets -│ └── converted/ # Auto-generated (Minne JSON format) -├── cache/ # Ingestion and embedding caches -├── reports/ # Evaluation output (JSON + Markdown) -├── manifest.yaml # Dataset and slice definitions -└── src/ # Evaluation source code +│ ├── raw/ # Downloaded datasets (manual) +│ │ ├── fever/ # BEIR subset raw dirs (corpus.jsonl, queries.jsonl, qrels/) +│ │ ├── fiqa/ +│ │ └── … +│ └── converted/ # Sharded stores (auto-generated) +│ ├── fever-minne/ # per-BEIR-subset stores +│ ├── fiqa-minne/ +│ └── … # BEIR mix loads from subset stores (no monolithic beir-minne/) +├── cache/ +│ ├── slices/ # Slice ledgers +│ └── ingested/ # Corpus ingestion caches (manifest includes namespace seed) +├── reports/ # JSON + Markdown output from benchmark runs +├── manifest.yaml +└── src/ ``` -## Dataset Sources +**After upgrading:** delete old monolithic `*-minne.json` files, any legacy `beir-minne/` merged store, `cache/snapshots/` directories, and stale `reports/history/` artifacts, then re-run `--warm`. -### SQuAD v2.0 - -Download and place at `data/raw/squad/dev-v2.0.json`: - -```bash -mkdir -p evaluations/data/raw/squad -curl -L https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json \ - -o evaluations/data/raw/squad/dev-v2.0.json -``` - -### Natural Questions (NQ) - -Download and place at `data/raw/nq-dev/dev-all.jsonl`: - -```bash -mkdir -p evaluations/data/raw/nq-dev -# Download from Google's Natural Questions page or HuggingFace -# File: dev-all.jsonl (simplified JSONL format) -``` - -Source: [Google Natural Questions](https://ai.google.com/research/NaturalQuestions) - -### BEIR Datasets - -All BEIR datasets follow the same format structure: - -``` -data/raw// -├── corpus.jsonl # Document corpus -├── queries.jsonl # Query set -└── qrels/ - └── test.tsv # Relevance judgments (or dev.tsv) -``` - -Download datasets from the [BEIR Benchmark repository](https://github.com/beir-cellar/beir). Each dataset zip extracts to the required directory structure. - -| Dataset | Directory | -|------------|---------------| -| FEVER | `fever/` | -| FiQA-2018 | `fiqa/` | -| HotpotQA | `hotpotqa/` | -| NFCorpus | `nfcorpus/` | -| Quora | `quora/` | -| TREC-COVID | `trec-covid/` | -| SciFact | `scifact/` | -| NQ (BEIR) | `nq/` | - -Example download: - -```bash -cd evaluations/data/raw -curl -L https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/fiqa.zip -o fiqa.zip -unzip fiqa.zip && rm fiqa.zip -``` - -## Dataset Conversion - -Raw datasets are automatically converted to Minne's internal JSON format on first run. To force reconversion: - -```bash -cargo run --package evaluations -- --force-convert -``` - -Converted files are saved to `data/converted/` and cached for subsequent runs. - -## CLI Reference - -### Common Options +## Common flags | Flag | Description | Default | |------|-------------|---------| -| `--dataset ` | Dataset to evaluate | `squad-v2` | -| `--limit ` | Max questions to evaluate (0 = all) | `200` | -| `--k ` | Precision@k cutoff | `5` | -| `--slice ` | Use a predefined slice from manifest | — | -| `--rerank` | Enable FastEmbed reranking stage | disabled | -| `--embedding-backend ` | `fastembed` or `hashed` | `fastembed` | -| `--ingest-chunks-only` | Skip entity extraction, ingest only text chunks | disabled | +| `--dataset` | Dataset to evaluate | `beir` | +| `--slice` | Slice ledger id (catalog or custom) | first catalog slice | +| `--limit` | Max questions in the slice ledger | `200` | +| `--warm` | Prepare without running queries | — | +| `--status` | Print readiness | — | +| `--require-ready` | Fail if not warmed | — | +| `--include-entities` | Entity extraction during ingestion | off | +| `--force-convert` | Rebuild converted store | — | +| `--chunk-result-cap` | Max chunks returned per query (raise with `--k`) | `5` | +| `--perf-log-console` | Print per-stage timings after a run | off | +| `--label` | Label stored in JSON/Markdown reports | — | -> [!TIP] -> Use `--ingest-chunks-only` when evaluating vector-only retrieval strategies. This skips the LLM-based entity extraction and graph generation, significantly speeding up ingestion while focusing on pure chunk-based vector search. - -### Available Datasets - -``` -squad-v2, natural-questions, beir, fever, fiqa, hotpotqa, -nfcorpus, quora, trec-covid, scifact, nq-beir -``` - -### Database Configuration - -| Flag | Environment | Default | -|------|-------------|---------| -| `--db-endpoint` | `EVAL_DB_ENDPOINT` | `ws://127.0.0.1:8000` | -| `--db-username` | `EVAL_DB_USERNAME` | `root_user` | -| `--db-password` | `EVAL_DB_PASSWORD` | `root_password` | -| `--db-namespace` | `EVAL_DB_NAMESPACE` | auto-generated | -| `--db-database` | `EVAL_DB_DATABASE` | auto-generated | - -### Example Runs - -```bash -# Vector-only evaluation (recommended for benchmarking) -cargo run --package evaluations -- \ - --dataset fiqa \ - --ingest-chunks-only \ - --limit 200 - -# Full FiQA evaluation with reranking -cargo run --package evaluations -- \ - --dataset fiqa \ - --ingest-chunks-only \ - --limit 500 \ - --rerank \ - --k 10 - -# Use a predefined slice for reproducibility -cargo run --package evaluations -- --slice fiqa-test-200 --ingest-chunks-only - -# Run the mixed BEIR benchmark -cargo run --package evaluations -- --dataset beir --slice beir-mix-600 --ingest-chunks-only -``` - -## Slices - -Slices are predefined, reproducible subsets defined in `manifest.yaml`. Each slice specifies: - -- **limit**: Number of questions -- **corpus_limit**: Maximum corpus size -- **seed**: Fixed RNG seed for reproducibility - -View available slices in [manifest.yaml](./manifest.yaml). - -## Reports - -Evaluations generate reports in `reports/`: - -- **JSON**: Full structured results (`*-report.json`) -- **Markdown**: Human-readable summary with sample mismatches (`*-report.md`) -- **History**: Timestamped run history (`history/`) - -## Performance Tuning - -```bash -# Log per-stage performance timings -cargo run --package evaluations -- --perf-log-console - -# Save telemetry to file -cargo run --package evaluations -- --perf-log-json ./perf.json -``` - -## License - -See [../LICENSE](../LICENSE). +See [REFACTOR.md](./REFACTOR.md) for architecture notes. diff --git a/evaluations/REFACTOR.md b/evaluations/REFACTOR.md new file mode 100644 index 0000000..1b61575 --- /dev/null +++ b/evaluations/REFACTOR.md @@ -0,0 +1,98 @@ +# Evaluations crate refactor plan + +This document records the architecture review and the simplification work applied to the +`evaluations` crate. **No backwards compatibility** is maintained for converted JSON layouts, +legacy report history, or old cache artifact formats. + +## Goals + +- Smaller, linear pipeline (no state machine ceremony) +- Sharded converted store for **all** datasets (memory-efficient partial loading) +- Slice-first loading when a catalog slice is selected +- In-memory SurrealDB for ingestion (no ephemeral server namespaces) +- Single DB lifecycle module (`db/`) +- CLI helpers under `cli/` + +## Primary workflow + +```bash +# One-time prep (converts raw data if needed, builds slice ledger, corpus cache, DB seed) +cargo eval --warm --dataset beir --slice beir-mix-600 + +# Check readiness +cargo eval --status --dataset beir --slice beir-mix-600 + +# Steady-state benchmark +cargo eval --dataset beir --slice beir-mix-600 --require-ready +``` + +Default dataset is `beir`. Chunk-only ingestion is the default; pass `--include-entities` to +opt into entity extraction (requires `OPENAI_API_KEY`). Slice tuning such as +`negative_multiplier` lives in `manifest.yaml` (e.g. `beir-mix-600` uses `9.0`). + +## Cache layers (after refactor) + +| Layer | Location | Purpose | +|-------|----------|---------| +| Converted store | `data/converted//` | Sharded paragraphs + question catalog | +| Slice ledger | `cache/slices//.json` | Deterministic questions + paragraph set | +| Corpus cache | `cache/ingested///` | Ingestion paragraph shards, manifest, and namespace reuse seed | + +Namespace reuse state lives in the corpus manifest (`metadata.namespace_seed`), not a separate +`snapshots/` tree. After upgrading, delete old `*-minne.json` monolithic files, any +`cache/snapshots/` directories, and re-run `--warm`. + +## Phases applied + +### Phase 0 — dead code + +- Removed unused `criterion` dependency +- Removed unused `EmbeddingCache` +- Updated README for current CLI + +### Phase 1 — structure + +- Flattened pipeline to linear `async fn` stages +- Removed `eval.rs` hub; imports go to owning modules +- Merged `namespace.rs`, `db_helpers.rs` → `db/`; dropped standalone `snapshot.rs` +- Moved `status.rs` → `cli/status.rs` +- Fixed catalog slice bootstrap (build ledger when explicit slice manifest is missing) + +### Phase 2 — no legacy paths + +- All datasets use sharded converted store only +- Removed legacy JSON layout and migration +- Removed legacy report history format +- Auto-apply first catalog slice when `--slice` omitted +- Namespace seed folded into corpus manifest (removed `cache/snapshots/`) + +### Phase 3 — performance + +- Ingestion always uses in-memory SurrealDB +- Slice-first partial load when ledger is complete +- Default catalog slice for dataset when `--slice` not passed +- Split `slice/` into `mod.rs`, `build.rs`, and `beir.rs` + +### Phase 4 — BEIR mix slice-first + +- `beir` is a virtual mix: slice ledger references prefixed ids (`fever-…`, `fiqa-…`, …) +- Conversion is **qrels-closed** per subset (only documents appearing in qrels, not full corpus) +- Slice ledger is resolved for the requested `--slice` (catalog preset or custom id + `--limit`) +- Only ledger paragraph ids are materialized into per-subset stores (`fever-minne/`, `fiqa-minne/`, …) +- No monolithic `beir-minne/` merged store +- Raw BEIR data lives in per-subset dirs under `data/raw/`; `data/raw/beir` is a catalog placeholder + +## Do not re-introduce + +- Monolithic `*-minne.json` converted files +- Monolithic `beir-minne/` merged converted store (use per-subset stores + virtual mix loader) +- `state-machines` pipeline for this linear flow +- `eval.rs` re-export hub +- Legacy history migration in reports +- Ephemeral `ingest_eval_*` namespaces on the shared SurrealDB server +- Separate `cache/snapshots/` namespace state files + +## Open follow-ups + +- Generate `DatasetKind` from `manifest.yaml` at build time +- Split `report.rs` when touching reporting again diff --git a/evaluations/manifest.yaml b/evaluations/manifest.yaml index 28c32ed..55a62f4 100644 --- a/evaluations/manifest.yaml +++ b/evaluations/manifest.yaml @@ -1,4 +1,4 @@ -default_dataset: squad-v2 +default_dataset: beir datasets: - id: squad-v2 label: "SQuAD v2.0" @@ -45,6 +45,7 @@ datasets: description: "Balanced slice across FEVER, FiQA, HotpotQA, NFCorpus, Quora, TREC-COVID, SciFact, NQ-BEIR" limit: 600 corpus_limit: 6000 + negative_multiplier: 9.0 seed: 0x5eed2025 - id: fever label: "FEVER (BEIR)" diff --git a/evaluations/src/args.rs b/evaluations/src/args.rs index 1600305..b7d7f7d 100644 --- a/evaluations/src/args.rs +++ b/evaluations/src/args.rs @@ -137,9 +137,9 @@ pub struct IngestConfig { #[arg(long, default_value_t = 50)] pub ingest_chunk_overlap_tokens: usize, - /// Run ingestion in chunk-only mode (skip analyzer/graph generation) + /// Include entity extraction and graph generation during ingestion (uses LLM tokens) #[arg(long)] - pub ingest_chunks_only: bool, + pub include_entities: bool, /// Number of paragraphs to ingest concurrently #[arg(long, default_value_t = 10)] @@ -159,6 +159,7 @@ pub struct IngestConfig { } #[derive(Debug, Clone, Args)] +#[allow(clippy::struct_field_names)] pub struct DatabaseArgs { /// `SurrealDB` server endpoint #[arg(long, default_value = "ws://127.0.0.1:8000", env = "EVAL_DB_ENDPOINT")] @@ -179,10 +180,6 @@ pub struct DatabaseArgs { /// Override the database used on the `SurrealDB` server #[arg(long, env = "EVAL_DB_DATABASE")] pub db_database: Option, - - /// Path to inspect DB state - #[arg(long)] - pub inspect_db_state: Option, } #[derive(Parser, Debug, Clone)] @@ -233,10 +230,6 @@ pub struct Config { #[arg(long, default_value_t = 5)] pub sample: usize, - /// Disable context cropping when converting datasets (ingest entire documents) - #[arg(long)] - pub full_context: bool, - #[command(flatten)] pub retrieval: RetrievalSettings, @@ -322,6 +315,18 @@ pub struct Config { #[command(flatten)] pub database: DatabaseArgs, + /// Require warmed corpus/namespace before running queries + #[arg(long)] + pub require_ready: bool, + + /// Prepare converted data, slice, corpus, and namespace without running queries + #[arg(long, conflicts_with = "status")] + pub warm: bool, + + /// Print readiness of converted data, slice, corpus, and namespace + #[arg(long, conflicts_with = "warm")] + pub status: bool, + // Computed fields (not arguments) #[arg(skip)] pub raw_dataset_path: PathBuf, @@ -334,11 +339,6 @@ pub struct Config { } impl Config { - #[allow(clippy::unused_self)] - pub fn context_token_limit(&self) -> Option { - None - } - #[allow(clippy::too_many_lines)] pub fn finalize(&mut self) -> Result<()> { // Handle dataset paths @@ -367,9 +367,7 @@ impl Config { // Handle retrieval settings self.retrieval.require_verified_chunks = !self.llm_mode; - if self.dataset == DatasetKind::Beir { - self.negative_multiplier = 9.0; - } + self.apply_catalog_slice_defaults()?; // Validations if self.ingest.ingest_chunk_min_tokens == 0 @@ -477,6 +475,56 @@ impl Config { Ok(()) } + + fn apply_catalog_slice_defaults(&mut self) -> Result<()> { + let catalog = crate::datasets::catalog()?; + let entry = catalog.dataset(self.dataset.id())?; + + if self.slice.is_none() { + if let Some(default_slice) = entry.slices.first() { + self.slice = Some(default_slice.id.clone()); + } + } + + let Some(slice_id) = self.slice.as_deref() else { + return Ok(()); + }; + + let Ok((_, slice)) = catalog.slice(slice_id) else { + return Ok(()); + }; + + if slice.dataset_id != self.dataset.id() { + return Ok(()); + } + + if let Some(limit) = slice.limit { + if self.limit_arg == 200 { + self.limit_arg = limit; + self.limit = Some(limit); + } + } + if self.corpus_limit.is_none() { + self.corpus_limit = slice.corpus_limit; + } + if let Some(seed) = slice.seed { + self.slice_seed = seed; + } + if let Some(include_unanswerable) = slice.include_unanswerable { + self.llm_mode = include_unanswerable; + self.retrieval.require_verified_chunks = !include_unanswerable; + } + if let Some(multiplier) = slice.negative_multiplier { + if negative_multiplier_is_default(self.negative_multiplier) { + self.negative_multiplier = multiplier; + } + } + Ok(()) + } +} + +fn negative_multiplier_is_default(value: f32) -> bool { + (value - crate::slice::DEFAULT_NEGATIVE_MULTIPLIER).abs() < f32::EPSILON } pub struct ParsedArgs { diff --git a/evaluations/src/cache.rs b/evaluations/src/cache.rs deleted file mode 100644 index 31a8594..0000000 --- a/evaluations/src/cache.rs +++ /dev/null @@ -1,88 +0,0 @@ -use std::{ - collections::HashMap, - path::Path, - sync::{ - atomic::{AtomicBool, Ordering}, - Arc, - }, -}; - -use anyhow::{Context, Result}; -use serde::{Deserialize, Serialize}; -use tokio::sync::Mutex; - -#[derive(Debug, Default, Serialize, Deserialize)] -struct EmbeddingCacheData { - entities: HashMap>, - chunks: HashMap>, -} - -#[derive(Clone)] -pub struct EmbeddingCache { - path: Arc, - data: Arc>, - dirty: Arc, -} - -#[allow(dead_code)] -impl EmbeddingCache { - pub async fn load(path: impl AsRef) -> Result { - let path = path.as_ref().to_path_buf(); - let data = if path.exists() { - let raw = tokio::fs::read(&path) - .await - .with_context(|| format!("reading embedding cache {}", path.display()))?; - serde_json::from_slice(&raw) - .with_context(|| format!("parsing embedding cache {}", path.display()))? - } else { - EmbeddingCacheData::default() - }; - - Ok(Self { - path: Arc::from(path.as_path()), - data: Arc::new(Mutex::new(data)), - dirty: Arc::new(AtomicBool::new(false)), - }) - } - - pub async fn get_entity(&self, id: &str) -> Option> { - let guard = self.data.lock().await; - guard.entities.get(id).cloned() - } - - pub async fn insert_entity(&self, id: String, embedding: Vec) { - let mut guard = self.data.lock().await; - guard.entities.insert(id, embedding); - self.dirty.store(true, Ordering::Relaxed); - } - - pub async fn get_chunk(&self, id: &str) -> Option> { - let guard = self.data.lock().await; - guard.chunks.get(id).cloned() - } - - pub async fn insert_chunk(&self, id: String, embedding: Vec) { - let mut guard = self.data.lock().await; - guard.chunks.insert(id, embedding); - self.dirty.store(true, Ordering::Relaxed); - } - - pub async fn persist(&self) -> Result<()> { - if !self.dirty.load(Ordering::Relaxed) { - return Ok(()); - } - - let guard = self.data.lock().await; - let body = serde_json::to_vec_pretty(&*guard).context("serialising embedding cache")?; - if let Some(parent) = self.path.parent() { - tokio::fs::create_dir_all(parent) - .await - .with_context(|| format!("creating cache directory {}", parent.display()))?; - } - tokio::fs::write(&*self.path, body) - .await - .with_context(|| format!("writing embedding cache {}", self.path.display()))?; - self.dirty.store(false, Ordering::Relaxed); - Ok(()) - } -} diff --git a/evaluations/src/cases.rs b/evaluations/src/cases.rs index 1b20a39..c1fe2f0 100644 --- a/evaluations/src/cases.rs +++ b/evaluations/src/cases.rs @@ -156,6 +156,7 @@ mod tests { chunk_min_tokens: 1, chunk_max_tokens: 10, chunk_only: false, + namespace_seed: None, }, paragraphs, questions, diff --git a/evaluations/src/cli/mod.rs b/evaluations/src/cli/mod.rs new file mode 100644 index 0000000..0bac432 --- /dev/null +++ b/evaluations/src/cli/mod.rs @@ -0,0 +1,3 @@ +pub mod status; + +pub use status::{collect_status, ensure_query_ready, print_status, warm}; diff --git a/evaluations/src/cli/status.rs b/evaluations/src/cli/status.rs new file mode 100644 index 0000000..b65a5f5 --- /dev/null +++ b/evaluations/src/cli/status.rs @@ -0,0 +1,316 @@ +#![allow(clippy::module_name_repetitions)] + +use std::path::Path; + +use anyhow::{Context, Result}; +use serde::Serialize; + +use crate::{ + args::Config, + corpus::{self, CorpusCacheConfig}, + datasets::{ + beir_subset_store_summary, beir_subset_stores_ready, content_checksum_for_layout, + detect_layout, mix_content_checksum, store_dir_for, ConvertedLayout, DatasetKind, + }, + db::{connect_eval_db, default_database, default_namespace, namespace_has_corpus}, + slice::{self, ledger_target}, +}; + +#[derive(Debug, Clone, Serialize)] +pub struct EvalStatus { + pub dataset: String, + pub slice: Option, + pub converted: ConvertedStatus, + pub slice_ledger: SliceLedgerStatus, + pub corpus_cache: CorpusCacheStatus, + pub namespace: NamespaceStatus, + pub query_ready: bool, + pub notes: Vec, +} + +#[derive(Debug, Clone, Serialize)] +pub struct ConvertedStatus { + pub layout: String, + pub path: String, + pub ready: bool, + pub partial_load_eligible: bool, + pub checksum: Option, +} + +#[derive(Debug, Clone, Serialize)] +pub struct SliceLedgerStatus { + pub ready: bool, + pub path: Option, + pub cases: Option, + pub positives: Option, + pub negatives: Option, +} + +#[derive(Debug, Clone, Serialize)] +pub struct CorpusCacheStatus { + pub ready: bool, + pub path: Option, + pub manifest_present: bool, +} + +#[derive(Debug, Clone, Serialize)] +pub struct NamespaceStatus { + pub namespace: String, + pub database: String, + pub seeded: bool, + pub namespace_seed_recorded: bool, +} + +#[allow(clippy::too_many_lines)] +pub async fn collect_status(config: &Config) -> Result { + let mut notes = Vec::new(); + let is_beir_mix = config.dataset == DatasetKind::Beir; + let converted_path = &config.converted_dataset_path; + let layout = if is_beir_mix { + ConvertedLayout::Missing + } else { + detect_layout(converted_path) + }; + let layout_label = if is_beir_mix { + "beir-mix-subset-stores" + } else { + match layout { + ConvertedLayout::ShardedStore => "sharded-store", + ConvertedLayout::Missing => "missing", + } + }; + + let store_dir = store_dir_for(converted_path); + let display_path = if is_beir_mix { + beir_subset_store_summary()? + .into_iter() + .map(|(subset, paragraphs, questions)| { + format!("{subset}-minne ({paragraphs} paragraphs, {questions} questions)") + }) + .collect::>() + .join("; ") + } else { + store_dir.display().to_string() + }; + + let manifest_path = slice::cached_manifest_path(config); + let slice_config = slice::slice_config_with_limit(config, ledger_target(config)); + let slice_manifest = manifest_path + .as_ref() + .and_then(|path| slice::read_manifest_if_exists(path).ok().flatten()); + + let slice_ledger = SliceLedgerStatus { + ready: slice_manifest + .as_ref() + .is_some_and(|manifest| slice::manifest_is_complete(manifest, &slice_config)), + path: manifest_path.as_ref().map(|path| path.display().to_string()), + cases: slice_manifest.as_ref().map(|manifest| manifest.case_count), + positives: slice_manifest.as_ref().map(|manifest| manifest.positive_paragraphs), + negatives: slice_manifest.as_ref().map(|manifest| manifest.negative_paragraphs), + }; + + let beir_paragraph_ids = slice_manifest.as_ref().map(|manifest| { + manifest + .paragraphs + .iter() + .map(|entry| entry.id.clone()) + .collect::>() + }); + + let converted_ready = if is_beir_mix { + slice_ledger.ready + && beir_paragraph_ids + .as_ref() + .is_some_and(|ids| beir_subset_stores_ready(ids).unwrap_or(false)) + } else { + layout == ConvertedLayout::ShardedStore + }; + + let checksum = if is_beir_mix { + beir_paragraph_ids + .as_ref() + .and_then(|ids| mix_content_checksum(ids).ok()) + } else if layout == ConvertedLayout::ShardedStore { + content_checksum_for_layout(converted_path).ok() + } else { + None + }; + + let partial_load_eligible = slice_ledger.ready && config.slice.is_some(); + + let corpus_cache = if let Some(manifest) = slice_manifest.as_ref() { + let cache_settings = CorpusCacheConfig::from(config); + let base_dir = corpus::cached_corpus_dir( + &cache_settings, + config.dataset.id(), + manifest.slice_id.as_str(), + ); + let manifest_present = corpus::load_cached_manifest(&base_dir)?.is_some(); + CorpusCacheStatus { + ready: manifest_present, + path: Some(base_dir.display().to_string()), + manifest_present, + } + } else { + CorpusCacheStatus { + ready: false, + path: None, + manifest_present: false, + } + }; + + let namespace = config + .database + .db_namespace + .clone() + .unwrap_or_else(|| { + default_namespace( + config.dataset.id(), + config.limit, + config.slice.as_deref(), + ) + }); + let database = config + .database + .db_database + .clone() + .unwrap_or_else(default_database); + + let namespace_seed = corpus_cache.path.as_ref().and_then(|path| { + corpus::load_cached_manifest(Path::new(path)) + .ok() + .flatten() + .and_then(|manifest| manifest.metadata.namespace_seed) + }); + + let (seeded, namespace_seed_recorded) = match connect_eval_db(config, &namespace, &database).await { + Ok(db) => { + let has_corpus = namespace_has_corpus(&db).await.unwrap_or(false); + (has_corpus, namespace_seed.is_some()) + } + Err(err) => { + notes.push(format!("SurrealDB unavailable: {err}")); + (false, false) + } + }; + + let query_ready = converted_ready + && slice_ledger.ready + && corpus_cache.ready + && seeded + && namespace_seed_recorded; + + if !query_ready { + notes.push("Run `cargo eval --warm --slice ` to prepare corpus and namespace.".into()); + } + + Ok(EvalStatus { + dataset: config.dataset.id().to_string(), + slice: config.slice.clone(), + converted: ConvertedStatus { + layout: layout_label.to_string(), + path: display_path, + ready: converted_ready, + partial_load_eligible, + checksum, + }, + slice_ledger, + corpus_cache, + namespace: NamespaceStatus { + namespace, + database, + seeded, + namespace_seed_recorded, + }, + query_ready, + notes, + }) +} + +pub fn print_status(status: &EvalStatus) { + println!("Evaluation status for dataset `{}`", status.dataset); + if let Some(slice) = &status.slice { + println!("Slice: {slice}"); + } + println!( + "Converted: {} ({})", + if status.converted.ready { + "ready" + } else { + "missing" + }, + status.converted.layout + ); + println!("Converted path: {}", status.converted.path); + if status.converted.partial_load_eligible { + println!("Slice-first loading: eligible"); + } + println!( + "Slice ledger: {}", + if status.slice_ledger.ready { + format!( + "ready ({} cases, {} positives, {} negatives)", + status.slice_ledger.cases.unwrap_or(0), + status.slice_ledger.positives.unwrap_or(0), + status.slice_ledger.negatives.unwrap_or(0) + ) + } else { + "missing or incomplete".to_string() + } + ); + if let Some(path) = &status.slice_ledger.path { + println!("Slice ledger path: {path}"); + } + println!( + "Corpus cache: {}", + if status.corpus_cache.ready { + "ready" + } else { + "missing" + } + ); + if let Some(path) = &status.corpus_cache.path { + println!("Corpus cache path: {path}"); + } + println!( + "Namespace `{}` / `{}`: seeded={}, namespace_seed_recorded={}", + status.namespace.namespace, + status.namespace.database, + status.namespace.seeded, + status.namespace.namespace_seed_recorded + ); + println!( + "Query-ready: {}", + if status.query_ready { + "yes" + } else { + "no" + } + ); + for note in &status.notes { + println!("Note: {note}"); + } +} + +pub async fn warm(config: &Config) -> Result<()> { + let loaded = + crate::datasets::prepare_dataset(config.dataset, config).context("preparing dataset")?; + crate::pipeline::warm_evaluation(&loaded.dataset, config, &loaded.content_checksum) + .await + .context("warming evaluation corpus and namespace")?; + let status = collect_status(config).await?; + print_status(&status); + Ok(()) +} + +pub async fn ensure_query_ready(config: &Config) -> Result<()> { + let status = collect_status(config).await?; + if status.query_ready { + return Ok(()); + } + print_status(&status); + anyhow::bail!( + "evaluation is not query-ready; run `cargo eval --warm --slice {}` first", + config.slice.as_deref().unwrap_or("") + ); +} diff --git a/evaluations/src/context_stats.rs b/evaluations/src/context_stats.rs new file mode 100644 index 0000000..ffcd425 --- /dev/null +++ b/evaluations/src/context_stats.rs @@ -0,0 +1,177 @@ +use serde::{Deserialize, Serialize}; + +use common::storage::types::StoredObject; + +use crate::types::EvaluationCandidate; + +const TOKENIZER_LABEL: &str = "estimated (~chars/4; ingestion uses bert-base-cased)"; + +#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq)] +pub struct RetrievedContextStats { + pub chunk_count: usize, + pub char_count: usize, + pub token_count: usize, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct RetrievalContextStats { + pub tokenizer: String, + pub queries: usize, + pub total_chunks: usize, + pub total_chars: usize, + pub total_tokens: usize, + pub avg_chunks_per_query: f64, + pub avg_chars_per_query: f64, + pub avg_tokens_per_query: f64, + pub p50_tokens_per_query: usize, + pub p95_tokens_per_query: usize, + pub max_tokens_per_query: usize, +} + +pub fn stats_for_candidates(candidates: &[EvaluationCandidate]) -> RetrievedContextStats { + let mut seen_chunk_ids = std::collections::HashSet::new(); + let mut stats = RetrievedContextStats::default(); + + for candidate in candidates { + for chunk in &candidate.chunks { + let chunk_id = chunk.chunk.id().to_string(); + if !seen_chunk_ids.insert(chunk_id) { + continue; + } + let text = chunk.chunk.chunk.as_str(); + stats.chunk_count += 1; + stats.char_count += text.chars().count(); + stats.token_count += estimate_ingestion_tokens(text); + } + } + + stats +} + +pub fn aggregate_context_stats(per_query: &[RetrievedContextStats]) -> RetrievalContextStats { + let queries = per_query.len(); + if queries == 0 { + return RetrievalContextStats { + tokenizer: TOKENIZER_LABEL.to_string(), + queries: 0, + total_chunks: 0, + total_chars: 0, + total_tokens: 0, + avg_chunks_per_query: 0.0, + avg_chars_per_query: 0.0, + avg_tokens_per_query: 0.0, + p50_tokens_per_query: 0, + p95_tokens_per_query: 0, + max_tokens_per_query: 0, + }; + } + + let total_chunks: usize = per_query.iter().map(|stats| stats.chunk_count).sum(); + let total_chars: usize = per_query.iter().map(|stats| stats.char_count).sum(); + let total_tokens: usize = per_query.iter().map(|stats| stats.token_count).sum(); + let mut tokens_per_query: Vec = per_query.iter().map(|stats| stats.token_count).collect(); + tokens_per_query.sort_unstable(); + let max_tokens_per_query = *tokens_per_query.last().unwrap_or(&0); + + RetrievalContextStats { + tokenizer: TOKENIZER_LABEL.to_string(), + queries, + total_chunks, + total_chars, + total_tokens, + avg_chunks_per_query: total_chunks as f64 / queries as f64, + avg_chars_per_query: total_chars as f64 / queries as f64, + avg_tokens_per_query: total_tokens as f64 / queries as f64, + p50_tokens_per_query: percentile_usize(&tokens_per_query, 0.50), + p95_tokens_per_query: percentile_usize(&tokens_per_query, 0.95), + max_tokens_per_query, + } +} + +fn estimate_ingestion_tokens(text: &str) -> usize { + let chars = text.chars().count(); + if chars == 0 { + return 0; + } + chars.div_ceil(4) +} + +#[allow(clippy::cast_precision_loss, clippy::indexing_slicing, clippy::arithmetic_side_effects)] +fn percentile_usize(sorted: &[usize], fraction: f64) -> usize { + if sorted.is_empty() { + return 0; + } + let clamped = fraction.clamp(0.0, 1.0); + let index = ((sorted.len() - 1) as f64 * clamped).round() as usize; + sorted[index.min(sorted.len() - 1)] +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use super::*; + use common::storage::types::text_chunk::TextChunk; + use retrieval_pipeline::RetrievedChunk; + + #[test] + fn deduplicates_chunks_when_counting_context() { + let shared = Arc::new(TextChunk::new( + "src".into(), + "hello world".into(), + "user".into(), + )); + let candidates = vec![ + EvaluationCandidate { + entity_id: "a".into(), + source_id: "src".into(), + entity_name: "A".into(), + entity_description: None, + entity_category: None, + score: 1.0, + chunks: vec![RetrievedChunk { + chunk: Arc::clone(&shared), + score: 1.0, + }], + }, + EvaluationCandidate { + entity_id: "b".into(), + source_id: "src".into(), + entity_name: "B".into(), + entity_description: None, + entity_category: None, + score: 0.9, + chunks: vec![RetrievedChunk { + chunk: shared, + score: 0.9, + }], + }, + ]; + let stats = stats_for_candidates(&candidates); + assert_eq!(stats.chunk_count, 1); + assert_eq!(stats.char_count, "hello world".chars().count()); + assert_eq!(stats.token_count, 3); + } + + #[test] + fn aggregates_per_query_token_totals() { + let per_query = vec![ + RetrievedContextStats { + chunk_count: 2, + char_count: 100, + token_count: 40, + }, + RetrievedContextStats { + chunk_count: 5, + char_count: 250, + token_count: 100, + }, + ]; + let aggregate = aggregate_context_stats(&per_query); + assert_eq!(aggregate.queries, 2); + assert_eq!(aggregate.total_chunks, 7); + assert_eq!(aggregate.total_tokens, 140); + assert_eq!(aggregate.max_tokens_per_query, 100); + assert!((aggregate.avg_tokens_per_query - 70.0).abs() < f64::EPSILON); + } +} diff --git a/evaluations/src/corpus/config.rs b/evaluations/src/corpus/config.rs index a7e6045..880771b 100644 --- a/evaluations/src/corpus/config.rs +++ b/evaluations/src/corpus/config.rs @@ -11,32 +11,14 @@ pub struct CorpusCacheConfig { pub ingestion_max_retries: usize, } -impl CorpusCacheConfig { - pub fn new( - ingestion_cache_dir: impl Into, - force_refresh: bool, - refresh_embeddings_only: bool, - ingestion_batch_size: usize, - ingestion_max_retries: usize, - ) -> Self { +impl From<&Config> for CorpusCacheConfig { + fn from(config: &Config) -> Self { Self { - ingestion_cache_dir: ingestion_cache_dir.into(), - force_refresh, - refresh_embeddings_only, - ingestion_batch_size, - ingestion_max_retries, + ingestion_cache_dir: config.ingest.ingestion_cache_dir.clone(), + force_refresh: config.force_convert || config.ingest.slice_reset_ingestion, + refresh_embeddings_only: config.ingest.refresh_embeddings_only, + ingestion_batch_size: config.ingest.ingestion_batch_size, + ingestion_max_retries: config.ingest.ingestion_max_retries, } } } - -impl From<&Config> for CorpusCacheConfig { - fn from(config: &Config) -> Self { - CorpusCacheConfig::new( - config.ingest.ingestion_cache_dir.clone(), - config.force_convert || config.ingest.slice_reset_ingestion, - config.ingest.refresh_embeddings_only, - config.ingest.ingestion_batch_size, - config.ingest.ingestion_max_retries, - ) - } -} diff --git a/evaluations/src/corpus/mod.rs b/evaluations/src/corpus/mod.rs index 5804384..ba41444 100644 --- a/evaluations/src/corpus/mod.rs +++ b/evaluations/src/corpus/mod.rs @@ -5,11 +5,11 @@ pub(crate) mod store; pub use config::CorpusCacheConfig; pub use orchestrator::{ cached_corpus_dir, compute_ingestion_fingerprint, corpus_handle_from_manifest, ensure_corpus, - load_cached_manifest, + load_cached_manifest, persist_corpus_manifest, }; pub use store::{ seed_manifest_into_db, window_manifest, CorpusHandle, CorpusManifest, CorpusMetadata, - CorpusQuestion, ParagraphShard, ParagraphShardStore, MANIFEST_VERSION, + CorpusQuestion, NamespaceSeedRecord, ParagraphShard, ParagraphShardStore, MANIFEST_VERSION, }; pub fn make_ingestion_config(config: &crate::args::Config) -> ingestion_pipeline::IngestionConfig { @@ -20,6 +20,6 @@ pub fn make_ingestion_config(config: &crate::args::Config) -> ingestion_pipeline chunk_overlap_tokens: config.ingest.ingest_chunk_overlap_tokens, ..Default::default() }, - chunk_only: config.ingest.ingest_chunks_only, + chunk_only: !config.ingest.include_entities, } } diff --git a/evaluations/src/corpus/orchestrator.rs b/evaluations/src/corpus/orchestrator.rs index a445f9e..3575090 100644 --- a/evaluations/src/corpus/orchestrator.rs +++ b/evaluations/src/corpus/orchestrator.rs @@ -9,8 +9,6 @@ use std::{ use anyhow::{anyhow, Context, Result}; use async_openai::Client; use chrono::Utc; -#[cfg(not(test))] -use common::utils::config::get_config; use common::{ storage::{ db::SurrealDbClient, @@ -125,10 +123,14 @@ pub async fn ensure_corpus( openai: Arc, user_id: &str, converted_path: &Path, + precomputed_checksum: Option<&str>, ingestion_config: IngestionConfig, ) -> Result { - let checksum = compute_file_checksum(converted_path) - .with_context(|| format!("computing checksum for {}", converted_path.display()))?; + let checksum = match precomputed_checksum { + Some(value) => value.to_string(), + None => crate::datasets::content_checksum_for_layout(converted_path) + .with_context(|| format!("computing checksum for {}", converted_path.display()))?, + }; let ingestion_fingerprint = build_ingestion_fingerprint(dataset, slice, &checksum, &ingestion_config); @@ -381,6 +383,7 @@ pub async fn ensure_corpus( chunk_min_tokens: ingestion_config.tuning.chunk_min_tokens, chunk_max_tokens: ingestion_config.tuning.chunk_max_tokens, chunk_only: ingestion_config.chunk_only, + namespace_seed: None, }, paragraphs: corpus_paragraphs, questions: corpus_questions, @@ -415,7 +418,7 @@ pub async fn ensure_corpus( negative_ingested: stats.negative_ingested, }; - persist_manifest(&handle).context("persisting corpus manifest")?; + persist_corpus_manifest(&handle).context("persisting corpus manifest")?; Ok(handle) } @@ -501,7 +504,6 @@ async fn ingest_paragraph_batch( Ok(shards) } -#[cfg(test)] async fn create_ingest_db(namespace: &str) -> Result> { let db = SurrealDbClient::memory(namespace, "corpus") .await @@ -509,21 +511,6 @@ async fn create_ingest_db(namespace: &str) -> Result> { Ok(Arc::new(db)) } -#[cfg(not(test))] -async fn create_ingest_db(namespace: &str) -> Result> { - let config = get_config().context("loading app config for ingestion database")?; - let db = SurrealDbClient::new( - &config.surrealdb_address, - &config.surrealdb_username, - &config.surrealdb_password, - namespace, - "corpus", - ) - .await - .context("creating surrealdb database for ingestion")?; - Ok(Arc::new(db)) -} - #[allow(clippy::too_many_arguments)] async fn ingest_single_paragraph( pipeline: Arc, @@ -631,8 +618,12 @@ pub fn compute_ingestion_fingerprint( slice: &ResolvedSlice<'_>, converted_path: &Path, ingestion_config: &IngestionConfig, + precomputed_checksum: Option<&str>, ) -> Result { - let checksum = compute_file_checksum(converted_path)?; + let checksum = match precomputed_checksum { + Some(value) => value.to_string(), + None => crate::datasets::content_checksum_for_layout(converted_path)?, + }; Ok(build_ingestion_fingerprint( dataset, slice, @@ -641,7 +632,7 @@ pub fn compute_ingestion_fingerprint( )) } -pub fn load_cached_manifest(base_dir: &Path) -> Result> { +pub fn load_cached_manifest(base_dir: &std::path::Path) -> Result> { let path = base_dir.join("manifest.json"); if !path.exists() { return Ok(None); @@ -656,7 +647,7 @@ pub fn load_cached_manifest(base_dir: &Path) -> Result> { Ok(Some(manifest)) } -fn persist_manifest(handle: &CorpusHandle) -> Result<()> { +pub fn persist_corpus_manifest(handle: &CorpusHandle) -> Result<()> { let path = handle.path.join("manifest.json"); if let Some(parent) = path.parent() { fs::create_dir_all(parent) @@ -685,24 +676,6 @@ pub fn corpus_handle_from_manifest(manifest: CorpusManifest, base_dir: PathBuf) } } -#[allow(clippy::indexing_slicing)] -fn compute_file_checksum(path: &Path) -> Result { - let mut file = fs::File::open(path) - .with_context(|| format!("opening file {} for checksum", path.display()))?; - let mut hasher = Sha256::new(); - let mut buffer = [0u8; 8192]; - loop { - let read = file - .read(&mut buffer) - .with_context(|| format!("reading {} for checksum", path.display()))?; - if read == 0 { - break; - } - hasher.update(&buffer[..read]); - } - Ok(format!("{:x}", hasher.finalize())) -} - #[cfg(test)] mod tests { use super::*; @@ -731,7 +704,6 @@ mod tests { metadata: crate::datasets::DatasetMetadata::for_kind( DatasetKind::default(), false, - None, ), source: "src".to_string(), paragraphs: vec![paragraph], diff --git a/evaluations/src/corpus/store.rs b/evaluations/src/corpus/store.rs index f219251..294ceed 100644 --- a/evaluations/src/corpus/store.rs +++ b/evaluations/src/corpus/store.rs @@ -42,7 +42,7 @@ fn default_chunk_max_tokens() -> usize { } fn default_chunk_only() -> bool { - false + true } // Reuse the pipeline's canonical embedded-artifact types so the on-disk corpus @@ -122,6 +122,14 @@ pub struct CorpusManifest { pub questions: Vec, } +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +pub struct NamespaceSeedRecord { + pub namespace: String, + pub database: String, + pub slice_case_count: usize, + pub seeded_at: DateTime, +} + #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] pub struct CorpusMetadata { pub dataset_id: String, @@ -144,6 +152,8 @@ pub struct CorpusMetadata { pub chunk_max_tokens: usize, #[serde(default = "default_chunk_only")] pub chunk_only: bool, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub namespace_seed: Option, } #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] @@ -629,6 +639,7 @@ mod tests { chunk_min_tokens: 1, chunk_max_tokens: 10, chunk_only: false, + namespace_seed: None, }, paragraphs: vec![paragraph_one, paragraph_two], questions: vec![question], diff --git a/evaluations/src/datasets/beir.rs b/evaluations/src/datasets/beir.rs index a06a529..be355a0 100644 --- a/evaluations/src/datasets/beir.rs +++ b/evaluations/src/datasets/beir.rs @@ -1,5 +1,5 @@ use std::{ - collections::{BTreeMap, HashMap}, + collections::{BTreeMap, HashMap, HashSet}, fs::File, io::{BufRead, BufReader}, path::{Path, PathBuf}, @@ -47,20 +47,71 @@ struct QrelEntry { score: i32, } +/// Convert only documents that appear in qrels (the BEIR evaluation closed world). #[allow(clippy::arithmetic_side_effects, clippy::indexing_slicing)] pub fn convert_beir(raw_dir: &Path, dataset: DatasetKind) -> Result> { + convert_beir_documents(raw_dir, dataset, None) +} + +/// Convert a subset of qrels-world documents. `doc_ids` use corpus ids (unprefixed). +#[allow( + clippy::too_many_lines, + clippy::arithmetic_side_effects, + clippy::indexing_slicing +)] +pub fn convert_beir_documents( + raw_dir: &Path, + dataset: DatasetKind, + doc_ids: Option<&HashSet>, +) -> Result> { let corpus_path = raw_dir.join("corpus.jsonl"); let queries_path = raw_dir.join("queries.jsonl"); let qrels_path = resolve_qrels_path(raw_dir)?; - let corpus = load_corpus(&corpus_path)?; let queries = load_queries(&queries_path)?; let qrels = load_qrels(&qrels_path)?; - let mut paragraphs = Vec::with_capacity(corpus.len()); + let mut qrels_doc_ids = HashSet::new(); + for entries in qrels.values() { + for entry in entries { + qrels_doc_ids.insert(entry.doc_id.clone()); + } + } + + let target_doc_ids: HashSet = match doc_ids { + Some(ids) => ids + .iter() + .filter(|id| qrels_doc_ids.contains(*id)) + .cloned() + .collect(), + None => qrels_doc_ids.clone(), + }; + + if target_doc_ids.is_empty() { + return Err(anyhow!( + "no qrels documents to convert for {} at {}", + dataset.id(), + raw_dir.display() + )); + } + + let corpus = load_corpus_filtered(&corpus_path, &target_doc_ids)?; + + let mut doc_ids_sorted: Vec = target_doc_ids.into_iter().collect(); + doc_ids_sorted.sort(); + + let mut paragraphs = Vec::with_capacity(doc_ids_sorted.len()); let mut paragraph_index = HashMap::new(); - for (doc_id, entry) in &corpus { + for doc_id in &doc_ids_sorted { + let Some(entry) = corpus.get(doc_id) else { + warn!( + doc_id = %doc_id, + dataset = %dataset.id(), + "Skipping qrels document missing from corpus" + ); + continue; + }; let paragraph_id = format!("{}-{doc_id}", dataset.source_prefix()); let paragraph = ConvertedParagraph { id: paragraph_id.clone(), @@ -87,6 +138,12 @@ pub fn convert_beir(raw_dir: &Path, dataset: DatasetKind) -> Result Result Result Result 0 { warn!( missing_queries, - missing_docs, skipped_answers, "Skipped some BEIR qrels entries during conversion" + missing_docs, + skipped_answers, + dataset = %dataset.id(), + "Skipped some BEIR qrels entries during conversion" ); } Ok(paragraphs) } +pub fn corpus_doc_id(paragraph_id: &str, dataset: DatasetKind) -> Option { + let prefix = format!("{}-", dataset.source_prefix()); + paragraph_id + .strip_prefix(&prefix) + .map(str::to_string) +} + fn resolve_qrels_path(raw_dir: &Path) -> Result { let qrels_dir = raw_dir.join("qrels"); let candidates = ["test.tsv", "dev.tsv", "train.tsv"]; @@ -148,7 +214,10 @@ fn resolve_qrels_path(raw_dir: &Path) -> Result { } #[allow(clippy::arithmetic_side_effects)] -fn load_corpus(path: &Path) -> Result> { +fn load_corpus_filtered( + path: &Path, + doc_ids: &HashSet, +) -> Result> { let file = File::open(path).with_context(|| format!("opening BEIR corpus at {}", path.display()))?; let reader = BufReader::new(file); @@ -167,6 +236,9 @@ fn load_corpus(path: &Path) -> Result> { path.display() ) })?; + if !doc_ids.contains(&corpus_row.id) { + continue; + } let title = corpus_row.title.unwrap_or_else(|| corpus_row.id.clone()); let text = corpus_row.text.unwrap_or_default(); let context = build_context(&title, &text); @@ -296,10 +368,8 @@ mod tests { use std::fs; use tempfile::tempdir; - #[test] - #[allow(clippy::unwrap_used, clippy::expect_used, clippy::indexing_slicing)] - fn converts_basic_beir_layout() { - let dir = tempdir().unwrap(); + #[allow(clippy::unwrap_used)] + fn write_fixture(dir: &tempfile::TempDir) { let corpus = r#" {"_id":"d1","title":"Doc 1","text":"Doc one has some text for testing."} {"_id":"d2","title":"Doc 2","text":"Second document content."} @@ -313,24 +383,34 @@ mod tests { fs::write(dir.path().join("queries.jsonl"), queries.trim()).unwrap(); fs::create_dir_all(dir.path().join("qrels")).unwrap(); fs::write(dir.path().join("qrels/test.tsv"), qrels).unwrap(); + } + + #[test] + #[allow(clippy::unwrap_used, clippy::expect_used, clippy::indexing_slicing)] + fn converts_qrels_world_only() { + let dir = tempdir().unwrap(); + write_fixture(&dir); let paragraphs = convert_beir(dir.path(), DatasetKind::Fever).unwrap(); - assert_eq!(paragraphs.len(), 2); - let doc_one = paragraphs - .iter() - .find(|p| p.id == "fever-d1") - .expect("missing paragraph for d1"); + assert_eq!(paragraphs.len(), 1); + let doc_one = ¶graphs[0]; + assert_eq!(doc_one.id, "fever-d1"); assert_eq!(doc_one.questions.len(), 1); - let question = &doc_one.questions[0]; - assert_eq!(question.id, "fever-q1"); - assert!(!question.answers.is_empty()); - assert!(doc_one.context.contains(&question.answers[0])); + assert_eq!(doc_one.questions[0].id, "fever-q1"); + } - let doc_two = paragraphs - .iter() - .find(|p| p.id == "fever-d2") - .expect("missing paragraph for d2"); - assert!(doc_two.questions.is_empty()); + #[test] + #[allow(clippy::unwrap_used, clippy::expect_used, clippy::indexing_slicing)] + fn converts_filtered_doc_ids() { + let dir = tempdir().unwrap(); + write_fixture(&dir); + + let mut ids = HashSet::new(); + ids.insert("d1".to_string()); + let paragraphs = + convert_beir_documents(dir.path(), DatasetKind::Fever, Some(&ids)).unwrap(); + assert_eq!(paragraphs.len(), 1); + assert_eq!(paragraphs[0].id, "fever-d1"); } } diff --git a/evaluations/src/datasets/beir_mix.rs b/evaluations/src/datasets/beir_mix.rs new file mode 100644 index 0000000..45a8e66 --- /dev/null +++ b/evaluations/src/datasets/beir_mix.rs @@ -0,0 +1,262 @@ +use std::collections::{HashMap, HashSet}; + +use anyhow::{anyhow, Context, Result}; +use sha2::{Digest, Sha256}; +use tracing::info; + +use super::{ + beir, + checksum::hash_file, + store::{ + self, build_dataset_from_catalog, paragraph_path, read_meta, store_dir_for, + upsert_sharded_paragraphs, write_sharded, + }, + BEIR_DATASETS, ConvertedDataset, DatasetKind, DatasetMetadata, +}; +use crate::{ + args::Config, + slice, +}; + +pub fn subset_for_paragraph_id(paragraph_id: &str) -> Option { + let mut kinds: Vec = BEIR_DATASETS.to_vec(); + kinds.sort_by_key(|kind| std::cmp::Reverse(kind.source_prefix().len())); + for kind in kinds { + let prefix = format!("{}-", kind.source_prefix()); + if paragraph_id.starts_with(&prefix) { + return Some(kind); + } + } + None +} + +pub fn build_beir_mix_qrels_dataset(include_unanswerable: bool) -> Result { + if include_unanswerable { + tracing::warn!("BEIR mix ignores include_unanswerable flag; all questions are answerable"); + } + + let mut paragraphs = Vec::new(); + for subset in BEIR_DATASETS { + let entry = super::dataset_entry_for_kind(subset)?; + let subset_paragraphs = beir::convert_beir(&entry.raw_path, subset)?; + paragraphs.extend(subset_paragraphs); + } + + Ok(ConvertedDataset { + generated_at: super::base_timestamp(), + metadata: DatasetMetadata::for_kind(DatasetKind::Beir, include_unanswerable), + source: "beir-mix".to_string(), + paragraphs, + }) +} + +pub fn prepare_beir_mix(config: &Config) -> Result { + let virtual_ds = build_beir_mix_qrels_dataset(config.llm_mode)?; + let slice_config = slice::slice_config_with_limit(config, slice::ledger_target(config)); + let resolved = slice::resolve_slice(&virtual_ds, &slice_config).context( + "resolving BEIR mix slice ledger (check --slice and --limit match your intent)", + )?; + + let unique: HashSet = resolved + .manifest + .paragraphs + .iter() + .map(|entry| entry.id.clone()) + .collect(); + + materialize_subset_stores(&unique, config.force_convert)?; + + let dataset = load_beir_mix_from_subsets(&unique)?; + let checksum = mix_content_checksum(&unique)?; + + info!( + slice = resolved.manifest.slice_id.as_str(), + paragraphs = unique.len(), + checksum = %checksum, + "Prepared BEIR mix from per-subset converted stores" + ); + + Ok(super::loader::LoadedDataset { + dataset, + content_checksum: checksum, + partial: true, + }) +} + +pub fn materialize_subset_stores( + paragraph_ids: &HashSet, + force: bool, +) -> Result<()> { + let mut by_subset: HashMap> = HashMap::new(); + for paragraph_id in paragraph_ids { + let kind = subset_for_paragraph_id(paragraph_id).with_context(|| { + format!("routing BEIR mix paragraph id '{paragraph_id}' to subset store") + })?; + by_subset.entry(kind).or_default().push(paragraph_id.clone()); + } + + for (kind, ids) in by_subset { + let entry = super::dataset_entry_for_kind(kind)?; + let store_dir = store_dir_for(&entry.converted_path); + let existing = if store_dir.join("meta.json").is_file() { + store::load_paragraph_ids_set(&store_dir)? + } else { + HashSet::new() + }; + + let missing: Vec = if force { + ids + } else { + ids.into_iter() + .filter(|paragraph_id| !existing.contains(paragraph_id)) + .collect() + }; + + if missing.is_empty() { + continue; + } + + let corpus_ids: HashSet = missing + .iter() + .filter_map(|paragraph_id| beir::corpus_doc_id(paragraph_id, kind)) + .collect(); + let paragraphs = beir::convert_beir_documents( + &entry.raw_path, + kind, + Some(&corpus_ids), + )?; + + if store_dir.join("meta.json").is_file() { + upsert_sharded_paragraphs(&store_dir, ¶graphs)?; + } else { + let question_count = paragraphs + .iter() + .map(|paragraph| paragraph.questions.len()) + .sum::(); + let dataset = ConvertedDataset { + generated_at: super::base_timestamp(), + metadata: DatasetMetadata::for_kind(kind, false), + source: entry.raw_path.display().to_string(), + paragraphs, + }; + write_sharded(&dataset, &store_dir)?; + info!( + subset = kind.id(), + store = %store_dir.display(), + paragraphs = dataset.paragraphs.len(), + questions = question_count, + "Created subset converted store for BEIR mix" + ); + } + } + + Ok(()) +} + +pub fn load_beir_mix_from_subsets(paragraph_ids: &HashSet) -> Result { + let mut by_subset: HashMap> = HashMap::new(); + for paragraph_id in paragraph_ids { + let kind = subset_for_paragraph_id(paragraph_id).with_context(|| { + format!("routing BEIR mix paragraph id '{paragraph_id}' to subset store") + })?; + by_subset + .entry(kind) + .or_default() + .insert(paragraph_id.clone()); + } + + let mut paragraphs = Vec::with_capacity(paragraph_ids.len()); + for (kind, subset_ids) in by_subset { + let entry = super::dataset_entry_for_kind(kind)?; + let store_dir = store_dir_for(&entry.converted_path); + let partial = build_dataset_from_catalog(&store_dir, &subset_ids)?; + paragraphs.extend(partial.paragraphs); + } + + paragraphs.sort_by(|left, right| left.id.cmp(&right.id)); + + Ok(ConvertedDataset { + generated_at: super::base_timestamp(), + metadata: DatasetMetadata::for_kind(DatasetKind::Beir, false), + source: "beir-mix".to_string(), + paragraphs, + }) +} + +pub fn mix_content_checksum(paragraph_ids: &HashSet) -> Result { + let mut ids: Vec = paragraph_ids.iter().cloned().collect(); + ids.sort(); + + let mut hasher = Sha256::new(); + for paragraph_id in ids { + let kind = subset_for_paragraph_id(¶graph_id) + .ok_or_else(|| anyhow!("unknown BEIR subset for paragraph '{paragraph_id}'"))?; + let entry = super::dataset_entry_for_kind(kind)?; + let store_dir = store_dir_for(&entry.converted_path); + let path = paragraph_path(&store_dir, ¶graph_id); + if !path.is_file() { + return Err(anyhow!( + "missing converted paragraph {} at {}", + paragraph_id, + path.display() + )); + } + hasher.update(paragraph_id.as_bytes()); + hasher.update([0]); + hasher.update(hash_file(&path)?.as_bytes()); + } + + Ok(format!("{:x}", hasher.finalize())) +} + +pub fn beir_subset_stores_ready(paragraph_ids: &HashSet) -> Result { + for paragraph_id in paragraph_ids { + let kind = subset_for_paragraph_id(paragraph_id).with_context(|| { + format!("routing BEIR mix paragraph id '{paragraph_id}' to subset store") + })?; + let entry = super::dataset_entry_for_kind(kind)?; + let store_dir = store_dir_for(&entry.converted_path); + if !store_dir.join("meta.json").is_file() { + return Ok(false); + } + if !paragraph_path(&store_dir, paragraph_id).is_file() { + return Ok(false); + } + } + Ok(true) +} + +pub fn beir_subset_store_summary() -> Result> { + let mut summary = Vec::new(); + for kind in BEIR_DATASETS { + let entry = super::dataset_entry_for_kind(kind)?; + let store_dir = store_dir_for(&entry.converted_path); + if store_dir.join("meta.json").is_file() { + let meta = read_meta(&store_dir)?; + summary.push((kind.id().to_string(), meta.paragraph_count, meta.question_count)); + } + } + Ok(summary) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn routes_prefixed_paragraph_ids() { + assert_eq!( + subset_for_paragraph_id("fever-doc-1"), + Some(DatasetKind::Fever) + ); + assert_eq!( + subset_for_paragraph_id("nq-beir-doc-1"), + Some(DatasetKind::NqBeir) + ); + assert_eq!( + subset_for_paragraph_id("trec-covid-doc-1"), + Some(DatasetKind::TrecCovid) + ); + assert!(subset_for_paragraph_id("unknown-doc").is_none()); + } +} diff --git a/evaluations/src/datasets/checksum.rs b/evaluations/src/datasets/checksum.rs new file mode 100644 index 0000000..331457a --- /dev/null +++ b/evaluations/src/datasets/checksum.rs @@ -0,0 +1,216 @@ +use std::{ + fs::{self, File}, + io::Read, + path::Path, +}; + +#[cfg(test)] +use std::path::PathBuf; + +use anyhow::{Context, Result}; +use serde::{Deserialize, Serialize}; +use sha2::{Digest, Sha256}; + +const SIDECAR_VERSION: u32 = 1; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ChecksumSidecar { + pub version: u32, + pub sha256: String, + pub size_bytes: u64, + #[serde(default)] + pub modified_unix_secs: u64, +} + +impl ChecksumSidecar { + #[cfg(test)] + pub fn sidecar_path(content_path: &Path) -> PathBuf { + content_path.with_extension("sha256") + } + + #[cfg(test)] + pub fn is_valid_for(&self, content_path: &Path) -> bool { + if self.version != SIDECAR_VERSION { + return false; + } + let Ok(metadata) = fs::metadata(content_path) else { + return false; + }; + if metadata.len() != self.size_bytes { + return false; + } + if self.modified_unix_secs != 0 { + let Ok(modified) = metadata.modified() else { + return true; + }; + let Ok(secs) = modified.duration_since(std::time::UNIX_EPOCH) else { + return true; + }; + if secs.as_secs() != self.modified_unix_secs { + return false; + } + } + true + } +} + +#[allow(clippy::indexing_slicing)] +pub fn hash_file(path: &Path) -> Result { + let mut file = + File::open(path).with_context(|| format!("opening file {} for checksum", path.display()))?; + let mut hasher = Sha256::new(); + let mut buffer = vec![0u8; 65_536]; + loop { + let read = file + .read(&mut buffer) + .with_context(|| format!("reading {} for checksum", path.display()))?; + if read == 0 { + break; + } + hasher.update(&buffer[..read]); + } + Ok(format!("{:x}", hasher.finalize())) +} + +pub fn read_sidecar(path: &Path) -> Result> { + if !path.exists() { + return Ok(None); + } + let raw = fs::read_to_string(path) + .with_context(|| format!("reading checksum sidecar {}", path.display()))?; + let sidecar: ChecksumSidecar = serde_json::from_str(&raw) + .with_context(|| format!("parsing checksum sidecar {}", path.display()))?; + Ok(Some(sidecar)) +} + +#[cfg(test)] +pub fn write_sidecar(content_path: &Path, sha256: &str) -> Result<()> { + let metadata = fs::metadata(content_path) + .with_context(|| format!("reading metadata for {}", content_path.display()))?; + let modified_unix_secs = metadata + .modified() + .ok() + .and_then(|time| time.duration_since(std::time::UNIX_EPOCH).ok()) + .map_or(0, |duration| duration.as_secs()); + let sidecar = ChecksumSidecar { + version: SIDECAR_VERSION, + sha256: sha256.to_string(), + size_bytes: metadata.len(), + modified_unix_secs, + }; + let path = ChecksumSidecar::sidecar_path(content_path); + if let Some(parent) = path.parent() { + fs::create_dir_all(parent) + .with_context(|| format!("creating checksum sidecar directory {}", parent.display()))?; + } + let blob = serde_json::to_vec_pretty(&sidecar).context("serialising checksum sidecar")?; + fs::write(&path, blob) + .with_context(|| format!("writing checksum sidecar {}", path.display()))?; + Ok(()) +} + +#[cfg(test)] +pub fn content_checksum(content_path: &Path) -> Result { + let sidecar_path = ChecksumSidecar::sidecar_path(content_path); + if let Some(sidecar) = read_sidecar(&sidecar_path)? { + if sidecar.is_valid_for(content_path) { + return Ok(sidecar.sha256); + } + } + let sha256 = hash_file(content_path)?; + write_sidecar(content_path, &sha256)?; + Ok(sha256) +} + +pub fn store_aggregate_checksum(store_dir: &Path) -> Result { + let marker = store_dir.join("checksum.sha256"); + let meta = store_dir.join("meta.json"); + if marker.is_file() && meta.is_file() { + if let (Ok(marker_meta), Ok(meta_meta)) = (marker.metadata(), meta.metadata()) { + if marker_meta + .modified() + .ok() + .zip(meta_meta.modified().ok()) + .is_some_and(|(marker_modified, meta_modified)| marker_modified >= meta_modified) + { + if let Some(sidecar) = read_sidecar(&marker)? { + return Ok(sidecar.sha256); + } + } + } + } + + let mut entries = Vec::new(); + collect_store_files(store_dir, store_dir, &mut entries)?; + entries.sort(); + + let mut hasher = Sha256::new(); + for relative in &entries { + let path = store_dir.join(relative); + if path == marker { + continue; + } + hasher.update(relative.as_bytes()); + hasher.update([0]); + let file_hash = hash_file(&path)?; + hasher.update(file_hash.as_bytes()); + } + let digest = format!("{:x}", hasher.finalize()); + + let sidecar = ChecksumSidecar { + version: SIDECAR_VERSION, + sha256: digest.clone(), + size_bytes: entries.len() as u64, + modified_unix_secs: std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map_or(0, |duration| duration.as_secs()), + }; + if let Some(parent) = marker.parent() { + fs::create_dir_all(parent)?; + } + fs::write(&marker, serde_json::to_vec_pretty(&sidecar)?)?; + Ok(digest) +} + +fn collect_store_files(base: &Path, current: &Path, entries: &mut Vec) -> Result<()> { + for entry in fs::read_dir(current)? { + let entry = entry?; + let path = entry.path(); + if path.file_name().is_some_and(|name| name == "checksum.sha256") { + continue; + } + if path.is_dir() { + collect_store_files(base, &path, entries)?; + } else if path.is_file() { + let relative = path + .strip_prefix(base) + .unwrap_or(&path) + .to_string_lossy() + .replace('\\', "/"); + entries.push(relative); + } + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::tempdir; + + #[test] + fn sidecar_round_trip() -> Result<()> { + let dir = tempdir()?; + let file = dir.path().join("sample.json"); + fs::write(&file, br#"{"hello":"world"}"#)?; + + let first = content_checksum(&file)?; + let second = content_checksum(&file)?; + assert_eq!(first, second); + + fs::write(&file, br#"{"hello":"world!"}"#)?; + let third = content_checksum(&file)?; + assert_ne!(first, third); + Ok(()) + } +} diff --git a/evaluations/src/datasets/loader.rs b/evaluations/src/datasets/loader.rs new file mode 100644 index 0000000..752ad93 --- /dev/null +++ b/evaluations/src/datasets/loader.rs @@ -0,0 +1,197 @@ +use std::collections::HashSet; + +use anyhow::{Context, Result}; +use tracing::info; + +use super::{ + catalog, + store::{ + self, build_dataset_from_catalog, detect_layout, read_meta, store_dir_for, write_sharded, + ConvertedLayout, + }, + ConvertedDataset, DatasetKind, +}; +use crate::{ + args::Config, + slice::{self, SliceConfig}, +}; + +#[derive(Debug, Clone)] +pub struct LoadedDataset { + pub dataset: ConvertedDataset, + pub content_checksum: String, + pub partial: bool, +} + +pub fn prepare_dataset(dataset_kind: DatasetKind, config: &Config) -> Result { + if dataset_kind == DatasetKind::Beir { + return super::beir_mix::prepare_beir_mix(config); + } + + let converted_path = &config.converted_dataset_path; + let layout = detect_layout(converted_path); + let store_dir = store_dir_for(converted_path); + + if layout == ConvertedLayout::Missing || config.force_convert { + return convert_and_load(dataset_kind, config); + } + + load_from_store(dataset_kind, config, &store_dir, true) +} + +fn convert_and_load(dataset_kind: DatasetKind, config: &Config) -> Result { + let dataset = super::convert( + config.raw_dataset_path.as_path(), + dataset_kind, + config.llm_mode, + ) + .with_context(|| format!("converting {} dataset", dataset_kind.label()))?; + + let store_dir = store_dir_for(&config.converted_dataset_path); + write_sharded(&dataset, &store_dir)?; + prebuild_catalog_slices(&dataset, config)?; + let checksum = crate::datasets::store_aggregate_checksum(&store_dir)?; + + Ok(LoadedDataset { + dataset, + content_checksum: checksum, + partial: false, + }) +} + +fn load_from_store( + dataset_kind: DatasetKind, + config: &Config, + store_dir: &std::path::Path, + allow_partial: bool, +) -> Result { + let checksum = crate::datasets::store_aggregate_checksum(store_dir)?; + let meta = read_meta(store_dir)?; + validate_metadata_fields(&meta.metadata, dataset_kind, config)?; + + if allow_partial { + if let Some(paragraph_ids) = slice_paragraph_ids_for_fast_path(config)? { + let unique: HashSet = paragraph_ids.into_iter().collect(); + info!( + paragraphs = unique.len(), + store = %store_dir.display(), + "Loading slice-addressed paragraphs from sharded converted store" + ); + let dataset = build_dataset_from_catalog(store_dir, &unique)?; + return Ok(LoadedDataset { + dataset, + content_checksum: checksum, + partial: true, + }); + } + } + + info!( + store = %store_dir.display(), + paragraphs = meta.paragraph_count, + "Loading full sharded converted store" + ); + let dataset = store::load_sharded_full(store_dir)?; + Ok(LoadedDataset { + dataset, + content_checksum: checksum, + partial: false, + }) +} + +fn slice_paragraph_ids_for_fast_path(config: &Config) -> Result>> { + let Some(manifest_path) = slice::cached_manifest_path(config) else { + return Ok(None); + }; + let Some(manifest) = slice::read_manifest_if_exists(&manifest_path)? else { + return Ok(None); + }; + let slice_config = slice::slice_config_with_limit(config, slice::ledger_target(config)); + if !slice::manifest_is_complete(&manifest, &slice_config) { + return Ok(None); + } + Ok(Some( + manifest + .paragraphs + .iter() + .map(|entry| entry.id.clone()) + .collect(), + )) +} + +fn validate_metadata_fields( + metadata: &super::DatasetMetadata, + dataset_kind: DatasetKind, + config: &Config, +) -> Result<()> { + if metadata.id != dataset_kind.id() { + anyhow::bail!( + "converted dataset targets '{}', expected '{}'", + metadata.id, + dataset_kind.id() + ); + } + if metadata.include_unanswerable != config.llm_mode { + anyhow::bail!( + "converted dataset include_unanswerable mismatch (expected {}, found {})", + config.llm_mode, + metadata.include_unanswerable + ); + } + Ok(()) +} + +pub fn prebuild_catalog_slices(dataset: &ConvertedDataset, config: &Config) -> Result<()> { + let catalog = catalog()?; + let entry = catalog.dataset(dataset.metadata.id.as_str())?; + if entry.slices.is_empty() { + return Ok(()); + } + + info!( + dataset = dataset.metadata.id.as_str(), + slices = entry.slices.len(), + "Prebuilding catalog slice ledgers" + ); + + for slice_entry in &entry.slices { + let slice_config = slice_config_for_catalog_entry(config, slice_entry); + match slice::resolve_slice(dataset, &slice_config) { + Ok(resolved) => info!( + slice = resolved.manifest.slice_id.as_str(), + cases = resolved.manifest.case_count, + positives = resolved.manifest.positive_paragraphs, + negatives = resolved.manifest.negative_paragraphs, + "Prebuilt catalog slice ledger" + ), + Err(err) => tracing::warn!( + slice = slice_entry.id.as_str(), + error = %err, + "Failed to prebuild catalog slice ledger" + ), + } + } + + Ok(()) +} + +fn slice_config_for_catalog_entry<'a>( + config: &'a Config, + slice_entry: &'a super::SliceEntry, +) -> SliceConfig<'a> { + SliceConfig { + cache_dir: config.cache_dir.as_path(), + force_convert: config.force_convert, + explicit_slice: Some(slice_entry.id.as_str()), + limit: slice_entry.limit, + corpus_limit: slice_entry.corpus_limit, + slice_seed: slice_entry.seed.unwrap_or(config.slice_seed), + llm_mode: slice_entry + .include_unanswerable + .unwrap_or(config.llm_mode), + negative_multiplier: slice_entry + .negative_multiplier + .unwrap_or(config.negative_multiplier), + require_verified_chunks: config.retrieval.require_verified_chunks, + } +} diff --git a/evaluations/src/datasets/mod.rs b/evaluations/src/datasets/mod.rs index 1274c5e..7380d77 100644 --- a/evaluations/src/datasets/mod.rs +++ b/evaluations/src/datasets/mod.rs @@ -1,6 +1,10 @@ mod beir; +mod beir_mix; +mod checksum; +mod loader; mod nq; mod squad; +mod store; use std::{ collections::{BTreeMap, HashMap}, @@ -20,38 +24,31 @@ const MANIFEST_PATH: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/manifest.yaml" static DATASET_CATALOG: OnceCell = OnceCell::new(); #[derive(Debug, Clone)] -#[allow(dead_code)] pub struct DatasetCatalog { datasets: BTreeMap, slices: HashMap, - default_dataset: String, } #[derive(Debug, Clone)] -#[allow(dead_code)] pub struct DatasetEntry { pub metadata: DatasetMetadata, pub raw_path: PathBuf, pub converted_path: PathBuf, - pub include_unanswerable: bool, pub slices: Vec, } #[derive(Debug, Clone)] -#[allow(dead_code)] pub struct SliceEntry { pub id: String, pub dataset_id: String, - pub label: String, - pub description: Option, pub limit: Option, pub corpus_limit: Option, pub include_unanswerable: Option, pub seed: Option, + pub negative_multiplier: Option, } #[derive(Debug, Clone)] -#[allow(dead_code)] struct SliceLocation { dataset_id: String, slice_index: usize, @@ -59,7 +56,6 @@ struct SliceLocation { #[derive(Debug, Deserialize)] struct ManifestFile { - default_dataset: Option, datasets: Vec, } @@ -81,6 +77,7 @@ struct ManifestDataset { } #[derive(Debug, Deserialize)] +#[allow(dead_code)] struct ManifestSlice { id: String, label: String, @@ -94,6 +91,8 @@ struct ManifestSlice { include_unanswerable: Option, #[serde(default)] seed: Option, + #[serde(default)] + negative_multiplier: Option, } impl DatasetCatalog { @@ -111,18 +110,19 @@ impl DatasetCatalog { let raw_path = resolve_path(root, &dataset.raw); let converted_path = resolve_path(root, &dataset.converted); - if !raw_path.exists() { + if !raw_path.exists() && dataset.id != "beir" { bail!( "dataset '{}' raw file missing at {}", dataset.id, raw_path.display() ); } - if !converted_path.exists() { + let store_dir = store::store_dir_for(&converted_path); + if !converted_path.exists() && !store_dir.join("meta.json").is_file() { warn!( - "dataset '{}' converted file missing at {}; the next conversion run will regenerate it", + "dataset '{}' converted store missing at {}; the next conversion run will regenerate it", dataset.id, - converted_path.display() + store_dir.display() ); } @@ -139,7 +139,6 @@ impl DatasetCatalog { .clone() .unwrap_or_else(|| dataset.id.clone()), include_unanswerable: dataset.include_unanswerable, - context_token_limit: None, }; let mut entry_slices = Vec::with_capacity(dataset.slices.len()); @@ -154,12 +153,11 @@ impl DatasetCatalog { entry_slices.push(SliceEntry { id: manifest_slice.id.clone(), dataset_id: dataset.id.clone(), - label: manifest_slice.label, - description: manifest_slice.description, limit: manifest_slice.limit, corpus_limit: manifest_slice.corpus_limit, include_unanswerable: manifest_slice.include_unanswerable, seed: manifest_slice.seed, + negative_multiplier: manifest_slice.negative_multiplier, }); slices.insert( manifest_slice.id, @@ -176,22 +174,16 @@ impl DatasetCatalog { metadata, raw_path, converted_path, - include_unanswerable: dataset.include_unanswerable, slices: entry_slices, }, ); } - let default_dataset = manifest - .default_dataset - .or_else(|| datasets.keys().next().cloned()) - .ok_or_else(|| anyhow!("dataset manifest does not include any datasets"))?; + if datasets.is_empty() { + bail!("dataset manifest does not include any datasets"); + } - Ok(Self { - datasets, - slices, - default_dataset, - }) + Ok(Self { datasets, slices }) } pub fn global() -> Result<&'static Self> { @@ -204,12 +196,6 @@ impl DatasetCatalog { .ok_or_else(|| anyhow!("unknown dataset '{id}' in manifest")) } - #[allow(dead_code)] - pub fn default_dataset(&self) -> Result<&DatasetEntry> { - self.dataset(&self.default_dataset) - } - - #[allow(dead_code)] pub fn slice(&self, slice_id: &str) -> Result<(&DatasetEntry, &SliceEntry)> { let location = self .slices @@ -236,20 +222,29 @@ fn resolve_path(root: &Path, value: &str) -> PathBuf { } } +pub use checksum::store_aggregate_checksum; +pub use beir_mix::{ + beir_subset_store_summary, beir_subset_stores_ready, mix_content_checksum, +}; +pub use loader::{prebuild_catalog_slices, prepare_dataset}; +pub use store::{ + content_checksum_for_layout, detect_layout, store_dir_for, write_sharded, ConvertedLayout, +}; + pub fn catalog() -> Result<&'static DatasetCatalog> { DatasetCatalog::global() } -fn dataset_entry_for_kind(kind: DatasetKind) -> Result<&'static DatasetEntry> { +pub(crate) fn dataset_entry_for_kind(kind: DatasetKind) -> Result<&'static DatasetEntry> { let catalog = catalog()?; catalog.dataset(kind.id()) } -#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum, Default)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, ValueEnum, Default)] pub enum DatasetKind { - #[default] SquadV2, NaturalQuestions, + #[default] Beir, #[value(name = "fever")] Fever, @@ -416,16 +411,10 @@ pub struct DatasetMetadata { pub source_prefix: String, #[serde(default)] pub include_unanswerable: bool, - #[serde(default)] - pub context_token_limit: Option, } impl DatasetMetadata { - pub fn for_kind( - kind: DatasetKind, - include_unanswerable: bool, - context_token_limit: Option, - ) -> Self { + pub fn for_kind(kind: DatasetKind, include_unanswerable: bool) -> Self { if let Ok(entry) = dataset_entry_for_kind(kind) { return Self { id: entry.metadata.id.clone(), @@ -434,7 +423,6 @@ impl DatasetMetadata { entity_suffix: entry.metadata.entity_suffix.clone(), source_prefix: entry.metadata.source_prefix.clone(), include_unanswerable, - context_token_limit, }; } @@ -445,13 +433,12 @@ impl DatasetMetadata { entity_suffix: kind.entity_suffix().to_string(), source_prefix: kind.source_prefix().to_string(), include_unanswerable, - context_token_limit, } } } fn default_metadata() -> DatasetMetadata { - DatasetMetadata::for_kind(DatasetKind::default(), false, None) + DatasetMetadata::for_kind(DatasetKind::default(), false) } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -483,14 +470,15 @@ pub fn convert( raw_path: &Path, dataset: DatasetKind, include_unanswerable: bool, - context_token_limit: Option, ) -> Result { let paragraphs = match dataset { DatasetKind::SquadV2 => squad::convert_squad(raw_path)?, - DatasetKind::NaturalQuestions => { - nq::convert_nq(raw_path, include_unanswerable, context_token_limit)? + DatasetKind::NaturalQuestions => nq::convert_nq(raw_path, include_unanswerable)?, + DatasetKind::Beir => { + bail!( + "BEIR mix is prepared via slice-first subset stores; use prepare_beir_mix instead of convert" + ); } - DatasetKind::Beir => convert_beir_mix(include_unanswerable, context_token_limit)?, DatasetKind::Fever | DatasetKind::Fiqa | DatasetKind::HotpotQa @@ -501,11 +489,6 @@ pub fn convert( | DatasetKind::NqBeir => beir::convert_beir(raw_path, dataset)?, }; - let metadata_limit = match dataset { - DatasetKind::NaturalQuestions => None, - _ => context_token_limit, - }; - let generated_at = match dataset { DatasetKind::Beir | DatasetKind::Fever @@ -526,100 +509,12 @@ pub fn convert( Ok(ConvertedDataset { generated_at, - metadata: DatasetMetadata::for_kind(dataset, include_unanswerable, metadata_limit), + metadata: DatasetMetadata::for_kind(dataset, include_unanswerable), source: source_label, paragraphs, }) } -fn convert_beir_mix( - include_unanswerable: bool, - _context_token_limit: Option, -) -> Result> { - if include_unanswerable { - warn!("BEIR mix ignores include_unanswerable flag; all questions are answerable"); - } - - let mut paragraphs = Vec::new(); - for subset in BEIR_DATASETS { - let entry = dataset_entry_for_kind(subset)?; - let subset_paragraphs = beir::convert_beir(&entry.raw_path, subset)?; - paragraphs.extend(subset_paragraphs); - } - - Ok(paragraphs) -} - -fn ensure_parent(path: &Path) -> Result<()> { - if let Some(parent) = path.parent() { - fs::create_dir_all(parent) - .with_context(|| format!("creating parent directory for {}", path.display()))?; - } - Ok(()) -} - -pub fn write_converted(dataset: &ConvertedDataset, converted_path: &Path) -> Result<()> { - ensure_parent(converted_path)?; - let json = - serde_json::to_string_pretty(dataset).context("serialising converted dataset to JSON")?; - fs::write(converted_path, json) - .with_context(|| format!("writing converted dataset to {}", converted_path.display())) -} - -pub fn read_converted(converted_path: &Path) -> Result { - let raw = fs::read_to_string(converted_path) - .with_context(|| format!("reading converted dataset at {}", converted_path.display()))?; - let mut dataset: ConvertedDataset = serde_json::from_str(&raw) - .with_context(|| format!("parsing converted dataset at {}", converted_path.display()))?; - if dataset.metadata.id.trim().is_empty() { - dataset.metadata = default_metadata(); - } - if dataset.source.is_empty() { - dataset.source = converted_path.display().to_string(); - } - Ok(dataset) -} - -pub fn ensure_converted( - dataset_kind: DatasetKind, - raw_path: &Path, - converted_path: &Path, - force: bool, - include_unanswerable: bool, - context_token_limit: Option, -) -> Result { - if force || !converted_path.exists() { - let dataset = convert( - raw_path, - dataset_kind, - include_unanswerable, - context_token_limit, - )?; - write_converted(&dataset, converted_path)?; - return Ok(dataset); - } - - match read_converted(converted_path) { - Ok(dataset) - if dataset.metadata.id == dataset_kind.id() - && dataset.metadata.include_unanswerable == include_unanswerable - && dataset.metadata.context_token_limit == context_token_limit => - { - Ok(dataset) - } - _ => { - let dataset = convert( - raw_path, - dataset_kind, - include_unanswerable, - context_token_limit, - )?; - write_converted(&dataset, converted_path)?; - Ok(dataset) - } - } -} - pub fn base_timestamp() -> DateTime { Utc.with_ymd_and_hms(2023, 1, 1, 0, 0, 0).unwrap() } diff --git a/evaluations/src/datasets/nq.rs b/evaluations/src/datasets/nq.rs index a39956d..2b72682 100644 --- a/evaluations/src/datasets/nq.rs +++ b/evaluations/src/datasets/nq.rs @@ -16,11 +16,7 @@ use super::{ConvertedParagraph, ConvertedQuestion}; clippy::arithmetic_side_effects, clippy::cast_sign_loss )] -pub fn convert_nq( - raw_path: &Path, - include_unanswerable: bool, - _context_token_limit: Option, -) -> Result> { +pub fn convert_nq(raw_path: &Path, include_unanswerable: bool) -> Result> { #[allow(dead_code)] #[derive(Debug, Deserialize)] struct NqExample { diff --git a/evaluations/src/datasets/store.rs b/evaluations/src/datasets/store.rs new file mode 100644 index 0000000..a4e85d9 --- /dev/null +++ b/evaluations/src/datasets/store.rs @@ -0,0 +1,410 @@ +use std::{ + collections::{HashMap, HashSet}, + fs::{self, File, OpenOptions}, + io::{BufRead, BufReader, Write}, + path::{Path, PathBuf}, +}; + +use anyhow::{anyhow, Context, Result}; +use chrono::{DateTime, Utc}; +use serde::{Deserialize, Serialize}; +use tracing::info; + +use super::{ + checksum::store_aggregate_checksum, + ConvertedDataset, ConvertedParagraph, ConvertedQuestion, DatasetMetadata, +}; +use crate::slice; + +pub const SHARDED_STORE_VERSION: u32 = 1; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ShardedMeta { + pub version: u32, + pub generated_at: DateTime, + pub metadata: DatasetMetadata, + pub source: String, + pub paragraph_count: usize, + pub question_count: usize, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub(crate) struct QuestionRecord { + paragraph_id: String, + #[serde(flatten)] + question: ConvertedQuestion, +} + +#[derive(Debug, Clone)] +pub struct QuestionCatalog { + pub entries: Vec, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ConvertedLayout { + ShardedStore, + Missing, +} + +pub fn store_dir_for(converted_path: &Path) -> PathBuf { + converted_path + .parent() + .unwrap_or_else(|| Path::new(".")) + .join( + converted_path + .file_stem() + .map_or_else(|| "dataset".to_string(), |stem| stem.to_string_lossy().into()), + ) +} + +pub fn detect_layout(converted_path: &Path) -> ConvertedLayout { + let store_dir = store_dir_for(converted_path); + if store_dir.join("meta.json").is_file() { + ConvertedLayout::ShardedStore + } else { + ConvertedLayout::Missing + } +} + +fn paragraph_file_name(paragraph_id: &str) -> String { + format!("{}.json", slice::paragraph_storage_key(paragraph_id)) +} + +pub fn paragraph_path(store_dir: &Path, paragraph_id: &str) -> PathBuf { + store_dir + .join("paragraphs") + .join(paragraph_file_name(paragraph_id)) +} + +pub fn write_sharded(dataset: &ConvertedDataset, store_dir: &Path) -> Result { + if store_dir.exists() { + fs::remove_dir_all(store_dir) + .with_context(|| format!("clearing sharded store {}", store_dir.display()))?; + } + fs::create_dir_all(store_dir.join("paragraphs")) + .with_context(|| format!("creating sharded store {}", store_dir.display()))?; + + let question_count = dataset + .paragraphs + .iter() + .map(|paragraph| paragraph.questions.len()) + .sum::(); + + let meta = ShardedMeta { + version: SHARDED_STORE_VERSION, + generated_at: dataset.generated_at, + metadata: dataset.metadata.clone(), + source: dataset.source.clone(), + paragraph_count: dataset.paragraphs.len(), + question_count, + }; + let meta_path = store_dir.join("meta.json"); + fs::write( + &meta_path, + serde_json::to_vec_pretty(&meta).context("serialising sharded store metadata")?, + ) + .with_context(|| format!("writing sharded metadata {}", meta_path.display()))?; + + let mut questions_file = File::create(store_dir.join("questions.jsonl")) + .context("creating questions.jsonl for sharded store")?; + let mut paragraph_ids_file = File::create(store_dir.join("paragraph_ids.jsonl")) + .context("creating paragraph_ids.jsonl for sharded store")?; + + for paragraph in &dataset.paragraphs { + writeln!(paragraph_ids_file, "{}", paragraph.id) + .context("writing paragraph id to paragraph_ids.jsonl")?; + for question in ¶graph.questions { + let record = QuestionRecord { + paragraph_id: paragraph.id.clone(), + question: question.clone(), + }; + serde_json::to_writer(&mut questions_file, &record) + .context("writing question record to questions.jsonl")?; + questions_file.write_all(b"\n")?; + } + + let path = paragraph_path(store_dir, ¶graph.id); + if let Some(parent) = path.parent() { + fs::create_dir_all(parent)?; + } + fs::write( + &path, + serde_json::to_vec(paragraph).context("serialising sharded paragraph")?, + ) + .with_context(|| format!("writing sharded paragraph {}", path.display()))?; + } + + let digest = store_aggregate_checksum(store_dir)?; + info!( + store = %store_dir.display(), + paragraphs = dataset.paragraphs.len(), + questions = question_count, + checksum = %digest, + "Wrote sharded converted dataset" + ); + Ok(digest) +} + +pub fn read_meta(store_dir: &Path) -> Result { + let path = store_dir.join("meta.json"); + let raw = fs::read_to_string(&path) + .with_context(|| format!("reading sharded metadata {}", path.display()))?; + serde_json::from_str(&raw) + .with_context(|| format!("parsing sharded metadata {}", path.display())) +} + +pub fn content_checksum_for_layout(converted_path: &Path) -> Result { + match detect_layout(converted_path) { + ConvertedLayout::ShardedStore => { + crate::datasets::store_aggregate_checksum(&store_dir_for(converted_path)) + } + ConvertedLayout::Missing => Err(anyhow!( + "converted dataset missing at {}", + converted_path.display() + )), + } +} + +fn load_paragraph(store_dir: &Path, paragraph_id: &str) -> Result { + let path = paragraph_path(store_dir, paragraph_id); + let raw = fs::read(&path) + .with_context(|| format!("reading sharded paragraph {}", path.display()))?; + serde_json::from_slice(&raw) + .with_context(|| format!("parsing sharded paragraph {}", path.display())) +} + +fn load_paragraphs(store_dir: &Path, paragraph_ids: &[String]) -> Result> { + paragraph_ids + .iter() + .map(|paragraph_id| load_paragraph(store_dir, paragraph_id)) + .collect() +} + +pub fn load_sharded_partial(store_dir: &Path, paragraph_ids: &[String]) -> Result { + let meta = read_meta(store_dir)?; + let mut paragraphs = load_paragraphs(store_dir, paragraph_ids)?; + paragraphs.sort_by(|left, right| left.id.cmp(&right.id)); + Ok(ConvertedDataset { + generated_at: meta.generated_at, + metadata: meta.metadata, + source: meta.source, + paragraphs, + }) +} + +pub fn load_sharded_full(store_dir: &Path) -> Result { + let meta = read_meta(store_dir)?; + let ids = load_paragraph_ids(store_dir)?; + let paragraphs = load_paragraphs(store_dir, &ids)?; + Ok(ConvertedDataset { + generated_at: meta.generated_at, + metadata: meta.metadata, + source: meta.source, + paragraphs, + }) +} + +pub fn load_paragraph_ids_set(store_dir: &Path) -> Result> { + Ok(load_paragraph_ids(store_dir)?.into_iter().collect()) +} + +#[allow(clippy::arithmetic_side_effects)] +pub fn upsert_sharded_paragraphs( + store_dir: &Path, + paragraphs: &[ConvertedParagraph], +) -> Result<()> { + if paragraphs.is_empty() { + return Ok(()); + } + if !store_dir.join("meta.json").is_file() { + return Err(anyhow!( + "cannot upsert into missing sharded store at {}", + store_dir.display() + )); + } + + fs::create_dir_all(store_dir.join("paragraphs")) + .with_context(|| format!("creating paragraphs directory in {}", store_dir.display()))?; + + let existing = load_paragraph_ids_set(store_dir)?; + let questions_path = store_dir.join("questions.jsonl"); + let mut questions_file = OpenOptions::new() + .create(true) + .append(true) + .open(&questions_path) + .with_context(|| format!("opening question catalog {}", questions_path.display()))?; + + let mut ids_file = None; + let mut new_paragraphs = 0usize; + let mut new_questions = 0usize; + + for paragraph in paragraphs { + let is_new = !existing.contains(¶graph.id); + let path = paragraph_path(store_dir, ¶graph.id); + if let Some(parent) = path.parent() { + fs::create_dir_all(parent)?; + } + fs::write( + &path, + serde_json::to_vec(paragraph).context("serialising sharded paragraph")?, + ) + .with_context(|| format!("writing sharded paragraph {}", path.display()))?; + + if is_new { + if ids_file.is_none() { + ids_file = Some( + OpenOptions::new() + .create(true) + .append(true) + .open(store_dir.join("paragraph_ids.jsonl")) + .context("opening paragraph_ids.jsonl for append")?, + ); + } + if let Some(file) = ids_file.as_mut() { + writeln!(file, "{}", paragraph.id).context("appending paragraph id")?; + } + new_paragraphs += 1; + + for question in ¶graph.questions { + let record = QuestionRecord { + paragraph_id: paragraph.id.clone(), + question: question.clone(), + }; + serde_json::to_writer(&mut questions_file, &record) + .context("writing question record to questions.jsonl")?; + questions_file.write_all(b"\n")?; + new_questions += 1; + } + } + } + + if new_paragraphs > 0 || new_questions > 0 { + let meta = read_meta(store_dir)?; + let updated = ShardedMeta { + paragraph_count: meta.paragraph_count + new_paragraphs, + question_count: meta.question_count + new_questions, + ..meta + }; + fs::write( + store_dir.join("meta.json"), + serde_json::to_vec_pretty(&updated).context("serialising updated sharded metadata")?, + )?; + store_aggregate_checksum(store_dir)?; + info!( + store = %store_dir.display(), + new_paragraphs, + new_questions, + "Upserted paragraphs into sharded converted store" + ); + } + + Ok(()) +} + +pub fn load_paragraph_ids(store_dir: &Path) -> Result> { + let path = store_dir.join("paragraph_ids.jsonl"); + let file = File::open(&path) + .with_context(|| format!("opening paragraph id index {}", path.display()))?; + let reader = BufReader::new(file); + reader + .lines() + .map(|line| { + line.context("reading paragraph id index line") + .and_then(|value| { + let trimmed = value.trim(); + if trimmed.is_empty() { + Err(anyhow!("empty paragraph id in index")) + } else { + Ok(trimmed.to_string()) + } + }) + }) + .collect() +} + +pub fn load_question_catalog(store_dir: &Path) -> Result { + let path = store_dir.join("questions.jsonl"); + let file = File::open(&path) + .with_context(|| format!("opening question catalog {}", path.display()))?; + let reader = BufReader::new(file); + let mut entries = Vec::new(); + for line in reader.lines() { + let line = line.context("reading question catalog line")?; + if line.trim().is_empty() { + continue; + } + let record: QuestionRecord = serde_json::from_str(&line) + .context("parsing question catalog record")?; + entries.push(record); + } + Ok(QuestionCatalog { entries }) +} + +pub fn build_dataset_from_catalog( + store_dir: &Path, + paragraph_ids: &HashSet, +) -> Result { + let catalog = load_question_catalog(store_dir)?; + let mut questions_by_paragraph: HashMap> = HashMap::new(); + for entry in catalog.entries { + if paragraph_ids.contains(&entry.paragraph_id) { + questions_by_paragraph + .entry(entry.paragraph_id.clone()) + .or_default() + .push(entry.question); + } + } + + let mut dataset = load_sharded_partial( + store_dir, + ¶graph_ids.iter().cloned().collect::>(), + )?; + for paragraph in &mut dataset.paragraphs { + if let Some(questions) = questions_by_paragraph.remove(¶graph.id) { + paragraph.questions = questions; + } else { + paragraph.questions.clear(); + } + } + + Ok(dataset) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::datasets::{DatasetKind, DatasetMetadata}; + + fn sample_dataset() -> ConvertedDataset { + ConvertedDataset { + generated_at: Utc::now(), + metadata: DatasetMetadata::for_kind(DatasetKind::SquadV2, false), + source: "test".to_string(), + paragraphs: vec![ConvertedParagraph { + id: "p1".to_string(), + title: "Title".to_string(), + context: "Body".to_string(), + questions: vec![ConvertedQuestion { + id: "q1".to_string(), + question: "Question?".to_string(), + answers: vec!["Answer".to_string()], + is_impossible: false, + }], + }], + } + } + + #[test] + #[allow(clippy::indexing_slicing)] + fn sharded_round_trip() -> Result<()> { + let dir = tempfile::tempdir()?; + let store_dir = dir.path().join("sample"); + let dataset = sample_dataset(); + write_sharded(&dataset, &store_dir)?; + + let loaded = load_sharded_full(&store_dir)?; + assert_eq!(loaded.paragraphs.len(), 1); + assert_eq!(loaded.paragraphs[0].questions[0].id, "q1"); + Ok(()) + } +} diff --git a/evaluations/src/namespace.rs b/evaluations/src/db/connect.rs similarity index 67% rename from evaluations/src/namespace.rs rename to evaluations/src/db/connect.rs index ff39a98..d200c5e 100644 --- a/evaluations/src/namespace.rs +++ b/evaluations/src/db/connect.rs @@ -1,22 +1,22 @@ -//! Database namespace management utilities. - use anyhow::{anyhow, Context, Result}; use chrono::Utc; -use common::storage::{ - db::SurrealDbClient, - types::user::{Theme, User}, - types::StoredObject, +use common::{ + storage::{ + db::SurrealDbClient, + types::user::{Theme, User}, + types::StoredObject, + }, + utils::embedding::EmbeddingProvider, }; use serde::Deserialize; use tracing::{info, warn}; use crate::{ args::Config, + corpus::{self, CorpusHandle, CorpusManifest, NamespaceSeedRecord}, datasets, - snapshot::{self, DbSnapshotState}, }; -/// Connect to the evaluation database with fallback auth strategies. pub(crate) async fn connect_eval_db( config: &Config, namespace: &str, @@ -73,7 +73,6 @@ pub(crate) async fn connect_eval_db( } } -/// Check if the namespace contains any corpus data. pub(crate) async fn namespace_has_corpus(db: &SurrealDbClient) -> Result { #[derive(Deserialize)] struct CountRow { @@ -89,41 +88,52 @@ pub(crate) async fn namespace_has_corpus(db: &SurrealDbClient) -> Result { Ok(rows.first().map_or(0, |row| row.count) > 0) } -/// Determine if we can reuse an existing namespace based on cached state. +fn manifest_matches_runtime( + manifest: &CorpusManifest, + embedding_provider: &EmbeddingProvider, + ingestion_fingerprint: &str, +) -> bool { + let metadata = &manifest.metadata; + metadata.ingestion_fingerprint == ingestion_fingerprint + && metadata.embedding_backend == embedding_provider.backend_label() + && metadata.embedding_model == embedding_provider.model_code() + && metadata.embedding_dimension == embedding_provider.dimension() +} + #[allow(clippy::too_many_arguments)] pub(crate) async fn can_reuse_namespace( db: &SurrealDbClient, - descriptor: &snapshot::Descriptor, + manifest: &CorpusManifest, + embedding_provider: &EmbeddingProvider, namespace: &str, database: &str, - dataset_id: &str, - slice_id: &str, ingestion_fingerprint: &str, slice_case_count: usize, ) -> Result { - let Some(state) = descriptor.load_db_state().await? else { - info!("No namespace state recorded; reseeding corpus from cached shards"); + if !manifest_matches_runtime(manifest, embedding_provider, ingestion_fingerprint) { + info!("Corpus manifest metadata mismatch; rebuilding namespace from cached shards"); + return Ok(false); + } + + let Some(seed) = manifest.metadata.namespace_seed.as_ref() else { + info!("No namespace seed recorded in corpus manifest; reseeding"); return Ok(false); }; - if state.slice_case_count != slice_case_count { + if seed.slice_case_count != slice_case_count { info!( requested_cases = slice_case_count, - stored_cases = state.slice_case_count, - "Skipping live namespace reuse; cached state does not match requested window" + stored_cases = seed.slice_case_count, + "Skipping namespace reuse; case window mismatch" ); return Ok(false); } - if state.dataset_id != dataset_id - || state.slice_id != slice_id - || state.ingestion_fingerprint != ingestion_fingerprint - || state.namespace.as_deref() != Some(namespace) - || state.database.as_deref() != Some(database) - { + if seed.namespace != namespace || seed.database != database { info!( namespace, - database, "Cached namespace metadata mismatch; rebuilding corpus from ingestion cache" + database, + "Corpus manifest namespace metadata mismatch; reseeding" ); return Ok(false); } @@ -140,28 +150,20 @@ pub(crate) async fn can_reuse_namespace( } } -/// Record the current namespace state to allow future reuse checks. -pub(crate) async fn record_namespace_state( - descriptor: &snapshot::Descriptor, - dataset_id: &str, - slice_id: &str, - ingestion_fingerprint: &str, +pub(crate) async fn record_namespace_seed( + handle: &mut CorpusHandle, namespace: &str, database: &str, slice_case_count: usize, ) { - let state = DbSnapshotState { - dataset_id: dataset_id.to_string(), - slice_id: slice_id.to_string(), - ingestion_fingerprint: ingestion_fingerprint.to_string(), - snapshot_hash: descriptor.metadata_hash().to_string(), - updated_at: Utc::now(), - namespace: Some(namespace.to_string()), - database: Some(database.to_string()), + handle.manifest.metadata.namespace_seed = Some(NamespaceSeedRecord { + namespace: namespace.to_string(), + database: database.to_string(), slice_case_count, - }; - if let Err(err) = descriptor.store_db_state(&state).await { - warn!(error = %err, "Failed to record namespace state"); + seeded_at: Utc::now(), + }); + if let Err(err) = corpus::persist_corpus_manifest(handle) { + warn!(error = %err, "Failed to record namespace seed in corpus manifest"); } } @@ -185,8 +187,17 @@ fn sanitize_identifier(input: &str) -> String { cleaned } -/// Generate a default namespace name based on dataset and limit. -pub(crate) fn default_namespace(dataset_id: &str, limit: Option) -> String { +pub(crate) fn default_namespace( + dataset_id: &str, + limit: Option, + slice_id: Option<&str>, +) -> String { + if let Some(slice_id) = slice_id { + let sanitized = sanitize_identifier(slice_id); + if !sanitized.is_empty() { + return format!("eval_{sanitized}"); + } + } let dataset_component = sanitize_identifier(dataset_id); let limit_component = match limit { Some(value) if value > 0 => format!("limit{value}"), @@ -195,12 +206,10 @@ pub(crate) fn default_namespace(dataset_id: &str, limit: Option) -> Strin format!("eval_{dataset_component}_{limit_component}") } -/// Generate the default database name for evaluations. pub(crate) fn default_database() -> String { "retrieval_eval".to_string() } -/// Ensure the evaluation user exists in the database. pub(crate) async fn ensure_eval_user(db: &SurrealDbClient) -> Result { let timestamp = datasets::base_timestamp(); let user = User { @@ -225,3 +234,7 @@ pub(crate) async fn ensure_eval_user(db: &SurrealDbClient) -> Result { .context("storing evaluation user")?; Ok(user) } + +pub(crate) fn sanitize_model_code(code: &str) -> String { + sanitize_identifier(code) +} diff --git a/evaluations/src/db_helpers.rs b/evaluations/src/db/lifecycle.rs similarity index 75% rename from evaluations/src/db_helpers.rs rename to evaluations/src/db/lifecycle.rs index e154b73..00ded78 100644 --- a/evaluations/src/db_helpers.rs +++ b/evaluations/src/db/lifecycle.rs @@ -2,13 +2,6 @@ use anyhow::{Context, Result}; use common::storage::{db::SurrealDbClient, indexes::ensure_runtime}; use tracing::info; -// Helper functions for index management during namespace reseed -pub async fn remove_all_indexes(db: &SurrealDbClient) -> Result<()> { - let _ = db; - info!("Removing ALL indexes before namespace reseed (no-op placeholder)"); - Ok(()) -} - pub async fn recreate_indexes(db: &SurrealDbClient, dimension: usize) -> Result<()> { info!("Recreating ALL indexes after namespace reseed via shared runtime helper"); ensure_runtime(db, dimension) @@ -34,14 +27,39 @@ pub async fn reset_namespace(db: &SurrealDbClient, namespace: &str, database: &s Ok(()) } -// // Test helper to force index dimension change -// #[allow(dead_code)] -// pub async fn change_embedding_length_in_hnsw_indexes( -// db: &SurrealDbClient, -// dimension: usize, -// ) -> Result<()> { -// recreate_indexes(db, dimension).await -// } +#[allow(clippy::cast_precision_loss)] +pub(crate) async fn warm_hnsw_cache(db: &SurrealDbClient, dimension: usize) -> Result<()> { + let dummy_embedding: Vec = (0..dimension).map(|i| (i as f32).sin()).collect(); + + info!("Warming HNSW caches with sample queries"); + + let _ = db + .client + .query( + r#"SELECT chunk_id + FROM text_chunk_embedding + WHERE embedding <|1,1|> $embedding + LIMIT 5"#, + ) + .bind(("embedding", dummy_embedding.clone())) + .await + .context("warming text chunk HNSW cache")?; + + let _ = db + .client + .query( + r#"SELECT entity_id + FROM knowledge_entity_embedding + WHERE embedding <|1,1|> $embedding + LIMIT 5"#, + ) + .bind(("embedding", dummy_embedding)) + .await + .context("warming knowledge entity HNSW cache")?; + + info!("HNSW cache warming completed"); + Ok(()) +} #[cfg(test)] mod tests { diff --git a/evaluations/src/db/mod.rs b/evaluations/src/db/mod.rs new file mode 100644 index 0000000..ee57459 --- /dev/null +++ b/evaluations/src/db/mod.rs @@ -0,0 +1,9 @@ +mod connect; +mod lifecycle; + +pub(crate) use connect::{ + can_reuse_namespace, connect_eval_db, default_database, default_namespace, ensure_eval_user, + namespace_has_corpus, record_namespace_seed, sanitize_model_code, +}; +pub use lifecycle::{recreate_indexes, reset_namespace}; +pub(crate) use lifecycle::warm_hnsw_cache; diff --git a/evaluations/src/eval.rs b/evaluations/src/eval.rs deleted file mode 100644 index e0abf4b..0000000 --- a/evaluations/src/eval.rs +++ /dev/null @@ -1,128 +0,0 @@ -//! Evaluation utilities module - re-exports from focused submodules. - -// Re-export types from the root types module -pub use crate::types::*; - -// Re-export from focused modules at crate root (crate-internal only) -pub(crate) use crate::cases::{cases_from_manifest, SeededCase}; -pub(crate) use crate::namespace::{ - can_reuse_namespace, connect_eval_db, default_database, default_namespace, ensure_eval_user, - record_namespace_state, -}; -pub(crate) use crate::settings::{enforce_system_settings, load_or_init_system_settings}; - -use std::path::Path; - -use anyhow::{Context, Result}; -use common::storage::db::SurrealDbClient; -use tokio::io::AsyncWriteExt; -use tracing::info; - -use crate::{ - args::{self, Config}, - datasets::ConvertedDataset, - slice::{self}, -}; - -/// Grow the slice ledger to contain the target number of cases. -pub fn grow_slice(dataset: &ConvertedDataset, config: &Config) -> Result<()> { - let ledger_limit = ledger_target(config); - let slice_settings = slice::slice_config_with_limit(config, ledger_limit); - let slice = - slice::resolve_slice(dataset, &slice_settings).context("resolving dataset slice")?; - info!( - slice = slice.manifest.slice_id.as_str(), - cases = slice.manifest.case_count, - positives = slice.manifest.positive_paragraphs, - negatives = slice.manifest.negative_paragraphs, - total_paragraphs = slice.manifest.total_paragraphs, - "Slice ledger ready" - ); - println!( - "Slice `{}` now contains {} questions ({} positives, {} negatives)", - slice.manifest.slice_id, - slice.manifest.case_count, - slice.manifest.positive_paragraphs, - slice.manifest.negative_paragraphs - ); - Ok(()) -} - -pub(crate) fn ledger_target(config: &Config) -> Option { - match (config.slice_grow, config.limit) { - (Some(grow), Some(limit)) => Some(limit.max(grow)), - (Some(grow), None) => Some(grow), - (None, limit) => limit, - } -} - -pub(crate) async fn write_chunk_diagnostics(path: &Path, cases: &[CaseDiagnostics]) -> Result<()> { - args::ensure_parent(path)?; - let mut file = tokio::fs::File::create(path) - .await - .with_context(|| format!("creating diagnostics file {}", path.display()))?; - for case in cases { - let line = serde_json::to_vec(case).context("serialising chunk diagnostics entry")?; - file.write_all(&line).await?; - file.write_all(b"\n").await?; - } - file.flush().await?; - Ok(()) -} - -#[allow(clippy::cast_precision_loss)] -pub(crate) async fn warm_hnsw_cache(db: &SurrealDbClient, dimension: usize) -> Result<()> { - let dummy_embedding: Vec = (0..dimension).map(|i| (i as f32).sin()).collect(); - - info!("Warming HNSW caches with sample queries"); - - // Warm up chunk embedding index - just query the embedding table to load HNSW index - let _ = db - .client - .query( - r#"SELECT chunk_id - FROM text_chunk_embedding - WHERE embedding <|1,1|> $embedding - LIMIT 5"#, - ) - .bind(("embedding", dummy_embedding.clone())) - .await - .context("warming text chunk HNSW cache")?; - - // Warm up entity embedding index - let _ = db - .client - .query( - r#"SELECT entity_id - FROM knowledge_entity_embedding - WHERE embedding <|1,1|> $embedding - LIMIT 5"#, - ) - .bind(("embedding", dummy_embedding)) - .await - .context("warming knowledge entity HNSW cache")?; - - info!("HNSW cache warming completed"); - Ok(()) -} - -use chrono::{DateTime, SecondsFormat, Utc}; - -pub fn format_timestamp(timestamp: &DateTime) -> String { - timestamp.to_rfc3339_opts(SecondsFormat::Secs, true) -} - -pub(crate) fn sanitize_model_code(code: &str) -> String { - code.chars() - .map(|ch| { - if ch.is_ascii_alphanumeric() { - ch.to_ascii_lowercase() - } else { - '_' - } - }) - .collect() -} - -// Re-export run_evaluation from the pipeline module at crate root -pub use crate::pipeline::run_evaluation; diff --git a/evaluations/src/inspection.rs b/evaluations/src/inspection.rs index ba71f0b..cca57b9 100644 --- a/evaluations/src/inspection.rs +++ b/evaluations/src/inspection.rs @@ -1,13 +1,13 @@ use std::{ collections::HashMap, fs, - path::{Path, PathBuf}, + path::Path, }; use anyhow::{anyhow, Context, Result}; use common::storage::{db::SurrealDbClient, types::text_chunk::TextChunk}; -use crate::{args::Config, corpus, eval::connect_eval_db, snapshot::DbSnapshotState}; +use crate::{args::Config, corpus, db::connect_eval_db}; pub async fn inspect_question(config: &Config) -> Result<()> { let question_id = config @@ -64,39 +64,26 @@ pub async fn inspect_question(config: &Config) -> Result<()> { ); } - let db_state_path = config - .database - .inspect_db_state - .clone() - .unwrap_or_else(|| default_state_path(config, &manifest)); - if let Some(state) = load_db_state(&db_state_path)? { - if let (Some(ns), Some(db_name)) = (state.namespace.as_deref(), state.database.as_deref()) { - match connect_eval_db(config, ns, db_name).await { - Ok(db) => match verify_chunks_in_db(&db, &question.matching_chunk_ids).await? { - MissingChunks::None => println!( - "All matching_chunk_ids exist in namespace '{ns}', database '{db_name}'" - ), - MissingChunks::Missing(list) => println!( - "Missing chunks in namespace '{ns}', database '{db_name}': {list:?}" - ), - }, - Err(err) => { - println!( - "Failed to connect to SurrealDB namespace '{ns}' / database '{db_name}': {err}" - ); - } + if let Some(seed) = manifest.metadata.namespace_seed.as_ref() { + let ns = seed.namespace.as_str(); + let db_name = seed.database.as_str(); + match connect_eval_db(config, ns, db_name).await { + Ok(db) => match verify_chunks_in_db(&db, &question.matching_chunk_ids).await? { + MissingChunks::None => println!( + "All matching_chunk_ids exist in namespace '{ns}', database '{db_name}'" + ), + MissingChunks::Missing(list) => println!( + "Missing chunks in namespace '{ns}', database '{db_name}': {list:?}" + ), + }, + Err(err) => { + println!( + "Failed to connect to SurrealDB namespace '{ns}' / database '{db_name}': {err}" + ); } - } else { - println!( - "State file {} is missing namespace/database fields; skipping live DB validation", - db_state_path.display() - ); } } else { - println!( - "State file {} not found; skipping live DB validation", - db_state_path.display() - ); + println!("Corpus manifest has no namespace seed; skipping live DB validation"); } Ok(()) @@ -137,25 +124,6 @@ fn build_chunk_lookup(manifest: &corpus::CorpusManifest) -> HashMap PathBuf { - config - .cache_dir - .join("snapshots") - .join(&manifest.metadata.dataset_id) - .join(&manifest.metadata.slice_id) - .join("db/state.json") -} - -fn load_db_state(path: &Path) -> Result> { - if !path.exists() { - return Ok(None); - } - let bytes = fs::read(path).with_context(|| format!("reading db state {}", path.display()))?; - let state = serde_json::from_slice(&bytes) - .with_context(|| format!("parsing db state {}", path.display()))?; - Ok(Some(state)) -} - enum MissingChunks { None, Missing(Vec), diff --git a/evaluations/src/main.rs b/evaluations/src/main.rs index 196878c..087c73c 100644 --- a/evaluations/src/main.rs +++ b/evaluations/src/main.rs @@ -1,19 +1,17 @@ mod args; -mod cache; +mod context_stats; mod cases; +mod cli; mod corpus; mod datasets; -mod db_helpers; -mod eval; +mod db; mod inspection; -mod namespace; mod openai; mod perf; mod pipeline; mod report; mod settings; mod slice; -mod snapshot; mod types; use anyhow::Context; @@ -24,7 +22,6 @@ use tracing_subscriber::{fmt, EnvFilter}; /// Configure `SurrealDB` environment variables for optimal performance #[allow(clippy::arithmetic_side_effects, clippy::unwrap_used)] fn configure_surrealdb_performance(cpu_count: usize) { - // Set environment variables only if they're not already set let indexing_batch_size = std::env::var("SURREAL_INDEXING_BATCH_SIZE") .unwrap_or_else(|_| (cpu_count * 2).to_string()); std::env::set_var("SURREAL_INDEXING_BATCH_SIZE", indexing_batch_size); @@ -62,12 +59,11 @@ fn configure_surrealdb_performance(cpu_count: usize) { } fn main() -> anyhow::Result<()> { - // Create an explicit multi-threaded runtime with optimized configuration let runtime = Builder::new_multi_thread() .enable_all() .worker_threads(std::thread::available_parallelism()?.get()) .max_blocking_threads(std::thread::available_parallelism()?.get()) - .thread_stack_size(10 * 1024 * 1024) // 10MiB stack size + .thread_stack_size(10 * 1024 * 1024) .thread_name("eval-retrieval-worker") .build() .context("failed to create tokio runtime")?; @@ -77,7 +73,6 @@ fn main() -> anyhow::Result<()> { #[allow(clippy::too_many_lines)] async fn async_main() -> anyhow::Result<()> { - // Log runtime configuration let cpu_count = std::thread::available_parallelism()?.get(); info!( cpu_cores = cpu_count, @@ -87,7 +82,6 @@ async fn async_main() -> anyhow::Result<()> { "Started multi-threaded tokio runtime" ); - // Configure SurrealDB environment variables for better performance configure_surrealdb_performance(cpu_count); let filter = std::env::var("RUST_LOG").unwrap_or_else(|_| "info".to_string()); @@ -97,13 +91,22 @@ async fn async_main() -> anyhow::Result<()> { let parsed = args::parse()?; - // Clap handles help automatically, so we don't need to check for it manually - if parsed.config.inspect_question.is_some() { inspection::inspect_question(&parsed.config).await?; return Ok(()); } + if parsed.config.status { + let status = cli::collect_status(&parsed.config).await?; + cli::print_status(&status); + return Ok(()); + } + + if parsed.config.warm { + cli::warm(&parsed.config).await?; + return Ok(()); + } + let dataset_kind = parsed.config.dataset; if parsed.config.convert_only { @@ -115,7 +118,6 @@ async fn async_main() -> anyhow::Result<()> { parsed.config.raw_dataset_path.as_path(), dataset_kind, parsed.config.llm_mode, - parsed.config.context_token_limit(), ) .with_context(|| { format!( @@ -124,56 +126,56 @@ async fn async_main() -> anyhow::Result<()> { parsed.config.raw_dataset_path.display() ) })?; - crate::datasets::write_converted(&dataset, parsed.config.converted_dataset_path.as_path()) - .with_context(|| { - format!( - "writing converted dataset to {}", - parsed.config.converted_dataset_path.display() - ) - })?; + let store_dir = datasets::store_dir_for(&parsed.config.converted_dataset_path); + datasets::write_sharded(&dataset, &store_dir)?; + datasets::prebuild_catalog_slices(&dataset, &parsed.config)?; println!( - "Converted dataset written to {}", - parsed.config.converted_dataset_path.display() + "Converted dataset written under {}", + store_dir.display() ); return Ok(()); } + if parsed.config.require_ready { + cli::ensure_query_ready(&parsed.config).await?; + } + info!(dataset = dataset_kind.id(), "Preparing converted dataset"); - let dataset = crate::datasets::ensure_converted( - dataset_kind, - parsed.config.raw_dataset_path.as_path(), - parsed.config.converted_dataset_path.as_path(), - parsed.config.force_convert, - parsed.config.llm_mode, - parsed.config.context_token_limit(), - ) - .with_context(|| { - format!( - "preparing converted dataset at {}", - parsed.config.converted_dataset_path.display() - ) - })?; + let loaded = crate::datasets::prepare_dataset(dataset_kind, &parsed.config).with_context( + || { + format!( + "preparing converted dataset at {}", + parsed.config.converted_dataset_path.display() + ) + }, + )?; info!( - questions = dataset + questions = loaded + .dataset .paragraphs .iter() .map(|p| p.questions.len()) .sum::(), - paragraphs = dataset.paragraphs.len(), - dataset = dataset.metadata.id.as_str(), + paragraphs = loaded.dataset.paragraphs.len(), + partial = loaded.partial, + dataset = loaded.dataset.metadata.id.as_str(), "Dataset ready" ); if parsed.config.slice_grow.is_some() { - eval::grow_slice(&dataset, &parsed.config).context("growing slice ledger")?; + slice::grow_slice(&loaded.dataset, &parsed.config).context("growing slice ledger")?; return Ok(()); } info!("Running retrieval evaluation"); - let summary = eval::run_evaluation(&dataset, &parsed.config) - .await - .context("running retrieval evaluation")?; + let summary = pipeline::run_evaluation( + &loaded.dataset, + &parsed.config, + Some(loaded.content_checksum.as_str()), + ) + .await + .context("running retrieval evaluation")?; let report = report::write_reports( &summary, @@ -226,12 +228,17 @@ async fn async_main() -> anyhow::Result<()> { ); } else { println!( - "[{}] Retrieval Precision@{k}: {precision:.3} ({correct}/{retrieval_total}) → JSON: {json} | Markdown: {md} | History: {history}{perf_note}", + "[{}] Retrieval Precision@{k}: {precision:.3} ({correct}/{retrieval_total}) | Retrieved context: {chunks} chunks, {tokens} tokens ({tokenizer}, avg {avg_tokens:.0}/query, p95 {p95}) → JSON: {json} | Markdown: {md} | History: {history}{perf_note}", summary.dataset_label, k = summary.k, precision = summary.precision, correct = summary.correct, retrieval_total = summary.retrieval_cases, + chunks = summary.retrieved_context.total_chunks, + tokens = summary.retrieved_context.total_tokens, + tokenizer = summary.retrieved_context.tokenizer, + avg_tokens = summary.retrieved_context.avg_tokens_per_query, + p95 = summary.retrieved_context.p95_tokens_per_query, json = report.paths.json.display(), md = report.paths.markdown.display(), history = report.history_path.display(), diff --git a/evaluations/src/openai.rs b/evaluations/src/openai.rs index 7c5e644..1928dc6 100644 --- a/evaluations/src/openai.rs +++ b/evaluations/src/openai.rs @@ -1,9 +1,27 @@ +use std::sync::Arc; + use anyhow::{Context, Result}; use async_openai::{config::OpenAIConfig, Client}; const DEFAULT_BASE_URL: &str = "https://api.openai.com/v1"; -pub fn build_client_from_env() -> Result<(Client, String)> { +pub fn ingestion_openai_client( + include_entities: bool, +) -> Result<(Arc>, Option)> { + if include_entities { + let (client, base_url) = build_client_from_env().context( + "OPENAI_API_KEY must be set when --include-entities is enabled (entity extraction uses OpenAI)", + )?; + Ok((Arc::new(client), Some(base_url))) + } else { + Ok(( + Arc::new(Client::with_config(OpenAIConfig::default())), + None, + )) + } +} + +fn build_client_from_env() -> Result<(Client, String)> { let api_key = std::env::var("OPENAI_API_KEY") .context("OPENAI_API_KEY must be set to run retrieval evaluations")?; let base_url = diff --git a/evaluations/src/perf.rs b/evaluations/src/perf.rs index dc29036..de12810 100644 --- a/evaluations/src/perf.rs +++ b/evaluations/src/perf.rs @@ -7,8 +7,8 @@ use anyhow::{Context, Result}; use crate::{ args, - eval::EvaluationSummary, report::{self, EvaluationReport}, + types::EvaluationSummary, }; pub fn mirror_perf_outputs( @@ -91,23 +91,23 @@ fn format_duration(value: Option) -> String { #[cfg(test)] mod tests { use super::*; - use crate::eval::{EvaluationStageTimings, PerformanceTimings}; + use crate::types::{EvaluationStageTimings, PerformanceTimings, LatencyStats, StageLatency, StageLatencyBreakdown}; use chrono::Utc; use tempfile::tempdir; - fn sample_latency() -> crate::eval::LatencyStats { - crate::eval::LatencyStats { + fn sample_latency() -> LatencyStats { + LatencyStats { avg: 10.0, p50: 8, p95: 15, } } - fn sample_stage_latency() -> crate::eval::StageLatencyBreakdown { - crate::eval::StageLatencyBreakdown { + fn sample_stage_latency() -> StageLatencyBreakdown { + StageLatencyBreakdown { stages: ["embed", "search", "rerank", "resolve_entities", "assemble"] .into_iter() - .map(|stage| crate::eval::StageLatency { + .map(|stage| StageLatency { stage: stage.to_string(), stats: sample_latency(), }) @@ -206,6 +206,7 @@ mod tests { chunk_vector_take: 20, chunk_fts_take: 20, max_chunks_per_entity: 4, + retrieved_context: crate::context_stats::aggregate_context_stats(&[]), cases: Vec::new(), } } diff --git a/evaluations/src/pipeline/context.rs b/evaluations/src/pipeline/context.rs index 9f2fb9c..08d8723 100644 --- a/evaluations/src/pipeline/context.rs +++ b/evaluations/src/pipeline/context.rs @@ -20,11 +20,11 @@ use retrieval_pipeline::{ use crate::{ args::Config, - cache::EmbeddingCache, + cases::SeededCase, corpus, datasets::ConvertedDataset, - eval::{CaseDiagnostics, CaseSummary, EvaluationStageTimings, EvaluationSummary, SeededCase}, - slice, snapshot, + slice, + types::{CaseDiagnostics, CaseSummary, EvaluationStageTimings, EvaluationSummary}, }; #[allow(clippy::struct_excessive_bools)] @@ -41,12 +41,10 @@ pub(super) struct EvaluationContext<'a> { pub namespace: String, pub database: String, pub db: Option, - pub descriptor: Option, pub settings: Option, pub settings_missing: bool, pub must_reapply_settings: bool, pub embedding_provider: Option, - pub embedding_cache: Option, pub openai_client: Option>>, pub openai_base_url: Option, pub expected_fingerprint: Option, @@ -67,13 +65,19 @@ pub(super) struct EvaluationContext<'a> { pub summary: Option, pub diagnostics_path: Option, pub diagnostics_enabled: bool, + pub content_checksum: Option, } impl<'a> EvaluationContext<'a> { - pub fn new(dataset: &'a ConvertedDataset, config: &'a Config) -> Self { + pub fn new( + dataset: &'a ConvertedDataset, + config: &'a Config, + content_checksum: Option, + ) -> Self { Self { dataset, config, + content_checksum, stage_timings: EvaluationStageTimings::default(), ledger_limit: None, slice_settings: None, @@ -84,12 +88,10 @@ impl<'a> EvaluationContext<'a> { namespace: String::new(), database: String::new(), db: None, - descriptor: None, settings: None, settings_missing: false, must_reapply_settings: false, embedding_provider: None, - embedding_cache: None, openai_client: None, openai_base_url: None, expected_fingerprint: None, @@ -133,12 +135,6 @@ impl<'a> EvaluationContext<'a> { .ok_or_else(|| anyhow!("database connection missing")) } - pub fn descriptor(&self) -> Result<&snapshot::Descriptor> { - self.descriptor - .as_ref() - .ok_or_else(|| anyhow!("snapshot descriptor unavailable")) - } - pub fn embedding_provider(&self) -> Result<&EmbeddingProvider> { self.embedding_provider .as_ref() @@ -159,6 +155,10 @@ impl<'a> EvaluationContext<'a> { .ok_or_else(|| anyhow!("corpus handle missing")) } + pub fn content_checksum(&self) -> Option<&str> { + self.content_checksum.as_deref() + } + pub fn evaluation_user(&self) -> Result<&User> { self.eval_user .as_ref() diff --git a/evaluations/src/pipeline/diagnostics.rs b/evaluations/src/pipeline/diagnostics.rs new file mode 100644 index 0000000..94bf940 --- /dev/null +++ b/evaluations/src/pipeline/diagnostics.rs @@ -0,0 +1,20 @@ +use std::path::Path; + +use anyhow::{Context, Result}; +use tokio::io::AsyncWriteExt; + +use crate::{args, types::CaseDiagnostics}; + +pub(crate) async fn write_chunk_diagnostics(path: &Path, cases: &[CaseDiagnostics]) -> Result<()> { + args::ensure_parent(path)?; + let mut file = tokio::fs::File::create(path) + .await + .with_context(|| format!("creating diagnostics file {}", path.display()))?; + for case in cases { + let line = serde_json::to_vec(case).context("serialising chunk diagnostics entry")?; + file.write_all(&line).await?; + file.write_all(b"\n").await?; + } + file.flush().await?; + Ok(()) +} diff --git a/evaluations/src/pipeline/mod.rs b/evaluations/src/pipeline/mod.rs index d8386b5..f154e2a 100644 --- a/evaluations/src/pipeline/mod.rs +++ b/evaluations/src/pipeline/mod.rs @@ -1,6 +1,6 @@ mod context; +mod diagnostics; mod stages; -mod state; use anyhow::Result; @@ -8,20 +8,49 @@ use crate::{args::Config, datasets::ConvertedDataset, types::EvaluationSummary}; use context::EvaluationContext; +async fn run_through_namespace<'a>( + dataset: &'a ConvertedDataset, + config: &'a Config, + content_checksum: Option, +) -> Result> { + let mut ctx = EvaluationContext::new(dataset, config, content_checksum); + stages::prepare_slice(&mut ctx).await?; + stages::prepare_db(&mut ctx).await?; + stages::prepare_corpus(&mut ctx).await?; + stages::prepare_namespace(&mut ctx).await?; + Ok(ctx) +} + +pub async fn warm_evaluation( + dataset: &ConvertedDataset, + config: &Config, + content_checksum: &str, +) -> Result<()> { + let _ctx = run_through_namespace( + dataset, + config, + Some(content_checksum.to_string()), + ) + .await?; + Ok(()) +} + pub async fn run_evaluation( dataset: &ConvertedDataset, config: &Config, + content_checksum: Option<&str>, ) -> Result { - let mut ctx = EvaluationContext::new(dataset, config); - let machine = state::ready(); - - let machine = stages::prepare_slice(machine, &mut ctx).await?; - let machine = stages::prepare_db(machine, &mut ctx).await?; - let machine = stages::prepare_corpus(machine, &mut ctx).await?; - let machine = stages::prepare_namespace(machine, &mut ctx).await?; - let machine = stages::run_queries(machine, &mut ctx).await?; - let machine = stages::summarize(machine, &mut ctx).await?; - let _ = stages::finalize(machine, &mut ctx).await?; - + let mut ctx = EvaluationContext::new( + dataset, + config, + content_checksum.map(str::to_string), + ); + stages::prepare_slice(&mut ctx).await?; + stages::prepare_db(&mut ctx).await?; + stages::prepare_corpus(&mut ctx).await?; + stages::prepare_namespace(&mut ctx).await?; + stages::run_queries(&mut ctx).await?; + stages::summarize(&mut ctx).await?; + stages::finalize(&mut ctx).await?; ctx.into_summary() } diff --git a/evaluations/src/pipeline/stages/finalize.rs b/evaluations/src/pipeline/stages/finalize.rs index b54708b..82d8e53 100644 --- a/evaluations/src/pipeline/stages/finalize.rs +++ b/evaluations/src/pipeline/stages/finalize.rs @@ -3,18 +3,12 @@ use std::time::Instant; use anyhow::Context; use tracing::info; -use crate::eval::write_chunk_diagnostics; - use super::super::{ context::{EvalStage, EvaluationContext}, - state::{Completed, EvaluationMachine, Summarized}, + diagnostics::write_chunk_diagnostics, }; -use super::{map_guard_error, StageResult}; -pub(crate) async fn finalize( - machine: EvaluationMachine<(), Summarized>, - ctx: &mut EvaluationContext<'_>, -) -> StageResult { +pub(crate) async fn finalize(ctx: &mut EvaluationContext<'_>) -> anyhow::Result<()> { let stage = EvalStage::Finalize; info!( evaluation_stage = stage.label(), @@ -22,13 +16,6 @@ pub(crate) async fn finalize( ); let started = Instant::now(); - if let Some(cache) = ctx.embedding_cache.as_ref() { - cache - .persist() - .await - .context("persisting embedding cache")?; - } - if let Some(path) = ctx.diagnostics_path.as_ref() { if ctx.diagnostics_enabled { write_chunk_diagnostics(path.as_path(), &ctx.diagnostics_output) @@ -53,7 +40,5 @@ pub(crate) async fn finalize( "completed evaluation stage" ); - machine - .finalize() - .map_err(|(_, guard)| map_guard_error("finalize", &guard)) + Ok(()) } diff --git a/evaluations/src/pipeline/stages/mod.rs b/evaluations/src/pipeline/stages/mod.rs index 356b532..99b35cc 100644 --- a/evaluations/src/pipeline/stages/mod.rs +++ b/evaluations/src/pipeline/stages/mod.rs @@ -13,14 +13,3 @@ pub(crate) use prepare_namespace::prepare_namespace; pub(crate) use prepare_slice::prepare_slice; pub(crate) use run_queries::run_queries; pub(crate) use summarize::summarize; - -use anyhow::Result; -use state_machines::core::GuardError; - -use super::state::EvaluationMachine; - -fn map_guard_error(event: &str, guard: &GuardError) -> anyhow::Error { - anyhow::anyhow!("invalid evaluation pipeline transition during {event}: {guard:?}") -} - -type StageResult = Result>; diff --git a/evaluations/src/pipeline/stages/prepare_corpus.rs b/evaluations/src/pipeline/stages/prepare_corpus.rs index a8a16f1..d5da651 100644 --- a/evaluations/src/pipeline/stages/prepare_corpus.rs +++ b/evaluations/src/pipeline/stages/prepare_corpus.rs @@ -3,19 +3,12 @@ use std::time::Instant; use anyhow::Context; use tracing::info; -use crate::{corpus, eval::can_reuse_namespace, slice, snapshot}; +use crate::{corpus, db::can_reuse_namespace, slice}; -use super::super::{ - context::{EvalStage, EvaluationContext}, - state::{CorpusReady, DbReady, EvaluationMachine}, -}; -use super::{map_guard_error, StageResult}; +use super::super::context::{EvalStage, EvaluationContext}; #[allow(clippy::too_many_lines)] -pub(crate) async fn prepare_corpus( - machine: EvaluationMachine<(), DbReady>, - ctx: &mut EvaluationContext<'_>, -) -> StageResult { +pub(crate) async fn prepare_corpus(ctx: &mut EvaluationContext<'_>) -> anyhow::Result<()> { let stage = EvalStage::PrepareCorpus; info!( evaluation_stage = stage.label(), @@ -31,13 +24,13 @@ pub(crate) async fn prepare_corpus( let window = slice::select_window(slice, ctx.config().slice_offset, ctx.config().limit) .context("selecting slice window for corpus preparation")?; - let descriptor = snapshot::Descriptor::new(config, slice, ctx.embedding_provider()?); let ingestion_config = corpus::make_ingestion_config(config); let expected_fingerprint = corpus::compute_ingestion_fingerprint( ctx.dataset(), slice, config.converted_dataset_path.as_path(), &ingestion_config, + ctx.content_checksum(), )?; let base_dir = corpus::cached_corpus_dir( &cache_settings, @@ -47,19 +40,18 @@ pub(crate) async fn prepare_corpus( if !config.reseed_slice { let requested_cases = window.cases.len(); - if can_reuse_namespace( - ctx.db()?, - &descriptor, - &ctx.namespace, - &ctx.database, - ctx.dataset().metadata.id.as_str(), - slice.manifest.slice_id.as_str(), - expected_fingerprint.as_str(), - requested_cases, - ) - .await? - { - if let Some(manifest) = corpus::load_cached_manifest(&base_dir)? { + if let Some(manifest) = corpus::load_cached_manifest(&base_dir)? { + if can_reuse_namespace( + ctx.db()?, + &manifest, + &embedding_provider, + &ctx.namespace, + &ctx.database, + expected_fingerprint.as_str(), + requested_cases, + ) + .await? + { info!( cache = %base_dir.display(), namespace = ctx.namespace.as_str(), @@ -70,7 +62,6 @@ pub(crate) async fn prepare_corpus( ctx.corpus_handle = Some(corpus_handle); ctx.expected_fingerprint = Some(expected_fingerprint); ctx.ingestion_duration_ms = 0; - ctx.descriptor = Some(descriptor); let elapsed = started.elapsed(); ctx.record_stage_duration(stage, elapsed); @@ -80,14 +71,8 @@ pub(crate) async fn prepare_corpus( "completed evaluation stage" ); - return machine - .prepare_corpus() - .map_err(|(_, guard)| map_guard_error("prepare_corpus", &guard)); + return Ok(()); } - info!( - cache = %base_dir.display(), - "Namespace reusable but cached manifest missing; regenerating corpus" - ); } } @@ -103,6 +88,7 @@ pub(crate) async fn prepare_corpus( openai_client, &eval_user_id, config.converted_dataset_path.as_path(), + ctx.content_checksum(), ingestion_config.clone(), ) .await @@ -126,7 +112,6 @@ pub(crate) async fn prepare_corpus( ctx.corpus_handle = Some(corpus_handle); ctx.expected_fingerprint = Some(expected_fingerprint); ctx.ingestion_duration_ms = ingestion_duration_ms; - ctx.descriptor = Some(descriptor); let elapsed = started.elapsed(); ctx.record_stage_duration(stage, elapsed); @@ -136,7 +121,5 @@ pub(crate) async fn prepare_corpus( "completed evaluation stage" ); - machine - .prepare_corpus() - .map_err(|(_, guard)| map_guard_error("prepare_corpus", &guard)) + Ok(()) } diff --git a/evaluations/src/pipeline/stages/prepare_db.rs b/evaluations/src/pipeline/stages/prepare_db.rs index 01eff64..3eb09cb 100644 --- a/evaluations/src/pipeline/stages/prepare_db.rs +++ b/evaluations/src/pipeline/stages/prepare_db.rs @@ -1,28 +1,19 @@ -use std::{sync::Arc, time::Instant}; +use std::time::Instant; use anyhow::{anyhow, Context}; use tracing::info; use crate::{ args::EmbeddingBackend, - cache::EmbeddingCache, - eval::{ - connect_eval_db, enforce_system_settings, load_or_init_system_settings, sanitize_model_code, - }, + db::{connect_eval_db, sanitize_model_code}, openai, + settings::{enforce_system_settings, load_or_init_system_settings}, }; use common::utils::embedding::{default_embedding_pool_size, EmbeddingProvider}; -use super::super::{ - context::{EvalStage, EvaluationContext}, - state::{DbReady, EvaluationMachine, SlicePrepared}, -}; -use super::{map_guard_error, StageResult}; +use super::super::context::{EvalStage, EvaluationContext}; -pub(crate) async fn prepare_db( - machine: EvaluationMachine<(), SlicePrepared>, - ctx: &mut EvaluationContext<'_>, -) -> StageResult { +pub(crate) async fn prepare_db(ctx: &mut EvaluationContext<'_>) -> anyhow::Result<()> { let stage = EvalStage::PrepareDb; info!( evaluation_stage = stage.label(), @@ -36,19 +27,18 @@ pub(crate) async fn prepare_db( let db = connect_eval_db(config, &namespace, &database).await?; - let (raw_openai_client, openai_base_url) = - openai::build_client_from_env().context("building OpenAI client")?; - let openai_client = Arc::new(raw_openai_client); + let (openai_client, openai_base_url) = + openai::ingestion_openai_client(config.ingest.include_entities) + .context("building OpenAI client for ingestion")?; - // Create embedding provider directly from config (eval only supports FastEmbed and Hashed) let embedding_provider = match config.embedding_backend { - crate::args::EmbeddingBackend::FastEmbed => EmbeddingProvider::new_fastembed( + EmbeddingBackend::FastEmbed => EmbeddingProvider::new_fastembed( config.embedding_model.clone(), default_embedding_pool_size(), ) .await .context("creating FastEmbed provider")?, - crate::args::EmbeddingBackend::Hashed => { + EmbeddingBackend::Hashed => { EmbeddingProvider::new_hashed(1536).context("creating Hashed provider")? } }; @@ -68,12 +58,14 @@ pub(crate) async fn prepare_db( dimension = provider_dimension, "Embedding provider initialised" ); - info!(openai_base_url = %openai_base_url, "OpenAI client configured"); + if let Some(base_url) = &openai_base_url { + info!(openai_base_url = %base_url, "OpenAI client configured for entity ingestion"); + } let (mut settings, settings_missing) = load_or_init_system_settings(&db, provider_dimension).await?; - let embedding_cache = if config.embedding_backend == EmbeddingBackend::FastEmbed { + if config.embedding_backend == EmbeddingBackend::FastEmbed { if let Some(model_code) = embedding_provider.model_code() { let sanitized = sanitize_model_code(&model_code); let path = config.cache_dir.join(format!("{sanitized}.json")); @@ -83,15 +75,8 @@ pub(crate) async fn prepare_db( .with_context(|| format!("removing stale cache {}", path.display())) .ok(); } - let cache = EmbeddingCache::load(&path).await?; - info!(path = %path.display(), "Embedding cache ready"); - Some(cache) - } else { - None } - } else { - None - }; + } let must_reapply_settings = settings_missing; let defer_initial_enforce = settings_missing && !config.reseed_slice; @@ -104,9 +89,8 @@ pub(crate) async fn prepare_db( ctx.must_reapply_settings = must_reapply_settings; ctx.settings = Some(settings); ctx.embedding_provider = Some(embedding_provider); - ctx.embedding_cache = embedding_cache; ctx.openai_client = Some(openai_client); - ctx.openai_base_url = Some(openai_base_url); + ctx.openai_base_url = openai_base_url; let elapsed = started.elapsed(); ctx.record_stage_duration(stage, elapsed); @@ -116,7 +100,5 @@ pub(crate) async fn prepare_db( "completed evaluation stage" ); - machine - .prepare_db() - .map_err(|(_, guard)| map_guard_error("prepare_db", &guard)) + Ok(()) } diff --git a/evaluations/src/pipeline/stages/prepare_namespace.rs b/evaluations/src/pipeline/stages/prepare_namespace.rs index 1af78d0..8eb635d 100644 --- a/evaluations/src/pipeline/stages/prepare_namespace.rs +++ b/evaluations/src/pipeline/stages/prepare_namespace.rs @@ -5,25 +5,19 @@ use common::storage::types::system_settings::SystemSettings; use tracing::{info, warn}; use crate::{ + cases::cases_from_manifest, corpus, - db_helpers::{recreate_indexes, remove_all_indexes, reset_namespace}, - eval::{ - can_reuse_namespace, cases_from_manifest, enforce_system_settings, ensure_eval_user, - record_namespace_state, warm_hnsw_cache, + db::{ + can_reuse_namespace, ensure_eval_user, record_namespace_seed, recreate_indexes, + reset_namespace, warm_hnsw_cache, }, + settings::enforce_system_settings, }; -use super::super::{ - context::{EvalStage, EvaluationContext}, - state::{CorpusReady, EvaluationMachine, NamespaceReady}, -}; -use super::{map_guard_error, StageResult}; +use super::super::context::{EvalStage, EvaluationContext}; #[allow(clippy::too_many_lines)] -pub(crate) async fn prepare_namespace( - machine: EvaluationMachine<(), CorpusReady>, - ctx: &mut EvaluationContext<'_>, -) -> StageResult { +pub(crate) async fn prepare_namespace(ctx: &mut EvaluationContext<'_>) -> anyhow::Result<()> { let stage = EvalStage::PrepareNamespace; info!( evaluation_stage = stage.label(), @@ -32,7 +26,6 @@ pub(crate) async fn prepare_namespace( let started = Instant::now(); let config = ctx.config(); - let dataset = ctx.dataset(); let expected_fingerprint = ctx .expected_fingerprint .as_deref() @@ -60,20 +53,16 @@ pub(crate) async fn prepare_namespace( let mut namespace_reused = false; if !config.reseed_slice { - namespace_reused = { - let slice = ctx.slice()?; - can_reuse_namespace( - ctx.db()?, - ctx.descriptor()?, - &namespace, - &database, - dataset.metadata.id.as_str(), - slice.manifest.slice_id.as_str(), - expected_fingerprint.as_str(), - requested_cases, - ) - .await? - }; + namespace_reused = can_reuse_namespace( + ctx.db()?, + base_manifest, + &embedding_provider, + &namespace, + &database, + expected_fingerprint.as_str(), + requested_cases, + ) + .await?; } let mut namespace_seed_ms = None; @@ -114,34 +103,20 @@ pub(crate) async fn prepare_namespace( "Seeding ingestion corpus into SurrealDB" ); } - let indexes_disabled = remove_all_indexes(ctx.db()?).await.is_ok(); - let seed_start = Instant::now(); corpus::seed_manifest_into_db(ctx.db()?, &manifest_for_seed) .await .context("seeding ingestion corpus from manifest")?; namespace_seed_ms = Some(seed_start.elapsed().as_millis()); - // Recreate indexes AFTER data is loaded (correct bulk loading pattern) - if indexes_disabled { - info!("Recreating indexes after seeding data"); - recreate_indexes(ctx.db()?, embedding_provider.dimension()) - .await - .context("recreating indexes with correct dimension")?; - warm_hnsw_cache(ctx.db()?, embedding_provider.dimension()).await?; - } - { - let slice = ctx.slice()?; - record_namespace_state( - ctx.descriptor()?, - dataset.metadata.id.as_str(), - slice.manifest.slice_id.as_str(), - expected_fingerprint.as_str(), - &namespace, - &database, - requested_cases, - ) - .await; + info!("Recreating indexes after seeding data"); + recreate_indexes(ctx.db()?, embedding_provider.dimension()) + .await + .context("recreating indexes with correct dimension")?; + warm_hnsw_cache(ctx.db()?, embedding_provider.dimension()).await?; + + if let Some(handle) = ctx.corpus_handle.as_mut() { + record_namespace_seed(handle, &namespace, &database, requested_cases).await; } } @@ -198,7 +173,5 @@ pub(crate) async fn prepare_namespace( "completed evaluation stage" ); - machine - .prepare_namespace() - .map_err(|(_, guard)| map_guard_error("prepare_namespace", &guard)) + Ok(()) } diff --git a/evaluations/src/pipeline/stages/prepare_slice.rs b/evaluations/src/pipeline/stages/prepare_slice.rs index 861c1c6..9b2493e 100644 --- a/evaluations/src/pipeline/stages/prepare_slice.rs +++ b/evaluations/src/pipeline/stages/prepare_slice.rs @@ -3,21 +3,11 @@ use std::time::Instant; use anyhow::Context; use tracing::info; -use crate::{ - eval::{default_database, default_namespace, ledger_target}, - slice, -}; +use crate::{db::{default_database, default_namespace}, slice}; -use super::super::{ - context::{EvalStage, EvaluationContext}, - state::{EvaluationMachine, Ready, SlicePrepared}, -}; -use super::{map_guard_error, StageResult}; +use super::super::context::{EvalStage, EvaluationContext}; -pub(crate) async fn prepare_slice( - machine: EvaluationMachine<(), Ready>, - ctx: &mut EvaluationContext<'_>, -) -> StageResult { +pub(crate) async fn prepare_slice(ctx: &mut EvaluationContext<'_>) -> anyhow::Result<()> { let stage = EvalStage::PrepareSlice; info!( evaluation_stage = stage.label(), @@ -25,7 +15,7 @@ pub(crate) async fn prepare_slice( ); let started = Instant::now(); - let ledger_limit = ledger_target(ctx.config()); + let ledger_limit = slice::ledger_target(ctx.config()); let slice_settings = slice::slice_config_with_limit(ctx.config(), ledger_limit); let resolved_slice = slice::resolve_slice(ctx.dataset(), &slice_settings).context("resolving dataset slice")?; @@ -49,7 +39,11 @@ pub(crate) async fn prepare_slice( .db_namespace .clone() .unwrap_or_else(|| { - default_namespace(ctx.dataset().metadata.id.as_str(), ctx.config().limit) + default_namespace( + ctx.dataset().metadata.id.as_str(), + ctx.config().limit, + ctx.config().slice.as_deref(), + ) }); ctx.database = ctx .config() @@ -66,7 +60,5 @@ pub(crate) async fn prepare_slice( "completed evaluation stage" ); - machine - .prepare_slice() - .map_err(|(_, guard)| map_guard_error("prepare_slice", &guard)) + Ok(()) } diff --git a/evaluations/src/pipeline/stages/run_queries.rs b/evaluations/src/pipeline/stages/run_queries.rs index c8683f5..96fd948 100644 --- a/evaluations/src/pipeline/stages/run_queries.rs +++ b/evaluations/src/pipeline/stages/run_queries.rs @@ -5,9 +5,13 @@ use common::storage::types::StoredObject; use futures::stream::{self, StreamExt}; use tracing::{debug, info}; -use crate::eval::{ - adapt_retrieval_output, build_case_diagnostics, text_contains_answer, CaseDiagnostics, - CaseSummary, RetrievedSummary, +use crate::{ + cases::SeededCase, + context_stats, + types::{ + adapt_retrieval_output, build_case_diagnostics, text_contains_answer, CaseDiagnostics, + CaseSummary, RetrievedSummary, + }, }; use retrieval_pipeline::{ pipeline::{self, RetrievalConfig, StageTimings}, @@ -15,17 +19,10 @@ use retrieval_pipeline::{ }; use tokio::sync::Semaphore; -use super::super::{ - context::{EvalStage, EvaluationContext}, - state::{EvaluationMachine, NamespaceReady, QueriesFinished}, -}; -use super::{map_guard_error, StageResult}; +use super::super::context::{EvalStage, EvaluationContext}; #[allow(clippy::too_many_lines, clippy::arithmetic_side_effects)] -pub(crate) async fn run_queries( - machine: EvaluationMachine<(), NamespaceReady>, - ctx: &mut EvaluationContext<'_>, -) -> StageResult { +pub(crate) async fn run_queries(ctx: &mut EvaluationContext<'_>) -> anyhow::Result<()> { let stage = EvalStage::RunQueries; info!( evaluation_stage = stage.label(), @@ -153,7 +150,7 @@ pub(crate) async fn run_queries( .await .context("acquiring query semaphore permit")?; - let crate::eval::SeededCase { + let SeededCase { question_id, question, expected_source, @@ -197,6 +194,7 @@ pub(crate) async fn run_queries( let query_latency = query_start.elapsed().as_millis(); let candidates = adapt_retrieval_output(result_output); + let retrieved_context = context_stats::stats_for_candidates(&candidates); let mut retrieved = Vec::new(); let mut match_rank = None; let answers_lower: Vec = @@ -288,6 +286,7 @@ pub(crate) async fn run_queries( reciprocal_rank: Some(reciprocal_rank), ndcg: Some(ndcg), latency_ms: query_latency, + retrieved_context, retrieved, }; @@ -353,9 +352,7 @@ pub(crate) async fn run_queries( "completed evaluation stage" ); - machine - .run_queries() - .map_err(|(_, guard)| map_guard_error("run_queries", &guard)) + Ok(()) } #[allow(clippy::arithmetic_side_effects, clippy::cast_precision_loss)] diff --git a/evaluations/src/pipeline/stages/summarize.rs b/evaluations/src/pipeline/stages/summarize.rs index fa7f67f..9603439 100644 --- a/evaluations/src/pipeline/stages/summarize.rs +++ b/evaluations/src/pipeline/stages/summarize.rs @@ -3,25 +3,19 @@ use std::time::Instant; use chrono::Utc; use tracing::info; -use crate::eval::{ +use crate::types::{ build_stage_latency_breakdown, compute_latency_stats, EvaluationSummary, PerformanceTimings, + RetrievedContextStats, }; -use super::super::{ - context::{EvalStage, EvaluationContext}, - state::{EvaluationMachine, QueriesFinished, Summarized}, -}; -use super::{map_guard_error, StageResult}; +use super::super::context::{EvalStage, EvaluationContext}; #[allow( clippy::too_many_lines, clippy::arithmetic_side_effects, clippy::cast_precision_loss )] -pub(crate) async fn summarize( - machine: EvaluationMachine<(), QueriesFinished>, - ctx: &mut EvaluationContext<'_>, -) -> StageResult { +pub(crate) async fn summarize(ctx: &mut EvaluationContext<'_>) -> anyhow::Result<()> { let stage = EvalStage::Summarize; info!( evaluation_stage = stage.label(), @@ -123,6 +117,12 @@ pub(crate) async fn summarize( sum_ndcg / (retrieval_cases as f64) }; + let per_query_context: Vec = summaries + .iter() + .map(|summary| summary.retrieved_context) + .collect(); + let retrieved_context = crate::context_stats::aggregate_context_stats(&per_query_context); + let active_tuning = ctx .retrieval_config .as_ref() @@ -133,7 +133,7 @@ pub(crate) async fn summarize( openai_base_url: ctx .openai_base_url .clone() - .unwrap_or_else(|| "".to_string()), + .unwrap_or_else(|| "n/a (chunk-only ingestion)".to_string()), ingestion_ms: ctx.ingestion_duration_ms, namespace_seed_ms: ctx.namespace_seed_ms, evaluation_stage_ms: ctx.stage_timings.clone(), @@ -217,11 +217,12 @@ pub(crate) async fn summarize( chunk_rrf_use_fts: active_tuning.flags.chunk_rrf_use_fts.as_bool(), ingest_chunk_min_tokens: config.ingest.ingest_chunk_min_tokens, ingest_chunk_max_tokens: config.ingest.ingest_chunk_max_tokens, - ingest_chunks_only: config.ingest.ingest_chunks_only, + ingest_chunks_only: !config.ingest.include_entities, ingest_chunk_overlap_tokens: config.ingest.ingest_chunk_overlap_tokens, chunk_vector_take: active_tuning.chunk_vector_take, chunk_fts_take: active_tuning.chunk_fts_take, max_chunks_per_entity: active_tuning.max_chunks_per_entity, + retrieved_context, cases: summaries, }); @@ -233,7 +234,5 @@ pub(crate) async fn summarize( "completed evaluation stage" ); - machine - .summarize() - .map_err(|(_, guard)| map_guard_error("summarize", &guard)) + Ok(()) } diff --git a/evaluations/src/pipeline/state.rs b/evaluations/src/pipeline/state.rs deleted file mode 100644 index aa9e753..0000000 --- a/evaluations/src/pipeline/state.rs +++ /dev/null @@ -1,31 +0,0 @@ -use state_machines::state_machine; - -state_machine! { - name: EvaluationMachine, - state: EvaluationState, - initial: Ready, - states: [Ready, SlicePrepared, DbReady, CorpusReady, NamespaceReady, QueriesFinished, Summarized, Completed, Failed], - events { - prepare_slice { transition: { from: Ready, to: SlicePrepared } } - prepare_db { transition: { from: SlicePrepared, to: DbReady } } - prepare_corpus { transition: { from: DbReady, to: CorpusReady } } - prepare_namespace { transition: { from: CorpusReady, to: NamespaceReady } } - run_queries { transition: { from: NamespaceReady, to: QueriesFinished } } - summarize { transition: { from: QueriesFinished, to: Summarized } } - finalize { transition: { from: Summarized, to: Completed } } - abort { - transition: { from: Ready, to: Failed } - transition: { from: SlicePrepared, to: Failed } - transition: { from: DbReady, to: Failed } - transition: { from: CorpusReady, to: Failed } - transition: { from: NamespaceReady, to: Failed } - transition: { from: QueriesFinished, to: Failed } - transition: { from: Summarized, to: Failed } - transition: { from: Completed, to: Failed } - } - } -} - -pub fn ready() -> EvaluationMachine<(), Ready> { - EvaluationMachine::new(()) -} diff --git a/evaluations/src/report.rs b/evaluations/src/report.rs index 33b419b..e299567 100644 --- a/evaluations/src/report.rs +++ b/evaluations/src/report.rs @@ -7,12 +7,10 @@ use std::{ use anyhow::{Context, Result}; use serde::{Deserialize, Serialize}; -use crate::eval::{ +use crate::types::{ format_timestamp, CaseSummary, EvaluationStageTimings, EvaluationSummary, LatencyStats, - StageLatencyBreakdown, + RetrievalContextStats, StageLatencyBreakdown, }; -use chrono::Utc; -use tracing::warn; #[derive(Debug)] pub struct ReportPaths { @@ -108,6 +106,7 @@ pub struct RetrievalSection { pub ingest_chunk_max_tokens: usize, pub ingest_chunk_overlap_tokens: usize, pub ingest_chunks_only: bool, + pub retrieved_context: RetrievalContextStats, } const fn default_chunk_rrf_k() -> f32 { @@ -242,6 +241,7 @@ impl EvaluationReport { ingest_chunk_max_tokens: summary.ingest_chunk_max_tokens, ingest_chunk_overlap_tokens: summary.ingest_chunk_overlap_tokens, ingest_chunks_only: summary.ingest_chunks_only, + retrieved_context: summary.retrieved_context.clone(), }; let llm = if summary.llm_cases > 0 { @@ -345,7 +345,7 @@ impl LlmCaseEntry { } impl RetrievedSnippet { - fn from_summary(entry: &crate::eval::RetrievedSummary) -> Self { + fn from_summary(entry: &crate::types::RetrievedSummary) -> Self { Self { rank: entry.rank, source_id: entry.source_id.clone(), @@ -558,6 +558,65 @@ fn render_markdown(report: &EvaluationReport) -> String { } else { md.push_str("| Rerank | disabled |\\n"); } + write!( + md, + "| Chunk result cap | {} |\\n", + report.retrieval.chunk_result_cap + ) + .unwrap(); + + md.push_str("\\n## Retrieved Context Volume\\n\\n"); + md.push_str("| Metric | Value |\\n| --- | --- |\\n"); + write!( + md, + "| Tokenizer | {} |\\n", + report.retrieval.retrieved_context.tokenizer + ) + .unwrap(); + write!( + md, + "| Queries measured | {} |\\n", + report.retrieval.retrieved_context.queries + ) + .unwrap(); + write!( + md, + "| Total chunks returned | {} |\\n", + report.retrieval.retrieved_context.total_chunks + ) + .unwrap(); + write!( + md, + "| Total characters | {} |\\n", + report.retrieval.retrieved_context.total_chars + ) + .unwrap(); + write!( + md, + "| Total tokens | {} |\\n", + report.retrieval.retrieved_context.total_tokens + ) + .unwrap(); + write!( + md, + "| Avg chunks / query | {:.1} |\\n", + report.retrieval.retrieved_context.avg_chunks_per_query + ) + .unwrap(); + write!( + md, + "| Avg tokens / query | {:.1} |\\n", + report.retrieval.retrieved_context.avg_tokens_per_query + ) + .unwrap(); + write!( + md, + "| P50 / P95 / max tokens / query | {} / {} / {} |\\n", + report.retrieval.retrieved_context.p50_tokens_per_query, + report.retrieval.retrieved_context.p95_tokens_per_query, + report.retrieval.retrieved_context.max_tokens_per_query + ) + .unwrap(); if let Some(llm) = &report.llm { md.push_str("\\n## LLM Mode Metrics\\n\\n"); @@ -797,182 +856,6 @@ pub fn dataset_report_dir(report_dir: &Path, dataset_id: &str) -> PathBuf { report_dir.join(sanitize_component(dataset_id)) } -#[derive(Debug, Serialize, Deserialize)] -struct LegacyHistoryEntry { - generated_at: String, - run_label: Option, - dataset_id: String, - dataset_label: String, - slice_id: String, - slice_seed: u64, - slice_window_offset: usize, - slice_window_length: usize, - slice_cases: usize, - slice_total_cases: usize, - k: usize, - limit: Option, - precision: f64, - precision_at_1: f64, - precision_at_2: f64, - precision_at_3: f64, - #[serde(default)] - mrr: f64, - #[serde(default)] - average_ndcg: f64, - #[serde(default)] - retrieval_cases: usize, - #[serde(default)] - retrieval_precision: f64, - #[serde(default)] - llm_cases: usize, - #[serde(default)] - llm_precision: f64, - duration_ms: u128, - latency_ms: LatencyStats, - embedding_backend: String, - embedding_model: Option, - ingestion_reused: bool, - ingestion_embeddings_reused: bool, - rerank_enabled: bool, - rerank_keep_top: usize, - rerank_pool_size: Option, - #[serde(default)] - chunk_result_cap: Option, - #[serde(default)] - ingest_chunk_min_tokens: Option, - #[serde(default)] - ingest_chunk_max_tokens: Option, - #[serde(default)] - ingest_chunk_overlap_tokens: Option, - #[serde(default)] - ingest_chunks_only: Option, - #[serde(default)] - delta: Option, - openai_base_url: String, - ingestion_ms: u128, - #[serde(default)] - namespace_seed_ms: Option, -} - -#[derive(Debug, Serialize, Deserialize)] -struct LegacyHistoryDelta { - precision: f64, - precision_at_1: f64, - latency_avg_ms: f64, -} - -#[allow(clippy::too_many_lines)] -fn convert_legacy_entry(entry: LegacyHistoryEntry) -> EvaluationReport { - let overview = OverviewSection { - generated_at: entry.generated_at, - run_label: entry.run_label, - total_cases: entry.slice_cases, - filtered_questions: 0, - }; - - let dataset = DatasetSection { - id: entry.dataset_id, - label: entry.dataset_label, - source: String::new(), - includes_unanswerable: entry.llm_cases > 0, - require_verified_chunks: true, - embedding_backend: entry.embedding_backend, - embedding_model: entry.embedding_model, - embedding_dimension: 0, - }; - - let slice = SliceSection { - id: entry.slice_id, - seed: entry.slice_seed, - window_offset: entry.slice_window_offset, - window_length: entry.slice_window_length, - slice_cases: entry.slice_cases, - ledger_total_cases: entry.slice_total_cases, - positives: 0, - negatives: 0, - total_paragraphs: 0, - negative_multiplier: 0.0, - }; - - let retrieval_cases = if entry.retrieval_cases > 0 { - entry.retrieval_cases - } else { - entry.slice_cases.saturating_sub(entry.llm_cases) - }; - let retrieval_precision = if entry.retrieval_precision > 0.0 { - entry.retrieval_precision - } else { - entry.precision - }; - - let retrieval = RetrievalSection { - k: entry.k, - cases: retrieval_cases, - correct: 0, - precision: retrieval_precision, - precision_at_1: entry.precision_at_1, - precision_at_2: entry.precision_at_2, - precision_at_3: entry.precision_at_3, - mrr: entry.mrr, - average_ndcg: entry.average_ndcg, - latency: entry.latency_ms, - concurrency: 0, - resolve_entities: false, - rerank_enabled: entry.rerank_enabled, - rerank_pool_size: entry.rerank_pool_size, - rerank_keep_top: entry.rerank_keep_top, - chunk_result_cap: entry.chunk_result_cap.unwrap_or(5), - chunk_rrf_k: default_chunk_rrf_k(), - chunk_rrf_vector_weight: default_chunk_rrf_weight(), - chunk_rrf_fts_weight: default_chunk_rrf_weight(), - chunk_rrf_use_vector: default_chunk_rrf_use(), - chunk_rrf_use_fts: default_chunk_rrf_use(), - chunk_vector_take: 0, - chunk_fts_take: 0, - ingest_chunk_min_tokens: entry.ingest_chunk_min_tokens.unwrap_or(256), - ingest_chunk_max_tokens: entry.ingest_chunk_max_tokens.unwrap_or(512), - ingest_chunk_overlap_tokens: entry.ingest_chunk_overlap_tokens.unwrap_or(50), - ingest_chunks_only: entry.ingest_chunks_only.unwrap_or(false), - }; - - let llm = if entry.llm_cases > 0 { - Some(LlmSection { - cases: entry.llm_cases, - answered: 0, - precision: entry.llm_precision, - }) - } else { - None - }; - - let performance = PerformanceSection { - openai_base_url: entry.openai_base_url, - ingestion_ms: entry.ingestion_ms, - namespace_seed_ms: entry.namespace_seed_ms, - evaluation_stages_ms: EvaluationStageTimings::default(), - stage_latency: StageLatencyBreakdown::default(), - namespace_reused: false, - ingestion_reused: entry.ingestion_reused, - embeddings_reused: entry.ingestion_embeddings_reused, - ingestion_cache_path: String::new(), - corpus_paragraphs: 0, - positive_paragraphs_reused: 0, - negative_paragraphs_reused: 0, - }; - - EvaluationReport { - overview, - dataset, - slice, - retrieval, - llm, - performance, - misses: Vec::new(), - llm_cases: Vec::new(), - detailed_report: false, - } -} - fn load_history(path: &Path) -> Result> { if !path.exists() { return Ok(Vec::new()); @@ -981,34 +864,12 @@ fn load_history(path: &Path) -> Result> { let contents = fs::read(path).with_context(|| format!("reading evaluation log {}", path.display()))?; - if let Ok(entries) = serde_json::from_slice::>(&contents) { - return Ok(entries); - } - - match serde_json::from_slice::>(&contents) { - Ok(entries) => Ok(entries.into_iter().map(convert_legacy_entry).collect()), - Err(err) => { - let timestamp = Utc::now().format("%Y%m%dT%H%M%S"); - let backup_path = path - .parent() - .unwrap_or_else(|| Path::new(".")) - .join(format!("evaluations.json.corrupted.{timestamp}")); - warn!( - path = %path.display(), - backup = %backup_path.display(), - error = %err, - "Evaluation history file is corrupted; backing up and starting fresh" - ); - if let Err(e) = fs::rename(path, &backup_path) { - warn!( - path = %path.display(), - error = %e, - "Failed to backup corrupted evaluation history" - ); - } - Ok(Vec::new()) - } - } + serde_json::from_slice(&contents).with_context(|| { + format!( + "parsing evaluation history at {}; delete the file and re-run if upgrading from an older format", + path.display() + ) + }) } fn record_history(report: &EvaluationReport, report_dir: &Path) -> Result { @@ -1024,9 +885,9 @@ fn record_history(report: &EvaluationReport, report_dir: &Path) -> Result Result> { + let prefixes: Vec<&str> = BEIR_DATASETS + .iter() + .map(|kind| kind.source_prefix()) + .collect(); + + let mut grouped: HashMap<&str, Vec<(usize, usize)>> = HashMap::new(); + for (p_idx, paragraph) in dataset.paragraphs.iter().enumerate() { + for (q_idx, question) in paragraph.questions.iter().enumerate() { + let include = if params.include_impossible { + true + } else { + !question.is_impossible && !question.answers.is_empty() + }; + if !include { + continue; + } + + let Some(prefix) = question_prefix(&question.id) else { + warn!( + question_id = %question.id, + "Skipping BEIR question without expected prefix" + ); + continue; + }; + if !prefixes.contains(&prefix) { + warn!( + question_id = %question.id, + prefix = %prefix, + "Skipping BEIR question with unknown subset prefix" + ); + continue; + } + grouped.entry(prefix).or_default().push((p_idx, q_idx)); + } + } + + if grouped.values().all(std::vec::Vec::is_empty) { + return Err(anyhow!( + "no eligible BEIR questions found; cannot build slice" + )); + } + + for prefix in &prefixes { + if let Some(entries) = grouped.get_mut(prefix) { + let seed = mix_seed( + &format!("{}::{prefix}", dataset.metadata.id), + params.base_seed, + ); + let mut rng = StdRng::seed_from_u64(seed); + entries.shuffle(&mut rng); + } + } + + let dataset_count = prefixes.len().max(1); + let base_quota = target_cases / dataset_count; + let mut remainder = target_cases % dataset_count; + + let mut quotas: HashMap<&str, usize> = HashMap::new(); + for prefix in &prefixes { + let mut quota = base_quota; + if remainder > 0 { + quota += 1; + remainder -= 1; + } + quotas.insert(*prefix, quota); + } + + let mut take_counts: HashMap<&str, usize> = HashMap::new(); + let mut spare_slots: HashMap<&str, usize> = HashMap::new(); + let mut shortfall = 0usize; + + for prefix in &prefixes { + let available = grouped.get(prefix).map_or(0, std::vec::Vec::len); + let quota = *quotas.get(prefix).unwrap_or(&0); + let take = quota.min(available); + let missing = quota.saturating_sub(take); + shortfall += missing; + take_counts.insert(*prefix, take); + spare_slots.insert(*prefix, available.saturating_sub(take)); + } + + while shortfall > 0 { + let mut allocated = false; + for prefix in &prefixes { + if shortfall == 0 { + break; + } + let spare = spare_slots.get(prefix).copied().unwrap_or(0); + if spare == 0 { + continue; + } + if let Some(count) = take_counts.get_mut(prefix) { + *count += 1; + } + spare_slots.insert(*prefix, spare - 1); + shortfall = shortfall.saturating_sub(1); + allocated = true; + } + if !allocated { + break; + } + } + + let mut queues: Vec> = Vec::new(); + let mut total_selected = 0usize; + for prefix in &prefixes { + let take = *take_counts.get(prefix).unwrap_or(&0); + let mut deque = VecDeque::new(); + if let Some(entries) = grouped.get(prefix) { + for item in entries.iter().take(take) { + deque.push_back(*item); + total_selected += 1; + } + } + queues.push(deque); + } + + if total_selected < target_cases { + warn!( + requested = target_cases, + available = total_selected, + "BEIR mix requested more questions than available after balancing; continuing with capped set" + ); + } + + let mut output = Vec::with_capacity(total_selected); + loop { + let mut progressed = false; + for queue in &mut queues { + if let Some(item) = queue.pop_front() { + output.push(item); + progressed = true; + } + } + if !progressed { + break; + } + } + + if output.is_empty() { + return Err(anyhow!( + "no eligible BEIR questions found; cannot build slice" + )); + } + + Ok(output) +} + +pub(super) fn question_prefix(question_id: &str) -> Option<&'static str> { + for prefix in BEIR_DATASETS.iter().map(|kind| kind.source_prefix()) { + if let Some(rest) = question_id.strip_prefix(prefix) { + if rest.starts_with('-') { + return Some(prefix); + } + } + } + None +} diff --git a/evaluations/src/slice/build.rs b/evaluations/src/slice/build.rs new file mode 100644 index 0000000..8018ac7 --- /dev/null +++ b/evaluations/src/slice/build.rs @@ -0,0 +1,19 @@ +use sha2::{Digest, Sha256}; + +#[derive(Debug)] +pub(super) struct BuildParams { + pub include_impossible: bool, + pub base_seed: u64, + pub rng_seed: u64, +} + +#[allow(clippy::indexing_slicing)] +pub(super) fn mix_seed(dataset_id: &str, seed: u64) -> u64 { + let mut hasher = Sha256::new(); + hasher.update(dataset_id.as_bytes()); + hasher.update(seed.to_le_bytes()); + let digest = hasher.finalize(); + let mut bytes = [0u8; 8]; + bytes.copy_from_slice(&digest[..8]); + u64::from_le_bytes(bytes) +} diff --git a/evaluations/src/slice.rs b/evaluations/src/slice/mod.rs similarity index 83% rename from evaluations/src/slice.rs rename to evaluations/src/slice/mod.rs index d1da847..3b7d7d9 100644 --- a/evaluations/src/slice.rs +++ b/evaluations/src/slice/mod.rs @@ -1,5 +1,5 @@ use std::{ - collections::{HashMap, HashSet, VecDeque}, + collections::{HashMap, HashSet}, fmt::Write, fs, path::{Path, PathBuf}, @@ -12,10 +12,18 @@ use serde::{Deserialize, Serialize}; use sha2::{Digest, Sha256}; use tracing::{info, warn}; -use crate::datasets::{ - ConvertedDataset, ConvertedParagraph, ConvertedQuestion, DatasetKind, BEIR_DATASETS, +use crate::{ + args::Config, + datasets::{ + ConvertedDataset, ConvertedParagraph, ConvertedQuestion, DatasetKind, + }, }; +mod beir; +mod build; + +use build::{mix_seed, BuildParams}; + const SLICE_VERSION: u32 = 2; pub const DEFAULT_NEGATIVE_MULTIPLIER: f32 = 4.0; @@ -80,8 +88,12 @@ pub enum SliceParagraphKind { Negative, } +pub fn paragraph_storage_key(paragraph_id: &str) -> String { + sanitize_identifier(paragraph_id) +} + pub(crate) fn default_shard_path(paragraph_id: &str) -> String { - let sanitized = sanitize_identifier(paragraph_id); + let sanitized = paragraph_storage_key(paragraph_id); format!("paragraphs/{sanitized}.json") } @@ -210,13 +222,6 @@ struct SliceKey<'a> { seed: u64, } -#[derive(Debug)] -struct BuildParams { - include_impossible: bool, - base_seed: u64, - rng_seed: u64, -} - #[allow(clippy::too_many_lines)] pub fn resolve_slice<'a>( dataset: &'a ConvertedDataset, @@ -225,15 +230,29 @@ pub fn resolve_slice<'a>( let index = DatasetIndex::build(dataset); if let Some(slice_arg) = config.explicit_slice { - let (path, manifest) = load_explicit_slice(dataset, &index, config, slice_arg)?; - let resolved = manifest_to_resolved(dataset, &index, manifest, path)?; + let path = explicit_slice_path(dataset, config, slice_arg); + if path.exists() { + let (path, manifest) = load_explicit_slice(dataset, &index, config, slice_arg)?; + let resolved = manifest_to_resolved(dataset, &index, manifest, path)?; + info!( + slice = %resolved.manifest.slice_id, + path = %resolved.path.display(), + cases = resolved.manifest.case_count, + positives = resolved.manifest.positive_paragraphs, + negatives = resolved.manifest.negative_paragraphs, + "Using explicitly selected slice" + ); + return Ok(resolved); + } + let resolved = + materialize_slice_ledger(dataset, config, &index, slice_arg, path)?; info!( slice = %resolved.manifest.slice_id, path = %resolved.path.display(), cases = resolved.manifest.case_count, positives = resolved.manifest.positive_paragraphs, negatives = resolved.manifest.negative_paragraphs, - "Using explicitly selected slice" + "Built catalog slice ledger" ); return Ok(resolved); } @@ -256,6 +275,82 @@ pub fn resolve_slice<'a>( .join("slices") .join(dataset.metadata.id.as_str()); let path = base.join(format!("{slice_id}.json")); + materialize_slice_ledger(dataset, config, &index, &slice_id, path) +} + +#[allow(clippy::indexing_slicing, clippy::arithmetic_side_effects)] +pub fn select_window<'a>( + resolved: &'a ResolvedSlice<'a>, + offset: usize, + limit: Option, +) -> Result> { + let total = resolved.manifest.case_count; + if total == 0 { + return Err(anyhow!( + "slice '{}' contains no cases", + resolved.manifest.slice_id + )); + } + if offset >= total { + return Err(anyhow!( + "slice offset {offset} exceeds available cases ({total})", + )); + } + let available = total - offset; + let requested = limit.unwrap_or(available).max(1); + let length = requested.min(available); + let cases = resolved.cases[offset..offset + length].to_vec(); + let mut seen = HashSet::new(); + let mut positive_ids = Vec::new(); + for case in &cases { + if seen.insert(case.paragraph.id.as_str()) { + positive_ids.push(case.paragraph.id.clone()); + } + } + Ok(SliceWindow { + offset, + length, + total_cases: total, + cases, + positive_paragraph_ids: positive_ids, + }) +} + +#[allow(dead_code)] +pub fn full_window<'a>(resolved: &'a ResolvedSlice<'a>) -> Result> { + select_window(resolved, 0, None) +} + +fn explicit_slice_path( + dataset: &ConvertedDataset, + config: &SliceConfig<'_>, + slice_arg: &str, +) -> PathBuf { + let explicit_path = Path::new(slice_arg); + if explicit_path.exists() { + explicit_path.to_path_buf() + } else { + config + .cache_dir + .join("slices") + .join(dataset.metadata.id.as_str()) + .join(format!("{slice_arg}.json")) + } +} + +#[allow(clippy::too_many_lines)] +fn materialize_slice_ledger<'a>( + dataset: &'a ConvertedDataset, + config: &SliceConfig<'_>, + index: &DatasetIndex, + slice_id: &str, + path: PathBuf, +) -> Result> { + let requested_corpus = config + .corpus_limit + .unwrap_or(dataset.paragraphs.len()) + .min(dataset.paragraphs.len()) + .max(1); let total_questions = dataset .paragraphs @@ -339,7 +434,7 @@ pub fn resolve_slice<'a>( let mut manifest = manifest.unwrap_or_else(|| { empty_manifest( dataset, - slice_id.clone(), + slice_id.to_string(), ¶ms, requested_corpus, config.negative_multiplier, @@ -396,52 +491,7 @@ pub fn resolve_slice<'a>( ); } - let resolved = manifest_to_resolved(dataset, &index, manifest.clone(), path)?; - - Ok(resolved) -} - -#[allow(clippy::indexing_slicing, clippy::arithmetic_side_effects)] -pub fn select_window<'a>( - resolved: &'a ResolvedSlice<'a>, - offset: usize, - limit: Option, -) -> Result> { - let total = resolved.manifest.case_count; - if total == 0 { - return Err(anyhow!( - "slice '{}' contains no cases", - resolved.manifest.slice_id - )); - } - if offset >= total { - return Err(anyhow!( - "slice offset {offset} exceeds available cases ({total})", - )); - } - let available = total - offset; - let requested = limit.unwrap_or(available).max(1); - let length = requested.min(available); - let cases = resolved.cases[offset..offset + length].to_vec(); - let mut seen = HashSet::new(); - let mut positive_ids = Vec::new(); - for case in &cases { - if seen.insert(case.paragraph.id.as_str()) { - positive_ids.push(case.paragraph.id.clone()); - } - } - Ok(SliceWindow { - offset, - length, - total_cases: total, - cases, - positive_paragraph_ids: positive_ids, - }) -} - -#[allow(dead_code)] -pub fn full_window<'a>(resolved: &'a ResolvedSlice<'a>) -> Result> { - select_window(resolved, 0, None) + manifest_to_resolved(dataset, index, manifest, path) } fn load_explicit_slice( @@ -450,16 +500,7 @@ fn load_explicit_slice( config: &SliceConfig<'_>, slice_arg: &str, ) -> Result<(PathBuf, SliceManifest)> { - let explicit_path = Path::new(slice_arg); - let candidate_path = if explicit_path.exists() { - explicit_path.to_path_buf() - } else { - config - .cache_dir - .join("slices") - .join(dataset.metadata.id.as_str()) - .join(format!("{slice_arg}.json")) - }; + let candidate_path = explicit_slice_path(dataset, config, slice_arg); let manifest = read_manifest(&candidate_path) .with_context(|| format!("reading slice manifest at {}", candidate_path.display()))?; @@ -613,7 +654,7 @@ fn ordered_question_refs( target_cases: usize, ) -> Result> { if dataset.metadata.id == DatasetKind::Beir.id() { - return ordered_question_refs_beir(dataset, params, target_cases); + return beir::ordered_question_refs_beir(dataset, params, target_cases); } let mut question_refs = Vec::new(); @@ -642,171 +683,6 @@ fn ordered_question_refs( Ok(question_refs) } -#[allow(clippy::too_many_lines, clippy::arithmetic_side_effects)] -fn ordered_question_refs_beir( - dataset: &ConvertedDataset, - params: &BuildParams, - target_cases: usize, -) -> Result> { - let prefixes: Vec<&str> = BEIR_DATASETS - .iter() - .map(|kind| kind.source_prefix()) - .collect(); - - let mut grouped: HashMap<&str, Vec<(usize, usize)>> = HashMap::new(); - for (p_idx, paragraph) in dataset.paragraphs.iter().enumerate() { - for (q_idx, question) in paragraph.questions.iter().enumerate() { - let include = if params.include_impossible { - true - } else { - !question.is_impossible && !question.answers.is_empty() - }; - if !include { - continue; - } - - let Some(prefix) = question_prefix(&question.id) else { - warn!( - question_id = %question.id, - "Skipping BEIR question without expected prefix" - ); - continue; - }; - if !prefixes.contains(&prefix) { - warn!( - question_id = %question.id, - prefix = %prefix, - "Skipping BEIR question with unknown subset prefix" - ); - continue; - } - grouped.entry(prefix).or_default().push((p_idx, q_idx)); - } - } - - if grouped.values().all(std::vec::Vec::is_empty) { - return Err(anyhow!( - "no eligible BEIR questions found; cannot build slice" - )); - } - - for prefix in &prefixes { - if let Some(entries) = grouped.get_mut(prefix) { - let seed = mix_seed( - &format!("{}::{prefix}", dataset.metadata.id), - params.base_seed, - ); - let mut rng = StdRng::seed_from_u64(seed); - entries.shuffle(&mut rng); - } - } - - let dataset_count = prefixes.len().max(1); - let base_quota = target_cases / dataset_count; - let mut remainder = target_cases % dataset_count; - - let mut quotas: HashMap<&str, usize> = HashMap::new(); - for prefix in &prefixes { - let mut quota = base_quota; - if remainder > 0 { - quota += 1; - remainder -= 1; - } - quotas.insert(*prefix, quota); - } - - let mut take_counts: HashMap<&str, usize> = HashMap::new(); - let mut spare_slots: HashMap<&str, usize> = HashMap::new(); - let mut shortfall = 0usize; - - for prefix in &prefixes { - let available = grouped.get(prefix).map_or(0, std::vec::Vec::len); - let quota = *quotas.get(prefix).unwrap_or(&0); - let take = quota.min(available); - let missing = quota.saturating_sub(take); - shortfall += missing; - take_counts.insert(*prefix, take); - spare_slots.insert(*prefix, available.saturating_sub(take)); - } - - while shortfall > 0 { - let mut allocated = false; - for prefix in &prefixes { - if shortfall == 0 { - break; - } - let spare = spare_slots.get(prefix).copied().unwrap_or(0); - if spare == 0 { - continue; - } - if let Some(count) = take_counts.get_mut(prefix) { - *count += 1; - } - spare_slots.insert(*prefix, spare - 1); - shortfall = shortfall.saturating_sub(1); - allocated = true; - } - if !allocated { - break; - } - } - - let mut queues: Vec> = Vec::new(); - let mut total_selected = 0usize; - for prefix in &prefixes { - let take = *take_counts.get(prefix).unwrap_or(&0); - let mut deque = VecDeque::new(); - if let Some(entries) = grouped.get(prefix) { - for item in entries.iter().take(take) { - deque.push_back(*item); - total_selected += 1; - } - } - queues.push(deque); - } - - if total_selected < target_cases { - warn!( - requested = target_cases, - available = total_selected, - "BEIR mix requested more questions than available after balancing; continuing with capped set" - ); - } - - let mut output = Vec::with_capacity(total_selected); - loop { - let mut progressed = false; - for queue in &mut queues { - if let Some(item) = queue.pop_front() { - output.push(item); - progressed = true; - } - } - if !progressed { - break; - } - } - - if output.is_empty() { - return Err(anyhow!( - "no eligible BEIR questions found; cannot build slice" - )); - } - - Ok(output) -} - -fn question_prefix(question_id: &str) -> Option<&'static str> { - for prefix in BEIR_DATASETS.iter().map(|kind| kind.source_prefix()) { - if let Some(rest) = question_id.strip_prefix(prefix) { - if rest.starts_with('-') { - return Some(prefix); - } - } - } - None -} - #[allow(clippy::indexing_slicing)] fn ensure_negative_pool( dataset: &ConvertedDataset, @@ -1028,15 +904,48 @@ fn compute_slice_id(key: &SliceKey<'_>) -> Result { })) } -#[allow(clippy::indexing_slicing)] -fn mix_seed(dataset_id: &str, seed: u64) -> u64 { - let mut hasher = Sha256::new(); - hasher.update(dataset_id.as_bytes()); - hasher.update(seed.to_le_bytes()); - let digest = hasher.finalize(); - let mut bytes = [0u8; 8]; - bytes.copy_from_slice(&digest[..8]); - u64::from_le_bytes(bytes) +pub fn read_manifest_if_exists(path: &Path) -> Result> { + if !path.exists() { + return Ok(None); + } + read_manifest(path).map(Some) +} + +pub fn cached_manifest_path(config: &crate::args::Config) -> Option { + let slice_arg = config.slice.as_deref()?; + let explicit_path = Path::new(slice_arg); + if explicit_path.exists() { + return Some(explicit_path.to_path_buf()); + } + Some( + config + .cache_dir + .join("slices") + .join(config.dataset.id()) + .join(format!("{slice_arg}.json")), + ) +} + +pub fn manifest_is_complete(manifest: &SliceManifest, config: &SliceConfig<'_>) -> bool { + let requested_limit = config + .limit + .unwrap_or(manifest.case_count.max(1)) + .max(1); + if manifest.case_count < requested_limit { + return false; + } + + let requested_corpus = config + .corpus_limit + .unwrap_or(manifest.total_paragraphs.max(1)) + .max(1); + let desired_negatives = desired_negative_target( + manifest.positive_paragraphs, + requested_corpus, + manifest.total_paragraphs.max(manifest.positive_paragraphs.max(1)), + config.negative_multiplier, + ); + manifest.negative_paragraphs >= desired_negatives } fn read_manifest(path: &Path) -> Result { @@ -1057,14 +966,38 @@ fn write_manifest(path: &Path, manifest: &SliceManifest) -> Result<()> { Ok(()) } -use crate::args::Config; - -impl<'a> From<&'a Config> for SliceConfig<'a> { - fn from(config: &'a Config) -> Self { - slice_config_with_limit(config, None) +pub fn ledger_target(config: &Config) -> Option { + match (config.slice_grow, config.limit) { + (Some(grow), Some(limit)) => Some(limit.max(grow)), + (Some(grow), None) => Some(grow), + (None, limit) => limit, } } +/// Grow the slice ledger to contain the target number of cases. +pub fn grow_slice(dataset: &ConvertedDataset, config: &Config) -> Result<()> { + let ledger_limit = ledger_target(config); + let slice_settings = slice_config_with_limit(config, ledger_limit); + let slice = + resolve_slice(dataset, &slice_settings).context("resolving dataset slice")?; + info!( + slice = slice.manifest.slice_id.as_str(), + cases = slice.manifest.case_count, + positives = slice.manifest.positive_paragraphs, + negatives = slice.manifest.negative_paragraphs, + total_paragraphs = slice.manifest.total_paragraphs, + "Slice ledger ready" + ); + println!( + "Slice `{}` now contains {} questions ({} positives, {} negatives)", + slice.manifest.slice_id, + slice.manifest.case_count, + slice.manifest.positive_paragraphs, + slice.manifest.negative_paragraphs + ); + Ok(()) +} + pub fn slice_config_with_limit(config: &Config, limit_override: Option) -> SliceConfig<'_> { SliceConfig { cache_dir: config.cache_dir.as_path(), @@ -1088,7 +1021,7 @@ mod tests { use tempfile::tempdir; fn sample_dataset() -> ConvertedDataset { - let metadata = DatasetMetadata::for_kind(DatasetKind::SquadV2, false, None); + let metadata = DatasetMetadata::for_kind(DatasetKind::SquadV2, false); ConvertedDataset { generated_at: Utc::now(), metadata, @@ -1226,7 +1159,7 @@ mod tests { } } - let metadata = DatasetMetadata::for_kind(DatasetKind::Beir, false, None); + let metadata = DatasetMetadata::for_kind(DatasetKind::Beir, false); let dataset = ConvertedDataset { generated_at: Utc::now(), metadata, @@ -1240,11 +1173,11 @@ mod tests { rng_seed: 0xBB, }; - let refs = ordered_question_refs_beir(&dataset, ¶ms, 8)?; + let refs = beir::ordered_question_refs_beir(&dataset, ¶ms, 8)?; let mut per_prefix: HashMap = HashMap::new(); for (p_idx, q_idx) in refs { let question = &dataset.paragraphs[p_idx].questions[q_idx]; - let prefix = question_prefix(&question.id).unwrap_or("unknown"); + let prefix = beir::question_prefix(&question.id).unwrap_or("unknown"); *per_prefix.entry(prefix.to_string()).or_default() += 1; } diff --git a/evaluations/src/snapshot.rs b/evaluations/src/snapshot.rs deleted file mode 100644 index 5b1a827..0000000 --- a/evaluations/src/snapshot.rs +++ /dev/null @@ -1,179 +0,0 @@ -use std::path::PathBuf; - -use anyhow::{Context, Result}; -use chrono::{DateTime, Utc}; -use serde::{Deserialize, Serialize}; -use sha2::{Digest, Sha256}; -use tokio::fs; - -use crate::{args::Config, slice}; -use common::utils::embedding::EmbeddingProvider; - -#[derive(Clone, Debug, Serialize, Deserialize)] -pub struct SnapshotMetadata { - pub dataset_id: String, - pub slice_id: String, - pub embedding_backend: String, - pub embedding_model: Option, - pub embedding_dimension: usize, - pub rerank_enabled: bool, -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct DbSnapshotState { - pub dataset_id: String, - pub slice_id: String, - pub ingestion_fingerprint: String, - pub snapshot_hash: String, - pub updated_at: DateTime, - #[serde(default)] - pub namespace: Option, - #[serde(default)] - pub database: Option, - #[serde(default)] - pub slice_case_count: usize, -} - -pub struct Descriptor { - #[allow(dead_code)] - metadata: SnapshotMetadata, - dir: PathBuf, - metadata_hash: String, -} - -impl Descriptor { - pub fn new( - config: &Config, - slice: &slice::ResolvedSlice<'_>, - embedding_provider: &EmbeddingProvider, - ) -> Self { - let metadata = SnapshotMetadata { - dataset_id: slice.manifest.dataset_id.clone(), - slice_id: slice.manifest.slice_id.clone(), - embedding_backend: embedding_provider.backend_label().to_string(), - embedding_model: embedding_provider.model_code(), - embedding_dimension: embedding_provider.dimension(), - rerank_enabled: config.retrieval.rerank, - }; - - let dir = config - .cache_dir - .join("snapshots") - .join(&metadata.dataset_id) - .join(&metadata.slice_id); - let metadata_hash = compute_hash(&metadata); - - Self { - metadata, - dir, - metadata_hash, - } - } - - pub fn metadata_hash(&self) -> &str { - &self.metadata_hash - } - - pub async fn load_db_state(&self) -> Result> { - let path = self.db_state_path(); - if !path.exists() { - return Ok(None); - } - let bytes = fs::read(&path) - .await - .with_context(|| format!("reading namespace state {}", path.display()))?; - let state = serde_json::from_slice(&bytes) - .with_context(|| format!("deserialising namespace state {}", path.display()))?; - Ok(Some(state)) - } - - pub async fn store_db_state(&self, state: &DbSnapshotState) -> Result<()> { - let path = self.db_state_path(); - if let Some(parent) = path.parent() { - fs::create_dir_all(parent).await.with_context(|| { - format!("creating namespace state directory {}", parent.display()) - })?; - } - let blob = - serde_json::to_vec_pretty(state).context("serialising namespace state payload")?; - fs::write(&path, blob) - .await - .with_context(|| format!("writing namespace state {}", path.display()))?; - Ok(()) - } - - fn db_dir(&self) -> PathBuf { - self.dir.join("db") - } - - fn db_state_path(&self) -> PathBuf { - self.db_dir().join("state.json") - } - - #[cfg(test)] - pub fn from_parts(metadata: SnapshotMetadata, dir: PathBuf) -> Self { - let metadata_hash = compute_hash(&metadata); - Self { - metadata, - dir, - metadata_hash, - } - } -} - -#[allow(clippy::expect_used)] -fn compute_hash(metadata: &SnapshotMetadata) -> String { - let mut hasher = Sha256::new(); - hasher.update( - serde_json::to_vec(metadata).expect("snapshot metadata serialisation should succeed"), - ); - format!("{:x}", hasher.finalize()) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[tokio::test] - #[allow(clippy::unwrap_used, clippy::expect_used)] - async fn state_round_trip() { - let temp_dir = tempfile::tempdir().unwrap(); - let metadata = SnapshotMetadata { - dataset_id: "dataset".into(), - slice_id: "slice".into(), - embedding_backend: "hashed".into(), - embedding_model: None, - embedding_dimension: 128, - rerank_enabled: true, - }; - let descriptor = Descriptor::from_parts( - metadata, - temp_dir - .path() - .join("snapshots") - .join("dataset") - .join("slice"), - ); - - let state = DbSnapshotState { - dataset_id: "dataset".into(), - slice_id: "slice".into(), - ingestion_fingerprint: "fingerprint".into(), - snapshot_hash: descriptor.metadata_hash().to_string(), - updated_at: Utc::now(), - namespace: Some("ns".into()), - database: Some("db".into()), - slice_case_count: 42, - }; - descriptor.store_db_state(&state).await.unwrap(); - - let loaded = descriptor.load_db_state().await.unwrap().unwrap(); - assert_eq!(loaded.dataset_id, state.dataset_id); - assert_eq!(loaded.slice_id, state.slice_id); - assert_eq!(loaded.ingestion_fingerprint, state.ingestion_fingerprint); - assert_eq!(loaded.snapshot_hash, state.snapshot_hash); - assert_eq!(loaded.namespace, state.namespace); - assert_eq!(loaded.database, state.database); - assert_eq!(loaded.slice_case_count, state.slice_case_count); - } -} diff --git a/evaluations/src/types.rs b/evaluations/src/types.rs index 0f4950f..cb99e61 100644 --- a/evaluations/src/types.rs +++ b/evaluations/src/types.rs @@ -1,6 +1,6 @@ use std::collections::HashSet; -use chrono::{DateTime, Utc}; +use chrono::{DateTime, SecondsFormat, Utc}; use common::storage::types::StoredObject; use retrieval_pipeline::{ Diagnostics, RetrievalOutput, RetrievedChunk, RetrievedEntity, StageKind, StageTimings, @@ -8,6 +8,8 @@ use retrieval_pipeline::{ use serde::{Deserialize, Serialize}; use unicode_normalization::UnicodeNormalization; +pub use crate::context_stats::{RetrievalContextStats, RetrievedContextStats}; + #[allow(clippy::struct_excessive_bools)] #[derive(Debug, Serialize)] pub struct EvaluationSummary { @@ -83,6 +85,7 @@ pub struct EvaluationSummary { pub chunk_vector_take: usize, pub chunk_fts_take: usize, pub max_chunks_per_entity: usize, + pub retrieved_context: RetrievalContextStats, pub cases: Vec, } @@ -108,6 +111,7 @@ pub struct CaseSummary { #[serde(skip_serializing_if = "Option::is_none")] pub ndcg: Option, pub latency_ms: u128, + pub retrieved_context: RetrievedContextStats, pub retrieved: Vec, } @@ -483,3 +487,7 @@ pub fn build_case_diagnostics( pipeline: pipeline_stats, } } + +pub fn format_timestamp(timestamp: &DateTime) -> String { + timestamp.to_rfc3339_opts(SecondsFormat::Secs, true) +} diff --git a/html-router/assets/style.css b/html-router/assets/style.css index a9691cc..57fbde4 100644 --- a/html-router/assets/style.css +++ b/html-router/assets/style.css @@ -44,7 +44,6 @@ --leading-snug: 1.375; --leading-relaxed: 1.625; --ease-out: cubic-bezier(0, 0, 0.2, 1); - --ease-in-out: cubic-bezier(0.4, 0, 0.2, 1); --animate-pulse: pulse 2s cubic-bezier(0.4, 0, 0.6, 1) infinite; --default-transition-duration: 150ms; --default-transition-timing-function: cubic-bezier(0.4, 0, 0.2, 1);