mirror of
https://github.com/perstarkse/minne.git
synced 2026-06-25 03:16:26 +02:00
evals: eval crate overhaul, simplification and performance improvements
This commit is contained in:
+1
-1
@@ -1,2 +1,2 @@
|
||||
[alias]
|
||||
eval = "run -p evaluations --"
|
||||
eval = "run -p evaluations --release --"
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# Changelog
|
||||
## Unreleased
|
||||
- Evaluations: simplified crate layout — linear pipeline, sharded-only converted store, in-memory ingestion, `db/` and `cli/` modules; namespace reuse state in corpus manifest (removed `cache/snapshots/`); no legacy JSON/history compatibility (re-run `--warm` after upgrade)
|
||||
- Performance: ingestion skips per-task index rebuild; worker runs scheduled `REBUILD INDEX` (default every 24h via `index_rebuild_interval_secs`, `0` disables)
|
||||
- Performance: ingestion persists all artifacts in a single SurrealDB transaction per task (atomic replace by task id)
|
||||
- Performance: entity embeddings during ingestion use batched `embed_batch`, matching chunk embedding
|
||||
|
||||
Generated
-94
@@ -165,12 +165,6 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anes"
|
||||
version = "0.1.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
|
||||
|
||||
[[package]]
|
||||
name = "anstream"
|
||||
version = "0.6.21"
|
||||
@@ -1071,12 +1065,6 @@ dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cast"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
|
||||
|
||||
[[package]]
|
||||
name = "castaway"
|
||||
version = "0.2.4"
|
||||
@@ -1582,42 +1570,6 @@ dependencies = [
|
||||
"cfg-if",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "criterion"
|
||||
version = "0.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f"
|
||||
dependencies = [
|
||||
"anes",
|
||||
"cast",
|
||||
"ciborium",
|
||||
"clap",
|
||||
"criterion-plot",
|
||||
"is-terminal",
|
||||
"itertools 0.10.5",
|
||||
"num-traits",
|
||||
"once_cell",
|
||||
"oorandom",
|
||||
"plotters",
|
||||
"rayon",
|
||||
"regex",
|
||||
"serde",
|
||||
"serde_derive",
|
||||
"serde_json",
|
||||
"tinytemplate",
|
||||
"walkdir",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "criterion-plot"
|
||||
version = "0.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1"
|
||||
dependencies = [
|
||||
"cast",
|
||||
"itertools 0.10.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "critical-section"
|
||||
version = "1.2.0"
|
||||
@@ -2238,7 +2190,6 @@ dependencies = [
|
||||
"chrono",
|
||||
"clap",
|
||||
"common",
|
||||
"criterion",
|
||||
"fastembed",
|
||||
"futures",
|
||||
"ingestion-pipeline",
|
||||
@@ -2250,7 +2201,6 @@ dependencies = [
|
||||
"serde_json",
|
||||
"serde_yaml",
|
||||
"sha2",
|
||||
"state-machines",
|
||||
"surrealdb",
|
||||
"tempfile",
|
||||
"text-splitter",
|
||||
@@ -4438,12 +4388,6 @@ dependencies = [
|
||||
"pkg-config",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "oorandom"
|
||||
version = "11.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e"
|
||||
|
||||
[[package]]
|
||||
name = "opaque-debug"
|
||||
version = "0.3.1"
|
||||
@@ -4836,34 +4780,6 @@ version = "0.3.32"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
|
||||
|
||||
[[package]]
|
||||
name = "plotters"
|
||||
version = "0.3.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747"
|
||||
dependencies = [
|
||||
"num-traits",
|
||||
"plotters-backend",
|
||||
"plotters-svg",
|
||||
"wasm-bindgen",
|
||||
"web-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "plotters-backend"
|
||||
version = "0.3.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a"
|
||||
|
||||
[[package]]
|
||||
name = "plotters-svg"
|
||||
version = "0.3.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670"
|
||||
dependencies = [
|
||||
"plotters-backend",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "polling"
|
||||
version = "3.11.0"
|
||||
@@ -6940,16 +6856,6 @@ dependencies = [
|
||||
"zerovec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tinytemplate"
|
||||
version = "1.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"serde_json",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tinyvec"
|
||||
version = "1.10.0"
|
||||
|
||||
+1
-1
@@ -9,7 +9,7 @@ members = [
|
||||
"json-stream-parser",
|
||||
"evaluations"
|
||||
]
|
||||
resolver = "2"
|
||||
resolver = "3"
|
||||
|
||||
[workspace.dependencies]
|
||||
anyhow = "1.0.94"
|
||||
|
||||
@@ -13,6 +13,8 @@ let
|
||||
else
|
||||
throw "pkgs.onnxruntime.version (${pkgs.onnxruntime.version}) must match ort-version (${ortVersion})";
|
||||
in {
|
||||
devenv.warnOnNewVersion = false;
|
||||
|
||||
cachix.enable = false;
|
||||
|
||||
packages = [
|
||||
|
||||
@@ -30,8 +30,6 @@ serde_json = { workspace = true }
|
||||
async-trait = { workspace = true }
|
||||
once_cell = "1.19"
|
||||
serde_yaml = "0.9"
|
||||
criterion = "0.5"
|
||||
state-machines = { workspace = true }
|
||||
clap = { version = "4.4", features = ["derive", "env"] }
|
||||
|
||||
[dev-dependencies]
|
||||
|
||||
+71
-181
@@ -1,212 +1,102 @@
|
||||
# Evaluations
|
||||
|
||||
The `evaluations` crate provides a retrieval evaluation framework for benchmarking Minne's information retrieval pipeline against standard datasets.
|
||||
The `evaluations` crate benchmarks Minne's retrieval pipeline against standard datasets.
|
||||
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
# Run SQuAD v2.0 evaluation (vector-only, recommended)
|
||||
cargo run --package evaluations -- --ingest-chunks-only
|
||||
# One-time prep (convert, slice ledger, corpus cache, DB seed)
|
||||
cargo eval --warm --dataset beir --slice beir-mix-600
|
||||
|
||||
# Run a specific dataset
|
||||
cargo run --package evaluations -- --dataset fiqa --ingest-chunks-only
|
||||
# Check readiness
|
||||
cargo eval --status --dataset beir --slice beir-mix-600
|
||||
|
||||
# Convert dataset only (no evaluation)
|
||||
cargo run --package evaluations -- --convert-only
|
||||
# Run benchmark (steady state after warm)
|
||||
cargo eval --dataset beir --slice beir-mix-600 --require-ready
|
||||
```
|
||||
|
||||
Default dataset is `beir`. When `--slice` is omitted, the first catalog slice for the dataset is applied automatically (e.g. `beir-mix-600`).
|
||||
|
||||
Chunk-only ingestion is the default. Pass `--include-entities` to opt into entity extraction during ingestion (requires `OPENAI_API_KEY`).
|
||||
|
||||
### Custom slice sizes
|
||||
|
||||
`--slice` is a ledger id, not only a catalog name. You can use any id; `--limit` controls how many questions the ledger contains:
|
||||
|
||||
```bash
|
||||
# 200-case BEIR mix (default --limit is 200)
|
||||
cargo eval --warm --dataset beir --slice beir-mix-200
|
||||
cargo eval --dataset beir --slice beir-mix-200 --require-ready
|
||||
```
|
||||
|
||||
The catalog slice `beir-mix-600` in `manifest.yaml` is a preset with `limit: 600` and `negative_multiplier: 9.0`.
|
||||
|
||||
### BEIR mix layout
|
||||
|
||||
`beir` is a **virtual mix** across eight subset datasets (FEVER, FiQA, HotpotQA, NFCorpus, Quora, TREC-COVID, SciFact, NQ-BEIR). There is no monolithic `beir-minne/` store.
|
||||
|
||||
1. Build an in-memory qrels-world mix from raw subset data
|
||||
2. Resolve the slice ledger (`cache/slices/beir/<slice-id>.json`)
|
||||
3. Materialize only ledger paragraph ids into per-subset stores (`fever-minne/`, `fiqa-minne/`, …)
|
||||
4. Ingest the slice corpus and seed SurrealDB
|
||||
|
||||
Conversion is **qrels-closed**: only documents that appear in qrels are exported, not the full BEIR corpus.
|
||||
|
||||
Chunk-only mode may evaluate fewer cases than the slice ledger size when some questions are impossible or lack verifiable answer chunks.
|
||||
|
||||
Reports include a **Retrieved Context Volume** section: total characters and estimated tokens across all chunks returned per query (`~chars/4`, comparable across `--chunk-result-cap` sweeps). Use this to compare the cost of raising `--chunk-result-cap`.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
### 1. SurrealDB
|
||||
|
||||
Start a SurrealDB instance before running evaluations:
|
||||
### SurrealDB
|
||||
|
||||
```bash
|
||||
docker-compose up -d surrealdb
|
||||
```
|
||||
|
||||
Or using the default endpoint configuration:
|
||||
### Raw datasets
|
||||
|
||||
```bash
|
||||
surreal start --user root_user --pass root_password
|
||||
```
|
||||
Place raw datasets under `evaluations/data/raw/`. See [manifest.yaml](./manifest.yaml) for paths.
|
||||
|
||||
### 2. Download Raw Datasets
|
||||
BEIR subsets live in sibling directories (`data/raw/fever`, `data/raw/fiqa`, …). The `data/raw/beir` entry is a virtual catalog placeholder; warm uses the subset paths.
|
||||
|
||||
Raw datasets must be downloaded manually and placed in `evaluations/data/raw/`. See [Dataset Sources](#dataset-sources) below for links and formats.
|
||||
|
||||
## Directory Structure
|
||||
## Directory structure
|
||||
|
||||
```
|
||||
evaluations/
|
||||
├── data/
|
||||
│ ├── raw/ # Downloaded raw datasets (manual)
|
||||
│ │ ├── squad/ # SQuAD v2.0
|
||||
│ │ ├── nq-dev/ # Natural Questions
|
||||
│ │ ├── fiqa/ # BEIR: FiQA-2018
|
||||
│ │ ├── fever/ # BEIR: FEVER
|
||||
│ │ ├── hotpotqa/ # BEIR: HotpotQA
|
||||
│ │ └── ... # Other BEIR subsets
|
||||
│ └── converted/ # Auto-generated (Minne JSON format)
|
||||
├── cache/ # Ingestion and embedding caches
|
||||
├── reports/ # Evaluation output (JSON + Markdown)
|
||||
├── manifest.yaml # Dataset and slice definitions
|
||||
└── src/ # Evaluation source code
|
||||
│ ├── raw/ # Downloaded datasets (manual)
|
||||
│ │ ├── fever/ # BEIR subset raw dirs (corpus.jsonl, queries.jsonl, qrels/)
|
||||
│ │ ├── fiqa/
|
||||
│ │ └── …
|
||||
│ └── converted/ # Sharded stores (auto-generated)
|
||||
│ ├── fever-minne/ # per-BEIR-subset stores
|
||||
│ ├── fiqa-minne/
|
||||
│ └── … # BEIR mix loads from subset stores (no monolithic beir-minne/)
|
||||
├── cache/
|
||||
│ ├── slices/ # Slice ledgers
|
||||
│ └── ingested/ # Corpus ingestion caches (manifest includes namespace seed)
|
||||
├── reports/ # JSON + Markdown output from benchmark runs
|
||||
├── manifest.yaml
|
||||
└── src/
|
||||
```
|
||||
|
||||
## Dataset Sources
|
||||
**After upgrading:** delete old monolithic `*-minne.json` files, any legacy `beir-minne/` merged store, `cache/snapshots/` directories, and stale `reports/history/` artifacts, then re-run `--warm`.
|
||||
|
||||
### SQuAD v2.0
|
||||
|
||||
Download and place at `data/raw/squad/dev-v2.0.json`:
|
||||
|
||||
```bash
|
||||
mkdir -p evaluations/data/raw/squad
|
||||
curl -L https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json \
|
||||
-o evaluations/data/raw/squad/dev-v2.0.json
|
||||
```
|
||||
|
||||
### Natural Questions (NQ)
|
||||
|
||||
Download and place at `data/raw/nq-dev/dev-all.jsonl`:
|
||||
|
||||
```bash
|
||||
mkdir -p evaluations/data/raw/nq-dev
|
||||
# Download from Google's Natural Questions page or HuggingFace
|
||||
# File: dev-all.jsonl (simplified JSONL format)
|
||||
```
|
||||
|
||||
Source: [Google Natural Questions](https://ai.google.com/research/NaturalQuestions)
|
||||
|
||||
### BEIR Datasets
|
||||
|
||||
All BEIR datasets follow the same format structure:
|
||||
|
||||
```
|
||||
data/raw/<dataset>/
|
||||
├── corpus.jsonl # Document corpus
|
||||
├── queries.jsonl # Query set
|
||||
└── qrels/
|
||||
└── test.tsv # Relevance judgments (or dev.tsv)
|
||||
```
|
||||
|
||||
Download datasets from the [BEIR Benchmark repository](https://github.com/beir-cellar/beir). Each dataset zip extracts to the required directory structure.
|
||||
|
||||
| Dataset | Directory |
|
||||
|------------|---------------|
|
||||
| FEVER | `fever/` |
|
||||
| FiQA-2018 | `fiqa/` |
|
||||
| HotpotQA | `hotpotqa/` |
|
||||
| NFCorpus | `nfcorpus/` |
|
||||
| Quora | `quora/` |
|
||||
| TREC-COVID | `trec-covid/` |
|
||||
| SciFact | `scifact/` |
|
||||
| NQ (BEIR) | `nq/` |
|
||||
|
||||
Example download:
|
||||
|
||||
```bash
|
||||
cd evaluations/data/raw
|
||||
curl -L https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/fiqa.zip -o fiqa.zip
|
||||
unzip fiqa.zip && rm fiqa.zip
|
||||
```
|
||||
|
||||
## Dataset Conversion
|
||||
|
||||
Raw datasets are automatically converted to Minne's internal JSON format on first run. To force reconversion:
|
||||
|
||||
```bash
|
||||
cargo run --package evaluations -- --force-convert
|
||||
```
|
||||
|
||||
Converted files are saved to `data/converted/` and cached for subsequent runs.
|
||||
|
||||
## CLI Reference
|
||||
|
||||
### Common Options
|
||||
## Common flags
|
||||
|
||||
| Flag | Description | Default |
|
||||
|------|-------------|---------|
|
||||
| `--dataset <NAME>` | Dataset to evaluate | `squad-v2` |
|
||||
| `--limit <N>` | Max questions to evaluate (0 = all) | `200` |
|
||||
| `--k <N>` | Precision@k cutoff | `5` |
|
||||
| `--slice <ID>` | Use a predefined slice from manifest | — |
|
||||
| `--rerank` | Enable FastEmbed reranking stage | disabled |
|
||||
| `--embedding-backend <BE>` | `fastembed` or `hashed` | `fastembed` |
|
||||
| `--ingest-chunks-only` | Skip entity extraction, ingest only text chunks | disabled |
|
||||
| `--dataset` | Dataset to evaluate | `beir` |
|
||||
| `--slice` | Slice ledger id (catalog or custom) | first catalog slice |
|
||||
| `--limit` | Max questions in the slice ledger | `200` |
|
||||
| `--warm` | Prepare without running queries | — |
|
||||
| `--status` | Print readiness | — |
|
||||
| `--require-ready` | Fail if not warmed | — |
|
||||
| `--include-entities` | Entity extraction during ingestion | off |
|
||||
| `--force-convert` | Rebuild converted store | — |
|
||||
| `--chunk-result-cap` | Max chunks returned per query (raise with `--k`) | `5` |
|
||||
| `--perf-log-console` | Print per-stage timings after a run | off |
|
||||
| `--label` | Label stored in JSON/Markdown reports | — |
|
||||
|
||||
> [!TIP]
|
||||
> Use `--ingest-chunks-only` when evaluating vector-only retrieval strategies. This skips the LLM-based entity extraction and graph generation, significantly speeding up ingestion while focusing on pure chunk-based vector search.
|
||||
|
||||
### Available Datasets
|
||||
|
||||
```
|
||||
squad-v2, natural-questions, beir, fever, fiqa, hotpotqa,
|
||||
nfcorpus, quora, trec-covid, scifact, nq-beir
|
||||
```
|
||||
|
||||
### Database Configuration
|
||||
|
||||
| Flag | Environment | Default |
|
||||
|------|-------------|---------|
|
||||
| `--db-endpoint` | `EVAL_DB_ENDPOINT` | `ws://127.0.0.1:8000` |
|
||||
| `--db-username` | `EVAL_DB_USERNAME` | `root_user` |
|
||||
| `--db-password` | `EVAL_DB_PASSWORD` | `root_password` |
|
||||
| `--db-namespace` | `EVAL_DB_NAMESPACE` | auto-generated |
|
||||
| `--db-database` | `EVAL_DB_DATABASE` | auto-generated |
|
||||
|
||||
### Example Runs
|
||||
|
||||
```bash
|
||||
# Vector-only evaluation (recommended for benchmarking)
|
||||
cargo run --package evaluations -- \
|
||||
--dataset fiqa \
|
||||
--ingest-chunks-only \
|
||||
--limit 200
|
||||
|
||||
# Full FiQA evaluation with reranking
|
||||
cargo run --package evaluations -- \
|
||||
--dataset fiqa \
|
||||
--ingest-chunks-only \
|
||||
--limit 500 \
|
||||
--rerank \
|
||||
--k 10
|
||||
|
||||
# Use a predefined slice for reproducibility
|
||||
cargo run --package evaluations -- --slice fiqa-test-200 --ingest-chunks-only
|
||||
|
||||
# Run the mixed BEIR benchmark
|
||||
cargo run --package evaluations -- --dataset beir --slice beir-mix-600 --ingest-chunks-only
|
||||
```
|
||||
|
||||
## Slices
|
||||
|
||||
Slices are predefined, reproducible subsets defined in `manifest.yaml`. Each slice specifies:
|
||||
|
||||
- **limit**: Number of questions
|
||||
- **corpus_limit**: Maximum corpus size
|
||||
- **seed**: Fixed RNG seed for reproducibility
|
||||
|
||||
View available slices in [manifest.yaml](./manifest.yaml).
|
||||
|
||||
## Reports
|
||||
|
||||
Evaluations generate reports in `reports/`:
|
||||
|
||||
- **JSON**: Full structured results (`*-report.json`)
|
||||
- **Markdown**: Human-readable summary with sample mismatches (`*-report.md`)
|
||||
- **History**: Timestamped run history (`history/`)
|
||||
|
||||
## Performance Tuning
|
||||
|
||||
```bash
|
||||
# Log per-stage performance timings
|
||||
cargo run --package evaluations -- --perf-log-console
|
||||
|
||||
# Save telemetry to file
|
||||
cargo run --package evaluations -- --perf-log-json ./perf.json
|
||||
```
|
||||
|
||||
## License
|
||||
|
||||
See [../LICENSE](../LICENSE).
|
||||
See [REFACTOR.md](./REFACTOR.md) for architecture notes.
|
||||
|
||||
@@ -0,0 +1,98 @@
|
||||
# Evaluations crate refactor plan
|
||||
|
||||
This document records the architecture review and the simplification work applied to the
|
||||
`evaluations` crate. **No backwards compatibility** is maintained for converted JSON layouts,
|
||||
legacy report history, or old cache artifact formats.
|
||||
|
||||
## Goals
|
||||
|
||||
- Smaller, linear pipeline (no state machine ceremony)
|
||||
- Sharded converted store for **all** datasets (memory-efficient partial loading)
|
||||
- Slice-first loading when a catalog slice is selected
|
||||
- In-memory SurrealDB for ingestion (no ephemeral server namespaces)
|
||||
- Single DB lifecycle module (`db/`)
|
||||
- CLI helpers under `cli/`
|
||||
|
||||
## Primary workflow
|
||||
|
||||
```bash
|
||||
# One-time prep (converts raw data if needed, builds slice ledger, corpus cache, DB seed)
|
||||
cargo eval --warm --dataset beir --slice beir-mix-600
|
||||
|
||||
# Check readiness
|
||||
cargo eval --status --dataset beir --slice beir-mix-600
|
||||
|
||||
# Steady-state benchmark
|
||||
cargo eval --dataset beir --slice beir-mix-600 --require-ready
|
||||
```
|
||||
|
||||
Default dataset is `beir`. Chunk-only ingestion is the default; pass `--include-entities` to
|
||||
opt into entity extraction (requires `OPENAI_API_KEY`). Slice tuning such as
|
||||
`negative_multiplier` lives in `manifest.yaml` (e.g. `beir-mix-600` uses `9.0`).
|
||||
|
||||
## Cache layers (after refactor)
|
||||
|
||||
| Layer | Location | Purpose |
|
||||
|-------|----------|---------|
|
||||
| Converted store | `data/converted/<name>/` | Sharded paragraphs + question catalog |
|
||||
| Slice ledger | `cache/slices/<dataset>/<slice-id>.json` | Deterministic questions + paragraph set |
|
||||
| Corpus cache | `cache/ingested/<dataset>/<slice-id>/` | Ingestion paragraph shards, manifest, and namespace reuse seed |
|
||||
|
||||
Namespace reuse state lives in the corpus manifest (`metadata.namespace_seed`), not a separate
|
||||
`snapshots/` tree. After upgrading, delete old `*-minne.json` monolithic files, any
|
||||
`cache/snapshots/` directories, and re-run `--warm`.
|
||||
|
||||
## Phases applied
|
||||
|
||||
### Phase 0 — dead code
|
||||
|
||||
- Removed unused `criterion` dependency
|
||||
- Removed unused `EmbeddingCache`
|
||||
- Updated README for current CLI
|
||||
|
||||
### Phase 1 — structure
|
||||
|
||||
- Flattened pipeline to linear `async fn` stages
|
||||
- Removed `eval.rs` hub; imports go to owning modules
|
||||
- Merged `namespace.rs`, `db_helpers.rs` → `db/`; dropped standalone `snapshot.rs`
|
||||
- Moved `status.rs` → `cli/status.rs`
|
||||
- Fixed catalog slice bootstrap (build ledger when explicit slice manifest is missing)
|
||||
|
||||
### Phase 2 — no legacy paths
|
||||
|
||||
- All datasets use sharded converted store only
|
||||
- Removed legacy JSON layout and migration
|
||||
- Removed legacy report history format
|
||||
- Auto-apply first catalog slice when `--slice` omitted
|
||||
- Namespace seed folded into corpus manifest (removed `cache/snapshots/`)
|
||||
|
||||
### Phase 3 — performance
|
||||
|
||||
- Ingestion always uses in-memory SurrealDB
|
||||
- Slice-first partial load when ledger is complete
|
||||
- Default catalog slice for dataset when `--slice` not passed
|
||||
- Split `slice/` into `mod.rs`, `build.rs`, and `beir.rs`
|
||||
|
||||
### Phase 4 — BEIR mix slice-first
|
||||
|
||||
- `beir` is a virtual mix: slice ledger references prefixed ids (`fever-…`, `fiqa-…`, …)
|
||||
- Conversion is **qrels-closed** per subset (only documents appearing in qrels, not full corpus)
|
||||
- Slice ledger is resolved for the requested `--slice` (catalog preset or custom id + `--limit`)
|
||||
- Only ledger paragraph ids are materialized into per-subset stores (`fever-minne/`, `fiqa-minne/`, …)
|
||||
- No monolithic `beir-minne/` merged store
|
||||
- Raw BEIR data lives in per-subset dirs under `data/raw/`; `data/raw/beir` is a catalog placeholder
|
||||
|
||||
## Do not re-introduce
|
||||
|
||||
- Monolithic `*-minne.json` converted files
|
||||
- Monolithic `beir-minne/` merged converted store (use per-subset stores + virtual mix loader)
|
||||
- `state-machines` pipeline for this linear flow
|
||||
- `eval.rs` re-export hub
|
||||
- Legacy history migration in reports
|
||||
- Ephemeral `ingest_eval_*` namespaces on the shared SurrealDB server
|
||||
- Separate `cache/snapshots/` namespace state files
|
||||
|
||||
## Open follow-ups
|
||||
|
||||
- Generate `DatasetKind` from `manifest.yaml` at build time
|
||||
- Split `report.rs` when touching reporting again
|
||||
@@ -1,4 +1,4 @@
|
||||
default_dataset: squad-v2
|
||||
default_dataset: beir
|
||||
datasets:
|
||||
- id: squad-v2
|
||||
label: "SQuAD v2.0"
|
||||
@@ -45,6 +45,7 @@ datasets:
|
||||
description: "Balanced slice across FEVER, FiQA, HotpotQA, NFCorpus, Quora, TREC-COVID, SciFact, NQ-BEIR"
|
||||
limit: 600
|
||||
corpus_limit: 6000
|
||||
negative_multiplier: 9.0
|
||||
seed: 0x5eed2025
|
||||
- id: fever
|
||||
label: "FEVER (BEIR)"
|
||||
|
||||
+66
-18
@@ -137,9 +137,9 @@ pub struct IngestConfig {
|
||||
#[arg(long, default_value_t = 50)]
|
||||
pub ingest_chunk_overlap_tokens: usize,
|
||||
|
||||
/// Run ingestion in chunk-only mode (skip analyzer/graph generation)
|
||||
/// Include entity extraction and graph generation during ingestion (uses LLM tokens)
|
||||
#[arg(long)]
|
||||
pub ingest_chunks_only: bool,
|
||||
pub include_entities: bool,
|
||||
|
||||
/// Number of paragraphs to ingest concurrently
|
||||
#[arg(long, default_value_t = 10)]
|
||||
@@ -159,6 +159,7 @@ pub struct IngestConfig {
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Args)]
|
||||
#[allow(clippy::struct_field_names)]
|
||||
pub struct DatabaseArgs {
|
||||
/// `SurrealDB` server endpoint
|
||||
#[arg(long, default_value = "ws://127.0.0.1:8000", env = "EVAL_DB_ENDPOINT")]
|
||||
@@ -179,10 +180,6 @@ pub struct DatabaseArgs {
|
||||
/// Override the database used on the `SurrealDB` server
|
||||
#[arg(long, env = "EVAL_DB_DATABASE")]
|
||||
pub db_database: Option<String>,
|
||||
|
||||
/// Path to inspect DB state
|
||||
#[arg(long)]
|
||||
pub inspect_db_state: Option<PathBuf>,
|
||||
}
|
||||
|
||||
#[derive(Parser, Debug, Clone)]
|
||||
@@ -233,10 +230,6 @@ pub struct Config {
|
||||
#[arg(long, default_value_t = 5)]
|
||||
pub sample: usize,
|
||||
|
||||
/// Disable context cropping when converting datasets (ingest entire documents)
|
||||
#[arg(long)]
|
||||
pub full_context: bool,
|
||||
|
||||
#[command(flatten)]
|
||||
pub retrieval: RetrievalSettings,
|
||||
|
||||
@@ -322,6 +315,18 @@ pub struct Config {
|
||||
#[command(flatten)]
|
||||
pub database: DatabaseArgs,
|
||||
|
||||
/// Require warmed corpus/namespace before running queries
|
||||
#[arg(long)]
|
||||
pub require_ready: bool,
|
||||
|
||||
/// Prepare converted data, slice, corpus, and namespace without running queries
|
||||
#[arg(long, conflicts_with = "status")]
|
||||
pub warm: bool,
|
||||
|
||||
/// Print readiness of converted data, slice, corpus, and namespace
|
||||
#[arg(long, conflicts_with = "warm")]
|
||||
pub status: bool,
|
||||
|
||||
// Computed fields (not arguments)
|
||||
#[arg(skip)]
|
||||
pub raw_dataset_path: PathBuf,
|
||||
@@ -334,11 +339,6 @@ pub struct Config {
|
||||
}
|
||||
|
||||
impl Config {
|
||||
#[allow(clippy::unused_self)]
|
||||
pub fn context_token_limit(&self) -> Option<usize> {
|
||||
None
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_lines)]
|
||||
pub fn finalize(&mut self) -> Result<()> {
|
||||
// Handle dataset paths
|
||||
@@ -367,9 +367,7 @@ impl Config {
|
||||
// Handle retrieval settings
|
||||
self.retrieval.require_verified_chunks = !self.llm_mode;
|
||||
|
||||
if self.dataset == DatasetKind::Beir {
|
||||
self.negative_multiplier = 9.0;
|
||||
}
|
||||
self.apply_catalog_slice_defaults()?;
|
||||
|
||||
// Validations
|
||||
if self.ingest.ingest_chunk_min_tokens == 0
|
||||
@@ -477,6 +475,56 @@ impl Config {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn apply_catalog_slice_defaults(&mut self) -> Result<()> {
|
||||
let catalog = crate::datasets::catalog()?;
|
||||
let entry = catalog.dataset(self.dataset.id())?;
|
||||
|
||||
if self.slice.is_none() {
|
||||
if let Some(default_slice) = entry.slices.first() {
|
||||
self.slice = Some(default_slice.id.clone());
|
||||
}
|
||||
}
|
||||
|
||||
let Some(slice_id) = self.slice.as_deref() else {
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
let Ok((_, slice)) = catalog.slice(slice_id) else {
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
if slice.dataset_id != self.dataset.id() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
if let Some(limit) = slice.limit {
|
||||
if self.limit_arg == 200 {
|
||||
self.limit_arg = limit;
|
||||
self.limit = Some(limit);
|
||||
}
|
||||
}
|
||||
if self.corpus_limit.is_none() {
|
||||
self.corpus_limit = slice.corpus_limit;
|
||||
}
|
||||
if let Some(seed) = slice.seed {
|
||||
self.slice_seed = seed;
|
||||
}
|
||||
if let Some(include_unanswerable) = slice.include_unanswerable {
|
||||
self.llm_mode = include_unanswerable;
|
||||
self.retrieval.require_verified_chunks = !include_unanswerable;
|
||||
}
|
||||
if let Some(multiplier) = slice.negative_multiplier {
|
||||
if negative_multiplier_is_default(self.negative_multiplier) {
|
||||
self.negative_multiplier = multiplier;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn negative_multiplier_is_default(value: f32) -> bool {
|
||||
(value - crate::slice::DEFAULT_NEGATIVE_MULTIPLIER).abs() < f32::EPSILON
|
||||
}
|
||||
|
||||
pub struct ParsedArgs {
|
||||
|
||||
@@ -1,88 +0,0 @@
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
path::Path,
|
||||
sync::{
|
||||
atomic::{AtomicBool, Ordering},
|
||||
Arc,
|
||||
},
|
||||
};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::sync::Mutex;
|
||||
|
||||
#[derive(Debug, Default, Serialize, Deserialize)]
|
||||
struct EmbeddingCacheData {
|
||||
entities: HashMap<String, Vec<f32>>,
|
||||
chunks: HashMap<String, Vec<f32>>,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct EmbeddingCache {
|
||||
path: Arc<Path>,
|
||||
data: Arc<Mutex<EmbeddingCacheData>>,
|
||||
dirty: Arc<AtomicBool>,
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
impl EmbeddingCache {
|
||||
pub async fn load(path: impl AsRef<Path>) -> Result<Self> {
|
||||
let path = path.as_ref().to_path_buf();
|
||||
let data = if path.exists() {
|
||||
let raw = tokio::fs::read(&path)
|
||||
.await
|
||||
.with_context(|| format!("reading embedding cache {}", path.display()))?;
|
||||
serde_json::from_slice(&raw)
|
||||
.with_context(|| format!("parsing embedding cache {}", path.display()))?
|
||||
} else {
|
||||
EmbeddingCacheData::default()
|
||||
};
|
||||
|
||||
Ok(Self {
|
||||
path: Arc::from(path.as_path()),
|
||||
data: Arc::new(Mutex::new(data)),
|
||||
dirty: Arc::new(AtomicBool::new(false)),
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn get_entity(&self, id: &str) -> Option<Vec<f32>> {
|
||||
let guard = self.data.lock().await;
|
||||
guard.entities.get(id).cloned()
|
||||
}
|
||||
|
||||
pub async fn insert_entity(&self, id: String, embedding: Vec<f32>) {
|
||||
let mut guard = self.data.lock().await;
|
||||
guard.entities.insert(id, embedding);
|
||||
self.dirty.store(true, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub async fn get_chunk(&self, id: &str) -> Option<Vec<f32>> {
|
||||
let guard = self.data.lock().await;
|
||||
guard.chunks.get(id).cloned()
|
||||
}
|
||||
|
||||
pub async fn insert_chunk(&self, id: String, embedding: Vec<f32>) {
|
||||
let mut guard = self.data.lock().await;
|
||||
guard.chunks.insert(id, embedding);
|
||||
self.dirty.store(true, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub async fn persist(&self) -> Result<()> {
|
||||
if !self.dirty.load(Ordering::Relaxed) {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let guard = self.data.lock().await;
|
||||
let body = serde_json::to_vec_pretty(&*guard).context("serialising embedding cache")?;
|
||||
if let Some(parent) = self.path.parent() {
|
||||
tokio::fs::create_dir_all(parent)
|
||||
.await
|
||||
.with_context(|| format!("creating cache directory {}", parent.display()))?;
|
||||
}
|
||||
tokio::fs::write(&*self.path, body)
|
||||
.await
|
||||
.with_context(|| format!("writing embedding cache {}", self.path.display()))?;
|
||||
self.dirty.store(false, Ordering::Relaxed);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -156,6 +156,7 @@ mod tests {
|
||||
chunk_min_tokens: 1,
|
||||
chunk_max_tokens: 10,
|
||||
chunk_only: false,
|
||||
namespace_seed: None,
|
||||
},
|
||||
paragraphs,
|
||||
questions,
|
||||
|
||||
@@ -0,0 +1,3 @@
|
||||
pub mod status;
|
||||
|
||||
pub use status::{collect_status, ensure_query_ready, print_status, warm};
|
||||
@@ -0,0 +1,316 @@
|
||||
#![allow(clippy::module_name_repetitions)]
|
||||
|
||||
use std::path::Path;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::{
|
||||
args::Config,
|
||||
corpus::{self, CorpusCacheConfig},
|
||||
datasets::{
|
||||
beir_subset_store_summary, beir_subset_stores_ready, content_checksum_for_layout,
|
||||
detect_layout, mix_content_checksum, store_dir_for, ConvertedLayout, DatasetKind,
|
||||
},
|
||||
db::{connect_eval_db, default_database, default_namespace, namespace_has_corpus},
|
||||
slice::{self, ledger_target},
|
||||
};
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct EvalStatus {
|
||||
pub dataset: String,
|
||||
pub slice: Option<String>,
|
||||
pub converted: ConvertedStatus,
|
||||
pub slice_ledger: SliceLedgerStatus,
|
||||
pub corpus_cache: CorpusCacheStatus,
|
||||
pub namespace: NamespaceStatus,
|
||||
pub query_ready: bool,
|
||||
pub notes: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct ConvertedStatus {
|
||||
pub layout: String,
|
||||
pub path: String,
|
||||
pub ready: bool,
|
||||
pub partial_load_eligible: bool,
|
||||
pub checksum: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct SliceLedgerStatus {
|
||||
pub ready: bool,
|
||||
pub path: Option<String>,
|
||||
pub cases: Option<usize>,
|
||||
pub positives: Option<usize>,
|
||||
pub negatives: Option<usize>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct CorpusCacheStatus {
|
||||
pub ready: bool,
|
||||
pub path: Option<String>,
|
||||
pub manifest_present: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct NamespaceStatus {
|
||||
pub namespace: String,
|
||||
pub database: String,
|
||||
pub seeded: bool,
|
||||
pub namespace_seed_recorded: bool,
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_lines)]
|
||||
pub async fn collect_status(config: &Config) -> Result<EvalStatus> {
|
||||
let mut notes = Vec::new();
|
||||
let is_beir_mix = config.dataset == DatasetKind::Beir;
|
||||
let converted_path = &config.converted_dataset_path;
|
||||
let layout = if is_beir_mix {
|
||||
ConvertedLayout::Missing
|
||||
} else {
|
||||
detect_layout(converted_path)
|
||||
};
|
||||
let layout_label = if is_beir_mix {
|
||||
"beir-mix-subset-stores"
|
||||
} else {
|
||||
match layout {
|
||||
ConvertedLayout::ShardedStore => "sharded-store",
|
||||
ConvertedLayout::Missing => "missing",
|
||||
}
|
||||
};
|
||||
|
||||
let store_dir = store_dir_for(converted_path);
|
||||
let display_path = if is_beir_mix {
|
||||
beir_subset_store_summary()?
|
||||
.into_iter()
|
||||
.map(|(subset, paragraphs, questions)| {
|
||||
format!("{subset}-minne ({paragraphs} paragraphs, {questions} questions)")
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
.join("; ")
|
||||
} else {
|
||||
store_dir.display().to_string()
|
||||
};
|
||||
|
||||
let manifest_path = slice::cached_manifest_path(config);
|
||||
let slice_config = slice::slice_config_with_limit(config, ledger_target(config));
|
||||
let slice_manifest = manifest_path
|
||||
.as_ref()
|
||||
.and_then(|path| slice::read_manifest_if_exists(path).ok().flatten());
|
||||
|
||||
let slice_ledger = SliceLedgerStatus {
|
||||
ready: slice_manifest
|
||||
.as_ref()
|
||||
.is_some_and(|manifest| slice::manifest_is_complete(manifest, &slice_config)),
|
||||
path: manifest_path.as_ref().map(|path| path.display().to_string()),
|
||||
cases: slice_manifest.as_ref().map(|manifest| manifest.case_count),
|
||||
positives: slice_manifest.as_ref().map(|manifest| manifest.positive_paragraphs),
|
||||
negatives: slice_manifest.as_ref().map(|manifest| manifest.negative_paragraphs),
|
||||
};
|
||||
|
||||
let beir_paragraph_ids = slice_manifest.as_ref().map(|manifest| {
|
||||
manifest
|
||||
.paragraphs
|
||||
.iter()
|
||||
.map(|entry| entry.id.clone())
|
||||
.collect::<std::collections::HashSet<_>>()
|
||||
});
|
||||
|
||||
let converted_ready = if is_beir_mix {
|
||||
slice_ledger.ready
|
||||
&& beir_paragraph_ids
|
||||
.as_ref()
|
||||
.is_some_and(|ids| beir_subset_stores_ready(ids).unwrap_or(false))
|
||||
} else {
|
||||
layout == ConvertedLayout::ShardedStore
|
||||
};
|
||||
|
||||
let checksum = if is_beir_mix {
|
||||
beir_paragraph_ids
|
||||
.as_ref()
|
||||
.and_then(|ids| mix_content_checksum(ids).ok())
|
||||
} else if layout == ConvertedLayout::ShardedStore {
|
||||
content_checksum_for_layout(converted_path).ok()
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let partial_load_eligible = slice_ledger.ready && config.slice.is_some();
|
||||
|
||||
let corpus_cache = if let Some(manifest) = slice_manifest.as_ref() {
|
||||
let cache_settings = CorpusCacheConfig::from(config);
|
||||
let base_dir = corpus::cached_corpus_dir(
|
||||
&cache_settings,
|
||||
config.dataset.id(),
|
||||
manifest.slice_id.as_str(),
|
||||
);
|
||||
let manifest_present = corpus::load_cached_manifest(&base_dir)?.is_some();
|
||||
CorpusCacheStatus {
|
||||
ready: manifest_present,
|
||||
path: Some(base_dir.display().to_string()),
|
||||
manifest_present,
|
||||
}
|
||||
} else {
|
||||
CorpusCacheStatus {
|
||||
ready: false,
|
||||
path: None,
|
||||
manifest_present: false,
|
||||
}
|
||||
};
|
||||
|
||||
let namespace = config
|
||||
.database
|
||||
.db_namespace
|
||||
.clone()
|
||||
.unwrap_or_else(|| {
|
||||
default_namespace(
|
||||
config.dataset.id(),
|
||||
config.limit,
|
||||
config.slice.as_deref(),
|
||||
)
|
||||
});
|
||||
let database = config
|
||||
.database
|
||||
.db_database
|
||||
.clone()
|
||||
.unwrap_or_else(default_database);
|
||||
|
||||
let namespace_seed = corpus_cache.path.as_ref().and_then(|path| {
|
||||
corpus::load_cached_manifest(Path::new(path))
|
||||
.ok()
|
||||
.flatten()
|
||||
.and_then(|manifest| manifest.metadata.namespace_seed)
|
||||
});
|
||||
|
||||
let (seeded, namespace_seed_recorded) = match connect_eval_db(config, &namespace, &database).await {
|
||||
Ok(db) => {
|
||||
let has_corpus = namespace_has_corpus(&db).await.unwrap_or(false);
|
||||
(has_corpus, namespace_seed.is_some())
|
||||
}
|
||||
Err(err) => {
|
||||
notes.push(format!("SurrealDB unavailable: {err}"));
|
||||
(false, false)
|
||||
}
|
||||
};
|
||||
|
||||
let query_ready = converted_ready
|
||||
&& slice_ledger.ready
|
||||
&& corpus_cache.ready
|
||||
&& seeded
|
||||
&& namespace_seed_recorded;
|
||||
|
||||
if !query_ready {
|
||||
notes.push("Run `cargo eval --warm --slice <id>` to prepare corpus and namespace.".into());
|
||||
}
|
||||
|
||||
Ok(EvalStatus {
|
||||
dataset: config.dataset.id().to_string(),
|
||||
slice: config.slice.clone(),
|
||||
converted: ConvertedStatus {
|
||||
layout: layout_label.to_string(),
|
||||
path: display_path,
|
||||
ready: converted_ready,
|
||||
partial_load_eligible,
|
||||
checksum,
|
||||
},
|
||||
slice_ledger,
|
||||
corpus_cache,
|
||||
namespace: NamespaceStatus {
|
||||
namespace,
|
||||
database,
|
||||
seeded,
|
||||
namespace_seed_recorded,
|
||||
},
|
||||
query_ready,
|
||||
notes,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn print_status(status: &EvalStatus) {
|
||||
println!("Evaluation status for dataset `{}`", status.dataset);
|
||||
if let Some(slice) = &status.slice {
|
||||
println!("Slice: {slice}");
|
||||
}
|
||||
println!(
|
||||
"Converted: {} ({})",
|
||||
if status.converted.ready {
|
||||
"ready"
|
||||
} else {
|
||||
"missing"
|
||||
},
|
||||
status.converted.layout
|
||||
);
|
||||
println!("Converted path: {}", status.converted.path);
|
||||
if status.converted.partial_load_eligible {
|
||||
println!("Slice-first loading: eligible");
|
||||
}
|
||||
println!(
|
||||
"Slice ledger: {}",
|
||||
if status.slice_ledger.ready {
|
||||
format!(
|
||||
"ready ({} cases, {} positives, {} negatives)",
|
||||
status.slice_ledger.cases.unwrap_or(0),
|
||||
status.slice_ledger.positives.unwrap_or(0),
|
||||
status.slice_ledger.negatives.unwrap_or(0)
|
||||
)
|
||||
} else {
|
||||
"missing or incomplete".to_string()
|
||||
}
|
||||
);
|
||||
if let Some(path) = &status.slice_ledger.path {
|
||||
println!("Slice ledger path: {path}");
|
||||
}
|
||||
println!(
|
||||
"Corpus cache: {}",
|
||||
if status.corpus_cache.ready {
|
||||
"ready"
|
||||
} else {
|
||||
"missing"
|
||||
}
|
||||
);
|
||||
if let Some(path) = &status.corpus_cache.path {
|
||||
println!("Corpus cache path: {path}");
|
||||
}
|
||||
println!(
|
||||
"Namespace `{}` / `{}`: seeded={}, namespace_seed_recorded={}",
|
||||
status.namespace.namespace,
|
||||
status.namespace.database,
|
||||
status.namespace.seeded,
|
||||
status.namespace.namespace_seed_recorded
|
||||
);
|
||||
println!(
|
||||
"Query-ready: {}",
|
||||
if status.query_ready {
|
||||
"yes"
|
||||
} else {
|
||||
"no"
|
||||
}
|
||||
);
|
||||
for note in &status.notes {
|
||||
println!("Note: {note}");
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn warm(config: &Config) -> Result<()> {
|
||||
let loaded =
|
||||
crate::datasets::prepare_dataset(config.dataset, config).context("preparing dataset")?;
|
||||
crate::pipeline::warm_evaluation(&loaded.dataset, config, &loaded.content_checksum)
|
||||
.await
|
||||
.context("warming evaluation corpus and namespace")?;
|
||||
let status = collect_status(config).await?;
|
||||
print_status(&status);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn ensure_query_ready(config: &Config) -> Result<()> {
|
||||
let status = collect_status(config).await?;
|
||||
if status.query_ready {
|
||||
return Ok(());
|
||||
}
|
||||
print_status(&status);
|
||||
anyhow::bail!(
|
||||
"evaluation is not query-ready; run `cargo eval --warm --slice {}` first",
|
||||
config.slice.as_deref().unwrap_or("<slice-id>")
|
||||
);
|
||||
}
|
||||
@@ -0,0 +1,177 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use common::storage::types::StoredObject;
|
||||
|
||||
use crate::types::EvaluationCandidate;
|
||||
|
||||
const TOKENIZER_LABEL: &str = "estimated (~chars/4; ingestion uses bert-base-cased)";
|
||||
|
||||
#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub struct RetrievedContextStats {
|
||||
pub chunk_count: usize,
|
||||
pub char_count: usize,
|
||||
pub token_count: usize,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
pub struct RetrievalContextStats {
|
||||
pub tokenizer: String,
|
||||
pub queries: usize,
|
||||
pub total_chunks: usize,
|
||||
pub total_chars: usize,
|
||||
pub total_tokens: usize,
|
||||
pub avg_chunks_per_query: f64,
|
||||
pub avg_chars_per_query: f64,
|
||||
pub avg_tokens_per_query: f64,
|
||||
pub p50_tokens_per_query: usize,
|
||||
pub p95_tokens_per_query: usize,
|
||||
pub max_tokens_per_query: usize,
|
||||
}
|
||||
|
||||
pub fn stats_for_candidates(candidates: &[EvaluationCandidate]) -> RetrievedContextStats {
|
||||
let mut seen_chunk_ids = std::collections::HashSet::new();
|
||||
let mut stats = RetrievedContextStats::default();
|
||||
|
||||
for candidate in candidates {
|
||||
for chunk in &candidate.chunks {
|
||||
let chunk_id = chunk.chunk.id().to_string();
|
||||
if !seen_chunk_ids.insert(chunk_id) {
|
||||
continue;
|
||||
}
|
||||
let text = chunk.chunk.chunk.as_str();
|
||||
stats.chunk_count += 1;
|
||||
stats.char_count += text.chars().count();
|
||||
stats.token_count += estimate_ingestion_tokens(text);
|
||||
}
|
||||
}
|
||||
|
||||
stats
|
||||
}
|
||||
|
||||
pub fn aggregate_context_stats(per_query: &[RetrievedContextStats]) -> RetrievalContextStats {
|
||||
let queries = per_query.len();
|
||||
if queries == 0 {
|
||||
return RetrievalContextStats {
|
||||
tokenizer: TOKENIZER_LABEL.to_string(),
|
||||
queries: 0,
|
||||
total_chunks: 0,
|
||||
total_chars: 0,
|
||||
total_tokens: 0,
|
||||
avg_chunks_per_query: 0.0,
|
||||
avg_chars_per_query: 0.0,
|
||||
avg_tokens_per_query: 0.0,
|
||||
p50_tokens_per_query: 0,
|
||||
p95_tokens_per_query: 0,
|
||||
max_tokens_per_query: 0,
|
||||
};
|
||||
}
|
||||
|
||||
let total_chunks: usize = per_query.iter().map(|stats| stats.chunk_count).sum();
|
||||
let total_chars: usize = per_query.iter().map(|stats| stats.char_count).sum();
|
||||
let total_tokens: usize = per_query.iter().map(|stats| stats.token_count).sum();
|
||||
let mut tokens_per_query: Vec<usize> = per_query.iter().map(|stats| stats.token_count).collect();
|
||||
tokens_per_query.sort_unstable();
|
||||
let max_tokens_per_query = *tokens_per_query.last().unwrap_or(&0);
|
||||
|
||||
RetrievalContextStats {
|
||||
tokenizer: TOKENIZER_LABEL.to_string(),
|
||||
queries,
|
||||
total_chunks,
|
||||
total_chars,
|
||||
total_tokens,
|
||||
avg_chunks_per_query: total_chunks as f64 / queries as f64,
|
||||
avg_chars_per_query: total_chars as f64 / queries as f64,
|
||||
avg_tokens_per_query: total_tokens as f64 / queries as f64,
|
||||
p50_tokens_per_query: percentile_usize(&tokens_per_query, 0.50),
|
||||
p95_tokens_per_query: percentile_usize(&tokens_per_query, 0.95),
|
||||
max_tokens_per_query,
|
||||
}
|
||||
}
|
||||
|
||||
fn estimate_ingestion_tokens(text: &str) -> usize {
|
||||
let chars = text.chars().count();
|
||||
if chars == 0 {
|
||||
return 0;
|
||||
}
|
||||
chars.div_ceil(4)
|
||||
}
|
||||
|
||||
#[allow(clippy::cast_precision_loss, clippy::indexing_slicing, clippy::arithmetic_side_effects)]
|
||||
fn percentile_usize(sorted: &[usize], fraction: f64) -> usize {
|
||||
if sorted.is_empty() {
|
||||
return 0;
|
||||
}
|
||||
let clamped = fraction.clamp(0.0, 1.0);
|
||||
let index = ((sorted.len() - 1) as f64 * clamped).round() as usize;
|
||||
sorted[index.min(sorted.len() - 1)]
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::Arc;
|
||||
|
||||
use super::*;
|
||||
use common::storage::types::text_chunk::TextChunk;
|
||||
use retrieval_pipeline::RetrievedChunk;
|
||||
|
||||
#[test]
|
||||
fn deduplicates_chunks_when_counting_context() {
|
||||
let shared = Arc::new(TextChunk::new(
|
||||
"src".into(),
|
||||
"hello world".into(),
|
||||
"user".into(),
|
||||
));
|
||||
let candidates = vec![
|
||||
EvaluationCandidate {
|
||||
entity_id: "a".into(),
|
||||
source_id: "src".into(),
|
||||
entity_name: "A".into(),
|
||||
entity_description: None,
|
||||
entity_category: None,
|
||||
score: 1.0,
|
||||
chunks: vec![RetrievedChunk {
|
||||
chunk: Arc::clone(&shared),
|
||||
score: 1.0,
|
||||
}],
|
||||
},
|
||||
EvaluationCandidate {
|
||||
entity_id: "b".into(),
|
||||
source_id: "src".into(),
|
||||
entity_name: "B".into(),
|
||||
entity_description: None,
|
||||
entity_category: None,
|
||||
score: 0.9,
|
||||
chunks: vec![RetrievedChunk {
|
||||
chunk: shared,
|
||||
score: 0.9,
|
||||
}],
|
||||
},
|
||||
];
|
||||
let stats = stats_for_candidates(&candidates);
|
||||
assert_eq!(stats.chunk_count, 1);
|
||||
assert_eq!(stats.char_count, "hello world".chars().count());
|
||||
assert_eq!(stats.token_count, 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn aggregates_per_query_token_totals() {
|
||||
let per_query = vec![
|
||||
RetrievedContextStats {
|
||||
chunk_count: 2,
|
||||
char_count: 100,
|
||||
token_count: 40,
|
||||
},
|
||||
RetrievedContextStats {
|
||||
chunk_count: 5,
|
||||
char_count: 250,
|
||||
token_count: 100,
|
||||
},
|
||||
];
|
||||
let aggregate = aggregate_context_stats(&per_query);
|
||||
assert_eq!(aggregate.queries, 2);
|
||||
assert_eq!(aggregate.total_chunks, 7);
|
||||
assert_eq!(aggregate.total_tokens, 140);
|
||||
assert_eq!(aggregate.max_tokens_per_query, 100);
|
||||
assert!((aggregate.avg_tokens_per_query - 70.0).abs() < f64::EPSILON);
|
||||
}
|
||||
}
|
||||
@@ -11,32 +11,14 @@ pub struct CorpusCacheConfig {
|
||||
pub ingestion_max_retries: usize,
|
||||
}
|
||||
|
||||
impl CorpusCacheConfig {
|
||||
pub fn new(
|
||||
ingestion_cache_dir: impl Into<PathBuf>,
|
||||
force_refresh: bool,
|
||||
refresh_embeddings_only: bool,
|
||||
ingestion_batch_size: usize,
|
||||
ingestion_max_retries: usize,
|
||||
) -> Self {
|
||||
impl From<&Config> for CorpusCacheConfig {
|
||||
fn from(config: &Config) -> Self {
|
||||
Self {
|
||||
ingestion_cache_dir: ingestion_cache_dir.into(),
|
||||
force_refresh,
|
||||
refresh_embeddings_only,
|
||||
ingestion_batch_size,
|
||||
ingestion_max_retries,
|
||||
ingestion_cache_dir: config.ingest.ingestion_cache_dir.clone(),
|
||||
force_refresh: config.force_convert || config.ingest.slice_reset_ingestion,
|
||||
refresh_embeddings_only: config.ingest.refresh_embeddings_only,
|
||||
ingestion_batch_size: config.ingest.ingestion_batch_size,
|
||||
ingestion_max_retries: config.ingest.ingestion_max_retries,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&Config> for CorpusCacheConfig {
|
||||
fn from(config: &Config) -> Self {
|
||||
CorpusCacheConfig::new(
|
||||
config.ingest.ingestion_cache_dir.clone(),
|
||||
config.force_convert || config.ingest.slice_reset_ingestion,
|
||||
config.ingest.refresh_embeddings_only,
|
||||
config.ingest.ingestion_batch_size,
|
||||
config.ingest.ingestion_max_retries,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,11 +5,11 @@ pub(crate) mod store;
|
||||
pub use config::CorpusCacheConfig;
|
||||
pub use orchestrator::{
|
||||
cached_corpus_dir, compute_ingestion_fingerprint, corpus_handle_from_manifest, ensure_corpus,
|
||||
load_cached_manifest,
|
||||
load_cached_manifest, persist_corpus_manifest,
|
||||
};
|
||||
pub use store::{
|
||||
seed_manifest_into_db, window_manifest, CorpusHandle, CorpusManifest, CorpusMetadata,
|
||||
CorpusQuestion, ParagraphShard, ParagraphShardStore, MANIFEST_VERSION,
|
||||
CorpusQuestion, NamespaceSeedRecord, ParagraphShard, ParagraphShardStore, MANIFEST_VERSION,
|
||||
};
|
||||
|
||||
pub fn make_ingestion_config(config: &crate::args::Config) -> ingestion_pipeline::IngestionConfig {
|
||||
@@ -20,6 +20,6 @@ pub fn make_ingestion_config(config: &crate::args::Config) -> ingestion_pipeline
|
||||
chunk_overlap_tokens: config.ingest.ingest_chunk_overlap_tokens,
|
||||
..Default::default()
|
||||
},
|
||||
chunk_only: config.ingest.ingest_chunks_only,
|
||||
chunk_only: !config.ingest.include_entities,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,8 +9,6 @@ use std::{
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use async_openai::Client;
|
||||
use chrono::Utc;
|
||||
#[cfg(not(test))]
|
||||
use common::utils::config::get_config;
|
||||
use common::{
|
||||
storage::{
|
||||
db::SurrealDbClient,
|
||||
@@ -125,10 +123,14 @@ pub async fn ensure_corpus(
|
||||
openai: Arc<OpenAIClient>,
|
||||
user_id: &str,
|
||||
converted_path: &Path,
|
||||
precomputed_checksum: Option<&str>,
|
||||
ingestion_config: IngestionConfig,
|
||||
) -> Result<CorpusHandle> {
|
||||
let checksum = compute_file_checksum(converted_path)
|
||||
.with_context(|| format!("computing checksum for {}", converted_path.display()))?;
|
||||
let checksum = match precomputed_checksum {
|
||||
Some(value) => value.to_string(),
|
||||
None => crate::datasets::content_checksum_for_layout(converted_path)
|
||||
.with_context(|| format!("computing checksum for {}", converted_path.display()))?,
|
||||
};
|
||||
let ingestion_fingerprint =
|
||||
build_ingestion_fingerprint(dataset, slice, &checksum, &ingestion_config);
|
||||
|
||||
@@ -381,6 +383,7 @@ pub async fn ensure_corpus(
|
||||
chunk_min_tokens: ingestion_config.tuning.chunk_min_tokens,
|
||||
chunk_max_tokens: ingestion_config.tuning.chunk_max_tokens,
|
||||
chunk_only: ingestion_config.chunk_only,
|
||||
namespace_seed: None,
|
||||
},
|
||||
paragraphs: corpus_paragraphs,
|
||||
questions: corpus_questions,
|
||||
@@ -415,7 +418,7 @@ pub async fn ensure_corpus(
|
||||
negative_ingested: stats.negative_ingested,
|
||||
};
|
||||
|
||||
persist_manifest(&handle).context("persisting corpus manifest")?;
|
||||
persist_corpus_manifest(&handle).context("persisting corpus manifest")?;
|
||||
|
||||
Ok(handle)
|
||||
}
|
||||
@@ -501,7 +504,6 @@ async fn ingest_paragraph_batch(
|
||||
Ok(shards)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
async fn create_ingest_db(namespace: &str) -> Result<Arc<SurrealDbClient>> {
|
||||
let db = SurrealDbClient::memory(namespace, "corpus")
|
||||
.await
|
||||
@@ -509,21 +511,6 @@ async fn create_ingest_db(namespace: &str) -> Result<Arc<SurrealDbClient>> {
|
||||
Ok(Arc::new(db))
|
||||
}
|
||||
|
||||
#[cfg(not(test))]
|
||||
async fn create_ingest_db(namespace: &str) -> Result<Arc<SurrealDbClient>> {
|
||||
let config = get_config().context("loading app config for ingestion database")?;
|
||||
let db = SurrealDbClient::new(
|
||||
&config.surrealdb_address,
|
||||
&config.surrealdb_username,
|
||||
&config.surrealdb_password,
|
||||
namespace,
|
||||
"corpus",
|
||||
)
|
||||
.await
|
||||
.context("creating surrealdb database for ingestion")?;
|
||||
Ok(Arc::new(db))
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn ingest_single_paragraph(
|
||||
pipeline: Arc<IngestionPipeline>,
|
||||
@@ -631,8 +618,12 @@ pub fn compute_ingestion_fingerprint(
|
||||
slice: &ResolvedSlice<'_>,
|
||||
converted_path: &Path,
|
||||
ingestion_config: &IngestionConfig,
|
||||
precomputed_checksum: Option<&str>,
|
||||
) -> Result<String> {
|
||||
let checksum = compute_file_checksum(converted_path)?;
|
||||
let checksum = match precomputed_checksum {
|
||||
Some(value) => value.to_string(),
|
||||
None => crate::datasets::content_checksum_for_layout(converted_path)?,
|
||||
};
|
||||
Ok(build_ingestion_fingerprint(
|
||||
dataset,
|
||||
slice,
|
||||
@@ -641,7 +632,7 @@ pub fn compute_ingestion_fingerprint(
|
||||
))
|
||||
}
|
||||
|
||||
pub fn load_cached_manifest(base_dir: &Path) -> Result<Option<CorpusManifest>> {
|
||||
pub fn load_cached_manifest(base_dir: &std::path::Path) -> Result<Option<CorpusManifest>> {
|
||||
let path = base_dir.join("manifest.json");
|
||||
if !path.exists() {
|
||||
return Ok(None);
|
||||
@@ -656,7 +647,7 @@ pub fn load_cached_manifest(base_dir: &Path) -> Result<Option<CorpusManifest>> {
|
||||
Ok(Some(manifest))
|
||||
}
|
||||
|
||||
fn persist_manifest(handle: &CorpusHandle) -> Result<()> {
|
||||
pub fn persist_corpus_manifest(handle: &CorpusHandle) -> Result<()> {
|
||||
let path = handle.path.join("manifest.json");
|
||||
if let Some(parent) = path.parent() {
|
||||
fs::create_dir_all(parent)
|
||||
@@ -685,24 +676,6 @@ pub fn corpus_handle_from_manifest(manifest: CorpusManifest, base_dir: PathBuf)
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::indexing_slicing)]
|
||||
fn compute_file_checksum(path: &Path) -> Result<String> {
|
||||
let mut file = fs::File::open(path)
|
||||
.with_context(|| format!("opening file {} for checksum", path.display()))?;
|
||||
let mut hasher = Sha256::new();
|
||||
let mut buffer = [0u8; 8192];
|
||||
loop {
|
||||
let read = file
|
||||
.read(&mut buffer)
|
||||
.with_context(|| format!("reading {} for checksum", path.display()))?;
|
||||
if read == 0 {
|
||||
break;
|
||||
}
|
||||
hasher.update(&buffer[..read]);
|
||||
}
|
||||
Ok(format!("{:x}", hasher.finalize()))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -731,7 +704,6 @@ mod tests {
|
||||
metadata: crate::datasets::DatasetMetadata::for_kind(
|
||||
DatasetKind::default(),
|
||||
false,
|
||||
None,
|
||||
),
|
||||
source: "src".to_string(),
|
||||
paragraphs: vec![paragraph],
|
||||
|
||||
@@ -42,7 +42,7 @@ fn default_chunk_max_tokens() -> usize {
|
||||
}
|
||||
|
||||
fn default_chunk_only() -> bool {
|
||||
false
|
||||
true
|
||||
}
|
||||
|
||||
// Reuse the pipeline's canonical embedded-artifact types so the on-disk corpus
|
||||
@@ -122,6 +122,14 @@ pub struct CorpusManifest {
|
||||
pub questions: Vec<CorpusQuestion>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
|
||||
pub struct NamespaceSeedRecord {
|
||||
pub namespace: String,
|
||||
pub database: String,
|
||||
pub slice_case_count: usize,
|
||||
pub seeded_at: DateTime<Utc>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
|
||||
pub struct CorpusMetadata {
|
||||
pub dataset_id: String,
|
||||
@@ -144,6 +152,8 @@ pub struct CorpusMetadata {
|
||||
pub chunk_max_tokens: usize,
|
||||
#[serde(default = "default_chunk_only")]
|
||||
pub chunk_only: bool,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub namespace_seed: Option<NamespaceSeedRecord>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
|
||||
@@ -629,6 +639,7 @@ mod tests {
|
||||
chunk_min_tokens: 1,
|
||||
chunk_max_tokens: 10,
|
||||
chunk_only: false,
|
||||
namespace_seed: None,
|
||||
},
|
||||
paragraphs: vec![paragraph_one, paragraph_two],
|
||||
questions: vec![question],
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use std::{
|
||||
collections::{BTreeMap, HashMap},
|
||||
collections::{BTreeMap, HashMap, HashSet},
|
||||
fs::File,
|
||||
io::{BufRead, BufReader},
|
||||
path::{Path, PathBuf},
|
||||
@@ -47,20 +47,71 @@ struct QrelEntry {
|
||||
score: i32,
|
||||
}
|
||||
|
||||
/// Convert only documents that appear in qrels (the BEIR evaluation closed world).
|
||||
#[allow(clippy::arithmetic_side_effects, clippy::indexing_slicing)]
|
||||
pub fn convert_beir(raw_dir: &Path, dataset: DatasetKind) -> Result<Vec<ConvertedParagraph>> {
|
||||
convert_beir_documents(raw_dir, dataset, None)
|
||||
}
|
||||
|
||||
/// Convert a subset of qrels-world documents. `doc_ids` use corpus ids (unprefixed).
|
||||
#[allow(
|
||||
clippy::too_many_lines,
|
||||
clippy::arithmetic_side_effects,
|
||||
clippy::indexing_slicing
|
||||
)]
|
||||
pub fn convert_beir_documents(
|
||||
raw_dir: &Path,
|
||||
dataset: DatasetKind,
|
||||
doc_ids: Option<&HashSet<String>>,
|
||||
) -> Result<Vec<ConvertedParagraph>> {
|
||||
let corpus_path = raw_dir.join("corpus.jsonl");
|
||||
let queries_path = raw_dir.join("queries.jsonl");
|
||||
let qrels_path = resolve_qrels_path(raw_dir)?;
|
||||
|
||||
let corpus = load_corpus(&corpus_path)?;
|
||||
let queries = load_queries(&queries_path)?;
|
||||
let qrels = load_qrels(&qrels_path)?;
|
||||
|
||||
let mut paragraphs = Vec::with_capacity(corpus.len());
|
||||
let mut qrels_doc_ids = HashSet::new();
|
||||
for entries in qrels.values() {
|
||||
for entry in entries {
|
||||
qrels_doc_ids.insert(entry.doc_id.clone());
|
||||
}
|
||||
}
|
||||
|
||||
let target_doc_ids: HashSet<String> = match doc_ids {
|
||||
Some(ids) => ids
|
||||
.iter()
|
||||
.filter(|id| qrels_doc_ids.contains(*id))
|
||||
.cloned()
|
||||
.collect(),
|
||||
None => qrels_doc_ids.clone(),
|
||||
};
|
||||
|
||||
if target_doc_ids.is_empty() {
|
||||
return Err(anyhow!(
|
||||
"no qrels documents to convert for {} at {}",
|
||||
dataset.id(),
|
||||
raw_dir.display()
|
||||
));
|
||||
}
|
||||
|
||||
let corpus = load_corpus_filtered(&corpus_path, &target_doc_ids)?;
|
||||
|
||||
let mut doc_ids_sorted: Vec<String> = target_doc_ids.into_iter().collect();
|
||||
doc_ids_sorted.sort();
|
||||
|
||||
let mut paragraphs = Vec::with_capacity(doc_ids_sorted.len());
|
||||
let mut paragraph_index = HashMap::new();
|
||||
|
||||
for (doc_id, entry) in &corpus {
|
||||
for doc_id in &doc_ids_sorted {
|
||||
let Some(entry) = corpus.get(doc_id) else {
|
||||
warn!(
|
||||
doc_id = %doc_id,
|
||||
dataset = %dataset.id(),
|
||||
"Skipping qrels document missing from corpus"
|
||||
);
|
||||
continue;
|
||||
};
|
||||
let paragraph_id = format!("{}-{doc_id}", dataset.source_prefix());
|
||||
let paragraph = ConvertedParagraph {
|
||||
id: paragraph_id.clone(),
|
||||
@@ -87,6 +138,12 @@ pub fn convert_beir(raw_dir: &Path, dataset: DatasetKind) -> Result<Vec<Converte
|
||||
continue;
|
||||
};
|
||||
|
||||
if let Some(filter) = doc_ids {
|
||||
if !filter.contains(&best.doc_id) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
let Some(¶graph_slot) = paragraph_index.get(&best.doc_id) else {
|
||||
missing_docs += 1;
|
||||
warn!(
|
||||
@@ -106,7 +163,6 @@ pub fn convert_beir(raw_dir: &Path, dataset: DatasetKind) -> Result<Vec<Converte
|
||||
);
|
||||
continue;
|
||||
};
|
||||
let answers = vec![snippet];
|
||||
|
||||
let question_id = format!("{}-{query_id}", dataset.source_prefix());
|
||||
paragraphs[paragraph_slot]
|
||||
@@ -114,7 +170,7 @@ pub fn convert_beir(raw_dir: &Path, dataset: DatasetKind) -> Result<Vec<Converte
|
||||
.push(ConvertedQuestion {
|
||||
id: question_id,
|
||||
question: query.text.clone(),
|
||||
answers,
|
||||
answers: vec![snippet],
|
||||
is_impossible: false,
|
||||
});
|
||||
}
|
||||
@@ -122,13 +178,23 @@ pub fn convert_beir(raw_dir: &Path, dataset: DatasetKind) -> Result<Vec<Converte
|
||||
if missing_queries + missing_docs + skipped_answers > 0 {
|
||||
warn!(
|
||||
missing_queries,
|
||||
missing_docs, skipped_answers, "Skipped some BEIR qrels entries during conversion"
|
||||
missing_docs,
|
||||
skipped_answers,
|
||||
dataset = %dataset.id(),
|
||||
"Skipped some BEIR qrels entries during conversion"
|
||||
);
|
||||
}
|
||||
|
||||
Ok(paragraphs)
|
||||
}
|
||||
|
||||
pub fn corpus_doc_id(paragraph_id: &str, dataset: DatasetKind) -> Option<String> {
|
||||
let prefix = format!("{}-", dataset.source_prefix());
|
||||
paragraph_id
|
||||
.strip_prefix(&prefix)
|
||||
.map(str::to_string)
|
||||
}
|
||||
|
||||
fn resolve_qrels_path(raw_dir: &Path) -> Result<PathBuf> {
|
||||
let qrels_dir = raw_dir.join("qrels");
|
||||
let candidates = ["test.tsv", "dev.tsv", "train.tsv"];
|
||||
@@ -148,7 +214,10 @@ fn resolve_qrels_path(raw_dir: &Path) -> Result<PathBuf> {
|
||||
}
|
||||
|
||||
#[allow(clippy::arithmetic_side_effects)]
|
||||
fn load_corpus(path: &Path) -> Result<BTreeMap<String, BeirParagraph>> {
|
||||
fn load_corpus_filtered(
|
||||
path: &Path,
|
||||
doc_ids: &HashSet<String>,
|
||||
) -> Result<BTreeMap<String, BeirParagraph>> {
|
||||
let file =
|
||||
File::open(path).with_context(|| format!("opening BEIR corpus at {}", path.display()))?;
|
||||
let reader = BufReader::new(file);
|
||||
@@ -167,6 +236,9 @@ fn load_corpus(path: &Path) -> Result<BTreeMap<String, BeirParagraph>> {
|
||||
path.display()
|
||||
)
|
||||
})?;
|
||||
if !doc_ids.contains(&corpus_row.id) {
|
||||
continue;
|
||||
}
|
||||
let title = corpus_row.title.unwrap_or_else(|| corpus_row.id.clone());
|
||||
let text = corpus_row.text.unwrap_or_default();
|
||||
let context = build_context(&title, &text);
|
||||
@@ -296,10 +368,8 @@ mod tests {
|
||||
use std::fs;
|
||||
use tempfile::tempdir;
|
||||
|
||||
#[test]
|
||||
#[allow(clippy::unwrap_used, clippy::expect_used, clippy::indexing_slicing)]
|
||||
fn converts_basic_beir_layout() {
|
||||
let dir = tempdir().unwrap();
|
||||
#[allow(clippy::unwrap_used)]
|
||||
fn write_fixture(dir: &tempfile::TempDir) {
|
||||
let corpus = r#"
|
||||
{"_id":"d1","title":"Doc 1","text":"Doc one has some text for testing."}
|
||||
{"_id":"d2","title":"Doc 2","text":"Second document content."}
|
||||
@@ -313,24 +383,34 @@ mod tests {
|
||||
fs::write(dir.path().join("queries.jsonl"), queries.trim()).unwrap();
|
||||
fs::create_dir_all(dir.path().join("qrels")).unwrap();
|
||||
fs::write(dir.path().join("qrels/test.tsv"), qrels).unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[allow(clippy::unwrap_used, clippy::expect_used, clippy::indexing_slicing)]
|
||||
fn converts_qrels_world_only() {
|
||||
let dir = tempdir().unwrap();
|
||||
write_fixture(&dir);
|
||||
|
||||
let paragraphs = convert_beir(dir.path(), DatasetKind::Fever).unwrap();
|
||||
|
||||
assert_eq!(paragraphs.len(), 2);
|
||||
let doc_one = paragraphs
|
||||
.iter()
|
||||
.find(|p| p.id == "fever-d1")
|
||||
.expect("missing paragraph for d1");
|
||||
assert_eq!(paragraphs.len(), 1);
|
||||
let doc_one = ¶graphs[0];
|
||||
assert_eq!(doc_one.id, "fever-d1");
|
||||
assert_eq!(doc_one.questions.len(), 1);
|
||||
let question = &doc_one.questions[0];
|
||||
assert_eq!(question.id, "fever-q1");
|
||||
assert!(!question.answers.is_empty());
|
||||
assert!(doc_one.context.contains(&question.answers[0]));
|
||||
assert_eq!(doc_one.questions[0].id, "fever-q1");
|
||||
}
|
||||
|
||||
let doc_two = paragraphs
|
||||
.iter()
|
||||
.find(|p| p.id == "fever-d2")
|
||||
.expect("missing paragraph for d2");
|
||||
assert!(doc_two.questions.is_empty());
|
||||
#[test]
|
||||
#[allow(clippy::unwrap_used, clippy::expect_used, clippy::indexing_slicing)]
|
||||
fn converts_filtered_doc_ids() {
|
||||
let dir = tempdir().unwrap();
|
||||
write_fixture(&dir);
|
||||
|
||||
let mut ids = HashSet::new();
|
||||
ids.insert("d1".to_string());
|
||||
let paragraphs =
|
||||
convert_beir_documents(dir.path(), DatasetKind::Fever, Some(&ids)).unwrap();
|
||||
assert_eq!(paragraphs.len(), 1);
|
||||
assert_eq!(paragraphs[0].id, "fever-d1");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,262 @@
|
||||
use std::collections::{HashMap, HashSet};
|
||||
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use sha2::{Digest, Sha256};
|
||||
use tracing::info;
|
||||
|
||||
use super::{
|
||||
beir,
|
||||
checksum::hash_file,
|
||||
store::{
|
||||
self, build_dataset_from_catalog, paragraph_path, read_meta, store_dir_for,
|
||||
upsert_sharded_paragraphs, write_sharded,
|
||||
},
|
||||
BEIR_DATASETS, ConvertedDataset, DatasetKind, DatasetMetadata,
|
||||
};
|
||||
use crate::{
|
||||
args::Config,
|
||||
slice,
|
||||
};
|
||||
|
||||
pub fn subset_for_paragraph_id(paragraph_id: &str) -> Option<DatasetKind> {
|
||||
let mut kinds: Vec<DatasetKind> = BEIR_DATASETS.to_vec();
|
||||
kinds.sort_by_key(|kind| std::cmp::Reverse(kind.source_prefix().len()));
|
||||
for kind in kinds {
|
||||
let prefix = format!("{}-", kind.source_prefix());
|
||||
if paragraph_id.starts_with(&prefix) {
|
||||
return Some(kind);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
pub fn build_beir_mix_qrels_dataset(include_unanswerable: bool) -> Result<ConvertedDataset> {
|
||||
if include_unanswerable {
|
||||
tracing::warn!("BEIR mix ignores include_unanswerable flag; all questions are answerable");
|
||||
}
|
||||
|
||||
let mut paragraphs = Vec::new();
|
||||
for subset in BEIR_DATASETS {
|
||||
let entry = super::dataset_entry_for_kind(subset)?;
|
||||
let subset_paragraphs = beir::convert_beir(&entry.raw_path, subset)?;
|
||||
paragraphs.extend(subset_paragraphs);
|
||||
}
|
||||
|
||||
Ok(ConvertedDataset {
|
||||
generated_at: super::base_timestamp(),
|
||||
metadata: DatasetMetadata::for_kind(DatasetKind::Beir, include_unanswerable),
|
||||
source: "beir-mix".to_string(),
|
||||
paragraphs,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn prepare_beir_mix(config: &Config) -> Result<super::loader::LoadedDataset> {
|
||||
let virtual_ds = build_beir_mix_qrels_dataset(config.llm_mode)?;
|
||||
let slice_config = slice::slice_config_with_limit(config, slice::ledger_target(config));
|
||||
let resolved = slice::resolve_slice(&virtual_ds, &slice_config).context(
|
||||
"resolving BEIR mix slice ledger (check --slice and --limit match your intent)",
|
||||
)?;
|
||||
|
||||
let unique: HashSet<String> = resolved
|
||||
.manifest
|
||||
.paragraphs
|
||||
.iter()
|
||||
.map(|entry| entry.id.clone())
|
||||
.collect();
|
||||
|
||||
materialize_subset_stores(&unique, config.force_convert)?;
|
||||
|
||||
let dataset = load_beir_mix_from_subsets(&unique)?;
|
||||
let checksum = mix_content_checksum(&unique)?;
|
||||
|
||||
info!(
|
||||
slice = resolved.manifest.slice_id.as_str(),
|
||||
paragraphs = unique.len(),
|
||||
checksum = %checksum,
|
||||
"Prepared BEIR mix from per-subset converted stores"
|
||||
);
|
||||
|
||||
Ok(super::loader::LoadedDataset {
|
||||
dataset,
|
||||
content_checksum: checksum,
|
||||
partial: true,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn materialize_subset_stores(
|
||||
paragraph_ids: &HashSet<String>,
|
||||
force: bool,
|
||||
) -> Result<()> {
|
||||
let mut by_subset: HashMap<DatasetKind, Vec<String>> = HashMap::new();
|
||||
for paragraph_id in paragraph_ids {
|
||||
let kind = subset_for_paragraph_id(paragraph_id).with_context(|| {
|
||||
format!("routing BEIR mix paragraph id '{paragraph_id}' to subset store")
|
||||
})?;
|
||||
by_subset.entry(kind).or_default().push(paragraph_id.clone());
|
||||
}
|
||||
|
||||
for (kind, ids) in by_subset {
|
||||
let entry = super::dataset_entry_for_kind(kind)?;
|
||||
let store_dir = store_dir_for(&entry.converted_path);
|
||||
let existing = if store_dir.join("meta.json").is_file() {
|
||||
store::load_paragraph_ids_set(&store_dir)?
|
||||
} else {
|
||||
HashSet::new()
|
||||
};
|
||||
|
||||
let missing: Vec<String> = if force {
|
||||
ids
|
||||
} else {
|
||||
ids.into_iter()
|
||||
.filter(|paragraph_id| !existing.contains(paragraph_id))
|
||||
.collect()
|
||||
};
|
||||
|
||||
if missing.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let corpus_ids: HashSet<String> = missing
|
||||
.iter()
|
||||
.filter_map(|paragraph_id| beir::corpus_doc_id(paragraph_id, kind))
|
||||
.collect();
|
||||
let paragraphs = beir::convert_beir_documents(
|
||||
&entry.raw_path,
|
||||
kind,
|
||||
Some(&corpus_ids),
|
||||
)?;
|
||||
|
||||
if store_dir.join("meta.json").is_file() {
|
||||
upsert_sharded_paragraphs(&store_dir, ¶graphs)?;
|
||||
} else {
|
||||
let question_count = paragraphs
|
||||
.iter()
|
||||
.map(|paragraph| paragraph.questions.len())
|
||||
.sum::<usize>();
|
||||
let dataset = ConvertedDataset {
|
||||
generated_at: super::base_timestamp(),
|
||||
metadata: DatasetMetadata::for_kind(kind, false),
|
||||
source: entry.raw_path.display().to_string(),
|
||||
paragraphs,
|
||||
};
|
||||
write_sharded(&dataset, &store_dir)?;
|
||||
info!(
|
||||
subset = kind.id(),
|
||||
store = %store_dir.display(),
|
||||
paragraphs = dataset.paragraphs.len(),
|
||||
questions = question_count,
|
||||
"Created subset converted store for BEIR mix"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn load_beir_mix_from_subsets(paragraph_ids: &HashSet<String>) -> Result<ConvertedDataset> {
|
||||
let mut by_subset: HashMap<DatasetKind, HashSet<String>> = HashMap::new();
|
||||
for paragraph_id in paragraph_ids {
|
||||
let kind = subset_for_paragraph_id(paragraph_id).with_context(|| {
|
||||
format!("routing BEIR mix paragraph id '{paragraph_id}' to subset store")
|
||||
})?;
|
||||
by_subset
|
||||
.entry(kind)
|
||||
.or_default()
|
||||
.insert(paragraph_id.clone());
|
||||
}
|
||||
|
||||
let mut paragraphs = Vec::with_capacity(paragraph_ids.len());
|
||||
for (kind, subset_ids) in by_subset {
|
||||
let entry = super::dataset_entry_for_kind(kind)?;
|
||||
let store_dir = store_dir_for(&entry.converted_path);
|
||||
let partial = build_dataset_from_catalog(&store_dir, &subset_ids)?;
|
||||
paragraphs.extend(partial.paragraphs);
|
||||
}
|
||||
|
||||
paragraphs.sort_by(|left, right| left.id.cmp(&right.id));
|
||||
|
||||
Ok(ConvertedDataset {
|
||||
generated_at: super::base_timestamp(),
|
||||
metadata: DatasetMetadata::for_kind(DatasetKind::Beir, false),
|
||||
source: "beir-mix".to_string(),
|
||||
paragraphs,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn mix_content_checksum(paragraph_ids: &HashSet<String>) -> Result<String> {
|
||||
let mut ids: Vec<String> = paragraph_ids.iter().cloned().collect();
|
||||
ids.sort();
|
||||
|
||||
let mut hasher = Sha256::new();
|
||||
for paragraph_id in ids {
|
||||
let kind = subset_for_paragraph_id(¶graph_id)
|
||||
.ok_or_else(|| anyhow!("unknown BEIR subset for paragraph '{paragraph_id}'"))?;
|
||||
let entry = super::dataset_entry_for_kind(kind)?;
|
||||
let store_dir = store_dir_for(&entry.converted_path);
|
||||
let path = paragraph_path(&store_dir, ¶graph_id);
|
||||
if !path.is_file() {
|
||||
return Err(anyhow!(
|
||||
"missing converted paragraph {} at {}",
|
||||
paragraph_id,
|
||||
path.display()
|
||||
));
|
||||
}
|
||||
hasher.update(paragraph_id.as_bytes());
|
||||
hasher.update([0]);
|
||||
hasher.update(hash_file(&path)?.as_bytes());
|
||||
}
|
||||
|
||||
Ok(format!("{:x}", hasher.finalize()))
|
||||
}
|
||||
|
||||
pub fn beir_subset_stores_ready(paragraph_ids: &HashSet<String>) -> Result<bool> {
|
||||
for paragraph_id in paragraph_ids {
|
||||
let kind = subset_for_paragraph_id(paragraph_id).with_context(|| {
|
||||
format!("routing BEIR mix paragraph id '{paragraph_id}' to subset store")
|
||||
})?;
|
||||
let entry = super::dataset_entry_for_kind(kind)?;
|
||||
let store_dir = store_dir_for(&entry.converted_path);
|
||||
if !store_dir.join("meta.json").is_file() {
|
||||
return Ok(false);
|
||||
}
|
||||
if !paragraph_path(&store_dir, paragraph_id).is_file() {
|
||||
return Ok(false);
|
||||
}
|
||||
}
|
||||
Ok(true)
|
||||
}
|
||||
|
||||
pub fn beir_subset_store_summary() -> Result<Vec<(String, usize, usize)>> {
|
||||
let mut summary = Vec::new();
|
||||
for kind in BEIR_DATASETS {
|
||||
let entry = super::dataset_entry_for_kind(kind)?;
|
||||
let store_dir = store_dir_for(&entry.converted_path);
|
||||
if store_dir.join("meta.json").is_file() {
|
||||
let meta = read_meta(&store_dir)?;
|
||||
summary.push((kind.id().to_string(), meta.paragraph_count, meta.question_count));
|
||||
}
|
||||
}
|
||||
Ok(summary)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn routes_prefixed_paragraph_ids() {
|
||||
assert_eq!(
|
||||
subset_for_paragraph_id("fever-doc-1"),
|
||||
Some(DatasetKind::Fever)
|
||||
);
|
||||
assert_eq!(
|
||||
subset_for_paragraph_id("nq-beir-doc-1"),
|
||||
Some(DatasetKind::NqBeir)
|
||||
);
|
||||
assert_eq!(
|
||||
subset_for_paragraph_id("trec-covid-doc-1"),
|
||||
Some(DatasetKind::TrecCovid)
|
||||
);
|
||||
assert!(subset_for_paragraph_id("unknown-doc").is_none());
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,216 @@
|
||||
use std::{
|
||||
fs::{self, File},
|
||||
io::Read,
|
||||
path::Path,
|
||||
};
|
||||
|
||||
#[cfg(test)]
|
||||
use std::path::PathBuf;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use sha2::{Digest, Sha256};
|
||||
|
||||
const SIDECAR_VERSION: u32 = 1;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ChecksumSidecar {
|
||||
pub version: u32,
|
||||
pub sha256: String,
|
||||
pub size_bytes: u64,
|
||||
#[serde(default)]
|
||||
pub modified_unix_secs: u64,
|
||||
}
|
||||
|
||||
impl ChecksumSidecar {
|
||||
#[cfg(test)]
|
||||
pub fn sidecar_path(content_path: &Path) -> PathBuf {
|
||||
content_path.with_extension("sha256")
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn is_valid_for(&self, content_path: &Path) -> bool {
|
||||
if self.version != SIDECAR_VERSION {
|
||||
return false;
|
||||
}
|
||||
let Ok(metadata) = fs::metadata(content_path) else {
|
||||
return false;
|
||||
};
|
||||
if metadata.len() != self.size_bytes {
|
||||
return false;
|
||||
}
|
||||
if self.modified_unix_secs != 0 {
|
||||
let Ok(modified) = metadata.modified() else {
|
||||
return true;
|
||||
};
|
||||
let Ok(secs) = modified.duration_since(std::time::UNIX_EPOCH) else {
|
||||
return true;
|
||||
};
|
||||
if secs.as_secs() != self.modified_unix_secs {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::indexing_slicing)]
|
||||
pub fn hash_file(path: &Path) -> Result<String> {
|
||||
let mut file =
|
||||
File::open(path).with_context(|| format!("opening file {} for checksum", path.display()))?;
|
||||
let mut hasher = Sha256::new();
|
||||
let mut buffer = vec![0u8; 65_536];
|
||||
loop {
|
||||
let read = file
|
||||
.read(&mut buffer)
|
||||
.with_context(|| format!("reading {} for checksum", path.display()))?;
|
||||
if read == 0 {
|
||||
break;
|
||||
}
|
||||
hasher.update(&buffer[..read]);
|
||||
}
|
||||
Ok(format!("{:x}", hasher.finalize()))
|
||||
}
|
||||
|
||||
pub fn read_sidecar(path: &Path) -> Result<Option<ChecksumSidecar>> {
|
||||
if !path.exists() {
|
||||
return Ok(None);
|
||||
}
|
||||
let raw = fs::read_to_string(path)
|
||||
.with_context(|| format!("reading checksum sidecar {}", path.display()))?;
|
||||
let sidecar: ChecksumSidecar = serde_json::from_str(&raw)
|
||||
.with_context(|| format!("parsing checksum sidecar {}", path.display()))?;
|
||||
Ok(Some(sidecar))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn write_sidecar(content_path: &Path, sha256: &str) -> Result<()> {
|
||||
let metadata = fs::metadata(content_path)
|
||||
.with_context(|| format!("reading metadata for {}", content_path.display()))?;
|
||||
let modified_unix_secs = metadata
|
||||
.modified()
|
||||
.ok()
|
||||
.and_then(|time| time.duration_since(std::time::UNIX_EPOCH).ok())
|
||||
.map_or(0, |duration| duration.as_secs());
|
||||
let sidecar = ChecksumSidecar {
|
||||
version: SIDECAR_VERSION,
|
||||
sha256: sha256.to_string(),
|
||||
size_bytes: metadata.len(),
|
||||
modified_unix_secs,
|
||||
};
|
||||
let path = ChecksumSidecar::sidecar_path(content_path);
|
||||
if let Some(parent) = path.parent() {
|
||||
fs::create_dir_all(parent)
|
||||
.with_context(|| format!("creating checksum sidecar directory {}", parent.display()))?;
|
||||
}
|
||||
let blob = serde_json::to_vec_pretty(&sidecar).context("serialising checksum sidecar")?;
|
||||
fs::write(&path, blob)
|
||||
.with_context(|| format!("writing checksum sidecar {}", path.display()))?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn content_checksum(content_path: &Path) -> Result<String> {
|
||||
let sidecar_path = ChecksumSidecar::sidecar_path(content_path);
|
||||
if let Some(sidecar) = read_sidecar(&sidecar_path)? {
|
||||
if sidecar.is_valid_for(content_path) {
|
||||
return Ok(sidecar.sha256);
|
||||
}
|
||||
}
|
||||
let sha256 = hash_file(content_path)?;
|
||||
write_sidecar(content_path, &sha256)?;
|
||||
Ok(sha256)
|
||||
}
|
||||
|
||||
pub fn store_aggregate_checksum(store_dir: &Path) -> Result<String> {
|
||||
let marker = store_dir.join("checksum.sha256");
|
||||
let meta = store_dir.join("meta.json");
|
||||
if marker.is_file() && meta.is_file() {
|
||||
if let (Ok(marker_meta), Ok(meta_meta)) = (marker.metadata(), meta.metadata()) {
|
||||
if marker_meta
|
||||
.modified()
|
||||
.ok()
|
||||
.zip(meta_meta.modified().ok())
|
||||
.is_some_and(|(marker_modified, meta_modified)| marker_modified >= meta_modified)
|
||||
{
|
||||
if let Some(sidecar) = read_sidecar(&marker)? {
|
||||
return Ok(sidecar.sha256);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut entries = Vec::new();
|
||||
collect_store_files(store_dir, store_dir, &mut entries)?;
|
||||
entries.sort();
|
||||
|
||||
let mut hasher = Sha256::new();
|
||||
for relative in &entries {
|
||||
let path = store_dir.join(relative);
|
||||
if path == marker {
|
||||
continue;
|
||||
}
|
||||
hasher.update(relative.as_bytes());
|
||||
hasher.update([0]);
|
||||
let file_hash = hash_file(&path)?;
|
||||
hasher.update(file_hash.as_bytes());
|
||||
}
|
||||
let digest = format!("{:x}", hasher.finalize());
|
||||
|
||||
let sidecar = ChecksumSidecar {
|
||||
version: SIDECAR_VERSION,
|
||||
sha256: digest.clone(),
|
||||
size_bytes: entries.len() as u64,
|
||||
modified_unix_secs: std::time::SystemTime::now()
|
||||
.duration_since(std::time::UNIX_EPOCH)
|
||||
.map_or(0, |duration| duration.as_secs()),
|
||||
};
|
||||
if let Some(parent) = marker.parent() {
|
||||
fs::create_dir_all(parent)?;
|
||||
}
|
||||
fs::write(&marker, serde_json::to_vec_pretty(&sidecar)?)?;
|
||||
Ok(digest)
|
||||
}
|
||||
|
||||
fn collect_store_files(base: &Path, current: &Path, entries: &mut Vec<String>) -> Result<()> {
|
||||
for entry in fs::read_dir(current)? {
|
||||
let entry = entry?;
|
||||
let path = entry.path();
|
||||
if path.file_name().is_some_and(|name| name == "checksum.sha256") {
|
||||
continue;
|
||||
}
|
||||
if path.is_dir() {
|
||||
collect_store_files(base, &path, entries)?;
|
||||
} else if path.is_file() {
|
||||
let relative = path
|
||||
.strip_prefix(base)
|
||||
.unwrap_or(&path)
|
||||
.to_string_lossy()
|
||||
.replace('\\', "/");
|
||||
entries.push(relative);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use tempfile::tempdir;
|
||||
|
||||
#[test]
|
||||
fn sidecar_round_trip() -> Result<()> {
|
||||
let dir = tempdir()?;
|
||||
let file = dir.path().join("sample.json");
|
||||
fs::write(&file, br#"{"hello":"world"}"#)?;
|
||||
|
||||
let first = content_checksum(&file)?;
|
||||
let second = content_checksum(&file)?;
|
||||
assert_eq!(first, second);
|
||||
|
||||
fs::write(&file, br#"{"hello":"world!"}"#)?;
|
||||
let third = content_checksum(&file)?;
|
||||
assert_ne!(first, third);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,197 @@
|
||||
use std::collections::HashSet;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use tracing::info;
|
||||
|
||||
use super::{
|
||||
catalog,
|
||||
store::{
|
||||
self, build_dataset_from_catalog, detect_layout, read_meta, store_dir_for, write_sharded,
|
||||
ConvertedLayout,
|
||||
},
|
||||
ConvertedDataset, DatasetKind,
|
||||
};
|
||||
use crate::{
|
||||
args::Config,
|
||||
slice::{self, SliceConfig},
|
||||
};
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct LoadedDataset {
|
||||
pub dataset: ConvertedDataset,
|
||||
pub content_checksum: String,
|
||||
pub partial: bool,
|
||||
}
|
||||
|
||||
pub fn prepare_dataset(dataset_kind: DatasetKind, config: &Config) -> Result<LoadedDataset> {
|
||||
if dataset_kind == DatasetKind::Beir {
|
||||
return super::beir_mix::prepare_beir_mix(config);
|
||||
}
|
||||
|
||||
let converted_path = &config.converted_dataset_path;
|
||||
let layout = detect_layout(converted_path);
|
||||
let store_dir = store_dir_for(converted_path);
|
||||
|
||||
if layout == ConvertedLayout::Missing || config.force_convert {
|
||||
return convert_and_load(dataset_kind, config);
|
||||
}
|
||||
|
||||
load_from_store(dataset_kind, config, &store_dir, true)
|
||||
}
|
||||
|
||||
fn convert_and_load(dataset_kind: DatasetKind, config: &Config) -> Result<LoadedDataset> {
|
||||
let dataset = super::convert(
|
||||
config.raw_dataset_path.as_path(),
|
||||
dataset_kind,
|
||||
config.llm_mode,
|
||||
)
|
||||
.with_context(|| format!("converting {} dataset", dataset_kind.label()))?;
|
||||
|
||||
let store_dir = store_dir_for(&config.converted_dataset_path);
|
||||
write_sharded(&dataset, &store_dir)?;
|
||||
prebuild_catalog_slices(&dataset, config)?;
|
||||
let checksum = crate::datasets::store_aggregate_checksum(&store_dir)?;
|
||||
|
||||
Ok(LoadedDataset {
|
||||
dataset,
|
||||
content_checksum: checksum,
|
||||
partial: false,
|
||||
})
|
||||
}
|
||||
|
||||
fn load_from_store(
|
||||
dataset_kind: DatasetKind,
|
||||
config: &Config,
|
||||
store_dir: &std::path::Path,
|
||||
allow_partial: bool,
|
||||
) -> Result<LoadedDataset> {
|
||||
let checksum = crate::datasets::store_aggregate_checksum(store_dir)?;
|
||||
let meta = read_meta(store_dir)?;
|
||||
validate_metadata_fields(&meta.metadata, dataset_kind, config)?;
|
||||
|
||||
if allow_partial {
|
||||
if let Some(paragraph_ids) = slice_paragraph_ids_for_fast_path(config)? {
|
||||
let unique: HashSet<String> = paragraph_ids.into_iter().collect();
|
||||
info!(
|
||||
paragraphs = unique.len(),
|
||||
store = %store_dir.display(),
|
||||
"Loading slice-addressed paragraphs from sharded converted store"
|
||||
);
|
||||
let dataset = build_dataset_from_catalog(store_dir, &unique)?;
|
||||
return Ok(LoadedDataset {
|
||||
dataset,
|
||||
content_checksum: checksum,
|
||||
partial: true,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
info!(
|
||||
store = %store_dir.display(),
|
||||
paragraphs = meta.paragraph_count,
|
||||
"Loading full sharded converted store"
|
||||
);
|
||||
let dataset = store::load_sharded_full(store_dir)?;
|
||||
Ok(LoadedDataset {
|
||||
dataset,
|
||||
content_checksum: checksum,
|
||||
partial: false,
|
||||
})
|
||||
}
|
||||
|
||||
fn slice_paragraph_ids_for_fast_path(config: &Config) -> Result<Option<Vec<String>>> {
|
||||
let Some(manifest_path) = slice::cached_manifest_path(config) else {
|
||||
return Ok(None);
|
||||
};
|
||||
let Some(manifest) = slice::read_manifest_if_exists(&manifest_path)? else {
|
||||
return Ok(None);
|
||||
};
|
||||
let slice_config = slice::slice_config_with_limit(config, slice::ledger_target(config));
|
||||
if !slice::manifest_is_complete(&manifest, &slice_config) {
|
||||
return Ok(None);
|
||||
}
|
||||
Ok(Some(
|
||||
manifest
|
||||
.paragraphs
|
||||
.iter()
|
||||
.map(|entry| entry.id.clone())
|
||||
.collect(),
|
||||
))
|
||||
}
|
||||
|
||||
fn validate_metadata_fields(
|
||||
metadata: &super::DatasetMetadata,
|
||||
dataset_kind: DatasetKind,
|
||||
config: &Config,
|
||||
) -> Result<()> {
|
||||
if metadata.id != dataset_kind.id() {
|
||||
anyhow::bail!(
|
||||
"converted dataset targets '{}', expected '{}'",
|
||||
metadata.id,
|
||||
dataset_kind.id()
|
||||
);
|
||||
}
|
||||
if metadata.include_unanswerable != config.llm_mode {
|
||||
anyhow::bail!(
|
||||
"converted dataset include_unanswerable mismatch (expected {}, found {})",
|
||||
config.llm_mode,
|
||||
metadata.include_unanswerable
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn prebuild_catalog_slices(dataset: &ConvertedDataset, config: &Config) -> Result<()> {
|
||||
let catalog = catalog()?;
|
||||
let entry = catalog.dataset(dataset.metadata.id.as_str())?;
|
||||
if entry.slices.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
info!(
|
||||
dataset = dataset.metadata.id.as_str(),
|
||||
slices = entry.slices.len(),
|
||||
"Prebuilding catalog slice ledgers"
|
||||
);
|
||||
|
||||
for slice_entry in &entry.slices {
|
||||
let slice_config = slice_config_for_catalog_entry(config, slice_entry);
|
||||
match slice::resolve_slice(dataset, &slice_config) {
|
||||
Ok(resolved) => info!(
|
||||
slice = resolved.manifest.slice_id.as_str(),
|
||||
cases = resolved.manifest.case_count,
|
||||
positives = resolved.manifest.positive_paragraphs,
|
||||
negatives = resolved.manifest.negative_paragraphs,
|
||||
"Prebuilt catalog slice ledger"
|
||||
),
|
||||
Err(err) => tracing::warn!(
|
||||
slice = slice_entry.id.as_str(),
|
||||
error = %err,
|
||||
"Failed to prebuild catalog slice ledger"
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn slice_config_for_catalog_entry<'a>(
|
||||
config: &'a Config,
|
||||
slice_entry: &'a super::SliceEntry,
|
||||
) -> SliceConfig<'a> {
|
||||
SliceConfig {
|
||||
cache_dir: config.cache_dir.as_path(),
|
||||
force_convert: config.force_convert,
|
||||
explicit_slice: Some(slice_entry.id.as_str()),
|
||||
limit: slice_entry.limit,
|
||||
corpus_limit: slice_entry.corpus_limit,
|
||||
slice_seed: slice_entry.seed.unwrap_or(config.slice_seed),
|
||||
llm_mode: slice_entry
|
||||
.include_unanswerable
|
||||
.unwrap_or(config.llm_mode),
|
||||
negative_multiplier: slice_entry
|
||||
.negative_multiplier
|
||||
.unwrap_or(config.negative_multiplier),
|
||||
require_verified_chunks: config.retrieval.require_verified_chunks,
|
||||
}
|
||||
}
|
||||
+38
-143
@@ -1,6 +1,10 @@
|
||||
mod beir;
|
||||
mod beir_mix;
|
||||
mod checksum;
|
||||
mod loader;
|
||||
mod nq;
|
||||
mod squad;
|
||||
mod store;
|
||||
|
||||
use std::{
|
||||
collections::{BTreeMap, HashMap},
|
||||
@@ -20,38 +24,31 @@ const MANIFEST_PATH: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/manifest.yaml"
|
||||
static DATASET_CATALOG: OnceCell<DatasetCatalog> = OnceCell::new();
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
#[allow(dead_code)]
|
||||
pub struct DatasetCatalog {
|
||||
datasets: BTreeMap<String, DatasetEntry>,
|
||||
slices: HashMap<String, SliceLocation>,
|
||||
default_dataset: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
#[allow(dead_code)]
|
||||
pub struct DatasetEntry {
|
||||
pub metadata: DatasetMetadata,
|
||||
pub raw_path: PathBuf,
|
||||
pub converted_path: PathBuf,
|
||||
pub include_unanswerable: bool,
|
||||
pub slices: Vec<SliceEntry>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
#[allow(dead_code)]
|
||||
pub struct SliceEntry {
|
||||
pub id: String,
|
||||
pub dataset_id: String,
|
||||
pub label: String,
|
||||
pub description: Option<String>,
|
||||
pub limit: Option<usize>,
|
||||
pub corpus_limit: Option<usize>,
|
||||
pub include_unanswerable: Option<bool>,
|
||||
pub seed: Option<u64>,
|
||||
pub negative_multiplier: Option<f32>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
#[allow(dead_code)]
|
||||
struct SliceLocation {
|
||||
dataset_id: String,
|
||||
slice_index: usize,
|
||||
@@ -59,7 +56,6 @@ struct SliceLocation {
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct ManifestFile {
|
||||
default_dataset: Option<String>,
|
||||
datasets: Vec<ManifestDataset>,
|
||||
}
|
||||
|
||||
@@ -81,6 +77,7 @@ struct ManifestDataset {
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
#[allow(dead_code)]
|
||||
struct ManifestSlice {
|
||||
id: String,
|
||||
label: String,
|
||||
@@ -94,6 +91,8 @@ struct ManifestSlice {
|
||||
include_unanswerable: Option<bool>,
|
||||
#[serde(default)]
|
||||
seed: Option<u64>,
|
||||
#[serde(default)]
|
||||
negative_multiplier: Option<f32>,
|
||||
}
|
||||
|
||||
impl DatasetCatalog {
|
||||
@@ -111,18 +110,19 @@ impl DatasetCatalog {
|
||||
let raw_path = resolve_path(root, &dataset.raw);
|
||||
let converted_path = resolve_path(root, &dataset.converted);
|
||||
|
||||
if !raw_path.exists() {
|
||||
if !raw_path.exists() && dataset.id != "beir" {
|
||||
bail!(
|
||||
"dataset '{}' raw file missing at {}",
|
||||
dataset.id,
|
||||
raw_path.display()
|
||||
);
|
||||
}
|
||||
if !converted_path.exists() {
|
||||
let store_dir = store::store_dir_for(&converted_path);
|
||||
if !converted_path.exists() && !store_dir.join("meta.json").is_file() {
|
||||
warn!(
|
||||
"dataset '{}' converted file missing at {}; the next conversion run will regenerate it",
|
||||
"dataset '{}' converted store missing at {}; the next conversion run will regenerate it",
|
||||
dataset.id,
|
||||
converted_path.display()
|
||||
store_dir.display()
|
||||
);
|
||||
}
|
||||
|
||||
@@ -139,7 +139,6 @@ impl DatasetCatalog {
|
||||
.clone()
|
||||
.unwrap_or_else(|| dataset.id.clone()),
|
||||
include_unanswerable: dataset.include_unanswerable,
|
||||
context_token_limit: None,
|
||||
};
|
||||
|
||||
let mut entry_slices = Vec::with_capacity(dataset.slices.len());
|
||||
@@ -154,12 +153,11 @@ impl DatasetCatalog {
|
||||
entry_slices.push(SliceEntry {
|
||||
id: manifest_slice.id.clone(),
|
||||
dataset_id: dataset.id.clone(),
|
||||
label: manifest_slice.label,
|
||||
description: manifest_slice.description,
|
||||
limit: manifest_slice.limit,
|
||||
corpus_limit: manifest_slice.corpus_limit,
|
||||
include_unanswerable: manifest_slice.include_unanswerable,
|
||||
seed: manifest_slice.seed,
|
||||
negative_multiplier: manifest_slice.negative_multiplier,
|
||||
});
|
||||
slices.insert(
|
||||
manifest_slice.id,
|
||||
@@ -176,22 +174,16 @@ impl DatasetCatalog {
|
||||
metadata,
|
||||
raw_path,
|
||||
converted_path,
|
||||
include_unanswerable: dataset.include_unanswerable,
|
||||
slices: entry_slices,
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
let default_dataset = manifest
|
||||
.default_dataset
|
||||
.or_else(|| datasets.keys().next().cloned())
|
||||
.ok_or_else(|| anyhow!("dataset manifest does not include any datasets"))?;
|
||||
if datasets.is_empty() {
|
||||
bail!("dataset manifest does not include any datasets");
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
datasets,
|
||||
slices,
|
||||
default_dataset,
|
||||
})
|
||||
Ok(Self { datasets, slices })
|
||||
}
|
||||
|
||||
pub fn global() -> Result<&'static Self> {
|
||||
@@ -204,12 +196,6 @@ impl DatasetCatalog {
|
||||
.ok_or_else(|| anyhow!("unknown dataset '{id}' in manifest"))
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub fn default_dataset(&self) -> Result<&DatasetEntry> {
|
||||
self.dataset(&self.default_dataset)
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub fn slice(&self, slice_id: &str) -> Result<(&DatasetEntry, &SliceEntry)> {
|
||||
let location = self
|
||||
.slices
|
||||
@@ -236,20 +222,29 @@ fn resolve_path(root: &Path, value: &str) -> PathBuf {
|
||||
}
|
||||
}
|
||||
|
||||
pub use checksum::store_aggregate_checksum;
|
||||
pub use beir_mix::{
|
||||
beir_subset_store_summary, beir_subset_stores_ready, mix_content_checksum,
|
||||
};
|
||||
pub use loader::{prebuild_catalog_slices, prepare_dataset};
|
||||
pub use store::{
|
||||
content_checksum_for_layout, detect_layout, store_dir_for, write_sharded, ConvertedLayout,
|
||||
};
|
||||
|
||||
pub fn catalog() -> Result<&'static DatasetCatalog> {
|
||||
DatasetCatalog::global()
|
||||
}
|
||||
|
||||
fn dataset_entry_for_kind(kind: DatasetKind) -> Result<&'static DatasetEntry> {
|
||||
pub(crate) fn dataset_entry_for_kind(kind: DatasetKind) -> Result<&'static DatasetEntry> {
|
||||
let catalog = catalog()?;
|
||||
catalog.dataset(kind.id())
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum, Default)]
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, ValueEnum, Default)]
|
||||
pub enum DatasetKind {
|
||||
#[default]
|
||||
SquadV2,
|
||||
NaturalQuestions,
|
||||
#[default]
|
||||
Beir,
|
||||
#[value(name = "fever")]
|
||||
Fever,
|
||||
@@ -416,16 +411,10 @@ pub struct DatasetMetadata {
|
||||
pub source_prefix: String,
|
||||
#[serde(default)]
|
||||
pub include_unanswerable: bool,
|
||||
#[serde(default)]
|
||||
pub context_token_limit: Option<usize>,
|
||||
}
|
||||
|
||||
impl DatasetMetadata {
|
||||
pub fn for_kind(
|
||||
kind: DatasetKind,
|
||||
include_unanswerable: bool,
|
||||
context_token_limit: Option<usize>,
|
||||
) -> Self {
|
||||
pub fn for_kind(kind: DatasetKind, include_unanswerable: bool) -> Self {
|
||||
if let Ok(entry) = dataset_entry_for_kind(kind) {
|
||||
return Self {
|
||||
id: entry.metadata.id.clone(),
|
||||
@@ -434,7 +423,6 @@ impl DatasetMetadata {
|
||||
entity_suffix: entry.metadata.entity_suffix.clone(),
|
||||
source_prefix: entry.metadata.source_prefix.clone(),
|
||||
include_unanswerable,
|
||||
context_token_limit,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -445,13 +433,12 @@ impl DatasetMetadata {
|
||||
entity_suffix: kind.entity_suffix().to_string(),
|
||||
source_prefix: kind.source_prefix().to_string(),
|
||||
include_unanswerable,
|
||||
context_token_limit,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn default_metadata() -> DatasetMetadata {
|
||||
DatasetMetadata::for_kind(DatasetKind::default(), false, None)
|
||||
DatasetMetadata::for_kind(DatasetKind::default(), false)
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
@@ -483,14 +470,15 @@ pub fn convert(
|
||||
raw_path: &Path,
|
||||
dataset: DatasetKind,
|
||||
include_unanswerable: bool,
|
||||
context_token_limit: Option<usize>,
|
||||
) -> Result<ConvertedDataset> {
|
||||
let paragraphs = match dataset {
|
||||
DatasetKind::SquadV2 => squad::convert_squad(raw_path)?,
|
||||
DatasetKind::NaturalQuestions => {
|
||||
nq::convert_nq(raw_path, include_unanswerable, context_token_limit)?
|
||||
DatasetKind::NaturalQuestions => nq::convert_nq(raw_path, include_unanswerable)?,
|
||||
DatasetKind::Beir => {
|
||||
bail!(
|
||||
"BEIR mix is prepared via slice-first subset stores; use prepare_beir_mix instead of convert"
|
||||
);
|
||||
}
|
||||
DatasetKind::Beir => convert_beir_mix(include_unanswerable, context_token_limit)?,
|
||||
DatasetKind::Fever
|
||||
| DatasetKind::Fiqa
|
||||
| DatasetKind::HotpotQa
|
||||
@@ -501,11 +489,6 @@ pub fn convert(
|
||||
| DatasetKind::NqBeir => beir::convert_beir(raw_path, dataset)?,
|
||||
};
|
||||
|
||||
let metadata_limit = match dataset {
|
||||
DatasetKind::NaturalQuestions => None,
|
||||
_ => context_token_limit,
|
||||
};
|
||||
|
||||
let generated_at = match dataset {
|
||||
DatasetKind::Beir
|
||||
| DatasetKind::Fever
|
||||
@@ -526,100 +509,12 @@ pub fn convert(
|
||||
|
||||
Ok(ConvertedDataset {
|
||||
generated_at,
|
||||
metadata: DatasetMetadata::for_kind(dataset, include_unanswerable, metadata_limit),
|
||||
metadata: DatasetMetadata::for_kind(dataset, include_unanswerable),
|
||||
source: source_label,
|
||||
paragraphs,
|
||||
})
|
||||
}
|
||||
|
||||
fn convert_beir_mix(
|
||||
include_unanswerable: bool,
|
||||
_context_token_limit: Option<usize>,
|
||||
) -> Result<Vec<ConvertedParagraph>> {
|
||||
if include_unanswerable {
|
||||
warn!("BEIR mix ignores include_unanswerable flag; all questions are answerable");
|
||||
}
|
||||
|
||||
let mut paragraphs = Vec::new();
|
||||
for subset in BEIR_DATASETS {
|
||||
let entry = dataset_entry_for_kind(subset)?;
|
||||
let subset_paragraphs = beir::convert_beir(&entry.raw_path, subset)?;
|
||||
paragraphs.extend(subset_paragraphs);
|
||||
}
|
||||
|
||||
Ok(paragraphs)
|
||||
}
|
||||
|
||||
fn ensure_parent(path: &Path) -> Result<()> {
|
||||
if let Some(parent) = path.parent() {
|
||||
fs::create_dir_all(parent)
|
||||
.with_context(|| format!("creating parent directory for {}", path.display()))?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn write_converted(dataset: &ConvertedDataset, converted_path: &Path) -> Result<()> {
|
||||
ensure_parent(converted_path)?;
|
||||
let json =
|
||||
serde_json::to_string_pretty(dataset).context("serialising converted dataset to JSON")?;
|
||||
fs::write(converted_path, json)
|
||||
.with_context(|| format!("writing converted dataset to {}", converted_path.display()))
|
||||
}
|
||||
|
||||
pub fn read_converted(converted_path: &Path) -> Result<ConvertedDataset> {
|
||||
let raw = fs::read_to_string(converted_path)
|
||||
.with_context(|| format!("reading converted dataset at {}", converted_path.display()))?;
|
||||
let mut dataset: ConvertedDataset = serde_json::from_str(&raw)
|
||||
.with_context(|| format!("parsing converted dataset at {}", converted_path.display()))?;
|
||||
if dataset.metadata.id.trim().is_empty() {
|
||||
dataset.metadata = default_metadata();
|
||||
}
|
||||
if dataset.source.is_empty() {
|
||||
dataset.source = converted_path.display().to_string();
|
||||
}
|
||||
Ok(dataset)
|
||||
}
|
||||
|
||||
pub fn ensure_converted(
|
||||
dataset_kind: DatasetKind,
|
||||
raw_path: &Path,
|
||||
converted_path: &Path,
|
||||
force: bool,
|
||||
include_unanswerable: bool,
|
||||
context_token_limit: Option<usize>,
|
||||
) -> Result<ConvertedDataset> {
|
||||
if force || !converted_path.exists() {
|
||||
let dataset = convert(
|
||||
raw_path,
|
||||
dataset_kind,
|
||||
include_unanswerable,
|
||||
context_token_limit,
|
||||
)?;
|
||||
write_converted(&dataset, converted_path)?;
|
||||
return Ok(dataset);
|
||||
}
|
||||
|
||||
match read_converted(converted_path) {
|
||||
Ok(dataset)
|
||||
if dataset.metadata.id == dataset_kind.id()
|
||||
&& dataset.metadata.include_unanswerable == include_unanswerable
|
||||
&& dataset.metadata.context_token_limit == context_token_limit =>
|
||||
{
|
||||
Ok(dataset)
|
||||
}
|
||||
_ => {
|
||||
let dataset = convert(
|
||||
raw_path,
|
||||
dataset_kind,
|
||||
include_unanswerable,
|
||||
context_token_limit,
|
||||
)?;
|
||||
write_converted(&dataset, converted_path)?;
|
||||
Ok(dataset)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn base_timestamp() -> DateTime<Utc> {
|
||||
Utc.with_ymd_and_hms(2023, 1, 1, 0, 0, 0).unwrap()
|
||||
}
|
||||
|
||||
@@ -16,11 +16,7 @@ use super::{ConvertedParagraph, ConvertedQuestion};
|
||||
clippy::arithmetic_side_effects,
|
||||
clippy::cast_sign_loss
|
||||
)]
|
||||
pub fn convert_nq(
|
||||
raw_path: &Path,
|
||||
include_unanswerable: bool,
|
||||
_context_token_limit: Option<usize>,
|
||||
) -> Result<Vec<ConvertedParagraph>> {
|
||||
pub fn convert_nq(raw_path: &Path, include_unanswerable: bool) -> Result<Vec<ConvertedParagraph>> {
|
||||
#[allow(dead_code)]
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct NqExample {
|
||||
|
||||
@@ -0,0 +1,410 @@
|
||||
use std::{
|
||||
collections::{HashMap, HashSet},
|
||||
fs::{self, File, OpenOptions},
|
||||
io::{BufRead, BufReader, Write},
|
||||
path::{Path, PathBuf},
|
||||
};
|
||||
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use chrono::{DateTime, Utc};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tracing::info;
|
||||
|
||||
use super::{
|
||||
checksum::store_aggregate_checksum,
|
||||
ConvertedDataset, ConvertedParagraph, ConvertedQuestion, DatasetMetadata,
|
||||
};
|
||||
use crate::slice;
|
||||
|
||||
pub const SHARDED_STORE_VERSION: u32 = 1;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ShardedMeta {
|
||||
pub version: u32,
|
||||
pub generated_at: DateTime<Utc>,
|
||||
pub metadata: DatasetMetadata,
|
||||
pub source: String,
|
||||
pub paragraph_count: usize,
|
||||
pub question_count: usize,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub(crate) struct QuestionRecord {
|
||||
paragraph_id: String,
|
||||
#[serde(flatten)]
|
||||
question: ConvertedQuestion,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct QuestionCatalog {
|
||||
pub entries: Vec<QuestionRecord>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum ConvertedLayout {
|
||||
ShardedStore,
|
||||
Missing,
|
||||
}
|
||||
|
||||
pub fn store_dir_for(converted_path: &Path) -> PathBuf {
|
||||
converted_path
|
||||
.parent()
|
||||
.unwrap_or_else(|| Path::new("."))
|
||||
.join(
|
||||
converted_path
|
||||
.file_stem()
|
||||
.map_or_else(|| "dataset".to_string(), |stem| stem.to_string_lossy().into()),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn detect_layout(converted_path: &Path) -> ConvertedLayout {
|
||||
let store_dir = store_dir_for(converted_path);
|
||||
if store_dir.join("meta.json").is_file() {
|
||||
ConvertedLayout::ShardedStore
|
||||
} else {
|
||||
ConvertedLayout::Missing
|
||||
}
|
||||
}
|
||||
|
||||
fn paragraph_file_name(paragraph_id: &str) -> String {
|
||||
format!("{}.json", slice::paragraph_storage_key(paragraph_id))
|
||||
}
|
||||
|
||||
pub fn paragraph_path(store_dir: &Path, paragraph_id: &str) -> PathBuf {
|
||||
store_dir
|
||||
.join("paragraphs")
|
||||
.join(paragraph_file_name(paragraph_id))
|
||||
}
|
||||
|
||||
pub fn write_sharded(dataset: &ConvertedDataset, store_dir: &Path) -> Result<String> {
|
||||
if store_dir.exists() {
|
||||
fs::remove_dir_all(store_dir)
|
||||
.with_context(|| format!("clearing sharded store {}", store_dir.display()))?;
|
||||
}
|
||||
fs::create_dir_all(store_dir.join("paragraphs"))
|
||||
.with_context(|| format!("creating sharded store {}", store_dir.display()))?;
|
||||
|
||||
let question_count = dataset
|
||||
.paragraphs
|
||||
.iter()
|
||||
.map(|paragraph| paragraph.questions.len())
|
||||
.sum::<usize>();
|
||||
|
||||
let meta = ShardedMeta {
|
||||
version: SHARDED_STORE_VERSION,
|
||||
generated_at: dataset.generated_at,
|
||||
metadata: dataset.metadata.clone(),
|
||||
source: dataset.source.clone(),
|
||||
paragraph_count: dataset.paragraphs.len(),
|
||||
question_count,
|
||||
};
|
||||
let meta_path = store_dir.join("meta.json");
|
||||
fs::write(
|
||||
&meta_path,
|
||||
serde_json::to_vec_pretty(&meta).context("serialising sharded store metadata")?,
|
||||
)
|
||||
.with_context(|| format!("writing sharded metadata {}", meta_path.display()))?;
|
||||
|
||||
let mut questions_file = File::create(store_dir.join("questions.jsonl"))
|
||||
.context("creating questions.jsonl for sharded store")?;
|
||||
let mut paragraph_ids_file = File::create(store_dir.join("paragraph_ids.jsonl"))
|
||||
.context("creating paragraph_ids.jsonl for sharded store")?;
|
||||
|
||||
for paragraph in &dataset.paragraphs {
|
||||
writeln!(paragraph_ids_file, "{}", paragraph.id)
|
||||
.context("writing paragraph id to paragraph_ids.jsonl")?;
|
||||
for question in ¶graph.questions {
|
||||
let record = QuestionRecord {
|
||||
paragraph_id: paragraph.id.clone(),
|
||||
question: question.clone(),
|
||||
};
|
||||
serde_json::to_writer(&mut questions_file, &record)
|
||||
.context("writing question record to questions.jsonl")?;
|
||||
questions_file.write_all(b"\n")?;
|
||||
}
|
||||
|
||||
let path = paragraph_path(store_dir, ¶graph.id);
|
||||
if let Some(parent) = path.parent() {
|
||||
fs::create_dir_all(parent)?;
|
||||
}
|
||||
fs::write(
|
||||
&path,
|
||||
serde_json::to_vec(paragraph).context("serialising sharded paragraph")?,
|
||||
)
|
||||
.with_context(|| format!("writing sharded paragraph {}", path.display()))?;
|
||||
}
|
||||
|
||||
let digest = store_aggregate_checksum(store_dir)?;
|
||||
info!(
|
||||
store = %store_dir.display(),
|
||||
paragraphs = dataset.paragraphs.len(),
|
||||
questions = question_count,
|
||||
checksum = %digest,
|
||||
"Wrote sharded converted dataset"
|
||||
);
|
||||
Ok(digest)
|
||||
}
|
||||
|
||||
pub fn read_meta(store_dir: &Path) -> Result<ShardedMeta> {
|
||||
let path = store_dir.join("meta.json");
|
||||
let raw = fs::read_to_string(&path)
|
||||
.with_context(|| format!("reading sharded metadata {}", path.display()))?;
|
||||
serde_json::from_str(&raw)
|
||||
.with_context(|| format!("parsing sharded metadata {}", path.display()))
|
||||
}
|
||||
|
||||
pub fn content_checksum_for_layout(converted_path: &Path) -> Result<String> {
|
||||
match detect_layout(converted_path) {
|
||||
ConvertedLayout::ShardedStore => {
|
||||
crate::datasets::store_aggregate_checksum(&store_dir_for(converted_path))
|
||||
}
|
||||
ConvertedLayout::Missing => Err(anyhow!(
|
||||
"converted dataset missing at {}",
|
||||
converted_path.display()
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
fn load_paragraph(store_dir: &Path, paragraph_id: &str) -> Result<ConvertedParagraph> {
|
||||
let path = paragraph_path(store_dir, paragraph_id);
|
||||
let raw = fs::read(&path)
|
||||
.with_context(|| format!("reading sharded paragraph {}", path.display()))?;
|
||||
serde_json::from_slice(&raw)
|
||||
.with_context(|| format!("parsing sharded paragraph {}", path.display()))
|
||||
}
|
||||
|
||||
fn load_paragraphs(store_dir: &Path, paragraph_ids: &[String]) -> Result<Vec<ConvertedParagraph>> {
|
||||
paragraph_ids
|
||||
.iter()
|
||||
.map(|paragraph_id| load_paragraph(store_dir, paragraph_id))
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn load_sharded_partial(store_dir: &Path, paragraph_ids: &[String]) -> Result<ConvertedDataset> {
|
||||
let meta = read_meta(store_dir)?;
|
||||
let mut paragraphs = load_paragraphs(store_dir, paragraph_ids)?;
|
||||
paragraphs.sort_by(|left, right| left.id.cmp(&right.id));
|
||||
Ok(ConvertedDataset {
|
||||
generated_at: meta.generated_at,
|
||||
metadata: meta.metadata,
|
||||
source: meta.source,
|
||||
paragraphs,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn load_sharded_full(store_dir: &Path) -> Result<ConvertedDataset> {
|
||||
let meta = read_meta(store_dir)?;
|
||||
let ids = load_paragraph_ids(store_dir)?;
|
||||
let paragraphs = load_paragraphs(store_dir, &ids)?;
|
||||
Ok(ConvertedDataset {
|
||||
generated_at: meta.generated_at,
|
||||
metadata: meta.metadata,
|
||||
source: meta.source,
|
||||
paragraphs,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn load_paragraph_ids_set(store_dir: &Path) -> Result<HashSet<String>> {
|
||||
Ok(load_paragraph_ids(store_dir)?.into_iter().collect())
|
||||
}
|
||||
|
||||
#[allow(clippy::arithmetic_side_effects)]
|
||||
pub fn upsert_sharded_paragraphs(
|
||||
store_dir: &Path,
|
||||
paragraphs: &[ConvertedParagraph],
|
||||
) -> Result<()> {
|
||||
if paragraphs.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
if !store_dir.join("meta.json").is_file() {
|
||||
return Err(anyhow!(
|
||||
"cannot upsert into missing sharded store at {}",
|
||||
store_dir.display()
|
||||
));
|
||||
}
|
||||
|
||||
fs::create_dir_all(store_dir.join("paragraphs"))
|
||||
.with_context(|| format!("creating paragraphs directory in {}", store_dir.display()))?;
|
||||
|
||||
let existing = load_paragraph_ids_set(store_dir)?;
|
||||
let questions_path = store_dir.join("questions.jsonl");
|
||||
let mut questions_file = OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open(&questions_path)
|
||||
.with_context(|| format!("opening question catalog {}", questions_path.display()))?;
|
||||
|
||||
let mut ids_file = None;
|
||||
let mut new_paragraphs = 0usize;
|
||||
let mut new_questions = 0usize;
|
||||
|
||||
for paragraph in paragraphs {
|
||||
let is_new = !existing.contains(¶graph.id);
|
||||
let path = paragraph_path(store_dir, ¶graph.id);
|
||||
if let Some(parent) = path.parent() {
|
||||
fs::create_dir_all(parent)?;
|
||||
}
|
||||
fs::write(
|
||||
&path,
|
||||
serde_json::to_vec(paragraph).context("serialising sharded paragraph")?,
|
||||
)
|
||||
.with_context(|| format!("writing sharded paragraph {}", path.display()))?;
|
||||
|
||||
if is_new {
|
||||
if ids_file.is_none() {
|
||||
ids_file = Some(
|
||||
OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open(store_dir.join("paragraph_ids.jsonl"))
|
||||
.context("opening paragraph_ids.jsonl for append")?,
|
||||
);
|
||||
}
|
||||
if let Some(file) = ids_file.as_mut() {
|
||||
writeln!(file, "{}", paragraph.id).context("appending paragraph id")?;
|
||||
}
|
||||
new_paragraphs += 1;
|
||||
|
||||
for question in ¶graph.questions {
|
||||
let record = QuestionRecord {
|
||||
paragraph_id: paragraph.id.clone(),
|
||||
question: question.clone(),
|
||||
};
|
||||
serde_json::to_writer(&mut questions_file, &record)
|
||||
.context("writing question record to questions.jsonl")?;
|
||||
questions_file.write_all(b"\n")?;
|
||||
new_questions += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if new_paragraphs > 0 || new_questions > 0 {
|
||||
let meta = read_meta(store_dir)?;
|
||||
let updated = ShardedMeta {
|
||||
paragraph_count: meta.paragraph_count + new_paragraphs,
|
||||
question_count: meta.question_count + new_questions,
|
||||
..meta
|
||||
};
|
||||
fs::write(
|
||||
store_dir.join("meta.json"),
|
||||
serde_json::to_vec_pretty(&updated).context("serialising updated sharded metadata")?,
|
||||
)?;
|
||||
store_aggregate_checksum(store_dir)?;
|
||||
info!(
|
||||
store = %store_dir.display(),
|
||||
new_paragraphs,
|
||||
new_questions,
|
||||
"Upserted paragraphs into sharded converted store"
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn load_paragraph_ids(store_dir: &Path) -> Result<Vec<String>> {
|
||||
let path = store_dir.join("paragraph_ids.jsonl");
|
||||
let file = File::open(&path)
|
||||
.with_context(|| format!("opening paragraph id index {}", path.display()))?;
|
||||
let reader = BufReader::new(file);
|
||||
reader
|
||||
.lines()
|
||||
.map(|line| {
|
||||
line.context("reading paragraph id index line")
|
||||
.and_then(|value| {
|
||||
let trimmed = value.trim();
|
||||
if trimmed.is_empty() {
|
||||
Err(anyhow!("empty paragraph id in index"))
|
||||
} else {
|
||||
Ok(trimmed.to_string())
|
||||
}
|
||||
})
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn load_question_catalog(store_dir: &Path) -> Result<QuestionCatalog> {
|
||||
let path = store_dir.join("questions.jsonl");
|
||||
let file = File::open(&path)
|
||||
.with_context(|| format!("opening question catalog {}", path.display()))?;
|
||||
let reader = BufReader::new(file);
|
||||
let mut entries = Vec::new();
|
||||
for line in reader.lines() {
|
||||
let line = line.context("reading question catalog line")?;
|
||||
if line.trim().is_empty() {
|
||||
continue;
|
||||
}
|
||||
let record: QuestionRecord = serde_json::from_str(&line)
|
||||
.context("parsing question catalog record")?;
|
||||
entries.push(record);
|
||||
}
|
||||
Ok(QuestionCatalog { entries })
|
||||
}
|
||||
|
||||
pub fn build_dataset_from_catalog(
|
||||
store_dir: &Path,
|
||||
paragraph_ids: &HashSet<String>,
|
||||
) -> Result<ConvertedDataset> {
|
||||
let catalog = load_question_catalog(store_dir)?;
|
||||
let mut questions_by_paragraph: HashMap<String, Vec<ConvertedQuestion>> = HashMap::new();
|
||||
for entry in catalog.entries {
|
||||
if paragraph_ids.contains(&entry.paragraph_id) {
|
||||
questions_by_paragraph
|
||||
.entry(entry.paragraph_id.clone())
|
||||
.or_default()
|
||||
.push(entry.question);
|
||||
}
|
||||
}
|
||||
|
||||
let mut dataset = load_sharded_partial(
|
||||
store_dir,
|
||||
¶graph_ids.iter().cloned().collect::<Vec<_>>(),
|
||||
)?;
|
||||
for paragraph in &mut dataset.paragraphs {
|
||||
if let Some(questions) = questions_by_paragraph.remove(¶graph.id) {
|
||||
paragraph.questions = questions;
|
||||
} else {
|
||||
paragraph.questions.clear();
|
||||
}
|
||||
}
|
||||
|
||||
Ok(dataset)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::datasets::{DatasetKind, DatasetMetadata};
|
||||
|
||||
fn sample_dataset() -> ConvertedDataset {
|
||||
ConvertedDataset {
|
||||
generated_at: Utc::now(),
|
||||
metadata: DatasetMetadata::for_kind(DatasetKind::SquadV2, false),
|
||||
source: "test".to_string(),
|
||||
paragraphs: vec![ConvertedParagraph {
|
||||
id: "p1".to_string(),
|
||||
title: "Title".to_string(),
|
||||
context: "Body".to_string(),
|
||||
questions: vec![ConvertedQuestion {
|
||||
id: "q1".to_string(),
|
||||
question: "Question?".to_string(),
|
||||
answers: vec!["Answer".to_string()],
|
||||
is_impossible: false,
|
||||
}],
|
||||
}],
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[allow(clippy::indexing_slicing)]
|
||||
fn sharded_round_trip() -> Result<()> {
|
||||
let dir = tempfile::tempdir()?;
|
||||
let store_dir = dir.path().join("sample");
|
||||
let dataset = sample_dataset();
|
||||
write_sharded(&dataset, &store_dir)?;
|
||||
|
||||
let loaded = load_sharded_full(&store_dir)?;
|
||||
assert_eq!(loaded.paragraphs.len(), 1);
|
||||
assert_eq!(loaded.paragraphs[0].questions[0].id, "q1");
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -1,22 +1,22 @@
|
||||
//! Database namespace management utilities.
|
||||
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use chrono::Utc;
|
||||
use common::storage::{
|
||||
db::SurrealDbClient,
|
||||
types::user::{Theme, User},
|
||||
types::StoredObject,
|
||||
use common::{
|
||||
storage::{
|
||||
db::SurrealDbClient,
|
||||
types::user::{Theme, User},
|
||||
types::StoredObject,
|
||||
},
|
||||
utils::embedding::EmbeddingProvider,
|
||||
};
|
||||
use serde::Deserialize;
|
||||
use tracing::{info, warn};
|
||||
|
||||
use crate::{
|
||||
args::Config,
|
||||
corpus::{self, CorpusHandle, CorpusManifest, NamespaceSeedRecord},
|
||||
datasets,
|
||||
snapshot::{self, DbSnapshotState},
|
||||
};
|
||||
|
||||
/// Connect to the evaluation database with fallback auth strategies.
|
||||
pub(crate) async fn connect_eval_db(
|
||||
config: &Config,
|
||||
namespace: &str,
|
||||
@@ -73,7 +73,6 @@ pub(crate) async fn connect_eval_db(
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if the namespace contains any corpus data.
|
||||
pub(crate) async fn namespace_has_corpus(db: &SurrealDbClient) -> Result<bool> {
|
||||
#[derive(Deserialize)]
|
||||
struct CountRow {
|
||||
@@ -89,41 +88,52 @@ pub(crate) async fn namespace_has_corpus(db: &SurrealDbClient) -> Result<bool> {
|
||||
Ok(rows.first().map_or(0, |row| row.count) > 0)
|
||||
}
|
||||
|
||||
/// Determine if we can reuse an existing namespace based on cached state.
|
||||
fn manifest_matches_runtime(
|
||||
manifest: &CorpusManifest,
|
||||
embedding_provider: &EmbeddingProvider,
|
||||
ingestion_fingerprint: &str,
|
||||
) -> bool {
|
||||
let metadata = &manifest.metadata;
|
||||
metadata.ingestion_fingerprint == ingestion_fingerprint
|
||||
&& metadata.embedding_backend == embedding_provider.backend_label()
|
||||
&& metadata.embedding_model == embedding_provider.model_code()
|
||||
&& metadata.embedding_dimension == embedding_provider.dimension()
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(crate) async fn can_reuse_namespace(
|
||||
db: &SurrealDbClient,
|
||||
descriptor: &snapshot::Descriptor,
|
||||
manifest: &CorpusManifest,
|
||||
embedding_provider: &EmbeddingProvider,
|
||||
namespace: &str,
|
||||
database: &str,
|
||||
dataset_id: &str,
|
||||
slice_id: &str,
|
||||
ingestion_fingerprint: &str,
|
||||
slice_case_count: usize,
|
||||
) -> Result<bool> {
|
||||
let Some(state) = descriptor.load_db_state().await? else {
|
||||
info!("No namespace state recorded; reseeding corpus from cached shards");
|
||||
if !manifest_matches_runtime(manifest, embedding_provider, ingestion_fingerprint) {
|
||||
info!("Corpus manifest metadata mismatch; rebuilding namespace from cached shards");
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
let Some(seed) = manifest.metadata.namespace_seed.as_ref() else {
|
||||
info!("No namespace seed recorded in corpus manifest; reseeding");
|
||||
return Ok(false);
|
||||
};
|
||||
|
||||
if state.slice_case_count != slice_case_count {
|
||||
if seed.slice_case_count != slice_case_count {
|
||||
info!(
|
||||
requested_cases = slice_case_count,
|
||||
stored_cases = state.slice_case_count,
|
||||
"Skipping live namespace reuse; cached state does not match requested window"
|
||||
stored_cases = seed.slice_case_count,
|
||||
"Skipping namespace reuse; case window mismatch"
|
||||
);
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
if state.dataset_id != dataset_id
|
||||
|| state.slice_id != slice_id
|
||||
|| state.ingestion_fingerprint != ingestion_fingerprint
|
||||
|| state.namespace.as_deref() != Some(namespace)
|
||||
|| state.database.as_deref() != Some(database)
|
||||
{
|
||||
if seed.namespace != namespace || seed.database != database {
|
||||
info!(
|
||||
namespace,
|
||||
database, "Cached namespace metadata mismatch; rebuilding corpus from ingestion cache"
|
||||
database,
|
||||
"Corpus manifest namespace metadata mismatch; reseeding"
|
||||
);
|
||||
return Ok(false);
|
||||
}
|
||||
@@ -140,28 +150,20 @@ pub(crate) async fn can_reuse_namespace(
|
||||
}
|
||||
}
|
||||
|
||||
/// Record the current namespace state to allow future reuse checks.
|
||||
pub(crate) async fn record_namespace_state(
|
||||
descriptor: &snapshot::Descriptor,
|
||||
dataset_id: &str,
|
||||
slice_id: &str,
|
||||
ingestion_fingerprint: &str,
|
||||
pub(crate) async fn record_namespace_seed(
|
||||
handle: &mut CorpusHandle,
|
||||
namespace: &str,
|
||||
database: &str,
|
||||
slice_case_count: usize,
|
||||
) {
|
||||
let state = DbSnapshotState {
|
||||
dataset_id: dataset_id.to_string(),
|
||||
slice_id: slice_id.to_string(),
|
||||
ingestion_fingerprint: ingestion_fingerprint.to_string(),
|
||||
snapshot_hash: descriptor.metadata_hash().to_string(),
|
||||
updated_at: Utc::now(),
|
||||
namespace: Some(namespace.to_string()),
|
||||
database: Some(database.to_string()),
|
||||
handle.manifest.metadata.namespace_seed = Some(NamespaceSeedRecord {
|
||||
namespace: namespace.to_string(),
|
||||
database: database.to_string(),
|
||||
slice_case_count,
|
||||
};
|
||||
if let Err(err) = descriptor.store_db_state(&state).await {
|
||||
warn!(error = %err, "Failed to record namespace state");
|
||||
seeded_at: Utc::now(),
|
||||
});
|
||||
if let Err(err) = corpus::persist_corpus_manifest(handle) {
|
||||
warn!(error = %err, "Failed to record namespace seed in corpus manifest");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -185,8 +187,17 @@ fn sanitize_identifier(input: &str) -> String {
|
||||
cleaned
|
||||
}
|
||||
|
||||
/// Generate a default namespace name based on dataset and limit.
|
||||
pub(crate) fn default_namespace(dataset_id: &str, limit: Option<usize>) -> String {
|
||||
pub(crate) fn default_namespace(
|
||||
dataset_id: &str,
|
||||
limit: Option<usize>,
|
||||
slice_id: Option<&str>,
|
||||
) -> String {
|
||||
if let Some(slice_id) = slice_id {
|
||||
let sanitized = sanitize_identifier(slice_id);
|
||||
if !sanitized.is_empty() {
|
||||
return format!("eval_{sanitized}");
|
||||
}
|
||||
}
|
||||
let dataset_component = sanitize_identifier(dataset_id);
|
||||
let limit_component = match limit {
|
||||
Some(value) if value > 0 => format!("limit{value}"),
|
||||
@@ -195,12 +206,10 @@ pub(crate) fn default_namespace(dataset_id: &str, limit: Option<usize>) -> Strin
|
||||
format!("eval_{dataset_component}_{limit_component}")
|
||||
}
|
||||
|
||||
/// Generate the default database name for evaluations.
|
||||
pub(crate) fn default_database() -> String {
|
||||
"retrieval_eval".to_string()
|
||||
}
|
||||
|
||||
/// Ensure the evaluation user exists in the database.
|
||||
pub(crate) async fn ensure_eval_user(db: &SurrealDbClient) -> Result<User> {
|
||||
let timestamp = datasets::base_timestamp();
|
||||
let user = User {
|
||||
@@ -225,3 +234,7 @@ pub(crate) async fn ensure_eval_user(db: &SurrealDbClient) -> Result<User> {
|
||||
.context("storing evaluation user")?;
|
||||
Ok(user)
|
||||
}
|
||||
|
||||
pub(crate) fn sanitize_model_code(code: &str) -> String {
|
||||
sanitize_identifier(code)
|
||||
}
|
||||
@@ -2,13 +2,6 @@ use anyhow::{Context, Result};
|
||||
use common::storage::{db::SurrealDbClient, indexes::ensure_runtime};
|
||||
use tracing::info;
|
||||
|
||||
// Helper functions for index management during namespace reseed
|
||||
pub async fn remove_all_indexes(db: &SurrealDbClient) -> Result<()> {
|
||||
let _ = db;
|
||||
info!("Removing ALL indexes before namespace reseed (no-op placeholder)");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn recreate_indexes(db: &SurrealDbClient, dimension: usize) -> Result<()> {
|
||||
info!("Recreating ALL indexes after namespace reseed via shared runtime helper");
|
||||
ensure_runtime(db, dimension)
|
||||
@@ -34,14 +27,39 @@ pub async fn reset_namespace(db: &SurrealDbClient, namespace: &str, database: &s
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// // Test helper to force index dimension change
|
||||
// #[allow(dead_code)]
|
||||
// pub async fn change_embedding_length_in_hnsw_indexes(
|
||||
// db: &SurrealDbClient,
|
||||
// dimension: usize,
|
||||
// ) -> Result<()> {
|
||||
// recreate_indexes(db, dimension).await
|
||||
// }
|
||||
#[allow(clippy::cast_precision_loss)]
|
||||
pub(crate) async fn warm_hnsw_cache(db: &SurrealDbClient, dimension: usize) -> Result<()> {
|
||||
let dummy_embedding: Vec<f32> = (0..dimension).map(|i| (i as f32).sin()).collect();
|
||||
|
||||
info!("Warming HNSW caches with sample queries");
|
||||
|
||||
let _ = db
|
||||
.client
|
||||
.query(
|
||||
r#"SELECT chunk_id
|
||||
FROM text_chunk_embedding
|
||||
WHERE embedding <|1,1|> $embedding
|
||||
LIMIT 5"#,
|
||||
)
|
||||
.bind(("embedding", dummy_embedding.clone()))
|
||||
.await
|
||||
.context("warming text chunk HNSW cache")?;
|
||||
|
||||
let _ = db
|
||||
.client
|
||||
.query(
|
||||
r#"SELECT entity_id
|
||||
FROM knowledge_entity_embedding
|
||||
WHERE embedding <|1,1|> $embedding
|
||||
LIMIT 5"#,
|
||||
)
|
||||
.bind(("embedding", dummy_embedding))
|
||||
.await
|
||||
.context("warming knowledge entity HNSW cache")?;
|
||||
|
||||
info!("HNSW cache warming completed");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
@@ -0,0 +1,9 @@
|
||||
mod connect;
|
||||
mod lifecycle;
|
||||
|
||||
pub(crate) use connect::{
|
||||
can_reuse_namespace, connect_eval_db, default_database, default_namespace, ensure_eval_user,
|
||||
namespace_has_corpus, record_namespace_seed, sanitize_model_code,
|
||||
};
|
||||
pub use lifecycle::{recreate_indexes, reset_namespace};
|
||||
pub(crate) use lifecycle::warm_hnsw_cache;
|
||||
@@ -1,128 +0,0 @@
|
||||
//! Evaluation utilities module - re-exports from focused submodules.
|
||||
|
||||
// Re-export types from the root types module
|
||||
pub use crate::types::*;
|
||||
|
||||
// Re-export from focused modules at crate root (crate-internal only)
|
||||
pub(crate) use crate::cases::{cases_from_manifest, SeededCase};
|
||||
pub(crate) use crate::namespace::{
|
||||
can_reuse_namespace, connect_eval_db, default_database, default_namespace, ensure_eval_user,
|
||||
record_namespace_state,
|
||||
};
|
||||
pub(crate) use crate::settings::{enforce_system_settings, load_or_init_system_settings};
|
||||
|
||||
use std::path::Path;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use common::storage::db::SurrealDbClient;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use tracing::info;
|
||||
|
||||
use crate::{
|
||||
args::{self, Config},
|
||||
datasets::ConvertedDataset,
|
||||
slice::{self},
|
||||
};
|
||||
|
||||
/// Grow the slice ledger to contain the target number of cases.
|
||||
pub fn grow_slice(dataset: &ConvertedDataset, config: &Config) -> Result<()> {
|
||||
let ledger_limit = ledger_target(config);
|
||||
let slice_settings = slice::slice_config_with_limit(config, ledger_limit);
|
||||
let slice =
|
||||
slice::resolve_slice(dataset, &slice_settings).context("resolving dataset slice")?;
|
||||
info!(
|
||||
slice = slice.manifest.slice_id.as_str(),
|
||||
cases = slice.manifest.case_count,
|
||||
positives = slice.manifest.positive_paragraphs,
|
||||
negatives = slice.manifest.negative_paragraphs,
|
||||
total_paragraphs = slice.manifest.total_paragraphs,
|
||||
"Slice ledger ready"
|
||||
);
|
||||
println!(
|
||||
"Slice `{}` now contains {} questions ({} positives, {} negatives)",
|
||||
slice.manifest.slice_id,
|
||||
slice.manifest.case_count,
|
||||
slice.manifest.positive_paragraphs,
|
||||
slice.manifest.negative_paragraphs
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn ledger_target(config: &Config) -> Option<usize> {
|
||||
match (config.slice_grow, config.limit) {
|
||||
(Some(grow), Some(limit)) => Some(limit.max(grow)),
|
||||
(Some(grow), None) => Some(grow),
|
||||
(None, limit) => limit,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn write_chunk_diagnostics(path: &Path, cases: &[CaseDiagnostics]) -> Result<()> {
|
||||
args::ensure_parent(path)?;
|
||||
let mut file = tokio::fs::File::create(path)
|
||||
.await
|
||||
.with_context(|| format!("creating diagnostics file {}", path.display()))?;
|
||||
for case in cases {
|
||||
let line = serde_json::to_vec(case).context("serialising chunk diagnostics entry")?;
|
||||
file.write_all(&line).await?;
|
||||
file.write_all(b"\n").await?;
|
||||
}
|
||||
file.flush().await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[allow(clippy::cast_precision_loss)]
|
||||
pub(crate) async fn warm_hnsw_cache(db: &SurrealDbClient, dimension: usize) -> Result<()> {
|
||||
let dummy_embedding: Vec<f32> = (0..dimension).map(|i| (i as f32).sin()).collect();
|
||||
|
||||
info!("Warming HNSW caches with sample queries");
|
||||
|
||||
// Warm up chunk embedding index - just query the embedding table to load HNSW index
|
||||
let _ = db
|
||||
.client
|
||||
.query(
|
||||
r#"SELECT chunk_id
|
||||
FROM text_chunk_embedding
|
||||
WHERE embedding <|1,1|> $embedding
|
||||
LIMIT 5"#,
|
||||
)
|
||||
.bind(("embedding", dummy_embedding.clone()))
|
||||
.await
|
||||
.context("warming text chunk HNSW cache")?;
|
||||
|
||||
// Warm up entity embedding index
|
||||
let _ = db
|
||||
.client
|
||||
.query(
|
||||
r#"SELECT entity_id
|
||||
FROM knowledge_entity_embedding
|
||||
WHERE embedding <|1,1|> $embedding
|
||||
LIMIT 5"#,
|
||||
)
|
||||
.bind(("embedding", dummy_embedding))
|
||||
.await
|
||||
.context("warming knowledge entity HNSW cache")?;
|
||||
|
||||
info!("HNSW cache warming completed");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
use chrono::{DateTime, SecondsFormat, Utc};
|
||||
|
||||
pub fn format_timestamp(timestamp: &DateTime<Utc>) -> String {
|
||||
timestamp.to_rfc3339_opts(SecondsFormat::Secs, true)
|
||||
}
|
||||
|
||||
pub(crate) fn sanitize_model_code(code: &str) -> String {
|
||||
code.chars()
|
||||
.map(|ch| {
|
||||
if ch.is_ascii_alphanumeric() {
|
||||
ch.to_ascii_lowercase()
|
||||
} else {
|
||||
'_'
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
// Re-export run_evaluation from the pipeline module at crate root
|
||||
pub use crate::pipeline::run_evaluation;
|
||||
@@ -1,13 +1,13 @@
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
fs,
|
||||
path::{Path, PathBuf},
|
||||
path::Path,
|
||||
};
|
||||
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use common::storage::{db::SurrealDbClient, types::text_chunk::TextChunk};
|
||||
|
||||
use crate::{args::Config, corpus, eval::connect_eval_db, snapshot::DbSnapshotState};
|
||||
use crate::{args::Config, corpus, db::connect_eval_db};
|
||||
|
||||
pub async fn inspect_question(config: &Config) -> Result<()> {
|
||||
let question_id = config
|
||||
@@ -64,39 +64,26 @@ pub async fn inspect_question(config: &Config) -> Result<()> {
|
||||
);
|
||||
}
|
||||
|
||||
let db_state_path = config
|
||||
.database
|
||||
.inspect_db_state
|
||||
.clone()
|
||||
.unwrap_or_else(|| default_state_path(config, &manifest));
|
||||
if let Some(state) = load_db_state(&db_state_path)? {
|
||||
if let (Some(ns), Some(db_name)) = (state.namespace.as_deref(), state.database.as_deref()) {
|
||||
match connect_eval_db(config, ns, db_name).await {
|
||||
Ok(db) => match verify_chunks_in_db(&db, &question.matching_chunk_ids).await? {
|
||||
MissingChunks::None => println!(
|
||||
"All matching_chunk_ids exist in namespace '{ns}', database '{db_name}'"
|
||||
),
|
||||
MissingChunks::Missing(list) => println!(
|
||||
"Missing chunks in namespace '{ns}', database '{db_name}': {list:?}"
|
||||
),
|
||||
},
|
||||
Err(err) => {
|
||||
println!(
|
||||
"Failed to connect to SurrealDB namespace '{ns}' / database '{db_name}': {err}"
|
||||
);
|
||||
}
|
||||
if let Some(seed) = manifest.metadata.namespace_seed.as_ref() {
|
||||
let ns = seed.namespace.as_str();
|
||||
let db_name = seed.database.as_str();
|
||||
match connect_eval_db(config, ns, db_name).await {
|
||||
Ok(db) => match verify_chunks_in_db(&db, &question.matching_chunk_ids).await? {
|
||||
MissingChunks::None => println!(
|
||||
"All matching_chunk_ids exist in namespace '{ns}', database '{db_name}'"
|
||||
),
|
||||
MissingChunks::Missing(list) => println!(
|
||||
"Missing chunks in namespace '{ns}', database '{db_name}': {list:?}"
|
||||
),
|
||||
},
|
||||
Err(err) => {
|
||||
println!(
|
||||
"Failed to connect to SurrealDB namespace '{ns}' / database '{db_name}': {err}"
|
||||
);
|
||||
}
|
||||
} else {
|
||||
println!(
|
||||
"State file {} is missing namespace/database fields; skipping live DB validation",
|
||||
db_state_path.display()
|
||||
);
|
||||
}
|
||||
} else {
|
||||
println!(
|
||||
"State file {} not found; skipping live DB validation",
|
||||
db_state_path.display()
|
||||
);
|
||||
println!("Corpus manifest has no namespace seed; skipping live DB validation");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -137,25 +124,6 @@ fn build_chunk_lookup(manifest: &corpus::CorpusManifest) -> HashMap<String, Chun
|
||||
lookup
|
||||
}
|
||||
|
||||
fn default_state_path(config: &Config, manifest: &corpus::CorpusManifest) -> PathBuf {
|
||||
config
|
||||
.cache_dir
|
||||
.join("snapshots")
|
||||
.join(&manifest.metadata.dataset_id)
|
||||
.join(&manifest.metadata.slice_id)
|
||||
.join("db/state.json")
|
||||
}
|
||||
|
||||
fn load_db_state(path: &Path) -> Result<Option<DbSnapshotState>> {
|
||||
if !path.exists() {
|
||||
return Ok(None);
|
||||
}
|
||||
let bytes = fs::read(path).with_context(|| format!("reading db state {}", path.display()))?;
|
||||
let state = serde_json::from_slice(&bytes)
|
||||
.with_context(|| format!("parsing db state {}", path.display()))?;
|
||||
Ok(Some(state))
|
||||
}
|
||||
|
||||
enum MissingChunks {
|
||||
None,
|
||||
Missing(Vec<String>),
|
||||
|
||||
+51
-44
@@ -1,19 +1,17 @@
|
||||
mod args;
|
||||
mod cache;
|
||||
mod context_stats;
|
||||
mod cases;
|
||||
mod cli;
|
||||
mod corpus;
|
||||
mod datasets;
|
||||
mod db_helpers;
|
||||
mod eval;
|
||||
mod db;
|
||||
mod inspection;
|
||||
mod namespace;
|
||||
mod openai;
|
||||
mod perf;
|
||||
mod pipeline;
|
||||
mod report;
|
||||
mod settings;
|
||||
mod slice;
|
||||
mod snapshot;
|
||||
mod types;
|
||||
|
||||
use anyhow::Context;
|
||||
@@ -24,7 +22,6 @@ use tracing_subscriber::{fmt, EnvFilter};
|
||||
/// Configure `SurrealDB` environment variables for optimal performance
|
||||
#[allow(clippy::arithmetic_side_effects, clippy::unwrap_used)]
|
||||
fn configure_surrealdb_performance(cpu_count: usize) {
|
||||
// Set environment variables only if they're not already set
|
||||
let indexing_batch_size = std::env::var("SURREAL_INDEXING_BATCH_SIZE")
|
||||
.unwrap_or_else(|_| (cpu_count * 2).to_string());
|
||||
std::env::set_var("SURREAL_INDEXING_BATCH_SIZE", indexing_batch_size);
|
||||
@@ -62,12 +59,11 @@ fn configure_surrealdb_performance(cpu_count: usize) {
|
||||
}
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
// Create an explicit multi-threaded runtime with optimized configuration
|
||||
let runtime = Builder::new_multi_thread()
|
||||
.enable_all()
|
||||
.worker_threads(std::thread::available_parallelism()?.get())
|
||||
.max_blocking_threads(std::thread::available_parallelism()?.get())
|
||||
.thread_stack_size(10 * 1024 * 1024) // 10MiB stack size
|
||||
.thread_stack_size(10 * 1024 * 1024)
|
||||
.thread_name("eval-retrieval-worker")
|
||||
.build()
|
||||
.context("failed to create tokio runtime")?;
|
||||
@@ -77,7 +73,6 @@ fn main() -> anyhow::Result<()> {
|
||||
|
||||
#[allow(clippy::too_many_lines)]
|
||||
async fn async_main() -> anyhow::Result<()> {
|
||||
// Log runtime configuration
|
||||
let cpu_count = std::thread::available_parallelism()?.get();
|
||||
info!(
|
||||
cpu_cores = cpu_count,
|
||||
@@ -87,7 +82,6 @@ async fn async_main() -> anyhow::Result<()> {
|
||||
"Started multi-threaded tokio runtime"
|
||||
);
|
||||
|
||||
// Configure SurrealDB environment variables for better performance
|
||||
configure_surrealdb_performance(cpu_count);
|
||||
|
||||
let filter = std::env::var("RUST_LOG").unwrap_or_else(|_| "info".to_string());
|
||||
@@ -97,13 +91,22 @@ async fn async_main() -> anyhow::Result<()> {
|
||||
|
||||
let parsed = args::parse()?;
|
||||
|
||||
// Clap handles help automatically, so we don't need to check for it manually
|
||||
|
||||
if parsed.config.inspect_question.is_some() {
|
||||
inspection::inspect_question(&parsed.config).await?;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
if parsed.config.status {
|
||||
let status = cli::collect_status(&parsed.config).await?;
|
||||
cli::print_status(&status);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
if parsed.config.warm {
|
||||
cli::warm(&parsed.config).await?;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let dataset_kind = parsed.config.dataset;
|
||||
|
||||
if parsed.config.convert_only {
|
||||
@@ -115,7 +118,6 @@ async fn async_main() -> anyhow::Result<()> {
|
||||
parsed.config.raw_dataset_path.as_path(),
|
||||
dataset_kind,
|
||||
parsed.config.llm_mode,
|
||||
parsed.config.context_token_limit(),
|
||||
)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
@@ -124,56 +126,56 @@ async fn async_main() -> anyhow::Result<()> {
|
||||
parsed.config.raw_dataset_path.display()
|
||||
)
|
||||
})?;
|
||||
crate::datasets::write_converted(&dataset, parsed.config.converted_dataset_path.as_path())
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"writing converted dataset to {}",
|
||||
parsed.config.converted_dataset_path.display()
|
||||
)
|
||||
})?;
|
||||
let store_dir = datasets::store_dir_for(&parsed.config.converted_dataset_path);
|
||||
datasets::write_sharded(&dataset, &store_dir)?;
|
||||
datasets::prebuild_catalog_slices(&dataset, &parsed.config)?;
|
||||
println!(
|
||||
"Converted dataset written to {}",
|
||||
parsed.config.converted_dataset_path.display()
|
||||
"Converted dataset written under {}",
|
||||
store_dir.display()
|
||||
);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
if parsed.config.require_ready {
|
||||
cli::ensure_query_ready(&parsed.config).await?;
|
||||
}
|
||||
|
||||
info!(dataset = dataset_kind.id(), "Preparing converted dataset");
|
||||
let dataset = crate::datasets::ensure_converted(
|
||||
dataset_kind,
|
||||
parsed.config.raw_dataset_path.as_path(),
|
||||
parsed.config.converted_dataset_path.as_path(),
|
||||
parsed.config.force_convert,
|
||||
parsed.config.llm_mode,
|
||||
parsed.config.context_token_limit(),
|
||||
)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"preparing converted dataset at {}",
|
||||
parsed.config.converted_dataset_path.display()
|
||||
)
|
||||
})?;
|
||||
let loaded = crate::datasets::prepare_dataset(dataset_kind, &parsed.config).with_context(
|
||||
|| {
|
||||
format!(
|
||||
"preparing converted dataset at {}",
|
||||
parsed.config.converted_dataset_path.display()
|
||||
)
|
||||
},
|
||||
)?;
|
||||
|
||||
info!(
|
||||
questions = dataset
|
||||
questions = loaded
|
||||
.dataset
|
||||
.paragraphs
|
||||
.iter()
|
||||
.map(|p| p.questions.len())
|
||||
.sum::<usize>(),
|
||||
paragraphs = dataset.paragraphs.len(),
|
||||
dataset = dataset.metadata.id.as_str(),
|
||||
paragraphs = loaded.dataset.paragraphs.len(),
|
||||
partial = loaded.partial,
|
||||
dataset = loaded.dataset.metadata.id.as_str(),
|
||||
"Dataset ready"
|
||||
);
|
||||
|
||||
if parsed.config.slice_grow.is_some() {
|
||||
eval::grow_slice(&dataset, &parsed.config).context("growing slice ledger")?;
|
||||
slice::grow_slice(&loaded.dataset, &parsed.config).context("growing slice ledger")?;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
info!("Running retrieval evaluation");
|
||||
let summary = eval::run_evaluation(&dataset, &parsed.config)
|
||||
.await
|
||||
.context("running retrieval evaluation")?;
|
||||
let summary = pipeline::run_evaluation(
|
||||
&loaded.dataset,
|
||||
&parsed.config,
|
||||
Some(loaded.content_checksum.as_str()),
|
||||
)
|
||||
.await
|
||||
.context("running retrieval evaluation")?;
|
||||
|
||||
let report = report::write_reports(
|
||||
&summary,
|
||||
@@ -226,12 +228,17 @@ async fn async_main() -> anyhow::Result<()> {
|
||||
);
|
||||
} else {
|
||||
println!(
|
||||
"[{}] Retrieval Precision@{k}: {precision:.3} ({correct}/{retrieval_total}) → JSON: {json} | Markdown: {md} | History: {history}{perf_note}",
|
||||
"[{}] Retrieval Precision@{k}: {precision:.3} ({correct}/{retrieval_total}) | Retrieved context: {chunks} chunks, {tokens} tokens ({tokenizer}, avg {avg_tokens:.0}/query, p95 {p95}) → JSON: {json} | Markdown: {md} | History: {history}{perf_note}",
|
||||
summary.dataset_label,
|
||||
k = summary.k,
|
||||
precision = summary.precision,
|
||||
correct = summary.correct,
|
||||
retrieval_total = summary.retrieval_cases,
|
||||
chunks = summary.retrieved_context.total_chunks,
|
||||
tokens = summary.retrieved_context.total_tokens,
|
||||
tokenizer = summary.retrieved_context.tokenizer,
|
||||
avg_tokens = summary.retrieved_context.avg_tokens_per_query,
|
||||
p95 = summary.retrieved_context.p95_tokens_per_query,
|
||||
json = report.paths.json.display(),
|
||||
md = report.paths.markdown.display(),
|
||||
history = report.history_path.display(),
|
||||
|
||||
@@ -1,9 +1,27 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use async_openai::{config::OpenAIConfig, Client};
|
||||
|
||||
const DEFAULT_BASE_URL: &str = "https://api.openai.com/v1";
|
||||
|
||||
pub fn build_client_from_env() -> Result<(Client<OpenAIConfig>, String)> {
|
||||
pub fn ingestion_openai_client(
|
||||
include_entities: bool,
|
||||
) -> Result<(Arc<Client<OpenAIConfig>>, Option<String>)> {
|
||||
if include_entities {
|
||||
let (client, base_url) = build_client_from_env().context(
|
||||
"OPENAI_API_KEY must be set when --include-entities is enabled (entity extraction uses OpenAI)",
|
||||
)?;
|
||||
Ok((Arc::new(client), Some(base_url)))
|
||||
} else {
|
||||
Ok((
|
||||
Arc::new(Client::with_config(OpenAIConfig::default())),
|
||||
None,
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
fn build_client_from_env() -> Result<(Client<OpenAIConfig>, String)> {
|
||||
let api_key = std::env::var("OPENAI_API_KEY")
|
||||
.context("OPENAI_API_KEY must be set to run retrieval evaluations")?;
|
||||
let base_url =
|
||||
|
||||
@@ -7,8 +7,8 @@ use anyhow::{Context, Result};
|
||||
|
||||
use crate::{
|
||||
args,
|
||||
eval::EvaluationSummary,
|
||||
report::{self, EvaluationReport},
|
||||
types::EvaluationSummary,
|
||||
};
|
||||
|
||||
pub fn mirror_perf_outputs(
|
||||
@@ -91,23 +91,23 @@ fn format_duration(value: Option<u128>) -> String {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::eval::{EvaluationStageTimings, PerformanceTimings};
|
||||
use crate::types::{EvaluationStageTimings, PerformanceTimings, LatencyStats, StageLatency, StageLatencyBreakdown};
|
||||
use chrono::Utc;
|
||||
use tempfile::tempdir;
|
||||
|
||||
fn sample_latency() -> crate::eval::LatencyStats {
|
||||
crate::eval::LatencyStats {
|
||||
fn sample_latency() -> LatencyStats {
|
||||
LatencyStats {
|
||||
avg: 10.0,
|
||||
p50: 8,
|
||||
p95: 15,
|
||||
}
|
||||
}
|
||||
|
||||
fn sample_stage_latency() -> crate::eval::StageLatencyBreakdown {
|
||||
crate::eval::StageLatencyBreakdown {
|
||||
fn sample_stage_latency() -> StageLatencyBreakdown {
|
||||
StageLatencyBreakdown {
|
||||
stages: ["embed", "search", "rerank", "resolve_entities", "assemble"]
|
||||
.into_iter()
|
||||
.map(|stage| crate::eval::StageLatency {
|
||||
.map(|stage| StageLatency {
|
||||
stage: stage.to_string(),
|
||||
stats: sample_latency(),
|
||||
})
|
||||
@@ -206,6 +206,7 @@ mod tests {
|
||||
chunk_vector_take: 20,
|
||||
chunk_fts_take: 20,
|
||||
max_chunks_per_entity: 4,
|
||||
retrieved_context: crate::context_stats::aggregate_context_stats(&[]),
|
||||
cases: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20,11 +20,11 @@ use retrieval_pipeline::{
|
||||
|
||||
use crate::{
|
||||
args::Config,
|
||||
cache::EmbeddingCache,
|
||||
cases::SeededCase,
|
||||
corpus,
|
||||
datasets::ConvertedDataset,
|
||||
eval::{CaseDiagnostics, CaseSummary, EvaluationStageTimings, EvaluationSummary, SeededCase},
|
||||
slice, snapshot,
|
||||
slice,
|
||||
types::{CaseDiagnostics, CaseSummary, EvaluationStageTimings, EvaluationSummary},
|
||||
};
|
||||
|
||||
#[allow(clippy::struct_excessive_bools)]
|
||||
@@ -41,12 +41,10 @@ pub(super) struct EvaluationContext<'a> {
|
||||
pub namespace: String,
|
||||
pub database: String,
|
||||
pub db: Option<SurrealDbClient>,
|
||||
pub descriptor: Option<snapshot::Descriptor>,
|
||||
pub settings: Option<SystemSettings>,
|
||||
pub settings_missing: bool,
|
||||
pub must_reapply_settings: bool,
|
||||
pub embedding_provider: Option<EmbeddingProvider>,
|
||||
pub embedding_cache: Option<EmbeddingCache>,
|
||||
pub openai_client: Option<Arc<Client<async_openai::config::OpenAIConfig>>>,
|
||||
pub openai_base_url: Option<String>,
|
||||
pub expected_fingerprint: Option<String>,
|
||||
@@ -67,13 +65,19 @@ pub(super) struct EvaluationContext<'a> {
|
||||
pub summary: Option<EvaluationSummary>,
|
||||
pub diagnostics_path: Option<PathBuf>,
|
||||
pub diagnostics_enabled: bool,
|
||||
pub content_checksum: Option<String>,
|
||||
}
|
||||
|
||||
impl<'a> EvaluationContext<'a> {
|
||||
pub fn new(dataset: &'a ConvertedDataset, config: &'a Config) -> Self {
|
||||
pub fn new(
|
||||
dataset: &'a ConvertedDataset,
|
||||
config: &'a Config,
|
||||
content_checksum: Option<String>,
|
||||
) -> Self {
|
||||
Self {
|
||||
dataset,
|
||||
config,
|
||||
content_checksum,
|
||||
stage_timings: EvaluationStageTimings::default(),
|
||||
ledger_limit: None,
|
||||
slice_settings: None,
|
||||
@@ -84,12 +88,10 @@ impl<'a> EvaluationContext<'a> {
|
||||
namespace: String::new(),
|
||||
database: String::new(),
|
||||
db: None,
|
||||
descriptor: None,
|
||||
settings: None,
|
||||
settings_missing: false,
|
||||
must_reapply_settings: false,
|
||||
embedding_provider: None,
|
||||
embedding_cache: None,
|
||||
openai_client: None,
|
||||
openai_base_url: None,
|
||||
expected_fingerprint: None,
|
||||
@@ -133,12 +135,6 @@ impl<'a> EvaluationContext<'a> {
|
||||
.ok_or_else(|| anyhow!("database connection missing"))
|
||||
}
|
||||
|
||||
pub fn descriptor(&self) -> Result<&snapshot::Descriptor> {
|
||||
self.descriptor
|
||||
.as_ref()
|
||||
.ok_or_else(|| anyhow!("snapshot descriptor unavailable"))
|
||||
}
|
||||
|
||||
pub fn embedding_provider(&self) -> Result<&EmbeddingProvider> {
|
||||
self.embedding_provider
|
||||
.as_ref()
|
||||
@@ -159,6 +155,10 @@ impl<'a> EvaluationContext<'a> {
|
||||
.ok_or_else(|| anyhow!("corpus handle missing"))
|
||||
}
|
||||
|
||||
pub fn content_checksum(&self) -> Option<&str> {
|
||||
self.content_checksum.as_deref()
|
||||
}
|
||||
|
||||
pub fn evaluation_user(&self) -> Result<&User> {
|
||||
self.eval_user
|
||||
.as_ref()
|
||||
|
||||
@@ -0,0 +1,20 @@
|
||||
use std::path::Path;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use tokio::io::AsyncWriteExt;
|
||||
|
||||
use crate::{args, types::CaseDiagnostics};
|
||||
|
||||
pub(crate) async fn write_chunk_diagnostics(path: &Path, cases: &[CaseDiagnostics]) -> Result<()> {
|
||||
args::ensure_parent(path)?;
|
||||
let mut file = tokio::fs::File::create(path)
|
||||
.await
|
||||
.with_context(|| format!("creating diagnostics file {}", path.display()))?;
|
||||
for case in cases {
|
||||
let line = serde_json::to_vec(case).context("serialising chunk diagnostics entry")?;
|
||||
file.write_all(&line).await?;
|
||||
file.write_all(b"\n").await?;
|
||||
}
|
||||
file.flush().await?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,6 +1,6 @@
|
||||
mod context;
|
||||
mod diagnostics;
|
||||
mod stages;
|
||||
mod state;
|
||||
|
||||
use anyhow::Result;
|
||||
|
||||
@@ -8,20 +8,49 @@ use crate::{args::Config, datasets::ConvertedDataset, types::EvaluationSummary};
|
||||
|
||||
use context::EvaluationContext;
|
||||
|
||||
async fn run_through_namespace<'a>(
|
||||
dataset: &'a ConvertedDataset,
|
||||
config: &'a Config,
|
||||
content_checksum: Option<String>,
|
||||
) -> Result<EvaluationContext<'a>> {
|
||||
let mut ctx = EvaluationContext::new(dataset, config, content_checksum);
|
||||
stages::prepare_slice(&mut ctx).await?;
|
||||
stages::prepare_db(&mut ctx).await?;
|
||||
stages::prepare_corpus(&mut ctx).await?;
|
||||
stages::prepare_namespace(&mut ctx).await?;
|
||||
Ok(ctx)
|
||||
}
|
||||
|
||||
pub async fn warm_evaluation(
|
||||
dataset: &ConvertedDataset,
|
||||
config: &Config,
|
||||
content_checksum: &str,
|
||||
) -> Result<()> {
|
||||
let _ctx = run_through_namespace(
|
||||
dataset,
|
||||
config,
|
||||
Some(content_checksum.to_string()),
|
||||
)
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn run_evaluation(
|
||||
dataset: &ConvertedDataset,
|
||||
config: &Config,
|
||||
content_checksum: Option<&str>,
|
||||
) -> Result<EvaluationSummary> {
|
||||
let mut ctx = EvaluationContext::new(dataset, config);
|
||||
let machine = state::ready();
|
||||
|
||||
let machine = stages::prepare_slice(machine, &mut ctx).await?;
|
||||
let machine = stages::prepare_db(machine, &mut ctx).await?;
|
||||
let machine = stages::prepare_corpus(machine, &mut ctx).await?;
|
||||
let machine = stages::prepare_namespace(machine, &mut ctx).await?;
|
||||
let machine = stages::run_queries(machine, &mut ctx).await?;
|
||||
let machine = stages::summarize(machine, &mut ctx).await?;
|
||||
let _ = stages::finalize(machine, &mut ctx).await?;
|
||||
|
||||
let mut ctx = EvaluationContext::new(
|
||||
dataset,
|
||||
config,
|
||||
content_checksum.map(str::to_string),
|
||||
);
|
||||
stages::prepare_slice(&mut ctx).await?;
|
||||
stages::prepare_db(&mut ctx).await?;
|
||||
stages::prepare_corpus(&mut ctx).await?;
|
||||
stages::prepare_namespace(&mut ctx).await?;
|
||||
stages::run_queries(&mut ctx).await?;
|
||||
stages::summarize(&mut ctx).await?;
|
||||
stages::finalize(&mut ctx).await?;
|
||||
ctx.into_summary()
|
||||
}
|
||||
|
||||
@@ -3,18 +3,12 @@ use std::time::Instant;
|
||||
use anyhow::Context;
|
||||
use tracing::info;
|
||||
|
||||
use crate::eval::write_chunk_diagnostics;
|
||||
|
||||
use super::super::{
|
||||
context::{EvalStage, EvaluationContext},
|
||||
state::{Completed, EvaluationMachine, Summarized},
|
||||
diagnostics::write_chunk_diagnostics,
|
||||
};
|
||||
use super::{map_guard_error, StageResult};
|
||||
|
||||
pub(crate) async fn finalize(
|
||||
machine: EvaluationMachine<(), Summarized>,
|
||||
ctx: &mut EvaluationContext<'_>,
|
||||
) -> StageResult<Completed> {
|
||||
pub(crate) async fn finalize(ctx: &mut EvaluationContext<'_>) -> anyhow::Result<()> {
|
||||
let stage = EvalStage::Finalize;
|
||||
info!(
|
||||
evaluation_stage = stage.label(),
|
||||
@@ -22,13 +16,6 @@ pub(crate) async fn finalize(
|
||||
);
|
||||
let started = Instant::now();
|
||||
|
||||
if let Some(cache) = ctx.embedding_cache.as_ref() {
|
||||
cache
|
||||
.persist()
|
||||
.await
|
||||
.context("persisting embedding cache")?;
|
||||
}
|
||||
|
||||
if let Some(path) = ctx.diagnostics_path.as_ref() {
|
||||
if ctx.diagnostics_enabled {
|
||||
write_chunk_diagnostics(path.as_path(), &ctx.diagnostics_output)
|
||||
@@ -53,7 +40,5 @@ pub(crate) async fn finalize(
|
||||
"completed evaluation stage"
|
||||
);
|
||||
|
||||
machine
|
||||
.finalize()
|
||||
.map_err(|(_, guard)| map_guard_error("finalize", &guard))
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -13,14 +13,3 @@ pub(crate) use prepare_namespace::prepare_namespace;
|
||||
pub(crate) use prepare_slice::prepare_slice;
|
||||
pub(crate) use run_queries::run_queries;
|
||||
pub(crate) use summarize::summarize;
|
||||
|
||||
use anyhow::Result;
|
||||
use state_machines::core::GuardError;
|
||||
|
||||
use super::state::EvaluationMachine;
|
||||
|
||||
fn map_guard_error(event: &str, guard: &GuardError) -> anyhow::Error {
|
||||
anyhow::anyhow!("invalid evaluation pipeline transition during {event}: {guard:?}")
|
||||
}
|
||||
|
||||
type StageResult<S> = Result<EvaluationMachine<(), S>>;
|
||||
|
||||
@@ -3,19 +3,12 @@ use std::time::Instant;
|
||||
use anyhow::Context;
|
||||
use tracing::info;
|
||||
|
||||
use crate::{corpus, eval::can_reuse_namespace, slice, snapshot};
|
||||
use crate::{corpus, db::can_reuse_namespace, slice};
|
||||
|
||||
use super::super::{
|
||||
context::{EvalStage, EvaluationContext},
|
||||
state::{CorpusReady, DbReady, EvaluationMachine},
|
||||
};
|
||||
use super::{map_guard_error, StageResult};
|
||||
use super::super::context::{EvalStage, EvaluationContext};
|
||||
|
||||
#[allow(clippy::too_many_lines)]
|
||||
pub(crate) async fn prepare_corpus(
|
||||
machine: EvaluationMachine<(), DbReady>,
|
||||
ctx: &mut EvaluationContext<'_>,
|
||||
) -> StageResult<CorpusReady> {
|
||||
pub(crate) async fn prepare_corpus(ctx: &mut EvaluationContext<'_>) -> anyhow::Result<()> {
|
||||
let stage = EvalStage::PrepareCorpus;
|
||||
info!(
|
||||
evaluation_stage = stage.label(),
|
||||
@@ -31,13 +24,13 @@ pub(crate) async fn prepare_corpus(
|
||||
let window = slice::select_window(slice, ctx.config().slice_offset, ctx.config().limit)
|
||||
.context("selecting slice window for corpus preparation")?;
|
||||
|
||||
let descriptor = snapshot::Descriptor::new(config, slice, ctx.embedding_provider()?);
|
||||
let ingestion_config = corpus::make_ingestion_config(config);
|
||||
let expected_fingerprint = corpus::compute_ingestion_fingerprint(
|
||||
ctx.dataset(),
|
||||
slice,
|
||||
config.converted_dataset_path.as_path(),
|
||||
&ingestion_config,
|
||||
ctx.content_checksum(),
|
||||
)?;
|
||||
let base_dir = corpus::cached_corpus_dir(
|
||||
&cache_settings,
|
||||
@@ -47,19 +40,18 @@ pub(crate) async fn prepare_corpus(
|
||||
|
||||
if !config.reseed_slice {
|
||||
let requested_cases = window.cases.len();
|
||||
if can_reuse_namespace(
|
||||
ctx.db()?,
|
||||
&descriptor,
|
||||
&ctx.namespace,
|
||||
&ctx.database,
|
||||
ctx.dataset().metadata.id.as_str(),
|
||||
slice.manifest.slice_id.as_str(),
|
||||
expected_fingerprint.as_str(),
|
||||
requested_cases,
|
||||
)
|
||||
.await?
|
||||
{
|
||||
if let Some(manifest) = corpus::load_cached_manifest(&base_dir)? {
|
||||
if let Some(manifest) = corpus::load_cached_manifest(&base_dir)? {
|
||||
if can_reuse_namespace(
|
||||
ctx.db()?,
|
||||
&manifest,
|
||||
&embedding_provider,
|
||||
&ctx.namespace,
|
||||
&ctx.database,
|
||||
expected_fingerprint.as_str(),
|
||||
requested_cases,
|
||||
)
|
||||
.await?
|
||||
{
|
||||
info!(
|
||||
cache = %base_dir.display(),
|
||||
namespace = ctx.namespace.as_str(),
|
||||
@@ -70,7 +62,6 @@ pub(crate) async fn prepare_corpus(
|
||||
ctx.corpus_handle = Some(corpus_handle);
|
||||
ctx.expected_fingerprint = Some(expected_fingerprint);
|
||||
ctx.ingestion_duration_ms = 0;
|
||||
ctx.descriptor = Some(descriptor);
|
||||
|
||||
let elapsed = started.elapsed();
|
||||
ctx.record_stage_duration(stage, elapsed);
|
||||
@@ -80,14 +71,8 @@ pub(crate) async fn prepare_corpus(
|
||||
"completed evaluation stage"
|
||||
);
|
||||
|
||||
return machine
|
||||
.prepare_corpus()
|
||||
.map_err(|(_, guard)| map_guard_error("prepare_corpus", &guard));
|
||||
return Ok(());
|
||||
}
|
||||
info!(
|
||||
cache = %base_dir.display(),
|
||||
"Namespace reusable but cached manifest missing; regenerating corpus"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -103,6 +88,7 @@ pub(crate) async fn prepare_corpus(
|
||||
openai_client,
|
||||
&eval_user_id,
|
||||
config.converted_dataset_path.as_path(),
|
||||
ctx.content_checksum(),
|
||||
ingestion_config.clone(),
|
||||
)
|
||||
.await
|
||||
@@ -126,7 +112,6 @@ pub(crate) async fn prepare_corpus(
|
||||
ctx.corpus_handle = Some(corpus_handle);
|
||||
ctx.expected_fingerprint = Some(expected_fingerprint);
|
||||
ctx.ingestion_duration_ms = ingestion_duration_ms;
|
||||
ctx.descriptor = Some(descriptor);
|
||||
|
||||
let elapsed = started.elapsed();
|
||||
ctx.record_stage_duration(stage, elapsed);
|
||||
@@ -136,7 +121,5 @@ pub(crate) async fn prepare_corpus(
|
||||
"completed evaluation stage"
|
||||
);
|
||||
|
||||
machine
|
||||
.prepare_corpus()
|
||||
.map_err(|(_, guard)| map_guard_error("prepare_corpus", &guard))
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,28 +1,19 @@
|
||||
use std::{sync::Arc, time::Instant};
|
||||
use std::time::Instant;
|
||||
|
||||
use anyhow::{anyhow, Context};
|
||||
use tracing::info;
|
||||
|
||||
use crate::{
|
||||
args::EmbeddingBackend,
|
||||
cache::EmbeddingCache,
|
||||
eval::{
|
||||
connect_eval_db, enforce_system_settings, load_or_init_system_settings, sanitize_model_code,
|
||||
},
|
||||
db::{connect_eval_db, sanitize_model_code},
|
||||
openai,
|
||||
settings::{enforce_system_settings, load_or_init_system_settings},
|
||||
};
|
||||
use common::utils::embedding::{default_embedding_pool_size, EmbeddingProvider};
|
||||
|
||||
use super::super::{
|
||||
context::{EvalStage, EvaluationContext},
|
||||
state::{DbReady, EvaluationMachine, SlicePrepared},
|
||||
};
|
||||
use super::{map_guard_error, StageResult};
|
||||
use super::super::context::{EvalStage, EvaluationContext};
|
||||
|
||||
pub(crate) async fn prepare_db(
|
||||
machine: EvaluationMachine<(), SlicePrepared>,
|
||||
ctx: &mut EvaluationContext<'_>,
|
||||
) -> StageResult<DbReady> {
|
||||
pub(crate) async fn prepare_db(ctx: &mut EvaluationContext<'_>) -> anyhow::Result<()> {
|
||||
let stage = EvalStage::PrepareDb;
|
||||
info!(
|
||||
evaluation_stage = stage.label(),
|
||||
@@ -36,19 +27,18 @@ pub(crate) async fn prepare_db(
|
||||
|
||||
let db = connect_eval_db(config, &namespace, &database).await?;
|
||||
|
||||
let (raw_openai_client, openai_base_url) =
|
||||
openai::build_client_from_env().context("building OpenAI client")?;
|
||||
let openai_client = Arc::new(raw_openai_client);
|
||||
let (openai_client, openai_base_url) =
|
||||
openai::ingestion_openai_client(config.ingest.include_entities)
|
||||
.context("building OpenAI client for ingestion")?;
|
||||
|
||||
// Create embedding provider directly from config (eval only supports FastEmbed and Hashed)
|
||||
let embedding_provider = match config.embedding_backend {
|
||||
crate::args::EmbeddingBackend::FastEmbed => EmbeddingProvider::new_fastembed(
|
||||
EmbeddingBackend::FastEmbed => EmbeddingProvider::new_fastembed(
|
||||
config.embedding_model.clone(),
|
||||
default_embedding_pool_size(),
|
||||
)
|
||||
.await
|
||||
.context("creating FastEmbed provider")?,
|
||||
crate::args::EmbeddingBackend::Hashed => {
|
||||
EmbeddingBackend::Hashed => {
|
||||
EmbeddingProvider::new_hashed(1536).context("creating Hashed provider")?
|
||||
}
|
||||
};
|
||||
@@ -68,12 +58,14 @@ pub(crate) async fn prepare_db(
|
||||
dimension = provider_dimension,
|
||||
"Embedding provider initialised"
|
||||
);
|
||||
info!(openai_base_url = %openai_base_url, "OpenAI client configured");
|
||||
if let Some(base_url) = &openai_base_url {
|
||||
info!(openai_base_url = %base_url, "OpenAI client configured for entity ingestion");
|
||||
}
|
||||
|
||||
let (mut settings, settings_missing) =
|
||||
load_or_init_system_settings(&db, provider_dimension).await?;
|
||||
|
||||
let embedding_cache = if config.embedding_backend == EmbeddingBackend::FastEmbed {
|
||||
if config.embedding_backend == EmbeddingBackend::FastEmbed {
|
||||
if let Some(model_code) = embedding_provider.model_code() {
|
||||
let sanitized = sanitize_model_code(&model_code);
|
||||
let path = config.cache_dir.join(format!("{sanitized}.json"));
|
||||
@@ -83,15 +75,8 @@ pub(crate) async fn prepare_db(
|
||||
.with_context(|| format!("removing stale cache {}", path.display()))
|
||||
.ok();
|
||||
}
|
||||
let cache = EmbeddingCache::load(&path).await?;
|
||||
info!(path = %path.display(), "Embedding cache ready");
|
||||
Some(cache)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
}
|
||||
|
||||
let must_reapply_settings = settings_missing;
|
||||
let defer_initial_enforce = settings_missing && !config.reseed_slice;
|
||||
@@ -104,9 +89,8 @@ pub(crate) async fn prepare_db(
|
||||
ctx.must_reapply_settings = must_reapply_settings;
|
||||
ctx.settings = Some(settings);
|
||||
ctx.embedding_provider = Some(embedding_provider);
|
||||
ctx.embedding_cache = embedding_cache;
|
||||
ctx.openai_client = Some(openai_client);
|
||||
ctx.openai_base_url = Some(openai_base_url);
|
||||
ctx.openai_base_url = openai_base_url;
|
||||
|
||||
let elapsed = started.elapsed();
|
||||
ctx.record_stage_duration(stage, elapsed);
|
||||
@@ -116,7 +100,5 @@ pub(crate) async fn prepare_db(
|
||||
"completed evaluation stage"
|
||||
);
|
||||
|
||||
machine
|
||||
.prepare_db()
|
||||
.map_err(|(_, guard)| map_guard_error("prepare_db", &guard))
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -5,25 +5,19 @@ use common::storage::types::system_settings::SystemSettings;
|
||||
use tracing::{info, warn};
|
||||
|
||||
use crate::{
|
||||
cases::cases_from_manifest,
|
||||
corpus,
|
||||
db_helpers::{recreate_indexes, remove_all_indexes, reset_namespace},
|
||||
eval::{
|
||||
can_reuse_namespace, cases_from_manifest, enforce_system_settings, ensure_eval_user,
|
||||
record_namespace_state, warm_hnsw_cache,
|
||||
db::{
|
||||
can_reuse_namespace, ensure_eval_user, record_namespace_seed, recreate_indexes,
|
||||
reset_namespace, warm_hnsw_cache,
|
||||
},
|
||||
settings::enforce_system_settings,
|
||||
};
|
||||
|
||||
use super::super::{
|
||||
context::{EvalStage, EvaluationContext},
|
||||
state::{CorpusReady, EvaluationMachine, NamespaceReady},
|
||||
};
|
||||
use super::{map_guard_error, StageResult};
|
||||
use super::super::context::{EvalStage, EvaluationContext};
|
||||
|
||||
#[allow(clippy::too_many_lines)]
|
||||
pub(crate) async fn prepare_namespace(
|
||||
machine: EvaluationMachine<(), CorpusReady>,
|
||||
ctx: &mut EvaluationContext<'_>,
|
||||
) -> StageResult<NamespaceReady> {
|
||||
pub(crate) async fn prepare_namespace(ctx: &mut EvaluationContext<'_>) -> anyhow::Result<()> {
|
||||
let stage = EvalStage::PrepareNamespace;
|
||||
info!(
|
||||
evaluation_stage = stage.label(),
|
||||
@@ -32,7 +26,6 @@ pub(crate) async fn prepare_namespace(
|
||||
let started = Instant::now();
|
||||
|
||||
let config = ctx.config();
|
||||
let dataset = ctx.dataset();
|
||||
let expected_fingerprint = ctx
|
||||
.expected_fingerprint
|
||||
.as_deref()
|
||||
@@ -60,20 +53,16 @@ pub(crate) async fn prepare_namespace(
|
||||
|
||||
let mut namespace_reused = false;
|
||||
if !config.reseed_slice {
|
||||
namespace_reused = {
|
||||
let slice = ctx.slice()?;
|
||||
can_reuse_namespace(
|
||||
ctx.db()?,
|
||||
ctx.descriptor()?,
|
||||
&namespace,
|
||||
&database,
|
||||
dataset.metadata.id.as_str(),
|
||||
slice.manifest.slice_id.as_str(),
|
||||
expected_fingerprint.as_str(),
|
||||
requested_cases,
|
||||
)
|
||||
.await?
|
||||
};
|
||||
namespace_reused = can_reuse_namespace(
|
||||
ctx.db()?,
|
||||
base_manifest,
|
||||
&embedding_provider,
|
||||
&namespace,
|
||||
&database,
|
||||
expected_fingerprint.as_str(),
|
||||
requested_cases,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
|
||||
let mut namespace_seed_ms = None;
|
||||
@@ -114,34 +103,20 @@ pub(crate) async fn prepare_namespace(
|
||||
"Seeding ingestion corpus into SurrealDB"
|
||||
);
|
||||
}
|
||||
let indexes_disabled = remove_all_indexes(ctx.db()?).await.is_ok();
|
||||
|
||||
let seed_start = Instant::now();
|
||||
corpus::seed_manifest_into_db(ctx.db()?, &manifest_for_seed)
|
||||
.await
|
||||
.context("seeding ingestion corpus from manifest")?;
|
||||
namespace_seed_ms = Some(seed_start.elapsed().as_millis());
|
||||
|
||||
// Recreate indexes AFTER data is loaded (correct bulk loading pattern)
|
||||
if indexes_disabled {
|
||||
info!("Recreating indexes after seeding data");
|
||||
recreate_indexes(ctx.db()?, embedding_provider.dimension())
|
||||
.await
|
||||
.context("recreating indexes with correct dimension")?;
|
||||
warm_hnsw_cache(ctx.db()?, embedding_provider.dimension()).await?;
|
||||
}
|
||||
{
|
||||
let slice = ctx.slice()?;
|
||||
record_namespace_state(
|
||||
ctx.descriptor()?,
|
||||
dataset.metadata.id.as_str(),
|
||||
slice.manifest.slice_id.as_str(),
|
||||
expected_fingerprint.as_str(),
|
||||
&namespace,
|
||||
&database,
|
||||
requested_cases,
|
||||
)
|
||||
.await;
|
||||
info!("Recreating indexes after seeding data");
|
||||
recreate_indexes(ctx.db()?, embedding_provider.dimension())
|
||||
.await
|
||||
.context("recreating indexes with correct dimension")?;
|
||||
warm_hnsw_cache(ctx.db()?, embedding_provider.dimension()).await?;
|
||||
|
||||
if let Some(handle) = ctx.corpus_handle.as_mut() {
|
||||
record_namespace_seed(handle, &namespace, &database, requested_cases).await;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -198,7 +173,5 @@ pub(crate) async fn prepare_namespace(
|
||||
"completed evaluation stage"
|
||||
);
|
||||
|
||||
machine
|
||||
.prepare_namespace()
|
||||
.map_err(|(_, guard)| map_guard_error("prepare_namespace", &guard))
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -3,21 +3,11 @@ use std::time::Instant;
|
||||
use anyhow::Context;
|
||||
use tracing::info;
|
||||
|
||||
use crate::{
|
||||
eval::{default_database, default_namespace, ledger_target},
|
||||
slice,
|
||||
};
|
||||
use crate::{db::{default_database, default_namespace}, slice};
|
||||
|
||||
use super::super::{
|
||||
context::{EvalStage, EvaluationContext},
|
||||
state::{EvaluationMachine, Ready, SlicePrepared},
|
||||
};
|
||||
use super::{map_guard_error, StageResult};
|
||||
use super::super::context::{EvalStage, EvaluationContext};
|
||||
|
||||
pub(crate) async fn prepare_slice(
|
||||
machine: EvaluationMachine<(), Ready>,
|
||||
ctx: &mut EvaluationContext<'_>,
|
||||
) -> StageResult<SlicePrepared> {
|
||||
pub(crate) async fn prepare_slice(ctx: &mut EvaluationContext<'_>) -> anyhow::Result<()> {
|
||||
let stage = EvalStage::PrepareSlice;
|
||||
info!(
|
||||
evaluation_stage = stage.label(),
|
||||
@@ -25,7 +15,7 @@ pub(crate) async fn prepare_slice(
|
||||
);
|
||||
let started = Instant::now();
|
||||
|
||||
let ledger_limit = ledger_target(ctx.config());
|
||||
let ledger_limit = slice::ledger_target(ctx.config());
|
||||
let slice_settings = slice::slice_config_with_limit(ctx.config(), ledger_limit);
|
||||
let resolved_slice =
|
||||
slice::resolve_slice(ctx.dataset(), &slice_settings).context("resolving dataset slice")?;
|
||||
@@ -49,7 +39,11 @@ pub(crate) async fn prepare_slice(
|
||||
.db_namespace
|
||||
.clone()
|
||||
.unwrap_or_else(|| {
|
||||
default_namespace(ctx.dataset().metadata.id.as_str(), ctx.config().limit)
|
||||
default_namespace(
|
||||
ctx.dataset().metadata.id.as_str(),
|
||||
ctx.config().limit,
|
||||
ctx.config().slice.as_deref(),
|
||||
)
|
||||
});
|
||||
ctx.database = ctx
|
||||
.config()
|
||||
@@ -66,7 +60,5 @@ pub(crate) async fn prepare_slice(
|
||||
"completed evaluation stage"
|
||||
);
|
||||
|
||||
machine
|
||||
.prepare_slice()
|
||||
.map_err(|(_, guard)| map_guard_error("prepare_slice", &guard))
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -5,9 +5,13 @@ use common::storage::types::StoredObject;
|
||||
use futures::stream::{self, StreamExt};
|
||||
use tracing::{debug, info};
|
||||
|
||||
use crate::eval::{
|
||||
adapt_retrieval_output, build_case_diagnostics, text_contains_answer, CaseDiagnostics,
|
||||
CaseSummary, RetrievedSummary,
|
||||
use crate::{
|
||||
cases::SeededCase,
|
||||
context_stats,
|
||||
types::{
|
||||
adapt_retrieval_output, build_case_diagnostics, text_contains_answer, CaseDiagnostics,
|
||||
CaseSummary, RetrievedSummary,
|
||||
},
|
||||
};
|
||||
use retrieval_pipeline::{
|
||||
pipeline::{self, RetrievalConfig, StageTimings},
|
||||
@@ -15,17 +19,10 @@ use retrieval_pipeline::{
|
||||
};
|
||||
use tokio::sync::Semaphore;
|
||||
|
||||
use super::super::{
|
||||
context::{EvalStage, EvaluationContext},
|
||||
state::{EvaluationMachine, NamespaceReady, QueriesFinished},
|
||||
};
|
||||
use super::{map_guard_error, StageResult};
|
||||
use super::super::context::{EvalStage, EvaluationContext};
|
||||
|
||||
#[allow(clippy::too_many_lines, clippy::arithmetic_side_effects)]
|
||||
pub(crate) async fn run_queries(
|
||||
machine: EvaluationMachine<(), NamespaceReady>,
|
||||
ctx: &mut EvaluationContext<'_>,
|
||||
) -> StageResult<QueriesFinished> {
|
||||
pub(crate) async fn run_queries(ctx: &mut EvaluationContext<'_>) -> anyhow::Result<()> {
|
||||
let stage = EvalStage::RunQueries;
|
||||
info!(
|
||||
evaluation_stage = stage.label(),
|
||||
@@ -153,7 +150,7 @@ pub(crate) async fn run_queries(
|
||||
.await
|
||||
.context("acquiring query semaphore permit")?;
|
||||
|
||||
let crate::eval::SeededCase {
|
||||
let SeededCase {
|
||||
question_id,
|
||||
question,
|
||||
expected_source,
|
||||
@@ -197,6 +194,7 @@ pub(crate) async fn run_queries(
|
||||
let query_latency = query_start.elapsed().as_millis();
|
||||
|
||||
let candidates = adapt_retrieval_output(result_output);
|
||||
let retrieved_context = context_stats::stats_for_candidates(&candidates);
|
||||
let mut retrieved = Vec::new();
|
||||
let mut match_rank = None;
|
||||
let answers_lower: Vec<String> =
|
||||
@@ -288,6 +286,7 @@ pub(crate) async fn run_queries(
|
||||
reciprocal_rank: Some(reciprocal_rank),
|
||||
ndcg: Some(ndcg),
|
||||
latency_ms: query_latency,
|
||||
retrieved_context,
|
||||
retrieved,
|
||||
};
|
||||
|
||||
@@ -353,9 +352,7 @@ pub(crate) async fn run_queries(
|
||||
"completed evaluation stage"
|
||||
);
|
||||
|
||||
machine
|
||||
.run_queries()
|
||||
.map_err(|(_, guard)| map_guard_error("run_queries", &guard))
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[allow(clippy::arithmetic_side_effects, clippy::cast_precision_loss)]
|
||||
|
||||
@@ -3,25 +3,19 @@ use std::time::Instant;
|
||||
use chrono::Utc;
|
||||
use tracing::info;
|
||||
|
||||
use crate::eval::{
|
||||
use crate::types::{
|
||||
build_stage_latency_breakdown, compute_latency_stats, EvaluationSummary, PerformanceTimings,
|
||||
RetrievedContextStats,
|
||||
};
|
||||
|
||||
use super::super::{
|
||||
context::{EvalStage, EvaluationContext},
|
||||
state::{EvaluationMachine, QueriesFinished, Summarized},
|
||||
};
|
||||
use super::{map_guard_error, StageResult};
|
||||
use super::super::context::{EvalStage, EvaluationContext};
|
||||
|
||||
#[allow(
|
||||
clippy::too_many_lines,
|
||||
clippy::arithmetic_side_effects,
|
||||
clippy::cast_precision_loss
|
||||
)]
|
||||
pub(crate) async fn summarize(
|
||||
machine: EvaluationMachine<(), QueriesFinished>,
|
||||
ctx: &mut EvaluationContext<'_>,
|
||||
) -> StageResult<Summarized> {
|
||||
pub(crate) async fn summarize(ctx: &mut EvaluationContext<'_>) -> anyhow::Result<()> {
|
||||
let stage = EvalStage::Summarize;
|
||||
info!(
|
||||
evaluation_stage = stage.label(),
|
||||
@@ -123,6 +117,12 @@ pub(crate) async fn summarize(
|
||||
sum_ndcg / (retrieval_cases as f64)
|
||||
};
|
||||
|
||||
let per_query_context: Vec<RetrievedContextStats> = summaries
|
||||
.iter()
|
||||
.map(|summary| summary.retrieved_context)
|
||||
.collect();
|
||||
let retrieved_context = crate::context_stats::aggregate_context_stats(&per_query_context);
|
||||
|
||||
let active_tuning = ctx
|
||||
.retrieval_config
|
||||
.as_ref()
|
||||
@@ -133,7 +133,7 @@ pub(crate) async fn summarize(
|
||||
openai_base_url: ctx
|
||||
.openai_base_url
|
||||
.clone()
|
||||
.unwrap_or_else(|| "<unknown>".to_string()),
|
||||
.unwrap_or_else(|| "n/a (chunk-only ingestion)".to_string()),
|
||||
ingestion_ms: ctx.ingestion_duration_ms,
|
||||
namespace_seed_ms: ctx.namespace_seed_ms,
|
||||
evaluation_stage_ms: ctx.stage_timings.clone(),
|
||||
@@ -217,11 +217,12 @@ pub(crate) async fn summarize(
|
||||
chunk_rrf_use_fts: active_tuning.flags.chunk_rrf_use_fts.as_bool(),
|
||||
ingest_chunk_min_tokens: config.ingest.ingest_chunk_min_tokens,
|
||||
ingest_chunk_max_tokens: config.ingest.ingest_chunk_max_tokens,
|
||||
ingest_chunks_only: config.ingest.ingest_chunks_only,
|
||||
ingest_chunks_only: !config.ingest.include_entities,
|
||||
ingest_chunk_overlap_tokens: config.ingest.ingest_chunk_overlap_tokens,
|
||||
chunk_vector_take: active_tuning.chunk_vector_take,
|
||||
chunk_fts_take: active_tuning.chunk_fts_take,
|
||||
max_chunks_per_entity: active_tuning.max_chunks_per_entity,
|
||||
retrieved_context,
|
||||
cases: summaries,
|
||||
});
|
||||
|
||||
@@ -233,7 +234,5 @@ pub(crate) async fn summarize(
|
||||
"completed evaluation stage"
|
||||
);
|
||||
|
||||
machine
|
||||
.summarize()
|
||||
.map_err(|(_, guard)| map_guard_error("summarize", &guard))
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,31 +0,0 @@
|
||||
use state_machines::state_machine;
|
||||
|
||||
state_machine! {
|
||||
name: EvaluationMachine,
|
||||
state: EvaluationState,
|
||||
initial: Ready,
|
||||
states: [Ready, SlicePrepared, DbReady, CorpusReady, NamespaceReady, QueriesFinished, Summarized, Completed, Failed],
|
||||
events {
|
||||
prepare_slice { transition: { from: Ready, to: SlicePrepared } }
|
||||
prepare_db { transition: { from: SlicePrepared, to: DbReady } }
|
||||
prepare_corpus { transition: { from: DbReady, to: CorpusReady } }
|
||||
prepare_namespace { transition: { from: CorpusReady, to: NamespaceReady } }
|
||||
run_queries { transition: { from: NamespaceReady, to: QueriesFinished } }
|
||||
summarize { transition: { from: QueriesFinished, to: Summarized } }
|
||||
finalize { transition: { from: Summarized, to: Completed } }
|
||||
abort {
|
||||
transition: { from: Ready, to: Failed }
|
||||
transition: { from: SlicePrepared, to: Failed }
|
||||
transition: { from: DbReady, to: Failed }
|
||||
transition: { from: CorpusReady, to: Failed }
|
||||
transition: { from: NamespaceReady, to: Failed }
|
||||
transition: { from: QueriesFinished, to: Failed }
|
||||
transition: { from: Summarized, to: Failed }
|
||||
transition: { from: Completed, to: Failed }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn ready() -> EvaluationMachine<(), Ready> {
|
||||
EvaluationMachine::new(())
|
||||
}
|
||||
+81
-212
@@ -7,12 +7,10 @@ use std::{
|
||||
use anyhow::{Context, Result};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::eval::{
|
||||
use crate::types::{
|
||||
format_timestamp, CaseSummary, EvaluationStageTimings, EvaluationSummary, LatencyStats,
|
||||
StageLatencyBreakdown,
|
||||
RetrievalContextStats, StageLatencyBreakdown,
|
||||
};
|
||||
use chrono::Utc;
|
||||
use tracing::warn;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct ReportPaths {
|
||||
@@ -108,6 +106,7 @@ pub struct RetrievalSection {
|
||||
pub ingest_chunk_max_tokens: usize,
|
||||
pub ingest_chunk_overlap_tokens: usize,
|
||||
pub ingest_chunks_only: bool,
|
||||
pub retrieved_context: RetrievalContextStats,
|
||||
}
|
||||
|
||||
const fn default_chunk_rrf_k() -> f32 {
|
||||
@@ -242,6 +241,7 @@ impl EvaluationReport {
|
||||
ingest_chunk_max_tokens: summary.ingest_chunk_max_tokens,
|
||||
ingest_chunk_overlap_tokens: summary.ingest_chunk_overlap_tokens,
|
||||
ingest_chunks_only: summary.ingest_chunks_only,
|
||||
retrieved_context: summary.retrieved_context.clone(),
|
||||
};
|
||||
|
||||
let llm = if summary.llm_cases > 0 {
|
||||
@@ -345,7 +345,7 @@ impl LlmCaseEntry {
|
||||
}
|
||||
|
||||
impl RetrievedSnippet {
|
||||
fn from_summary(entry: &crate::eval::RetrievedSummary) -> Self {
|
||||
fn from_summary(entry: &crate::types::RetrievedSummary) -> Self {
|
||||
Self {
|
||||
rank: entry.rank,
|
||||
source_id: entry.source_id.clone(),
|
||||
@@ -558,6 +558,65 @@ fn render_markdown(report: &EvaluationReport) -> String {
|
||||
} else {
|
||||
md.push_str("| Rerank | disabled |\\n");
|
||||
}
|
||||
write!(
|
||||
md,
|
||||
"| Chunk result cap | {} |\\n",
|
||||
report.retrieval.chunk_result_cap
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
md.push_str("\\n## Retrieved Context Volume\\n\\n");
|
||||
md.push_str("| Metric | Value |\\n| --- | --- |\\n");
|
||||
write!(
|
||||
md,
|
||||
"| Tokenizer | {} |\\n",
|
||||
report.retrieval.retrieved_context.tokenizer
|
||||
)
|
||||
.unwrap();
|
||||
write!(
|
||||
md,
|
||||
"| Queries measured | {} |\\n",
|
||||
report.retrieval.retrieved_context.queries
|
||||
)
|
||||
.unwrap();
|
||||
write!(
|
||||
md,
|
||||
"| Total chunks returned | {} |\\n",
|
||||
report.retrieval.retrieved_context.total_chunks
|
||||
)
|
||||
.unwrap();
|
||||
write!(
|
||||
md,
|
||||
"| Total characters | {} |\\n",
|
||||
report.retrieval.retrieved_context.total_chars
|
||||
)
|
||||
.unwrap();
|
||||
write!(
|
||||
md,
|
||||
"| Total tokens | {} |\\n",
|
||||
report.retrieval.retrieved_context.total_tokens
|
||||
)
|
||||
.unwrap();
|
||||
write!(
|
||||
md,
|
||||
"| Avg chunks / query | {:.1} |\\n",
|
||||
report.retrieval.retrieved_context.avg_chunks_per_query
|
||||
)
|
||||
.unwrap();
|
||||
write!(
|
||||
md,
|
||||
"| Avg tokens / query | {:.1} |\\n",
|
||||
report.retrieval.retrieved_context.avg_tokens_per_query
|
||||
)
|
||||
.unwrap();
|
||||
write!(
|
||||
md,
|
||||
"| P50 / P95 / max tokens / query | {} / {} / {} |\\n",
|
||||
report.retrieval.retrieved_context.p50_tokens_per_query,
|
||||
report.retrieval.retrieved_context.p95_tokens_per_query,
|
||||
report.retrieval.retrieved_context.max_tokens_per_query
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
if let Some(llm) = &report.llm {
|
||||
md.push_str("\\n## LLM Mode Metrics\\n\\n");
|
||||
@@ -797,182 +856,6 @@ pub fn dataset_report_dir(report_dir: &Path, dataset_id: &str) -> PathBuf {
|
||||
report_dir.join(sanitize_component(dataset_id))
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
struct LegacyHistoryEntry {
|
||||
generated_at: String,
|
||||
run_label: Option<String>,
|
||||
dataset_id: String,
|
||||
dataset_label: String,
|
||||
slice_id: String,
|
||||
slice_seed: u64,
|
||||
slice_window_offset: usize,
|
||||
slice_window_length: usize,
|
||||
slice_cases: usize,
|
||||
slice_total_cases: usize,
|
||||
k: usize,
|
||||
limit: Option<usize>,
|
||||
precision: f64,
|
||||
precision_at_1: f64,
|
||||
precision_at_2: f64,
|
||||
precision_at_3: f64,
|
||||
#[serde(default)]
|
||||
mrr: f64,
|
||||
#[serde(default)]
|
||||
average_ndcg: f64,
|
||||
#[serde(default)]
|
||||
retrieval_cases: usize,
|
||||
#[serde(default)]
|
||||
retrieval_precision: f64,
|
||||
#[serde(default)]
|
||||
llm_cases: usize,
|
||||
#[serde(default)]
|
||||
llm_precision: f64,
|
||||
duration_ms: u128,
|
||||
latency_ms: LatencyStats,
|
||||
embedding_backend: String,
|
||||
embedding_model: Option<String>,
|
||||
ingestion_reused: bool,
|
||||
ingestion_embeddings_reused: bool,
|
||||
rerank_enabled: bool,
|
||||
rerank_keep_top: usize,
|
||||
rerank_pool_size: Option<usize>,
|
||||
#[serde(default)]
|
||||
chunk_result_cap: Option<usize>,
|
||||
#[serde(default)]
|
||||
ingest_chunk_min_tokens: Option<usize>,
|
||||
#[serde(default)]
|
||||
ingest_chunk_max_tokens: Option<usize>,
|
||||
#[serde(default)]
|
||||
ingest_chunk_overlap_tokens: Option<usize>,
|
||||
#[serde(default)]
|
||||
ingest_chunks_only: Option<bool>,
|
||||
#[serde(default)]
|
||||
delta: Option<LegacyHistoryDelta>,
|
||||
openai_base_url: String,
|
||||
ingestion_ms: u128,
|
||||
#[serde(default)]
|
||||
namespace_seed_ms: Option<u128>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
struct LegacyHistoryDelta {
|
||||
precision: f64,
|
||||
precision_at_1: f64,
|
||||
latency_avg_ms: f64,
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_lines)]
|
||||
fn convert_legacy_entry(entry: LegacyHistoryEntry) -> EvaluationReport {
|
||||
let overview = OverviewSection {
|
||||
generated_at: entry.generated_at,
|
||||
run_label: entry.run_label,
|
||||
total_cases: entry.slice_cases,
|
||||
filtered_questions: 0,
|
||||
};
|
||||
|
||||
let dataset = DatasetSection {
|
||||
id: entry.dataset_id,
|
||||
label: entry.dataset_label,
|
||||
source: String::new(),
|
||||
includes_unanswerable: entry.llm_cases > 0,
|
||||
require_verified_chunks: true,
|
||||
embedding_backend: entry.embedding_backend,
|
||||
embedding_model: entry.embedding_model,
|
||||
embedding_dimension: 0,
|
||||
};
|
||||
|
||||
let slice = SliceSection {
|
||||
id: entry.slice_id,
|
||||
seed: entry.slice_seed,
|
||||
window_offset: entry.slice_window_offset,
|
||||
window_length: entry.slice_window_length,
|
||||
slice_cases: entry.slice_cases,
|
||||
ledger_total_cases: entry.slice_total_cases,
|
||||
positives: 0,
|
||||
negatives: 0,
|
||||
total_paragraphs: 0,
|
||||
negative_multiplier: 0.0,
|
||||
};
|
||||
|
||||
let retrieval_cases = if entry.retrieval_cases > 0 {
|
||||
entry.retrieval_cases
|
||||
} else {
|
||||
entry.slice_cases.saturating_sub(entry.llm_cases)
|
||||
};
|
||||
let retrieval_precision = if entry.retrieval_precision > 0.0 {
|
||||
entry.retrieval_precision
|
||||
} else {
|
||||
entry.precision
|
||||
};
|
||||
|
||||
let retrieval = RetrievalSection {
|
||||
k: entry.k,
|
||||
cases: retrieval_cases,
|
||||
correct: 0,
|
||||
precision: retrieval_precision,
|
||||
precision_at_1: entry.precision_at_1,
|
||||
precision_at_2: entry.precision_at_2,
|
||||
precision_at_3: entry.precision_at_3,
|
||||
mrr: entry.mrr,
|
||||
average_ndcg: entry.average_ndcg,
|
||||
latency: entry.latency_ms,
|
||||
concurrency: 0,
|
||||
resolve_entities: false,
|
||||
rerank_enabled: entry.rerank_enabled,
|
||||
rerank_pool_size: entry.rerank_pool_size,
|
||||
rerank_keep_top: entry.rerank_keep_top,
|
||||
chunk_result_cap: entry.chunk_result_cap.unwrap_or(5),
|
||||
chunk_rrf_k: default_chunk_rrf_k(),
|
||||
chunk_rrf_vector_weight: default_chunk_rrf_weight(),
|
||||
chunk_rrf_fts_weight: default_chunk_rrf_weight(),
|
||||
chunk_rrf_use_vector: default_chunk_rrf_use(),
|
||||
chunk_rrf_use_fts: default_chunk_rrf_use(),
|
||||
chunk_vector_take: 0,
|
||||
chunk_fts_take: 0,
|
||||
ingest_chunk_min_tokens: entry.ingest_chunk_min_tokens.unwrap_or(256),
|
||||
ingest_chunk_max_tokens: entry.ingest_chunk_max_tokens.unwrap_or(512),
|
||||
ingest_chunk_overlap_tokens: entry.ingest_chunk_overlap_tokens.unwrap_or(50),
|
||||
ingest_chunks_only: entry.ingest_chunks_only.unwrap_or(false),
|
||||
};
|
||||
|
||||
let llm = if entry.llm_cases > 0 {
|
||||
Some(LlmSection {
|
||||
cases: entry.llm_cases,
|
||||
answered: 0,
|
||||
precision: entry.llm_precision,
|
||||
})
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let performance = PerformanceSection {
|
||||
openai_base_url: entry.openai_base_url,
|
||||
ingestion_ms: entry.ingestion_ms,
|
||||
namespace_seed_ms: entry.namespace_seed_ms,
|
||||
evaluation_stages_ms: EvaluationStageTimings::default(),
|
||||
stage_latency: StageLatencyBreakdown::default(),
|
||||
namespace_reused: false,
|
||||
ingestion_reused: entry.ingestion_reused,
|
||||
embeddings_reused: entry.ingestion_embeddings_reused,
|
||||
ingestion_cache_path: String::new(),
|
||||
corpus_paragraphs: 0,
|
||||
positive_paragraphs_reused: 0,
|
||||
negative_paragraphs_reused: 0,
|
||||
};
|
||||
|
||||
EvaluationReport {
|
||||
overview,
|
||||
dataset,
|
||||
slice,
|
||||
retrieval,
|
||||
llm,
|
||||
performance,
|
||||
misses: Vec::new(),
|
||||
llm_cases: Vec::new(),
|
||||
detailed_report: false,
|
||||
}
|
||||
}
|
||||
|
||||
fn load_history(path: &Path) -> Result<Vec<EvaluationReport>> {
|
||||
if !path.exists() {
|
||||
return Ok(Vec::new());
|
||||
@@ -981,34 +864,12 @@ fn load_history(path: &Path) -> Result<Vec<EvaluationReport>> {
|
||||
let contents =
|
||||
fs::read(path).with_context(|| format!("reading evaluation log {}", path.display()))?;
|
||||
|
||||
if let Ok(entries) = serde_json::from_slice::<Vec<EvaluationReport>>(&contents) {
|
||||
return Ok(entries);
|
||||
}
|
||||
|
||||
match serde_json::from_slice::<Vec<LegacyHistoryEntry>>(&contents) {
|
||||
Ok(entries) => Ok(entries.into_iter().map(convert_legacy_entry).collect()),
|
||||
Err(err) => {
|
||||
let timestamp = Utc::now().format("%Y%m%dT%H%M%S");
|
||||
let backup_path = path
|
||||
.parent()
|
||||
.unwrap_or_else(|| Path::new("."))
|
||||
.join(format!("evaluations.json.corrupted.{timestamp}"));
|
||||
warn!(
|
||||
path = %path.display(),
|
||||
backup = %backup_path.display(),
|
||||
error = %err,
|
||||
"Evaluation history file is corrupted; backing up and starting fresh"
|
||||
);
|
||||
if let Err(e) = fs::rename(path, &backup_path) {
|
||||
warn!(
|
||||
path = %path.display(),
|
||||
error = %e,
|
||||
"Failed to backup corrupted evaluation history"
|
||||
);
|
||||
}
|
||||
Ok(Vec::new())
|
||||
}
|
||||
}
|
||||
serde_json::from_slice(&contents).with_context(|| {
|
||||
format!(
|
||||
"parsing evaluation history at {}; delete the file and re-run if upgrading from an older format",
|
||||
path.display()
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
fn record_history(report: &EvaluationReport, report_dir: &Path) -> Result<PathBuf> {
|
||||
@@ -1024,9 +885,9 @@ fn record_history(report: &EvaluationReport, report_dir: &Path) -> Result<PathBu
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::eval::{
|
||||
EvaluationStageTimings, PerformanceTimings, RetrievedSummary, StageLatency,
|
||||
StageLatencyBreakdown,
|
||||
use crate::types::{
|
||||
EvaluationStageTimings, PerformanceTimings, RetrievedContextStats, RetrievedSummary,
|
||||
StageLatency, StageLatencyBreakdown,
|
||||
};
|
||||
use chrono::Utc;
|
||||
use tempfile::tempdir;
|
||||
@@ -1101,6 +962,7 @@ mod tests {
|
||||
has_verified_chunks: !is_impossible,
|
||||
match_rank: if matched { Some(1) } else { None },
|
||||
latency_ms: 42,
|
||||
retrieved_context: RetrievedContextStats::default(),
|
||||
retrieved: vec![RetrievedSummary {
|
||||
rank: 1,
|
||||
entity_id: "entity1".into(),
|
||||
@@ -1199,6 +1061,13 @@ mod tests {
|
||||
chunk_vector_take: 50,
|
||||
chunk_fts_take: 50,
|
||||
max_chunks_per_entity: 4,
|
||||
retrieved_context: crate::context_stats::aggregate_context_stats(&[
|
||||
RetrievedContextStats {
|
||||
chunk_count: 1,
|
||||
char_count: 10,
|
||||
token_count: 3,
|
||||
},
|
||||
]),
|
||||
cases,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,174 @@
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
|
||||
use anyhow::{anyhow, Result};
|
||||
use rand::{rngs::StdRng, seq::SliceRandom, SeedableRng};
|
||||
use tracing::warn;
|
||||
|
||||
use crate::datasets::{ConvertedDataset, BEIR_DATASETS};
|
||||
|
||||
use super::build::{mix_seed, BuildParams};
|
||||
|
||||
#[allow(clippy::too_many_lines, clippy::arithmetic_side_effects)]
|
||||
pub(super) fn ordered_question_refs_beir(
|
||||
dataset: &ConvertedDataset,
|
||||
params: &BuildParams,
|
||||
target_cases: usize,
|
||||
) -> Result<Vec<(usize, usize)>> {
|
||||
let prefixes: Vec<&str> = BEIR_DATASETS
|
||||
.iter()
|
||||
.map(|kind| kind.source_prefix())
|
||||
.collect();
|
||||
|
||||
let mut grouped: HashMap<&str, Vec<(usize, usize)>> = HashMap::new();
|
||||
for (p_idx, paragraph) in dataset.paragraphs.iter().enumerate() {
|
||||
for (q_idx, question) in paragraph.questions.iter().enumerate() {
|
||||
let include = if params.include_impossible {
|
||||
true
|
||||
} else {
|
||||
!question.is_impossible && !question.answers.is_empty()
|
||||
};
|
||||
if !include {
|
||||
continue;
|
||||
}
|
||||
|
||||
let Some(prefix) = question_prefix(&question.id) else {
|
||||
warn!(
|
||||
question_id = %question.id,
|
||||
"Skipping BEIR question without expected prefix"
|
||||
);
|
||||
continue;
|
||||
};
|
||||
if !prefixes.contains(&prefix) {
|
||||
warn!(
|
||||
question_id = %question.id,
|
||||
prefix = %prefix,
|
||||
"Skipping BEIR question with unknown subset prefix"
|
||||
);
|
||||
continue;
|
||||
}
|
||||
grouped.entry(prefix).or_default().push((p_idx, q_idx));
|
||||
}
|
||||
}
|
||||
|
||||
if grouped.values().all(std::vec::Vec::is_empty) {
|
||||
return Err(anyhow!(
|
||||
"no eligible BEIR questions found; cannot build slice"
|
||||
));
|
||||
}
|
||||
|
||||
for prefix in &prefixes {
|
||||
if let Some(entries) = grouped.get_mut(prefix) {
|
||||
let seed = mix_seed(
|
||||
&format!("{}::{prefix}", dataset.metadata.id),
|
||||
params.base_seed,
|
||||
);
|
||||
let mut rng = StdRng::seed_from_u64(seed);
|
||||
entries.shuffle(&mut rng);
|
||||
}
|
||||
}
|
||||
|
||||
let dataset_count = prefixes.len().max(1);
|
||||
let base_quota = target_cases / dataset_count;
|
||||
let mut remainder = target_cases % dataset_count;
|
||||
|
||||
let mut quotas: HashMap<&str, usize> = HashMap::new();
|
||||
for prefix in &prefixes {
|
||||
let mut quota = base_quota;
|
||||
if remainder > 0 {
|
||||
quota += 1;
|
||||
remainder -= 1;
|
||||
}
|
||||
quotas.insert(*prefix, quota);
|
||||
}
|
||||
|
||||
let mut take_counts: HashMap<&str, usize> = HashMap::new();
|
||||
let mut spare_slots: HashMap<&str, usize> = HashMap::new();
|
||||
let mut shortfall = 0usize;
|
||||
|
||||
for prefix in &prefixes {
|
||||
let available = grouped.get(prefix).map_or(0, std::vec::Vec::len);
|
||||
let quota = *quotas.get(prefix).unwrap_or(&0);
|
||||
let take = quota.min(available);
|
||||
let missing = quota.saturating_sub(take);
|
||||
shortfall += missing;
|
||||
take_counts.insert(*prefix, take);
|
||||
spare_slots.insert(*prefix, available.saturating_sub(take));
|
||||
}
|
||||
|
||||
while shortfall > 0 {
|
||||
let mut allocated = false;
|
||||
for prefix in &prefixes {
|
||||
if shortfall == 0 {
|
||||
break;
|
||||
}
|
||||
let spare = spare_slots.get(prefix).copied().unwrap_or(0);
|
||||
if spare == 0 {
|
||||
continue;
|
||||
}
|
||||
if let Some(count) = take_counts.get_mut(prefix) {
|
||||
*count += 1;
|
||||
}
|
||||
spare_slots.insert(*prefix, spare - 1);
|
||||
shortfall = shortfall.saturating_sub(1);
|
||||
allocated = true;
|
||||
}
|
||||
if !allocated {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let mut queues: Vec<VecDeque<(usize, usize)>> = Vec::new();
|
||||
let mut total_selected = 0usize;
|
||||
for prefix in &prefixes {
|
||||
let take = *take_counts.get(prefix).unwrap_or(&0);
|
||||
let mut deque = VecDeque::new();
|
||||
if let Some(entries) = grouped.get(prefix) {
|
||||
for item in entries.iter().take(take) {
|
||||
deque.push_back(*item);
|
||||
total_selected += 1;
|
||||
}
|
||||
}
|
||||
queues.push(deque);
|
||||
}
|
||||
|
||||
if total_selected < target_cases {
|
||||
warn!(
|
||||
requested = target_cases,
|
||||
available = total_selected,
|
||||
"BEIR mix requested more questions than available after balancing; continuing with capped set"
|
||||
);
|
||||
}
|
||||
|
||||
let mut output = Vec::with_capacity(total_selected);
|
||||
loop {
|
||||
let mut progressed = false;
|
||||
for queue in &mut queues {
|
||||
if let Some(item) = queue.pop_front() {
|
||||
output.push(item);
|
||||
progressed = true;
|
||||
}
|
||||
}
|
||||
if !progressed {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if output.is_empty() {
|
||||
return Err(anyhow!(
|
||||
"no eligible BEIR questions found; cannot build slice"
|
||||
));
|
||||
}
|
||||
|
||||
Ok(output)
|
||||
}
|
||||
|
||||
pub(super) fn question_prefix(question_id: &str) -> Option<&'static str> {
|
||||
for prefix in BEIR_DATASETS.iter().map(|kind| kind.source_prefix()) {
|
||||
if let Some(rest) = question_id.strip_prefix(prefix) {
|
||||
if rest.starts_with('-') {
|
||||
return Some(prefix);
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
@@ -0,0 +1,19 @@
|
||||
use sha2::{Digest, Sha256};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(super) struct BuildParams {
|
||||
pub include_impossible: bool,
|
||||
pub base_seed: u64,
|
||||
pub rng_seed: u64,
|
||||
}
|
||||
|
||||
#[allow(clippy::indexing_slicing)]
|
||||
pub(super) fn mix_seed(dataset_id: &str, seed: u64) -> u64 {
|
||||
let mut hasher = Sha256::new();
|
||||
hasher.update(dataset_id.as_bytes());
|
||||
hasher.update(seed.to_le_bytes());
|
||||
let digest = hasher.finalize();
|
||||
let mut bytes = [0u8; 8];
|
||||
bytes.copy_from_slice(&digest[..8]);
|
||||
u64::from_le_bytes(bytes)
|
||||
}
|
||||
@@ -1,5 +1,5 @@
|
||||
use std::{
|
||||
collections::{HashMap, HashSet, VecDeque},
|
||||
collections::{HashMap, HashSet},
|
||||
fmt::Write,
|
||||
fs,
|
||||
path::{Path, PathBuf},
|
||||
@@ -12,10 +12,18 @@ use serde::{Deserialize, Serialize};
|
||||
use sha2::{Digest, Sha256};
|
||||
use tracing::{info, warn};
|
||||
|
||||
use crate::datasets::{
|
||||
ConvertedDataset, ConvertedParagraph, ConvertedQuestion, DatasetKind, BEIR_DATASETS,
|
||||
use crate::{
|
||||
args::Config,
|
||||
datasets::{
|
||||
ConvertedDataset, ConvertedParagraph, ConvertedQuestion, DatasetKind,
|
||||
},
|
||||
};
|
||||
|
||||
mod beir;
|
||||
mod build;
|
||||
|
||||
use build::{mix_seed, BuildParams};
|
||||
|
||||
const SLICE_VERSION: u32 = 2;
|
||||
pub const DEFAULT_NEGATIVE_MULTIPLIER: f32 = 4.0;
|
||||
|
||||
@@ -80,8 +88,12 @@ pub enum SliceParagraphKind {
|
||||
Negative,
|
||||
}
|
||||
|
||||
pub fn paragraph_storage_key(paragraph_id: &str) -> String {
|
||||
sanitize_identifier(paragraph_id)
|
||||
}
|
||||
|
||||
pub(crate) fn default_shard_path(paragraph_id: &str) -> String {
|
||||
let sanitized = sanitize_identifier(paragraph_id);
|
||||
let sanitized = paragraph_storage_key(paragraph_id);
|
||||
format!("paragraphs/{sanitized}.json")
|
||||
}
|
||||
|
||||
@@ -210,13 +222,6 @@ struct SliceKey<'a> {
|
||||
seed: u64,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct BuildParams {
|
||||
include_impossible: bool,
|
||||
base_seed: u64,
|
||||
rng_seed: u64,
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_lines)]
|
||||
pub fn resolve_slice<'a>(
|
||||
dataset: &'a ConvertedDataset,
|
||||
@@ -225,15 +230,29 @@ pub fn resolve_slice<'a>(
|
||||
let index = DatasetIndex::build(dataset);
|
||||
|
||||
if let Some(slice_arg) = config.explicit_slice {
|
||||
let (path, manifest) = load_explicit_slice(dataset, &index, config, slice_arg)?;
|
||||
let resolved = manifest_to_resolved(dataset, &index, manifest, path)?;
|
||||
let path = explicit_slice_path(dataset, config, slice_arg);
|
||||
if path.exists() {
|
||||
let (path, manifest) = load_explicit_slice(dataset, &index, config, slice_arg)?;
|
||||
let resolved = manifest_to_resolved(dataset, &index, manifest, path)?;
|
||||
info!(
|
||||
slice = %resolved.manifest.slice_id,
|
||||
path = %resolved.path.display(),
|
||||
cases = resolved.manifest.case_count,
|
||||
positives = resolved.manifest.positive_paragraphs,
|
||||
negatives = resolved.manifest.negative_paragraphs,
|
||||
"Using explicitly selected slice"
|
||||
);
|
||||
return Ok(resolved);
|
||||
}
|
||||
let resolved =
|
||||
materialize_slice_ledger(dataset, config, &index, slice_arg, path)?;
|
||||
info!(
|
||||
slice = %resolved.manifest.slice_id,
|
||||
path = %resolved.path.display(),
|
||||
cases = resolved.manifest.case_count,
|
||||
positives = resolved.manifest.positive_paragraphs,
|
||||
negatives = resolved.manifest.negative_paragraphs,
|
||||
"Using explicitly selected slice"
|
||||
"Built catalog slice ledger"
|
||||
);
|
||||
return Ok(resolved);
|
||||
}
|
||||
@@ -256,6 +275,82 @@ pub fn resolve_slice<'a>(
|
||||
.join("slices")
|
||||
.join(dataset.metadata.id.as_str());
|
||||
let path = base.join(format!("{slice_id}.json"));
|
||||
materialize_slice_ledger(dataset, config, &index, &slice_id, path)
|
||||
}
|
||||
|
||||
#[allow(clippy::indexing_slicing, clippy::arithmetic_side_effects)]
|
||||
pub fn select_window<'a>(
|
||||
resolved: &'a ResolvedSlice<'a>,
|
||||
offset: usize,
|
||||
limit: Option<usize>,
|
||||
) -> Result<SliceWindow<'a>> {
|
||||
let total = resolved.manifest.case_count;
|
||||
if total == 0 {
|
||||
return Err(anyhow!(
|
||||
"slice '{}' contains no cases",
|
||||
resolved.manifest.slice_id
|
||||
));
|
||||
}
|
||||
if offset >= total {
|
||||
return Err(anyhow!(
|
||||
"slice offset {offset} exceeds available cases ({total})",
|
||||
));
|
||||
}
|
||||
let available = total - offset;
|
||||
let requested = limit.unwrap_or(available).max(1);
|
||||
let length = requested.min(available);
|
||||
let cases = resolved.cases[offset..offset + length].to_vec();
|
||||
let mut seen = HashSet::new();
|
||||
let mut positive_ids = Vec::new();
|
||||
for case in &cases {
|
||||
if seen.insert(case.paragraph.id.as_str()) {
|
||||
positive_ids.push(case.paragraph.id.clone());
|
||||
}
|
||||
}
|
||||
Ok(SliceWindow {
|
||||
offset,
|
||||
length,
|
||||
total_cases: total,
|
||||
cases,
|
||||
positive_paragraph_ids: positive_ids,
|
||||
})
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub fn full_window<'a>(resolved: &'a ResolvedSlice<'a>) -> Result<SliceWindow<'a>> {
|
||||
select_window(resolved, 0, None)
|
||||
}
|
||||
|
||||
fn explicit_slice_path(
|
||||
dataset: &ConvertedDataset,
|
||||
config: &SliceConfig<'_>,
|
||||
slice_arg: &str,
|
||||
) -> PathBuf {
|
||||
let explicit_path = Path::new(slice_arg);
|
||||
if explicit_path.exists() {
|
||||
explicit_path.to_path_buf()
|
||||
} else {
|
||||
config
|
||||
.cache_dir
|
||||
.join("slices")
|
||||
.join(dataset.metadata.id.as_str())
|
||||
.join(format!("{slice_arg}.json"))
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_lines)]
|
||||
fn materialize_slice_ledger<'a>(
|
||||
dataset: &'a ConvertedDataset,
|
||||
config: &SliceConfig<'_>,
|
||||
index: &DatasetIndex,
|
||||
slice_id: &str,
|
||||
path: PathBuf,
|
||||
) -> Result<ResolvedSlice<'a>> {
|
||||
let requested_corpus = config
|
||||
.corpus_limit
|
||||
.unwrap_or(dataset.paragraphs.len())
|
||||
.min(dataset.paragraphs.len())
|
||||
.max(1);
|
||||
|
||||
let total_questions = dataset
|
||||
.paragraphs
|
||||
@@ -339,7 +434,7 @@ pub fn resolve_slice<'a>(
|
||||
let mut manifest = manifest.unwrap_or_else(|| {
|
||||
empty_manifest(
|
||||
dataset,
|
||||
slice_id.clone(),
|
||||
slice_id.to_string(),
|
||||
¶ms,
|
||||
requested_corpus,
|
||||
config.negative_multiplier,
|
||||
@@ -396,52 +491,7 @@ pub fn resolve_slice<'a>(
|
||||
);
|
||||
}
|
||||
|
||||
let resolved = manifest_to_resolved(dataset, &index, manifest.clone(), path)?;
|
||||
|
||||
Ok(resolved)
|
||||
}
|
||||
|
||||
#[allow(clippy::indexing_slicing, clippy::arithmetic_side_effects)]
|
||||
pub fn select_window<'a>(
|
||||
resolved: &'a ResolvedSlice<'a>,
|
||||
offset: usize,
|
||||
limit: Option<usize>,
|
||||
) -> Result<SliceWindow<'a>> {
|
||||
let total = resolved.manifest.case_count;
|
||||
if total == 0 {
|
||||
return Err(anyhow!(
|
||||
"slice '{}' contains no cases",
|
||||
resolved.manifest.slice_id
|
||||
));
|
||||
}
|
||||
if offset >= total {
|
||||
return Err(anyhow!(
|
||||
"slice offset {offset} exceeds available cases ({total})",
|
||||
));
|
||||
}
|
||||
let available = total - offset;
|
||||
let requested = limit.unwrap_or(available).max(1);
|
||||
let length = requested.min(available);
|
||||
let cases = resolved.cases[offset..offset + length].to_vec();
|
||||
let mut seen = HashSet::new();
|
||||
let mut positive_ids = Vec::new();
|
||||
for case in &cases {
|
||||
if seen.insert(case.paragraph.id.as_str()) {
|
||||
positive_ids.push(case.paragraph.id.clone());
|
||||
}
|
||||
}
|
||||
Ok(SliceWindow {
|
||||
offset,
|
||||
length,
|
||||
total_cases: total,
|
||||
cases,
|
||||
positive_paragraph_ids: positive_ids,
|
||||
})
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub fn full_window<'a>(resolved: &'a ResolvedSlice<'a>) -> Result<SliceWindow<'a>> {
|
||||
select_window(resolved, 0, None)
|
||||
manifest_to_resolved(dataset, index, manifest, path)
|
||||
}
|
||||
|
||||
fn load_explicit_slice(
|
||||
@@ -450,16 +500,7 @@ fn load_explicit_slice(
|
||||
config: &SliceConfig<'_>,
|
||||
slice_arg: &str,
|
||||
) -> Result<(PathBuf, SliceManifest)> {
|
||||
let explicit_path = Path::new(slice_arg);
|
||||
let candidate_path = if explicit_path.exists() {
|
||||
explicit_path.to_path_buf()
|
||||
} else {
|
||||
config
|
||||
.cache_dir
|
||||
.join("slices")
|
||||
.join(dataset.metadata.id.as_str())
|
||||
.join(format!("{slice_arg}.json"))
|
||||
};
|
||||
let candidate_path = explicit_slice_path(dataset, config, slice_arg);
|
||||
|
||||
let manifest = read_manifest(&candidate_path)
|
||||
.with_context(|| format!("reading slice manifest at {}", candidate_path.display()))?;
|
||||
@@ -613,7 +654,7 @@ fn ordered_question_refs(
|
||||
target_cases: usize,
|
||||
) -> Result<Vec<(usize, usize)>> {
|
||||
if dataset.metadata.id == DatasetKind::Beir.id() {
|
||||
return ordered_question_refs_beir(dataset, params, target_cases);
|
||||
return beir::ordered_question_refs_beir(dataset, params, target_cases);
|
||||
}
|
||||
|
||||
let mut question_refs = Vec::new();
|
||||
@@ -642,171 +683,6 @@ fn ordered_question_refs(
|
||||
Ok(question_refs)
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_lines, clippy::arithmetic_side_effects)]
|
||||
fn ordered_question_refs_beir(
|
||||
dataset: &ConvertedDataset,
|
||||
params: &BuildParams,
|
||||
target_cases: usize,
|
||||
) -> Result<Vec<(usize, usize)>> {
|
||||
let prefixes: Vec<&str> = BEIR_DATASETS
|
||||
.iter()
|
||||
.map(|kind| kind.source_prefix())
|
||||
.collect();
|
||||
|
||||
let mut grouped: HashMap<&str, Vec<(usize, usize)>> = HashMap::new();
|
||||
for (p_idx, paragraph) in dataset.paragraphs.iter().enumerate() {
|
||||
for (q_idx, question) in paragraph.questions.iter().enumerate() {
|
||||
let include = if params.include_impossible {
|
||||
true
|
||||
} else {
|
||||
!question.is_impossible && !question.answers.is_empty()
|
||||
};
|
||||
if !include {
|
||||
continue;
|
||||
}
|
||||
|
||||
let Some(prefix) = question_prefix(&question.id) else {
|
||||
warn!(
|
||||
question_id = %question.id,
|
||||
"Skipping BEIR question without expected prefix"
|
||||
);
|
||||
continue;
|
||||
};
|
||||
if !prefixes.contains(&prefix) {
|
||||
warn!(
|
||||
question_id = %question.id,
|
||||
prefix = %prefix,
|
||||
"Skipping BEIR question with unknown subset prefix"
|
||||
);
|
||||
continue;
|
||||
}
|
||||
grouped.entry(prefix).or_default().push((p_idx, q_idx));
|
||||
}
|
||||
}
|
||||
|
||||
if grouped.values().all(std::vec::Vec::is_empty) {
|
||||
return Err(anyhow!(
|
||||
"no eligible BEIR questions found; cannot build slice"
|
||||
));
|
||||
}
|
||||
|
||||
for prefix in &prefixes {
|
||||
if let Some(entries) = grouped.get_mut(prefix) {
|
||||
let seed = mix_seed(
|
||||
&format!("{}::{prefix}", dataset.metadata.id),
|
||||
params.base_seed,
|
||||
);
|
||||
let mut rng = StdRng::seed_from_u64(seed);
|
||||
entries.shuffle(&mut rng);
|
||||
}
|
||||
}
|
||||
|
||||
let dataset_count = prefixes.len().max(1);
|
||||
let base_quota = target_cases / dataset_count;
|
||||
let mut remainder = target_cases % dataset_count;
|
||||
|
||||
let mut quotas: HashMap<&str, usize> = HashMap::new();
|
||||
for prefix in &prefixes {
|
||||
let mut quota = base_quota;
|
||||
if remainder > 0 {
|
||||
quota += 1;
|
||||
remainder -= 1;
|
||||
}
|
||||
quotas.insert(*prefix, quota);
|
||||
}
|
||||
|
||||
let mut take_counts: HashMap<&str, usize> = HashMap::new();
|
||||
let mut spare_slots: HashMap<&str, usize> = HashMap::new();
|
||||
let mut shortfall = 0usize;
|
||||
|
||||
for prefix in &prefixes {
|
||||
let available = grouped.get(prefix).map_or(0, std::vec::Vec::len);
|
||||
let quota = *quotas.get(prefix).unwrap_or(&0);
|
||||
let take = quota.min(available);
|
||||
let missing = quota.saturating_sub(take);
|
||||
shortfall += missing;
|
||||
take_counts.insert(*prefix, take);
|
||||
spare_slots.insert(*prefix, available.saturating_sub(take));
|
||||
}
|
||||
|
||||
while shortfall > 0 {
|
||||
let mut allocated = false;
|
||||
for prefix in &prefixes {
|
||||
if shortfall == 0 {
|
||||
break;
|
||||
}
|
||||
let spare = spare_slots.get(prefix).copied().unwrap_or(0);
|
||||
if spare == 0 {
|
||||
continue;
|
||||
}
|
||||
if let Some(count) = take_counts.get_mut(prefix) {
|
||||
*count += 1;
|
||||
}
|
||||
spare_slots.insert(*prefix, spare - 1);
|
||||
shortfall = shortfall.saturating_sub(1);
|
||||
allocated = true;
|
||||
}
|
||||
if !allocated {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let mut queues: Vec<VecDeque<(usize, usize)>> = Vec::new();
|
||||
let mut total_selected = 0usize;
|
||||
for prefix in &prefixes {
|
||||
let take = *take_counts.get(prefix).unwrap_or(&0);
|
||||
let mut deque = VecDeque::new();
|
||||
if let Some(entries) = grouped.get(prefix) {
|
||||
for item in entries.iter().take(take) {
|
||||
deque.push_back(*item);
|
||||
total_selected += 1;
|
||||
}
|
||||
}
|
||||
queues.push(deque);
|
||||
}
|
||||
|
||||
if total_selected < target_cases {
|
||||
warn!(
|
||||
requested = target_cases,
|
||||
available = total_selected,
|
||||
"BEIR mix requested more questions than available after balancing; continuing with capped set"
|
||||
);
|
||||
}
|
||||
|
||||
let mut output = Vec::with_capacity(total_selected);
|
||||
loop {
|
||||
let mut progressed = false;
|
||||
for queue in &mut queues {
|
||||
if let Some(item) = queue.pop_front() {
|
||||
output.push(item);
|
||||
progressed = true;
|
||||
}
|
||||
}
|
||||
if !progressed {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if output.is_empty() {
|
||||
return Err(anyhow!(
|
||||
"no eligible BEIR questions found; cannot build slice"
|
||||
));
|
||||
}
|
||||
|
||||
Ok(output)
|
||||
}
|
||||
|
||||
fn question_prefix(question_id: &str) -> Option<&'static str> {
|
||||
for prefix in BEIR_DATASETS.iter().map(|kind| kind.source_prefix()) {
|
||||
if let Some(rest) = question_id.strip_prefix(prefix) {
|
||||
if rest.starts_with('-') {
|
||||
return Some(prefix);
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
#[allow(clippy::indexing_slicing)]
|
||||
fn ensure_negative_pool(
|
||||
dataset: &ConvertedDataset,
|
||||
@@ -1028,15 +904,48 @@ fn compute_slice_id(key: &SliceKey<'_>) -> Result<String> {
|
||||
}))
|
||||
}
|
||||
|
||||
#[allow(clippy::indexing_slicing)]
|
||||
fn mix_seed(dataset_id: &str, seed: u64) -> u64 {
|
||||
let mut hasher = Sha256::new();
|
||||
hasher.update(dataset_id.as_bytes());
|
||||
hasher.update(seed.to_le_bytes());
|
||||
let digest = hasher.finalize();
|
||||
let mut bytes = [0u8; 8];
|
||||
bytes.copy_from_slice(&digest[..8]);
|
||||
u64::from_le_bytes(bytes)
|
||||
pub fn read_manifest_if_exists(path: &Path) -> Result<Option<SliceManifest>> {
|
||||
if !path.exists() {
|
||||
return Ok(None);
|
||||
}
|
||||
read_manifest(path).map(Some)
|
||||
}
|
||||
|
||||
pub fn cached_manifest_path(config: &crate::args::Config) -> Option<PathBuf> {
|
||||
let slice_arg = config.slice.as_deref()?;
|
||||
let explicit_path = Path::new(slice_arg);
|
||||
if explicit_path.exists() {
|
||||
return Some(explicit_path.to_path_buf());
|
||||
}
|
||||
Some(
|
||||
config
|
||||
.cache_dir
|
||||
.join("slices")
|
||||
.join(config.dataset.id())
|
||||
.join(format!("{slice_arg}.json")),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn manifest_is_complete(manifest: &SliceManifest, config: &SliceConfig<'_>) -> bool {
|
||||
let requested_limit = config
|
||||
.limit
|
||||
.unwrap_or(manifest.case_count.max(1))
|
||||
.max(1);
|
||||
if manifest.case_count < requested_limit {
|
||||
return false;
|
||||
}
|
||||
|
||||
let requested_corpus = config
|
||||
.corpus_limit
|
||||
.unwrap_or(manifest.total_paragraphs.max(1))
|
||||
.max(1);
|
||||
let desired_negatives = desired_negative_target(
|
||||
manifest.positive_paragraphs,
|
||||
requested_corpus,
|
||||
manifest.total_paragraphs.max(manifest.positive_paragraphs.max(1)),
|
||||
config.negative_multiplier,
|
||||
);
|
||||
manifest.negative_paragraphs >= desired_negatives
|
||||
}
|
||||
|
||||
fn read_manifest(path: &Path) -> Result<SliceManifest> {
|
||||
@@ -1057,14 +966,38 @@ fn write_manifest(path: &Path, manifest: &SliceManifest) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
use crate::args::Config;
|
||||
|
||||
impl<'a> From<&'a Config> for SliceConfig<'a> {
|
||||
fn from(config: &'a Config) -> Self {
|
||||
slice_config_with_limit(config, None)
|
||||
pub fn ledger_target(config: &Config) -> Option<usize> {
|
||||
match (config.slice_grow, config.limit) {
|
||||
(Some(grow), Some(limit)) => Some(limit.max(grow)),
|
||||
(Some(grow), None) => Some(grow),
|
||||
(None, limit) => limit,
|
||||
}
|
||||
}
|
||||
|
||||
/// Grow the slice ledger to contain the target number of cases.
|
||||
pub fn grow_slice(dataset: &ConvertedDataset, config: &Config) -> Result<()> {
|
||||
let ledger_limit = ledger_target(config);
|
||||
let slice_settings = slice_config_with_limit(config, ledger_limit);
|
||||
let slice =
|
||||
resolve_slice(dataset, &slice_settings).context("resolving dataset slice")?;
|
||||
info!(
|
||||
slice = slice.manifest.slice_id.as_str(),
|
||||
cases = slice.manifest.case_count,
|
||||
positives = slice.manifest.positive_paragraphs,
|
||||
negatives = slice.manifest.negative_paragraphs,
|
||||
total_paragraphs = slice.manifest.total_paragraphs,
|
||||
"Slice ledger ready"
|
||||
);
|
||||
println!(
|
||||
"Slice `{}` now contains {} questions ({} positives, {} negatives)",
|
||||
slice.manifest.slice_id,
|
||||
slice.manifest.case_count,
|
||||
slice.manifest.positive_paragraphs,
|
||||
slice.manifest.negative_paragraphs
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn slice_config_with_limit(config: &Config, limit_override: Option<usize>) -> SliceConfig<'_> {
|
||||
SliceConfig {
|
||||
cache_dir: config.cache_dir.as_path(),
|
||||
@@ -1088,7 +1021,7 @@ mod tests {
|
||||
use tempfile::tempdir;
|
||||
|
||||
fn sample_dataset() -> ConvertedDataset {
|
||||
let metadata = DatasetMetadata::for_kind(DatasetKind::SquadV2, false, None);
|
||||
let metadata = DatasetMetadata::for_kind(DatasetKind::SquadV2, false);
|
||||
ConvertedDataset {
|
||||
generated_at: Utc::now(),
|
||||
metadata,
|
||||
@@ -1226,7 +1159,7 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
let metadata = DatasetMetadata::for_kind(DatasetKind::Beir, false, None);
|
||||
let metadata = DatasetMetadata::for_kind(DatasetKind::Beir, false);
|
||||
let dataset = ConvertedDataset {
|
||||
generated_at: Utc::now(),
|
||||
metadata,
|
||||
@@ -1240,11 +1173,11 @@ mod tests {
|
||||
rng_seed: 0xBB,
|
||||
};
|
||||
|
||||
let refs = ordered_question_refs_beir(&dataset, ¶ms, 8)?;
|
||||
let refs = beir::ordered_question_refs_beir(&dataset, ¶ms, 8)?;
|
||||
let mut per_prefix: HashMap<String, usize> = HashMap::new();
|
||||
for (p_idx, q_idx) in refs {
|
||||
let question = &dataset.paragraphs[p_idx].questions[q_idx];
|
||||
let prefix = question_prefix(&question.id).unwrap_or("unknown");
|
||||
let prefix = beir::question_prefix(&question.id).unwrap_or("unknown");
|
||||
*per_prefix.entry(prefix.to_string()).or_default() += 1;
|
||||
}
|
||||
|
||||
@@ -1,179 +0,0 @@
|
||||
use std::path::PathBuf;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use chrono::{DateTime, Utc};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use sha2::{Digest, Sha256};
|
||||
use tokio::fs;
|
||||
|
||||
use crate::{args::Config, slice};
|
||||
use common::utils::embedding::EmbeddingProvider;
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct SnapshotMetadata {
|
||||
pub dataset_id: String,
|
||||
pub slice_id: String,
|
||||
pub embedding_backend: String,
|
||||
pub embedding_model: Option<String>,
|
||||
pub embedding_dimension: usize,
|
||||
pub rerank_enabled: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct DbSnapshotState {
|
||||
pub dataset_id: String,
|
||||
pub slice_id: String,
|
||||
pub ingestion_fingerprint: String,
|
||||
pub snapshot_hash: String,
|
||||
pub updated_at: DateTime<Utc>,
|
||||
#[serde(default)]
|
||||
pub namespace: Option<String>,
|
||||
#[serde(default)]
|
||||
pub database: Option<String>,
|
||||
#[serde(default)]
|
||||
pub slice_case_count: usize,
|
||||
}
|
||||
|
||||
pub struct Descriptor {
|
||||
#[allow(dead_code)]
|
||||
metadata: SnapshotMetadata,
|
||||
dir: PathBuf,
|
||||
metadata_hash: String,
|
||||
}
|
||||
|
||||
impl Descriptor {
|
||||
pub fn new(
|
||||
config: &Config,
|
||||
slice: &slice::ResolvedSlice<'_>,
|
||||
embedding_provider: &EmbeddingProvider,
|
||||
) -> Self {
|
||||
let metadata = SnapshotMetadata {
|
||||
dataset_id: slice.manifest.dataset_id.clone(),
|
||||
slice_id: slice.manifest.slice_id.clone(),
|
||||
embedding_backend: embedding_provider.backend_label().to_string(),
|
||||
embedding_model: embedding_provider.model_code(),
|
||||
embedding_dimension: embedding_provider.dimension(),
|
||||
rerank_enabled: config.retrieval.rerank,
|
||||
};
|
||||
|
||||
let dir = config
|
||||
.cache_dir
|
||||
.join("snapshots")
|
||||
.join(&metadata.dataset_id)
|
||||
.join(&metadata.slice_id);
|
||||
let metadata_hash = compute_hash(&metadata);
|
||||
|
||||
Self {
|
||||
metadata,
|
||||
dir,
|
||||
metadata_hash,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn metadata_hash(&self) -> &str {
|
||||
&self.metadata_hash
|
||||
}
|
||||
|
||||
pub async fn load_db_state(&self) -> Result<Option<DbSnapshotState>> {
|
||||
let path = self.db_state_path();
|
||||
if !path.exists() {
|
||||
return Ok(None);
|
||||
}
|
||||
let bytes = fs::read(&path)
|
||||
.await
|
||||
.with_context(|| format!("reading namespace state {}", path.display()))?;
|
||||
let state = serde_json::from_slice(&bytes)
|
||||
.with_context(|| format!("deserialising namespace state {}", path.display()))?;
|
||||
Ok(Some(state))
|
||||
}
|
||||
|
||||
pub async fn store_db_state(&self, state: &DbSnapshotState) -> Result<()> {
|
||||
let path = self.db_state_path();
|
||||
if let Some(parent) = path.parent() {
|
||||
fs::create_dir_all(parent).await.with_context(|| {
|
||||
format!("creating namespace state directory {}", parent.display())
|
||||
})?;
|
||||
}
|
||||
let blob =
|
||||
serde_json::to_vec_pretty(state).context("serialising namespace state payload")?;
|
||||
fs::write(&path, blob)
|
||||
.await
|
||||
.with_context(|| format!("writing namespace state {}", path.display()))?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn db_dir(&self) -> PathBuf {
|
||||
self.dir.join("db")
|
||||
}
|
||||
|
||||
fn db_state_path(&self) -> PathBuf {
|
||||
self.db_dir().join("state.json")
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn from_parts(metadata: SnapshotMetadata, dir: PathBuf) -> Self {
|
||||
let metadata_hash = compute_hash(&metadata);
|
||||
Self {
|
||||
metadata,
|
||||
dir,
|
||||
metadata_hash,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::expect_used)]
|
||||
fn compute_hash(metadata: &SnapshotMetadata) -> String {
|
||||
let mut hasher = Sha256::new();
|
||||
hasher.update(
|
||||
serde_json::to_vec(metadata).expect("snapshot metadata serialisation should succeed"),
|
||||
);
|
||||
format!("{:x}", hasher.finalize())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
#[allow(clippy::unwrap_used, clippy::expect_used)]
|
||||
async fn state_round_trip() {
|
||||
let temp_dir = tempfile::tempdir().unwrap();
|
||||
let metadata = SnapshotMetadata {
|
||||
dataset_id: "dataset".into(),
|
||||
slice_id: "slice".into(),
|
||||
embedding_backend: "hashed".into(),
|
||||
embedding_model: None,
|
||||
embedding_dimension: 128,
|
||||
rerank_enabled: true,
|
||||
};
|
||||
let descriptor = Descriptor::from_parts(
|
||||
metadata,
|
||||
temp_dir
|
||||
.path()
|
||||
.join("snapshots")
|
||||
.join("dataset")
|
||||
.join("slice"),
|
||||
);
|
||||
|
||||
let state = DbSnapshotState {
|
||||
dataset_id: "dataset".into(),
|
||||
slice_id: "slice".into(),
|
||||
ingestion_fingerprint: "fingerprint".into(),
|
||||
snapshot_hash: descriptor.metadata_hash().to_string(),
|
||||
updated_at: Utc::now(),
|
||||
namespace: Some("ns".into()),
|
||||
database: Some("db".into()),
|
||||
slice_case_count: 42,
|
||||
};
|
||||
descriptor.store_db_state(&state).await.unwrap();
|
||||
|
||||
let loaded = descriptor.load_db_state().await.unwrap().unwrap();
|
||||
assert_eq!(loaded.dataset_id, state.dataset_id);
|
||||
assert_eq!(loaded.slice_id, state.slice_id);
|
||||
assert_eq!(loaded.ingestion_fingerprint, state.ingestion_fingerprint);
|
||||
assert_eq!(loaded.snapshot_hash, state.snapshot_hash);
|
||||
assert_eq!(loaded.namespace, state.namespace);
|
||||
assert_eq!(loaded.database, state.database);
|
||||
assert_eq!(loaded.slice_case_count, state.slice_case_count);
|
||||
}
|
||||
}
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::collections::HashSet;
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
use chrono::{DateTime, SecondsFormat, Utc};
|
||||
use common::storage::types::StoredObject;
|
||||
use retrieval_pipeline::{
|
||||
Diagnostics, RetrievalOutput, RetrievedChunk, RetrievedEntity, StageKind, StageTimings,
|
||||
@@ -8,6 +8,8 @@ use retrieval_pipeline::{
|
||||
use serde::{Deserialize, Serialize};
|
||||
use unicode_normalization::UnicodeNormalization;
|
||||
|
||||
pub use crate::context_stats::{RetrievalContextStats, RetrievedContextStats};
|
||||
|
||||
#[allow(clippy::struct_excessive_bools)]
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct EvaluationSummary {
|
||||
@@ -83,6 +85,7 @@ pub struct EvaluationSummary {
|
||||
pub chunk_vector_take: usize,
|
||||
pub chunk_fts_take: usize,
|
||||
pub max_chunks_per_entity: usize,
|
||||
pub retrieved_context: RetrievalContextStats,
|
||||
pub cases: Vec<CaseSummary>,
|
||||
}
|
||||
|
||||
@@ -108,6 +111,7 @@ pub struct CaseSummary {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub ndcg: Option<f64>,
|
||||
pub latency_ms: u128,
|
||||
pub retrieved_context: RetrievedContextStats,
|
||||
pub retrieved: Vec<RetrievedSummary>,
|
||||
}
|
||||
|
||||
@@ -483,3 +487,7 @@ pub fn build_case_diagnostics(
|
||||
pipeline: pipeline_stats,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn format_timestamp(timestamp: &DateTime<Utc>) -> String {
|
||||
timestamp.to_rfc3339_opts(SecondsFormat::Secs, true)
|
||||
}
|
||||
|
||||
@@ -44,7 +44,6 @@
|
||||
--leading-snug: 1.375;
|
||||
--leading-relaxed: 1.625;
|
||||
--ease-out: cubic-bezier(0, 0, 0.2, 1);
|
||||
--ease-in-out: cubic-bezier(0.4, 0, 0.2, 1);
|
||||
--animate-pulse: pulse 2s cubic-bezier(0.4, 0, 0.6, 1) infinite;
|
||||
--default-transition-duration: 150ms;
|
||||
--default-transition-timing-function: cubic-bezier(0.4, 0, 0.2, 1);
|
||||
|
||||
Reference in New Issue
Block a user