evals: eval crate overhaul, simplification and performance improvements

This commit is contained in:
Per Stark
2026-06-17 19:23:11 +02:00
parent adc04d8c6d
commit fb51a8b55f
53 changed files with 2852 additions and 1831 deletions
+1 -1
View File
@@ -1,2 +1,2 @@
[alias]
eval = "run -p evaluations --"
eval = "run -p evaluations --release --"
+1
View File
@@ -1,5 +1,6 @@
# Changelog
## Unreleased
- Evaluations: simplified crate layout — linear pipeline, sharded-only converted store, in-memory ingestion, `db/` and `cli/` modules; namespace reuse state in corpus manifest (removed `cache/snapshots/`); no legacy JSON/history compatibility (re-run `--warm` after upgrade)
- Performance: ingestion skips per-task index rebuild; worker runs scheduled `REBUILD INDEX` (default every 24h via `index_rebuild_interval_secs`, `0` disables)
- Performance: ingestion persists all artifacts in a single SurrealDB transaction per task (atomic replace by task id)
- Performance: entity embeddings during ingestion use batched `embed_batch`, matching chunk embedding
Generated
-94
View File
@@ -165,12 +165,6 @@ dependencies = [
"libc",
]
[[package]]
name = "anes"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
[[package]]
name = "anstream"
version = "0.6.21"
@@ -1071,12 +1065,6 @@ dependencies = [
"serde",
]
[[package]]
name = "cast"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
[[package]]
name = "castaway"
version = "0.2.4"
@@ -1582,42 +1570,6 @@ dependencies = [
"cfg-if",
]
[[package]]
name = "criterion"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f"
dependencies = [
"anes",
"cast",
"ciborium",
"clap",
"criterion-plot",
"is-terminal",
"itertools 0.10.5",
"num-traits",
"once_cell",
"oorandom",
"plotters",
"rayon",
"regex",
"serde",
"serde_derive",
"serde_json",
"tinytemplate",
"walkdir",
]
[[package]]
name = "criterion-plot"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1"
dependencies = [
"cast",
"itertools 0.10.5",
]
[[package]]
name = "critical-section"
version = "1.2.0"
@@ -2238,7 +2190,6 @@ dependencies = [
"chrono",
"clap",
"common",
"criterion",
"fastembed",
"futures",
"ingestion-pipeline",
@@ -2250,7 +2201,6 @@ dependencies = [
"serde_json",
"serde_yaml",
"sha2",
"state-machines",
"surrealdb",
"tempfile",
"text-splitter",
@@ -4438,12 +4388,6 @@ dependencies = [
"pkg-config",
]
[[package]]
name = "oorandom"
version = "11.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e"
[[package]]
name = "opaque-debug"
version = "0.3.1"
@@ -4836,34 +4780,6 @@ version = "0.3.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
[[package]]
name = "plotters"
version = "0.3.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747"
dependencies = [
"num-traits",
"plotters-backend",
"plotters-svg",
"wasm-bindgen",
"web-sys",
]
[[package]]
name = "plotters-backend"
version = "0.3.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a"
[[package]]
name = "plotters-svg"
version = "0.3.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670"
dependencies = [
"plotters-backend",
]
[[package]]
name = "polling"
version = "3.11.0"
@@ -6940,16 +6856,6 @@ dependencies = [
"zerovec",
]
[[package]]
name = "tinytemplate"
version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc"
dependencies = [
"serde",
"serde_json",
]
[[package]]
name = "tinyvec"
version = "1.10.0"
+1 -1
View File
@@ -9,7 +9,7 @@ members = [
"json-stream-parser",
"evaluations"
]
resolver = "2"
resolver = "3"
[workspace.dependencies]
anyhow = "1.0.94"
+2
View File
@@ -13,6 +13,8 @@ let
else
throw "pkgs.onnxruntime.version (${pkgs.onnxruntime.version}) must match ort-version (${ortVersion})";
in {
devenv.warnOnNewVersion = false;
cachix.enable = false;
packages = [
-2
View File
@@ -30,8 +30,6 @@ serde_json = { workspace = true }
async-trait = { workspace = true }
once_cell = "1.19"
serde_yaml = "0.9"
criterion = "0.5"
state-machines = { workspace = true }
clap = { version = "4.4", features = ["derive", "env"] }
[dev-dependencies]
+71 -181
View File
@@ -1,212 +1,102 @@
# Evaluations
The `evaluations` crate provides a retrieval evaluation framework for benchmarking Minne's information retrieval pipeline against standard datasets.
The `evaluations` crate benchmarks Minne's retrieval pipeline against standard datasets.
## Quick Start
```bash
# Run SQuAD v2.0 evaluation (vector-only, recommended)
cargo run --package evaluations -- --ingest-chunks-only
# One-time prep (convert, slice ledger, corpus cache, DB seed)
cargo eval --warm --dataset beir --slice beir-mix-600
# Run a specific dataset
cargo run --package evaluations -- --dataset fiqa --ingest-chunks-only
# Check readiness
cargo eval --status --dataset beir --slice beir-mix-600
# Convert dataset only (no evaluation)
cargo run --package evaluations -- --convert-only
# Run benchmark (steady state after warm)
cargo eval --dataset beir --slice beir-mix-600 --require-ready
```
Default dataset is `beir`. When `--slice` is omitted, the first catalog slice for the dataset is applied automatically (e.g. `beir-mix-600`).
Chunk-only ingestion is the default. Pass `--include-entities` to opt into entity extraction during ingestion (requires `OPENAI_API_KEY`).
### Custom slice sizes
`--slice` is a ledger id, not only a catalog name. You can use any id; `--limit` controls how many questions the ledger contains:
```bash
# 200-case BEIR mix (default --limit is 200)
cargo eval --warm --dataset beir --slice beir-mix-200
cargo eval --dataset beir --slice beir-mix-200 --require-ready
```
The catalog slice `beir-mix-600` in `manifest.yaml` is a preset with `limit: 600` and `negative_multiplier: 9.0`.
### BEIR mix layout
`beir` is a **virtual mix** across eight subset datasets (FEVER, FiQA, HotpotQA, NFCorpus, Quora, TREC-COVID, SciFact, NQ-BEIR). There is no monolithic `beir-minne/` store.
1. Build an in-memory qrels-world mix from raw subset data
2. Resolve the slice ledger (`cache/slices/beir/<slice-id>.json`)
3. Materialize only ledger paragraph ids into per-subset stores (`fever-minne/`, `fiqa-minne/`, …)
4. Ingest the slice corpus and seed SurrealDB
Conversion is **qrels-closed**: only documents that appear in qrels are exported, not the full BEIR corpus.
Chunk-only mode may evaluate fewer cases than the slice ledger size when some questions are impossible or lack verifiable answer chunks.
Reports include a **Retrieved Context Volume** section: total characters and estimated tokens across all chunks returned per query (`~chars/4`, comparable across `--chunk-result-cap` sweeps). Use this to compare the cost of raising `--chunk-result-cap`.
## Prerequisites
### 1. SurrealDB
Start a SurrealDB instance before running evaluations:
### SurrealDB
```bash
docker-compose up -d surrealdb
```
Or using the default endpoint configuration:
### Raw datasets
```bash
surreal start --user root_user --pass root_password
```
Place raw datasets under `evaluations/data/raw/`. See [manifest.yaml](./manifest.yaml) for paths.
### 2. Download Raw Datasets
BEIR subsets live in sibling directories (`data/raw/fever`, `data/raw/fiqa`, …). The `data/raw/beir` entry is a virtual catalog placeholder; warm uses the subset paths.
Raw datasets must be downloaded manually and placed in `evaluations/data/raw/`. See [Dataset Sources](#dataset-sources) below for links and formats.
## Directory Structure
## Directory structure
```
evaluations/
├── data/
│ ├── raw/ # Downloaded raw datasets (manual)
│ │ ├── squad/ # SQuAD v2.0
│ │ ├── nq-dev/ # Natural Questions
│ │ ── fiqa/ # BEIR: FiQA-2018
│ ├── fever/ # BEIR: FEVER
├── hotpotqa/ # BEIR: HotpotQA
── ... # Other BEIR subsets
└── converted/ # Auto-generated (Minne JSON format)
├── cache/ # Ingestion and embedding caches
├── reports/ # Evaluation output (JSON + Markdown)
├── manifest.yaml # Dataset and slice definitions
── src/ # Evaluation source code
│ ├── raw/ # Downloaded datasets (manual)
│ │ ├── fever/ # BEIR subset raw dirs (corpus.jsonl, queries.jsonl, qrels/)
│ │ ├── fiqa/
│ │ ──
└── converted/ # Sharded stores (auto-generated)
├── fever-minne/ # per-BEIR-subset stores
── fiqa-minne/
└── … # BEIR mix loads from subset stores (no monolithic beir-minne/)
├── cache/
│ ├── slices/ # Slice ledgers
│ └── ingested/ # Corpus ingestion caches (manifest includes namespace seed)
── reports/ # JSON + Markdown output from benchmark runs
├── manifest.yaml
└── src/
```
## Dataset Sources
**After upgrading:** delete old monolithic `*-minne.json` files, any legacy `beir-minne/` merged store, `cache/snapshots/` directories, and stale `reports/history/` artifacts, then re-run `--warm`.
### SQuAD v2.0
Download and place at `data/raw/squad/dev-v2.0.json`:
```bash
mkdir -p evaluations/data/raw/squad
curl -L https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json \
-o evaluations/data/raw/squad/dev-v2.0.json
```
### Natural Questions (NQ)
Download and place at `data/raw/nq-dev/dev-all.jsonl`:
```bash
mkdir -p evaluations/data/raw/nq-dev
# Download from Google's Natural Questions page or HuggingFace
# File: dev-all.jsonl (simplified JSONL format)
```
Source: [Google Natural Questions](https://ai.google.com/research/NaturalQuestions)
### BEIR Datasets
All BEIR datasets follow the same format structure:
```
data/raw/<dataset>/
├── corpus.jsonl # Document corpus
├── queries.jsonl # Query set
└── qrels/
└── test.tsv # Relevance judgments (or dev.tsv)
```
Download datasets from the [BEIR Benchmark repository](https://github.com/beir-cellar/beir). Each dataset zip extracts to the required directory structure.
| Dataset | Directory |
|------------|---------------|
| FEVER | `fever/` |
| FiQA-2018 | `fiqa/` |
| HotpotQA | `hotpotqa/` |
| NFCorpus | `nfcorpus/` |
| Quora | `quora/` |
| TREC-COVID | `trec-covid/` |
| SciFact | `scifact/` |
| NQ (BEIR) | `nq/` |
Example download:
```bash
cd evaluations/data/raw
curl -L https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/fiqa.zip -o fiqa.zip
unzip fiqa.zip && rm fiqa.zip
```
## Dataset Conversion
Raw datasets are automatically converted to Minne's internal JSON format on first run. To force reconversion:
```bash
cargo run --package evaluations -- --force-convert
```
Converted files are saved to `data/converted/` and cached for subsequent runs.
## CLI Reference
### Common Options
## Common flags
| Flag | Description | Default |
|------|-------------|---------|
| `--dataset <NAME>` | Dataset to evaluate | `squad-v2` |
| `--limit <N>` | Max questions to evaluate (0 = all) | `200` |
| `--k <N>` | Precision@k cutoff | `5` |
| `--slice <ID>` | Use a predefined slice from manifest | — |
| `--rerank` | Enable FastEmbed reranking stage | disabled |
| `--embedding-backend <BE>` | `fastembed` or `hashed` | `fastembed` |
| `--ingest-chunks-only` | Skip entity extraction, ingest only text chunks | disabled |
| `--dataset` | Dataset to evaluate | `beir` |
| `--slice` | Slice ledger id (catalog or custom) | first catalog slice |
| `--limit` | Max questions in the slice ledger | `200` |
| `--warm` | Prepare without running queries | — |
| `--status` | Print readiness | — |
| `--require-ready` | Fail if not warmed | — |
| `--include-entities` | Entity extraction during ingestion | off |
| `--force-convert` | Rebuild converted store | — |
| `--chunk-result-cap` | Max chunks returned per query (raise with `--k`) | `5` |
| `--perf-log-console` | Print per-stage timings after a run | off |
| `--label` | Label stored in JSON/Markdown reports | — |
> [!TIP]
> Use `--ingest-chunks-only` when evaluating vector-only retrieval strategies. This skips the LLM-based entity extraction and graph generation, significantly speeding up ingestion while focusing on pure chunk-based vector search.
### Available Datasets
```
squad-v2, natural-questions, beir, fever, fiqa, hotpotqa,
nfcorpus, quora, trec-covid, scifact, nq-beir
```
### Database Configuration
| Flag | Environment | Default |
|------|-------------|---------|
| `--db-endpoint` | `EVAL_DB_ENDPOINT` | `ws://127.0.0.1:8000` |
| `--db-username` | `EVAL_DB_USERNAME` | `root_user` |
| `--db-password` | `EVAL_DB_PASSWORD` | `root_password` |
| `--db-namespace` | `EVAL_DB_NAMESPACE` | auto-generated |
| `--db-database` | `EVAL_DB_DATABASE` | auto-generated |
### Example Runs
```bash
# Vector-only evaluation (recommended for benchmarking)
cargo run --package evaluations -- \
--dataset fiqa \
--ingest-chunks-only \
--limit 200
# Full FiQA evaluation with reranking
cargo run --package evaluations -- \
--dataset fiqa \
--ingest-chunks-only \
--limit 500 \
--rerank \
--k 10
# Use a predefined slice for reproducibility
cargo run --package evaluations -- --slice fiqa-test-200 --ingest-chunks-only
# Run the mixed BEIR benchmark
cargo run --package evaluations -- --dataset beir --slice beir-mix-600 --ingest-chunks-only
```
## Slices
Slices are predefined, reproducible subsets defined in `manifest.yaml`. Each slice specifies:
- **limit**: Number of questions
- **corpus_limit**: Maximum corpus size
- **seed**: Fixed RNG seed for reproducibility
View available slices in [manifest.yaml](./manifest.yaml).
## Reports
Evaluations generate reports in `reports/`:
- **JSON**: Full structured results (`*-report.json`)
- **Markdown**: Human-readable summary with sample mismatches (`*-report.md`)
- **History**: Timestamped run history (`history/`)
## Performance Tuning
```bash
# Log per-stage performance timings
cargo run --package evaluations -- --perf-log-console
# Save telemetry to file
cargo run --package evaluations -- --perf-log-json ./perf.json
```
## License
See [../LICENSE](../LICENSE).
See [REFACTOR.md](./REFACTOR.md) for architecture notes.
+98
View File
@@ -0,0 +1,98 @@
# Evaluations crate refactor plan
This document records the architecture review and the simplification work applied to the
`evaluations` crate. **No backwards compatibility** is maintained for converted JSON layouts,
legacy report history, or old cache artifact formats.
## Goals
- Smaller, linear pipeline (no state machine ceremony)
- Sharded converted store for **all** datasets (memory-efficient partial loading)
- Slice-first loading when a catalog slice is selected
- In-memory SurrealDB for ingestion (no ephemeral server namespaces)
- Single DB lifecycle module (`db/`)
- CLI helpers under `cli/`
## Primary workflow
```bash
# One-time prep (converts raw data if needed, builds slice ledger, corpus cache, DB seed)
cargo eval --warm --dataset beir --slice beir-mix-600
# Check readiness
cargo eval --status --dataset beir --slice beir-mix-600
# Steady-state benchmark
cargo eval --dataset beir --slice beir-mix-600 --require-ready
```
Default dataset is `beir`. Chunk-only ingestion is the default; pass `--include-entities` to
opt into entity extraction (requires `OPENAI_API_KEY`). Slice tuning such as
`negative_multiplier` lives in `manifest.yaml` (e.g. `beir-mix-600` uses `9.0`).
## Cache layers (after refactor)
| Layer | Location | Purpose |
|-------|----------|---------|
| Converted store | `data/converted/<name>/` | Sharded paragraphs + question catalog |
| Slice ledger | `cache/slices/<dataset>/<slice-id>.json` | Deterministic questions + paragraph set |
| Corpus cache | `cache/ingested/<dataset>/<slice-id>/` | Ingestion paragraph shards, manifest, and namespace reuse seed |
Namespace reuse state lives in the corpus manifest (`metadata.namespace_seed`), not a separate
`snapshots/` tree. After upgrading, delete old `*-minne.json` monolithic files, any
`cache/snapshots/` directories, and re-run `--warm`.
## Phases applied
### Phase 0 — dead code
- Removed unused `criterion` dependency
- Removed unused `EmbeddingCache`
- Updated README for current CLI
### Phase 1 — structure
- Flattened pipeline to linear `async fn` stages
- Removed `eval.rs` hub; imports go to owning modules
- Merged `namespace.rs`, `db_helpers.rs``db/`; dropped standalone `snapshot.rs`
- Moved `status.rs``cli/status.rs`
- Fixed catalog slice bootstrap (build ledger when explicit slice manifest is missing)
### Phase 2 — no legacy paths
- All datasets use sharded converted store only
- Removed legacy JSON layout and migration
- Removed legacy report history format
- Auto-apply first catalog slice when `--slice` omitted
- Namespace seed folded into corpus manifest (removed `cache/snapshots/`)
### Phase 3 — performance
- Ingestion always uses in-memory SurrealDB
- Slice-first partial load when ledger is complete
- Default catalog slice for dataset when `--slice` not passed
- Split `slice/` into `mod.rs`, `build.rs`, and `beir.rs`
### Phase 4 — BEIR mix slice-first
- `beir` is a virtual mix: slice ledger references prefixed ids (`fever-…`, `fiqa-…`, …)
- Conversion is **qrels-closed** per subset (only documents appearing in qrels, not full corpus)
- Slice ledger is resolved for the requested `--slice` (catalog preset or custom id + `--limit`)
- Only ledger paragraph ids are materialized into per-subset stores (`fever-minne/`, `fiqa-minne/`, …)
- No monolithic `beir-minne/` merged store
- Raw BEIR data lives in per-subset dirs under `data/raw/`; `data/raw/beir` is a catalog placeholder
## Do not re-introduce
- Monolithic `*-minne.json` converted files
- Monolithic `beir-minne/` merged converted store (use per-subset stores + virtual mix loader)
- `state-machines` pipeline for this linear flow
- `eval.rs` re-export hub
- Legacy history migration in reports
- Ephemeral `ingest_eval_*` namespaces on the shared SurrealDB server
- Separate `cache/snapshots/` namespace state files
## Open follow-ups
- Generate `DatasetKind` from `manifest.yaml` at build time
- Split `report.rs` when touching reporting again
+2 -1
View File
@@ -1,4 +1,4 @@
default_dataset: squad-v2
default_dataset: beir
datasets:
- id: squad-v2
label: "SQuAD v2.0"
@@ -45,6 +45,7 @@ datasets:
description: "Balanced slice across FEVER, FiQA, HotpotQA, NFCorpus, Quora, TREC-COVID, SciFact, NQ-BEIR"
limit: 600
corpus_limit: 6000
negative_multiplier: 9.0
seed: 0x5eed2025
- id: fever
label: "FEVER (BEIR)"
+66 -18
View File
@@ -137,9 +137,9 @@ pub struct IngestConfig {
#[arg(long, default_value_t = 50)]
pub ingest_chunk_overlap_tokens: usize,
/// Run ingestion in chunk-only mode (skip analyzer/graph generation)
/// Include entity extraction and graph generation during ingestion (uses LLM tokens)
#[arg(long)]
pub ingest_chunks_only: bool,
pub include_entities: bool,
/// Number of paragraphs to ingest concurrently
#[arg(long, default_value_t = 10)]
@@ -159,6 +159,7 @@ pub struct IngestConfig {
}
#[derive(Debug, Clone, Args)]
#[allow(clippy::struct_field_names)]
pub struct DatabaseArgs {
/// `SurrealDB` server endpoint
#[arg(long, default_value = "ws://127.0.0.1:8000", env = "EVAL_DB_ENDPOINT")]
@@ -179,10 +180,6 @@ pub struct DatabaseArgs {
/// Override the database used on the `SurrealDB` server
#[arg(long, env = "EVAL_DB_DATABASE")]
pub db_database: Option<String>,
/// Path to inspect DB state
#[arg(long)]
pub inspect_db_state: Option<PathBuf>,
}
#[derive(Parser, Debug, Clone)]
@@ -233,10 +230,6 @@ pub struct Config {
#[arg(long, default_value_t = 5)]
pub sample: usize,
/// Disable context cropping when converting datasets (ingest entire documents)
#[arg(long)]
pub full_context: bool,
#[command(flatten)]
pub retrieval: RetrievalSettings,
@@ -322,6 +315,18 @@ pub struct Config {
#[command(flatten)]
pub database: DatabaseArgs,
/// Require warmed corpus/namespace before running queries
#[arg(long)]
pub require_ready: bool,
/// Prepare converted data, slice, corpus, and namespace without running queries
#[arg(long, conflicts_with = "status")]
pub warm: bool,
/// Print readiness of converted data, slice, corpus, and namespace
#[arg(long, conflicts_with = "warm")]
pub status: bool,
// Computed fields (not arguments)
#[arg(skip)]
pub raw_dataset_path: PathBuf,
@@ -334,11 +339,6 @@ pub struct Config {
}
impl Config {
#[allow(clippy::unused_self)]
pub fn context_token_limit(&self) -> Option<usize> {
None
}
#[allow(clippy::too_many_lines)]
pub fn finalize(&mut self) -> Result<()> {
// Handle dataset paths
@@ -367,9 +367,7 @@ impl Config {
// Handle retrieval settings
self.retrieval.require_verified_chunks = !self.llm_mode;
if self.dataset == DatasetKind::Beir {
self.negative_multiplier = 9.0;
}
self.apply_catalog_slice_defaults()?;
// Validations
if self.ingest.ingest_chunk_min_tokens == 0
@@ -477,6 +475,56 @@ impl Config {
Ok(())
}
fn apply_catalog_slice_defaults(&mut self) -> Result<()> {
let catalog = crate::datasets::catalog()?;
let entry = catalog.dataset(self.dataset.id())?;
if self.slice.is_none() {
if let Some(default_slice) = entry.slices.first() {
self.slice = Some(default_slice.id.clone());
}
}
let Some(slice_id) = self.slice.as_deref() else {
return Ok(());
};
let Ok((_, slice)) = catalog.slice(slice_id) else {
return Ok(());
};
if slice.dataset_id != self.dataset.id() {
return Ok(());
}
if let Some(limit) = slice.limit {
if self.limit_arg == 200 {
self.limit_arg = limit;
self.limit = Some(limit);
}
}
if self.corpus_limit.is_none() {
self.corpus_limit = slice.corpus_limit;
}
if let Some(seed) = slice.seed {
self.slice_seed = seed;
}
if let Some(include_unanswerable) = slice.include_unanswerable {
self.llm_mode = include_unanswerable;
self.retrieval.require_verified_chunks = !include_unanswerable;
}
if let Some(multiplier) = slice.negative_multiplier {
if negative_multiplier_is_default(self.negative_multiplier) {
self.negative_multiplier = multiplier;
}
}
Ok(())
}
}
fn negative_multiplier_is_default(value: f32) -> bool {
(value - crate::slice::DEFAULT_NEGATIVE_MULTIPLIER).abs() < f32::EPSILON
}
pub struct ParsedArgs {
-88
View File
@@ -1,88 +0,0 @@
use std::{
collections::HashMap,
path::Path,
sync::{
atomic::{AtomicBool, Ordering},
Arc,
},
};
use anyhow::{Context, Result};
use serde::{Deserialize, Serialize};
use tokio::sync::Mutex;
#[derive(Debug, Default, Serialize, Deserialize)]
struct EmbeddingCacheData {
entities: HashMap<String, Vec<f32>>,
chunks: HashMap<String, Vec<f32>>,
}
#[derive(Clone)]
pub struct EmbeddingCache {
path: Arc<Path>,
data: Arc<Mutex<EmbeddingCacheData>>,
dirty: Arc<AtomicBool>,
}
#[allow(dead_code)]
impl EmbeddingCache {
pub async fn load(path: impl AsRef<Path>) -> Result<Self> {
let path = path.as_ref().to_path_buf();
let data = if path.exists() {
let raw = tokio::fs::read(&path)
.await
.with_context(|| format!("reading embedding cache {}", path.display()))?;
serde_json::from_slice(&raw)
.with_context(|| format!("parsing embedding cache {}", path.display()))?
} else {
EmbeddingCacheData::default()
};
Ok(Self {
path: Arc::from(path.as_path()),
data: Arc::new(Mutex::new(data)),
dirty: Arc::new(AtomicBool::new(false)),
})
}
pub async fn get_entity(&self, id: &str) -> Option<Vec<f32>> {
let guard = self.data.lock().await;
guard.entities.get(id).cloned()
}
pub async fn insert_entity(&self, id: String, embedding: Vec<f32>) {
let mut guard = self.data.lock().await;
guard.entities.insert(id, embedding);
self.dirty.store(true, Ordering::Relaxed);
}
pub async fn get_chunk(&self, id: &str) -> Option<Vec<f32>> {
let guard = self.data.lock().await;
guard.chunks.get(id).cloned()
}
pub async fn insert_chunk(&self, id: String, embedding: Vec<f32>) {
let mut guard = self.data.lock().await;
guard.chunks.insert(id, embedding);
self.dirty.store(true, Ordering::Relaxed);
}
pub async fn persist(&self) -> Result<()> {
if !self.dirty.load(Ordering::Relaxed) {
return Ok(());
}
let guard = self.data.lock().await;
let body = serde_json::to_vec_pretty(&*guard).context("serialising embedding cache")?;
if let Some(parent) = self.path.parent() {
tokio::fs::create_dir_all(parent)
.await
.with_context(|| format!("creating cache directory {}", parent.display()))?;
}
tokio::fs::write(&*self.path, body)
.await
.with_context(|| format!("writing embedding cache {}", self.path.display()))?;
self.dirty.store(false, Ordering::Relaxed);
Ok(())
}
}
+1
View File
@@ -156,6 +156,7 @@ mod tests {
chunk_min_tokens: 1,
chunk_max_tokens: 10,
chunk_only: false,
namespace_seed: None,
},
paragraphs,
questions,
+3
View File
@@ -0,0 +1,3 @@
pub mod status;
pub use status::{collect_status, ensure_query_ready, print_status, warm};
+316
View File
@@ -0,0 +1,316 @@
#![allow(clippy::module_name_repetitions)]
use std::path::Path;
use anyhow::{Context, Result};
use serde::Serialize;
use crate::{
args::Config,
corpus::{self, CorpusCacheConfig},
datasets::{
beir_subset_store_summary, beir_subset_stores_ready, content_checksum_for_layout,
detect_layout, mix_content_checksum, store_dir_for, ConvertedLayout, DatasetKind,
},
db::{connect_eval_db, default_database, default_namespace, namespace_has_corpus},
slice::{self, ledger_target},
};
#[derive(Debug, Clone, Serialize)]
pub struct EvalStatus {
pub dataset: String,
pub slice: Option<String>,
pub converted: ConvertedStatus,
pub slice_ledger: SliceLedgerStatus,
pub corpus_cache: CorpusCacheStatus,
pub namespace: NamespaceStatus,
pub query_ready: bool,
pub notes: Vec<String>,
}
#[derive(Debug, Clone, Serialize)]
pub struct ConvertedStatus {
pub layout: String,
pub path: String,
pub ready: bool,
pub partial_load_eligible: bool,
pub checksum: Option<String>,
}
#[derive(Debug, Clone, Serialize)]
pub struct SliceLedgerStatus {
pub ready: bool,
pub path: Option<String>,
pub cases: Option<usize>,
pub positives: Option<usize>,
pub negatives: Option<usize>,
}
#[derive(Debug, Clone, Serialize)]
pub struct CorpusCacheStatus {
pub ready: bool,
pub path: Option<String>,
pub manifest_present: bool,
}
#[derive(Debug, Clone, Serialize)]
pub struct NamespaceStatus {
pub namespace: String,
pub database: String,
pub seeded: bool,
pub namespace_seed_recorded: bool,
}
#[allow(clippy::too_many_lines)]
pub async fn collect_status(config: &Config) -> Result<EvalStatus> {
let mut notes = Vec::new();
let is_beir_mix = config.dataset == DatasetKind::Beir;
let converted_path = &config.converted_dataset_path;
let layout = if is_beir_mix {
ConvertedLayout::Missing
} else {
detect_layout(converted_path)
};
let layout_label = if is_beir_mix {
"beir-mix-subset-stores"
} else {
match layout {
ConvertedLayout::ShardedStore => "sharded-store",
ConvertedLayout::Missing => "missing",
}
};
let store_dir = store_dir_for(converted_path);
let display_path = if is_beir_mix {
beir_subset_store_summary()?
.into_iter()
.map(|(subset, paragraphs, questions)| {
format!("{subset}-minne ({paragraphs} paragraphs, {questions} questions)")
})
.collect::<Vec<_>>()
.join("; ")
} else {
store_dir.display().to_string()
};
let manifest_path = slice::cached_manifest_path(config);
let slice_config = slice::slice_config_with_limit(config, ledger_target(config));
let slice_manifest = manifest_path
.as_ref()
.and_then(|path| slice::read_manifest_if_exists(path).ok().flatten());
let slice_ledger = SliceLedgerStatus {
ready: slice_manifest
.as_ref()
.is_some_and(|manifest| slice::manifest_is_complete(manifest, &slice_config)),
path: manifest_path.as_ref().map(|path| path.display().to_string()),
cases: slice_manifest.as_ref().map(|manifest| manifest.case_count),
positives: slice_manifest.as_ref().map(|manifest| manifest.positive_paragraphs),
negatives: slice_manifest.as_ref().map(|manifest| manifest.negative_paragraphs),
};
let beir_paragraph_ids = slice_manifest.as_ref().map(|manifest| {
manifest
.paragraphs
.iter()
.map(|entry| entry.id.clone())
.collect::<std::collections::HashSet<_>>()
});
let converted_ready = if is_beir_mix {
slice_ledger.ready
&& beir_paragraph_ids
.as_ref()
.is_some_and(|ids| beir_subset_stores_ready(ids).unwrap_or(false))
} else {
layout == ConvertedLayout::ShardedStore
};
let checksum = if is_beir_mix {
beir_paragraph_ids
.as_ref()
.and_then(|ids| mix_content_checksum(ids).ok())
} else if layout == ConvertedLayout::ShardedStore {
content_checksum_for_layout(converted_path).ok()
} else {
None
};
let partial_load_eligible = slice_ledger.ready && config.slice.is_some();
let corpus_cache = if let Some(manifest) = slice_manifest.as_ref() {
let cache_settings = CorpusCacheConfig::from(config);
let base_dir = corpus::cached_corpus_dir(
&cache_settings,
config.dataset.id(),
manifest.slice_id.as_str(),
);
let manifest_present = corpus::load_cached_manifest(&base_dir)?.is_some();
CorpusCacheStatus {
ready: manifest_present,
path: Some(base_dir.display().to_string()),
manifest_present,
}
} else {
CorpusCacheStatus {
ready: false,
path: None,
manifest_present: false,
}
};
let namespace = config
.database
.db_namespace
.clone()
.unwrap_or_else(|| {
default_namespace(
config.dataset.id(),
config.limit,
config.slice.as_deref(),
)
});
let database = config
.database
.db_database
.clone()
.unwrap_or_else(default_database);
let namespace_seed = corpus_cache.path.as_ref().and_then(|path| {
corpus::load_cached_manifest(Path::new(path))
.ok()
.flatten()
.and_then(|manifest| manifest.metadata.namespace_seed)
});
let (seeded, namespace_seed_recorded) = match connect_eval_db(config, &namespace, &database).await {
Ok(db) => {
let has_corpus = namespace_has_corpus(&db).await.unwrap_or(false);
(has_corpus, namespace_seed.is_some())
}
Err(err) => {
notes.push(format!("SurrealDB unavailable: {err}"));
(false, false)
}
};
let query_ready = converted_ready
&& slice_ledger.ready
&& corpus_cache.ready
&& seeded
&& namespace_seed_recorded;
if !query_ready {
notes.push("Run `cargo eval --warm --slice <id>` to prepare corpus and namespace.".into());
}
Ok(EvalStatus {
dataset: config.dataset.id().to_string(),
slice: config.slice.clone(),
converted: ConvertedStatus {
layout: layout_label.to_string(),
path: display_path,
ready: converted_ready,
partial_load_eligible,
checksum,
},
slice_ledger,
corpus_cache,
namespace: NamespaceStatus {
namespace,
database,
seeded,
namespace_seed_recorded,
},
query_ready,
notes,
})
}
pub fn print_status(status: &EvalStatus) {
println!("Evaluation status for dataset `{}`", status.dataset);
if let Some(slice) = &status.slice {
println!("Slice: {slice}");
}
println!(
"Converted: {} ({})",
if status.converted.ready {
"ready"
} else {
"missing"
},
status.converted.layout
);
println!("Converted path: {}", status.converted.path);
if status.converted.partial_load_eligible {
println!("Slice-first loading: eligible");
}
println!(
"Slice ledger: {}",
if status.slice_ledger.ready {
format!(
"ready ({} cases, {} positives, {} negatives)",
status.slice_ledger.cases.unwrap_or(0),
status.slice_ledger.positives.unwrap_or(0),
status.slice_ledger.negatives.unwrap_or(0)
)
} else {
"missing or incomplete".to_string()
}
);
if let Some(path) = &status.slice_ledger.path {
println!("Slice ledger path: {path}");
}
println!(
"Corpus cache: {}",
if status.corpus_cache.ready {
"ready"
} else {
"missing"
}
);
if let Some(path) = &status.corpus_cache.path {
println!("Corpus cache path: {path}");
}
println!(
"Namespace `{}` / `{}`: seeded={}, namespace_seed_recorded={}",
status.namespace.namespace,
status.namespace.database,
status.namespace.seeded,
status.namespace.namespace_seed_recorded
);
println!(
"Query-ready: {}",
if status.query_ready {
"yes"
} else {
"no"
}
);
for note in &status.notes {
println!("Note: {note}");
}
}
pub async fn warm(config: &Config) -> Result<()> {
let loaded =
crate::datasets::prepare_dataset(config.dataset, config).context("preparing dataset")?;
crate::pipeline::warm_evaluation(&loaded.dataset, config, &loaded.content_checksum)
.await
.context("warming evaluation corpus and namespace")?;
let status = collect_status(config).await?;
print_status(&status);
Ok(())
}
pub async fn ensure_query_ready(config: &Config) -> Result<()> {
let status = collect_status(config).await?;
if status.query_ready {
return Ok(());
}
print_status(&status);
anyhow::bail!(
"evaluation is not query-ready; run `cargo eval --warm --slice {}` first",
config.slice.as_deref().unwrap_or("<slice-id>")
);
}
+177
View File
@@ -0,0 +1,177 @@
use serde::{Deserialize, Serialize};
use common::storage::types::StoredObject;
use crate::types::EvaluationCandidate;
const TOKENIZER_LABEL: &str = "estimated (~chars/4; ingestion uses bert-base-cased)";
#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq)]
pub struct RetrievedContextStats {
pub chunk_count: usize,
pub char_count: usize,
pub token_count: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct RetrievalContextStats {
pub tokenizer: String,
pub queries: usize,
pub total_chunks: usize,
pub total_chars: usize,
pub total_tokens: usize,
pub avg_chunks_per_query: f64,
pub avg_chars_per_query: f64,
pub avg_tokens_per_query: f64,
pub p50_tokens_per_query: usize,
pub p95_tokens_per_query: usize,
pub max_tokens_per_query: usize,
}
pub fn stats_for_candidates(candidates: &[EvaluationCandidate]) -> RetrievedContextStats {
let mut seen_chunk_ids = std::collections::HashSet::new();
let mut stats = RetrievedContextStats::default();
for candidate in candidates {
for chunk in &candidate.chunks {
let chunk_id = chunk.chunk.id().to_string();
if !seen_chunk_ids.insert(chunk_id) {
continue;
}
let text = chunk.chunk.chunk.as_str();
stats.chunk_count += 1;
stats.char_count += text.chars().count();
stats.token_count += estimate_ingestion_tokens(text);
}
}
stats
}
pub fn aggregate_context_stats(per_query: &[RetrievedContextStats]) -> RetrievalContextStats {
let queries = per_query.len();
if queries == 0 {
return RetrievalContextStats {
tokenizer: TOKENIZER_LABEL.to_string(),
queries: 0,
total_chunks: 0,
total_chars: 0,
total_tokens: 0,
avg_chunks_per_query: 0.0,
avg_chars_per_query: 0.0,
avg_tokens_per_query: 0.0,
p50_tokens_per_query: 0,
p95_tokens_per_query: 0,
max_tokens_per_query: 0,
};
}
let total_chunks: usize = per_query.iter().map(|stats| stats.chunk_count).sum();
let total_chars: usize = per_query.iter().map(|stats| stats.char_count).sum();
let total_tokens: usize = per_query.iter().map(|stats| stats.token_count).sum();
let mut tokens_per_query: Vec<usize> = per_query.iter().map(|stats| stats.token_count).collect();
tokens_per_query.sort_unstable();
let max_tokens_per_query = *tokens_per_query.last().unwrap_or(&0);
RetrievalContextStats {
tokenizer: TOKENIZER_LABEL.to_string(),
queries,
total_chunks,
total_chars,
total_tokens,
avg_chunks_per_query: total_chunks as f64 / queries as f64,
avg_chars_per_query: total_chars as f64 / queries as f64,
avg_tokens_per_query: total_tokens as f64 / queries as f64,
p50_tokens_per_query: percentile_usize(&tokens_per_query, 0.50),
p95_tokens_per_query: percentile_usize(&tokens_per_query, 0.95),
max_tokens_per_query,
}
}
fn estimate_ingestion_tokens(text: &str) -> usize {
let chars = text.chars().count();
if chars == 0 {
return 0;
}
chars.div_ceil(4)
}
#[allow(clippy::cast_precision_loss, clippy::indexing_slicing, clippy::arithmetic_side_effects)]
fn percentile_usize(sorted: &[usize], fraction: f64) -> usize {
if sorted.is_empty() {
return 0;
}
let clamped = fraction.clamp(0.0, 1.0);
let index = ((sorted.len() - 1) as f64 * clamped).round() as usize;
sorted[index.min(sorted.len() - 1)]
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use super::*;
use common::storage::types::text_chunk::TextChunk;
use retrieval_pipeline::RetrievedChunk;
#[test]
fn deduplicates_chunks_when_counting_context() {
let shared = Arc::new(TextChunk::new(
"src".into(),
"hello world".into(),
"user".into(),
));
let candidates = vec![
EvaluationCandidate {
entity_id: "a".into(),
source_id: "src".into(),
entity_name: "A".into(),
entity_description: None,
entity_category: None,
score: 1.0,
chunks: vec![RetrievedChunk {
chunk: Arc::clone(&shared),
score: 1.0,
}],
},
EvaluationCandidate {
entity_id: "b".into(),
source_id: "src".into(),
entity_name: "B".into(),
entity_description: None,
entity_category: None,
score: 0.9,
chunks: vec![RetrievedChunk {
chunk: shared,
score: 0.9,
}],
},
];
let stats = stats_for_candidates(&candidates);
assert_eq!(stats.chunk_count, 1);
assert_eq!(stats.char_count, "hello world".chars().count());
assert_eq!(stats.token_count, 3);
}
#[test]
fn aggregates_per_query_token_totals() {
let per_query = vec![
RetrievedContextStats {
chunk_count: 2,
char_count: 100,
token_count: 40,
},
RetrievedContextStats {
chunk_count: 5,
char_count: 250,
token_count: 100,
},
];
let aggregate = aggregate_context_stats(&per_query);
assert_eq!(aggregate.queries, 2);
assert_eq!(aggregate.total_chunks, 7);
assert_eq!(aggregate.total_tokens, 140);
assert_eq!(aggregate.max_tokens_per_query, 100);
assert!((aggregate.avg_tokens_per_query - 70.0).abs() < f64::EPSILON);
}
}
+7 -25
View File
@@ -11,32 +11,14 @@ pub struct CorpusCacheConfig {
pub ingestion_max_retries: usize,
}
impl CorpusCacheConfig {
pub fn new(
ingestion_cache_dir: impl Into<PathBuf>,
force_refresh: bool,
refresh_embeddings_only: bool,
ingestion_batch_size: usize,
ingestion_max_retries: usize,
) -> Self {
impl From<&Config> for CorpusCacheConfig {
fn from(config: &Config) -> Self {
Self {
ingestion_cache_dir: ingestion_cache_dir.into(),
force_refresh,
refresh_embeddings_only,
ingestion_batch_size,
ingestion_max_retries,
ingestion_cache_dir: config.ingest.ingestion_cache_dir.clone(),
force_refresh: config.force_convert || config.ingest.slice_reset_ingestion,
refresh_embeddings_only: config.ingest.refresh_embeddings_only,
ingestion_batch_size: config.ingest.ingestion_batch_size,
ingestion_max_retries: config.ingest.ingestion_max_retries,
}
}
}
impl From<&Config> for CorpusCacheConfig {
fn from(config: &Config) -> Self {
CorpusCacheConfig::new(
config.ingest.ingestion_cache_dir.clone(),
config.force_convert || config.ingest.slice_reset_ingestion,
config.ingest.refresh_embeddings_only,
config.ingest.ingestion_batch_size,
config.ingest.ingestion_max_retries,
)
}
}
+3 -3
View File
@@ -5,11 +5,11 @@ pub(crate) mod store;
pub use config::CorpusCacheConfig;
pub use orchestrator::{
cached_corpus_dir, compute_ingestion_fingerprint, corpus_handle_from_manifest, ensure_corpus,
load_cached_manifest,
load_cached_manifest, persist_corpus_manifest,
};
pub use store::{
seed_manifest_into_db, window_manifest, CorpusHandle, CorpusManifest, CorpusMetadata,
CorpusQuestion, ParagraphShard, ParagraphShardStore, MANIFEST_VERSION,
CorpusQuestion, NamespaceSeedRecord, ParagraphShard, ParagraphShardStore, MANIFEST_VERSION,
};
pub fn make_ingestion_config(config: &crate::args::Config) -> ingestion_pipeline::IngestionConfig {
@@ -20,6 +20,6 @@ pub fn make_ingestion_config(config: &crate::args::Config) -> ingestion_pipeline
chunk_overlap_tokens: config.ingest.ingest_chunk_overlap_tokens,
..Default::default()
},
chunk_only: config.ingest.ingest_chunks_only,
chunk_only: !config.ingest.include_entities,
}
}
+15 -43
View File
@@ -9,8 +9,6 @@ use std::{
use anyhow::{anyhow, Context, Result};
use async_openai::Client;
use chrono::Utc;
#[cfg(not(test))]
use common::utils::config::get_config;
use common::{
storage::{
db::SurrealDbClient,
@@ -125,10 +123,14 @@ pub async fn ensure_corpus(
openai: Arc<OpenAIClient>,
user_id: &str,
converted_path: &Path,
precomputed_checksum: Option<&str>,
ingestion_config: IngestionConfig,
) -> Result<CorpusHandle> {
let checksum = compute_file_checksum(converted_path)
.with_context(|| format!("computing checksum for {}", converted_path.display()))?;
let checksum = match precomputed_checksum {
Some(value) => value.to_string(),
None => crate::datasets::content_checksum_for_layout(converted_path)
.with_context(|| format!("computing checksum for {}", converted_path.display()))?,
};
let ingestion_fingerprint =
build_ingestion_fingerprint(dataset, slice, &checksum, &ingestion_config);
@@ -381,6 +383,7 @@ pub async fn ensure_corpus(
chunk_min_tokens: ingestion_config.tuning.chunk_min_tokens,
chunk_max_tokens: ingestion_config.tuning.chunk_max_tokens,
chunk_only: ingestion_config.chunk_only,
namespace_seed: None,
},
paragraphs: corpus_paragraphs,
questions: corpus_questions,
@@ -415,7 +418,7 @@ pub async fn ensure_corpus(
negative_ingested: stats.negative_ingested,
};
persist_manifest(&handle).context("persisting corpus manifest")?;
persist_corpus_manifest(&handle).context("persisting corpus manifest")?;
Ok(handle)
}
@@ -501,7 +504,6 @@ async fn ingest_paragraph_batch(
Ok(shards)
}
#[cfg(test)]
async fn create_ingest_db(namespace: &str) -> Result<Arc<SurrealDbClient>> {
let db = SurrealDbClient::memory(namespace, "corpus")
.await
@@ -509,21 +511,6 @@ async fn create_ingest_db(namespace: &str) -> Result<Arc<SurrealDbClient>> {
Ok(Arc::new(db))
}
#[cfg(not(test))]
async fn create_ingest_db(namespace: &str) -> Result<Arc<SurrealDbClient>> {
let config = get_config().context("loading app config for ingestion database")?;
let db = SurrealDbClient::new(
&config.surrealdb_address,
&config.surrealdb_username,
&config.surrealdb_password,
namespace,
"corpus",
)
.await
.context("creating surrealdb database for ingestion")?;
Ok(Arc::new(db))
}
#[allow(clippy::too_many_arguments)]
async fn ingest_single_paragraph(
pipeline: Arc<IngestionPipeline>,
@@ -631,8 +618,12 @@ pub fn compute_ingestion_fingerprint(
slice: &ResolvedSlice<'_>,
converted_path: &Path,
ingestion_config: &IngestionConfig,
precomputed_checksum: Option<&str>,
) -> Result<String> {
let checksum = compute_file_checksum(converted_path)?;
let checksum = match precomputed_checksum {
Some(value) => value.to_string(),
None => crate::datasets::content_checksum_for_layout(converted_path)?,
};
Ok(build_ingestion_fingerprint(
dataset,
slice,
@@ -641,7 +632,7 @@ pub fn compute_ingestion_fingerprint(
))
}
pub fn load_cached_manifest(base_dir: &Path) -> Result<Option<CorpusManifest>> {
pub fn load_cached_manifest(base_dir: &std::path::Path) -> Result<Option<CorpusManifest>> {
let path = base_dir.join("manifest.json");
if !path.exists() {
return Ok(None);
@@ -656,7 +647,7 @@ pub fn load_cached_manifest(base_dir: &Path) -> Result<Option<CorpusManifest>> {
Ok(Some(manifest))
}
fn persist_manifest(handle: &CorpusHandle) -> Result<()> {
pub fn persist_corpus_manifest(handle: &CorpusHandle) -> Result<()> {
let path = handle.path.join("manifest.json");
if let Some(parent) = path.parent() {
fs::create_dir_all(parent)
@@ -685,24 +676,6 @@ pub fn corpus_handle_from_manifest(manifest: CorpusManifest, base_dir: PathBuf)
}
}
#[allow(clippy::indexing_slicing)]
fn compute_file_checksum(path: &Path) -> Result<String> {
let mut file = fs::File::open(path)
.with_context(|| format!("opening file {} for checksum", path.display()))?;
let mut hasher = Sha256::new();
let mut buffer = [0u8; 8192];
loop {
let read = file
.read(&mut buffer)
.with_context(|| format!("reading {} for checksum", path.display()))?;
if read == 0 {
break;
}
hasher.update(&buffer[..read]);
}
Ok(format!("{:x}", hasher.finalize()))
}
#[cfg(test)]
mod tests {
use super::*;
@@ -731,7 +704,6 @@ mod tests {
metadata: crate::datasets::DatasetMetadata::for_kind(
DatasetKind::default(),
false,
None,
),
source: "src".to_string(),
paragraphs: vec![paragraph],
+12 -1
View File
@@ -42,7 +42,7 @@ fn default_chunk_max_tokens() -> usize {
}
fn default_chunk_only() -> bool {
false
true
}
// Reuse the pipeline's canonical embedded-artifact types so the on-disk corpus
@@ -122,6 +122,14 @@ pub struct CorpusManifest {
pub questions: Vec<CorpusQuestion>,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct NamespaceSeedRecord {
pub namespace: String,
pub database: String,
pub slice_case_count: usize,
pub seeded_at: DateTime<Utc>,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct CorpusMetadata {
pub dataset_id: String,
@@ -144,6 +152,8 @@ pub struct CorpusMetadata {
pub chunk_max_tokens: usize,
#[serde(default = "default_chunk_only")]
pub chunk_only: bool,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub namespace_seed: Option<NamespaceSeedRecord>,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
@@ -629,6 +639,7 @@ mod tests {
chunk_min_tokens: 1,
chunk_max_tokens: 10,
chunk_only: false,
namespace_seed: None,
},
paragraphs: vec![paragraph_one, paragraph_two],
questions: vec![question],
+106 -26
View File
@@ -1,5 +1,5 @@
use std::{
collections::{BTreeMap, HashMap},
collections::{BTreeMap, HashMap, HashSet},
fs::File,
io::{BufRead, BufReader},
path::{Path, PathBuf},
@@ -47,20 +47,71 @@ struct QrelEntry {
score: i32,
}
/// Convert only documents that appear in qrels (the BEIR evaluation closed world).
#[allow(clippy::arithmetic_side_effects, clippy::indexing_slicing)]
pub fn convert_beir(raw_dir: &Path, dataset: DatasetKind) -> Result<Vec<ConvertedParagraph>> {
convert_beir_documents(raw_dir, dataset, None)
}
/// Convert a subset of qrels-world documents. `doc_ids` use corpus ids (unprefixed).
#[allow(
clippy::too_many_lines,
clippy::arithmetic_side_effects,
clippy::indexing_slicing
)]
pub fn convert_beir_documents(
raw_dir: &Path,
dataset: DatasetKind,
doc_ids: Option<&HashSet<String>>,
) -> Result<Vec<ConvertedParagraph>> {
let corpus_path = raw_dir.join("corpus.jsonl");
let queries_path = raw_dir.join("queries.jsonl");
let qrels_path = resolve_qrels_path(raw_dir)?;
let corpus = load_corpus(&corpus_path)?;
let queries = load_queries(&queries_path)?;
let qrels = load_qrels(&qrels_path)?;
let mut paragraphs = Vec::with_capacity(corpus.len());
let mut qrels_doc_ids = HashSet::new();
for entries in qrels.values() {
for entry in entries {
qrels_doc_ids.insert(entry.doc_id.clone());
}
}
let target_doc_ids: HashSet<String> = match doc_ids {
Some(ids) => ids
.iter()
.filter(|id| qrels_doc_ids.contains(*id))
.cloned()
.collect(),
None => qrels_doc_ids.clone(),
};
if target_doc_ids.is_empty() {
return Err(anyhow!(
"no qrels documents to convert for {} at {}",
dataset.id(),
raw_dir.display()
));
}
let corpus = load_corpus_filtered(&corpus_path, &target_doc_ids)?;
let mut doc_ids_sorted: Vec<String> = target_doc_ids.into_iter().collect();
doc_ids_sorted.sort();
let mut paragraphs = Vec::with_capacity(doc_ids_sorted.len());
let mut paragraph_index = HashMap::new();
for (doc_id, entry) in &corpus {
for doc_id in &doc_ids_sorted {
let Some(entry) = corpus.get(doc_id) else {
warn!(
doc_id = %doc_id,
dataset = %dataset.id(),
"Skipping qrels document missing from corpus"
);
continue;
};
let paragraph_id = format!("{}-{doc_id}", dataset.source_prefix());
let paragraph = ConvertedParagraph {
id: paragraph_id.clone(),
@@ -87,6 +138,12 @@ pub fn convert_beir(raw_dir: &Path, dataset: DatasetKind) -> Result<Vec<Converte
continue;
};
if let Some(filter) = doc_ids {
if !filter.contains(&best.doc_id) {
continue;
}
}
let Some(&paragraph_slot) = paragraph_index.get(&best.doc_id) else {
missing_docs += 1;
warn!(
@@ -106,7 +163,6 @@ pub fn convert_beir(raw_dir: &Path, dataset: DatasetKind) -> Result<Vec<Converte
);
continue;
};
let answers = vec![snippet];
let question_id = format!("{}-{query_id}", dataset.source_prefix());
paragraphs[paragraph_slot]
@@ -114,7 +170,7 @@ pub fn convert_beir(raw_dir: &Path, dataset: DatasetKind) -> Result<Vec<Converte
.push(ConvertedQuestion {
id: question_id,
question: query.text.clone(),
answers,
answers: vec![snippet],
is_impossible: false,
});
}
@@ -122,13 +178,23 @@ pub fn convert_beir(raw_dir: &Path, dataset: DatasetKind) -> Result<Vec<Converte
if missing_queries + missing_docs + skipped_answers > 0 {
warn!(
missing_queries,
missing_docs, skipped_answers, "Skipped some BEIR qrels entries during conversion"
missing_docs,
skipped_answers,
dataset = %dataset.id(),
"Skipped some BEIR qrels entries during conversion"
);
}
Ok(paragraphs)
}
pub fn corpus_doc_id(paragraph_id: &str, dataset: DatasetKind) -> Option<String> {
let prefix = format!("{}-", dataset.source_prefix());
paragraph_id
.strip_prefix(&prefix)
.map(str::to_string)
}
fn resolve_qrels_path(raw_dir: &Path) -> Result<PathBuf> {
let qrels_dir = raw_dir.join("qrels");
let candidates = ["test.tsv", "dev.tsv", "train.tsv"];
@@ -148,7 +214,10 @@ fn resolve_qrels_path(raw_dir: &Path) -> Result<PathBuf> {
}
#[allow(clippy::arithmetic_side_effects)]
fn load_corpus(path: &Path) -> Result<BTreeMap<String, BeirParagraph>> {
fn load_corpus_filtered(
path: &Path,
doc_ids: &HashSet<String>,
) -> Result<BTreeMap<String, BeirParagraph>> {
let file =
File::open(path).with_context(|| format!("opening BEIR corpus at {}", path.display()))?;
let reader = BufReader::new(file);
@@ -167,6 +236,9 @@ fn load_corpus(path: &Path) -> Result<BTreeMap<String, BeirParagraph>> {
path.display()
)
})?;
if !doc_ids.contains(&corpus_row.id) {
continue;
}
let title = corpus_row.title.unwrap_or_else(|| corpus_row.id.clone());
let text = corpus_row.text.unwrap_or_default();
let context = build_context(&title, &text);
@@ -296,10 +368,8 @@ mod tests {
use std::fs;
use tempfile::tempdir;
#[test]
#[allow(clippy::unwrap_used, clippy::expect_used, clippy::indexing_slicing)]
fn converts_basic_beir_layout() {
let dir = tempdir().unwrap();
#[allow(clippy::unwrap_used)]
fn write_fixture(dir: &tempfile::TempDir) {
let corpus = r#"
{"_id":"d1","title":"Doc 1","text":"Doc one has some text for testing."}
{"_id":"d2","title":"Doc 2","text":"Second document content."}
@@ -313,24 +383,34 @@ mod tests {
fs::write(dir.path().join("queries.jsonl"), queries.trim()).unwrap();
fs::create_dir_all(dir.path().join("qrels")).unwrap();
fs::write(dir.path().join("qrels/test.tsv"), qrels).unwrap();
}
#[test]
#[allow(clippy::unwrap_used, clippy::expect_used, clippy::indexing_slicing)]
fn converts_qrels_world_only() {
let dir = tempdir().unwrap();
write_fixture(&dir);
let paragraphs = convert_beir(dir.path(), DatasetKind::Fever).unwrap();
assert_eq!(paragraphs.len(), 2);
let doc_one = paragraphs
.iter()
.find(|p| p.id == "fever-d1")
.expect("missing paragraph for d1");
assert_eq!(paragraphs.len(), 1);
let doc_one = &paragraphs[0];
assert_eq!(doc_one.id, "fever-d1");
assert_eq!(doc_one.questions.len(), 1);
let question = &doc_one.questions[0];
assert_eq!(question.id, "fever-q1");
assert!(!question.answers.is_empty());
assert!(doc_one.context.contains(&question.answers[0]));
assert_eq!(doc_one.questions[0].id, "fever-q1");
}
let doc_two = paragraphs
.iter()
.find(|p| p.id == "fever-d2")
.expect("missing paragraph for d2");
assert!(doc_two.questions.is_empty());
#[test]
#[allow(clippy::unwrap_used, clippy::expect_used, clippy::indexing_slicing)]
fn converts_filtered_doc_ids() {
let dir = tempdir().unwrap();
write_fixture(&dir);
let mut ids = HashSet::new();
ids.insert("d1".to_string());
let paragraphs =
convert_beir_documents(dir.path(), DatasetKind::Fever, Some(&ids)).unwrap();
assert_eq!(paragraphs.len(), 1);
assert_eq!(paragraphs[0].id, "fever-d1");
}
}
+262
View File
@@ -0,0 +1,262 @@
use std::collections::{HashMap, HashSet};
use anyhow::{anyhow, Context, Result};
use sha2::{Digest, Sha256};
use tracing::info;
use super::{
beir,
checksum::hash_file,
store::{
self, build_dataset_from_catalog, paragraph_path, read_meta, store_dir_for,
upsert_sharded_paragraphs, write_sharded,
},
BEIR_DATASETS, ConvertedDataset, DatasetKind, DatasetMetadata,
};
use crate::{
args::Config,
slice,
};
pub fn subset_for_paragraph_id(paragraph_id: &str) -> Option<DatasetKind> {
let mut kinds: Vec<DatasetKind> = BEIR_DATASETS.to_vec();
kinds.sort_by_key(|kind| std::cmp::Reverse(kind.source_prefix().len()));
for kind in kinds {
let prefix = format!("{}-", kind.source_prefix());
if paragraph_id.starts_with(&prefix) {
return Some(kind);
}
}
None
}
pub fn build_beir_mix_qrels_dataset(include_unanswerable: bool) -> Result<ConvertedDataset> {
if include_unanswerable {
tracing::warn!("BEIR mix ignores include_unanswerable flag; all questions are answerable");
}
let mut paragraphs = Vec::new();
for subset in BEIR_DATASETS {
let entry = super::dataset_entry_for_kind(subset)?;
let subset_paragraphs = beir::convert_beir(&entry.raw_path, subset)?;
paragraphs.extend(subset_paragraphs);
}
Ok(ConvertedDataset {
generated_at: super::base_timestamp(),
metadata: DatasetMetadata::for_kind(DatasetKind::Beir, include_unanswerable),
source: "beir-mix".to_string(),
paragraphs,
})
}
pub fn prepare_beir_mix(config: &Config) -> Result<super::loader::LoadedDataset> {
let virtual_ds = build_beir_mix_qrels_dataset(config.llm_mode)?;
let slice_config = slice::slice_config_with_limit(config, slice::ledger_target(config));
let resolved = slice::resolve_slice(&virtual_ds, &slice_config).context(
"resolving BEIR mix slice ledger (check --slice and --limit match your intent)",
)?;
let unique: HashSet<String> = resolved
.manifest
.paragraphs
.iter()
.map(|entry| entry.id.clone())
.collect();
materialize_subset_stores(&unique, config.force_convert)?;
let dataset = load_beir_mix_from_subsets(&unique)?;
let checksum = mix_content_checksum(&unique)?;
info!(
slice = resolved.manifest.slice_id.as_str(),
paragraphs = unique.len(),
checksum = %checksum,
"Prepared BEIR mix from per-subset converted stores"
);
Ok(super::loader::LoadedDataset {
dataset,
content_checksum: checksum,
partial: true,
})
}
pub fn materialize_subset_stores(
paragraph_ids: &HashSet<String>,
force: bool,
) -> Result<()> {
let mut by_subset: HashMap<DatasetKind, Vec<String>> = HashMap::new();
for paragraph_id in paragraph_ids {
let kind = subset_for_paragraph_id(paragraph_id).with_context(|| {
format!("routing BEIR mix paragraph id '{paragraph_id}' to subset store")
})?;
by_subset.entry(kind).or_default().push(paragraph_id.clone());
}
for (kind, ids) in by_subset {
let entry = super::dataset_entry_for_kind(kind)?;
let store_dir = store_dir_for(&entry.converted_path);
let existing = if store_dir.join("meta.json").is_file() {
store::load_paragraph_ids_set(&store_dir)?
} else {
HashSet::new()
};
let missing: Vec<String> = if force {
ids
} else {
ids.into_iter()
.filter(|paragraph_id| !existing.contains(paragraph_id))
.collect()
};
if missing.is_empty() {
continue;
}
let corpus_ids: HashSet<String> = missing
.iter()
.filter_map(|paragraph_id| beir::corpus_doc_id(paragraph_id, kind))
.collect();
let paragraphs = beir::convert_beir_documents(
&entry.raw_path,
kind,
Some(&corpus_ids),
)?;
if store_dir.join("meta.json").is_file() {
upsert_sharded_paragraphs(&store_dir, &paragraphs)?;
} else {
let question_count = paragraphs
.iter()
.map(|paragraph| paragraph.questions.len())
.sum::<usize>();
let dataset = ConvertedDataset {
generated_at: super::base_timestamp(),
metadata: DatasetMetadata::for_kind(kind, false),
source: entry.raw_path.display().to_string(),
paragraphs,
};
write_sharded(&dataset, &store_dir)?;
info!(
subset = kind.id(),
store = %store_dir.display(),
paragraphs = dataset.paragraphs.len(),
questions = question_count,
"Created subset converted store for BEIR mix"
);
}
}
Ok(())
}
pub fn load_beir_mix_from_subsets(paragraph_ids: &HashSet<String>) -> Result<ConvertedDataset> {
let mut by_subset: HashMap<DatasetKind, HashSet<String>> = HashMap::new();
for paragraph_id in paragraph_ids {
let kind = subset_for_paragraph_id(paragraph_id).with_context(|| {
format!("routing BEIR mix paragraph id '{paragraph_id}' to subset store")
})?;
by_subset
.entry(kind)
.or_default()
.insert(paragraph_id.clone());
}
let mut paragraphs = Vec::with_capacity(paragraph_ids.len());
for (kind, subset_ids) in by_subset {
let entry = super::dataset_entry_for_kind(kind)?;
let store_dir = store_dir_for(&entry.converted_path);
let partial = build_dataset_from_catalog(&store_dir, &subset_ids)?;
paragraphs.extend(partial.paragraphs);
}
paragraphs.sort_by(|left, right| left.id.cmp(&right.id));
Ok(ConvertedDataset {
generated_at: super::base_timestamp(),
metadata: DatasetMetadata::for_kind(DatasetKind::Beir, false),
source: "beir-mix".to_string(),
paragraphs,
})
}
pub fn mix_content_checksum(paragraph_ids: &HashSet<String>) -> Result<String> {
let mut ids: Vec<String> = paragraph_ids.iter().cloned().collect();
ids.sort();
let mut hasher = Sha256::new();
for paragraph_id in ids {
let kind = subset_for_paragraph_id(&paragraph_id)
.ok_or_else(|| anyhow!("unknown BEIR subset for paragraph '{paragraph_id}'"))?;
let entry = super::dataset_entry_for_kind(kind)?;
let store_dir = store_dir_for(&entry.converted_path);
let path = paragraph_path(&store_dir, &paragraph_id);
if !path.is_file() {
return Err(anyhow!(
"missing converted paragraph {} at {}",
paragraph_id,
path.display()
));
}
hasher.update(paragraph_id.as_bytes());
hasher.update([0]);
hasher.update(hash_file(&path)?.as_bytes());
}
Ok(format!("{:x}", hasher.finalize()))
}
pub fn beir_subset_stores_ready(paragraph_ids: &HashSet<String>) -> Result<bool> {
for paragraph_id in paragraph_ids {
let kind = subset_for_paragraph_id(paragraph_id).with_context(|| {
format!("routing BEIR mix paragraph id '{paragraph_id}' to subset store")
})?;
let entry = super::dataset_entry_for_kind(kind)?;
let store_dir = store_dir_for(&entry.converted_path);
if !store_dir.join("meta.json").is_file() {
return Ok(false);
}
if !paragraph_path(&store_dir, paragraph_id).is_file() {
return Ok(false);
}
}
Ok(true)
}
pub fn beir_subset_store_summary() -> Result<Vec<(String, usize, usize)>> {
let mut summary = Vec::new();
for kind in BEIR_DATASETS {
let entry = super::dataset_entry_for_kind(kind)?;
let store_dir = store_dir_for(&entry.converted_path);
if store_dir.join("meta.json").is_file() {
let meta = read_meta(&store_dir)?;
summary.push((kind.id().to_string(), meta.paragraph_count, meta.question_count));
}
}
Ok(summary)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn routes_prefixed_paragraph_ids() {
assert_eq!(
subset_for_paragraph_id("fever-doc-1"),
Some(DatasetKind::Fever)
);
assert_eq!(
subset_for_paragraph_id("nq-beir-doc-1"),
Some(DatasetKind::NqBeir)
);
assert_eq!(
subset_for_paragraph_id("trec-covid-doc-1"),
Some(DatasetKind::TrecCovid)
);
assert!(subset_for_paragraph_id("unknown-doc").is_none());
}
}
+216
View File
@@ -0,0 +1,216 @@
use std::{
fs::{self, File},
io::Read,
path::Path,
};
#[cfg(test)]
use std::path::PathBuf;
use anyhow::{Context, Result};
use serde::{Deserialize, Serialize};
use sha2::{Digest, Sha256};
const SIDECAR_VERSION: u32 = 1;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ChecksumSidecar {
pub version: u32,
pub sha256: String,
pub size_bytes: u64,
#[serde(default)]
pub modified_unix_secs: u64,
}
impl ChecksumSidecar {
#[cfg(test)]
pub fn sidecar_path(content_path: &Path) -> PathBuf {
content_path.with_extension("sha256")
}
#[cfg(test)]
pub fn is_valid_for(&self, content_path: &Path) -> bool {
if self.version != SIDECAR_VERSION {
return false;
}
let Ok(metadata) = fs::metadata(content_path) else {
return false;
};
if metadata.len() != self.size_bytes {
return false;
}
if self.modified_unix_secs != 0 {
let Ok(modified) = metadata.modified() else {
return true;
};
let Ok(secs) = modified.duration_since(std::time::UNIX_EPOCH) else {
return true;
};
if secs.as_secs() != self.modified_unix_secs {
return false;
}
}
true
}
}
#[allow(clippy::indexing_slicing)]
pub fn hash_file(path: &Path) -> Result<String> {
let mut file =
File::open(path).with_context(|| format!("opening file {} for checksum", path.display()))?;
let mut hasher = Sha256::new();
let mut buffer = vec![0u8; 65_536];
loop {
let read = file
.read(&mut buffer)
.with_context(|| format!("reading {} for checksum", path.display()))?;
if read == 0 {
break;
}
hasher.update(&buffer[..read]);
}
Ok(format!("{:x}", hasher.finalize()))
}
pub fn read_sidecar(path: &Path) -> Result<Option<ChecksumSidecar>> {
if !path.exists() {
return Ok(None);
}
let raw = fs::read_to_string(path)
.with_context(|| format!("reading checksum sidecar {}", path.display()))?;
let sidecar: ChecksumSidecar = serde_json::from_str(&raw)
.with_context(|| format!("parsing checksum sidecar {}", path.display()))?;
Ok(Some(sidecar))
}
#[cfg(test)]
pub fn write_sidecar(content_path: &Path, sha256: &str) -> Result<()> {
let metadata = fs::metadata(content_path)
.with_context(|| format!("reading metadata for {}", content_path.display()))?;
let modified_unix_secs = metadata
.modified()
.ok()
.and_then(|time| time.duration_since(std::time::UNIX_EPOCH).ok())
.map_or(0, |duration| duration.as_secs());
let sidecar = ChecksumSidecar {
version: SIDECAR_VERSION,
sha256: sha256.to_string(),
size_bytes: metadata.len(),
modified_unix_secs,
};
let path = ChecksumSidecar::sidecar_path(content_path);
if let Some(parent) = path.parent() {
fs::create_dir_all(parent)
.with_context(|| format!("creating checksum sidecar directory {}", parent.display()))?;
}
let blob = serde_json::to_vec_pretty(&sidecar).context("serialising checksum sidecar")?;
fs::write(&path, blob)
.with_context(|| format!("writing checksum sidecar {}", path.display()))?;
Ok(())
}
#[cfg(test)]
pub fn content_checksum(content_path: &Path) -> Result<String> {
let sidecar_path = ChecksumSidecar::sidecar_path(content_path);
if let Some(sidecar) = read_sidecar(&sidecar_path)? {
if sidecar.is_valid_for(content_path) {
return Ok(sidecar.sha256);
}
}
let sha256 = hash_file(content_path)?;
write_sidecar(content_path, &sha256)?;
Ok(sha256)
}
pub fn store_aggregate_checksum(store_dir: &Path) -> Result<String> {
let marker = store_dir.join("checksum.sha256");
let meta = store_dir.join("meta.json");
if marker.is_file() && meta.is_file() {
if let (Ok(marker_meta), Ok(meta_meta)) = (marker.metadata(), meta.metadata()) {
if marker_meta
.modified()
.ok()
.zip(meta_meta.modified().ok())
.is_some_and(|(marker_modified, meta_modified)| marker_modified >= meta_modified)
{
if let Some(sidecar) = read_sidecar(&marker)? {
return Ok(sidecar.sha256);
}
}
}
}
let mut entries = Vec::new();
collect_store_files(store_dir, store_dir, &mut entries)?;
entries.sort();
let mut hasher = Sha256::new();
for relative in &entries {
let path = store_dir.join(relative);
if path == marker {
continue;
}
hasher.update(relative.as_bytes());
hasher.update([0]);
let file_hash = hash_file(&path)?;
hasher.update(file_hash.as_bytes());
}
let digest = format!("{:x}", hasher.finalize());
let sidecar = ChecksumSidecar {
version: SIDECAR_VERSION,
sha256: digest.clone(),
size_bytes: entries.len() as u64,
modified_unix_secs: std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.map_or(0, |duration| duration.as_secs()),
};
if let Some(parent) = marker.parent() {
fs::create_dir_all(parent)?;
}
fs::write(&marker, serde_json::to_vec_pretty(&sidecar)?)?;
Ok(digest)
}
fn collect_store_files(base: &Path, current: &Path, entries: &mut Vec<String>) -> Result<()> {
for entry in fs::read_dir(current)? {
let entry = entry?;
let path = entry.path();
if path.file_name().is_some_and(|name| name == "checksum.sha256") {
continue;
}
if path.is_dir() {
collect_store_files(base, &path, entries)?;
} else if path.is_file() {
let relative = path
.strip_prefix(base)
.unwrap_or(&path)
.to_string_lossy()
.replace('\\', "/");
entries.push(relative);
}
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::tempdir;
#[test]
fn sidecar_round_trip() -> Result<()> {
let dir = tempdir()?;
let file = dir.path().join("sample.json");
fs::write(&file, br#"{"hello":"world"}"#)?;
let first = content_checksum(&file)?;
let second = content_checksum(&file)?;
assert_eq!(first, second);
fs::write(&file, br#"{"hello":"world!"}"#)?;
let third = content_checksum(&file)?;
assert_ne!(first, third);
Ok(())
}
}
+197
View File
@@ -0,0 +1,197 @@
use std::collections::HashSet;
use anyhow::{Context, Result};
use tracing::info;
use super::{
catalog,
store::{
self, build_dataset_from_catalog, detect_layout, read_meta, store_dir_for, write_sharded,
ConvertedLayout,
},
ConvertedDataset, DatasetKind,
};
use crate::{
args::Config,
slice::{self, SliceConfig},
};
#[derive(Debug, Clone)]
pub struct LoadedDataset {
pub dataset: ConvertedDataset,
pub content_checksum: String,
pub partial: bool,
}
pub fn prepare_dataset(dataset_kind: DatasetKind, config: &Config) -> Result<LoadedDataset> {
if dataset_kind == DatasetKind::Beir {
return super::beir_mix::prepare_beir_mix(config);
}
let converted_path = &config.converted_dataset_path;
let layout = detect_layout(converted_path);
let store_dir = store_dir_for(converted_path);
if layout == ConvertedLayout::Missing || config.force_convert {
return convert_and_load(dataset_kind, config);
}
load_from_store(dataset_kind, config, &store_dir, true)
}
fn convert_and_load(dataset_kind: DatasetKind, config: &Config) -> Result<LoadedDataset> {
let dataset = super::convert(
config.raw_dataset_path.as_path(),
dataset_kind,
config.llm_mode,
)
.with_context(|| format!("converting {} dataset", dataset_kind.label()))?;
let store_dir = store_dir_for(&config.converted_dataset_path);
write_sharded(&dataset, &store_dir)?;
prebuild_catalog_slices(&dataset, config)?;
let checksum = crate::datasets::store_aggregate_checksum(&store_dir)?;
Ok(LoadedDataset {
dataset,
content_checksum: checksum,
partial: false,
})
}
fn load_from_store(
dataset_kind: DatasetKind,
config: &Config,
store_dir: &std::path::Path,
allow_partial: bool,
) -> Result<LoadedDataset> {
let checksum = crate::datasets::store_aggregate_checksum(store_dir)?;
let meta = read_meta(store_dir)?;
validate_metadata_fields(&meta.metadata, dataset_kind, config)?;
if allow_partial {
if let Some(paragraph_ids) = slice_paragraph_ids_for_fast_path(config)? {
let unique: HashSet<String> = paragraph_ids.into_iter().collect();
info!(
paragraphs = unique.len(),
store = %store_dir.display(),
"Loading slice-addressed paragraphs from sharded converted store"
);
let dataset = build_dataset_from_catalog(store_dir, &unique)?;
return Ok(LoadedDataset {
dataset,
content_checksum: checksum,
partial: true,
});
}
}
info!(
store = %store_dir.display(),
paragraphs = meta.paragraph_count,
"Loading full sharded converted store"
);
let dataset = store::load_sharded_full(store_dir)?;
Ok(LoadedDataset {
dataset,
content_checksum: checksum,
partial: false,
})
}
fn slice_paragraph_ids_for_fast_path(config: &Config) -> Result<Option<Vec<String>>> {
let Some(manifest_path) = slice::cached_manifest_path(config) else {
return Ok(None);
};
let Some(manifest) = slice::read_manifest_if_exists(&manifest_path)? else {
return Ok(None);
};
let slice_config = slice::slice_config_with_limit(config, slice::ledger_target(config));
if !slice::manifest_is_complete(&manifest, &slice_config) {
return Ok(None);
}
Ok(Some(
manifest
.paragraphs
.iter()
.map(|entry| entry.id.clone())
.collect(),
))
}
fn validate_metadata_fields(
metadata: &super::DatasetMetadata,
dataset_kind: DatasetKind,
config: &Config,
) -> Result<()> {
if metadata.id != dataset_kind.id() {
anyhow::bail!(
"converted dataset targets '{}', expected '{}'",
metadata.id,
dataset_kind.id()
);
}
if metadata.include_unanswerable != config.llm_mode {
anyhow::bail!(
"converted dataset include_unanswerable mismatch (expected {}, found {})",
config.llm_mode,
metadata.include_unanswerable
);
}
Ok(())
}
pub fn prebuild_catalog_slices(dataset: &ConvertedDataset, config: &Config) -> Result<()> {
let catalog = catalog()?;
let entry = catalog.dataset(dataset.metadata.id.as_str())?;
if entry.slices.is_empty() {
return Ok(());
}
info!(
dataset = dataset.metadata.id.as_str(),
slices = entry.slices.len(),
"Prebuilding catalog slice ledgers"
);
for slice_entry in &entry.slices {
let slice_config = slice_config_for_catalog_entry(config, slice_entry);
match slice::resolve_slice(dataset, &slice_config) {
Ok(resolved) => info!(
slice = resolved.manifest.slice_id.as_str(),
cases = resolved.manifest.case_count,
positives = resolved.manifest.positive_paragraphs,
negatives = resolved.manifest.negative_paragraphs,
"Prebuilt catalog slice ledger"
),
Err(err) => tracing::warn!(
slice = slice_entry.id.as_str(),
error = %err,
"Failed to prebuild catalog slice ledger"
),
}
}
Ok(())
}
fn slice_config_for_catalog_entry<'a>(
config: &'a Config,
slice_entry: &'a super::SliceEntry,
) -> SliceConfig<'a> {
SliceConfig {
cache_dir: config.cache_dir.as_path(),
force_convert: config.force_convert,
explicit_slice: Some(slice_entry.id.as_str()),
limit: slice_entry.limit,
corpus_limit: slice_entry.corpus_limit,
slice_seed: slice_entry.seed.unwrap_or(config.slice_seed),
llm_mode: slice_entry
.include_unanswerable
.unwrap_or(config.llm_mode),
negative_multiplier: slice_entry
.negative_multiplier
.unwrap_or(config.negative_multiplier),
require_verified_chunks: config.retrieval.require_verified_chunks,
}
}
+38 -143
View File
@@ -1,6 +1,10 @@
mod beir;
mod beir_mix;
mod checksum;
mod loader;
mod nq;
mod squad;
mod store;
use std::{
collections::{BTreeMap, HashMap},
@@ -20,38 +24,31 @@ const MANIFEST_PATH: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/manifest.yaml"
static DATASET_CATALOG: OnceCell<DatasetCatalog> = OnceCell::new();
#[derive(Debug, Clone)]
#[allow(dead_code)]
pub struct DatasetCatalog {
datasets: BTreeMap<String, DatasetEntry>,
slices: HashMap<String, SliceLocation>,
default_dataset: String,
}
#[derive(Debug, Clone)]
#[allow(dead_code)]
pub struct DatasetEntry {
pub metadata: DatasetMetadata,
pub raw_path: PathBuf,
pub converted_path: PathBuf,
pub include_unanswerable: bool,
pub slices: Vec<SliceEntry>,
}
#[derive(Debug, Clone)]
#[allow(dead_code)]
pub struct SliceEntry {
pub id: String,
pub dataset_id: String,
pub label: String,
pub description: Option<String>,
pub limit: Option<usize>,
pub corpus_limit: Option<usize>,
pub include_unanswerable: Option<bool>,
pub seed: Option<u64>,
pub negative_multiplier: Option<f32>,
}
#[derive(Debug, Clone)]
#[allow(dead_code)]
struct SliceLocation {
dataset_id: String,
slice_index: usize,
@@ -59,7 +56,6 @@ struct SliceLocation {
#[derive(Debug, Deserialize)]
struct ManifestFile {
default_dataset: Option<String>,
datasets: Vec<ManifestDataset>,
}
@@ -81,6 +77,7 @@ struct ManifestDataset {
}
#[derive(Debug, Deserialize)]
#[allow(dead_code)]
struct ManifestSlice {
id: String,
label: String,
@@ -94,6 +91,8 @@ struct ManifestSlice {
include_unanswerable: Option<bool>,
#[serde(default)]
seed: Option<u64>,
#[serde(default)]
negative_multiplier: Option<f32>,
}
impl DatasetCatalog {
@@ -111,18 +110,19 @@ impl DatasetCatalog {
let raw_path = resolve_path(root, &dataset.raw);
let converted_path = resolve_path(root, &dataset.converted);
if !raw_path.exists() {
if !raw_path.exists() && dataset.id != "beir" {
bail!(
"dataset '{}' raw file missing at {}",
dataset.id,
raw_path.display()
);
}
if !converted_path.exists() {
let store_dir = store::store_dir_for(&converted_path);
if !converted_path.exists() && !store_dir.join("meta.json").is_file() {
warn!(
"dataset '{}' converted file missing at {}; the next conversion run will regenerate it",
"dataset '{}' converted store missing at {}; the next conversion run will regenerate it",
dataset.id,
converted_path.display()
store_dir.display()
);
}
@@ -139,7 +139,6 @@ impl DatasetCatalog {
.clone()
.unwrap_or_else(|| dataset.id.clone()),
include_unanswerable: dataset.include_unanswerable,
context_token_limit: None,
};
let mut entry_slices = Vec::with_capacity(dataset.slices.len());
@@ -154,12 +153,11 @@ impl DatasetCatalog {
entry_slices.push(SliceEntry {
id: manifest_slice.id.clone(),
dataset_id: dataset.id.clone(),
label: manifest_slice.label,
description: manifest_slice.description,
limit: manifest_slice.limit,
corpus_limit: manifest_slice.corpus_limit,
include_unanswerable: manifest_slice.include_unanswerable,
seed: manifest_slice.seed,
negative_multiplier: manifest_slice.negative_multiplier,
});
slices.insert(
manifest_slice.id,
@@ -176,22 +174,16 @@ impl DatasetCatalog {
metadata,
raw_path,
converted_path,
include_unanswerable: dataset.include_unanswerable,
slices: entry_slices,
},
);
}
let default_dataset = manifest
.default_dataset
.or_else(|| datasets.keys().next().cloned())
.ok_or_else(|| anyhow!("dataset manifest does not include any datasets"))?;
if datasets.is_empty() {
bail!("dataset manifest does not include any datasets");
}
Ok(Self {
datasets,
slices,
default_dataset,
})
Ok(Self { datasets, slices })
}
pub fn global() -> Result<&'static Self> {
@@ -204,12 +196,6 @@ impl DatasetCatalog {
.ok_or_else(|| anyhow!("unknown dataset '{id}' in manifest"))
}
#[allow(dead_code)]
pub fn default_dataset(&self) -> Result<&DatasetEntry> {
self.dataset(&self.default_dataset)
}
#[allow(dead_code)]
pub fn slice(&self, slice_id: &str) -> Result<(&DatasetEntry, &SliceEntry)> {
let location = self
.slices
@@ -236,20 +222,29 @@ fn resolve_path(root: &Path, value: &str) -> PathBuf {
}
}
pub use checksum::store_aggregate_checksum;
pub use beir_mix::{
beir_subset_store_summary, beir_subset_stores_ready, mix_content_checksum,
};
pub use loader::{prebuild_catalog_slices, prepare_dataset};
pub use store::{
content_checksum_for_layout, detect_layout, store_dir_for, write_sharded, ConvertedLayout,
};
pub fn catalog() -> Result<&'static DatasetCatalog> {
DatasetCatalog::global()
}
fn dataset_entry_for_kind(kind: DatasetKind) -> Result<&'static DatasetEntry> {
pub(crate) fn dataset_entry_for_kind(kind: DatasetKind) -> Result<&'static DatasetEntry> {
let catalog = catalog()?;
catalog.dataset(kind.id())
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum, Default)]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, ValueEnum, Default)]
pub enum DatasetKind {
#[default]
SquadV2,
NaturalQuestions,
#[default]
Beir,
#[value(name = "fever")]
Fever,
@@ -416,16 +411,10 @@ pub struct DatasetMetadata {
pub source_prefix: String,
#[serde(default)]
pub include_unanswerable: bool,
#[serde(default)]
pub context_token_limit: Option<usize>,
}
impl DatasetMetadata {
pub fn for_kind(
kind: DatasetKind,
include_unanswerable: bool,
context_token_limit: Option<usize>,
) -> Self {
pub fn for_kind(kind: DatasetKind, include_unanswerable: bool) -> Self {
if let Ok(entry) = dataset_entry_for_kind(kind) {
return Self {
id: entry.metadata.id.clone(),
@@ -434,7 +423,6 @@ impl DatasetMetadata {
entity_suffix: entry.metadata.entity_suffix.clone(),
source_prefix: entry.metadata.source_prefix.clone(),
include_unanswerable,
context_token_limit,
};
}
@@ -445,13 +433,12 @@ impl DatasetMetadata {
entity_suffix: kind.entity_suffix().to_string(),
source_prefix: kind.source_prefix().to_string(),
include_unanswerable,
context_token_limit,
}
}
}
fn default_metadata() -> DatasetMetadata {
DatasetMetadata::for_kind(DatasetKind::default(), false, None)
DatasetMetadata::for_kind(DatasetKind::default(), false)
}
#[derive(Debug, Clone, Serialize, Deserialize)]
@@ -483,14 +470,15 @@ pub fn convert(
raw_path: &Path,
dataset: DatasetKind,
include_unanswerable: bool,
context_token_limit: Option<usize>,
) -> Result<ConvertedDataset> {
let paragraphs = match dataset {
DatasetKind::SquadV2 => squad::convert_squad(raw_path)?,
DatasetKind::NaturalQuestions => {
nq::convert_nq(raw_path, include_unanswerable, context_token_limit)?
DatasetKind::NaturalQuestions => nq::convert_nq(raw_path, include_unanswerable)?,
DatasetKind::Beir => {
bail!(
"BEIR mix is prepared via slice-first subset stores; use prepare_beir_mix instead of convert"
);
}
DatasetKind::Beir => convert_beir_mix(include_unanswerable, context_token_limit)?,
DatasetKind::Fever
| DatasetKind::Fiqa
| DatasetKind::HotpotQa
@@ -501,11 +489,6 @@ pub fn convert(
| DatasetKind::NqBeir => beir::convert_beir(raw_path, dataset)?,
};
let metadata_limit = match dataset {
DatasetKind::NaturalQuestions => None,
_ => context_token_limit,
};
let generated_at = match dataset {
DatasetKind::Beir
| DatasetKind::Fever
@@ -526,100 +509,12 @@ pub fn convert(
Ok(ConvertedDataset {
generated_at,
metadata: DatasetMetadata::for_kind(dataset, include_unanswerable, metadata_limit),
metadata: DatasetMetadata::for_kind(dataset, include_unanswerable),
source: source_label,
paragraphs,
})
}
fn convert_beir_mix(
include_unanswerable: bool,
_context_token_limit: Option<usize>,
) -> Result<Vec<ConvertedParagraph>> {
if include_unanswerable {
warn!("BEIR mix ignores include_unanswerable flag; all questions are answerable");
}
let mut paragraphs = Vec::new();
for subset in BEIR_DATASETS {
let entry = dataset_entry_for_kind(subset)?;
let subset_paragraphs = beir::convert_beir(&entry.raw_path, subset)?;
paragraphs.extend(subset_paragraphs);
}
Ok(paragraphs)
}
fn ensure_parent(path: &Path) -> Result<()> {
if let Some(parent) = path.parent() {
fs::create_dir_all(parent)
.with_context(|| format!("creating parent directory for {}", path.display()))?;
}
Ok(())
}
pub fn write_converted(dataset: &ConvertedDataset, converted_path: &Path) -> Result<()> {
ensure_parent(converted_path)?;
let json =
serde_json::to_string_pretty(dataset).context("serialising converted dataset to JSON")?;
fs::write(converted_path, json)
.with_context(|| format!("writing converted dataset to {}", converted_path.display()))
}
pub fn read_converted(converted_path: &Path) -> Result<ConvertedDataset> {
let raw = fs::read_to_string(converted_path)
.with_context(|| format!("reading converted dataset at {}", converted_path.display()))?;
let mut dataset: ConvertedDataset = serde_json::from_str(&raw)
.with_context(|| format!("parsing converted dataset at {}", converted_path.display()))?;
if dataset.metadata.id.trim().is_empty() {
dataset.metadata = default_metadata();
}
if dataset.source.is_empty() {
dataset.source = converted_path.display().to_string();
}
Ok(dataset)
}
pub fn ensure_converted(
dataset_kind: DatasetKind,
raw_path: &Path,
converted_path: &Path,
force: bool,
include_unanswerable: bool,
context_token_limit: Option<usize>,
) -> Result<ConvertedDataset> {
if force || !converted_path.exists() {
let dataset = convert(
raw_path,
dataset_kind,
include_unanswerable,
context_token_limit,
)?;
write_converted(&dataset, converted_path)?;
return Ok(dataset);
}
match read_converted(converted_path) {
Ok(dataset)
if dataset.metadata.id == dataset_kind.id()
&& dataset.metadata.include_unanswerable == include_unanswerable
&& dataset.metadata.context_token_limit == context_token_limit =>
{
Ok(dataset)
}
_ => {
let dataset = convert(
raw_path,
dataset_kind,
include_unanswerable,
context_token_limit,
)?;
write_converted(&dataset, converted_path)?;
Ok(dataset)
}
}
}
pub fn base_timestamp() -> DateTime<Utc> {
Utc.with_ymd_and_hms(2023, 1, 1, 0, 0, 0).unwrap()
}
+1 -5
View File
@@ -16,11 +16,7 @@ use super::{ConvertedParagraph, ConvertedQuestion};
clippy::arithmetic_side_effects,
clippy::cast_sign_loss
)]
pub fn convert_nq(
raw_path: &Path,
include_unanswerable: bool,
_context_token_limit: Option<usize>,
) -> Result<Vec<ConvertedParagraph>> {
pub fn convert_nq(raw_path: &Path, include_unanswerable: bool) -> Result<Vec<ConvertedParagraph>> {
#[allow(dead_code)]
#[derive(Debug, Deserialize)]
struct NqExample {
+410
View File
@@ -0,0 +1,410 @@
use std::{
collections::{HashMap, HashSet},
fs::{self, File, OpenOptions},
io::{BufRead, BufReader, Write},
path::{Path, PathBuf},
};
use anyhow::{anyhow, Context, Result};
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use tracing::info;
use super::{
checksum::store_aggregate_checksum,
ConvertedDataset, ConvertedParagraph, ConvertedQuestion, DatasetMetadata,
};
use crate::slice;
pub const SHARDED_STORE_VERSION: u32 = 1;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ShardedMeta {
pub version: u32,
pub generated_at: DateTime<Utc>,
pub metadata: DatasetMetadata,
pub source: String,
pub paragraph_count: usize,
pub question_count: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub(crate) struct QuestionRecord {
paragraph_id: String,
#[serde(flatten)]
question: ConvertedQuestion,
}
#[derive(Debug, Clone)]
pub struct QuestionCatalog {
pub entries: Vec<QuestionRecord>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ConvertedLayout {
ShardedStore,
Missing,
}
pub fn store_dir_for(converted_path: &Path) -> PathBuf {
converted_path
.parent()
.unwrap_or_else(|| Path::new("."))
.join(
converted_path
.file_stem()
.map_or_else(|| "dataset".to_string(), |stem| stem.to_string_lossy().into()),
)
}
pub fn detect_layout(converted_path: &Path) -> ConvertedLayout {
let store_dir = store_dir_for(converted_path);
if store_dir.join("meta.json").is_file() {
ConvertedLayout::ShardedStore
} else {
ConvertedLayout::Missing
}
}
fn paragraph_file_name(paragraph_id: &str) -> String {
format!("{}.json", slice::paragraph_storage_key(paragraph_id))
}
pub fn paragraph_path(store_dir: &Path, paragraph_id: &str) -> PathBuf {
store_dir
.join("paragraphs")
.join(paragraph_file_name(paragraph_id))
}
pub fn write_sharded(dataset: &ConvertedDataset, store_dir: &Path) -> Result<String> {
if store_dir.exists() {
fs::remove_dir_all(store_dir)
.with_context(|| format!("clearing sharded store {}", store_dir.display()))?;
}
fs::create_dir_all(store_dir.join("paragraphs"))
.with_context(|| format!("creating sharded store {}", store_dir.display()))?;
let question_count = dataset
.paragraphs
.iter()
.map(|paragraph| paragraph.questions.len())
.sum::<usize>();
let meta = ShardedMeta {
version: SHARDED_STORE_VERSION,
generated_at: dataset.generated_at,
metadata: dataset.metadata.clone(),
source: dataset.source.clone(),
paragraph_count: dataset.paragraphs.len(),
question_count,
};
let meta_path = store_dir.join("meta.json");
fs::write(
&meta_path,
serde_json::to_vec_pretty(&meta).context("serialising sharded store metadata")?,
)
.with_context(|| format!("writing sharded metadata {}", meta_path.display()))?;
let mut questions_file = File::create(store_dir.join("questions.jsonl"))
.context("creating questions.jsonl for sharded store")?;
let mut paragraph_ids_file = File::create(store_dir.join("paragraph_ids.jsonl"))
.context("creating paragraph_ids.jsonl for sharded store")?;
for paragraph in &dataset.paragraphs {
writeln!(paragraph_ids_file, "{}", paragraph.id)
.context("writing paragraph id to paragraph_ids.jsonl")?;
for question in &paragraph.questions {
let record = QuestionRecord {
paragraph_id: paragraph.id.clone(),
question: question.clone(),
};
serde_json::to_writer(&mut questions_file, &record)
.context("writing question record to questions.jsonl")?;
questions_file.write_all(b"\n")?;
}
let path = paragraph_path(store_dir, &paragraph.id);
if let Some(parent) = path.parent() {
fs::create_dir_all(parent)?;
}
fs::write(
&path,
serde_json::to_vec(paragraph).context("serialising sharded paragraph")?,
)
.with_context(|| format!("writing sharded paragraph {}", path.display()))?;
}
let digest = store_aggregate_checksum(store_dir)?;
info!(
store = %store_dir.display(),
paragraphs = dataset.paragraphs.len(),
questions = question_count,
checksum = %digest,
"Wrote sharded converted dataset"
);
Ok(digest)
}
pub fn read_meta(store_dir: &Path) -> Result<ShardedMeta> {
let path = store_dir.join("meta.json");
let raw = fs::read_to_string(&path)
.with_context(|| format!("reading sharded metadata {}", path.display()))?;
serde_json::from_str(&raw)
.with_context(|| format!("parsing sharded metadata {}", path.display()))
}
pub fn content_checksum_for_layout(converted_path: &Path) -> Result<String> {
match detect_layout(converted_path) {
ConvertedLayout::ShardedStore => {
crate::datasets::store_aggregate_checksum(&store_dir_for(converted_path))
}
ConvertedLayout::Missing => Err(anyhow!(
"converted dataset missing at {}",
converted_path.display()
)),
}
}
fn load_paragraph(store_dir: &Path, paragraph_id: &str) -> Result<ConvertedParagraph> {
let path = paragraph_path(store_dir, paragraph_id);
let raw = fs::read(&path)
.with_context(|| format!("reading sharded paragraph {}", path.display()))?;
serde_json::from_slice(&raw)
.with_context(|| format!("parsing sharded paragraph {}", path.display()))
}
fn load_paragraphs(store_dir: &Path, paragraph_ids: &[String]) -> Result<Vec<ConvertedParagraph>> {
paragraph_ids
.iter()
.map(|paragraph_id| load_paragraph(store_dir, paragraph_id))
.collect()
}
pub fn load_sharded_partial(store_dir: &Path, paragraph_ids: &[String]) -> Result<ConvertedDataset> {
let meta = read_meta(store_dir)?;
let mut paragraphs = load_paragraphs(store_dir, paragraph_ids)?;
paragraphs.sort_by(|left, right| left.id.cmp(&right.id));
Ok(ConvertedDataset {
generated_at: meta.generated_at,
metadata: meta.metadata,
source: meta.source,
paragraphs,
})
}
pub fn load_sharded_full(store_dir: &Path) -> Result<ConvertedDataset> {
let meta = read_meta(store_dir)?;
let ids = load_paragraph_ids(store_dir)?;
let paragraphs = load_paragraphs(store_dir, &ids)?;
Ok(ConvertedDataset {
generated_at: meta.generated_at,
metadata: meta.metadata,
source: meta.source,
paragraphs,
})
}
pub fn load_paragraph_ids_set(store_dir: &Path) -> Result<HashSet<String>> {
Ok(load_paragraph_ids(store_dir)?.into_iter().collect())
}
#[allow(clippy::arithmetic_side_effects)]
pub fn upsert_sharded_paragraphs(
store_dir: &Path,
paragraphs: &[ConvertedParagraph],
) -> Result<()> {
if paragraphs.is_empty() {
return Ok(());
}
if !store_dir.join("meta.json").is_file() {
return Err(anyhow!(
"cannot upsert into missing sharded store at {}",
store_dir.display()
));
}
fs::create_dir_all(store_dir.join("paragraphs"))
.with_context(|| format!("creating paragraphs directory in {}", store_dir.display()))?;
let existing = load_paragraph_ids_set(store_dir)?;
let questions_path = store_dir.join("questions.jsonl");
let mut questions_file = OpenOptions::new()
.create(true)
.append(true)
.open(&questions_path)
.with_context(|| format!("opening question catalog {}", questions_path.display()))?;
let mut ids_file = None;
let mut new_paragraphs = 0usize;
let mut new_questions = 0usize;
for paragraph in paragraphs {
let is_new = !existing.contains(&paragraph.id);
let path = paragraph_path(store_dir, &paragraph.id);
if let Some(parent) = path.parent() {
fs::create_dir_all(parent)?;
}
fs::write(
&path,
serde_json::to_vec(paragraph).context("serialising sharded paragraph")?,
)
.with_context(|| format!("writing sharded paragraph {}", path.display()))?;
if is_new {
if ids_file.is_none() {
ids_file = Some(
OpenOptions::new()
.create(true)
.append(true)
.open(store_dir.join("paragraph_ids.jsonl"))
.context("opening paragraph_ids.jsonl for append")?,
);
}
if let Some(file) = ids_file.as_mut() {
writeln!(file, "{}", paragraph.id).context("appending paragraph id")?;
}
new_paragraphs += 1;
for question in &paragraph.questions {
let record = QuestionRecord {
paragraph_id: paragraph.id.clone(),
question: question.clone(),
};
serde_json::to_writer(&mut questions_file, &record)
.context("writing question record to questions.jsonl")?;
questions_file.write_all(b"\n")?;
new_questions += 1;
}
}
}
if new_paragraphs > 0 || new_questions > 0 {
let meta = read_meta(store_dir)?;
let updated = ShardedMeta {
paragraph_count: meta.paragraph_count + new_paragraphs,
question_count: meta.question_count + new_questions,
..meta
};
fs::write(
store_dir.join("meta.json"),
serde_json::to_vec_pretty(&updated).context("serialising updated sharded metadata")?,
)?;
store_aggregate_checksum(store_dir)?;
info!(
store = %store_dir.display(),
new_paragraphs,
new_questions,
"Upserted paragraphs into sharded converted store"
);
}
Ok(())
}
pub fn load_paragraph_ids(store_dir: &Path) -> Result<Vec<String>> {
let path = store_dir.join("paragraph_ids.jsonl");
let file = File::open(&path)
.with_context(|| format!("opening paragraph id index {}", path.display()))?;
let reader = BufReader::new(file);
reader
.lines()
.map(|line| {
line.context("reading paragraph id index line")
.and_then(|value| {
let trimmed = value.trim();
if trimmed.is_empty() {
Err(anyhow!("empty paragraph id in index"))
} else {
Ok(trimmed.to_string())
}
})
})
.collect()
}
pub fn load_question_catalog(store_dir: &Path) -> Result<QuestionCatalog> {
let path = store_dir.join("questions.jsonl");
let file = File::open(&path)
.with_context(|| format!("opening question catalog {}", path.display()))?;
let reader = BufReader::new(file);
let mut entries = Vec::new();
for line in reader.lines() {
let line = line.context("reading question catalog line")?;
if line.trim().is_empty() {
continue;
}
let record: QuestionRecord = serde_json::from_str(&line)
.context("parsing question catalog record")?;
entries.push(record);
}
Ok(QuestionCatalog { entries })
}
pub fn build_dataset_from_catalog(
store_dir: &Path,
paragraph_ids: &HashSet<String>,
) -> Result<ConvertedDataset> {
let catalog = load_question_catalog(store_dir)?;
let mut questions_by_paragraph: HashMap<String, Vec<ConvertedQuestion>> = HashMap::new();
for entry in catalog.entries {
if paragraph_ids.contains(&entry.paragraph_id) {
questions_by_paragraph
.entry(entry.paragraph_id.clone())
.or_default()
.push(entry.question);
}
}
let mut dataset = load_sharded_partial(
store_dir,
&paragraph_ids.iter().cloned().collect::<Vec<_>>(),
)?;
for paragraph in &mut dataset.paragraphs {
if let Some(questions) = questions_by_paragraph.remove(&paragraph.id) {
paragraph.questions = questions;
} else {
paragraph.questions.clear();
}
}
Ok(dataset)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::datasets::{DatasetKind, DatasetMetadata};
fn sample_dataset() -> ConvertedDataset {
ConvertedDataset {
generated_at: Utc::now(),
metadata: DatasetMetadata::for_kind(DatasetKind::SquadV2, false),
source: "test".to_string(),
paragraphs: vec![ConvertedParagraph {
id: "p1".to_string(),
title: "Title".to_string(),
context: "Body".to_string(),
questions: vec![ConvertedQuestion {
id: "q1".to_string(),
question: "Question?".to_string(),
answers: vec!["Answer".to_string()],
is_impossible: false,
}],
}],
}
}
#[test]
#[allow(clippy::indexing_slicing)]
fn sharded_round_trip() -> Result<()> {
let dir = tempfile::tempdir()?;
let store_dir = dir.path().join("sample");
let dataset = sample_dataset();
write_sharded(&dataset, &store_dir)?;
let loaded = load_sharded_full(&store_dir)?;
assert_eq!(loaded.paragraphs.len(), 1);
assert_eq!(loaded.paragraphs[0].questions[0].id, "q1");
Ok(())
}
}
@@ -1,22 +1,22 @@
//! Database namespace management utilities.
use anyhow::{anyhow, Context, Result};
use chrono::Utc;
use common::storage::{
db::SurrealDbClient,
types::user::{Theme, User},
types::StoredObject,
use common::{
storage::{
db::SurrealDbClient,
types::user::{Theme, User},
types::StoredObject,
},
utils::embedding::EmbeddingProvider,
};
use serde::Deserialize;
use tracing::{info, warn};
use crate::{
args::Config,
corpus::{self, CorpusHandle, CorpusManifest, NamespaceSeedRecord},
datasets,
snapshot::{self, DbSnapshotState},
};
/// Connect to the evaluation database with fallback auth strategies.
pub(crate) async fn connect_eval_db(
config: &Config,
namespace: &str,
@@ -73,7 +73,6 @@ pub(crate) async fn connect_eval_db(
}
}
/// Check if the namespace contains any corpus data.
pub(crate) async fn namespace_has_corpus(db: &SurrealDbClient) -> Result<bool> {
#[derive(Deserialize)]
struct CountRow {
@@ -89,41 +88,52 @@ pub(crate) async fn namespace_has_corpus(db: &SurrealDbClient) -> Result<bool> {
Ok(rows.first().map_or(0, |row| row.count) > 0)
}
/// Determine if we can reuse an existing namespace based on cached state.
fn manifest_matches_runtime(
manifest: &CorpusManifest,
embedding_provider: &EmbeddingProvider,
ingestion_fingerprint: &str,
) -> bool {
let metadata = &manifest.metadata;
metadata.ingestion_fingerprint == ingestion_fingerprint
&& metadata.embedding_backend == embedding_provider.backend_label()
&& metadata.embedding_model == embedding_provider.model_code()
&& metadata.embedding_dimension == embedding_provider.dimension()
}
#[allow(clippy::too_many_arguments)]
pub(crate) async fn can_reuse_namespace(
db: &SurrealDbClient,
descriptor: &snapshot::Descriptor,
manifest: &CorpusManifest,
embedding_provider: &EmbeddingProvider,
namespace: &str,
database: &str,
dataset_id: &str,
slice_id: &str,
ingestion_fingerprint: &str,
slice_case_count: usize,
) -> Result<bool> {
let Some(state) = descriptor.load_db_state().await? else {
info!("No namespace state recorded; reseeding corpus from cached shards");
if !manifest_matches_runtime(manifest, embedding_provider, ingestion_fingerprint) {
info!("Corpus manifest metadata mismatch; rebuilding namespace from cached shards");
return Ok(false);
}
let Some(seed) = manifest.metadata.namespace_seed.as_ref() else {
info!("No namespace seed recorded in corpus manifest; reseeding");
return Ok(false);
};
if state.slice_case_count != slice_case_count {
if seed.slice_case_count != slice_case_count {
info!(
requested_cases = slice_case_count,
stored_cases = state.slice_case_count,
"Skipping live namespace reuse; cached state does not match requested window"
stored_cases = seed.slice_case_count,
"Skipping namespace reuse; case window mismatch"
);
return Ok(false);
}
if state.dataset_id != dataset_id
|| state.slice_id != slice_id
|| state.ingestion_fingerprint != ingestion_fingerprint
|| state.namespace.as_deref() != Some(namespace)
|| state.database.as_deref() != Some(database)
{
if seed.namespace != namespace || seed.database != database {
info!(
namespace,
database, "Cached namespace metadata mismatch; rebuilding corpus from ingestion cache"
database,
"Corpus manifest namespace metadata mismatch; reseeding"
);
return Ok(false);
}
@@ -140,28 +150,20 @@ pub(crate) async fn can_reuse_namespace(
}
}
/// Record the current namespace state to allow future reuse checks.
pub(crate) async fn record_namespace_state(
descriptor: &snapshot::Descriptor,
dataset_id: &str,
slice_id: &str,
ingestion_fingerprint: &str,
pub(crate) async fn record_namespace_seed(
handle: &mut CorpusHandle,
namespace: &str,
database: &str,
slice_case_count: usize,
) {
let state = DbSnapshotState {
dataset_id: dataset_id.to_string(),
slice_id: slice_id.to_string(),
ingestion_fingerprint: ingestion_fingerprint.to_string(),
snapshot_hash: descriptor.metadata_hash().to_string(),
updated_at: Utc::now(),
namespace: Some(namespace.to_string()),
database: Some(database.to_string()),
handle.manifest.metadata.namespace_seed = Some(NamespaceSeedRecord {
namespace: namespace.to_string(),
database: database.to_string(),
slice_case_count,
};
if let Err(err) = descriptor.store_db_state(&state).await {
warn!(error = %err, "Failed to record namespace state");
seeded_at: Utc::now(),
});
if let Err(err) = corpus::persist_corpus_manifest(handle) {
warn!(error = %err, "Failed to record namespace seed in corpus manifest");
}
}
@@ -185,8 +187,17 @@ fn sanitize_identifier(input: &str) -> String {
cleaned
}
/// Generate a default namespace name based on dataset and limit.
pub(crate) fn default_namespace(dataset_id: &str, limit: Option<usize>) -> String {
pub(crate) fn default_namespace(
dataset_id: &str,
limit: Option<usize>,
slice_id: Option<&str>,
) -> String {
if let Some(slice_id) = slice_id {
let sanitized = sanitize_identifier(slice_id);
if !sanitized.is_empty() {
return format!("eval_{sanitized}");
}
}
let dataset_component = sanitize_identifier(dataset_id);
let limit_component = match limit {
Some(value) if value > 0 => format!("limit{value}"),
@@ -195,12 +206,10 @@ pub(crate) fn default_namespace(dataset_id: &str, limit: Option<usize>) -> Strin
format!("eval_{dataset_component}_{limit_component}")
}
/// Generate the default database name for evaluations.
pub(crate) fn default_database() -> String {
"retrieval_eval".to_string()
}
/// Ensure the evaluation user exists in the database.
pub(crate) async fn ensure_eval_user(db: &SurrealDbClient) -> Result<User> {
let timestamp = datasets::base_timestamp();
let user = User {
@@ -225,3 +234,7 @@ pub(crate) async fn ensure_eval_user(db: &SurrealDbClient) -> Result<User> {
.context("storing evaluation user")?;
Ok(user)
}
pub(crate) fn sanitize_model_code(code: &str) -> String {
sanitize_identifier(code)
}
@@ -2,13 +2,6 @@ use anyhow::{Context, Result};
use common::storage::{db::SurrealDbClient, indexes::ensure_runtime};
use tracing::info;
// Helper functions for index management during namespace reseed
pub async fn remove_all_indexes(db: &SurrealDbClient) -> Result<()> {
let _ = db;
info!("Removing ALL indexes before namespace reseed (no-op placeholder)");
Ok(())
}
pub async fn recreate_indexes(db: &SurrealDbClient, dimension: usize) -> Result<()> {
info!("Recreating ALL indexes after namespace reseed via shared runtime helper");
ensure_runtime(db, dimension)
@@ -34,14 +27,39 @@ pub async fn reset_namespace(db: &SurrealDbClient, namespace: &str, database: &s
Ok(())
}
// // Test helper to force index dimension change
// #[allow(dead_code)]
// pub async fn change_embedding_length_in_hnsw_indexes(
// db: &SurrealDbClient,
// dimension: usize,
// ) -> Result<()> {
// recreate_indexes(db, dimension).await
// }
#[allow(clippy::cast_precision_loss)]
pub(crate) async fn warm_hnsw_cache(db: &SurrealDbClient, dimension: usize) -> Result<()> {
let dummy_embedding: Vec<f32> = (0..dimension).map(|i| (i as f32).sin()).collect();
info!("Warming HNSW caches with sample queries");
let _ = db
.client
.query(
r#"SELECT chunk_id
FROM text_chunk_embedding
WHERE embedding <|1,1|> $embedding
LIMIT 5"#,
)
.bind(("embedding", dummy_embedding.clone()))
.await
.context("warming text chunk HNSW cache")?;
let _ = db
.client
.query(
r#"SELECT entity_id
FROM knowledge_entity_embedding
WHERE embedding <|1,1|> $embedding
LIMIT 5"#,
)
.bind(("embedding", dummy_embedding))
.await
.context("warming knowledge entity HNSW cache")?;
info!("HNSW cache warming completed");
Ok(())
}
#[cfg(test)]
mod tests {
+9
View File
@@ -0,0 +1,9 @@
mod connect;
mod lifecycle;
pub(crate) use connect::{
can_reuse_namespace, connect_eval_db, default_database, default_namespace, ensure_eval_user,
namespace_has_corpus, record_namespace_seed, sanitize_model_code,
};
pub use lifecycle::{recreate_indexes, reset_namespace};
pub(crate) use lifecycle::warm_hnsw_cache;
-128
View File
@@ -1,128 +0,0 @@
//! Evaluation utilities module - re-exports from focused submodules.
// Re-export types from the root types module
pub use crate::types::*;
// Re-export from focused modules at crate root (crate-internal only)
pub(crate) use crate::cases::{cases_from_manifest, SeededCase};
pub(crate) use crate::namespace::{
can_reuse_namespace, connect_eval_db, default_database, default_namespace, ensure_eval_user,
record_namespace_state,
};
pub(crate) use crate::settings::{enforce_system_settings, load_or_init_system_settings};
use std::path::Path;
use anyhow::{Context, Result};
use common::storage::db::SurrealDbClient;
use tokio::io::AsyncWriteExt;
use tracing::info;
use crate::{
args::{self, Config},
datasets::ConvertedDataset,
slice::{self},
};
/// Grow the slice ledger to contain the target number of cases.
pub fn grow_slice(dataset: &ConvertedDataset, config: &Config) -> Result<()> {
let ledger_limit = ledger_target(config);
let slice_settings = slice::slice_config_with_limit(config, ledger_limit);
let slice =
slice::resolve_slice(dataset, &slice_settings).context("resolving dataset slice")?;
info!(
slice = slice.manifest.slice_id.as_str(),
cases = slice.manifest.case_count,
positives = slice.manifest.positive_paragraphs,
negatives = slice.manifest.negative_paragraphs,
total_paragraphs = slice.manifest.total_paragraphs,
"Slice ledger ready"
);
println!(
"Slice `{}` now contains {} questions ({} positives, {} negatives)",
slice.manifest.slice_id,
slice.manifest.case_count,
slice.manifest.positive_paragraphs,
slice.manifest.negative_paragraphs
);
Ok(())
}
pub(crate) fn ledger_target(config: &Config) -> Option<usize> {
match (config.slice_grow, config.limit) {
(Some(grow), Some(limit)) => Some(limit.max(grow)),
(Some(grow), None) => Some(grow),
(None, limit) => limit,
}
}
pub(crate) async fn write_chunk_diagnostics(path: &Path, cases: &[CaseDiagnostics]) -> Result<()> {
args::ensure_parent(path)?;
let mut file = tokio::fs::File::create(path)
.await
.with_context(|| format!("creating diagnostics file {}", path.display()))?;
for case in cases {
let line = serde_json::to_vec(case).context("serialising chunk diagnostics entry")?;
file.write_all(&line).await?;
file.write_all(b"\n").await?;
}
file.flush().await?;
Ok(())
}
#[allow(clippy::cast_precision_loss)]
pub(crate) async fn warm_hnsw_cache(db: &SurrealDbClient, dimension: usize) -> Result<()> {
let dummy_embedding: Vec<f32> = (0..dimension).map(|i| (i as f32).sin()).collect();
info!("Warming HNSW caches with sample queries");
// Warm up chunk embedding index - just query the embedding table to load HNSW index
let _ = db
.client
.query(
r#"SELECT chunk_id
FROM text_chunk_embedding
WHERE embedding <|1,1|> $embedding
LIMIT 5"#,
)
.bind(("embedding", dummy_embedding.clone()))
.await
.context("warming text chunk HNSW cache")?;
// Warm up entity embedding index
let _ = db
.client
.query(
r#"SELECT entity_id
FROM knowledge_entity_embedding
WHERE embedding <|1,1|> $embedding
LIMIT 5"#,
)
.bind(("embedding", dummy_embedding))
.await
.context("warming knowledge entity HNSW cache")?;
info!("HNSW cache warming completed");
Ok(())
}
use chrono::{DateTime, SecondsFormat, Utc};
pub fn format_timestamp(timestamp: &DateTime<Utc>) -> String {
timestamp.to_rfc3339_opts(SecondsFormat::Secs, true)
}
pub(crate) fn sanitize_model_code(code: &str) -> String {
code.chars()
.map(|ch| {
if ch.is_ascii_alphanumeric() {
ch.to_ascii_lowercase()
} else {
'_'
}
})
.collect()
}
// Re-export run_evaluation from the pipeline module at crate root
pub use crate::pipeline::run_evaluation;
+19 -51
View File
@@ -1,13 +1,13 @@
use std::{
collections::HashMap,
fs,
path::{Path, PathBuf},
path::Path,
};
use anyhow::{anyhow, Context, Result};
use common::storage::{db::SurrealDbClient, types::text_chunk::TextChunk};
use crate::{args::Config, corpus, eval::connect_eval_db, snapshot::DbSnapshotState};
use crate::{args::Config, corpus, db::connect_eval_db};
pub async fn inspect_question(config: &Config) -> Result<()> {
let question_id = config
@@ -64,39 +64,26 @@ pub async fn inspect_question(config: &Config) -> Result<()> {
);
}
let db_state_path = config
.database
.inspect_db_state
.clone()
.unwrap_or_else(|| default_state_path(config, &manifest));
if let Some(state) = load_db_state(&db_state_path)? {
if let (Some(ns), Some(db_name)) = (state.namespace.as_deref(), state.database.as_deref()) {
match connect_eval_db(config, ns, db_name).await {
Ok(db) => match verify_chunks_in_db(&db, &question.matching_chunk_ids).await? {
MissingChunks::None => println!(
"All matching_chunk_ids exist in namespace '{ns}', database '{db_name}'"
),
MissingChunks::Missing(list) => println!(
"Missing chunks in namespace '{ns}', database '{db_name}': {list:?}"
),
},
Err(err) => {
println!(
"Failed to connect to SurrealDB namespace '{ns}' / database '{db_name}': {err}"
);
}
if let Some(seed) = manifest.metadata.namespace_seed.as_ref() {
let ns = seed.namespace.as_str();
let db_name = seed.database.as_str();
match connect_eval_db(config, ns, db_name).await {
Ok(db) => match verify_chunks_in_db(&db, &question.matching_chunk_ids).await? {
MissingChunks::None => println!(
"All matching_chunk_ids exist in namespace '{ns}', database '{db_name}'"
),
MissingChunks::Missing(list) => println!(
"Missing chunks in namespace '{ns}', database '{db_name}': {list:?}"
),
},
Err(err) => {
println!(
"Failed to connect to SurrealDB namespace '{ns}' / database '{db_name}': {err}"
);
}
} else {
println!(
"State file {} is missing namespace/database fields; skipping live DB validation",
db_state_path.display()
);
}
} else {
println!(
"State file {} not found; skipping live DB validation",
db_state_path.display()
);
println!("Corpus manifest has no namespace seed; skipping live DB validation");
}
Ok(())
@@ -137,25 +124,6 @@ fn build_chunk_lookup(manifest: &corpus::CorpusManifest) -> HashMap<String, Chun
lookup
}
fn default_state_path(config: &Config, manifest: &corpus::CorpusManifest) -> PathBuf {
config
.cache_dir
.join("snapshots")
.join(&manifest.metadata.dataset_id)
.join(&manifest.metadata.slice_id)
.join("db/state.json")
}
fn load_db_state(path: &Path) -> Result<Option<DbSnapshotState>> {
if !path.exists() {
return Ok(None);
}
let bytes = fs::read(path).with_context(|| format!("reading db state {}", path.display()))?;
let state = serde_json::from_slice(&bytes)
.with_context(|| format!("parsing db state {}", path.display()))?;
Ok(Some(state))
}
enum MissingChunks {
None,
Missing(Vec<String>),
+51 -44
View File
@@ -1,19 +1,17 @@
mod args;
mod cache;
mod context_stats;
mod cases;
mod cli;
mod corpus;
mod datasets;
mod db_helpers;
mod eval;
mod db;
mod inspection;
mod namespace;
mod openai;
mod perf;
mod pipeline;
mod report;
mod settings;
mod slice;
mod snapshot;
mod types;
use anyhow::Context;
@@ -24,7 +22,6 @@ use tracing_subscriber::{fmt, EnvFilter};
/// Configure `SurrealDB` environment variables for optimal performance
#[allow(clippy::arithmetic_side_effects, clippy::unwrap_used)]
fn configure_surrealdb_performance(cpu_count: usize) {
// Set environment variables only if they're not already set
let indexing_batch_size = std::env::var("SURREAL_INDEXING_BATCH_SIZE")
.unwrap_or_else(|_| (cpu_count * 2).to_string());
std::env::set_var("SURREAL_INDEXING_BATCH_SIZE", indexing_batch_size);
@@ -62,12 +59,11 @@ fn configure_surrealdb_performance(cpu_count: usize) {
}
fn main() -> anyhow::Result<()> {
// Create an explicit multi-threaded runtime with optimized configuration
let runtime = Builder::new_multi_thread()
.enable_all()
.worker_threads(std::thread::available_parallelism()?.get())
.max_blocking_threads(std::thread::available_parallelism()?.get())
.thread_stack_size(10 * 1024 * 1024) // 10MiB stack size
.thread_stack_size(10 * 1024 * 1024)
.thread_name("eval-retrieval-worker")
.build()
.context("failed to create tokio runtime")?;
@@ -77,7 +73,6 @@ fn main() -> anyhow::Result<()> {
#[allow(clippy::too_many_lines)]
async fn async_main() -> anyhow::Result<()> {
// Log runtime configuration
let cpu_count = std::thread::available_parallelism()?.get();
info!(
cpu_cores = cpu_count,
@@ -87,7 +82,6 @@ async fn async_main() -> anyhow::Result<()> {
"Started multi-threaded tokio runtime"
);
// Configure SurrealDB environment variables for better performance
configure_surrealdb_performance(cpu_count);
let filter = std::env::var("RUST_LOG").unwrap_or_else(|_| "info".to_string());
@@ -97,13 +91,22 @@ async fn async_main() -> anyhow::Result<()> {
let parsed = args::parse()?;
// Clap handles help automatically, so we don't need to check for it manually
if parsed.config.inspect_question.is_some() {
inspection::inspect_question(&parsed.config).await?;
return Ok(());
}
if parsed.config.status {
let status = cli::collect_status(&parsed.config).await?;
cli::print_status(&status);
return Ok(());
}
if parsed.config.warm {
cli::warm(&parsed.config).await?;
return Ok(());
}
let dataset_kind = parsed.config.dataset;
if parsed.config.convert_only {
@@ -115,7 +118,6 @@ async fn async_main() -> anyhow::Result<()> {
parsed.config.raw_dataset_path.as_path(),
dataset_kind,
parsed.config.llm_mode,
parsed.config.context_token_limit(),
)
.with_context(|| {
format!(
@@ -124,56 +126,56 @@ async fn async_main() -> anyhow::Result<()> {
parsed.config.raw_dataset_path.display()
)
})?;
crate::datasets::write_converted(&dataset, parsed.config.converted_dataset_path.as_path())
.with_context(|| {
format!(
"writing converted dataset to {}",
parsed.config.converted_dataset_path.display()
)
})?;
let store_dir = datasets::store_dir_for(&parsed.config.converted_dataset_path);
datasets::write_sharded(&dataset, &store_dir)?;
datasets::prebuild_catalog_slices(&dataset, &parsed.config)?;
println!(
"Converted dataset written to {}",
parsed.config.converted_dataset_path.display()
"Converted dataset written under {}",
store_dir.display()
);
return Ok(());
}
if parsed.config.require_ready {
cli::ensure_query_ready(&parsed.config).await?;
}
info!(dataset = dataset_kind.id(), "Preparing converted dataset");
let dataset = crate::datasets::ensure_converted(
dataset_kind,
parsed.config.raw_dataset_path.as_path(),
parsed.config.converted_dataset_path.as_path(),
parsed.config.force_convert,
parsed.config.llm_mode,
parsed.config.context_token_limit(),
)
.with_context(|| {
format!(
"preparing converted dataset at {}",
parsed.config.converted_dataset_path.display()
)
})?;
let loaded = crate::datasets::prepare_dataset(dataset_kind, &parsed.config).with_context(
|| {
format!(
"preparing converted dataset at {}",
parsed.config.converted_dataset_path.display()
)
},
)?;
info!(
questions = dataset
questions = loaded
.dataset
.paragraphs
.iter()
.map(|p| p.questions.len())
.sum::<usize>(),
paragraphs = dataset.paragraphs.len(),
dataset = dataset.metadata.id.as_str(),
paragraphs = loaded.dataset.paragraphs.len(),
partial = loaded.partial,
dataset = loaded.dataset.metadata.id.as_str(),
"Dataset ready"
);
if parsed.config.slice_grow.is_some() {
eval::grow_slice(&dataset, &parsed.config).context("growing slice ledger")?;
slice::grow_slice(&loaded.dataset, &parsed.config).context("growing slice ledger")?;
return Ok(());
}
info!("Running retrieval evaluation");
let summary = eval::run_evaluation(&dataset, &parsed.config)
.await
.context("running retrieval evaluation")?;
let summary = pipeline::run_evaluation(
&loaded.dataset,
&parsed.config,
Some(loaded.content_checksum.as_str()),
)
.await
.context("running retrieval evaluation")?;
let report = report::write_reports(
&summary,
@@ -226,12 +228,17 @@ async fn async_main() -> anyhow::Result<()> {
);
} else {
println!(
"[{}] Retrieval Precision@{k}: {precision:.3} ({correct}/{retrieval_total}) → JSON: {json} | Markdown: {md} | History: {history}{perf_note}",
"[{}] Retrieval Precision@{k}: {precision:.3} ({correct}/{retrieval_total}) | Retrieved context: {chunks} chunks, {tokens} tokens ({tokenizer}, avg {avg_tokens:.0}/query, p95 {p95}) → JSON: {json} | Markdown: {md} | History: {history}{perf_note}",
summary.dataset_label,
k = summary.k,
precision = summary.precision,
correct = summary.correct,
retrieval_total = summary.retrieval_cases,
chunks = summary.retrieved_context.total_chunks,
tokens = summary.retrieved_context.total_tokens,
tokenizer = summary.retrieved_context.tokenizer,
avg_tokens = summary.retrieved_context.avg_tokens_per_query,
p95 = summary.retrieved_context.p95_tokens_per_query,
json = report.paths.json.display(),
md = report.paths.markdown.display(),
history = report.history_path.display(),
+19 -1
View File
@@ -1,9 +1,27 @@
use std::sync::Arc;
use anyhow::{Context, Result};
use async_openai::{config::OpenAIConfig, Client};
const DEFAULT_BASE_URL: &str = "https://api.openai.com/v1";
pub fn build_client_from_env() -> Result<(Client<OpenAIConfig>, String)> {
pub fn ingestion_openai_client(
include_entities: bool,
) -> Result<(Arc<Client<OpenAIConfig>>, Option<String>)> {
if include_entities {
let (client, base_url) = build_client_from_env().context(
"OPENAI_API_KEY must be set when --include-entities is enabled (entity extraction uses OpenAI)",
)?;
Ok((Arc::new(client), Some(base_url)))
} else {
Ok((
Arc::new(Client::with_config(OpenAIConfig::default())),
None,
))
}
}
fn build_client_from_env() -> Result<(Client<OpenAIConfig>, String)> {
let api_key = std::env::var("OPENAI_API_KEY")
.context("OPENAI_API_KEY must be set to run retrieval evaluations")?;
let base_url =
+8 -7
View File
@@ -7,8 +7,8 @@ use anyhow::{Context, Result};
use crate::{
args,
eval::EvaluationSummary,
report::{self, EvaluationReport},
types::EvaluationSummary,
};
pub fn mirror_perf_outputs(
@@ -91,23 +91,23 @@ fn format_duration(value: Option<u128>) -> String {
#[cfg(test)]
mod tests {
use super::*;
use crate::eval::{EvaluationStageTimings, PerformanceTimings};
use crate::types::{EvaluationStageTimings, PerformanceTimings, LatencyStats, StageLatency, StageLatencyBreakdown};
use chrono::Utc;
use tempfile::tempdir;
fn sample_latency() -> crate::eval::LatencyStats {
crate::eval::LatencyStats {
fn sample_latency() -> LatencyStats {
LatencyStats {
avg: 10.0,
p50: 8,
p95: 15,
}
}
fn sample_stage_latency() -> crate::eval::StageLatencyBreakdown {
crate::eval::StageLatencyBreakdown {
fn sample_stage_latency() -> StageLatencyBreakdown {
StageLatencyBreakdown {
stages: ["embed", "search", "rerank", "resolve_entities", "assemble"]
.into_iter()
.map(|stage| crate::eval::StageLatency {
.map(|stage| StageLatency {
stage: stage.to_string(),
stats: sample_latency(),
})
@@ -206,6 +206,7 @@ mod tests {
chunk_vector_take: 20,
chunk_fts_take: 20,
max_chunks_per_entity: 4,
retrieved_context: crate::context_stats::aggregate_context_stats(&[]),
cases: Vec::new(),
}
}
+14 -14
View File
@@ -20,11 +20,11 @@ use retrieval_pipeline::{
use crate::{
args::Config,
cache::EmbeddingCache,
cases::SeededCase,
corpus,
datasets::ConvertedDataset,
eval::{CaseDiagnostics, CaseSummary, EvaluationStageTimings, EvaluationSummary, SeededCase},
slice, snapshot,
slice,
types::{CaseDiagnostics, CaseSummary, EvaluationStageTimings, EvaluationSummary},
};
#[allow(clippy::struct_excessive_bools)]
@@ -41,12 +41,10 @@ pub(super) struct EvaluationContext<'a> {
pub namespace: String,
pub database: String,
pub db: Option<SurrealDbClient>,
pub descriptor: Option<snapshot::Descriptor>,
pub settings: Option<SystemSettings>,
pub settings_missing: bool,
pub must_reapply_settings: bool,
pub embedding_provider: Option<EmbeddingProvider>,
pub embedding_cache: Option<EmbeddingCache>,
pub openai_client: Option<Arc<Client<async_openai::config::OpenAIConfig>>>,
pub openai_base_url: Option<String>,
pub expected_fingerprint: Option<String>,
@@ -67,13 +65,19 @@ pub(super) struct EvaluationContext<'a> {
pub summary: Option<EvaluationSummary>,
pub diagnostics_path: Option<PathBuf>,
pub diagnostics_enabled: bool,
pub content_checksum: Option<String>,
}
impl<'a> EvaluationContext<'a> {
pub fn new(dataset: &'a ConvertedDataset, config: &'a Config) -> Self {
pub fn new(
dataset: &'a ConvertedDataset,
config: &'a Config,
content_checksum: Option<String>,
) -> Self {
Self {
dataset,
config,
content_checksum,
stage_timings: EvaluationStageTimings::default(),
ledger_limit: None,
slice_settings: None,
@@ -84,12 +88,10 @@ impl<'a> EvaluationContext<'a> {
namespace: String::new(),
database: String::new(),
db: None,
descriptor: None,
settings: None,
settings_missing: false,
must_reapply_settings: false,
embedding_provider: None,
embedding_cache: None,
openai_client: None,
openai_base_url: None,
expected_fingerprint: None,
@@ -133,12 +135,6 @@ impl<'a> EvaluationContext<'a> {
.ok_or_else(|| anyhow!("database connection missing"))
}
pub fn descriptor(&self) -> Result<&snapshot::Descriptor> {
self.descriptor
.as_ref()
.ok_or_else(|| anyhow!("snapshot descriptor unavailable"))
}
pub fn embedding_provider(&self) -> Result<&EmbeddingProvider> {
self.embedding_provider
.as_ref()
@@ -159,6 +155,10 @@ impl<'a> EvaluationContext<'a> {
.ok_or_else(|| anyhow!("corpus handle missing"))
}
pub fn content_checksum(&self) -> Option<&str> {
self.content_checksum.as_deref()
}
pub fn evaluation_user(&self) -> Result<&User> {
self.eval_user
.as_ref()
+20
View File
@@ -0,0 +1,20 @@
use std::path::Path;
use anyhow::{Context, Result};
use tokio::io::AsyncWriteExt;
use crate::{args, types::CaseDiagnostics};
pub(crate) async fn write_chunk_diagnostics(path: &Path, cases: &[CaseDiagnostics]) -> Result<()> {
args::ensure_parent(path)?;
let mut file = tokio::fs::File::create(path)
.await
.with_context(|| format!("creating diagnostics file {}", path.display()))?;
for case in cases {
let line = serde_json::to_vec(case).context("serialising chunk diagnostics entry")?;
file.write_all(&line).await?;
file.write_all(b"\n").await?;
}
file.flush().await?;
Ok(())
}
+41 -12
View File
@@ -1,6 +1,6 @@
mod context;
mod diagnostics;
mod stages;
mod state;
use anyhow::Result;
@@ -8,20 +8,49 @@ use crate::{args::Config, datasets::ConvertedDataset, types::EvaluationSummary};
use context::EvaluationContext;
async fn run_through_namespace<'a>(
dataset: &'a ConvertedDataset,
config: &'a Config,
content_checksum: Option<String>,
) -> Result<EvaluationContext<'a>> {
let mut ctx = EvaluationContext::new(dataset, config, content_checksum);
stages::prepare_slice(&mut ctx).await?;
stages::prepare_db(&mut ctx).await?;
stages::prepare_corpus(&mut ctx).await?;
stages::prepare_namespace(&mut ctx).await?;
Ok(ctx)
}
pub async fn warm_evaluation(
dataset: &ConvertedDataset,
config: &Config,
content_checksum: &str,
) -> Result<()> {
let _ctx = run_through_namespace(
dataset,
config,
Some(content_checksum.to_string()),
)
.await?;
Ok(())
}
pub async fn run_evaluation(
dataset: &ConvertedDataset,
config: &Config,
content_checksum: Option<&str>,
) -> Result<EvaluationSummary> {
let mut ctx = EvaluationContext::new(dataset, config);
let machine = state::ready();
let machine = stages::prepare_slice(machine, &mut ctx).await?;
let machine = stages::prepare_db(machine, &mut ctx).await?;
let machine = stages::prepare_corpus(machine, &mut ctx).await?;
let machine = stages::prepare_namespace(machine, &mut ctx).await?;
let machine = stages::run_queries(machine, &mut ctx).await?;
let machine = stages::summarize(machine, &mut ctx).await?;
let _ = stages::finalize(machine, &mut ctx).await?;
let mut ctx = EvaluationContext::new(
dataset,
config,
content_checksum.map(str::to_string),
);
stages::prepare_slice(&mut ctx).await?;
stages::prepare_db(&mut ctx).await?;
stages::prepare_corpus(&mut ctx).await?;
stages::prepare_namespace(&mut ctx).await?;
stages::run_queries(&mut ctx).await?;
stages::summarize(&mut ctx).await?;
stages::finalize(&mut ctx).await?;
ctx.into_summary()
}
+3 -18
View File
@@ -3,18 +3,12 @@ use std::time::Instant;
use anyhow::Context;
use tracing::info;
use crate::eval::write_chunk_diagnostics;
use super::super::{
context::{EvalStage, EvaluationContext},
state::{Completed, EvaluationMachine, Summarized},
diagnostics::write_chunk_diagnostics,
};
use super::{map_guard_error, StageResult};
pub(crate) async fn finalize(
machine: EvaluationMachine<(), Summarized>,
ctx: &mut EvaluationContext<'_>,
) -> StageResult<Completed> {
pub(crate) async fn finalize(ctx: &mut EvaluationContext<'_>) -> anyhow::Result<()> {
let stage = EvalStage::Finalize;
info!(
evaluation_stage = stage.label(),
@@ -22,13 +16,6 @@ pub(crate) async fn finalize(
);
let started = Instant::now();
if let Some(cache) = ctx.embedding_cache.as_ref() {
cache
.persist()
.await
.context("persisting embedding cache")?;
}
if let Some(path) = ctx.diagnostics_path.as_ref() {
if ctx.diagnostics_enabled {
write_chunk_diagnostics(path.as_path(), &ctx.diagnostics_output)
@@ -53,7 +40,5 @@ pub(crate) async fn finalize(
"completed evaluation stage"
);
machine
.finalize()
.map_err(|(_, guard)| map_guard_error("finalize", &guard))
Ok(())
}
-11
View File
@@ -13,14 +13,3 @@ pub(crate) use prepare_namespace::prepare_namespace;
pub(crate) use prepare_slice::prepare_slice;
pub(crate) use run_queries::run_queries;
pub(crate) use summarize::summarize;
use anyhow::Result;
use state_machines::core::GuardError;
use super::state::EvaluationMachine;
fn map_guard_error(event: &str, guard: &GuardError) -> anyhow::Error {
anyhow::anyhow!("invalid evaluation pipeline transition during {event}: {guard:?}")
}
type StageResult<S> = Result<EvaluationMachine<(), S>>;
@@ -3,19 +3,12 @@ use std::time::Instant;
use anyhow::Context;
use tracing::info;
use crate::{corpus, eval::can_reuse_namespace, slice, snapshot};
use crate::{corpus, db::can_reuse_namespace, slice};
use super::super::{
context::{EvalStage, EvaluationContext},
state::{CorpusReady, DbReady, EvaluationMachine},
};
use super::{map_guard_error, StageResult};
use super::super::context::{EvalStage, EvaluationContext};
#[allow(clippy::too_many_lines)]
pub(crate) async fn prepare_corpus(
machine: EvaluationMachine<(), DbReady>,
ctx: &mut EvaluationContext<'_>,
) -> StageResult<CorpusReady> {
pub(crate) async fn prepare_corpus(ctx: &mut EvaluationContext<'_>) -> anyhow::Result<()> {
let stage = EvalStage::PrepareCorpus;
info!(
evaluation_stage = stage.label(),
@@ -31,13 +24,13 @@ pub(crate) async fn prepare_corpus(
let window = slice::select_window(slice, ctx.config().slice_offset, ctx.config().limit)
.context("selecting slice window for corpus preparation")?;
let descriptor = snapshot::Descriptor::new(config, slice, ctx.embedding_provider()?);
let ingestion_config = corpus::make_ingestion_config(config);
let expected_fingerprint = corpus::compute_ingestion_fingerprint(
ctx.dataset(),
slice,
config.converted_dataset_path.as_path(),
&ingestion_config,
ctx.content_checksum(),
)?;
let base_dir = corpus::cached_corpus_dir(
&cache_settings,
@@ -47,19 +40,18 @@ pub(crate) async fn prepare_corpus(
if !config.reseed_slice {
let requested_cases = window.cases.len();
if can_reuse_namespace(
ctx.db()?,
&descriptor,
&ctx.namespace,
&ctx.database,
ctx.dataset().metadata.id.as_str(),
slice.manifest.slice_id.as_str(),
expected_fingerprint.as_str(),
requested_cases,
)
.await?
{
if let Some(manifest) = corpus::load_cached_manifest(&base_dir)? {
if let Some(manifest) = corpus::load_cached_manifest(&base_dir)? {
if can_reuse_namespace(
ctx.db()?,
&manifest,
&embedding_provider,
&ctx.namespace,
&ctx.database,
expected_fingerprint.as_str(),
requested_cases,
)
.await?
{
info!(
cache = %base_dir.display(),
namespace = ctx.namespace.as_str(),
@@ -70,7 +62,6 @@ pub(crate) async fn prepare_corpus(
ctx.corpus_handle = Some(corpus_handle);
ctx.expected_fingerprint = Some(expected_fingerprint);
ctx.ingestion_duration_ms = 0;
ctx.descriptor = Some(descriptor);
let elapsed = started.elapsed();
ctx.record_stage_duration(stage, elapsed);
@@ -80,14 +71,8 @@ pub(crate) async fn prepare_corpus(
"completed evaluation stage"
);
return machine
.prepare_corpus()
.map_err(|(_, guard)| map_guard_error("prepare_corpus", &guard));
return Ok(());
}
info!(
cache = %base_dir.display(),
"Namespace reusable but cached manifest missing; regenerating corpus"
);
}
}
@@ -103,6 +88,7 @@ pub(crate) async fn prepare_corpus(
openai_client,
&eval_user_id,
config.converted_dataset_path.as_path(),
ctx.content_checksum(),
ingestion_config.clone(),
)
.await
@@ -126,7 +112,6 @@ pub(crate) async fn prepare_corpus(
ctx.corpus_handle = Some(corpus_handle);
ctx.expected_fingerprint = Some(expected_fingerprint);
ctx.ingestion_duration_ms = ingestion_duration_ms;
ctx.descriptor = Some(descriptor);
let elapsed = started.elapsed();
ctx.record_stage_duration(stage, elapsed);
@@ -136,7 +121,5 @@ pub(crate) async fn prepare_corpus(
"completed evaluation stage"
);
machine
.prepare_corpus()
.map_err(|(_, guard)| map_guard_error("prepare_corpus", &guard))
Ok(())
}
+17 -35
View File
@@ -1,28 +1,19 @@
use std::{sync::Arc, time::Instant};
use std::time::Instant;
use anyhow::{anyhow, Context};
use tracing::info;
use crate::{
args::EmbeddingBackend,
cache::EmbeddingCache,
eval::{
connect_eval_db, enforce_system_settings, load_or_init_system_settings, sanitize_model_code,
},
db::{connect_eval_db, sanitize_model_code},
openai,
settings::{enforce_system_settings, load_or_init_system_settings},
};
use common::utils::embedding::{default_embedding_pool_size, EmbeddingProvider};
use super::super::{
context::{EvalStage, EvaluationContext},
state::{DbReady, EvaluationMachine, SlicePrepared},
};
use super::{map_guard_error, StageResult};
use super::super::context::{EvalStage, EvaluationContext};
pub(crate) async fn prepare_db(
machine: EvaluationMachine<(), SlicePrepared>,
ctx: &mut EvaluationContext<'_>,
) -> StageResult<DbReady> {
pub(crate) async fn prepare_db(ctx: &mut EvaluationContext<'_>) -> anyhow::Result<()> {
let stage = EvalStage::PrepareDb;
info!(
evaluation_stage = stage.label(),
@@ -36,19 +27,18 @@ pub(crate) async fn prepare_db(
let db = connect_eval_db(config, &namespace, &database).await?;
let (raw_openai_client, openai_base_url) =
openai::build_client_from_env().context("building OpenAI client")?;
let openai_client = Arc::new(raw_openai_client);
let (openai_client, openai_base_url) =
openai::ingestion_openai_client(config.ingest.include_entities)
.context("building OpenAI client for ingestion")?;
// Create embedding provider directly from config (eval only supports FastEmbed and Hashed)
let embedding_provider = match config.embedding_backend {
crate::args::EmbeddingBackend::FastEmbed => EmbeddingProvider::new_fastembed(
EmbeddingBackend::FastEmbed => EmbeddingProvider::new_fastembed(
config.embedding_model.clone(),
default_embedding_pool_size(),
)
.await
.context("creating FastEmbed provider")?,
crate::args::EmbeddingBackend::Hashed => {
EmbeddingBackend::Hashed => {
EmbeddingProvider::new_hashed(1536).context("creating Hashed provider")?
}
};
@@ -68,12 +58,14 @@ pub(crate) async fn prepare_db(
dimension = provider_dimension,
"Embedding provider initialised"
);
info!(openai_base_url = %openai_base_url, "OpenAI client configured");
if let Some(base_url) = &openai_base_url {
info!(openai_base_url = %base_url, "OpenAI client configured for entity ingestion");
}
let (mut settings, settings_missing) =
load_or_init_system_settings(&db, provider_dimension).await?;
let embedding_cache = if config.embedding_backend == EmbeddingBackend::FastEmbed {
if config.embedding_backend == EmbeddingBackend::FastEmbed {
if let Some(model_code) = embedding_provider.model_code() {
let sanitized = sanitize_model_code(&model_code);
let path = config.cache_dir.join(format!("{sanitized}.json"));
@@ -83,15 +75,8 @@ pub(crate) async fn prepare_db(
.with_context(|| format!("removing stale cache {}", path.display()))
.ok();
}
let cache = EmbeddingCache::load(&path).await?;
info!(path = %path.display(), "Embedding cache ready");
Some(cache)
} else {
None
}
} else {
None
};
}
let must_reapply_settings = settings_missing;
let defer_initial_enforce = settings_missing && !config.reseed_slice;
@@ -104,9 +89,8 @@ pub(crate) async fn prepare_db(
ctx.must_reapply_settings = must_reapply_settings;
ctx.settings = Some(settings);
ctx.embedding_provider = Some(embedding_provider);
ctx.embedding_cache = embedding_cache;
ctx.openai_client = Some(openai_client);
ctx.openai_base_url = Some(openai_base_url);
ctx.openai_base_url = openai_base_url;
let elapsed = started.elapsed();
ctx.record_stage_duration(stage, elapsed);
@@ -116,7 +100,5 @@ pub(crate) async fn prepare_db(
"completed evaluation stage"
);
machine
.prepare_db()
.map_err(|(_, guard)| map_guard_error("prepare_db", &guard))
Ok(())
}
@@ -5,25 +5,19 @@ use common::storage::types::system_settings::SystemSettings;
use tracing::{info, warn};
use crate::{
cases::cases_from_manifest,
corpus,
db_helpers::{recreate_indexes, remove_all_indexes, reset_namespace},
eval::{
can_reuse_namespace, cases_from_manifest, enforce_system_settings, ensure_eval_user,
record_namespace_state, warm_hnsw_cache,
db::{
can_reuse_namespace, ensure_eval_user, record_namespace_seed, recreate_indexes,
reset_namespace, warm_hnsw_cache,
},
settings::enforce_system_settings,
};
use super::super::{
context::{EvalStage, EvaluationContext},
state::{CorpusReady, EvaluationMachine, NamespaceReady},
};
use super::{map_guard_error, StageResult};
use super::super::context::{EvalStage, EvaluationContext};
#[allow(clippy::too_many_lines)]
pub(crate) async fn prepare_namespace(
machine: EvaluationMachine<(), CorpusReady>,
ctx: &mut EvaluationContext<'_>,
) -> StageResult<NamespaceReady> {
pub(crate) async fn prepare_namespace(ctx: &mut EvaluationContext<'_>) -> anyhow::Result<()> {
let stage = EvalStage::PrepareNamespace;
info!(
evaluation_stage = stage.label(),
@@ -32,7 +26,6 @@ pub(crate) async fn prepare_namespace(
let started = Instant::now();
let config = ctx.config();
let dataset = ctx.dataset();
let expected_fingerprint = ctx
.expected_fingerprint
.as_deref()
@@ -60,20 +53,16 @@ pub(crate) async fn prepare_namespace(
let mut namespace_reused = false;
if !config.reseed_slice {
namespace_reused = {
let slice = ctx.slice()?;
can_reuse_namespace(
ctx.db()?,
ctx.descriptor()?,
&namespace,
&database,
dataset.metadata.id.as_str(),
slice.manifest.slice_id.as_str(),
expected_fingerprint.as_str(),
requested_cases,
)
.await?
};
namespace_reused = can_reuse_namespace(
ctx.db()?,
base_manifest,
&embedding_provider,
&namespace,
&database,
expected_fingerprint.as_str(),
requested_cases,
)
.await?;
}
let mut namespace_seed_ms = None;
@@ -114,34 +103,20 @@ pub(crate) async fn prepare_namespace(
"Seeding ingestion corpus into SurrealDB"
);
}
let indexes_disabled = remove_all_indexes(ctx.db()?).await.is_ok();
let seed_start = Instant::now();
corpus::seed_manifest_into_db(ctx.db()?, &manifest_for_seed)
.await
.context("seeding ingestion corpus from manifest")?;
namespace_seed_ms = Some(seed_start.elapsed().as_millis());
// Recreate indexes AFTER data is loaded (correct bulk loading pattern)
if indexes_disabled {
info!("Recreating indexes after seeding data");
recreate_indexes(ctx.db()?, embedding_provider.dimension())
.await
.context("recreating indexes with correct dimension")?;
warm_hnsw_cache(ctx.db()?, embedding_provider.dimension()).await?;
}
{
let slice = ctx.slice()?;
record_namespace_state(
ctx.descriptor()?,
dataset.metadata.id.as_str(),
slice.manifest.slice_id.as_str(),
expected_fingerprint.as_str(),
&namespace,
&database,
requested_cases,
)
.await;
info!("Recreating indexes after seeding data");
recreate_indexes(ctx.db()?, embedding_provider.dimension())
.await
.context("recreating indexes with correct dimension")?;
warm_hnsw_cache(ctx.db()?, embedding_provider.dimension()).await?;
if let Some(handle) = ctx.corpus_handle.as_mut() {
record_namespace_seed(handle, &namespace, &database, requested_cases).await;
}
}
@@ -198,7 +173,5 @@ pub(crate) async fn prepare_namespace(
"completed evaluation stage"
);
machine
.prepare_namespace()
.map_err(|(_, guard)| map_guard_error("prepare_namespace", &guard))
Ok(())
}
@@ -3,21 +3,11 @@ use std::time::Instant;
use anyhow::Context;
use tracing::info;
use crate::{
eval::{default_database, default_namespace, ledger_target},
slice,
};
use crate::{db::{default_database, default_namespace}, slice};
use super::super::{
context::{EvalStage, EvaluationContext},
state::{EvaluationMachine, Ready, SlicePrepared},
};
use super::{map_guard_error, StageResult};
use super::super::context::{EvalStage, EvaluationContext};
pub(crate) async fn prepare_slice(
machine: EvaluationMachine<(), Ready>,
ctx: &mut EvaluationContext<'_>,
) -> StageResult<SlicePrepared> {
pub(crate) async fn prepare_slice(ctx: &mut EvaluationContext<'_>) -> anyhow::Result<()> {
let stage = EvalStage::PrepareSlice;
info!(
evaluation_stage = stage.label(),
@@ -25,7 +15,7 @@ pub(crate) async fn prepare_slice(
);
let started = Instant::now();
let ledger_limit = ledger_target(ctx.config());
let ledger_limit = slice::ledger_target(ctx.config());
let slice_settings = slice::slice_config_with_limit(ctx.config(), ledger_limit);
let resolved_slice =
slice::resolve_slice(ctx.dataset(), &slice_settings).context("resolving dataset slice")?;
@@ -49,7 +39,11 @@ pub(crate) async fn prepare_slice(
.db_namespace
.clone()
.unwrap_or_else(|| {
default_namespace(ctx.dataset().metadata.id.as_str(), ctx.config().limit)
default_namespace(
ctx.dataset().metadata.id.as_str(),
ctx.config().limit,
ctx.config().slice.as_deref(),
)
});
ctx.database = ctx
.config()
@@ -66,7 +60,5 @@ pub(crate) async fn prepare_slice(
"completed evaluation stage"
);
machine
.prepare_slice()
.map_err(|(_, guard)| map_guard_error("prepare_slice", &guard))
Ok(())
}
+13 -16
View File
@@ -5,9 +5,13 @@ use common::storage::types::StoredObject;
use futures::stream::{self, StreamExt};
use tracing::{debug, info};
use crate::eval::{
adapt_retrieval_output, build_case_diagnostics, text_contains_answer, CaseDiagnostics,
CaseSummary, RetrievedSummary,
use crate::{
cases::SeededCase,
context_stats,
types::{
adapt_retrieval_output, build_case_diagnostics, text_contains_answer, CaseDiagnostics,
CaseSummary, RetrievedSummary,
},
};
use retrieval_pipeline::{
pipeline::{self, RetrievalConfig, StageTimings},
@@ -15,17 +19,10 @@ use retrieval_pipeline::{
};
use tokio::sync::Semaphore;
use super::super::{
context::{EvalStage, EvaluationContext},
state::{EvaluationMachine, NamespaceReady, QueriesFinished},
};
use super::{map_guard_error, StageResult};
use super::super::context::{EvalStage, EvaluationContext};
#[allow(clippy::too_many_lines, clippy::arithmetic_side_effects)]
pub(crate) async fn run_queries(
machine: EvaluationMachine<(), NamespaceReady>,
ctx: &mut EvaluationContext<'_>,
) -> StageResult<QueriesFinished> {
pub(crate) async fn run_queries(ctx: &mut EvaluationContext<'_>) -> anyhow::Result<()> {
let stage = EvalStage::RunQueries;
info!(
evaluation_stage = stage.label(),
@@ -153,7 +150,7 @@ pub(crate) async fn run_queries(
.await
.context("acquiring query semaphore permit")?;
let crate::eval::SeededCase {
let SeededCase {
question_id,
question,
expected_source,
@@ -197,6 +194,7 @@ pub(crate) async fn run_queries(
let query_latency = query_start.elapsed().as_millis();
let candidates = adapt_retrieval_output(result_output);
let retrieved_context = context_stats::stats_for_candidates(&candidates);
let mut retrieved = Vec::new();
let mut match_rank = None;
let answers_lower: Vec<String> =
@@ -288,6 +286,7 @@ pub(crate) async fn run_queries(
reciprocal_rank: Some(reciprocal_rank),
ndcg: Some(ndcg),
latency_ms: query_latency,
retrieved_context,
retrieved,
};
@@ -353,9 +352,7 @@ pub(crate) async fn run_queries(
"completed evaluation stage"
);
machine
.run_queries()
.map_err(|(_, guard)| map_guard_error("run_queries", &guard))
Ok(())
}
#[allow(clippy::arithmetic_side_effects, clippy::cast_precision_loss)]
+14 -15
View File
@@ -3,25 +3,19 @@ use std::time::Instant;
use chrono::Utc;
use tracing::info;
use crate::eval::{
use crate::types::{
build_stage_latency_breakdown, compute_latency_stats, EvaluationSummary, PerformanceTimings,
RetrievedContextStats,
};
use super::super::{
context::{EvalStage, EvaluationContext},
state::{EvaluationMachine, QueriesFinished, Summarized},
};
use super::{map_guard_error, StageResult};
use super::super::context::{EvalStage, EvaluationContext};
#[allow(
clippy::too_many_lines,
clippy::arithmetic_side_effects,
clippy::cast_precision_loss
)]
pub(crate) async fn summarize(
machine: EvaluationMachine<(), QueriesFinished>,
ctx: &mut EvaluationContext<'_>,
) -> StageResult<Summarized> {
pub(crate) async fn summarize(ctx: &mut EvaluationContext<'_>) -> anyhow::Result<()> {
let stage = EvalStage::Summarize;
info!(
evaluation_stage = stage.label(),
@@ -123,6 +117,12 @@ pub(crate) async fn summarize(
sum_ndcg / (retrieval_cases as f64)
};
let per_query_context: Vec<RetrievedContextStats> = summaries
.iter()
.map(|summary| summary.retrieved_context)
.collect();
let retrieved_context = crate::context_stats::aggregate_context_stats(&per_query_context);
let active_tuning = ctx
.retrieval_config
.as_ref()
@@ -133,7 +133,7 @@ pub(crate) async fn summarize(
openai_base_url: ctx
.openai_base_url
.clone()
.unwrap_or_else(|| "<unknown>".to_string()),
.unwrap_or_else(|| "n/a (chunk-only ingestion)".to_string()),
ingestion_ms: ctx.ingestion_duration_ms,
namespace_seed_ms: ctx.namespace_seed_ms,
evaluation_stage_ms: ctx.stage_timings.clone(),
@@ -217,11 +217,12 @@ pub(crate) async fn summarize(
chunk_rrf_use_fts: active_tuning.flags.chunk_rrf_use_fts.as_bool(),
ingest_chunk_min_tokens: config.ingest.ingest_chunk_min_tokens,
ingest_chunk_max_tokens: config.ingest.ingest_chunk_max_tokens,
ingest_chunks_only: config.ingest.ingest_chunks_only,
ingest_chunks_only: !config.ingest.include_entities,
ingest_chunk_overlap_tokens: config.ingest.ingest_chunk_overlap_tokens,
chunk_vector_take: active_tuning.chunk_vector_take,
chunk_fts_take: active_tuning.chunk_fts_take,
max_chunks_per_entity: active_tuning.max_chunks_per_entity,
retrieved_context,
cases: summaries,
});
@@ -233,7 +234,5 @@ pub(crate) async fn summarize(
"completed evaluation stage"
);
machine
.summarize()
.map_err(|(_, guard)| map_guard_error("summarize", &guard))
Ok(())
}
-31
View File
@@ -1,31 +0,0 @@
use state_machines::state_machine;
state_machine! {
name: EvaluationMachine,
state: EvaluationState,
initial: Ready,
states: [Ready, SlicePrepared, DbReady, CorpusReady, NamespaceReady, QueriesFinished, Summarized, Completed, Failed],
events {
prepare_slice { transition: { from: Ready, to: SlicePrepared } }
prepare_db { transition: { from: SlicePrepared, to: DbReady } }
prepare_corpus { transition: { from: DbReady, to: CorpusReady } }
prepare_namespace { transition: { from: CorpusReady, to: NamespaceReady } }
run_queries { transition: { from: NamespaceReady, to: QueriesFinished } }
summarize { transition: { from: QueriesFinished, to: Summarized } }
finalize { transition: { from: Summarized, to: Completed } }
abort {
transition: { from: Ready, to: Failed }
transition: { from: SlicePrepared, to: Failed }
transition: { from: DbReady, to: Failed }
transition: { from: CorpusReady, to: Failed }
transition: { from: NamespaceReady, to: Failed }
transition: { from: QueriesFinished, to: Failed }
transition: { from: Summarized, to: Failed }
transition: { from: Completed, to: Failed }
}
}
}
pub fn ready() -> EvaluationMachine<(), Ready> {
EvaluationMachine::new(())
}
+81 -212
View File
@@ -7,12 +7,10 @@ use std::{
use anyhow::{Context, Result};
use serde::{Deserialize, Serialize};
use crate::eval::{
use crate::types::{
format_timestamp, CaseSummary, EvaluationStageTimings, EvaluationSummary, LatencyStats,
StageLatencyBreakdown,
RetrievalContextStats, StageLatencyBreakdown,
};
use chrono::Utc;
use tracing::warn;
#[derive(Debug)]
pub struct ReportPaths {
@@ -108,6 +106,7 @@ pub struct RetrievalSection {
pub ingest_chunk_max_tokens: usize,
pub ingest_chunk_overlap_tokens: usize,
pub ingest_chunks_only: bool,
pub retrieved_context: RetrievalContextStats,
}
const fn default_chunk_rrf_k() -> f32 {
@@ -242,6 +241,7 @@ impl EvaluationReport {
ingest_chunk_max_tokens: summary.ingest_chunk_max_tokens,
ingest_chunk_overlap_tokens: summary.ingest_chunk_overlap_tokens,
ingest_chunks_only: summary.ingest_chunks_only,
retrieved_context: summary.retrieved_context.clone(),
};
let llm = if summary.llm_cases > 0 {
@@ -345,7 +345,7 @@ impl LlmCaseEntry {
}
impl RetrievedSnippet {
fn from_summary(entry: &crate::eval::RetrievedSummary) -> Self {
fn from_summary(entry: &crate::types::RetrievedSummary) -> Self {
Self {
rank: entry.rank,
source_id: entry.source_id.clone(),
@@ -558,6 +558,65 @@ fn render_markdown(report: &EvaluationReport) -> String {
} else {
md.push_str("| Rerank | disabled |\\n");
}
write!(
md,
"| Chunk result cap | {} |\\n",
report.retrieval.chunk_result_cap
)
.unwrap();
md.push_str("\\n## Retrieved Context Volume\\n\\n");
md.push_str("| Metric | Value |\\n| --- | --- |\\n");
write!(
md,
"| Tokenizer | {} |\\n",
report.retrieval.retrieved_context.tokenizer
)
.unwrap();
write!(
md,
"| Queries measured | {} |\\n",
report.retrieval.retrieved_context.queries
)
.unwrap();
write!(
md,
"| Total chunks returned | {} |\\n",
report.retrieval.retrieved_context.total_chunks
)
.unwrap();
write!(
md,
"| Total characters | {} |\\n",
report.retrieval.retrieved_context.total_chars
)
.unwrap();
write!(
md,
"| Total tokens | {} |\\n",
report.retrieval.retrieved_context.total_tokens
)
.unwrap();
write!(
md,
"| Avg chunks / query | {:.1} |\\n",
report.retrieval.retrieved_context.avg_chunks_per_query
)
.unwrap();
write!(
md,
"| Avg tokens / query | {:.1} |\\n",
report.retrieval.retrieved_context.avg_tokens_per_query
)
.unwrap();
write!(
md,
"| P50 / P95 / max tokens / query | {} / {} / {} |\\n",
report.retrieval.retrieved_context.p50_tokens_per_query,
report.retrieval.retrieved_context.p95_tokens_per_query,
report.retrieval.retrieved_context.max_tokens_per_query
)
.unwrap();
if let Some(llm) = &report.llm {
md.push_str("\\n## LLM Mode Metrics\\n\\n");
@@ -797,182 +856,6 @@ pub fn dataset_report_dir(report_dir: &Path, dataset_id: &str) -> PathBuf {
report_dir.join(sanitize_component(dataset_id))
}
#[derive(Debug, Serialize, Deserialize)]
struct LegacyHistoryEntry {
generated_at: String,
run_label: Option<String>,
dataset_id: String,
dataset_label: String,
slice_id: String,
slice_seed: u64,
slice_window_offset: usize,
slice_window_length: usize,
slice_cases: usize,
slice_total_cases: usize,
k: usize,
limit: Option<usize>,
precision: f64,
precision_at_1: f64,
precision_at_2: f64,
precision_at_3: f64,
#[serde(default)]
mrr: f64,
#[serde(default)]
average_ndcg: f64,
#[serde(default)]
retrieval_cases: usize,
#[serde(default)]
retrieval_precision: f64,
#[serde(default)]
llm_cases: usize,
#[serde(default)]
llm_precision: f64,
duration_ms: u128,
latency_ms: LatencyStats,
embedding_backend: String,
embedding_model: Option<String>,
ingestion_reused: bool,
ingestion_embeddings_reused: bool,
rerank_enabled: bool,
rerank_keep_top: usize,
rerank_pool_size: Option<usize>,
#[serde(default)]
chunk_result_cap: Option<usize>,
#[serde(default)]
ingest_chunk_min_tokens: Option<usize>,
#[serde(default)]
ingest_chunk_max_tokens: Option<usize>,
#[serde(default)]
ingest_chunk_overlap_tokens: Option<usize>,
#[serde(default)]
ingest_chunks_only: Option<bool>,
#[serde(default)]
delta: Option<LegacyHistoryDelta>,
openai_base_url: String,
ingestion_ms: u128,
#[serde(default)]
namespace_seed_ms: Option<u128>,
}
#[derive(Debug, Serialize, Deserialize)]
struct LegacyHistoryDelta {
precision: f64,
precision_at_1: f64,
latency_avg_ms: f64,
}
#[allow(clippy::too_many_lines)]
fn convert_legacy_entry(entry: LegacyHistoryEntry) -> EvaluationReport {
let overview = OverviewSection {
generated_at: entry.generated_at,
run_label: entry.run_label,
total_cases: entry.slice_cases,
filtered_questions: 0,
};
let dataset = DatasetSection {
id: entry.dataset_id,
label: entry.dataset_label,
source: String::new(),
includes_unanswerable: entry.llm_cases > 0,
require_verified_chunks: true,
embedding_backend: entry.embedding_backend,
embedding_model: entry.embedding_model,
embedding_dimension: 0,
};
let slice = SliceSection {
id: entry.slice_id,
seed: entry.slice_seed,
window_offset: entry.slice_window_offset,
window_length: entry.slice_window_length,
slice_cases: entry.slice_cases,
ledger_total_cases: entry.slice_total_cases,
positives: 0,
negatives: 0,
total_paragraphs: 0,
negative_multiplier: 0.0,
};
let retrieval_cases = if entry.retrieval_cases > 0 {
entry.retrieval_cases
} else {
entry.slice_cases.saturating_sub(entry.llm_cases)
};
let retrieval_precision = if entry.retrieval_precision > 0.0 {
entry.retrieval_precision
} else {
entry.precision
};
let retrieval = RetrievalSection {
k: entry.k,
cases: retrieval_cases,
correct: 0,
precision: retrieval_precision,
precision_at_1: entry.precision_at_1,
precision_at_2: entry.precision_at_2,
precision_at_3: entry.precision_at_3,
mrr: entry.mrr,
average_ndcg: entry.average_ndcg,
latency: entry.latency_ms,
concurrency: 0,
resolve_entities: false,
rerank_enabled: entry.rerank_enabled,
rerank_pool_size: entry.rerank_pool_size,
rerank_keep_top: entry.rerank_keep_top,
chunk_result_cap: entry.chunk_result_cap.unwrap_or(5),
chunk_rrf_k: default_chunk_rrf_k(),
chunk_rrf_vector_weight: default_chunk_rrf_weight(),
chunk_rrf_fts_weight: default_chunk_rrf_weight(),
chunk_rrf_use_vector: default_chunk_rrf_use(),
chunk_rrf_use_fts: default_chunk_rrf_use(),
chunk_vector_take: 0,
chunk_fts_take: 0,
ingest_chunk_min_tokens: entry.ingest_chunk_min_tokens.unwrap_or(256),
ingest_chunk_max_tokens: entry.ingest_chunk_max_tokens.unwrap_or(512),
ingest_chunk_overlap_tokens: entry.ingest_chunk_overlap_tokens.unwrap_or(50),
ingest_chunks_only: entry.ingest_chunks_only.unwrap_or(false),
};
let llm = if entry.llm_cases > 0 {
Some(LlmSection {
cases: entry.llm_cases,
answered: 0,
precision: entry.llm_precision,
})
} else {
None
};
let performance = PerformanceSection {
openai_base_url: entry.openai_base_url,
ingestion_ms: entry.ingestion_ms,
namespace_seed_ms: entry.namespace_seed_ms,
evaluation_stages_ms: EvaluationStageTimings::default(),
stage_latency: StageLatencyBreakdown::default(),
namespace_reused: false,
ingestion_reused: entry.ingestion_reused,
embeddings_reused: entry.ingestion_embeddings_reused,
ingestion_cache_path: String::new(),
corpus_paragraphs: 0,
positive_paragraphs_reused: 0,
negative_paragraphs_reused: 0,
};
EvaluationReport {
overview,
dataset,
slice,
retrieval,
llm,
performance,
misses: Vec::new(),
llm_cases: Vec::new(),
detailed_report: false,
}
}
fn load_history(path: &Path) -> Result<Vec<EvaluationReport>> {
if !path.exists() {
return Ok(Vec::new());
@@ -981,34 +864,12 @@ fn load_history(path: &Path) -> Result<Vec<EvaluationReport>> {
let contents =
fs::read(path).with_context(|| format!("reading evaluation log {}", path.display()))?;
if let Ok(entries) = serde_json::from_slice::<Vec<EvaluationReport>>(&contents) {
return Ok(entries);
}
match serde_json::from_slice::<Vec<LegacyHistoryEntry>>(&contents) {
Ok(entries) => Ok(entries.into_iter().map(convert_legacy_entry).collect()),
Err(err) => {
let timestamp = Utc::now().format("%Y%m%dT%H%M%S");
let backup_path = path
.parent()
.unwrap_or_else(|| Path::new("."))
.join(format!("evaluations.json.corrupted.{timestamp}"));
warn!(
path = %path.display(),
backup = %backup_path.display(),
error = %err,
"Evaluation history file is corrupted; backing up and starting fresh"
);
if let Err(e) = fs::rename(path, &backup_path) {
warn!(
path = %path.display(),
error = %e,
"Failed to backup corrupted evaluation history"
);
}
Ok(Vec::new())
}
}
serde_json::from_slice(&contents).with_context(|| {
format!(
"parsing evaluation history at {}; delete the file and re-run if upgrading from an older format",
path.display()
)
})
}
fn record_history(report: &EvaluationReport, report_dir: &Path) -> Result<PathBuf> {
@@ -1024,9 +885,9 @@ fn record_history(report: &EvaluationReport, report_dir: &Path) -> Result<PathBu
#[cfg(test)]
mod tests {
use super::*;
use crate::eval::{
EvaluationStageTimings, PerformanceTimings, RetrievedSummary, StageLatency,
StageLatencyBreakdown,
use crate::types::{
EvaluationStageTimings, PerformanceTimings, RetrievedContextStats, RetrievedSummary,
StageLatency, StageLatencyBreakdown,
};
use chrono::Utc;
use tempfile::tempdir;
@@ -1101,6 +962,7 @@ mod tests {
has_verified_chunks: !is_impossible,
match_rank: if matched { Some(1) } else { None },
latency_ms: 42,
retrieved_context: RetrievedContextStats::default(),
retrieved: vec![RetrievedSummary {
rank: 1,
entity_id: "entity1".into(),
@@ -1199,6 +1061,13 @@ mod tests {
chunk_vector_take: 50,
chunk_fts_take: 50,
max_chunks_per_entity: 4,
retrieved_context: crate::context_stats::aggregate_context_stats(&[
RetrievedContextStats {
chunk_count: 1,
char_count: 10,
token_count: 3,
},
]),
cases,
}
}
+174
View File
@@ -0,0 +1,174 @@
use std::collections::{HashMap, VecDeque};
use anyhow::{anyhow, Result};
use rand::{rngs::StdRng, seq::SliceRandom, SeedableRng};
use tracing::warn;
use crate::datasets::{ConvertedDataset, BEIR_DATASETS};
use super::build::{mix_seed, BuildParams};
#[allow(clippy::too_many_lines, clippy::arithmetic_side_effects)]
pub(super) fn ordered_question_refs_beir(
dataset: &ConvertedDataset,
params: &BuildParams,
target_cases: usize,
) -> Result<Vec<(usize, usize)>> {
let prefixes: Vec<&str> = BEIR_DATASETS
.iter()
.map(|kind| kind.source_prefix())
.collect();
let mut grouped: HashMap<&str, Vec<(usize, usize)>> = HashMap::new();
for (p_idx, paragraph) in dataset.paragraphs.iter().enumerate() {
for (q_idx, question) in paragraph.questions.iter().enumerate() {
let include = if params.include_impossible {
true
} else {
!question.is_impossible && !question.answers.is_empty()
};
if !include {
continue;
}
let Some(prefix) = question_prefix(&question.id) else {
warn!(
question_id = %question.id,
"Skipping BEIR question without expected prefix"
);
continue;
};
if !prefixes.contains(&prefix) {
warn!(
question_id = %question.id,
prefix = %prefix,
"Skipping BEIR question with unknown subset prefix"
);
continue;
}
grouped.entry(prefix).or_default().push((p_idx, q_idx));
}
}
if grouped.values().all(std::vec::Vec::is_empty) {
return Err(anyhow!(
"no eligible BEIR questions found; cannot build slice"
));
}
for prefix in &prefixes {
if let Some(entries) = grouped.get_mut(prefix) {
let seed = mix_seed(
&format!("{}::{prefix}", dataset.metadata.id),
params.base_seed,
);
let mut rng = StdRng::seed_from_u64(seed);
entries.shuffle(&mut rng);
}
}
let dataset_count = prefixes.len().max(1);
let base_quota = target_cases / dataset_count;
let mut remainder = target_cases % dataset_count;
let mut quotas: HashMap<&str, usize> = HashMap::new();
for prefix in &prefixes {
let mut quota = base_quota;
if remainder > 0 {
quota += 1;
remainder -= 1;
}
quotas.insert(*prefix, quota);
}
let mut take_counts: HashMap<&str, usize> = HashMap::new();
let mut spare_slots: HashMap<&str, usize> = HashMap::new();
let mut shortfall = 0usize;
for prefix in &prefixes {
let available = grouped.get(prefix).map_or(0, std::vec::Vec::len);
let quota = *quotas.get(prefix).unwrap_or(&0);
let take = quota.min(available);
let missing = quota.saturating_sub(take);
shortfall += missing;
take_counts.insert(*prefix, take);
spare_slots.insert(*prefix, available.saturating_sub(take));
}
while shortfall > 0 {
let mut allocated = false;
for prefix in &prefixes {
if shortfall == 0 {
break;
}
let spare = spare_slots.get(prefix).copied().unwrap_or(0);
if spare == 0 {
continue;
}
if let Some(count) = take_counts.get_mut(prefix) {
*count += 1;
}
spare_slots.insert(*prefix, spare - 1);
shortfall = shortfall.saturating_sub(1);
allocated = true;
}
if !allocated {
break;
}
}
let mut queues: Vec<VecDeque<(usize, usize)>> = Vec::new();
let mut total_selected = 0usize;
for prefix in &prefixes {
let take = *take_counts.get(prefix).unwrap_or(&0);
let mut deque = VecDeque::new();
if let Some(entries) = grouped.get(prefix) {
for item in entries.iter().take(take) {
deque.push_back(*item);
total_selected += 1;
}
}
queues.push(deque);
}
if total_selected < target_cases {
warn!(
requested = target_cases,
available = total_selected,
"BEIR mix requested more questions than available after balancing; continuing with capped set"
);
}
let mut output = Vec::with_capacity(total_selected);
loop {
let mut progressed = false;
for queue in &mut queues {
if let Some(item) = queue.pop_front() {
output.push(item);
progressed = true;
}
}
if !progressed {
break;
}
}
if output.is_empty() {
return Err(anyhow!(
"no eligible BEIR questions found; cannot build slice"
));
}
Ok(output)
}
pub(super) fn question_prefix(question_id: &str) -> Option<&'static str> {
for prefix in BEIR_DATASETS.iter().map(|kind| kind.source_prefix()) {
if let Some(rest) = question_id.strip_prefix(prefix) {
if rest.starts_with('-') {
return Some(prefix);
}
}
}
None
}
+19
View File
@@ -0,0 +1,19 @@
use sha2::{Digest, Sha256};
#[derive(Debug)]
pub(super) struct BuildParams {
pub include_impossible: bool,
pub base_seed: u64,
pub rng_seed: u64,
}
#[allow(clippy::indexing_slicing)]
pub(super) fn mix_seed(dataset_id: &str, seed: u64) -> u64 {
let mut hasher = Sha256::new();
hasher.update(dataset_id.as_bytes());
hasher.update(seed.to_le_bytes());
let digest = hasher.finalize();
let mut bytes = [0u8; 8];
bytes.copy_from_slice(&digest[..8]);
u64::from_le_bytes(bytes)
}
@@ -1,5 +1,5 @@
use std::{
collections::{HashMap, HashSet, VecDeque},
collections::{HashMap, HashSet},
fmt::Write,
fs,
path::{Path, PathBuf},
@@ -12,10 +12,18 @@ use serde::{Deserialize, Serialize};
use sha2::{Digest, Sha256};
use tracing::{info, warn};
use crate::datasets::{
ConvertedDataset, ConvertedParagraph, ConvertedQuestion, DatasetKind, BEIR_DATASETS,
use crate::{
args::Config,
datasets::{
ConvertedDataset, ConvertedParagraph, ConvertedQuestion, DatasetKind,
},
};
mod beir;
mod build;
use build::{mix_seed, BuildParams};
const SLICE_VERSION: u32 = 2;
pub const DEFAULT_NEGATIVE_MULTIPLIER: f32 = 4.0;
@@ -80,8 +88,12 @@ pub enum SliceParagraphKind {
Negative,
}
pub fn paragraph_storage_key(paragraph_id: &str) -> String {
sanitize_identifier(paragraph_id)
}
pub(crate) fn default_shard_path(paragraph_id: &str) -> String {
let sanitized = sanitize_identifier(paragraph_id);
let sanitized = paragraph_storage_key(paragraph_id);
format!("paragraphs/{sanitized}.json")
}
@@ -210,13 +222,6 @@ struct SliceKey<'a> {
seed: u64,
}
#[derive(Debug)]
struct BuildParams {
include_impossible: bool,
base_seed: u64,
rng_seed: u64,
}
#[allow(clippy::too_many_lines)]
pub fn resolve_slice<'a>(
dataset: &'a ConvertedDataset,
@@ -225,15 +230,29 @@ pub fn resolve_slice<'a>(
let index = DatasetIndex::build(dataset);
if let Some(slice_arg) = config.explicit_slice {
let (path, manifest) = load_explicit_slice(dataset, &index, config, slice_arg)?;
let resolved = manifest_to_resolved(dataset, &index, manifest, path)?;
let path = explicit_slice_path(dataset, config, slice_arg);
if path.exists() {
let (path, manifest) = load_explicit_slice(dataset, &index, config, slice_arg)?;
let resolved = manifest_to_resolved(dataset, &index, manifest, path)?;
info!(
slice = %resolved.manifest.slice_id,
path = %resolved.path.display(),
cases = resolved.manifest.case_count,
positives = resolved.manifest.positive_paragraphs,
negatives = resolved.manifest.negative_paragraphs,
"Using explicitly selected slice"
);
return Ok(resolved);
}
let resolved =
materialize_slice_ledger(dataset, config, &index, slice_arg, path)?;
info!(
slice = %resolved.manifest.slice_id,
path = %resolved.path.display(),
cases = resolved.manifest.case_count,
positives = resolved.manifest.positive_paragraphs,
negatives = resolved.manifest.negative_paragraphs,
"Using explicitly selected slice"
"Built catalog slice ledger"
);
return Ok(resolved);
}
@@ -256,6 +275,82 @@ pub fn resolve_slice<'a>(
.join("slices")
.join(dataset.metadata.id.as_str());
let path = base.join(format!("{slice_id}.json"));
materialize_slice_ledger(dataset, config, &index, &slice_id, path)
}
#[allow(clippy::indexing_slicing, clippy::arithmetic_side_effects)]
pub fn select_window<'a>(
resolved: &'a ResolvedSlice<'a>,
offset: usize,
limit: Option<usize>,
) -> Result<SliceWindow<'a>> {
let total = resolved.manifest.case_count;
if total == 0 {
return Err(anyhow!(
"slice '{}' contains no cases",
resolved.manifest.slice_id
));
}
if offset >= total {
return Err(anyhow!(
"slice offset {offset} exceeds available cases ({total})",
));
}
let available = total - offset;
let requested = limit.unwrap_or(available).max(1);
let length = requested.min(available);
let cases = resolved.cases[offset..offset + length].to_vec();
let mut seen = HashSet::new();
let mut positive_ids = Vec::new();
for case in &cases {
if seen.insert(case.paragraph.id.as_str()) {
positive_ids.push(case.paragraph.id.clone());
}
}
Ok(SliceWindow {
offset,
length,
total_cases: total,
cases,
positive_paragraph_ids: positive_ids,
})
}
#[allow(dead_code)]
pub fn full_window<'a>(resolved: &'a ResolvedSlice<'a>) -> Result<SliceWindow<'a>> {
select_window(resolved, 0, None)
}
fn explicit_slice_path(
dataset: &ConvertedDataset,
config: &SliceConfig<'_>,
slice_arg: &str,
) -> PathBuf {
let explicit_path = Path::new(slice_arg);
if explicit_path.exists() {
explicit_path.to_path_buf()
} else {
config
.cache_dir
.join("slices")
.join(dataset.metadata.id.as_str())
.join(format!("{slice_arg}.json"))
}
}
#[allow(clippy::too_many_lines)]
fn materialize_slice_ledger<'a>(
dataset: &'a ConvertedDataset,
config: &SliceConfig<'_>,
index: &DatasetIndex,
slice_id: &str,
path: PathBuf,
) -> Result<ResolvedSlice<'a>> {
let requested_corpus = config
.corpus_limit
.unwrap_or(dataset.paragraphs.len())
.min(dataset.paragraphs.len())
.max(1);
let total_questions = dataset
.paragraphs
@@ -339,7 +434,7 @@ pub fn resolve_slice<'a>(
let mut manifest = manifest.unwrap_or_else(|| {
empty_manifest(
dataset,
slice_id.clone(),
slice_id.to_string(),
&params,
requested_corpus,
config.negative_multiplier,
@@ -396,52 +491,7 @@ pub fn resolve_slice<'a>(
);
}
let resolved = manifest_to_resolved(dataset, &index, manifest.clone(), path)?;
Ok(resolved)
}
#[allow(clippy::indexing_slicing, clippy::arithmetic_side_effects)]
pub fn select_window<'a>(
resolved: &'a ResolvedSlice<'a>,
offset: usize,
limit: Option<usize>,
) -> Result<SliceWindow<'a>> {
let total = resolved.manifest.case_count;
if total == 0 {
return Err(anyhow!(
"slice '{}' contains no cases",
resolved.manifest.slice_id
));
}
if offset >= total {
return Err(anyhow!(
"slice offset {offset} exceeds available cases ({total})",
));
}
let available = total - offset;
let requested = limit.unwrap_or(available).max(1);
let length = requested.min(available);
let cases = resolved.cases[offset..offset + length].to_vec();
let mut seen = HashSet::new();
let mut positive_ids = Vec::new();
for case in &cases {
if seen.insert(case.paragraph.id.as_str()) {
positive_ids.push(case.paragraph.id.clone());
}
}
Ok(SliceWindow {
offset,
length,
total_cases: total,
cases,
positive_paragraph_ids: positive_ids,
})
}
#[allow(dead_code)]
pub fn full_window<'a>(resolved: &'a ResolvedSlice<'a>) -> Result<SliceWindow<'a>> {
select_window(resolved, 0, None)
manifest_to_resolved(dataset, index, manifest, path)
}
fn load_explicit_slice(
@@ -450,16 +500,7 @@ fn load_explicit_slice(
config: &SliceConfig<'_>,
slice_arg: &str,
) -> Result<(PathBuf, SliceManifest)> {
let explicit_path = Path::new(slice_arg);
let candidate_path = if explicit_path.exists() {
explicit_path.to_path_buf()
} else {
config
.cache_dir
.join("slices")
.join(dataset.metadata.id.as_str())
.join(format!("{slice_arg}.json"))
};
let candidate_path = explicit_slice_path(dataset, config, slice_arg);
let manifest = read_manifest(&candidate_path)
.with_context(|| format!("reading slice manifest at {}", candidate_path.display()))?;
@@ -613,7 +654,7 @@ fn ordered_question_refs(
target_cases: usize,
) -> Result<Vec<(usize, usize)>> {
if dataset.metadata.id == DatasetKind::Beir.id() {
return ordered_question_refs_beir(dataset, params, target_cases);
return beir::ordered_question_refs_beir(dataset, params, target_cases);
}
let mut question_refs = Vec::new();
@@ -642,171 +683,6 @@ fn ordered_question_refs(
Ok(question_refs)
}
#[allow(clippy::too_many_lines, clippy::arithmetic_side_effects)]
fn ordered_question_refs_beir(
dataset: &ConvertedDataset,
params: &BuildParams,
target_cases: usize,
) -> Result<Vec<(usize, usize)>> {
let prefixes: Vec<&str> = BEIR_DATASETS
.iter()
.map(|kind| kind.source_prefix())
.collect();
let mut grouped: HashMap<&str, Vec<(usize, usize)>> = HashMap::new();
for (p_idx, paragraph) in dataset.paragraphs.iter().enumerate() {
for (q_idx, question) in paragraph.questions.iter().enumerate() {
let include = if params.include_impossible {
true
} else {
!question.is_impossible && !question.answers.is_empty()
};
if !include {
continue;
}
let Some(prefix) = question_prefix(&question.id) else {
warn!(
question_id = %question.id,
"Skipping BEIR question without expected prefix"
);
continue;
};
if !prefixes.contains(&prefix) {
warn!(
question_id = %question.id,
prefix = %prefix,
"Skipping BEIR question with unknown subset prefix"
);
continue;
}
grouped.entry(prefix).or_default().push((p_idx, q_idx));
}
}
if grouped.values().all(std::vec::Vec::is_empty) {
return Err(anyhow!(
"no eligible BEIR questions found; cannot build slice"
));
}
for prefix in &prefixes {
if let Some(entries) = grouped.get_mut(prefix) {
let seed = mix_seed(
&format!("{}::{prefix}", dataset.metadata.id),
params.base_seed,
);
let mut rng = StdRng::seed_from_u64(seed);
entries.shuffle(&mut rng);
}
}
let dataset_count = prefixes.len().max(1);
let base_quota = target_cases / dataset_count;
let mut remainder = target_cases % dataset_count;
let mut quotas: HashMap<&str, usize> = HashMap::new();
for prefix in &prefixes {
let mut quota = base_quota;
if remainder > 0 {
quota += 1;
remainder -= 1;
}
quotas.insert(*prefix, quota);
}
let mut take_counts: HashMap<&str, usize> = HashMap::new();
let mut spare_slots: HashMap<&str, usize> = HashMap::new();
let mut shortfall = 0usize;
for prefix in &prefixes {
let available = grouped.get(prefix).map_or(0, std::vec::Vec::len);
let quota = *quotas.get(prefix).unwrap_or(&0);
let take = quota.min(available);
let missing = quota.saturating_sub(take);
shortfall += missing;
take_counts.insert(*prefix, take);
spare_slots.insert(*prefix, available.saturating_sub(take));
}
while shortfall > 0 {
let mut allocated = false;
for prefix in &prefixes {
if shortfall == 0 {
break;
}
let spare = spare_slots.get(prefix).copied().unwrap_or(0);
if spare == 0 {
continue;
}
if let Some(count) = take_counts.get_mut(prefix) {
*count += 1;
}
spare_slots.insert(*prefix, spare - 1);
shortfall = shortfall.saturating_sub(1);
allocated = true;
}
if !allocated {
break;
}
}
let mut queues: Vec<VecDeque<(usize, usize)>> = Vec::new();
let mut total_selected = 0usize;
for prefix in &prefixes {
let take = *take_counts.get(prefix).unwrap_or(&0);
let mut deque = VecDeque::new();
if let Some(entries) = grouped.get(prefix) {
for item in entries.iter().take(take) {
deque.push_back(*item);
total_selected += 1;
}
}
queues.push(deque);
}
if total_selected < target_cases {
warn!(
requested = target_cases,
available = total_selected,
"BEIR mix requested more questions than available after balancing; continuing with capped set"
);
}
let mut output = Vec::with_capacity(total_selected);
loop {
let mut progressed = false;
for queue in &mut queues {
if let Some(item) = queue.pop_front() {
output.push(item);
progressed = true;
}
}
if !progressed {
break;
}
}
if output.is_empty() {
return Err(anyhow!(
"no eligible BEIR questions found; cannot build slice"
));
}
Ok(output)
}
fn question_prefix(question_id: &str) -> Option<&'static str> {
for prefix in BEIR_DATASETS.iter().map(|kind| kind.source_prefix()) {
if let Some(rest) = question_id.strip_prefix(prefix) {
if rest.starts_with('-') {
return Some(prefix);
}
}
}
None
}
#[allow(clippy::indexing_slicing)]
fn ensure_negative_pool(
dataset: &ConvertedDataset,
@@ -1028,15 +904,48 @@ fn compute_slice_id(key: &SliceKey<'_>) -> Result<String> {
}))
}
#[allow(clippy::indexing_slicing)]
fn mix_seed(dataset_id: &str, seed: u64) -> u64 {
let mut hasher = Sha256::new();
hasher.update(dataset_id.as_bytes());
hasher.update(seed.to_le_bytes());
let digest = hasher.finalize();
let mut bytes = [0u8; 8];
bytes.copy_from_slice(&digest[..8]);
u64::from_le_bytes(bytes)
pub fn read_manifest_if_exists(path: &Path) -> Result<Option<SliceManifest>> {
if !path.exists() {
return Ok(None);
}
read_manifest(path).map(Some)
}
pub fn cached_manifest_path(config: &crate::args::Config) -> Option<PathBuf> {
let slice_arg = config.slice.as_deref()?;
let explicit_path = Path::new(slice_arg);
if explicit_path.exists() {
return Some(explicit_path.to_path_buf());
}
Some(
config
.cache_dir
.join("slices")
.join(config.dataset.id())
.join(format!("{slice_arg}.json")),
)
}
pub fn manifest_is_complete(manifest: &SliceManifest, config: &SliceConfig<'_>) -> bool {
let requested_limit = config
.limit
.unwrap_or(manifest.case_count.max(1))
.max(1);
if manifest.case_count < requested_limit {
return false;
}
let requested_corpus = config
.corpus_limit
.unwrap_or(manifest.total_paragraphs.max(1))
.max(1);
let desired_negatives = desired_negative_target(
manifest.positive_paragraphs,
requested_corpus,
manifest.total_paragraphs.max(manifest.positive_paragraphs.max(1)),
config.negative_multiplier,
);
manifest.negative_paragraphs >= desired_negatives
}
fn read_manifest(path: &Path) -> Result<SliceManifest> {
@@ -1057,14 +966,38 @@ fn write_manifest(path: &Path, manifest: &SliceManifest) -> Result<()> {
Ok(())
}
use crate::args::Config;
impl<'a> From<&'a Config> for SliceConfig<'a> {
fn from(config: &'a Config) -> Self {
slice_config_with_limit(config, None)
pub fn ledger_target(config: &Config) -> Option<usize> {
match (config.slice_grow, config.limit) {
(Some(grow), Some(limit)) => Some(limit.max(grow)),
(Some(grow), None) => Some(grow),
(None, limit) => limit,
}
}
/// Grow the slice ledger to contain the target number of cases.
pub fn grow_slice(dataset: &ConvertedDataset, config: &Config) -> Result<()> {
let ledger_limit = ledger_target(config);
let slice_settings = slice_config_with_limit(config, ledger_limit);
let slice =
resolve_slice(dataset, &slice_settings).context("resolving dataset slice")?;
info!(
slice = slice.manifest.slice_id.as_str(),
cases = slice.manifest.case_count,
positives = slice.manifest.positive_paragraphs,
negatives = slice.manifest.negative_paragraphs,
total_paragraphs = slice.manifest.total_paragraphs,
"Slice ledger ready"
);
println!(
"Slice `{}` now contains {} questions ({} positives, {} negatives)",
slice.manifest.slice_id,
slice.manifest.case_count,
slice.manifest.positive_paragraphs,
slice.manifest.negative_paragraphs
);
Ok(())
}
pub fn slice_config_with_limit(config: &Config, limit_override: Option<usize>) -> SliceConfig<'_> {
SliceConfig {
cache_dir: config.cache_dir.as_path(),
@@ -1088,7 +1021,7 @@ mod tests {
use tempfile::tempdir;
fn sample_dataset() -> ConvertedDataset {
let metadata = DatasetMetadata::for_kind(DatasetKind::SquadV2, false, None);
let metadata = DatasetMetadata::for_kind(DatasetKind::SquadV2, false);
ConvertedDataset {
generated_at: Utc::now(),
metadata,
@@ -1226,7 +1159,7 @@ mod tests {
}
}
let metadata = DatasetMetadata::for_kind(DatasetKind::Beir, false, None);
let metadata = DatasetMetadata::for_kind(DatasetKind::Beir, false);
let dataset = ConvertedDataset {
generated_at: Utc::now(),
metadata,
@@ -1240,11 +1173,11 @@ mod tests {
rng_seed: 0xBB,
};
let refs = ordered_question_refs_beir(&dataset, &params, 8)?;
let refs = beir::ordered_question_refs_beir(&dataset, &params, 8)?;
let mut per_prefix: HashMap<String, usize> = HashMap::new();
for (p_idx, q_idx) in refs {
let question = &dataset.paragraphs[p_idx].questions[q_idx];
let prefix = question_prefix(&question.id).unwrap_or("unknown");
let prefix = beir::question_prefix(&question.id).unwrap_or("unknown");
*per_prefix.entry(prefix.to_string()).or_default() += 1;
}
-179
View File
@@ -1,179 +0,0 @@
use std::path::PathBuf;
use anyhow::{Context, Result};
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use sha2::{Digest, Sha256};
use tokio::fs;
use crate::{args::Config, slice};
use common::utils::embedding::EmbeddingProvider;
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct SnapshotMetadata {
pub dataset_id: String,
pub slice_id: String,
pub embedding_backend: String,
pub embedding_model: Option<String>,
pub embedding_dimension: usize,
pub rerank_enabled: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DbSnapshotState {
pub dataset_id: String,
pub slice_id: String,
pub ingestion_fingerprint: String,
pub snapshot_hash: String,
pub updated_at: DateTime<Utc>,
#[serde(default)]
pub namespace: Option<String>,
#[serde(default)]
pub database: Option<String>,
#[serde(default)]
pub slice_case_count: usize,
}
pub struct Descriptor {
#[allow(dead_code)]
metadata: SnapshotMetadata,
dir: PathBuf,
metadata_hash: String,
}
impl Descriptor {
pub fn new(
config: &Config,
slice: &slice::ResolvedSlice<'_>,
embedding_provider: &EmbeddingProvider,
) -> Self {
let metadata = SnapshotMetadata {
dataset_id: slice.manifest.dataset_id.clone(),
slice_id: slice.manifest.slice_id.clone(),
embedding_backend: embedding_provider.backend_label().to_string(),
embedding_model: embedding_provider.model_code(),
embedding_dimension: embedding_provider.dimension(),
rerank_enabled: config.retrieval.rerank,
};
let dir = config
.cache_dir
.join("snapshots")
.join(&metadata.dataset_id)
.join(&metadata.slice_id);
let metadata_hash = compute_hash(&metadata);
Self {
metadata,
dir,
metadata_hash,
}
}
pub fn metadata_hash(&self) -> &str {
&self.metadata_hash
}
pub async fn load_db_state(&self) -> Result<Option<DbSnapshotState>> {
let path = self.db_state_path();
if !path.exists() {
return Ok(None);
}
let bytes = fs::read(&path)
.await
.with_context(|| format!("reading namespace state {}", path.display()))?;
let state = serde_json::from_slice(&bytes)
.with_context(|| format!("deserialising namespace state {}", path.display()))?;
Ok(Some(state))
}
pub async fn store_db_state(&self, state: &DbSnapshotState) -> Result<()> {
let path = self.db_state_path();
if let Some(parent) = path.parent() {
fs::create_dir_all(parent).await.with_context(|| {
format!("creating namespace state directory {}", parent.display())
})?;
}
let blob =
serde_json::to_vec_pretty(state).context("serialising namespace state payload")?;
fs::write(&path, blob)
.await
.with_context(|| format!("writing namespace state {}", path.display()))?;
Ok(())
}
fn db_dir(&self) -> PathBuf {
self.dir.join("db")
}
fn db_state_path(&self) -> PathBuf {
self.db_dir().join("state.json")
}
#[cfg(test)]
pub fn from_parts(metadata: SnapshotMetadata, dir: PathBuf) -> Self {
let metadata_hash = compute_hash(&metadata);
Self {
metadata,
dir,
metadata_hash,
}
}
}
#[allow(clippy::expect_used)]
fn compute_hash(metadata: &SnapshotMetadata) -> String {
let mut hasher = Sha256::new();
hasher.update(
serde_json::to_vec(metadata).expect("snapshot metadata serialisation should succeed"),
);
format!("{:x}", hasher.finalize())
}
#[cfg(test)]
mod tests {
use super::*;
#[tokio::test]
#[allow(clippy::unwrap_used, clippy::expect_used)]
async fn state_round_trip() {
let temp_dir = tempfile::tempdir().unwrap();
let metadata = SnapshotMetadata {
dataset_id: "dataset".into(),
slice_id: "slice".into(),
embedding_backend: "hashed".into(),
embedding_model: None,
embedding_dimension: 128,
rerank_enabled: true,
};
let descriptor = Descriptor::from_parts(
metadata,
temp_dir
.path()
.join("snapshots")
.join("dataset")
.join("slice"),
);
let state = DbSnapshotState {
dataset_id: "dataset".into(),
slice_id: "slice".into(),
ingestion_fingerprint: "fingerprint".into(),
snapshot_hash: descriptor.metadata_hash().to_string(),
updated_at: Utc::now(),
namespace: Some("ns".into()),
database: Some("db".into()),
slice_case_count: 42,
};
descriptor.store_db_state(&state).await.unwrap();
let loaded = descriptor.load_db_state().await.unwrap().unwrap();
assert_eq!(loaded.dataset_id, state.dataset_id);
assert_eq!(loaded.slice_id, state.slice_id);
assert_eq!(loaded.ingestion_fingerprint, state.ingestion_fingerprint);
assert_eq!(loaded.snapshot_hash, state.snapshot_hash);
assert_eq!(loaded.namespace, state.namespace);
assert_eq!(loaded.database, state.database);
assert_eq!(loaded.slice_case_count, state.slice_case_count);
}
}
+9 -1
View File
@@ -1,6 +1,6 @@
use std::collections::HashSet;
use chrono::{DateTime, Utc};
use chrono::{DateTime, SecondsFormat, Utc};
use common::storage::types::StoredObject;
use retrieval_pipeline::{
Diagnostics, RetrievalOutput, RetrievedChunk, RetrievedEntity, StageKind, StageTimings,
@@ -8,6 +8,8 @@ use retrieval_pipeline::{
use serde::{Deserialize, Serialize};
use unicode_normalization::UnicodeNormalization;
pub use crate::context_stats::{RetrievalContextStats, RetrievedContextStats};
#[allow(clippy::struct_excessive_bools)]
#[derive(Debug, Serialize)]
pub struct EvaluationSummary {
@@ -83,6 +85,7 @@ pub struct EvaluationSummary {
pub chunk_vector_take: usize,
pub chunk_fts_take: usize,
pub max_chunks_per_entity: usize,
pub retrieved_context: RetrievalContextStats,
pub cases: Vec<CaseSummary>,
}
@@ -108,6 +111,7 @@ pub struct CaseSummary {
#[serde(skip_serializing_if = "Option::is_none")]
pub ndcg: Option<f64>,
pub latency_ms: u128,
pub retrieved_context: RetrievedContextStats,
pub retrieved: Vec<RetrievedSummary>,
}
@@ -483,3 +487,7 @@ pub fn build_case_diagnostics(
pipeline: pipeline_stats,
}
}
pub fn format_timestamp(timestamp: &DateTime<Utc>) -> String {
timestamp.to_rfc3339_opts(SecondsFormat::Secs, true)
}
-1
View File
@@ -44,7 +44,6 @@
--leading-snug: 1.375;
--leading-relaxed: 1.625;
--ease-out: cubic-bezier(0, 0, 0.2, 1);
--ease-in-out: cubic-bezier(0.4, 0, 0.2, 1);
--animate-pulse: pulse 2s cubic-bezier(0.4, 0, 0.6, 1) infinite;
--default-transition-duration: 150ms;
--default-transition-timing-function: cubic-bezier(0.4, 0, 0.2, 1);