diff --git a/eval/manifest.yaml b/eval/manifest.yaml index 3edf556..c73edba 100644 --- a/eval/manifest.yaml +++ b/eval/manifest.yaml @@ -31,3 +31,108 @@ datasets: corpus_limit: 2000 include_unanswerable: false seed: 0x5eed2025 + - id: beir + label: "BEIR mix" + category: "BEIR" + entity_suffix: "BEIR" + source_prefix: "beir" + raw: "data/raw/beir" + converted: "data/converted/beir-minne.json" + include_unanswerable: false + slices: + - id: beir-mix-600 + label: "BEIR mix (600)" + description: "Balanced slice across FEVER, FiQA, HotpotQA, NFCorpus, Quora, TREC-COVID" + limit: 600 + corpus_limit: 6000 + seed: 0x5eed2025 + - id: fever + label: "FEVER (BEIR)" + category: "FEVER" + entity_suffix: "FEVER" + source_prefix: "fever" + raw: "data/raw/fever" + converted: "data/converted/fever-minne.json" + include_unanswerable: false + slices: + - id: fever-test-200 + label: "FEVER test (200)" + description: "200-case slice from BEIR test qrels" + limit: 200 + corpus_limit: 5000 + seed: 0x5eed2025 + - id: fiqa + label: "FiQA-2018 (BEIR)" + category: "FiQA-2018" + entity_suffix: "FiQA" + source_prefix: "fiqa" + raw: "data/raw/fiqa" + converted: "data/converted/fiqa-minne.json" + include_unanswerable: false + slices: + - id: fiqa-test-200 + label: "FiQA test (200)" + description: "200-case slice from BEIR test qrels" + limit: 200 + corpus_limit: 5000 + seed: 0x5eed2025 + - id: hotpotqa + label: "HotpotQA (BEIR)" + category: "HotpotQA" + entity_suffix: "HotpotQA" + source_prefix: "hotpotqa" + raw: "data/raw/hotpotqa" + converted: "data/converted/hotpotqa-minne.json" + include_unanswerable: false + slices: + - id: hotpotqa-test-200 + label: "HotpotQA test (200)" + description: "200-case slice from BEIR test qrels" + limit: 200 + corpus_limit: 5000 + seed: 0x5eed2025 + - id: nfcorpus + label: "NFCorpus (BEIR)" + category: "NFCorpus" + entity_suffix: "NFCorpus" + source_prefix: "nfcorpus" + raw: "data/raw/nfcorpus" + converted: "data/converted/nfcorpus-minne.json" + include_unanswerable: false + slices: + - id: nfcorpus-test-200 + label: "NFCorpus test (200)" + description: "200-case slice from BEIR test qrels" + limit: 200 + corpus_limit: 5000 + seed: 0x5eed2025 + - id: quora + label: "Quora (IR)" + category: "Quora" + entity_suffix: "Quora" + source_prefix: "quora" + raw: "data/raw/quora" + converted: "data/converted/quora-minne.json" + include_unanswerable: false + slices: + - id: quora-test-200 + label: "Quora test (200)" + description: "200-case slice from BEIR test qrels" + limit: 200 + corpus_limit: 5000 + seed: 0x5eed2025 + - id: trec-covid + label: "TREC-COVID (BEIR)" + category: "TREC-COVID" + entity_suffix: "TREC-COVID" + source_prefix: "trec-covid" + raw: "data/raw/trec-covid" + converted: "data/converted/trec-covid-minne.json" + include_unanswerable: false + slices: + - id: trec-covid-test-200 + label: "TREC-COVID test (200)" + description: "200-case slice from BEIR test qrels" + limit: 200 + corpus_limit: 5000 + seed: 0x5eed2025 diff --git a/eval/src/args.rs b/eval/src/args.rs index 73f4d64..f9b0b5b 100644 --- a/eval/src/args.rs +++ b/eval/src/args.rs @@ -347,6 +347,10 @@ impl Config { self.retrieval.require_verified_chunks = true; } + if self.dataset == DatasetKind::Beir { + self.negative_multiplier = 9.0; + } + // Validations if self.ingest_chunk_min_tokens == 0 || self.ingest_chunk_min_tokens >= self.ingest_chunk_max_tokens diff --git a/eval/src/datasets/beir.rs b/eval/src/datasets/beir.rs new file mode 100644 index 0000000..d150b65 --- /dev/null +++ b/eval/src/datasets/beir.rs @@ -0,0 +1,341 @@ +use std::{ + collections::{BTreeMap, HashMap}, + fs::File, + io::{BufRead, BufReader}, + path::{Path, PathBuf}, +}; + +use anyhow::{anyhow, Context, Result}; +use serde::Deserialize; +use tracing::warn; + +use super::{ConvertedParagraph, ConvertedQuestion, DatasetKind}; + +const ANSWER_SNIPPET_CHARS: usize = 240; + +#[derive(Debug, Deserialize)] +struct BeirCorpusRow { + #[serde(rename = "_id")] + id: String, + #[serde(default)] + title: Option, + #[serde(default)] + text: Option, +} + +#[derive(Debug, Deserialize)] +struct BeirQueryRow { + #[serde(rename = "_id")] + id: String, + text: String, +} + +#[derive(Debug, Clone)] +struct BeirParagraph { + title: String, + context: String, +} + +#[derive(Debug, Clone)] +struct BeirQuery { + text: String, +} + +#[derive(Debug, Clone)] +struct QrelEntry { + doc_id: String, + score: i32, +} + +pub fn convert_beir(raw_dir: &Path, dataset: DatasetKind) -> Result> { + let corpus_path = raw_dir.join("corpus.jsonl"); + let queries_path = raw_dir.join("queries.jsonl"); + let qrels_path = resolve_qrels_path(raw_dir)?; + + let corpus = load_corpus(&corpus_path)?; + let queries = load_queries(&queries_path)?; + let qrels = load_qrels(&qrels_path)?; + + let mut paragraphs = Vec::with_capacity(corpus.len()); + let mut paragraph_index = HashMap::new(); + + for (doc_id, entry) in corpus.iter() { + let paragraph_id = format!("{}-{doc_id}", dataset.source_prefix()); + let paragraph = ConvertedParagraph { + id: paragraph_id.clone(), + title: entry.title.clone(), + context: entry.context.clone(), + questions: Vec::new(), + }; + paragraph_index.insert(doc_id.clone(), paragraphs.len()); + paragraphs.push(paragraph); + } + + let mut missing_queries = 0usize; + let mut missing_docs = 0usize; + let mut skipped_answers = 0usize; + + for (query_id, entries) in qrels { + let query = match queries.get(&query_id) { + Some(query) => query, + None => { + missing_queries += 1; + warn!(query_id = %query_id, "Skipping qrels entry for missing query"); + continue; + } + }; + + let best = match select_best_doc(&entries) { + Some(entry) => entry, + None => continue, + }; + + let paragraph_slot = match paragraph_index.get(&best.doc_id) { + Some(slot) => *slot, + None => { + missing_docs += 1; + warn!( + query_id = %query_id, + doc_id = %best.doc_id, + "Skipping qrels entry referencing missing corpus document" + ); + continue; + } + }; + + let answer = answer_snippet(¶graphs[paragraph_slot].context); + let answers = match answer { + Some(snippet) => vec![snippet], + None => { + skipped_answers += 1; + warn!( + query_id = %query_id, + doc_id = %best.doc_id, + "Skipping query because no non-empty answer snippet could be derived" + ); + continue; + } + }; + + let question_id = format!("{}-{query_id}", dataset.source_prefix()); + paragraphs[paragraph_slot] + .questions + .push(ConvertedQuestion { + id: question_id, + question: query.text.clone(), + answers, + is_impossible: false, + }); + } + + if missing_queries + missing_docs + skipped_answers > 0 { + warn!( + missing_queries, + missing_docs, skipped_answers, "Skipped some BEIR qrels entries during conversion" + ); + } + + Ok(paragraphs) +} + +fn resolve_qrels_path(raw_dir: &Path) -> Result { + let qrels_dir = raw_dir.join("qrels"); + let candidates = ["test.tsv", "dev.tsv", "train.tsv"]; + + for name in candidates { + let candidate = qrels_dir.join(name); + if candidate.exists() { + return Ok(candidate); + } + } + + Err(anyhow!( + "No qrels file found under {}; expected one of {:?}", + qrels_dir.display(), + candidates + )) +} + +fn load_corpus(path: &Path) -> Result> { + let file = + File::open(path).with_context(|| format!("opening BEIR corpus at {}", path.display()))?; + let reader = BufReader::new(file); + let mut corpus = BTreeMap::new(); + + for (idx, line) in reader.lines().enumerate() { + let raw = line + .with_context(|| format!("reading corpus line {} from {}", idx + 1, path.display()))?; + if raw.trim().is_empty() { + continue; + } + let row: BeirCorpusRow = serde_json::from_str(&raw).with_context(|| { + format!( + "parsing corpus JSON on line {} from {}", + idx + 1, + path.display() + ) + })?; + let title = row.title.unwrap_or_else(|| row.id.clone()); + let text = row.text.unwrap_or_default(); + let context = build_context(&title, &text); + + if context.is_empty() { + warn!(doc_id = %row.id, "Skipping empty corpus document"); + continue; + } + + corpus.insert(row.id, BeirParagraph { title, context }); + } + + Ok(corpus) +} + +fn load_queries(path: &Path) -> Result> { + let file = File::open(path) + .with_context(|| format!("opening BEIR queries file at {}", path.display()))?; + let reader = BufReader::new(file); + let mut queries = BTreeMap::new(); + + for (idx, line) in reader.lines().enumerate() { + let raw = line + .with_context(|| format!("reading query line {} from {}", idx + 1, path.display()))?; + if raw.trim().is_empty() { + continue; + } + let row: BeirQueryRow = serde_json::from_str(&raw).with_context(|| { + format!( + "parsing query JSON on line {} from {}", + idx + 1, + path.display() + ) + })?; + queries.insert( + row.id, + BeirQuery { + text: row.text.trim().to_string(), + }, + ); + } + + Ok(queries) +} + +fn load_qrels(path: &Path) -> Result>> { + let file = + File::open(path).with_context(|| format!("opening BEIR qrels at {}", path.display()))?; + let reader = BufReader::new(file); + let mut qrels: BTreeMap> = BTreeMap::new(); + + for (idx, line) in reader.lines().enumerate() { + let raw = line + .with_context(|| format!("reading qrels line {} from {}", idx + 1, path.display()))?; + let trimmed = raw.trim(); + if trimmed.is_empty() || trimmed.starts_with("query-id") { + continue; + } + let mut parts = trimmed.split_whitespace(); + let query_id = parts + .next() + .ok_or_else(|| anyhow!("missing query id on line {}", idx + 1))?; + let doc_id = parts + .next() + .ok_or_else(|| anyhow!("missing document id on line {}", idx + 1))?; + let score_raw = parts + .next() + .ok_or_else(|| anyhow!("missing score on line {}", idx + 1))?; + let score: i32 = score_raw.parse().with_context(|| { + format!( + "parsing qrels score '{}' on line {} from {}", + score_raw, + idx + 1, + path.display() + ) + })?; + + qrels + .entry(query_id.to_string()) + .or_default() + .push(QrelEntry { + doc_id: doc_id.to_string(), + score, + }); + } + + Ok(qrels) +} + +fn select_best_doc(entries: &[QrelEntry]) -> Option<&QrelEntry> { + entries + .iter() + .max_by(|a, b| a.score.cmp(&b.score).then_with(|| b.doc_id.cmp(&a.doc_id))) +} + +fn answer_snippet(text: &str) -> Option { + let trimmed = text.trim(); + if trimmed.is_empty() { + return None; + } + let snippet: String = trimmed.chars().take(ANSWER_SNIPPET_CHARS).collect(); + let snippet = snippet.trim(); + if snippet.is_empty() { + None + } else { + Some(snippet.to_string()) + } +} + +fn build_context(title: &str, text: &str) -> String { + let title = title.trim(); + let text = text.trim(); + + match (title.is_empty(), text.is_empty()) { + (true, true) => String::new(), + (true, false) => text.to_string(), + (false, true) => title.to_string(), + (false, false) => format!("{title}\n\n{text}"), + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::fs; + use tempfile::tempdir; + + #[test] + fn converts_basic_beir_layout() { + let dir = tempdir().unwrap(); + let corpus = r#" +{"_id":"d1","title":"Doc 1","text":"Doc one has some text for testing."} +{"_id":"d2","title":"Doc 2","text":"Second document content."} +"#; + let queries = r#" +{"_id":"q1","text":"What is in doc one?"} +"#; + let qrels = "query-id\tcorpus-id\tscore\nq1\td1\t2\n"; + + fs::write(dir.path().join("corpus.jsonl"), corpus.trim()).unwrap(); + fs::write(dir.path().join("queries.jsonl"), queries.trim()).unwrap(); + fs::create_dir_all(dir.path().join("qrels")).unwrap(); + fs::write(dir.path().join("qrels/test.tsv"), qrels).unwrap(); + + let paragraphs = convert_beir(dir.path(), DatasetKind::Fever).unwrap(); + + assert_eq!(paragraphs.len(), 2); + let doc_one = paragraphs + .iter() + .find(|p| p.id == "fever-d1") + .expect("missing paragraph for d1"); + assert_eq!(doc_one.questions.len(), 1); + let question = &doc_one.questions[0]; + assert_eq!(question.id, "fever-q1"); + assert!(!question.answers.is_empty()); + assert!(doc_one.context.contains(&question.answers[0])); + + let doc_two = paragraphs + .iter() + .find(|p| p.id == "fever-d2") + .expect("missing paragraph for d2"); + assert!(doc_two.questions.is_empty()); + } +} diff --git a/eval/src/datasets/mod.rs b/eval/src/datasets/mod.rs index 84991e3..bef38a7 100644 --- a/eval/src/datasets/mod.rs +++ b/eval/src/datasets/mod.rs @@ -1,3 +1,4 @@ +mod beir; mod nq; mod squad; @@ -10,10 +11,10 @@ use std::{ use anyhow::{anyhow, bail, Context, Result}; use chrono::{DateTime, TimeZone, Utc}; +use clap::ValueEnum; use once_cell::sync::OnceCell; use serde::{Deserialize, Serialize}; use tracing::warn; -use clap::ValueEnum; const MANIFEST_PATH: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/manifest.yaml"); static DATASET_CATALOG: OnceCell = OnceCell::new(); @@ -248,6 +249,19 @@ fn dataset_entry_for_kind(kind: DatasetKind) -> Result<&'static DatasetEntry> { pub enum DatasetKind { SquadV2, NaturalQuestions, + Beir, + #[value(name = "fever")] + Fever, + #[value(name = "fiqa")] + Fiqa, + #[value(name = "hotpotqa", alias = "hotpot-qa")] + HotpotQa, + #[value(name = "nfcorpus", alias = "nf-corpus")] + Nfcorpus, + #[value(name = "quora")] + Quora, + #[value(name = "trec-covid", alias = "treccovid", alias = "trec_covid")] + TrecCovid, } impl DatasetKind { @@ -255,6 +269,13 @@ impl DatasetKind { match self { Self::SquadV2 => "squad-v2", Self::NaturalQuestions => "natural-questions-dev", + Self::Beir => "beir", + Self::Fever => "fever", + Self::Fiqa => "fiqa", + Self::HotpotQa => "hotpotqa", + Self::Nfcorpus => "nfcorpus", + Self::Quora => "quora", + Self::TrecCovid => "trec-covid", } } @@ -262,6 +283,13 @@ impl DatasetKind { match self { Self::SquadV2 => "SQuAD v2.0", Self::NaturalQuestions => "Natural Questions (dev)", + Self::Beir => "BEIR mix", + Self::Fever => "FEVER (BEIR)", + Self::Fiqa => "FiQA-2018 (BEIR)", + Self::HotpotQa => "HotpotQA (BEIR)", + Self::Nfcorpus => "NFCorpus (BEIR)", + Self::Quora => "Quora (IR)", + Self::TrecCovid => "TREC-COVID (BEIR)", } } @@ -269,6 +297,13 @@ impl DatasetKind { match self { Self::SquadV2 => "SQuAD v2.0", Self::NaturalQuestions => "Natural Questions", + Self::Beir => "BEIR", + Self::Fever => "FEVER", + Self::Fiqa => "FiQA-2018", + Self::HotpotQa => "HotpotQA", + Self::Nfcorpus => "NFCorpus", + Self::Quora => "Quora", + Self::TrecCovid => "TREC-COVID", } } @@ -276,6 +311,13 @@ impl DatasetKind { match self { Self::SquadV2 => "SQuAD", Self::NaturalQuestions => "Natural Questions", + Self::Beir => "BEIR", + Self::Fever => "FEVER", + Self::Fiqa => "FiQA", + Self::HotpotQa => "HotpotQA", + Self::Nfcorpus => "NFCorpus", + Self::Quora => "Quora", + Self::TrecCovid => "TREC-COVID", } } @@ -283,6 +325,13 @@ impl DatasetKind { match self { Self::SquadV2 => "squad", Self::NaturalQuestions => "nq", + Self::Beir => "beir", + Self::Fever => "fever", + Self::Fiqa => "fiqa", + Self::HotpotQa => "hotpotqa", + Self::Nfcorpus => "nfcorpus", + Self::Quora => "quora", + Self::TrecCovid => "trec-covid", } } @@ -320,13 +369,29 @@ impl FromStr for DatasetKind { "nq" | "natural-questions" | "natural_questions" | "natural-questions-dev" => { Ok(Self::NaturalQuestions) } + "beir" => Ok(Self::Beir), + "fever" => Ok(Self::Fever), + "fiqa" | "fiqa-2018" => Ok(Self::Fiqa), + "hotpotqa" | "hotpot-qa" => Ok(Self::HotpotQa), + "nfcorpus" | "nf-corpus" => Ok(Self::Nfcorpus), + "quora" => Ok(Self::Quora), + "trec-covid" | "treccovid" | "trec_covid" => Ok(Self::TrecCovid), other => { - anyhow::bail!("unknown dataset '{other}'. Expected 'squad' or 'natural-questions'.") + anyhow::bail!("unknown dataset '{other}'. Expected one of: squad, natural-questions, beir, fever, fiqa, hotpotqa, nfcorpus, quora, trec-covid.") } } } } +pub const BEIR_DATASETS: [DatasetKind; 6] = [ + DatasetKind::Fever, + DatasetKind::Fiqa, + DatasetKind::HotpotQa, + DatasetKind::Nfcorpus, + DatasetKind::Quora, + DatasetKind::TrecCovid, +]; + #[derive(Debug, Clone, Serialize, Deserialize)] pub struct DatasetMetadata { pub id: String, @@ -410,6 +475,13 @@ pub fn convert( DatasetKind::NaturalQuestions => { nq::convert_nq(raw_path, include_unanswerable, context_token_limit)? } + DatasetKind::Beir => convert_beir_mix(include_unanswerable, context_token_limit)?, + DatasetKind::Fever + | DatasetKind::Fiqa + | DatasetKind::HotpotQa + | DatasetKind::Nfcorpus + | DatasetKind::Quora + | DatasetKind::TrecCovid => beir::convert_beir(raw_path, dataset)?, }; let metadata_limit = match dataset { @@ -417,14 +489,37 @@ pub fn convert( _ => context_token_limit, }; + let source_label = match dataset { + DatasetKind::Beir => "beir-mix".to_string(), + _ => raw_path.display().to_string(), + }; + Ok(ConvertedDataset { generated_at: Utc::now(), metadata: DatasetMetadata::for_kind(dataset, include_unanswerable, metadata_limit), - source: raw_path.display().to_string(), + source: source_label, paragraphs, }) } +fn convert_beir_mix( + include_unanswerable: bool, + _context_token_limit: Option, +) -> Result> { + if include_unanswerable { + warn!("BEIR mix ignores include_unanswerable flag; all questions are answerable"); + } + + let mut paragraphs = Vec::new(); + for subset in BEIR_DATASETS { + let entry = dataset_entry_for_kind(subset)?; + let subset_paragraphs = beir::convert_beir(&entry.raw_path, subset)?; + paragraphs.extend(subset_paragraphs); + } + + Ok(paragraphs) +} + fn ensure_parent(path: &Path) -> Result<()> { if let Some(parent) = path.parent() { fs::create_dir_all(parent) diff --git a/eval/src/eval/pipeline/stages/run_queries.rs b/eval/src/eval/pipeline/stages/run_queries.rs index 34e7a41..b2a6f08 100644 --- a/eval/src/eval/pipeline/stages/run_queries.rs +++ b/eval/src/eval/pipeline/stages/run_queries.rs @@ -6,8 +6,8 @@ use futures::stream::{self, StreamExt}; use tracing::{debug, info}; use crate::eval::{ - adapt_strategy_output, build_case_diagnostics, - text_contains_answer, CaseDiagnostics, CaseSummary, RetrievedSummary, + adapt_strategy_output, build_case_diagnostics, text_contains_answer, CaseDiagnostics, + CaseSummary, RetrievedSummary, }; use retrieval_pipeline::{ pipeline::{self, PipelineStageTimings, RetrievalConfig}, diff --git a/eval/src/ingest/orchestrator.rs b/eval/src/ingest/orchestrator.rs index af3671a..5f26a96 100644 --- a/eval/src/ingest/orchestrator.rs +++ b/eval/src/ingest/orchestrator.rs @@ -26,7 +26,6 @@ use uuid::Uuid; use crate::{ datasets::{ConvertedDataset, ConvertedParagraph, ConvertedQuestion}, - db_helpers::change_embedding_length_in_hnsw_indexes, slices::{self, ResolvedSlice, SliceParagraphKind}, }; @@ -417,10 +416,6 @@ async fn ingest_paragraph_batch( .await .context("applying migrations for ingestion")?; - change_embedding_length_in_hnsw_indexes(&db, embedding_dimension) - .await - .context("failed setting new hnsw length")?; - let mut app_config = AppConfig::default(); app_config.storage = StorageKind::Memory; let backend: DynStore = Arc::new(InMemory::new()); diff --git a/eval/src/main.rs b/eval/src/main.rs index c03c9ab..aefe6f9 100644 --- a/eval/src/main.rs +++ b/eval/src/main.rs @@ -93,7 +93,6 @@ async fn async_main() -> anyhow::Result<()> { // Clap handles help automatically, so we don't need to check for it manually - if parsed.config.inspect_question.is_some() { inspection::inspect_question(&parsed.config).await?; return Ok(()); diff --git a/eval/src/perf.rs b/eval/src/perf.rs index f44b047..e4e61b5 100644 --- a/eval/src/perf.rs +++ b/eval/src/perf.rs @@ -145,6 +145,8 @@ mod tests { precision_at_1: 0.5, precision_at_2: 0.5, precision_at_3: 0.5, + mrr: 0.0, + average_ndcg: 0.0, duration_ms: 1234, dataset_id: "squad-v2".into(), dataset_label: "SQuAD v2".into(), @@ -192,18 +194,17 @@ mod tests { rerank_pool_size: Some(4), rerank_keep_top: 10, concurrency: 2, - retrieval_strategy: "initial".into(), detailed_report: false, + retrieval_strategy: "initial".into(), + chunk_result_cap: 5, ingest_chunk_min_tokens: 256, ingest_chunk_max_tokens: 512, - ingest_chunk_overlap_tokens: 50, ingest_chunks_only: false, + ingest_chunk_overlap_tokens: 50, chunk_vector_take: 20, chunk_fts_take: 20, chunk_avg_chars_per_token: 4, max_chunks_per_entity: 4, - average_ndcg: 0.0, - mrr: 0.0, cases: Vec::new(), } } diff --git a/eval/src/report.rs b/eval/src/report.rs index b47c7e4..8084c3a 100644 --- a/eval/src/report.rs +++ b/eval/src/report.rs @@ -88,6 +88,10 @@ pub struct RetrievalSection { pub rerank_pool_size: Option, pub rerank_keep_top: usize, pub chunk_result_cap: usize, + #[serde(default)] + pub chunk_vector_take: usize, + #[serde(default)] + pub chunk_fts_take: usize, pub ingest_chunk_min_tokens: usize, pub ingest_chunk_max_tokens: usize, pub ingest_chunk_overlap_tokens: usize, @@ -202,6 +206,8 @@ impl EvaluationReport { rerank_pool_size: summary.rerank_pool_size, rerank_keep_top: summary.rerank_keep_top, chunk_result_cap: summary.chunk_result_cap, + chunk_vector_take: summary.chunk_vector_take, + chunk_fts_take: summary.chunk_fts_take, ingest_chunk_min_tokens: summary.ingest_chunk_min_tokens, ingest_chunk_max_tokens: summary.ingest_chunk_max_tokens, ingest_chunk_overlap_tokens: summary.ingest_chunk_overlap_tokens, @@ -467,10 +473,7 @@ fn render_markdown(report: &EvaluationReport) -> String { report.retrieval.precision_at_2, report.retrieval.precision_at_3 )); - md.push_str(&format!( - "| MRR | {:.3} |\\n", - report.retrieval.mrr - )); + md.push_str(&format!("| MRR | {:.3} |\\n", report.retrieval.mrr)); md.push_str(&format!( "| NDCG | {:.3} |\\n", report.retrieval.average_ndcg @@ -632,7 +635,9 @@ fn render_markdown(report: &EvaluationReport) -> String { if report.detailed_report { md.push_str("All LLM-only cases matched within the evaluation window.\\n"); } else { - md.push_str("LLM-only cases omitted. Re-run with `--detailed-report` to see samples.\\n"); + md.push_str( + "LLM-only cases omitted. Re-run with `--detailed-report` to see samples.\\n", + ); } } else { md.push_str("| Question ID | Answered | Match Rank | Top Retrieved |\\n"); @@ -851,6 +856,8 @@ fn convert_legacy_entry(entry: LegacyHistoryEntry) -> EvaluationReport { rerank_pool_size: entry.rerank_pool_size, rerank_keep_top: entry.rerank_keep_top, chunk_result_cap: entry.chunk_result_cap.unwrap_or(5), + chunk_vector_take: 0, + chunk_fts_take: 0, ingest_chunk_min_tokens: entry.ingest_chunk_min_tokens.unwrap_or(256), ingest_chunk_max_tokens: entry.ingest_chunk_max_tokens.unwrap_or(512), ingest_chunk_overlap_tokens: entry.ingest_chunk_overlap_tokens.unwrap_or(50), @@ -1126,8 +1133,7 @@ mod tests { let tmp = tempdir().unwrap(); let summary = sample_summary(false); - let outcome = - write_reports(&summary, tmp.path(), 5).expect("writing consolidated reports"); + let outcome = write_reports(&summary, tmp.path(), 5).expect("writing consolidated reports"); let contents = std::fs::read_to_string(&outcome.history_path).expect("reading evaluations history"); let entries: Vec = diff --git a/eval/src/slices.rs b/eval/src/slices.rs index 79523eb..014f82a 100644 --- a/eval/src/slices.rs +++ b/eval/src/slices.rs @@ -1,5 +1,5 @@ use std::{ - collections::{HashMap, HashSet}, + collections::{HashMap, HashSet, VecDeque}, fs, path::{Path, PathBuf}, }; @@ -11,7 +11,9 @@ use serde::{Deserialize, Serialize}; use sha2::{Digest, Sha256}; use tracing::{info, warn}; -use crate::datasets::{ConvertedDataset, ConvertedParagraph, ConvertedQuestion}; +use crate::datasets::{ + ConvertedDataset, ConvertedParagraph, ConvertedQuestion, DatasetKind, BEIR_DATASETS, +}; const SLICE_VERSION: u32 = 2; pub const DEFAULT_NEGATIVE_MULTIPLIER: f32 = 4.0; @@ -526,7 +528,7 @@ fn ensure_case_capacity( return Ok(false); } - let question_refs = ordered_question_refs(dataset, params)?; + let question_refs = ordered_question_refs(dataset, params, target_cases)?; let mut existing_questions: HashSet = manifest .cases .iter() @@ -599,7 +601,12 @@ fn ensure_case_capacity( fn ordered_question_refs( dataset: &ConvertedDataset, params: &BuildParams, + target_cases: usize, ) -> Result> { + if dataset.metadata.id == DatasetKind::Beir.id() { + return ordered_question_refs_beir(dataset, params, target_cases); + } + let mut question_refs = Vec::new(); for (p_idx, paragraph) in dataset.paragraphs.iter().enumerate() { for (q_idx, question) in paragraph.questions.iter().enumerate() { @@ -626,6 +633,170 @@ fn ordered_question_refs( Ok(question_refs) } +fn ordered_question_refs_beir( + dataset: &ConvertedDataset, + params: &BuildParams, + target_cases: usize, +) -> Result> { + let prefixes: Vec<&str> = BEIR_DATASETS + .iter() + .map(|kind| kind.source_prefix()) + .collect(); + + let mut grouped: HashMap<&str, Vec<(usize, usize)>> = HashMap::new(); + for (p_idx, paragraph) in dataset.paragraphs.iter().enumerate() { + for (q_idx, question) in paragraph.questions.iter().enumerate() { + let include = if params.include_impossible { + true + } else { + !question.is_impossible && !question.answers.is_empty() + }; + if !include { + continue; + } + + let Some(prefix) = question_prefix(&question.id) else { + warn!( + question_id = %question.id, + "Skipping BEIR question without expected prefix" + ); + continue; + }; + if !prefixes.contains(&prefix) { + warn!( + question_id = %question.id, + prefix = %prefix, + "Skipping BEIR question with unknown subset prefix" + ); + continue; + } + grouped.entry(prefix).or_default().push((p_idx, q_idx)); + } + } + + if grouped.values().all(|entries| entries.is_empty()) { + return Err(anyhow!( + "no eligible BEIR questions found; cannot build slice" + )); + } + + for prefix in &prefixes { + if let Some(entries) = grouped.get_mut(prefix) { + let seed = mix_seed( + &format!("{}::{prefix}", dataset.metadata.id), + params.base_seed, + ); + let mut rng = StdRng::seed_from_u64(seed); + entries.shuffle(&mut rng); + } + } + + let dataset_count = prefixes.len().max(1); + let base_quota = target_cases / dataset_count; + let mut remainder = target_cases % dataset_count; + + let mut quotas: HashMap<&str, usize> = HashMap::new(); + for prefix in &prefixes { + let mut quota = base_quota; + if remainder > 0 { + quota += 1; + remainder -= 1; + } + quotas.insert(*prefix, quota); + } + + let mut take_counts: HashMap<&str, usize> = HashMap::new(); + let mut spare_slots: HashMap<&str, usize> = HashMap::new(); + let mut shortfall = 0usize; + + for prefix in &prefixes { + let available = grouped.get(prefix).map(|v| v.len()).unwrap_or(0); + let quota = *quotas.get(prefix).unwrap_or(&0); + let take = quota.min(available); + let missing = quota.saturating_sub(take); + shortfall += missing; + take_counts.insert(*prefix, take); + spare_slots.insert(*prefix, available.saturating_sub(take)); + } + + while shortfall > 0 { + let mut allocated = false; + for prefix in &prefixes { + if shortfall == 0 { + break; + } + let spare = spare_slots.get(prefix).copied().unwrap_or(0); + if spare == 0 { + continue; + } + if let Some(count) = take_counts.get_mut(prefix) { + *count += 1; + } + spare_slots.insert(*prefix, spare - 1); + shortfall = shortfall.saturating_sub(1); + allocated = true; + } + if !allocated { + break; + } + } + + let mut queues: Vec> = Vec::new(); + let mut total_selected = 0usize; + for prefix in &prefixes { + let take = *take_counts.get(prefix).unwrap_or(&0); + let mut deque = VecDeque::new(); + if let Some(entries) = grouped.get(prefix) { + for item in entries.iter().take(take) { + deque.push_back(*item); + total_selected += 1; + } + } + queues.push(deque); + } + + if total_selected < target_cases { + warn!( + requested = target_cases, + available = total_selected, + "BEIR mix requested more questions than available after balancing; continuing with capped set" + ); + } + + let mut output = Vec::with_capacity(total_selected); + loop { + let mut progressed = false; + for queue in queues.iter_mut() { + if let Some(item) = queue.pop_front() { + output.push(item); + progressed = true; + } + } + if !progressed { + break; + } + } + + if output.is_empty() { + return Err(anyhow!( + "no eligible BEIR questions found; cannot build slice" + )); + } + + Ok(output) +} + +fn question_prefix(question_id: &str) -> Option<&'static str> { + for prefix in BEIR_DATASETS.iter().map(|kind| kind.source_prefix()) { + if let Some(rest) = question_id.strip_prefix(prefix) { + if rest.starts_with('-') { + return Some(prefix); + } + } + } + None +} + fn ensure_negative_pool( dataset: &ConvertedDataset, manifest: &mut SliceManifest, @@ -981,4 +1152,65 @@ mod tests { .any(|entry| entry.id == positive_ids[0])); Ok(()) } + + #[test] + fn beir_mix_balances_and_rebalances() -> Result<()> { + let mut paragraphs = Vec::new(); + let counts = [ + ("fever", 1usize), + ("fiqa", 2usize), + ("hotpotqa", 1usize), + ("nfcorpus", 0usize), + ("quora", 3usize), + ("trec-covid", 2usize), + ]; + + for (prefix, count) in counts { + for idx in 0..count { + let q_id = format!("{prefix}-q{idx}"); + paragraphs.push(ConvertedParagraph { + id: format!("{prefix}-p{idx}"), + title: format!("{prefix} title"), + context: format!("{prefix} context {idx}"), + questions: vec![ConvertedQuestion { + id: q_id, + question: format!("{prefix} question {idx}"), + answers: vec!["answer".to_string()], + is_impossible: false, + }], + }); + } + } + + let metadata = DatasetMetadata::for_kind(DatasetKind::Beir, false, None); + let dataset = ConvertedDataset { + generated_at: Utc::now(), + metadata, + source: "beir-mix".to_string(), + paragraphs, + }; + + let params = BuildParams { + include_impossible: false, + base_seed: 0xAA, + rng_seed: 0xBB, + }; + + let refs = ordered_question_refs_beir(&dataset, ¶ms, 8)?; + let mut per_prefix: HashMap = HashMap::new(); + for (p_idx, q_idx) in refs { + let question = &dataset.paragraphs[p_idx].questions[q_idx]; + let prefix = question_prefix(&question.id).unwrap_or("unknown"); + *per_prefix.entry(prefix.to_string()).or_default() += 1; + } + + assert_eq!(per_prefix.get("fever").copied().unwrap_or(0), 1); + assert_eq!(per_prefix.get("fiqa").copied().unwrap_or(0), 2); + assert_eq!(per_prefix.get("hotpotqa").copied().unwrap_or(0), 1); + assert_eq!(per_prefix.get("nfcorpus").copied().unwrap_or(0), 0); + assert_eq!(per_prefix.get("quora").copied().unwrap_or(0), 2); + assert_eq!(per_prefix.get("trec-covid").copied().unwrap_or(0), 2); + + Ok(()) + } }