dataset: beir

2026-07-10 14:52:45 +02:00 · 2025-12-04 17:50:35 +01:00
parent d3fa3be3e5
commit d1a6d9abdf
10 changed files with 803 additions and 25 deletions
@@ -31,3 +31,108 @@ datasets:
        corpus_limit: 2000
        include_unanswerable: false
        seed: 0x5eed2025
  - id: beir
    label: "BEIR mix"
    category: "BEIR"
    entity_suffix: "BEIR"
    source_prefix: "beir"
    raw: "data/raw/beir"
    converted: "data/converted/beir-minne.json"
    include_unanswerable: false
    slices:
      - id: beir-mix-600
        label: "BEIR mix (600)"
        description: "Balanced slice across FEVER, FiQA, HotpotQA, NFCorpus, Quora, TREC-COVID"
        limit: 600
        corpus_limit: 6000
        seed: 0x5eed2025
  - id: fever
    label: "FEVER (BEIR)"
    category: "FEVER"
    entity_suffix: "FEVER"
    source_prefix: "fever"
    raw: "data/raw/fever"
    converted: "data/converted/fever-minne.json"
    include_unanswerable: false
    slices:
      - id: fever-test-200
        label: "FEVER test (200)"
        description: "200-case slice from BEIR test qrels"
        limit: 200
        corpus_limit: 5000
        seed: 0x5eed2025
  - id: fiqa
    label: "FiQA-2018 (BEIR)"
    category: "FiQA-2018"
    entity_suffix: "FiQA"
    source_prefix: "fiqa"
    raw: "data/raw/fiqa"
    converted: "data/converted/fiqa-minne.json"
    include_unanswerable: false
    slices:
      - id: fiqa-test-200
        label: "FiQA test (200)"
        description: "200-case slice from BEIR test qrels"
        limit: 200
        corpus_limit: 5000
        seed: 0x5eed2025
  - id: hotpotqa
    label: "HotpotQA (BEIR)"
    category: "HotpotQA"
    entity_suffix: "HotpotQA"
    source_prefix: "hotpotqa"
    raw: "data/raw/hotpotqa"
    converted: "data/converted/hotpotqa-minne.json"
    include_unanswerable: false
    slices:
      - id: hotpotqa-test-200
        label: "HotpotQA test (200)"
        description: "200-case slice from BEIR test qrels"
        limit: 200
        corpus_limit: 5000
        seed: 0x5eed2025
  - id: nfcorpus
    label: "NFCorpus (BEIR)"
    category: "NFCorpus"
    entity_suffix: "NFCorpus"
    source_prefix: "nfcorpus"
    raw: "data/raw/nfcorpus"
    converted: "data/converted/nfcorpus-minne.json"
    include_unanswerable: false
    slices:
      - id: nfcorpus-test-200
        label: "NFCorpus test (200)"
        description: "200-case slice from BEIR test qrels"
        limit: 200
        corpus_limit: 5000
        seed: 0x5eed2025
  - id: quora
    label: "Quora (IR)"
    category: "Quora"
    entity_suffix: "Quora"
    source_prefix: "quora"
    raw: "data/raw/quora"
    converted: "data/converted/quora-minne.json"
    include_unanswerable: false
    slices:
      - id: quora-test-200
        label: "Quora test (200)"
        description: "200-case slice from BEIR test qrels"
        limit: 200
        corpus_limit: 5000
        seed: 0x5eed2025
  - id: trec-covid
    label: "TREC-COVID (BEIR)"
    category: "TREC-COVID"
    entity_suffix: "TREC-COVID"
    source_prefix: "trec-covid"
    raw: "data/raw/trec-covid"
    converted: "data/converted/trec-covid-minne.json"
    include_unanswerable: false
    slices:
      - id: trec-covid-test-200
        label: "TREC-COVID test (200)"
        description: "200-case slice from BEIR test qrels"
        limit: 200
        corpus_limit: 5000
        seed: 0x5eed2025
@@ -347,6 +347,10 @@ impl Config {
            self.retrieval.require_verified_chunks = true;
        }
        if self.dataset == DatasetKind::Beir {
            self.negative_multiplier = 9.0;
        }
        // Validations
        if self.ingest_chunk_min_tokens == 0
            || self.ingest_chunk_min_tokens >= self.ingest_chunk_max_tokens
@@ -0,0 +1,341 @@
 use std::{
    collections::{BTreeMap, HashMap},
    fs::File,
    io::{BufRead, BufReader},
    path::{Path, PathBuf},
 };
 use anyhow::{anyhow, Context, Result};
 use serde::Deserialize;
 use tracing::warn;
 use super::{ConvertedParagraph, ConvertedQuestion, DatasetKind};
 const ANSWER_SNIPPET_CHARS: usize = 240;
 #[derive(Debug, Deserialize)]
 struct BeirCorpusRow {
    #[serde(rename = "_id")]
    id: String,
    #[serde(default)]
    title: Option<String>,
    #[serde(default)]
    text: Option<String>,
 }
 #[derive(Debug, Deserialize)]
 struct BeirQueryRow {
    #[serde(rename = "_id")]
    id: String,
    text: String,
 }
 #[derive(Debug, Clone)]
 struct BeirParagraph {
    title: String,
    context: String,
 }
 #[derive(Debug, Clone)]
 struct BeirQuery {
    text: String,
 }
 #[derive(Debug, Clone)]
 struct QrelEntry {
    doc_id: String,
    score: i32,
 }
 pub fn convert_beir(raw_dir: &Path, dataset: DatasetKind) -> Result<Vec<ConvertedParagraph>> {
    let corpus_path = raw_dir.join("corpus.jsonl");
    let queries_path = raw_dir.join("queries.jsonl");
    let qrels_path = resolve_qrels_path(raw_dir)?;
    let corpus = load_corpus(&corpus_path)?;
    let queries = load_queries(&queries_path)?;
    let qrels = load_qrels(&qrels_path)?;
    let mut paragraphs = Vec::with_capacity(corpus.len());
    let mut paragraph_index = HashMap::new();
    for (doc_id, entry) in corpus.iter() {
        let paragraph_id = format!("{}-{doc_id}", dataset.source_prefix());
        let paragraph = ConvertedParagraph {
            id: paragraph_id.clone(),
            title: entry.title.clone(),
            context: entry.context.clone(),
            questions: Vec::new(),
        };
        paragraph_index.insert(doc_id.clone(), paragraphs.len());
        paragraphs.push(paragraph);
    }
    let mut missing_queries = 0usize;
    let mut missing_docs = 0usize;
    let mut skipped_answers = 0usize;
    for (query_id, entries) in qrels {
        let query = match queries.get(&query_id) {
            Some(query) => query,
            None => {
                missing_queries += 1;
                warn!(query_id = %query_id, "Skipping qrels entry for missing query");
                continue;
            }
        };
        let best = match select_best_doc(&entries) {
            Some(entry) => entry,
            None => continue,
        };
        let paragraph_slot = match paragraph_index.get(&best.doc_id) {
            Some(slot) => *slot,
            None => {
                missing_docs += 1;
                warn!(
                    query_id = %query_id,
                    doc_id = %best.doc_id,
                    "Skipping qrels entry referencing missing corpus document"
                );
                continue;
            }
        };
        let answer = answer_snippet(&paragraphs[paragraph_slot].context);
        let answers = match answer {
            Some(snippet) => vec![snippet],
            None => {
                skipped_answers += 1;
                warn!(
                    query_id = %query_id,
                    doc_id = %best.doc_id,
                    "Skipping query because no non-empty answer snippet could be derived"
                );
                continue;
            }
        };
        let question_id = format!("{}-{query_id}", dataset.source_prefix());
        paragraphs[paragraph_slot]
            .questions
            .push(ConvertedQuestion {
                id: question_id,
                question: query.text.clone(),
                answers,
                is_impossible: false,
            });
    }
    if missing_queries + missing_docs + skipped_answers > 0 {
        warn!(
            missing_queries,
            missing_docs, skipped_answers, "Skipped some BEIR qrels entries during conversion"
        );
    }
    Ok(paragraphs)
 }
 fn resolve_qrels_path(raw_dir: &Path) -> Result<PathBuf> {
    let qrels_dir = raw_dir.join("qrels");
    let candidates = ["test.tsv", "dev.tsv", "train.tsv"];
    for name in candidates {
        let candidate = qrels_dir.join(name);
        if candidate.exists() {
            return Ok(candidate);
        }
    }
    Err(anyhow!(
        "No qrels file found under {}; expected one of {:?}",
        qrels_dir.display(),
        candidates
    ))
 }
 fn load_corpus(path: &Path) -> Result<BTreeMap<String, BeirParagraph>> {
    let file =
        File::open(path).with_context(|| format!("opening BEIR corpus at {}", path.display()))?;
    let reader = BufReader::new(file);
    let mut corpus = BTreeMap::new();
    for (idx, line) in reader.lines().enumerate() {
        let raw = line
            .with_context(|| format!("reading corpus line {} from {}", idx + 1, path.display()))?;
        if raw.trim().is_empty() {
            continue;
        }
        let row: BeirCorpusRow = serde_json::from_str(&raw).with_context(|| {
            format!(
                "parsing corpus JSON on line {} from {}",
                idx + 1,
                path.display()
            )
        })?;
        let title = row.title.unwrap_or_else(|| row.id.clone());
        let text = row.text.unwrap_or_default();
        let context = build_context(&title, &text);
        if context.is_empty() {
            warn!(doc_id = %row.id, "Skipping empty corpus document");
            continue;
        }
        corpus.insert(row.id, BeirParagraph { title, context });
    }
    Ok(corpus)
 }
 fn load_queries(path: &Path) -> Result<BTreeMap<String, BeirQuery>> {
    let file = File::open(path)
        .with_context(|| format!("opening BEIR queries file at {}", path.display()))?;
    let reader = BufReader::new(file);
    let mut queries = BTreeMap::new();
    for (idx, line) in reader.lines().enumerate() {
        let raw = line
            .with_context(|| format!("reading query line {} from {}", idx + 1, path.display()))?;
        if raw.trim().is_empty() {
            continue;
        }
        let row: BeirQueryRow = serde_json::from_str(&raw).with_context(|| {
            format!(
                "parsing query JSON on line {} from {}",
                idx + 1,
                path.display()
            )
        })?;
        queries.insert(
            row.id,
            BeirQuery {
                text: row.text.trim().to_string(),
            },
        );
    }
    Ok(queries)
 }
 fn load_qrels(path: &Path) -> Result<BTreeMap<String, Vec<QrelEntry>>> {
    let file =
        File::open(path).with_context(|| format!("opening BEIR qrels at {}", path.display()))?;
    let reader = BufReader::new(file);
    let mut qrels: BTreeMap<String, Vec<QrelEntry>> = BTreeMap::new();
    for (idx, line) in reader.lines().enumerate() {
        let raw = line
            .with_context(|| format!("reading qrels line {} from {}", idx + 1, path.display()))?;
        let trimmed = raw.trim();
        if trimmed.is_empty() || trimmed.starts_with("query-id") {
            continue;
        }
        let mut parts = trimmed.split_whitespace();
        let query_id = parts
            .next()
            .ok_or_else(|| anyhow!("missing query id on line {}", idx + 1))?;
        let doc_id = parts
            .next()
            .ok_or_else(|| anyhow!("missing document id on line {}", idx + 1))?;
        let score_raw = parts
            .next()
            .ok_or_else(|| anyhow!("missing score on line {}", idx + 1))?;
        let score: i32 = score_raw.parse().with_context(|| {
            format!(
                "parsing qrels score '{}' on line {} from {}",
                score_raw,
                idx + 1,
                path.display()
            )
        })?;
        qrels
            .entry(query_id.to_string())
            .or_default()
            .push(QrelEntry {
                doc_id: doc_id.to_string(),
                score,
            });
    }
    Ok(qrels)
 }
 fn select_best_doc(entries: &[QrelEntry]) -> Option<&QrelEntry> {
    entries
        .iter()
        .max_by(|a, b| a.score.cmp(&b.score).then_with(|| b.doc_id.cmp(&a.doc_id)))
 }
 fn answer_snippet(text: &str) -> Option<String> {
    let trimmed = text.trim();
    if trimmed.is_empty() {
        return None;
    }
    let snippet: String = trimmed.chars().take(ANSWER_SNIPPET_CHARS).collect();
    let snippet = snippet.trim();
    if snippet.is_empty() {
        None
    } else {
        Some(snippet.to_string())
    }
 }
 fn build_context(title: &str, text: &str) -> String {
    let title = title.trim();
    let text = text.trim();
    match (title.is_empty(), text.is_empty()) {
        (true, true) => String::new(),
        (true, false) => text.to_string(),
        (false, true) => title.to_string(),
        (false, false) => format!("{title}\n\n{text}"),
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    use std::fs;
    use tempfile::tempdir;
    #[test]
    fn converts_basic_beir_layout() {
        let dir = tempdir().unwrap();
        let corpus = r#"
 {"_id":"d1","title":"Doc 1","text":"Doc one has some text for testing."}
 {"_id":"d2","title":"Doc 2","text":"Second document content."}
 "#;
        let queries = r#"
 {"_id":"q1","text":"What is in doc one?"}
 "#;
        let qrels = "query-id\tcorpus-id\tscore\nq1\td1\t2\n";
        fs::write(dir.path().join("corpus.jsonl"), corpus.trim()).unwrap();
        fs::write(dir.path().join("queries.jsonl"), queries.trim()).unwrap();
        fs::create_dir_all(dir.path().join("qrels")).unwrap();
        fs::write(dir.path().join("qrels/test.tsv"), qrels).unwrap();
        let paragraphs = convert_beir(dir.path(), DatasetKind::Fever).unwrap();
        assert_eq!(paragraphs.len(), 2);
        let doc_one = paragraphs
            .iter()
            .find(|p| p.id == "fever-d1")
            .expect("missing paragraph for d1");
        assert_eq!(doc_one.questions.len(), 1);
        let question = &doc_one.questions[0];
        assert_eq!(question.id, "fever-q1");
        assert!(!question.answers.is_empty());
        assert!(doc_one.context.contains(&question.answers[0]));
        let doc_two = paragraphs
            .iter()
            .find(|p| p.id == "fever-d2")
            .expect("missing paragraph for d2");
        assert!(doc_two.questions.is_empty());
    }
 }
@@ -1,3 +1,4 @@
 mod beir;
 mod nq;
 mod squad;
@@ -10,10 +11,10 @@ use std::{
 use anyhow::{anyhow, bail, Context, Result};
 use chrono::{DateTime, TimeZone, Utc};
 use clap::ValueEnum;
 use once_cell::sync::OnceCell;
 use serde::{Deserialize, Serialize};
 use tracing::warn;
 use clap::ValueEnum;
 const MANIFEST_PATH: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/manifest.yaml");
 static DATASET_CATALOG: OnceCell<DatasetCatalog> = OnceCell::new();
@@ -248,6 +249,19 @@ fn dataset_entry_for_kind(kind: DatasetKind) -> Result<&'static DatasetEntry> {
 pub enum DatasetKind {
    SquadV2,
    NaturalQuestions,
    Beir,
    #[value(name = "fever")]
    Fever,
    #[value(name = "fiqa")]
    Fiqa,
    #[value(name = "hotpotqa", alias = "hotpot-qa")]
    HotpotQa,
    #[value(name = "nfcorpus", alias = "nf-corpus")]
    Nfcorpus,
    #[value(name = "quora")]
    Quora,
    #[value(name = "trec-covid", alias = "treccovid", alias = "trec_covid")]
    TrecCovid,
 }
 impl DatasetKind {
@@ -255,6 +269,13 @@ impl DatasetKind {
        match self {
            Self::SquadV2 => "squad-v2",
            Self::NaturalQuestions => "natural-questions-dev",
            Self::Beir => "beir",
            Self::Fever => "fever",
            Self::Fiqa => "fiqa",
            Self::HotpotQa => "hotpotqa",
            Self::Nfcorpus => "nfcorpus",
            Self::Quora => "quora",
            Self::TrecCovid => "trec-covid",
        }
    }
@@ -262,6 +283,13 @@ impl DatasetKind {
        match self {
            Self::SquadV2 => "SQuAD v2.0",
            Self::NaturalQuestions => "Natural Questions (dev)",
            Self::Beir => "BEIR mix",
            Self::Fever => "FEVER (BEIR)",
            Self::Fiqa => "FiQA-2018 (BEIR)",
            Self::HotpotQa => "HotpotQA (BEIR)",
            Self::Nfcorpus => "NFCorpus (BEIR)",
            Self::Quora => "Quora (IR)",
            Self::TrecCovid => "TREC-COVID (BEIR)",
        }
    }
@@ -269,6 +297,13 @@ impl DatasetKind {
        match self {
            Self::SquadV2 => "SQuAD v2.0",
            Self::NaturalQuestions => "Natural Questions",
            Self::Beir => "BEIR",
            Self::Fever => "FEVER",
            Self::Fiqa => "FiQA-2018",
            Self::HotpotQa => "HotpotQA",
            Self::Nfcorpus => "NFCorpus",
            Self::Quora => "Quora",
            Self::TrecCovid => "TREC-COVID",
        }
    }
@@ -276,6 +311,13 @@ impl DatasetKind {
        match self {
            Self::SquadV2 => "SQuAD",
            Self::NaturalQuestions => "Natural Questions",
            Self::Beir => "BEIR",
            Self::Fever => "FEVER",
            Self::Fiqa => "FiQA",
            Self::HotpotQa => "HotpotQA",
            Self::Nfcorpus => "NFCorpus",
            Self::Quora => "Quora",
            Self::TrecCovid => "TREC-COVID",
        }
    }
@@ -283,6 +325,13 @@ impl DatasetKind {
        match self {
            Self::SquadV2 => "squad",
            Self::NaturalQuestions => "nq",
            Self::Beir => "beir",
            Self::Fever => "fever",
            Self::Fiqa => "fiqa",
            Self::HotpotQa => "hotpotqa",
            Self::Nfcorpus => "nfcorpus",
            Self::Quora => "quora",
            Self::TrecCovid => "trec-covid",
        }
    }
@@ -320,13 +369,29 @@ impl FromStr for DatasetKind {
            "nq" | "natural-questions" | "natural_questions" | "natural-questions-dev" => {
                Ok(Self::NaturalQuestions)
            }
            "beir" => Ok(Self::Beir),
            "fever" => Ok(Self::Fever),
            "fiqa" | "fiqa-2018" => Ok(Self::Fiqa),
            "hotpotqa" | "hotpot-qa" => Ok(Self::HotpotQa),
            "nfcorpus" | "nf-corpus" => Ok(Self::Nfcorpus),
            "quora" => Ok(Self::Quora),
            "trec-covid" | "treccovid" | "trec_covid" => Ok(Self::TrecCovid),
            other => {
-                anyhow::bail!("unknown dataset '{other}'. Expected 'squad' or 'natural-questions'.")
+                anyhow::bail!("unknown dataset '{other}'. Expected one of: squad, natural-questions, beir, fever, fiqa, hotpotqa, nfcorpus, quora, trec-covid.")
            }
        }
    }
 }
 pub const BEIR_DATASETS: [DatasetKind; 6] = [
    DatasetKind::Fever,
    DatasetKind::Fiqa,
    DatasetKind::HotpotQa,
    DatasetKind::Nfcorpus,
    DatasetKind::Quora,
    DatasetKind::TrecCovid,
 ];
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct DatasetMetadata {
    pub id: String,
@@ -410,6 +475,13 @@ pub fn convert(
        DatasetKind::NaturalQuestions => {
            nq::convert_nq(raw_path, include_unanswerable, context_token_limit)?
        }
        DatasetKind::Beir => convert_beir_mix(include_unanswerable, context_token_limit)?,
        DatasetKind::Fever
        | DatasetKind::Fiqa
        | DatasetKind::HotpotQa
        | DatasetKind::Nfcorpus
        | DatasetKind::Quora
        | DatasetKind::TrecCovid => beir::convert_beir(raw_path, dataset)?,
    };
    let metadata_limit = match dataset {
@@ -417,14 +489,37 @@ pub fn convert(
        _ => context_token_limit,
    };
    let source_label = match dataset {
        DatasetKind::Beir => "beir-mix".to_string(),
        _ => raw_path.display().to_string(),
    };
    Ok(ConvertedDataset {
        generated_at: Utc::now(),
        metadata: DatasetMetadata::for_kind(dataset, include_unanswerable, metadata_limit),
-        source: raw_path.display().to_string(),
+        source: source_label,
        paragraphs,
    })
 }
 fn convert_beir_mix(
    include_unanswerable: bool,
    _context_token_limit: Option<usize>,
 ) -> Result<Vec<ConvertedParagraph>> {
    if include_unanswerable {
        warn!("BEIR mix ignores include_unanswerable flag; all questions are answerable");
    }
    let mut paragraphs = Vec::new();
    for subset in BEIR_DATASETS {
        let entry = dataset_entry_for_kind(subset)?;
        let subset_paragraphs = beir::convert_beir(&entry.raw_path, subset)?;
        paragraphs.extend(subset_paragraphs);
    }
    Ok(paragraphs)
 }
 fn ensure_parent(path: &Path) -> Result<()> {
    if let Some(parent) = path.parent() {
        fs::create_dir_all(parent)
@@ -6,8 +6,8 @@ use futures::stream::{self, StreamExt};
 use tracing::{debug, info};
 use crate::eval::{
-    adapt_strategy_output, build_case_diagnostics,
+    adapt_strategy_output, build_case_diagnostics, text_contains_answer, CaseDiagnostics,
-    text_contains_answer, CaseDiagnostics, CaseSummary, RetrievedSummary,
+    CaseSummary, RetrievedSummary,
 };
 use retrieval_pipeline::{
    pipeline::{self, PipelineStageTimings, RetrievalConfig},
@@ -26,7 +26,6 @@ use uuid::Uuid;
 use crate::{
    datasets::{ConvertedDataset, ConvertedParagraph, ConvertedQuestion},
    db_helpers::change_embedding_length_in_hnsw_indexes,
    slices::{self, ResolvedSlice, SliceParagraphKind},
 };
@@ -417,10 +416,6 @@ async fn ingest_paragraph_batch(
        .await
        .context("applying migrations for ingestion")?;
    change_embedding_length_in_hnsw_indexes(&db, embedding_dimension)
        .await
        .context("failed setting new hnsw length")?;
    let mut app_config = AppConfig::default();
    app_config.storage = StorageKind::Memory;
    let backend: DynStore = Arc::new(InMemory::new());
@@ -93,7 +93,6 @@ async fn async_main() -> anyhow::Result<()> {
    // Clap handles help automatically, so we don't need to check for it manually
    if parsed.config.inspect_question.is_some() {
        inspection::inspect_question(&parsed.config).await?;
        return Ok(());
@@ -145,6 +145,8 @@ mod tests {
            precision_at_1: 0.5,
            precision_at_2: 0.5,
            precision_at_3: 0.5,
            mrr: 0.0,
            average_ndcg: 0.0,
            duration_ms: 1234,
            dataset_id: "squad-v2".into(),
            dataset_label: "SQuAD v2".into(),
@@ -192,18 +194,17 @@ mod tests {
            rerank_pool_size: Some(4),
            rerank_keep_top: 10,
            concurrency: 2,
            retrieval_strategy: "initial".into(),
            detailed_report: false,
            retrieval_strategy: "initial".into(),
            chunk_result_cap: 5,
            ingest_chunk_min_tokens: 256,
            ingest_chunk_max_tokens: 512,
            ingest_chunk_overlap_tokens: 50,
            ingest_chunks_only: false,
            ingest_chunk_overlap_tokens: 50,
            chunk_vector_take: 20,
            chunk_fts_take: 20,
            chunk_avg_chars_per_token: 4,
            max_chunks_per_entity: 4,
            average_ndcg: 0.0,
            mrr: 0.0,
            cases: Vec::new(),
        }
    }
@@ -88,6 +88,10 @@ pub struct RetrievalSection {
    pub rerank_pool_size: Option<usize>,
    pub rerank_keep_top: usize,
    pub chunk_result_cap: usize,
    #[serde(default)]
    pub chunk_vector_take: usize,
    #[serde(default)]
    pub chunk_fts_take: usize,
    pub ingest_chunk_min_tokens: usize,
    pub ingest_chunk_max_tokens: usize,
    pub ingest_chunk_overlap_tokens: usize,
@@ -202,6 +206,8 @@ impl EvaluationReport {
            rerank_pool_size: summary.rerank_pool_size,
            rerank_keep_top: summary.rerank_keep_top,
            chunk_result_cap: summary.chunk_result_cap,
            chunk_vector_take: summary.chunk_vector_take,
            chunk_fts_take: summary.chunk_fts_take,
            ingest_chunk_min_tokens: summary.ingest_chunk_min_tokens,
            ingest_chunk_max_tokens: summary.ingest_chunk_max_tokens,
            ingest_chunk_overlap_tokens: summary.ingest_chunk_overlap_tokens,
@@ -467,10 +473,7 @@ fn render_markdown(report: &EvaluationReport) -> String {
        report.retrieval.precision_at_2,
        report.retrieval.precision_at_3
    ));
-    md.push_str(&format!(
+    md.push_str(&format!("| MRR | {:.3} |\\n", report.retrieval.mrr));
        "| MRR | {:.3} |\\n",
        report.retrieval.mrr
    ));
    md.push_str(&format!(
        "| NDCG | {:.3} |\\n",
        report.retrieval.average_ndcg
@@ -632,7 +635,9 @@ fn render_markdown(report: &EvaluationReport) -> String {
            if report.detailed_report {
                md.push_str("All LLM-only cases matched within the evaluation window.\\n");
            } else {
-                md.push_str("LLM-only cases omitted. Re-run with `--detailed-report` to see samples.\\n");
+                md.push_str(
                    "LLM-only cases omitted. Re-run with `--detailed-report` to see samples.\\n",
                );
            }
        } else {
            md.push_str("| Question ID | Answered | Match Rank | Top Retrieved |\\n");
@@ -851,6 +856,8 @@ fn convert_legacy_entry(entry: LegacyHistoryEntry) -> EvaluationReport {
        rerank_pool_size: entry.rerank_pool_size,
        rerank_keep_top: entry.rerank_keep_top,
        chunk_result_cap: entry.chunk_result_cap.unwrap_or(5),
        chunk_vector_take: 0,
        chunk_fts_take: 0,
        ingest_chunk_min_tokens: entry.ingest_chunk_min_tokens.unwrap_or(256),
        ingest_chunk_max_tokens: entry.ingest_chunk_max_tokens.unwrap_or(512),
        ingest_chunk_overlap_tokens: entry.ingest_chunk_overlap_tokens.unwrap_or(50),
@@ -1126,8 +1133,7 @@ mod tests {
        let tmp = tempdir().unwrap();
        let summary = sample_summary(false);
-        let outcome =
+        let outcome = write_reports(&summary, tmp.path(), 5).expect("writing consolidated reports");
            write_reports(&summary, tmp.path(), 5).expect("writing consolidated reports");
        let contents =
            std::fs::read_to_string(&outcome.history_path).expect("reading evaluations history");
        let entries: Vec<EvaluationReport> =
@@ -1,5 +1,5 @@
 use std::{
-    collections::{HashMap, HashSet},
+    collections::{HashMap, HashSet, VecDeque},
    fs,
    path::{Path, PathBuf},
 };
@@ -11,7 +11,9 @@ use serde::{Deserialize, Serialize};
 use sha2::{Digest, Sha256};
 use tracing::{info, warn};
-use crate::datasets::{ConvertedDataset, ConvertedParagraph, ConvertedQuestion};
+use crate::datasets::{
    ConvertedDataset, ConvertedParagraph, ConvertedQuestion, DatasetKind, BEIR_DATASETS,
 };
 const SLICE_VERSION: u32 = 2;
 pub const DEFAULT_NEGATIVE_MULTIPLIER: f32 = 4.0;
@@ -526,7 +528,7 @@ fn ensure_case_capacity(
        return Ok(false);
    }
-    let question_refs = ordered_question_refs(dataset, params)?;
+    let question_refs = ordered_question_refs(dataset, params, target_cases)?;
    let mut existing_questions: HashSet<String> = manifest
        .cases
        .iter()
@@ -599,7 +601,12 @@ fn ensure_case_capacity(
 fn ordered_question_refs(
    dataset: &ConvertedDataset,
    params: &BuildParams,
    target_cases: usize,
 ) -> Result<Vec<(usize, usize)>> {
    if dataset.metadata.id == DatasetKind::Beir.id() {
        return ordered_question_refs_beir(dataset, params, target_cases);
    }
    let mut question_refs = Vec::new();
    for (p_idx, paragraph) in dataset.paragraphs.iter().enumerate() {
        for (q_idx, question) in paragraph.questions.iter().enumerate() {
@@ -626,6 +633,170 @@ fn ordered_question_refs(
    Ok(question_refs)
 }
 fn ordered_question_refs_beir(
    dataset: &ConvertedDataset,
    params: &BuildParams,
    target_cases: usize,
 ) -> Result<Vec<(usize, usize)>> {
    let prefixes: Vec<&str> = BEIR_DATASETS
        .iter()
        .map(|kind| kind.source_prefix())
        .collect();
    let mut grouped: HashMap<&str, Vec<(usize, usize)>> = HashMap::new();
    for (p_idx, paragraph) in dataset.paragraphs.iter().enumerate() {
        for (q_idx, question) in paragraph.questions.iter().enumerate() {
            let include = if params.include_impossible {
                true
            } else {
                !question.is_impossible && !question.answers.is_empty()
            };
            if !include {
                continue;
            }
            let Some(prefix) = question_prefix(&question.id) else {
                warn!(
                    question_id = %question.id,
                    "Skipping BEIR question without expected prefix"
                );
                continue;
            };
            if !prefixes.contains(&prefix) {
                warn!(
                    question_id = %question.id,
                    prefix = %prefix,
                    "Skipping BEIR question with unknown subset prefix"
                );
                continue;
            }
            grouped.entry(prefix).or_default().push((p_idx, q_idx));
        }
    }
    if grouped.values().all(|entries| entries.is_empty()) {
        return Err(anyhow!(
            "no eligible BEIR questions found; cannot build slice"
        ));
    }
    for prefix in &prefixes {
        if let Some(entries) = grouped.get_mut(prefix) {
            let seed = mix_seed(
                &format!("{}::{prefix}", dataset.metadata.id),
                params.base_seed,
            );
            let mut rng = StdRng::seed_from_u64(seed);
            entries.shuffle(&mut rng);
        }
    }
    let dataset_count = prefixes.len().max(1);
    let base_quota = target_cases / dataset_count;
    let mut remainder = target_cases % dataset_count;
    let mut quotas: HashMap<&str, usize> = HashMap::new();
    for prefix in &prefixes {
        let mut quota = base_quota;
        if remainder > 0 {
            quota += 1;
            remainder -= 1;
        }
        quotas.insert(*prefix, quota);
    }
    let mut take_counts: HashMap<&str, usize> = HashMap::new();
    let mut spare_slots: HashMap<&str, usize> = HashMap::new();
    let mut shortfall = 0usize;
    for prefix in &prefixes {
        let available = grouped.get(prefix).map(|v| v.len()).unwrap_or(0);
        let quota = *quotas.get(prefix).unwrap_or(&0);
        let take = quota.min(available);
        let missing = quota.saturating_sub(take);
        shortfall += missing;
        take_counts.insert(*prefix, take);
        spare_slots.insert(*prefix, available.saturating_sub(take));
    }
    while shortfall > 0 {
        let mut allocated = false;
        for prefix in &prefixes {
            if shortfall == 0 {
                break;
            }
            let spare = spare_slots.get(prefix).copied().unwrap_or(0);
            if spare == 0 {
                continue;
            }
            if let Some(count) = take_counts.get_mut(prefix) {
                *count += 1;
            }
            spare_slots.insert(*prefix, spare - 1);
            shortfall = shortfall.saturating_sub(1);
            allocated = true;
        }
        if !allocated {
            break;
        }
    }
    let mut queues: Vec<VecDeque<(usize, usize)>> = Vec::new();
    let mut total_selected = 0usize;
    for prefix in &prefixes {
        let take = *take_counts.get(prefix).unwrap_or(&0);
        let mut deque = VecDeque::new();
        if let Some(entries) = grouped.get(prefix) {
            for item in entries.iter().take(take) {
                deque.push_back(*item);
                total_selected += 1;
            }
        }
        queues.push(deque);
    }
    if total_selected < target_cases {
        warn!(
            requested = target_cases,
            available = total_selected,
            "BEIR mix requested more questions than available after balancing; continuing with capped set"
        );
    }
    let mut output = Vec::with_capacity(total_selected);
    loop {
        let mut progressed = false;
        for queue in queues.iter_mut() {
            if let Some(item) = queue.pop_front() {
                output.push(item);
                progressed = true;
            }
        }
        if !progressed {
            break;
        }
    }
    if output.is_empty() {
        return Err(anyhow!(
            "no eligible BEIR questions found; cannot build slice"
        ));
    }
    Ok(output)
 }
 fn question_prefix(question_id: &str) -> Option<&'static str> {
    for prefix in BEIR_DATASETS.iter().map(|kind| kind.source_prefix()) {
        if let Some(rest) = question_id.strip_prefix(prefix) {
            if rest.starts_with('-') {
                return Some(prefix);
            }
        }
    }
    None
 }
 fn ensure_negative_pool(
    dataset: &ConvertedDataset,
    manifest: &mut SliceManifest,
@@ -981,4 +1152,65 @@ mod tests {
            .any(|entry| entry.id == positive_ids[0]));
        Ok(())
    }
    #[test]
    fn beir_mix_balances_and_rebalances() -> Result<()> {
        let mut paragraphs = Vec::new();
        let counts = [
            ("fever", 1usize),
            ("fiqa", 2usize),
            ("hotpotqa", 1usize),
            ("nfcorpus", 0usize),
            ("quora", 3usize),
            ("trec-covid", 2usize),
        ];
        for (prefix, count) in counts {
            for idx in 0..count {
                let q_id = format!("{prefix}-q{idx}");
                paragraphs.push(ConvertedParagraph {
                    id: format!("{prefix}-p{idx}"),
                    title: format!("{prefix} title"),
                    context: format!("{prefix} context {idx}"),
                    questions: vec![ConvertedQuestion {
                        id: q_id,
                        question: format!("{prefix} question {idx}"),
                        answers: vec!["answer".to_string()],
                        is_impossible: false,
                    }],
                });
            }
        }
        let metadata = DatasetMetadata::for_kind(DatasetKind::Beir, false, None);
        let dataset = ConvertedDataset {
            generated_at: Utc::now(),
            metadata,
            source: "beir-mix".to_string(),
            paragraphs,
        };
        let params = BuildParams {
            include_impossible: false,
            base_seed: 0xAA,
            rng_seed: 0xBB,
        };
        let refs = ordered_question_refs_beir(&dataset, &params, 8)?;
        let mut per_prefix: HashMap<String, usize> = HashMap::new();
        for (p_idx, q_idx) in refs {
            let question = &dataset.paragraphs[p_idx].questions[q_idx];
            let prefix = question_prefix(&question.id).unwrap_or("unknown");
            *per_prefix.entry(prefix.to_string()).or_default() += 1;
        }
        assert_eq!(per_prefix.get("fever").copied().unwrap_or(0), 1);
        assert_eq!(per_prefix.get("fiqa").copied().unwrap_or(0), 2);
        assert_eq!(per_prefix.get("hotpotqa").copied().unwrap_or(0), 1);
        assert_eq!(per_prefix.get("nfcorpus").copied().unwrap_or(0), 0);
        assert_eq!(per_prefix.get("quora").copied().unwrap_or(0), 2);
        assert_eq!(per_prefix.get("trec-covid").copied().unwrap_or(0), 2);
        Ok(())
    }
 }