dataset: beir

2026-06-26 20:06:24 +02:00 · 2025-12-04 17:50:35 +01:00
parent d3fa3be3e5
commit d1a6d9abdf
10 changed files with 803 additions and 25 deletions
@@ -31,3 +31,108 @@ datasets:
        corpus_limit: 2000
        include_unanswerable: false
        seed: 0x5eed2025
+  - id: beir
+    label: "BEIR mix"
+    category: "BEIR"
+    entity_suffix: "BEIR"
+    source_prefix: "beir"
+    raw: "data/raw/beir"
+    converted: "data/converted/beir-minne.json"
+    include_unanswerable: false
+    slices:
+      - id: beir-mix-600
+        label: "BEIR mix (600)"
+        description: "Balanced slice across FEVER, FiQA, HotpotQA, NFCorpus, Quora, TREC-COVID"
+        limit: 600
+        corpus_limit: 6000
+        seed: 0x5eed2025
+  - id: fever
+    label: "FEVER (BEIR)"
+    category: "FEVER"
+    entity_suffix: "FEVER"
+    source_prefix: "fever"
+    raw: "data/raw/fever"
+    converted: "data/converted/fever-minne.json"
+    include_unanswerable: false
+    slices:
+      - id: fever-test-200
+        label: "FEVER test (200)"
+        description: "200-case slice from BEIR test qrels"
+        limit: 200
+        corpus_limit: 5000
+        seed: 0x5eed2025
+  - id: fiqa
+    label: "FiQA-2018 (BEIR)"
+    category: "FiQA-2018"
+    entity_suffix: "FiQA"
+    source_prefix: "fiqa"
+    raw: "data/raw/fiqa"
+    converted: "data/converted/fiqa-minne.json"
+    include_unanswerable: false
+    slices:
+      - id: fiqa-test-200
+        label: "FiQA test (200)"
+        description: "200-case slice from BEIR test qrels"
+        limit: 200
+        corpus_limit: 5000
+        seed: 0x5eed2025
+  - id: hotpotqa
+    label: "HotpotQA (BEIR)"
+    category: "HotpotQA"
+    entity_suffix: "HotpotQA"
+    source_prefix: "hotpotqa"
+    raw: "data/raw/hotpotqa"
+    converted: "data/converted/hotpotqa-minne.json"
+    include_unanswerable: false
+    slices:
+      - id: hotpotqa-test-200
+        label: "HotpotQA test (200)"
+        description: "200-case slice from BEIR test qrels"
+        limit: 200
+        corpus_limit: 5000
+        seed: 0x5eed2025
+  - id: nfcorpus
+    label: "NFCorpus (BEIR)"
+    category: "NFCorpus"
+    entity_suffix: "NFCorpus"
+    source_prefix: "nfcorpus"
+    raw: "data/raw/nfcorpus"
+    converted: "data/converted/nfcorpus-minne.json"
+    include_unanswerable: false
+    slices:
+      - id: nfcorpus-test-200
+        label: "NFCorpus test (200)"
+        description: "200-case slice from BEIR test qrels"
+        limit: 200
+        corpus_limit: 5000
+        seed: 0x5eed2025
+  - id: quora
+    label: "Quora (IR)"
+    category: "Quora"
+    entity_suffix: "Quora"
+    source_prefix: "quora"
+    raw: "data/raw/quora"
+    converted: "data/converted/quora-minne.json"
+    include_unanswerable: false
+    slices:
+      - id: quora-test-200
+        label: "Quora test (200)"
+        description: "200-case slice from BEIR test qrels"
+        limit: 200
+        corpus_limit: 5000
+        seed: 0x5eed2025
+  - id: trec-covid
+    label: "TREC-COVID (BEIR)"
+    category: "TREC-COVID"
+    entity_suffix: "TREC-COVID"
+    source_prefix: "trec-covid"
+    raw: "data/raw/trec-covid"
+    converted: "data/converted/trec-covid-minne.json"
+    include_unanswerable: false
+    slices:
+      - id: trec-covid-test-200
+        label: "TREC-COVID test (200)"
+        description: "200-case slice from BEIR test qrels"
+        limit: 200
+        corpus_limit: 5000
+        seed: 0x5eed2025
@@ -347,6 +347,10 @@ impl Config {
            self.retrieval.require_verified_chunks = true;
        }

+        if self.dataset == DatasetKind::Beir {
+            self.negative_multiplier = 9.0;
+        }
+
        // Validations
        if self.ingest_chunk_min_tokens == 0
            || self.ingest_chunk_min_tokens >= self.ingest_chunk_max_tokens
@@ -0,0 +1,341 @@
+use std::{
+    collections::{BTreeMap, HashMap},
+    fs::File,
+    io::{BufRead, BufReader},
+    path::{Path, PathBuf},
+};
+
+use anyhow::{anyhow, Context, Result};
+use serde::Deserialize;
+use tracing::warn;
+
+use super::{ConvertedParagraph, ConvertedQuestion, DatasetKind};
+
+const ANSWER_SNIPPET_CHARS: usize = 240;
+
+#[derive(Debug, Deserialize)]
+struct BeirCorpusRow {
+    #[serde(rename = "_id")]
+    id: String,
+    #[serde(default)]
+    title: Option<String>,
+    #[serde(default)]
+    text: Option<String>,
+}
+
+#[derive(Debug, Deserialize)]
+struct BeirQueryRow {
+    #[serde(rename = "_id")]
+    id: String,
+    text: String,
+}
+
+#[derive(Debug, Clone)]
+struct BeirParagraph {
+    title: String,
+    context: String,
+}
+
+#[derive(Debug, Clone)]
+struct BeirQuery {
+    text: String,
+}
+
+#[derive(Debug, Clone)]
+struct QrelEntry {
+    doc_id: String,
+    score: i32,
+}
+
+pub fn convert_beir(raw_dir: &Path, dataset: DatasetKind) -> Result<Vec<ConvertedParagraph>> {
+    let corpus_path = raw_dir.join("corpus.jsonl");
+    let queries_path = raw_dir.join("queries.jsonl");
+    let qrels_path = resolve_qrels_path(raw_dir)?;
+
+    let corpus = load_corpus(&corpus_path)?;
+    let queries = load_queries(&queries_path)?;
+    let qrels = load_qrels(&qrels_path)?;
+
+    let mut paragraphs = Vec::with_capacity(corpus.len());
+    let mut paragraph_index = HashMap::new();
+
+    for (doc_id, entry) in corpus.iter() {
+        let paragraph_id = format!("{}-{doc_id}", dataset.source_prefix());
+        let paragraph = ConvertedParagraph {
+            id: paragraph_id.clone(),
+            title: entry.title.clone(),
+            context: entry.context.clone(),
+            questions: Vec::new(),
+        };
+        paragraph_index.insert(doc_id.clone(), paragraphs.len());
+        paragraphs.push(paragraph);
+    }
+
+    let mut missing_queries = 0usize;
+    let mut missing_docs = 0usize;
+    let mut skipped_answers = 0usize;
+
+    for (query_id, entries) in qrels {
+        let query = match queries.get(&query_id) {
+            Some(query) => query,
+            None => {
+                missing_queries += 1;
+                warn!(query_id = %query_id, "Skipping qrels entry for missing query");
+                continue;
+            }
+        };
+
+        let best = match select_best_doc(&entries) {
+            Some(entry) => entry,
+            None => continue,
+        };
+
+        let paragraph_slot = match paragraph_index.get(&best.doc_id) {
+            Some(slot) => *slot,
+            None => {
+                missing_docs += 1;
+                warn!(
+                    query_id = %query_id,
+                    doc_id = %best.doc_id,
+                    "Skipping qrels entry referencing missing corpus document"
+                );
+                continue;
+            }
+        };
+
+        let answer = answer_snippet(&paragraphs[paragraph_slot].context);
+        let answers = match answer {
+            Some(snippet) => vec![snippet],
+            None => {
+                skipped_answers += 1;
+                warn!(
+                    query_id = %query_id,
+                    doc_id = %best.doc_id,
+                    "Skipping query because no non-empty answer snippet could be derived"
+                );
+                continue;
+            }
+        };
+
+        let question_id = format!("{}-{query_id}", dataset.source_prefix());
+        paragraphs[paragraph_slot]
+            .questions
+            .push(ConvertedQuestion {
+                id: question_id,
+                question: query.text.clone(),
+                answers,
+                is_impossible: false,
+            });
+    }
+
+    if missing_queries + missing_docs + skipped_answers > 0 {
+        warn!(
+            missing_queries,
+            missing_docs, skipped_answers, "Skipped some BEIR qrels entries during conversion"
+        );
+    }
+
+    Ok(paragraphs)
+}
+
+fn resolve_qrels_path(raw_dir: &Path) -> Result<PathBuf> {
+    let qrels_dir = raw_dir.join("qrels");
+    let candidates = ["test.tsv", "dev.tsv", "train.tsv"];
+
+    for name in candidates {
+        let candidate = qrels_dir.join(name);
+        if candidate.exists() {
+            return Ok(candidate);
+        }
+    }
+
+    Err(anyhow!(
+        "No qrels file found under {}; expected one of {:?}",
+        qrels_dir.display(),
+        candidates
+    ))
+}
+
+fn load_corpus(path: &Path) -> Result<BTreeMap<String, BeirParagraph>> {
+    let file =
+        File::open(path).with_context(|| format!("opening BEIR corpus at {}", path.display()))?;
+    let reader = BufReader::new(file);
+    let mut corpus = BTreeMap::new();
+
+    for (idx, line) in reader.lines().enumerate() {
+        let raw = line
+            .with_context(|| format!("reading corpus line {} from {}", idx + 1, path.display()))?;
+        if raw.trim().is_empty() {
+            continue;
+        }
+        let row: BeirCorpusRow = serde_json::from_str(&raw).with_context(|| {
+            format!(
+                "parsing corpus JSON on line {} from {}",
+                idx + 1,
+                path.display()
+            )
+        })?;
+        let title = row.title.unwrap_or_else(|| row.id.clone());
+        let text = row.text.unwrap_or_default();
+        let context = build_context(&title, &text);
+
+        if context.is_empty() {
+            warn!(doc_id = %row.id, "Skipping empty corpus document");
+            continue;
+        }
+
+        corpus.insert(row.id, BeirParagraph { title, context });
+    }
+
+    Ok(corpus)
+}
+
+fn load_queries(path: &Path) -> Result<BTreeMap<String, BeirQuery>> {
+    let file = File::open(path)
+        .with_context(|| format!("opening BEIR queries file at {}", path.display()))?;
+    let reader = BufReader::new(file);
+    let mut queries = BTreeMap::new();
+
+    for (idx, line) in reader.lines().enumerate() {
+        let raw = line
+            .with_context(|| format!("reading query line {} from {}", idx + 1, path.display()))?;
+        if raw.trim().is_empty() {
+            continue;
+        }
+        let row: BeirQueryRow = serde_json::from_str(&raw).with_context(|| {
+            format!(
+                "parsing query JSON on line {} from {}",
+                idx + 1,
+                path.display()
+            )
+        })?;
+        queries.insert(
+            row.id,
+            BeirQuery {
+                text: row.text.trim().to_string(),
+            },
+        );
+    }
+
+    Ok(queries)
+}
+
+fn load_qrels(path: &Path) -> Result<BTreeMap<String, Vec<QrelEntry>>> {
+    let file =
+        File::open(path).with_context(|| format!("opening BEIR qrels at {}", path.display()))?;
+    let reader = BufReader::new(file);
+    let mut qrels: BTreeMap<String, Vec<QrelEntry>> = BTreeMap::new();
+
+    for (idx, line) in reader.lines().enumerate() {
+        let raw = line
+            .with_context(|| format!("reading qrels line {} from {}", idx + 1, path.display()))?;
+        let trimmed = raw.trim();
+        if trimmed.is_empty() || trimmed.starts_with("query-id") {
+            continue;
+        }
+        let mut parts = trimmed.split_whitespace();
+        let query_id = parts
+            .next()
+            .ok_or_else(|| anyhow!("missing query id on line {}", idx + 1))?;
+        let doc_id = parts
+            .next()
+            .ok_or_else(|| anyhow!("missing document id on line {}", idx + 1))?;
+        let score_raw = parts
+            .next()
+            .ok_or_else(|| anyhow!("missing score on line {}", idx + 1))?;
+        let score: i32 = score_raw.parse().with_context(|| {
+            format!(
+                "parsing qrels score '{}' on line {} from {}",
+                score_raw,
+                idx + 1,
+                path.display()
+            )
+        })?;
+
+        qrels
+            .entry(query_id.to_string())
+            .or_default()
+            .push(QrelEntry {
+                doc_id: doc_id.to_string(),
+                score,
+            });
+    }
+
+    Ok(qrels)
+}
+
+fn select_best_doc(entries: &[QrelEntry]) -> Option<&QrelEntry> {
+    entries
+        .iter()
+        .max_by(|a, b| a.score.cmp(&b.score).then_with(|| b.doc_id.cmp(&a.doc_id)))
+}
+
+fn answer_snippet(text: &str) -> Option<String> {
+    let trimmed = text.trim();
+    if trimmed.is_empty() {
+        return None;
+    }
+    let snippet: String = trimmed.chars().take(ANSWER_SNIPPET_CHARS).collect();
+    let snippet = snippet.trim();
+    if snippet.is_empty() {
+        None
+    } else {
+        Some(snippet.to_string())
+    }
+}
+
+fn build_context(title: &str, text: &str) -> String {
+    let title = title.trim();
+    let text = text.trim();
+
+    match (title.is_empty(), text.is_empty()) {
+        (true, true) => String::new(),
+        (true, false) => text.to_string(),
+        (false, true) => title.to_string(),
+        (false, false) => format!("{title}\n\n{text}"),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::fs;
+    use tempfile::tempdir;
+
+    #[test]
+    fn converts_basic_beir_layout() {
+        let dir = tempdir().unwrap();
+        let corpus = r#"
+{"_id":"d1","title":"Doc 1","text":"Doc one has some text for testing."}
+{"_id":"d2","title":"Doc 2","text":"Second document content."}
+"#;
+        let queries = r#"
+{"_id":"q1","text":"What is in doc one?"}
+"#;
+        let qrels = "query-id\tcorpus-id\tscore\nq1\td1\t2\n";
+
+        fs::write(dir.path().join("corpus.jsonl"), corpus.trim()).unwrap();
+        fs::write(dir.path().join("queries.jsonl"), queries.trim()).unwrap();
+        fs::create_dir_all(dir.path().join("qrels")).unwrap();
+        fs::write(dir.path().join("qrels/test.tsv"), qrels).unwrap();
+
+        let paragraphs = convert_beir(dir.path(), DatasetKind::Fever).unwrap();
+
+        assert_eq!(paragraphs.len(), 2);
+        let doc_one = paragraphs
+            .iter()
+            .find(|p| p.id == "fever-d1")
+            .expect("missing paragraph for d1");
+        assert_eq!(doc_one.questions.len(), 1);
+        let question = &doc_one.questions[0];
+        assert_eq!(question.id, "fever-q1");
+        assert!(!question.answers.is_empty());
+        assert!(doc_one.context.contains(&question.answers[0]));
+
+        let doc_two = paragraphs
+            .iter()
+            .find(|p| p.id == "fever-d2")
+            .expect("missing paragraph for d2");
+        assert!(doc_two.questions.is_empty());
+    }
+}
@@ -1,3 +1,4 @@
+mod beir;
 mod nq;
 mod squad;

@@ -10,10 +11,10 @@ use std::{

 use anyhow::{anyhow, bail, Context, Result};
 use chrono::{DateTime, TimeZone, Utc};
+use clap::ValueEnum;
 use once_cell::sync::OnceCell;
 use serde::{Deserialize, Serialize};
 use tracing::warn;
-use clap::ValueEnum;

 const MANIFEST_PATH: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/manifest.yaml");
 static DATASET_CATALOG: OnceCell<DatasetCatalog> = OnceCell::new();
@@ -248,6 +249,19 @@ fn dataset_entry_for_kind(kind: DatasetKind) -> Result<&'static DatasetEntry> {
 pub enum DatasetKind {
    SquadV2,
    NaturalQuestions,
+    Beir,
+    #[value(name = "fever")]
+    Fever,
+    #[value(name = "fiqa")]
+    Fiqa,
+    #[value(name = "hotpotqa", alias = "hotpot-qa")]
+    HotpotQa,
+    #[value(name = "nfcorpus", alias = "nf-corpus")]
+    Nfcorpus,
+    #[value(name = "quora")]
+    Quora,
+    #[value(name = "trec-covid", alias = "treccovid", alias = "trec_covid")]
+    TrecCovid,
 }

 impl DatasetKind {
@@ -255,6 +269,13 @@ impl DatasetKind {
        match self {
            Self::SquadV2 => "squad-v2",
            Self::NaturalQuestions => "natural-questions-dev",
+            Self::Beir => "beir",
+            Self::Fever => "fever",
+            Self::Fiqa => "fiqa",
+            Self::HotpotQa => "hotpotqa",
+            Self::Nfcorpus => "nfcorpus",
+            Self::Quora => "quora",
+            Self::TrecCovid => "trec-covid",
        }
    }

@@ -262,6 +283,13 @@ impl DatasetKind {
        match self {
            Self::SquadV2 => "SQuAD v2.0",
            Self::NaturalQuestions => "Natural Questions (dev)",
+            Self::Beir => "BEIR mix",
+            Self::Fever => "FEVER (BEIR)",
+            Self::Fiqa => "FiQA-2018 (BEIR)",
+            Self::HotpotQa => "HotpotQA (BEIR)",
+            Self::Nfcorpus => "NFCorpus (BEIR)",
+            Self::Quora => "Quora (IR)",
+            Self::TrecCovid => "TREC-COVID (BEIR)",
        }
    }

@@ -269,6 +297,13 @@ impl DatasetKind {
        match self {
            Self::SquadV2 => "SQuAD v2.0",
            Self::NaturalQuestions => "Natural Questions",
+            Self::Beir => "BEIR",
+            Self::Fever => "FEVER",
+            Self::Fiqa => "FiQA-2018",
+            Self::HotpotQa => "HotpotQA",
+            Self::Nfcorpus => "NFCorpus",
+            Self::Quora => "Quora",
+            Self::TrecCovid => "TREC-COVID",
        }
    }

@@ -276,6 +311,13 @@ impl DatasetKind {
        match self {
            Self::SquadV2 => "SQuAD",
            Self::NaturalQuestions => "Natural Questions",
+            Self::Beir => "BEIR",
+            Self::Fever => "FEVER",
+            Self::Fiqa => "FiQA",
+            Self::HotpotQa => "HotpotQA",
+            Self::Nfcorpus => "NFCorpus",
+            Self::Quora => "Quora",
+            Self::TrecCovid => "TREC-COVID",
        }
    }

@@ -283,6 +325,13 @@ impl DatasetKind {
        match self {
            Self::SquadV2 => "squad",
            Self::NaturalQuestions => "nq",
+            Self::Beir => "beir",
+            Self::Fever => "fever",
+            Self::Fiqa => "fiqa",
+            Self::HotpotQa => "hotpotqa",
+            Self::Nfcorpus => "nfcorpus",
+            Self::Quora => "quora",
+            Self::TrecCovid => "trec-covid",
        }
    }

@@ -320,13 +369,29 @@ impl FromStr for DatasetKind {
            "nq" | "natural-questions" | "natural_questions" | "natural-questions-dev" => {
                Ok(Self::NaturalQuestions)
            }
+            "beir" => Ok(Self::Beir),
+            "fever" => Ok(Self::Fever),
+            "fiqa" | "fiqa-2018" => Ok(Self::Fiqa),
+            "hotpotqa" | "hotpot-qa" => Ok(Self::HotpotQa),
+            "nfcorpus" | "nf-corpus" => Ok(Self::Nfcorpus),
+            "quora" => Ok(Self::Quora),
+            "trec-covid" | "treccovid" | "trec_covid" => Ok(Self::TrecCovid),
            other => {
-                anyhow::bail!("unknown dataset '{other}'. Expected 'squad' or 'natural-questions'.")
+                anyhow::bail!("unknown dataset '{other}'. Expected one of: squad, natural-questions, beir, fever, fiqa, hotpotqa, nfcorpus, quora, trec-covid.")
            }
        }
    }
 }

+pub const BEIR_DATASETS: [DatasetKind; 6] = [
+    DatasetKind::Fever,
+    DatasetKind::Fiqa,
+    DatasetKind::HotpotQa,
+    DatasetKind::Nfcorpus,
+    DatasetKind::Quora,
+    DatasetKind::TrecCovid,
+];
+
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct DatasetMetadata {
    pub id: String,
@@ -410,6 +475,13 @@ pub fn convert(
        DatasetKind::NaturalQuestions => {
            nq::convert_nq(raw_path, include_unanswerable, context_token_limit)?
        }
+        DatasetKind::Beir => convert_beir_mix(include_unanswerable, context_token_limit)?,
+        DatasetKind::Fever
+        | DatasetKind::Fiqa
+        | DatasetKind::HotpotQa
+        | DatasetKind::Nfcorpus
+        | DatasetKind::Quora
+        | DatasetKind::TrecCovid => beir::convert_beir(raw_path, dataset)?,
    };

    let metadata_limit = match dataset {
@@ -417,14 +489,37 @@ pub fn convert(
        _ => context_token_limit,
    };

+    let source_label = match dataset {
+        DatasetKind::Beir => "beir-mix".to_string(),
+        _ => raw_path.display().to_string(),
+    };
+
    Ok(ConvertedDataset {
        generated_at: Utc::now(),
        metadata: DatasetMetadata::for_kind(dataset, include_unanswerable, metadata_limit),
-        source: raw_path.display().to_string(),
+        source: source_label,
        paragraphs,
    })
 }

+fn convert_beir_mix(
+    include_unanswerable: bool,
+    _context_token_limit: Option<usize>,
+) -> Result<Vec<ConvertedParagraph>> {
+    if include_unanswerable {
+        warn!("BEIR mix ignores include_unanswerable flag; all questions are answerable");
+    }
+
+    let mut paragraphs = Vec::new();
+    for subset in BEIR_DATASETS {
+        let entry = dataset_entry_for_kind(subset)?;
+        let subset_paragraphs = beir::convert_beir(&entry.raw_path, subset)?;
+        paragraphs.extend(subset_paragraphs);
+    }
+
+    Ok(paragraphs)
+}
+
 fn ensure_parent(path: &Path) -> Result<()> {
    if let Some(parent) = path.parent() {
        fs::create_dir_all(parent)
@@ -6,8 +6,8 @@ use futures::stream::{self, StreamExt};
 use tracing::{debug, info};

 use crate::eval::{
-    adapt_strategy_output, build_case_diagnostics,
-    text_contains_answer, CaseDiagnostics, CaseSummary, RetrievedSummary,
+    adapt_strategy_output, build_case_diagnostics, text_contains_answer, CaseDiagnostics,
+    CaseSummary, RetrievedSummary,
 };
 use retrieval_pipeline::{
    pipeline::{self, PipelineStageTimings, RetrievalConfig},
@@ -26,7 +26,6 @@ use uuid::Uuid;

 use crate::{
    datasets::{ConvertedDataset, ConvertedParagraph, ConvertedQuestion},
-    db_helpers::change_embedding_length_in_hnsw_indexes,
    slices::{self, ResolvedSlice, SliceParagraphKind},
 };

@@ -417,10 +416,6 @@ async fn ingest_paragraph_batch(
        .await
        .context("applying migrations for ingestion")?;

-    change_embedding_length_in_hnsw_indexes(&db, embedding_dimension)
-        .await
-        .context("failed setting new hnsw length")?;
-
    let mut app_config = AppConfig::default();
    app_config.storage = StorageKind::Memory;
    let backend: DynStore = Arc::new(InMemory::new());
@@ -93,7 +93,6 @@ async fn async_main() -> anyhow::Result<()> {

    // Clap handles help automatically, so we don't need to check for it manually

-
    if parsed.config.inspect_question.is_some() {
        inspection::inspect_question(&parsed.config).await?;
        return Ok(());
@@ -145,6 +145,8 @@ mod tests {
            precision_at_1: 0.5,
            precision_at_2: 0.5,
            precision_at_3: 0.5,
+            mrr: 0.0,
+            average_ndcg: 0.0,
            duration_ms: 1234,
            dataset_id: "squad-v2".into(),
            dataset_label: "SQuAD v2".into(),
@@ -192,18 +194,17 @@ mod tests {
            rerank_pool_size: Some(4),
            rerank_keep_top: 10,
            concurrency: 2,
-            retrieval_strategy: "initial".into(),
            detailed_report: false,
+            retrieval_strategy: "initial".into(),
+            chunk_result_cap: 5,
            ingest_chunk_min_tokens: 256,
            ingest_chunk_max_tokens: 512,
-            ingest_chunk_overlap_tokens: 50,
            ingest_chunks_only: false,
+            ingest_chunk_overlap_tokens: 50,
            chunk_vector_take: 20,
            chunk_fts_take: 20,
            chunk_avg_chars_per_token: 4,
            max_chunks_per_entity: 4,
-            average_ndcg: 0.0,
-            mrr: 0.0,
            cases: Vec::new(),
        }
    }
@@ -88,6 +88,10 @@ pub struct RetrievalSection {
    pub rerank_pool_size: Option<usize>,
    pub rerank_keep_top: usize,
    pub chunk_result_cap: usize,
+    #[serde(default)]
+    pub chunk_vector_take: usize,
+    #[serde(default)]
+    pub chunk_fts_take: usize,
    pub ingest_chunk_min_tokens: usize,
    pub ingest_chunk_max_tokens: usize,
    pub ingest_chunk_overlap_tokens: usize,
@@ -202,6 +206,8 @@ impl EvaluationReport {
            rerank_pool_size: summary.rerank_pool_size,
            rerank_keep_top: summary.rerank_keep_top,
            chunk_result_cap: summary.chunk_result_cap,
+            chunk_vector_take: summary.chunk_vector_take,
+            chunk_fts_take: summary.chunk_fts_take,
            ingest_chunk_min_tokens: summary.ingest_chunk_min_tokens,
            ingest_chunk_max_tokens: summary.ingest_chunk_max_tokens,
            ingest_chunk_overlap_tokens: summary.ingest_chunk_overlap_tokens,
@@ -467,10 +473,7 @@ fn render_markdown(report: &EvaluationReport) -> String {
        report.retrieval.precision_at_2,
        report.retrieval.precision_at_3
    ));
-    md.push_str(&format!(
-        "| MRR | {:.3} |\\n",
-        report.retrieval.mrr
-    ));
+    md.push_str(&format!("| MRR | {:.3} |\\n", report.retrieval.mrr));
    md.push_str(&format!(
        "| NDCG | {:.3} |\\n",
        report.retrieval.average_ndcg
@@ -632,7 +635,9 @@ fn render_markdown(report: &EvaluationReport) -> String {
            if report.detailed_report {
                md.push_str("All LLM-only cases matched within the evaluation window.\\n");
            } else {
-                md.push_str("LLM-only cases omitted. Re-run with `--detailed-report` to see samples.\\n");
+                md.push_str(
+                    "LLM-only cases omitted. Re-run with `--detailed-report` to see samples.\\n",
+                );
            }
        } else {
            md.push_str("| Question ID | Answered | Match Rank | Top Retrieved |\\n");
@@ -851,6 +856,8 @@ fn convert_legacy_entry(entry: LegacyHistoryEntry) -> EvaluationReport {
        rerank_pool_size: entry.rerank_pool_size,
        rerank_keep_top: entry.rerank_keep_top,
        chunk_result_cap: entry.chunk_result_cap.unwrap_or(5),
+        chunk_vector_take: 0,
+        chunk_fts_take: 0,
        ingest_chunk_min_tokens: entry.ingest_chunk_min_tokens.unwrap_or(256),
        ingest_chunk_max_tokens: entry.ingest_chunk_max_tokens.unwrap_or(512),
        ingest_chunk_overlap_tokens: entry.ingest_chunk_overlap_tokens.unwrap_or(50),
@@ -1126,8 +1133,7 @@ mod tests {
        let tmp = tempdir().unwrap();
        let summary = sample_summary(false);

-        let outcome =
-            write_reports(&summary, tmp.path(), 5).expect("writing consolidated reports");
+        let outcome = write_reports(&summary, tmp.path(), 5).expect("writing consolidated reports");
        let contents =
            std::fs::read_to_string(&outcome.history_path).expect("reading evaluations history");
        let entries: Vec<EvaluationReport> =
@@ -1,5 +1,5 @@
 use std::{
-    collections::{HashMap, HashSet},
+    collections::{HashMap, HashSet, VecDeque},
    fs,
    path::{Path, PathBuf},
 };
@@ -11,7 +11,9 @@ use serde::{Deserialize, Serialize};
 use sha2::{Digest, Sha256};
 use tracing::{info, warn};

-use crate::datasets::{ConvertedDataset, ConvertedParagraph, ConvertedQuestion};
+use crate::datasets::{
+    ConvertedDataset, ConvertedParagraph, ConvertedQuestion, DatasetKind, BEIR_DATASETS,
+};

 const SLICE_VERSION: u32 = 2;
 pub const DEFAULT_NEGATIVE_MULTIPLIER: f32 = 4.0;
@@ -526,7 +528,7 @@ fn ensure_case_capacity(
        return Ok(false);
    }

-    let question_refs = ordered_question_refs(dataset, params)?;
+    let question_refs = ordered_question_refs(dataset, params, target_cases)?;
    let mut existing_questions: HashSet<String> = manifest
        .cases
        .iter()
@@ -599,7 +601,12 @@ fn ensure_case_capacity(
 fn ordered_question_refs(
    dataset: &ConvertedDataset,
    params: &BuildParams,
+    target_cases: usize,
 ) -> Result<Vec<(usize, usize)>> {
+    if dataset.metadata.id == DatasetKind::Beir.id() {
+        return ordered_question_refs_beir(dataset, params, target_cases);
+    }
+
    let mut question_refs = Vec::new();
    for (p_idx, paragraph) in dataset.paragraphs.iter().enumerate() {
        for (q_idx, question) in paragraph.questions.iter().enumerate() {
@@ -626,6 +633,170 @@ fn ordered_question_refs(
    Ok(question_refs)
 }

+fn ordered_question_refs_beir(
+    dataset: &ConvertedDataset,
+    params: &BuildParams,
+    target_cases: usize,
+) -> Result<Vec<(usize, usize)>> {
+    let prefixes: Vec<&str> = BEIR_DATASETS
+        .iter()
+        .map(|kind| kind.source_prefix())
+        .collect();
+
+    let mut grouped: HashMap<&str, Vec<(usize, usize)>> = HashMap::new();
+    for (p_idx, paragraph) in dataset.paragraphs.iter().enumerate() {
+        for (q_idx, question) in paragraph.questions.iter().enumerate() {
+            let include = if params.include_impossible {
+                true
+            } else {
+                !question.is_impossible && !question.answers.is_empty()
+            };
+            if !include {
+                continue;
+            }
+
+            let Some(prefix) = question_prefix(&question.id) else {
+                warn!(
+                    question_id = %question.id,
+                    "Skipping BEIR question without expected prefix"
+                );
+                continue;
+            };
+            if !prefixes.contains(&prefix) {
+                warn!(
+                    question_id = %question.id,
+                    prefix = %prefix,
+                    "Skipping BEIR question with unknown subset prefix"
+                );
+                continue;
+            }
+            grouped.entry(prefix).or_default().push((p_idx, q_idx));
+        }
+    }
+
+    if grouped.values().all(|entries| entries.is_empty()) {
+        return Err(anyhow!(
+            "no eligible BEIR questions found; cannot build slice"
+        ));
+    }
+
+    for prefix in &prefixes {
+        if let Some(entries) = grouped.get_mut(prefix) {
+            let seed = mix_seed(
+                &format!("{}::{prefix}", dataset.metadata.id),
+                params.base_seed,
+            );
+            let mut rng = StdRng::seed_from_u64(seed);
+            entries.shuffle(&mut rng);
+        }
+    }
+
+    let dataset_count = prefixes.len().max(1);
+    let base_quota = target_cases / dataset_count;
+    let mut remainder = target_cases % dataset_count;
+
+    let mut quotas: HashMap<&str, usize> = HashMap::new();
+    for prefix in &prefixes {
+        let mut quota = base_quota;
+        if remainder > 0 {
+            quota += 1;
+            remainder -= 1;
+        }
+        quotas.insert(*prefix, quota);
+    }
+
+    let mut take_counts: HashMap<&str, usize> = HashMap::new();
+    let mut spare_slots: HashMap<&str, usize> = HashMap::new();
+    let mut shortfall = 0usize;
+
+    for prefix in &prefixes {
+        let available = grouped.get(prefix).map(|v| v.len()).unwrap_or(0);
+        let quota = *quotas.get(prefix).unwrap_or(&0);
+        let take = quota.min(available);
+        let missing = quota.saturating_sub(take);
+        shortfall += missing;
+        take_counts.insert(*prefix, take);
+        spare_slots.insert(*prefix, available.saturating_sub(take));
+    }
+
+    while shortfall > 0 {
+        let mut allocated = false;
+        for prefix in &prefixes {
+            if shortfall == 0 {
+                break;
+            }
+            let spare = spare_slots.get(prefix).copied().unwrap_or(0);
+            if spare == 0 {
+                continue;
+            }
+            if let Some(count) = take_counts.get_mut(prefix) {
+                *count += 1;
+            }
+            spare_slots.insert(*prefix, spare - 1);
+            shortfall = shortfall.saturating_sub(1);
+            allocated = true;
+        }
+        if !allocated {
+            break;
+        }
+    }
+
+    let mut queues: Vec<VecDeque<(usize, usize)>> = Vec::new();
+    let mut total_selected = 0usize;
+    for prefix in &prefixes {
+        let take = *take_counts.get(prefix).unwrap_or(&0);
+        let mut deque = VecDeque::new();
+        if let Some(entries) = grouped.get(prefix) {
+            for item in entries.iter().take(take) {
+                deque.push_back(*item);
+                total_selected += 1;
+            }
+        }
+        queues.push(deque);
+    }
+
+    if total_selected < target_cases {
+        warn!(
+            requested = target_cases,
+            available = total_selected,
+            "BEIR mix requested more questions than available after balancing; continuing with capped set"
+        );
+    }
+
+    let mut output = Vec::with_capacity(total_selected);
+    loop {
+        let mut progressed = false;
+        for queue in queues.iter_mut() {
+            if let Some(item) = queue.pop_front() {
+                output.push(item);
+                progressed = true;
+            }
+        }
+        if !progressed {
+            break;
+        }
+    }
+
+    if output.is_empty() {
+        return Err(anyhow!(
+            "no eligible BEIR questions found; cannot build slice"
+        ));
+    }
+
+    Ok(output)
+}
+
+fn question_prefix(question_id: &str) -> Option<&'static str> {
+    for prefix in BEIR_DATASETS.iter().map(|kind| kind.source_prefix()) {
+        if let Some(rest) = question_id.strip_prefix(prefix) {
+            if rest.starts_with('-') {
+                return Some(prefix);
+            }
+        }
+    }
+    None
+}
+
 fn ensure_negative_pool(
    dataset: &ConvertedDataset,
    manifest: &mut SliceManifest,
@@ -981,4 +1152,65 @@ mod tests {
            .any(|entry| entry.id == positive_ids[0]));
        Ok(())
    }
+
+    #[test]
+    fn beir_mix_balances_and_rebalances() -> Result<()> {
+        let mut paragraphs = Vec::new();
+        let counts = [
+            ("fever", 1usize),
+            ("fiqa", 2usize),
+            ("hotpotqa", 1usize),
+            ("nfcorpus", 0usize),
+            ("quora", 3usize),
+            ("trec-covid", 2usize),
+        ];
+
+        for (prefix, count) in counts {
+            for idx in 0..count {
+                let q_id = format!("{prefix}-q{idx}");
+                paragraphs.push(ConvertedParagraph {
+                    id: format!("{prefix}-p{idx}"),
+                    title: format!("{prefix} title"),
+                    context: format!("{prefix} context {idx}"),
+                    questions: vec![ConvertedQuestion {
+                        id: q_id,
+                        question: format!("{prefix} question {idx}"),
+                        answers: vec!["answer".to_string()],
+                        is_impossible: false,
+                    }],
+                });
+            }
+        }
+
+        let metadata = DatasetMetadata::for_kind(DatasetKind::Beir, false, None);
+        let dataset = ConvertedDataset {
+            generated_at: Utc::now(),
+            metadata,
+            source: "beir-mix".to_string(),
+            paragraphs,
+        };
+
+        let params = BuildParams {
+            include_impossible: false,
+            base_seed: 0xAA,
+            rng_seed: 0xBB,
+        };
+
+        let refs = ordered_question_refs_beir(&dataset, &params, 8)?;
+        let mut per_prefix: HashMap<String, usize> = HashMap::new();
+        for (p_idx, q_idx) in refs {
+            let question = &dataset.paragraphs[p_idx].questions[q_idx];
+            let prefix = question_prefix(&question.id).unwrap_or("unknown");
+            *per_prefix.entry(prefix.to_string()).or_default() += 1;
+        }
+
+        assert_eq!(per_prefix.get("fever").copied().unwrap_or(0), 1);
+        assert_eq!(per_prefix.get("fiqa").copied().unwrap_or(0), 2);
+        assert_eq!(per_prefix.get("hotpotqa").copied().unwrap_or(0), 1);
+        assert_eq!(per_prefix.get("nfcorpus").copied().unwrap_or(0), 0);
+        assert_eq!(per_prefix.get("quora").copied().unwrap_or(0), 2);
+        assert_eq!(per_prefix.get("trec-covid").copied().unwrap_or(0), 2);
+
+        Ok(())
+    }
 }