benchmarks: ready for hybrid revised

2026-06-28 12:56:26 +02:00 · 2025-12-03 11:38:07 +01:00
parent 2939e4c2a4
commit dd881efbf9
22 changed files with 760 additions and 476 deletions
@@ -18,6 +18,7 @@ tracing = { workspace = true }
 tracing-subscriber = { workspace = true }
 uuid = { workspace = true }
 text-splitter = { workspace = true }
+unicode-normalization = { workspace = true }
 rand = "0.8"
 sha2 = { workspace = true }
 object_store = { workspace = true }
@@ -52,14 +52,6 @@ impl std::fmt::Display for EmbeddingBackend {

 #[derive(Debug, Clone, Args)]
 pub struct RetrievalSettings {
-    /// Minimum characters per chunk for text splitting
-    #[arg(long, default_value_t = 500)]
-    pub chunk_min_chars: usize,
-
-    /// Maximum characters per chunk for text splitting
-    #[arg(long, default_value_t = 2000)]
-    pub chunk_max_chars: usize,
-
    /// Override chunk vector candidate cap
    #[arg(long)]
    pub chunk_vector_take: Option<usize>,
@@ -68,10 +60,6 @@ pub struct RetrievalSettings {
    #[arg(long)]
    pub chunk_fts_take: Option<usize>,

-    /// Override chunk token budget estimate for assembly
-    #[arg(long)]
-    pub chunk_token_budget: Option<usize>,
-
    /// Override average characters per token used for budgeting
    #[arg(long)]
    pub chunk_avg_chars_per_token: Option<usize>,
@@ -80,18 +68,22 @@ pub struct RetrievalSettings {
    #[arg(long)]
    pub max_chunks_per_entity: Option<usize>,

-    /// Disable the FastEmbed reranking stage
-    #[arg(long = "no-rerank", action = clap::ArgAction::SetFalse)]
+    /// Enable the FastEmbed reranking stage
+    #[arg(long = "rerank", action = clap::ArgAction::SetTrue, default_value_t = false)]
    pub rerank: bool,

    /// Reranking engine pool size / parallelism
-    #[arg(long, default_value_t = 16)]
+    #[arg(long, default_value_t = 4)]
    pub rerank_pool_size: usize,

    /// Keep top-N entities after reranking
    #[arg(long, default_value_t = 10)]
    pub rerank_keep_top: usize,

+    /// Cap the number of chunks returned by retrieval (revised strategy)
+    #[arg(long, default_value_t = 5)]
+    pub chunk_result_cap: usize,
+
    /// Require verified chunks (disable with --llm-mode)
    #[arg(skip = true)]
    pub require_verified_chunks: bool,
@@ -104,16 +96,14 @@ pub struct RetrievalSettings {
 impl Default for RetrievalSettings {
    fn default() -> Self {
        Self {
-            chunk_min_chars: 500,
-            chunk_max_chars: 2_000,
            chunk_vector_take: None,
            chunk_fts_take: None,
-            chunk_token_budget: None,
            chunk_avg_chars_per_token: None,
            max_chunks_per_entity: None,
-            rerank: true,
-            rerank_pool_size: 16,
+            rerank: false,
+            rerank_pool_size: 4,
            rerank_keep_top: 10,
+            chunk_result_cap: 5,
            require_verified_chunks: true,
            strategy: RetrievalStrategy::Initial,
        }
@@ -175,7 +165,7 @@ pub struct Config {
    pub retrieval: RetrievalSettings,

    /// Concurrency level
-    #[arg(long, default_value_t = 4)]
+    #[arg(long, default_value_t = 1)]
    pub concurrency: usize,

    /// Embedding backend
@@ -195,19 +185,23 @@ pub struct Config {
    pub ingestion_cache_dir: PathBuf,

    /// Minimum tokens per chunk for ingestion
-    #[arg(long, default_value_t = 500)]
+    #[arg(long, default_value_t = 256)]
    pub ingest_chunk_min_tokens: usize,

    /// Maximum tokens per chunk for ingestion
-    #[arg(long, default_value_t = 2_000)]
+    #[arg(long, default_value_t = 512)]
    pub ingest_chunk_max_tokens: usize,

+    /// Overlap between chunks during ingestion (tokens)
+    #[arg(long, default_value_t = 50)]
+    pub ingest_chunk_overlap_tokens: usize,
+
    /// Run ingestion in chunk-only mode (skip analyzer/graph generation)
    #[arg(long)]
    pub ingest_chunks_only: bool,

    /// Number of paragraphs to ingest concurrently
-    #[arg(long, default_value_t = 5)]
+    #[arg(long, default_value_t = 10)]
    pub ingestion_batch_size: usize,

    /// Maximum retries for ingestion failures per paragraph
@@ -354,15 +348,9 @@ impl Config {
        }

        // Validations
-        if self.retrieval.chunk_min_chars >= self.retrieval.chunk_max_chars {
-            return Err(anyhow!(
-                "--chunk-min must be less than --chunk-max (got {} >= {})",
-                self.retrieval.chunk_min_chars,
-                self.retrieval.chunk_max_chars
-            ));
-        }
-
-        if self.ingest_chunk_min_tokens == 0 || self.ingest_chunk_min_tokens >= self.ingest_chunk_max_tokens {
+        if self.ingest_chunk_min_tokens == 0
+            || self.ingest_chunk_min_tokens >= self.ingest_chunk_max_tokens
+        {
            return Err(anyhow!(
                "--ingest-chunk-min-tokens must be greater than zero and less than --ingest-chunk-max-tokens (got {} >= {})",
                self.ingest_chunk_min_tokens,
@@ -370,6 +358,14 @@ impl Config {
            ));
        }

+        if self.ingest_chunk_overlap_tokens >= self.ingest_chunk_min_tokens {
+            return Err(anyhow!(
+                "--ingest-chunk-overlap-tokens ({}) must be less than --ingest-chunk-min-tokens ({})",
+                self.ingest_chunk_overlap_tokens,
+                self.ingest_chunk_min_tokens
+            ));
+        }
+
        if self.retrieval.rerank && self.retrieval.rerank_pool_size == 0 {
            return Err(anyhow!(
                "--rerank-pool must be greater than zero when reranking is enabled"
@@ -444,9 +440,7 @@ pub struct ParsedArgs {
 pub fn parse() -> Result<ParsedArgs> {
    let mut config = Config::parse();
    config.finalize()?;
-    Ok(ParsedArgs {
-        config,
-    })
+    Ok(ParsedArgs { config })
 }

 pub fn ensure_parent(path: &Path) -> Result<()> {
@@ -15,7 +15,6 @@ use common::{
        types::{system_settings::SystemSettings, user::User, StoredObject},
    },
 };
-use retrieval_pipeline::RetrievalTuning;
 use serde::Deserialize;
 use tokio::io::AsyncWriteExt;
 use tracing::{info, warn};
@@ -120,37 +119,6 @@ pub(crate) fn ledger_target(config: &Config) -> Option<usize> {
    }
 }

-pub(crate) fn apply_dataset_tuning_overrides(
-    dataset: &ConvertedDataset,
-    config: &Config,
-    tuning: &mut RetrievalTuning,
-) {
-    let is_long_form = dataset
-        .metadata
-        .id
-        .to_ascii_lowercase()
-        .contains("natural-questions");
-    if !is_long_form {
-        return;
-    }
-
-    if config.retrieval.chunk_vector_take.is_none() {
-        tuning.chunk_vector_take = tuning.chunk_vector_take.max(80);
-    }
-    if config.retrieval.chunk_fts_take.is_none() {
-        tuning.chunk_fts_take = tuning.chunk_fts_take.max(80);
-    }
-    if config.retrieval.chunk_token_budget.is_none() {
-        tuning.token_budget_estimate = tuning.token_budget_estimate.max(20_000);
-    }
-    if config.retrieval.max_chunks_per_entity.is_none() {
-        tuning.max_chunks_per_entity = tuning.max_chunks_per_entity.max(12);
-    }
-    if tuning.lexical_match_weight < 0.25 {
-        tuning.lexical_match_weight = 0.3;
-    }
-}
-
 pub(crate) async fn write_chunk_diagnostics(path: &Path, cases: &[CaseDiagnostics]) -> Result<()> {
    args::ensure_parent(path)?;
    let mut file = tokio::fs::File::create(path)
@@ -175,9 +143,10 @@ pub(crate) async fn warm_hnsw_cache(db: &SurrealDbClient, dimension: usize) -> R
    let _ = db
        .client
        .query(
-            "SELECT chunk_id \
-             FROM text_chunk_embedding \
-             WHERE embedding <|1,1|> $embedding LIMIT 5",
+            r#"SELECT chunk_id
+               FROM text_chunk_embedding
+               WHERE embedding <|1,1|> $embedding
+               LIMIT 5"#,
        )
        .bind(("embedding", dummy_embedding.clone()))
        .await
@@ -187,9 +156,10 @@ pub(crate) async fn warm_hnsw_cache(db: &SurrealDbClient, dimension: usize) -> R
    let _ = db
        .client
        .query(
-            "SELECT entity_id \
-             FROM knowledge_entity_embedding \
-             WHERE embedding <|1,1|> $embedding LIMIT 5",
+            r#"SELECT entity_id
+               FROM knowledge_entity_embedding
+               WHERE embedding <|1,1|> $embedding
+               LIMIT 5"#,
        )
        .bind(("embedding", dummy_embedding))
        .await
@@ -427,12 +397,10 @@ pub(crate) async fn enforce_system_settings(
 ) -> Result<SystemSettings> {
    let mut updated_settings = settings.clone();
    let mut needs_settings_update = false;
-    // let mut embedding_dimension_changed = false;

    if provider_dimension != settings.embedding_dimensions as usize {
        updated_settings.embedding_dimensions = provider_dimension as u32;
        needs_settings_update = true;
-        // embedding_dimension_changed = true;
    }
    if let Some(query_override) = config.query_model.as_deref() {
        if settings.query_model != query_override {
@@ -449,12 +417,6 @@ pub(crate) async fn enforce_system_settings(
            .await
            .context("updating system settings overrides")?;
    }
-    // We dont need to do this, we've changed from default settings already
-    // if embedding_dimension_changed {
-    //     change_embedding_length_in_hnsw_indexes(db, provider_dimension)
-    //         .await
-    //         .context("redefining HNSW indexes for new embedding dimension")?;
-    // }
    Ok(settings)
 }

@@ -6,7 +6,7 @@ use futures::stream::{self, StreamExt};
 use tracing::{debug, info};

 use crate::eval::{
-    adapt_strategy_output, apply_dataset_tuning_overrides, build_case_diagnostics,
+    adapt_strategy_output, build_case_diagnostics,
    text_contains_answer, CaseDiagnostics, CaseSummary, RetrievedSummary,
 };
 use retrieval_pipeline::{
@@ -56,15 +56,13 @@ pub(crate) async fn run_queries(
    if retrieval_config.tuning.fallback_min_results < config.retrieval.rerank_keep_top {
        retrieval_config.tuning.fallback_min_results = config.retrieval.rerank_keep_top;
    }
+    retrieval_config.tuning.chunk_result_cap = config.retrieval.chunk_result_cap.max(1);
    if let Some(value) = config.retrieval.chunk_vector_take {
        retrieval_config.tuning.chunk_vector_take = value;
    }
    if let Some(value) = config.retrieval.chunk_fts_take {
        retrieval_config.tuning.chunk_fts_take = value;
    }
-    if let Some(value) = config.retrieval.chunk_token_budget {
-        retrieval_config.tuning.token_budget_estimate = value;
-    }
    if let Some(value) = config.retrieval.chunk_avg_chars_per_token {
        retrieval_config.tuning.avg_chars_per_token = value;
    }
@@ -72,8 +70,6 @@ pub(crate) async fn run_queries(
        retrieval_config.tuning.max_chunks_per_entity = value;
    }

-    apply_dataset_tuning_overrides(dataset, config, &mut retrieval_config.tuning);
-
    let active_tuning = retrieval_config.tuning.clone();
    let effective_chunk_vector = config
        .retrieval
@@ -95,11 +91,8 @@ pub(crate) async fn run_queries(
        rerank_enabled = config.retrieval.rerank,
        rerank_pool_size = config.retrieval.rerank_pool_size,
        rerank_keep_top = config.retrieval.rerank_keep_top,
-        chunk_min = config.retrieval.chunk_min_chars,
-        chunk_max = config.retrieval.chunk_max_chars,
        chunk_vector_take = effective_chunk_vector,
        chunk_fts_take = effective_chunk_fts,
-        chunk_token_budget = active_tuning.token_budget_estimate,
        embedding_backend = ctx.embedding_provider().backend_label(),
        embedding_model = ctx
            .embedding_provider()
@@ -405,4 +398,3 @@ fn calculate_ndcg(retrieved: &[RetrievedSummary], k: usize) -> f64 {
        dcg / idcg
    }
 }
-
@@ -201,9 +201,13 @@ pub(crate) async fn summarize(
        concurrency: config.concurrency.max(1),
        detailed_report: config.detailed_report,
        retrieval_strategy: config.retrieval.strategy.to_string(),
+        chunk_result_cap: config.retrieval.chunk_result_cap,
+        ingest_chunk_min_tokens: config.ingest_chunk_min_tokens,
+        ingest_chunk_max_tokens: config.ingest_chunk_max_tokens,
+        ingest_chunks_only: config.ingest_chunks_only,
+        ingest_chunk_overlap_tokens: config.ingest_chunk_overlap_tokens,
        chunk_vector_take: active_tuning.chunk_vector_take,
        chunk_fts_take: active_tuning.chunk_fts_take,
-        chunk_token_budget: active_tuning.token_budget_estimate,
        chunk_avg_chars_per_token: active_tuning.avg_chars_per_token,
        max_chunks_per_entity: active_tuning.max_chunks_per_entity,
        cases: summaries,
@@ -6,6 +6,7 @@ use retrieval_pipeline::{
    PipelineDiagnostics, PipelineStageTimings, RetrievedChunk, RetrievedEntity, StrategyOutput,
 };
 use serde::{Deserialize, Serialize};
+use unicode_normalization::UnicodeNormalization;

 #[derive(Debug, Serialize)]
 pub struct EvaluationSummary {
@@ -68,9 +69,13 @@ pub struct EvaluationSummary {
    pub concurrency: usize,
    pub detailed_report: bool,
    pub retrieval_strategy: String,
+    pub chunk_result_cap: usize,
+    pub ingest_chunk_min_tokens: usize,
+    pub ingest_chunk_max_tokens: usize,
+    pub ingest_chunks_only: bool,
+    pub ingest_chunk_overlap_tokens: usize,
    pub chunk_vector_take: usize,
    pub chunk_fts_take: usize,
-    pub chunk_token_budget: usize,
    pub chunk_avg_chars_per_token: usize,
    pub max_chunks_per_entity: usize,
    pub cases: Vec<CaseSummary>,
@@ -107,7 +112,17 @@ pub struct LatencyStats {
    pub p95: u128,
 }

-#[derive(Debug, Clone, Serialize)]
+impl Default for LatencyStats {
+    fn default() -> Self {
+        Self {
+            avg: 0.0,
+            p50: 0,
+            p95: 0,
+        }
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, Default)]
 pub struct StageLatencyBreakdown {
    pub embed: LatencyStats,
    pub collect_candidates: LatencyStats,
@@ -117,7 +132,7 @@ pub struct StageLatencyBreakdown {
    pub assemble: LatencyStats,
 }

-#[derive(Debug, Default, Clone, Serialize)]
+#[derive(Debug, Default, Clone, Serialize, Deserialize)]
 pub struct EvaluationStageTimings {
    pub prepare_slice_ms: u128,
    pub prepare_db_ms: u128,
@@ -128,7 +143,7 @@ pub struct EvaluationStageTimings {
    pub finalize_ms: u128,
 }

-#[derive(Debug, Serialize)]
+#[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct PerformanceTimings {
    pub openai_base_url: String,
    pub ingestion_ms: u128,
@@ -254,8 +269,44 @@ pub fn text_contains_answer(text: &str, answers: &[String]) -> bool {
    if answers.is_empty() {
        return true;
    }
-    let haystack = text.to_ascii_lowercase();
-    answers.iter().any(|needle| haystack.contains(needle))
+    let haystack = normalize_for_match(text);
+    answers
+        .iter()
+        .map(|needle| normalize_for_match(needle))
+        .any(|needle| !needle.is_empty() && haystack.contains(&needle))
+}
+
+fn normalize_for_match(input: &str) -> String {
+    // NFKC normalize, lowercase, and collapse whitespace/punctuation to a single space
+    // to reduce false negatives from formatting or punctuation differences.
+    let mut out = String::with_capacity(input.len());
+    let mut last_space = false;
+    for ch in input.nfkc().flat_map(|c| c.to_lowercase()) {
+        let is_space = ch.is_whitespace();
+        let is_punct = ch.is_ascii_punctuation()
+            || matches!(
+                ch,
+                '“' | '”' | '‘' | '’' | '«' | '»' | '–' | '—' | '…' | '·' | '•'
+            );
+        if is_space || is_punct {
+            if !last_space {
+                out.push(' ');
+                last_space = true;
+            }
+        } else {
+            out.push(ch);
+            last_space = false;
+        }
+    }
+
+    let trimmed = out.trim();
+    if trimmed.is_empty() {
+        return String::new();
+    }
+
+    trimmed
+        .trim_matches(|c: char| c.is_ascii_punctuation() || c.is_whitespace())
+        .to_string()
 }

 fn chunk_snippet(text: &str) -> String {
@@ -17,6 +17,7 @@ pub fn make_ingestion_config(config: &crate::args::Config) -> ingestion_pipeline
    let mut tuning = ingestion_pipeline::IngestionTuning::default();
    tuning.chunk_min_tokens = config.ingest_chunk_min_tokens;
    tuning.chunk_max_tokens = config.ingest_chunk_max_tokens;
+    tuning.chunk_overlap_tokens = config.ingest_chunk_overlap_tokens;

    ingestion_pipeline::IngestionConfig {
        tuning,
@@ -172,13 +172,14 @@ async fn async_main() -> anyhow::Result<()> {
        .await
        .context("running retrieval evaluation")?;

-    let report_paths = report::write_reports(
+    let report = report::write_reports(
        &summary,
        parsed.config.report_dir.as_path(),
        parsed.config.summary_sample,
    )
    .with_context(|| format!("writing reports to {}", parsed.config.report_dir.display()))?;
-    let perf_log_path = perf::write_perf_logs(
+    let perf_mirrors = perf::mirror_perf_outputs(
+        &report.record,
        &summary,
        parsed.config.report_dir.as_path(),
        parsed.config.perf_log_json.as_deref(),
@@ -186,14 +187,27 @@ async fn async_main() -> anyhow::Result<()> {
    )
    .with_context(|| {
        format!(
-            "writing perf logs under {}",
+            "writing perf mirrors under {}",
            parsed.config.report_dir.display()
        )
    })?;

+    let perf_note = if perf_mirrors.is_empty() {
+        String::new()
+    } else {
+        format!(
+            " | Perf mirrors: {}",
+            perf_mirrors
+                .iter()
+                .map(|path| path.display().to_string())
+                .collect::<Vec<_>>()
+                .join(", ")
+        )
+    };
+
    if summary.llm_cases > 0 {
        println!(
-            "[{}] Retrieval Precision@{k}: {precision:.3} ({correct}/{retrieval_total}) + LLM: {llm_answered}/{llm_total} ({llm_precision:.3}) → JSON: {json} | Markdown: {md} | Perf: {perf}",
+            "[{}] Retrieval Precision@{k}: {precision:.3} ({correct}/{retrieval_total}) + LLM: {llm_answered}/{llm_total} ({llm_precision:.3}) → JSON: {json} | Markdown: {md} | History: {history}{perf_note}",
            summary.dataset_label,
            k = summary.k,
            precision = summary.precision,
@@ -202,26 +216,28 @@ async fn async_main() -> anyhow::Result<()> {
            llm_answered = summary.llm_answered,
            llm_total = summary.llm_cases,
            llm_precision = summary.llm_precision,
-            json = report_paths.json.display(),
-            md = report_paths.markdown.display(),
-            perf = perf_log_path.display()
+            json = report.paths.json.display(),
+            md = report.paths.markdown.display(),
+            history = report.history_path.display(),
+            perf_note = perf_note,
        );
    } else {
        println!(
-            "[{}] Retrieval Precision@{k}: {precision:.3} ({correct}/{retrieval_total}) → JSON: {json} | Markdown: {md} | Perf: {perf}",
+            "[{}] Retrieval Precision@{k}: {precision:.3} ({correct}/{retrieval_total}) → JSON: {json} | Markdown: {md} | History: {history}{perf_note}",
            summary.dataset_label,
            k = summary.k,
            precision = summary.precision,
            correct = summary.correct,
            retrieval_total = summary.retrieval_cases,
-            json = report_paths.json.display(),
-            md = report_paths.markdown.display(),
-            perf = perf_log_path.display()
+            json = report.paths.json.display(),
+            md = report.paths.markdown.display(),
+            history = report.history_path.display(),
+            perf_note = perf_note,
        );
    }

    if parsed.config.perf_log_console {
-        perf::print_console_summary(&summary);
+        perf::print_console_summary(&report.record);
    }

    Ok(())
@@ -1,160 +1,37 @@
 use std::{
-    fs::{self, OpenOptions},
-    io::Write,
+    fs,
    path::{Path, PathBuf},
 };

 use anyhow::{Context, Result};
-use serde::Serialize;

 use crate::{
    args,
-    eval::{format_timestamp, EvaluationStageTimings, EvaluationSummary},
-    report,
+    eval::EvaluationSummary,
+    report::{self, EvaluationReport},
 };

-#[derive(Debug, Serialize)]
-struct PerformanceLogEntry {
-    generated_at: String,
-    dataset_id: String,
-    dataset_label: String,
-    run_label: Option<String>,
-    retrieval_strategy: String,
-    slice_id: String,
-    slice_seed: u64,
-    slice_window_offset: usize,
-    slice_window_length: usize,
-    limit: Option<usize>,
-    total_cases: usize,
-    correct: usize,
-    precision: f64,
-    retrieval_cases: usize,
-    llm_cases: usize,
-    llm_answered: usize,
-    llm_precision: f64,
-    k: usize,
-    openai_base_url: String,
-    ingestion: IngestionPerf,
-    namespace: NamespacePerf,
-    retrieval: RetrievalPerf,
-    evaluation_stages: EvaluationStageTimings,
-}
-
-#[derive(Debug, Serialize)]
-struct IngestionPerf {
-    duration_ms: u128,
-    cache_path: String,
-    reused: bool,
-    embeddings_reused: bool,
-    fingerprint: String,
-    positives_total: usize,
-    negatives_total: usize,
-}
-
-#[derive(Debug, Serialize)]
-struct NamespacePerf {
-    reused: bool,
-    seed_ms: Option<u128>,
-}
-
-#[derive(Debug, Serialize)]
-struct RetrievalPerf {
-    latency_ms: crate::eval::LatencyStats,
-    stage_latency: crate::eval::StageLatencyBreakdown,
-    concurrency: usize,
-    rerank_enabled: bool,
-    rerank_pool_size: Option<usize>,
-    rerank_keep_top: usize,
-    evaluated_cases: usize,
-}
-
-impl PerformanceLogEntry {
-    fn from_summary(summary: &EvaluationSummary) -> Self {
-        let ingestion = IngestionPerf {
-            duration_ms: summary.perf.ingestion_ms,
-            cache_path: summary.ingestion_cache_path.clone(),
-            reused: summary.ingestion_reused,
-            embeddings_reused: summary.ingestion_embeddings_reused,
-            fingerprint: summary.ingestion_fingerprint.clone(),
-            positives_total: summary.slice_positive_paragraphs,
-            negatives_total: summary.slice_negative_paragraphs,
-        };
-
-        let namespace = NamespacePerf {
-            reused: summary.namespace_reused,
-            seed_ms: summary.perf.namespace_seed_ms,
-        };
-
-        let retrieval = RetrievalPerf {
-            latency_ms: summary.latency_ms.clone(),
-            stage_latency: summary.perf.stage_latency.clone(),
-            concurrency: summary.concurrency,
-            rerank_enabled: summary.rerank_enabled,
-            rerank_pool_size: summary.rerank_pool_size,
-            rerank_keep_top: summary.rerank_keep_top,
-            evaluated_cases: summary.retrieval_cases,
-        };
-
-        Self {
-            generated_at: format_timestamp(&summary.generated_at),
-            dataset_id: summary.dataset_id.clone(),
-            dataset_label: summary.dataset_label.clone(),
-            run_label: summary.run_label.clone(),
-            retrieval_strategy: summary.retrieval_strategy.clone(),
-            slice_id: summary.slice_id.clone(),
-            slice_seed: summary.slice_seed,
-            slice_window_offset: summary.slice_window_offset,
-            slice_window_length: summary.slice_window_length,
-            limit: summary.limit,
-            total_cases: summary.total_cases,
-            correct: summary.correct,
-            precision: summary.precision,
-            retrieval_cases: summary.retrieval_cases,
-            llm_cases: summary.llm_cases,
-            llm_answered: summary.llm_answered,
-            llm_precision: summary.llm_precision,
-            k: summary.k,
-            openai_base_url: summary.perf.openai_base_url.clone(),
-            ingestion,
-            namespace,
-            retrieval,
-            evaluation_stages: summary.perf.evaluation_stage_ms.clone(),
-        }
-    }
-}
-
-pub fn write_perf_logs(
+pub fn mirror_perf_outputs(
+    record: &EvaluationReport,
    summary: &EvaluationSummary,
    report_root: &Path,
    extra_json: Option<&Path>,
    extra_dir: Option<&Path>,
-) -> Result<PathBuf> {
-    let entry = PerformanceLogEntry::from_summary(summary);
-    let dataset_dir = report::dataset_report_dir(report_root, &summary.dataset_id);
-    fs::create_dir_all(&dataset_dir)
-        .with_context(|| format!("creating dataset perf directory {}", dataset_dir.display()))?;
-
-    let log_path = dataset_dir.join("perf-log.jsonl");
-    let mut file = OpenOptions::new()
-        .create(true)
-        .append(true)
-        .open(&log_path)
-        .with_context(|| format!("opening perf log {}", log_path.display()))?;
-    let line = serde_json::to_vec(&entry).context("serialising perf log entry")?;
-    file.write_all(&line)?;
-    file.write_all(b"\n")?;
-    file.flush()?;
+) -> Result<Vec<PathBuf>> {
+    let mut written = Vec::new();

    if let Some(path) = extra_json {
        args::ensure_parent(path)?;
-        let blob = serde_json::to_vec_pretty(&entry).context("serialising perf log JSON")?;
+        let blob = serde_json::to_vec_pretty(record).context("serialising perf log JSON")?;
        fs::write(path, blob)
            .with_context(|| format!("writing perf log copy to {}", path.display()))?;
+        written.push(path.to_path_buf());
    }

    if let Some(dir) = extra_dir {
        fs::create_dir_all(dir)
            .with_context(|| format!("creating perf log directory {}", dir.display()))?;
+        let dataset_dir = report::dataset_report_dir(report_root, &summary.dataset_id);
        let dataset_slug = dataset_dir
            .file_name()
            .and_then(|os| os.to_str())
@@ -162,22 +39,24 @@ pub fn write_perf_logs(
        let timestamp = summary.generated_at.format("%Y%m%dT%H%M%S").to_string();
        let filename = format!("perf-{}-{}.json", dataset_slug, timestamp);
        let path = dir.join(filename);
-        let blob = serde_json::to_vec_pretty(&entry).context("serialising perf log JSON")?;
+        let blob = serde_json::to_vec_pretty(record).context("serialising perf log JSON")?;
        fs::write(&path, blob)
            .with_context(|| format!("writing perf log mirror {}", path.display()))?;
+        written.push(path);
    }

-    Ok(log_path)
+    Ok(written)
 }

-pub fn print_console_summary(summary: &EvaluationSummary) {
-    let perf = &summary.perf;
+pub fn print_console_summary(record: &EvaluationReport) {
+    let perf = &record.performance;
    println!(
-        "[perf] retrieval strategy={} | rerank={} (pool {:?}, keep {})",
-        summary.retrieval_strategy,
-        summary.rerank_enabled,
-        summary.rerank_pool_size,
-        summary.rerank_keep_top
+        "[perf] retrieval strategy={} | concurrency={} | rerank={} (pool {:?}, keep {})",
+        record.retrieval.strategy,
+        record.retrieval.concurrency,
+        record.retrieval.rerank_enabled,
+        record.retrieval.rerank_pool_size,
+        record.retrieval.rerank_keep_top
    );
    println!(
        "[perf] ingestion={}ms | namespace_seed={}",
@@ -194,7 +73,7 @@ pub fn print_console_summary(summary: &EvaluationSummary) {
        stage.rerank.avg,
        stage.assemble.avg,
    );
-    let eval = &perf.evaluation_stage_ms;
+    let eval = &perf.evaluation_stages_ms;
    println!(
        "[perf] eval stage ms → slice {} | db {} | corpus {} | namespace {} | queries {} | summarize {} | finalize {}",
        eval.prepare_slice_ms,
@@ -315,9 +194,12 @@ mod tests {
            concurrency: 2,
            retrieval_strategy: "initial".into(),
            detailed_report: false,
+            ingest_chunk_min_tokens: 256,
+            ingest_chunk_max_tokens: 512,
+            ingest_chunk_overlap_tokens: 50,
+            ingest_chunks_only: false,
            chunk_vector_take: 20,
            chunk_fts_take: 20,
-            chunk_token_budget: 10000,
            chunk_avg_chars_per_token: 4,
            max_chunks_per_entity: 4,
            average_ndcg: 0.0,
@@ -327,18 +209,34 @@ mod tests {
    }

    #[test]
-    fn writes_perf_log_jsonl() {
+    fn writes_perf_mirrors_from_record() {
        let tmp = tempdir().unwrap();
        let report_root = tmp.path().join("reports");
        let summary = sample_summary();
-        let log_path = write_perf_logs(&summary, &report_root, None, None).expect("perf log write");
-        assert!(log_path.exists());
-        let contents = std::fs::read_to_string(&log_path).expect("reading perf log jsonl");
+        let record = report::EvaluationReport::from_summary(&summary, 5);
+
+        let json_path = tmp.path().join("extra.json");
+        let dir_path = tmp.path().join("copies");
+        let outputs = mirror_perf_outputs(
+            &record,
+            &summary,
+            &report_root,
+            Some(json_path.as_path()),
+            Some(dir_path.as_path()),
+        )
+        .expect("perf mirrors");
+
+        assert!(json_path.exists());
+        let content = std::fs::read_to_string(&json_path).expect("reading mirror json");
        assert!(
-            contents.contains("\"openai_base_url\":\"https://example.com\""),
-            "serialized log should include base URL"
+            content.contains("\"evaluation_stages_ms\""),
+            "perf mirror should include evaluation stage timings"
        );
-        let dataset_dir = report::dataset_report_dir(&report_root, &summary.dataset_id);
-        assert!(dataset_dir.join("perf-log.jsonl").exists());
+        assert_eq!(outputs.len(), 2);
+        let mirrored = outputs
+            .into_iter()
+            .filter(|path| path.starts_with(&dir_path))
+            .collect::<Vec<_>>();
+        assert_eq!(mirrored.len(), 1, "expected timestamped mirror in dir");
    }
 }
@@ -19,7 +19,7 @@ pub struct ReportPaths {
    pub markdown: PathBuf,
 }

-#[derive(Debug, Serialize)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct EvaluationReport {
    pub overview: OverviewSection,
    pub dataset: DatasetSection,
@@ -28,14 +28,15 @@ pub struct EvaluationReport {
    #[serde(skip_serializing_if = "Option::is_none")]
    pub llm: Option<LlmSection>,
    pub performance: PerformanceSection,
-    #[serde(skip_serializing_if = "Vec::is_empty")]
+    #[serde(default, skip_serializing_if = "Vec::is_empty")]
    pub misses: Vec<MissEntry>,
-    #[serde(skip_serializing_if = "Vec::is_empty")]
+    #[serde(default, skip_serializing_if = "Vec::is_empty")]
    pub llm_cases: Vec<LlmCaseEntry>,
+    #[serde(default)]
    pub detailed_report: bool,
 }

-#[derive(Debug, Serialize)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct OverviewSection {
    pub generated_at: String,
    pub run_label: Option<String>,
@@ -43,7 +44,7 @@ pub struct OverviewSection {
    pub filtered_questions: usize,
 }

-#[derive(Debug, Serialize)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct DatasetSection {
    pub id: String,
    pub label: String,
@@ -55,7 +56,7 @@ pub struct DatasetSection {
    pub embedding_dimension: usize,
 }

-#[derive(Debug, Serialize)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct SliceSection {
    pub id: String,
    pub seed: u64,
@@ -69,7 +70,7 @@ pub struct SliceSection {
    pub negative_multiplier: f32,
 }

-#[derive(Debug, Serialize)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct RetrievalSection {
    pub k: usize,
    pub cases: usize,
@@ -86,16 +87,21 @@ pub struct RetrievalSection {
    pub rerank_enabled: bool,
    pub rerank_pool_size: Option<usize>,
    pub rerank_keep_top: usize,
+    pub chunk_result_cap: usize,
+    pub ingest_chunk_min_tokens: usize,
+    pub ingest_chunk_max_tokens: usize,
+    pub ingest_chunk_overlap_tokens: usize,
+    pub ingest_chunks_only: bool,
 }

-#[derive(Debug, Serialize)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct LlmSection {
    pub cases: usize,
    pub answered: usize,
    pub precision: f64,
 }

-#[derive(Debug, Serialize)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct PerformanceSection {
    pub openai_base_url: String,
    pub ingestion_ms: u128,
@@ -111,7 +117,7 @@ pub struct PerformanceSection {
    pub negative_paragraphs_reused: usize,
 }

-#[derive(Debug, Serialize)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct MissEntry {
    pub question_id: String,
    pub paragraph_title: String,
@@ -122,7 +128,7 @@ pub struct MissEntry {
    pub retrieved: Vec<RetrievedSnippet>,
 }

-#[derive(Debug, Serialize)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct LlmCaseEntry {
    pub question_id: String,
    pub answered: bool,
@@ -131,7 +137,7 @@ pub struct LlmCaseEntry {
    pub retrieved: Vec<RetrievedSnippet>,
 }

-#[derive(Debug, Serialize)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct RetrievedSnippet {
    pub rank: usize,
    pub source_id: String,
@@ -139,6 +145,13 @@ pub struct RetrievedSnippet {
    pub matched: bool,
 }

+#[derive(Debug)]
+pub struct ReportOutcome {
+    pub record: EvaluationReport,
+    pub paths: ReportPaths,
+    pub history_path: PathBuf,
+}
+
 impl EvaluationReport {
    pub fn from_summary(summary: &EvaluationSummary, sample: usize) -> Self {
        let overview = OverviewSection {
@@ -188,6 +201,11 @@ impl EvaluationReport {
            rerank_enabled: summary.rerank_enabled,
            rerank_pool_size: summary.rerank_pool_size,
            rerank_keep_top: summary.rerank_keep_top,
+            chunk_result_cap: summary.chunk_result_cap,
+            ingest_chunk_min_tokens: summary.ingest_chunk_min_tokens,
+            ingest_chunk_max_tokens: summary.ingest_chunk_max_tokens,
+            ingest_chunk_overlap_tokens: summary.ingest_chunk_overlap_tokens,
+            ingest_chunks_only: summary.ingest_chunks_only,
        };

        let llm = if summary.llm_cases > 0 {
@@ -215,24 +233,30 @@ impl EvaluationReport {
            negative_paragraphs_reused: summary.negative_paragraphs_reused,
        };

-        let misses = summary
-            .cases
-            .iter()
-            .filter(|case| !case.matched && !case.is_impossible)
-            .take(sample)
-            .map(MissEntry::from_case)
-            .collect();
-
-        let llm_cases = if llm.is_some() {
-            summary
+        let (misses, llm_cases) = if summary.detailed_report {
+            let misses = summary
                .cases
                .iter()
-                .filter(|case| case.is_impossible)
+                .filter(|case| !case.matched && !case.is_impossible)
                .take(sample)
-                .map(LlmCaseEntry::from_case)
-                .collect()
+                .map(MissEntry::from_case)
+                .collect();
+
+            let llm_cases = if llm.is_some() {
+                summary
+                    .cases
+                    .iter()
+                    .filter(|case| case.is_impossible)
+                    .take(sample)
+                    .map(LlmCaseEntry::from_case)
+                    .collect()
+            } else {
+                Vec::new()
+            };
+
+            (misses, llm_cases)
        } else {
-            Vec::new()
+            (Vec::new(), Vec::new())
        };

        Self {
@@ -299,7 +323,7 @@ pub fn write_reports(
    summary: &EvaluationSummary,
    report_dir: &Path,
    sample: usize,
-) -> Result<ReportPaths> {
+) -> Result<ReportOutcome> {
    fs::create_dir_all(report_dir)
        .with_context(|| format!("creating report directory {}", report_dir.display()))?;
    let dataset_dir = dataset_report_dir(report_dir, &summary.dataset_id);
@@ -331,11 +355,15 @@ pub fn write_reports(
    fs::write(&latest_md, markdown)
        .with_context(|| format!("writing latest Markdown report to {}", latest_md.display()))?;

-    record_history(summary, &dataset_dir)?;
+    let history_path = record_history(&report, &dataset_dir)?;

-    Ok(ReportPaths {
-        json: json_path,
-        markdown: md_path,
+    Ok(ReportOutcome {
+        record: report,
+        paths: ReportPaths {
+            json: json_path,
+            markdown: md_path,
+        },
+        history_path,
    })
 }

@@ -555,10 +583,14 @@ fn render_markdown(report: &EvaluationReport) -> String {
    );

    if report.misses.is_empty() {
-        md.push_str("\\n_All evaluated retrieval queries matched within the top-k window._\\n");
        if report.detailed_report {
            md.push_str(
-                "\\nSuccess measures were captured for each query (entity, chunk text, chunk ID).\\n",
+                "\\n_All evaluated retrieval queries matched within the top-k window._\\n\
+                \\nSuccess measures were captured for each query (entity, chunk text, chunk ID).\\n",
+            );
+        } else {
+            md.push_str(
+                "\\n_Misses omitted. Re-run with `--detailed-report` to see sampled failures._\\n",
            );
        }
    } else {
@@ -597,7 +629,11 @@ fn render_markdown(report: &EvaluationReport) -> String {
    if report.llm.is_some() {
        md.push_str("\\n## LLM-Only Cases (sample)\\n\\n");
        if report.llm_cases.is_empty() {
-            md.push_str("All LLM-only cases matched within the evaluation window.\\n");
+            if report.detailed_report {
+                md.push_str("All LLM-only cases matched within the evaluation window.\\n");
+            } else {
+                md.push_str("LLM-only cases omitted. Re-run with `--detailed-report` to see samples.\\n");
+            }
        } else {
            md.push_str("| Question ID | Answered | Match Rank | Top Retrieved |\\n");
            md.push_str("| --- | --- | --- | --- |\\n");
@@ -681,7 +717,7 @@ pub fn dataset_report_dir(report_dir: &Path, dataset_id: &str) -> PathBuf {
 }

 #[derive(Debug, Serialize, Deserialize)]
-struct HistoryEntry {
+struct LegacyHistoryEntry {
    generated_at: String,
    run_label: Option<String>,
    dataset_id: String,
@@ -719,7 +755,18 @@ struct HistoryEntry {
    rerank_enabled: bool,
    rerank_keep_top: usize,
    rerank_pool_size: Option<usize>,
-    delta: Option<HistoryDelta>,
+    #[serde(default)]
+    chunk_result_cap: Option<usize>,
+    #[serde(default)]
+    ingest_chunk_min_tokens: Option<usize>,
+    #[serde(default)]
+    ingest_chunk_max_tokens: Option<usize>,
+    #[serde(default)]
+    ingest_chunk_overlap_tokens: Option<usize>,
+    #[serde(default)]
+    ingest_chunks_only: Option<bool>,
+    #[serde(default)]
+    delta: Option<LegacyHistoryDelta>,
    openai_base_url: String,
    ingestion_ms: u128,
    #[serde(default)]
@@ -727,92 +774,173 @@ struct HistoryEntry {
 }

 #[derive(Debug, Serialize, Deserialize)]
-struct HistoryDelta {
+struct LegacyHistoryDelta {
    precision: f64,
    precision_at_1: f64,
    latency_avg_ms: f64,
 }

-fn record_history(summary: &EvaluationSummary, report_dir: &Path) -> Result<()> {
-    let path = report_dir.join("evaluations.json");
-    let mut entries: Vec<HistoryEntry> = if path.exists() {
-        let contents = fs::read(&path)
-            .with_context(|| format!("reading evaluation log {}", path.display()))?;
-        match serde_json::from_slice(&contents) {
-            Ok(entries) => entries,
-            Err(err) => {
-                let timestamp = Utc::now().format("%Y%m%dT%H%M%S");
-                let backup_path =
-                    report_dir.join(format!("evaluations.json.corrupted.{}", timestamp));
+fn default_stage_latency() -> StageLatencyBreakdown {
+    StageLatencyBreakdown {
+        embed: LatencyStats::default(),
+        collect_candidates: LatencyStats::default(),
+        graph_expansion: LatencyStats::default(),
+        chunk_attach: LatencyStats::default(),
+        rerank: LatencyStats::default(),
+        assemble: LatencyStats::default(),
+    }
+}
+
+fn convert_legacy_entry(entry: LegacyHistoryEntry) -> EvaluationReport {
+    let overview = OverviewSection {
+        generated_at: entry.generated_at,
+        run_label: entry.run_label,
+        total_cases: entry.slice_cases,
+        filtered_questions: 0,
+    };
+
+    let dataset = DatasetSection {
+        id: entry.dataset_id,
+        label: entry.dataset_label,
+        source: String::new(),
+        includes_unanswerable: entry.llm_cases > 0,
+        require_verified_chunks: true,
+        embedding_backend: entry.embedding_backend,
+        embedding_model: entry.embedding_model,
+        embedding_dimension: 0,
+    };
+
+    let slice = SliceSection {
+        id: entry.slice_id,
+        seed: entry.slice_seed,
+        window_offset: entry.slice_window_offset,
+        window_length: entry.slice_window_length,
+        slice_cases: entry.slice_cases,
+        ledger_total_cases: entry.slice_total_cases,
+        positives: 0,
+        negatives: 0,
+        total_paragraphs: 0,
+        negative_multiplier: 0.0,
+    };
+
+    let retrieval_cases = if entry.retrieval_cases > 0 {
+        entry.retrieval_cases
+    } else {
+        entry.slice_cases.saturating_sub(entry.llm_cases)
+    };
+    let retrieval_precision = if entry.retrieval_precision > 0.0 {
+        entry.retrieval_precision
+    } else {
+        entry.precision
+    };
+
+    let retrieval = RetrievalSection {
+        k: entry.k,
+        cases: retrieval_cases,
+        correct: 0,
+        precision: retrieval_precision,
+        precision_at_1: entry.precision_at_1,
+        precision_at_2: entry.precision_at_2,
+        precision_at_3: entry.precision_at_3,
+        mrr: entry.mrr,
+        average_ndcg: entry.average_ndcg,
+        latency: entry.latency_ms,
+        concurrency: 0,
+        strategy: "unknown".into(),
+        rerank_enabled: entry.rerank_enabled,
+        rerank_pool_size: entry.rerank_pool_size,
+        rerank_keep_top: entry.rerank_keep_top,
+        chunk_result_cap: entry.chunk_result_cap.unwrap_or(5),
+        ingest_chunk_min_tokens: entry.ingest_chunk_min_tokens.unwrap_or(256),
+        ingest_chunk_max_tokens: entry.ingest_chunk_max_tokens.unwrap_or(512),
+        ingest_chunk_overlap_tokens: entry.ingest_chunk_overlap_tokens.unwrap_or(50),
+        ingest_chunks_only: entry.ingest_chunks_only.unwrap_or(false),
+    };
+
+    let llm = if entry.llm_cases > 0 {
+        Some(LlmSection {
+            cases: entry.llm_cases,
+            answered: 0,
+            precision: entry.llm_precision,
+        })
+    } else {
+        None
+    };
+
+    let performance = PerformanceSection {
+        openai_base_url: entry.openai_base_url,
+        ingestion_ms: entry.ingestion_ms,
+        namespace_seed_ms: entry.namespace_seed_ms,
+        evaluation_stages_ms: EvaluationStageTimings::default(),
+        stage_latency: default_stage_latency(),
+        namespace_reused: false,
+        ingestion_reused: entry.ingestion_reused,
+        embeddings_reused: entry.ingestion_embeddings_reused,
+        ingestion_cache_path: String::new(),
+        corpus_paragraphs: 0,
+        positive_paragraphs_reused: 0,
+        negative_paragraphs_reused: 0,
+    };
+
+    EvaluationReport {
+        overview,
+        dataset,
+        slice,
+        retrieval,
+        llm,
+        performance,
+        misses: Vec::new(),
+        llm_cases: Vec::new(),
+        detailed_report: false,
+    }
+}
+
+fn load_history(path: &Path) -> Result<Vec<EvaluationReport>> {
+    if !path.exists() {
+        return Ok(Vec::new());
+    }
+
+    let contents =
+        fs::read(path).with_context(|| format!("reading evaluation log {}", path.display()))?;
+
+    if let Ok(entries) = serde_json::from_slice::<Vec<EvaluationReport>>(&contents) {
+        return Ok(entries);
+    }
+
+    match serde_json::from_slice::<Vec<LegacyHistoryEntry>>(&contents) {
+        Ok(entries) => Ok(entries.into_iter().map(convert_legacy_entry).collect()),
+        Err(err) => {
+            let timestamp = Utc::now().format("%Y%m%dT%H%M%S");
+            let backup_path = path
+                .parent()
+                .unwrap_or_else(|| Path::new("."))
+                .join(format!("evaluations.json.corrupted.{timestamp}"));
+            warn!(
+                path = %path.display(),
+                backup = %backup_path.display(),
+                error = %err,
+                "Evaluation history file is corrupted; backing up and starting fresh"
+            );
+            if let Err(e) = fs::rename(path, &backup_path) {
                warn!(
                    path = %path.display(),
-                    backup = %backup_path.display(),
-                    error = %err,
-                    "Evaluation history file is corrupted; backing up and starting fresh"
+                    error = %e,
+                    "Failed to backup corrupted evaluation history"
                );
-                if let Err(e) = fs::rename(&path, &backup_path) {
-                    warn!(
-                        path = %path.display(),
-                        error = %e,
-                        "Failed to backup corrupted evaluation history"
-                    );
-                }
-                Vec::new()
            }
+            Ok(Vec::new())
        }
-    } else {
-        Vec::new()
-    };
+    }
+}

-    let delta = entries.last().map(|prev| HistoryDelta {
-        precision: summary.precision - prev.precision,
-        precision_at_1: summary.precision_at_1 - prev.precision_at_1,
-        latency_avg_ms: summary.latency_ms.avg - prev.latency_ms.avg,
-    });
-
-    let entry = HistoryEntry {
-        generated_at: format_timestamp(&summary.generated_at),
-        run_label: summary.run_label.clone(),
-        dataset_id: summary.dataset_id.clone(),
-        dataset_label: summary.dataset_label.clone(),
-        slice_id: summary.slice_id.clone(),
-        slice_seed: summary.slice_seed,
-        slice_window_offset: summary.slice_window_offset,
-        slice_window_length: summary.slice_window_length,
-        slice_cases: summary.slice_cases,
-        slice_total_cases: summary.slice_total_cases,
-        k: summary.k,
-        limit: summary.limit,
-        precision: summary.precision,
-        precision_at_1: summary.precision_at_1,
-        precision_at_2: summary.precision_at_2,
-        precision_at_3: summary.precision_at_3,
-        mrr: summary.mrr,
-        average_ndcg: summary.average_ndcg,
-        retrieval_cases: summary.retrieval_cases,
-        retrieval_precision: summary.retrieval_precision,
-        llm_cases: summary.llm_cases,
-        llm_precision: summary.llm_precision,
-        duration_ms: summary.duration_ms,
-        latency_ms: summary.latency_ms.clone(),
-        embedding_backend: summary.embedding_backend.clone(),
-        embedding_model: summary.embedding_model.clone(),
-        ingestion_reused: summary.ingestion_reused,
-        ingestion_embeddings_reused: summary.ingestion_embeddings_reused,
-        rerank_enabled: summary.rerank_enabled,
-        rerank_keep_top: summary.rerank_keep_top,
-        rerank_pool_size: summary.rerank_pool_size,
-        delta,
-        openai_base_url: summary.perf.openai_base_url.clone(),
-        ingestion_ms: summary.perf.ingestion_ms,
-        namespace_seed_ms: summary.perf.namespace_seed_ms,
-    };
-
-    entries.push(entry);
+fn record_history(report: &EvaluationReport, report_dir: &Path) -> Result<PathBuf> {
+    let path = report_dir.join("evaluations.json");
+    let mut entries = load_history(&path)?;
+    entries.push(report.clone());

    let blob = serde_json::to_vec_pretty(&entries).context("serialising evaluation log")?;
    fs::write(&path, blob).with_context(|| format!("writing evaluation log {}", path.display()))?;
-    Ok(())
+    Ok(path)
 }

 #[cfg(test)]
@@ -822,6 +950,7 @@ mod tests {
        EvaluationStageTimings, PerformanceTimings, RetrievedSummary, StageLatencyBreakdown,
    };
    use chrono::Utc;
+    use tempfile::tempdir;

    fn latency(ms: f64) -> LatencyStats {
        LatencyStats {
@@ -961,9 +1090,13 @@ mod tests {
            concurrency: 2,
            detailed_report: true,
            retrieval_strategy: "initial".into(),
+            chunk_result_cap: 5,
+            ingest_chunk_min_tokens: 256,
+            ingest_chunk_max_tokens: 512,
+            ingest_chunk_overlap_tokens: 50,
+            ingest_chunks_only: false,
            chunk_vector_take: 50,
            chunk_fts_take: 50,
-            chunk_token_budget: 10_000,
            chunk_avg_chars_per_token: 4,
            max_chunks_per_entity: 4,
            cases,
@@ -987,4 +1120,25 @@ mod tests {
        assert!(!md.contains("LLM Mode Metrics"));
        assert!(!md.contains("LLM-Only Cases"));
    }
+
+    #[test]
+    fn evaluations_history_captures_strategy_and_concurrency() {
+        let tmp = tempdir().unwrap();
+        let summary = sample_summary(false);
+
+        let outcome =
+            write_reports(&summary, tmp.path(), 5).expect("writing consolidated reports");
+        let contents =
+            std::fs::read_to_string(&outcome.history_path).expect("reading evaluations history");
+        let entries: Vec<EvaluationReport> =
+            serde_json::from_str(&contents).expect("parsing evaluations history");
+        assert_eq!(entries.len(), 1);
+        let stored = &entries[0];
+        assert_eq!(stored.retrieval.concurrency, summary.concurrency);
+        assert_eq!(stored.retrieval.strategy, summary.retrieval_strategy);
+        assert_eq!(
+            stored.performance.evaluation_stages_ms.run_queries_ms,
+            summary.perf.evaluation_stage_ms.run_queries_ms
+        );
+    }
 }
@@ -16,8 +16,6 @@ pub struct SnapshotMetadata {
    pub embedding_backend: String,
    pub embedding_model: Option<String>,
    pub embedding_dimension: usize,
-    pub chunk_min_chars: usize,
-    pub chunk_max_chars: usize,
    pub rerank_enabled: bool,
 }

@@ -55,8 +53,6 @@ impl Descriptor {
            embedding_backend: embedding_provider.backend_label().to_string(),
            embedding_model: embedding_provider.model_code(),
            embedding_dimension: embedding_provider.dimension(),
-            chunk_min_chars: config.retrieval.chunk_min_chars,
-            chunk_max_chars: config.retrieval.chunk_max_chars,
            rerank_enabled: config.retrieval.rerank,
        };

@@ -146,8 +142,6 @@ mod tests {
            embedding_backend: "hashed".into(),
            embedding_model: None,
            embedding_dimension: 128,
-            chunk_min_chars: 10,
-            chunk_max_chars: 100,
            rerank_enabled: true,
        };
        let descriptor = Descriptor::from_parts(