minne/eval/src/report.rs

use std::{
    fs,
    path::{Path, PathBuf},
};

use anyhow::{Context, Result};
use serde::{Deserialize, Serialize};

use crate::eval::{
    format_timestamp, CaseSummary, EvaluationStageTimings, EvaluationSummary, LatencyStats,
    StageLatencyBreakdown,
};
use chrono::Utc;
use tracing::warn;

#[derive(Debug)]
pub struct ReportPaths {
    pub json: PathBuf,
    pub markdown: PathBuf,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvaluationReport {
    pub overview: OverviewSection,
    pub dataset: DatasetSection,
    pub slice: SliceSection,
    pub retrieval: RetrievalSection,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub llm: Option<LlmSection>,
    pub performance: PerformanceSection,
    #[serde(default, skip_serializing_if = "Vec::is_empty")]
    pub misses: Vec<MissEntry>,
    #[serde(default, skip_serializing_if = "Vec::is_empty")]
    pub llm_cases: Vec<LlmCaseEntry>,
    #[serde(default)]
    pub detailed_report: bool,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OverviewSection {
    pub generated_at: String,
    pub run_label: Option<String>,
    pub total_cases: usize,
    pub filtered_questions: usize,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DatasetSection {
    pub id: String,
    pub label: String,
    pub source: String,
    pub includes_unanswerable: bool,
    pub require_verified_chunks: bool,
    pub embedding_backend: String,
    pub embedding_model: Option<String>,
    pub embedding_dimension: usize,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SliceSection {
    pub id: String,
    pub seed: u64,
    pub window_offset: usize,
    pub window_length: usize,
    pub slice_cases: usize,
    pub ledger_total_cases: usize,
    pub positives: usize,
    pub negatives: usize,
    pub total_paragraphs: usize,
    pub negative_multiplier: f32,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RetrievalSection {
    pub k: usize,
    pub cases: usize,
    pub correct: usize,
    pub precision: f64,
    pub precision_at_1: f64,
    pub precision_at_2: f64,
    pub precision_at_3: f64,
    pub mrr: f64,
    pub average_ndcg: f64,
    pub latency: LatencyStats,
    pub concurrency: usize,
    pub strategy: String,
    pub rerank_enabled: bool,
    pub rerank_pool_size: Option<usize>,
    pub rerank_keep_top: usize,
    pub chunk_result_cap: usize,
    #[serde(default = "default_chunk_rrf_k")]
    pub chunk_rrf_k: f32,
    #[serde(default = "default_chunk_rrf_weight")]
    pub chunk_rrf_vector_weight: f32,
    #[serde(default = "default_chunk_rrf_weight")]
    pub chunk_rrf_fts_weight: f32,
    #[serde(default = "default_chunk_rrf_use")]
    pub chunk_rrf_use_vector: bool,
    #[serde(default = "default_chunk_rrf_use")]
    pub chunk_rrf_use_fts: bool,
    #[serde(default)]
    pub chunk_vector_take: usize,
    #[serde(default)]
    pub chunk_fts_take: usize,
    pub ingest_chunk_min_tokens: usize,
    pub ingest_chunk_max_tokens: usize,
    pub ingest_chunk_overlap_tokens: usize,
    pub ingest_chunks_only: bool,
}

const fn default_chunk_rrf_k() -> f32 {
    60.0
}

const fn default_chunk_rrf_weight() -> f32 {
    1.0
}

const fn default_chunk_rrf_use() -> bool {
    true
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LlmSection {
    pub cases: usize,
    pub answered: usize,
    pub precision: f64,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PerformanceSection {
    pub openai_base_url: String,
    pub ingestion_ms: u128,
    pub namespace_seed_ms: Option<u128>,
    pub evaluation_stages_ms: EvaluationStageTimings,
    pub stage_latency: StageLatencyBreakdown,
    pub namespace_reused: bool,
    pub ingestion_reused: bool,
    pub embeddings_reused: bool,
    pub ingestion_cache_path: String,
    pub corpus_paragraphs: usize,
    pub positive_paragraphs_reused: usize,
    pub negative_paragraphs_reused: usize,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MissEntry {
    pub question_id: String,
    pub paragraph_title: String,
    pub expected_source: String,
    pub entity_match: bool,
    pub chunk_text_match: bool,
    pub chunk_id_match: bool,
    pub retrieved: Vec<RetrievedSnippet>,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LlmCaseEntry {
    pub question_id: String,
    pub answered: bool,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub match_rank: Option<usize>,
    pub retrieved: Vec<RetrievedSnippet>,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RetrievedSnippet {
    pub rank: usize,
    pub source_id: String,
    pub entity_name: String,
    pub matched: bool,
}

#[derive(Debug)]
pub struct ReportOutcome {
    pub record: EvaluationReport,
    pub paths: ReportPaths,
    pub history_path: PathBuf,
}

impl EvaluationReport {
    pub fn from_summary(summary: &EvaluationSummary, sample: usize) -> Self {
        let overview = OverviewSection {
            generated_at: format_timestamp(&summary.generated_at),
            run_label: summary.run_label.clone(),
            total_cases: summary.total_cases,
            filtered_questions: summary.filtered_questions,
        };

        let dataset = DatasetSection {
            id: summary.dataset_id.clone(),
            label: summary.dataset_label.clone(),
            source: summary.dataset_source.clone(),
            includes_unanswerable: summary.includes_impossible_cases,
            require_verified_chunks: summary.require_verified_chunks,
            embedding_backend: summary.embedding_backend.clone(),
            embedding_model: summary.embedding_model.clone(),
            embedding_dimension: summary.embedding_dimension,
        };

        let slice = SliceSection {
            id: summary.slice_id.clone(),
            seed: summary.slice_seed,
            window_offset: summary.slice_window_offset,
            window_length: summary.slice_window_length,
            slice_cases: summary.slice_cases,
            ledger_total_cases: summary.slice_total_cases,
            positives: summary.slice_positive_paragraphs,
            negatives: summary.slice_negative_paragraphs,
            total_paragraphs: summary.slice_total_paragraphs,
            negative_multiplier: summary.slice_negative_multiplier,
        };

        let retrieval = RetrievalSection {
            k: summary.k,
            cases: summary.retrieval_cases,
            correct: summary.retrieval_correct,
            precision: summary.retrieval_precision,
            precision_at_1: summary.precision_at_1,
            precision_at_2: summary.precision_at_2,
            precision_at_3: summary.precision_at_3,
            mrr: summary.mrr,
            average_ndcg: summary.average_ndcg,
            latency: summary.latency_ms.clone(),
            concurrency: summary.concurrency,
            strategy: summary.retrieval_strategy.clone(),
            rerank_enabled: summary.rerank_enabled,
            rerank_pool_size: summary.rerank_pool_size,
            rerank_keep_top: summary.rerank_keep_top,
            chunk_result_cap: summary.chunk_result_cap,
            chunk_rrf_k: summary.chunk_rrf_k,
            chunk_rrf_vector_weight: summary.chunk_rrf_vector_weight,
            chunk_rrf_fts_weight: summary.chunk_rrf_fts_weight,
            chunk_rrf_use_vector: summary.chunk_rrf_use_vector,
            chunk_rrf_use_fts: summary.chunk_rrf_use_fts,
            chunk_vector_take: summary.chunk_vector_take,
            chunk_fts_take: summary.chunk_fts_take,
            ingest_chunk_min_tokens: summary.ingest_chunk_min_tokens,
            ingest_chunk_max_tokens: summary.ingest_chunk_max_tokens,
            ingest_chunk_overlap_tokens: summary.ingest_chunk_overlap_tokens,
            ingest_chunks_only: summary.ingest_chunks_only,
        };

        let llm = if summary.llm_cases > 0 {
            Some(LlmSection {
                cases: summary.llm_cases,
                answered: summary.llm_answered,
                precision: summary.llm_precision,
            })
        } else {
            None
        };

        let performance = PerformanceSection {
            openai_base_url: summary.perf.openai_base_url.clone(),
            ingestion_ms: summary.perf.ingestion_ms,
            namespace_seed_ms: summary.perf.namespace_seed_ms,
            evaluation_stages_ms: summary.perf.evaluation_stage_ms.clone(),
            stage_latency: summary.perf.stage_latency.clone(),
            namespace_reused: summary.namespace_reused,
            ingestion_reused: summary.ingestion_reused,
            embeddings_reused: summary.ingestion_embeddings_reused,
            ingestion_cache_path: summary.ingestion_cache_path.clone(),
            corpus_paragraphs: summary.corpus_paragraphs,
            positive_paragraphs_reused: summary.positive_paragraphs_reused,
            negative_paragraphs_reused: summary.negative_paragraphs_reused,
        };

        let (misses, llm_cases) = if summary.detailed_report {
            let misses = summary
                .cases
                .iter()
                .filter(|case| !case.matched && !case.is_impossible)
                .take(sample)
                .map(MissEntry::from_case)
                .collect();

            let llm_cases = if llm.is_some() {
                summary
                    .cases
                    .iter()
                    .filter(|case| case.is_impossible)
                    .take(sample)
                    .map(LlmCaseEntry::from_case)
                    .collect()
            } else {
                Vec::new()
            };

            (misses, llm_cases)
        } else {
            (Vec::new(), Vec::new())
        };

        Self {
            overview,
            dataset,
            slice,
            retrieval,
            llm,
            performance,
            misses,
            llm_cases,
            detailed_report: summary.detailed_report,
        }
    }
}

impl MissEntry {
    fn from_case(case: &CaseSummary) -> Self {
        Self {
            question_id: case.question_id.clone(),
            paragraph_title: case.paragraph_title.clone(),
            expected_source: case.expected_source.clone(),
            entity_match: case.entity_match,
            chunk_text_match: case.chunk_text_match,
            chunk_id_match: case.chunk_id_match,
            retrieved: case
                .retrieved
                .iter()
                .take(3)
                .map(RetrievedSnippet::from_summary)
                .collect(),
        }
    }
}

impl LlmCaseEntry {
    fn from_case(case: &CaseSummary) -> Self {
        Self {
            question_id: case.question_id.clone(),
            answered: case.matched,
            match_rank: case.match_rank,
            retrieved: case
                .retrieved
                .iter()
                .take(3)
                .map(RetrievedSnippet::from_summary)
                .collect(),
        }
    }
}

impl RetrievedSnippet {
    fn from_summary(entry: &crate::eval::RetrievedSummary) -> Self {
        Self {
            rank: entry.rank,
            source_id: entry.source_id.clone(),
            entity_name: entry.entity_name.clone(),
            matched: entry.matched,
        }
    }
}

pub fn write_reports(
    summary: &EvaluationSummary,
    report_dir: &Path,
    sample: usize,
) -> Result<ReportOutcome> {
    fs::create_dir_all(report_dir)
        .with_context(|| format!("creating report directory {}", report_dir.display()))?;
    let dataset_dir = dataset_report_dir(report_dir, &summary.dataset_id);
    fs::create_dir_all(&dataset_dir).with_context(|| {
        format!(
            "creating dataset report directory {}",
            dataset_dir.display()
        )
    })?;

    let stem = build_report_stem(summary);
    let report = EvaluationReport::from_summary(summary, sample);

    let json_path = dataset_dir.join(format!("{stem}.json"));
    let json_blob = serde_json::to_string_pretty(&report).context("serialising JSON report")?;
    fs::write(&json_path, &json_blob)
        .with_context(|| format!("writing JSON report to {}", json_path.display()))?;

    let md_path = dataset_dir.join(format!("{stem}.md"));
    let markdown = render_markdown(&report);
    fs::write(&md_path, &markdown)
        .with_context(|| format!("writing Markdown report to {}", md_path.display()))?;

    // Keep a latest.json pointer to simplify automation.
    let latest_json = dataset_dir.join("latest.json");
    fs::write(&latest_json, json_blob)
        .with_context(|| format!("writing latest JSON report to {}", latest_json.display()))?;
    let latest_md = dataset_dir.join("latest.md");
    fs::write(&latest_md, markdown)
        .with_context(|| format!("writing latest Markdown report to {}", latest_md.display()))?;

    let history_path = record_history(&report, &dataset_dir)?;

    Ok(ReportOutcome {
        record: report,
        paths: ReportPaths {
            json: json_path,
            markdown: md_path,
        },
        history_path,
    })
}

fn render_markdown(report: &EvaluationReport) -> String {
    let mut md = String::new();

    md.push_str(&format!(
        "# Retrieval Evaluation (k={})\\n\\n",
        report.retrieval.k
    ));

    md.push_str("## Overview\\n\\n");
    md.push_str("| Metric | Value |\\n| --- | --- |\\n");
    md.push_str(&format!(
        "| Generated | {} |\\n",
        report.overview.generated_at
    ));
    md.push_str(&format!(
        "| Run Label | {} |\\n",
        report
            .overview
            .run_label
            .as_deref()
            .filter(|label| !label.is_empty())
            .unwrap_or("-")
    ));
    md.push_str(&format!(
        "| Total Cases | {} |\\n",
        report.overview.total_cases
    ));
    md.push_str(&format!(
        "| Filtered Questions | {} |\\n",
        report.overview.filtered_questions
    ));

    md.push_str("\\n## Dataset & Slice\\n\\n");
    md.push_str("| Metric | Value |\\n| --- | --- |\\n");
    md.push_str(&format!(
        "| Dataset | {} (`{}`) |\\n",
        report.dataset.label, report.dataset.id
    ));
    md.push_str(&format!(
        "| Dataset Source | {} |\\n",
        report.dataset.source
    ));
    md.push_str(&format!(
        "| Includes Unanswerable | {} |\\n",
        bool_badge(report.dataset.includes_unanswerable)
    ));
    md.push_str(&format!(
        "| Require Verified Chunks | {} |\\n",
        bool_badge(report.dataset.require_verified_chunks)
    ));
    let embedding_label = if let Some(model) = report.dataset.embedding_model.as_ref() {
        format!("{} ({model})", report.dataset.embedding_backend)
    } else {
        report.dataset.embedding_backend.clone()
    };
    md.push_str(&format!("| Embedding | {} |\\n", embedding_label));
    md.push_str(&format!(
        "| Embedding Dim | {} |\\n",
        report.dataset.embedding_dimension
    ));
    md.push_str(&format!("| Slice ID | `{}` |\\n", report.slice.id));
    md.push_str(&format!("| Slice Seed | {} |\\n", report.slice.seed));
    md.push_str(&format!(
        "| Slice Window (offset/length) | {}/{} |\\n",
        report.slice.window_offset, report.slice.window_length
    ));
    md.push_str(&format!(
        "| Slice Questions (window/ledger) | {}/{} |\\n",
        report.slice.slice_cases, report.slice.ledger_total_cases
    ));
    md.push_str(&format!(
        "| Slice Positives / Negatives | {}/{} |\\n",
        report.slice.positives, report.slice.negatives
    ));
    md.push_str(&format!(
        "| Slice Paragraphs | {} |\\n",
        report.slice.total_paragraphs
    ));
    md.push_str(&format!(
        "| Negative Multiplier | {:.2} |\\n",
        report.slice.negative_multiplier
    ));

    md.push_str("\\n## Retrieval Metrics\\n\\n");
    md.push_str("| Metric | Value |\\n| --- | --- |\\n");
    md.push_str(&format!("| Cases | {} |\\n", report.retrieval.cases));
    md.push_str(&format!(
        "| Correct@{} | {}/{} |\\n",
        report.retrieval.k, report.retrieval.correct, report.retrieval.cases
    ));
    md.push_str(&format!(
        "| Precision@{} | {:.3} |\\n",
        report.retrieval.k, report.retrieval.precision
    ));
    md.push_str(&format!(
        "| Precision@1/2/3 | {:.3} / {:.3} / {:.3} |\\n",
        report.retrieval.precision_at_1,
        report.retrieval.precision_at_2,
        report.retrieval.precision_at_3
    ));
    md.push_str(&format!("| MRR | {:.3} |\\n", report.retrieval.mrr));
    md.push_str(&format!(
        "| NDCG | {:.3} |\\n",
        report.retrieval.average_ndcg
    ));
    md.push_str(&format!(
        "| Latency Avg / P50 / P95 (ms) | {:.1} / {} / {} |\\n",
        report.retrieval.latency.avg, report.retrieval.latency.p50, report.retrieval.latency.p95
    ));
    md.push_str(&format!(
        "| Strategy | `{}` |\\n",
        report.retrieval.strategy
    ));
    md.push_str(&format!(
        "| Concurrency | {} |\\n",
        report.retrieval.concurrency
    ));
    if report.retrieval.rerank_enabled {
        let pool = report
            .retrieval
            .rerank_pool_size
            .map(|size| size.to_string())
            .unwrap_or_else(|| "?".into());
        md.push_str(&format!(
            "| Rerank | enabled (pool {pool}, keep top {}) |\\n",
            report.retrieval.rerank_keep_top
        ));
    } else {
        md.push_str("| Rerank | disabled |\\n");
    }

    if let Some(llm) = &report.llm {
        md.push_str("\\n## LLM Mode Metrics\\n\\n");
        md.push_str("| Metric | Value |\\n| --- | --- |\\n");
        md.push_str(&format!("| Cases | {} |\\n", llm.cases));
        md.push_str(&format!("| Answered | {} |\\n", llm.answered));
        md.push_str(&format!("| Precision | {:.3} |\\n", llm.precision));
    }

    md.push_str("\\n## Performance\\n\\n");
    md.push_str("| Metric | Value |\\n| --- | --- |\\n");
    md.push_str(&format!(
        "| OpenAI Base URL | {} |\\n",
        report.performance.openai_base_url
    ));
    md.push_str(&format!(
        "| Ingestion Duration | {} ms |\\n",
        report.performance.ingestion_ms
    ));
    if let Some(seed) = report.performance.namespace_seed_ms {
        md.push_str(&format!("| Namespace Seed | {} ms |\\n", seed));
    }
    md.push_str(&format!(
        "| Namespace State | {} |\\n",
        if report.performance.namespace_reused {
            "reused"
        } else {
            "seeded"
        }
    ));
    md.push_str(&format!(
        "| Corpus Paragraphs | {} |\\n",
        report.performance.corpus_paragraphs
    ));
    if report.detailed_report {
        md.push_str(&format!(
            "| Ingestion Cache | `{}` |\\n",
            report.performance.ingestion_cache_path
        ));
        md.push_str(&format!(
            "| Ingestion Reused | {} |\\n",
            bool_badge(report.performance.ingestion_reused)
        ));
        md.push_str(&format!(
            "| Embeddings Reused | {} |\\n",
            bool_badge(report.performance.embeddings_reused)
        ));
    }
    md.push_str(&format!(
        "| Positives Cached | {} |\\n",
        report.performance.positive_paragraphs_reused
    ));
    md.push_str(&format!(
        "| Negatives Cached | {} |\\n",
        report.performance.negative_paragraphs_reused
    ));

    md.push_str("\\n## Retrieval Stage Timings\\n\\n");
    md.push_str("| Stage | Avg (ms) | P50 (ms) | P95 (ms) |\\n| --- | --- | --- | --- |\\n");
    write_stage_row(&mut md, "Embed", &report.performance.stage_latency.embed);
    write_stage_row(
        &mut md,
        "Collect Candidates",
        &report.performance.stage_latency.collect_candidates,
    );
    write_stage_row(
        &mut md,
        "Graph Expansion",
        &report.performance.stage_latency.graph_expansion,
    );
    write_stage_row(
        &mut md,
        "Chunk Attach",
        &report.performance.stage_latency.chunk_attach,
    );
    write_stage_row(&mut md, "Rerank", &report.performance.stage_latency.rerank);
    write_stage_row(
        &mut md,
        "Assemble",
        &report.performance.stage_latency.assemble,
    );

    if report.misses.is_empty() {
        if report.detailed_report {
            md.push_str(
                "\\n_All evaluated retrieval queries matched within the top-k window._\\n\
                \\nSuccess measures were captured for each query (entity, chunk text, chunk ID).\\n",
            );
        } else {
            md.push_str(
                "\\n_Misses omitted. Re-run with `--detailed-report` to see sampled failures._\\n",
            );
        }
    } else {
        md.push_str("\\n## Missed Retrieval Queries (sample)\\n\\n");
        if report.detailed_report {
            md.push_str(
                "| Question ID | Paragraph | Expected Source | Entity Match | Chunk Text | Chunk ID | Top Retrieved |\\n",
            );
            md.push_str("| --- | --- | --- | --- | --- | --- | --- |\\n");
        } else {
            md.push_str("| Question ID | Paragraph | Expected Source | Top Retrieved |\\n");
            md.push_str("| --- | --- | --- | --- |\\n");
        }
        for case in &report.misses {
            let retrieved = render_retrieved(&case.retrieved);
            if report.detailed_report {
                md.push_str(&format!(
                    "| `{}` | {} | `{}` | {} | {} | {} | {} |\\n",
                    case.question_id,
                    case.paragraph_title,
                    case.expected_source,
                    bool_badge(case.entity_match),
                    bool_badge(case.chunk_text_match),
                    bool_badge(case.chunk_id_match),
                    retrieved
                ));
            } else {
                md.push_str(&format!(
                    "| `{}` | {} | `{}` | {} |\\n",
                    case.question_id, case.paragraph_title, case.expected_source, retrieved
                ));
            }
        }
    }

    if report.llm.is_some() {
        md.push_str("\\n## LLM-Only Cases (sample)\\n\\n");
        if report.llm_cases.is_empty() {
            if report.detailed_report {
                md.push_str("All LLM-only cases matched within the evaluation window.\\n");
            } else {
                md.push_str(
                    "LLM-only cases omitted. Re-run with `--detailed-report` to see samples.\\n",
                );
            }
        } else {
            md.push_str("| Question ID | Answered | Match Rank | Top Retrieved |\\n");
            md.push_str("| --- | --- | --- | --- |\\n");
            for case in &report.llm_cases {
                let retrieved = render_retrieved(&case.retrieved);
                let rank = case
                    .match_rank
                    .map(|rank| rank.to_string())
                    .unwrap_or_else(|| "-".into());
                md.push_str(&format!(
                    "| `{}` | {} | {} | {} |\\n",
                    case.question_id,
                    bool_badge(case.answered),
                    rank,
                    retrieved
                ));
            }
        }
    }

    md
}
fn write_stage_row(buf: &mut String, label: &str, stats: &LatencyStats) {
    buf.push_str(&format!(
        "| {} | {:.1} | {} | {} |\n",
        label, stats.avg, stats.p50, stats.p95
    ));
}

fn bool_badge(value: bool) -> &'static str {
    if value {
        "✅"
    } else {
        "⚪"
    }
}

fn render_retrieved(entries: &[RetrievedSnippet]) -> String {
    if entries.is_empty() {
        "-".to_string()
    } else {
        entries
            .iter()
            .map(|entry| format!("{} (rank {})", entry.source_id, entry.rank))
            .take(3)
            .collect::<Vec<_>>()
            .join("<br>")
    }
}

fn build_report_stem(summary: &EvaluationSummary) -> String {
    let timestamp = summary.generated_at.format("%Y%m%dT%H%M%S");
    let backend = sanitize_component(&summary.embedding_backend);
    let dataset_component = sanitize_component(&summary.dataset_id);
    let model_component = summary
        .embedding_model
        .as_ref()
        .map(|model| sanitize_component(model));

    match model_component {
        Some(model) => format!(
            "precision_at_{}_{}_{}_{}_{}",
            summary.k, dataset_component, timestamp, backend, model
        ),
        None => format!(
            "precision_at_{}_{}_{}_{}",
            summary.k, dataset_component, timestamp, backend
        ),
    }
}

fn sanitize_component(input: &str) -> String {
    input
        .chars()
        .map(|ch| if ch.is_ascii_alphanumeric() { ch } else { '_' })
        .collect()
}

pub fn dataset_report_dir(report_dir: &Path, dataset_id: &str) -> PathBuf {
    report_dir.join(sanitize_component(dataset_id))
}

#[derive(Debug, Serialize, Deserialize)]
struct LegacyHistoryEntry {
    generated_at: String,
    run_label: Option<String>,
    dataset_id: String,
    dataset_label: String,
    slice_id: String,
    slice_seed: u64,
    slice_window_offset: usize,
    slice_window_length: usize,
    slice_cases: usize,
    slice_total_cases: usize,
    k: usize,
    limit: Option<usize>,
    precision: f64,
    precision_at_1: f64,
    precision_at_2: f64,
    precision_at_3: f64,
    #[serde(default)]
    mrr: f64,
    #[serde(default)]
    average_ndcg: f64,
    #[serde(default)]
    retrieval_cases: usize,
    #[serde(default)]
    retrieval_precision: f64,
    #[serde(default)]
    llm_cases: usize,
    #[serde(default)]
    llm_precision: f64,
    duration_ms: u128,
    latency_ms: LatencyStats,
    embedding_backend: String,
    embedding_model: Option<String>,
    ingestion_reused: bool,
    ingestion_embeddings_reused: bool,
    rerank_enabled: bool,
    rerank_keep_top: usize,
    rerank_pool_size: Option<usize>,
    #[serde(default)]
    chunk_result_cap: Option<usize>,
    #[serde(default)]
    ingest_chunk_min_tokens: Option<usize>,
    #[serde(default)]
    ingest_chunk_max_tokens: Option<usize>,
    #[serde(default)]
    ingest_chunk_overlap_tokens: Option<usize>,
    #[serde(default)]
    ingest_chunks_only: Option<bool>,
    #[serde(default)]
    delta: Option<LegacyHistoryDelta>,
    openai_base_url: String,
    ingestion_ms: u128,
    #[serde(default)]
    namespace_seed_ms: Option<u128>,
}

#[derive(Debug, Serialize, Deserialize)]
struct LegacyHistoryDelta {
    precision: f64,
    precision_at_1: f64,
    latency_avg_ms: f64,
}

fn default_stage_latency() -> StageLatencyBreakdown {
    StageLatencyBreakdown {
        embed: LatencyStats::default(),
        collect_candidates: LatencyStats::default(),
        graph_expansion: LatencyStats::default(),
        chunk_attach: LatencyStats::default(),
        rerank: LatencyStats::default(),
        assemble: LatencyStats::default(),
    }
}

fn convert_legacy_entry(entry: LegacyHistoryEntry) -> EvaluationReport {
    let overview = OverviewSection {
        generated_at: entry.generated_at,
        run_label: entry.run_label,
        total_cases: entry.slice_cases,
        filtered_questions: 0,
    };

    let dataset = DatasetSection {
        id: entry.dataset_id,
        label: entry.dataset_label,
        source: String::new(),
        includes_unanswerable: entry.llm_cases > 0,
        require_verified_chunks: true,
        embedding_backend: entry.embedding_backend,
        embedding_model: entry.embedding_model,
        embedding_dimension: 0,
    };

    let slice = SliceSection {
        id: entry.slice_id,
        seed: entry.slice_seed,
        window_offset: entry.slice_window_offset,
        window_length: entry.slice_window_length,
        slice_cases: entry.slice_cases,
        ledger_total_cases: entry.slice_total_cases,
        positives: 0,
        negatives: 0,
        total_paragraphs: 0,
        negative_multiplier: 0.0,
    };

    let retrieval_cases = if entry.retrieval_cases > 0 {
        entry.retrieval_cases
    } else {
        entry.slice_cases.saturating_sub(entry.llm_cases)
    };
    let retrieval_precision = if entry.retrieval_precision > 0.0 {
        entry.retrieval_precision
    } else {
        entry.precision
    };

    let retrieval = RetrievalSection {
        k: entry.k,
        cases: retrieval_cases,
        correct: 0,
        precision: retrieval_precision,
        precision_at_1: entry.precision_at_1,
        precision_at_2: entry.precision_at_2,
        precision_at_3: entry.precision_at_3,
        mrr: entry.mrr,
        average_ndcg: entry.average_ndcg,
        latency: entry.latency_ms,
        concurrency: 0,
        strategy: "unknown".into(),
        rerank_enabled: entry.rerank_enabled,
        rerank_pool_size: entry.rerank_pool_size,
        rerank_keep_top: entry.rerank_keep_top,
        chunk_result_cap: entry.chunk_result_cap.unwrap_or(5),
        chunk_rrf_k: default_chunk_rrf_k(),
        chunk_rrf_vector_weight: default_chunk_rrf_weight(),
        chunk_rrf_fts_weight: default_chunk_rrf_weight(),
        chunk_rrf_use_vector: default_chunk_rrf_use(),
        chunk_rrf_use_fts: default_chunk_rrf_use(),
        chunk_vector_take: 0,
        chunk_fts_take: 0,
        ingest_chunk_min_tokens: entry.ingest_chunk_min_tokens.unwrap_or(256),
        ingest_chunk_max_tokens: entry.ingest_chunk_max_tokens.unwrap_or(512),
        ingest_chunk_overlap_tokens: entry.ingest_chunk_overlap_tokens.unwrap_or(50),
        ingest_chunks_only: entry.ingest_chunks_only.unwrap_or(false),
    };

    let llm = if entry.llm_cases > 0 {
        Some(LlmSection {
            cases: entry.llm_cases,
            answered: 0,
            precision: entry.llm_precision,
        })
    } else {
        None
    };

    let performance = PerformanceSection {
        openai_base_url: entry.openai_base_url,
        ingestion_ms: entry.ingestion_ms,
        namespace_seed_ms: entry.namespace_seed_ms,
        evaluation_stages_ms: EvaluationStageTimings::default(),
        stage_latency: default_stage_latency(),
        namespace_reused: false,
        ingestion_reused: entry.ingestion_reused,
        embeddings_reused: entry.ingestion_embeddings_reused,
        ingestion_cache_path: String::new(),
        corpus_paragraphs: 0,
        positive_paragraphs_reused: 0,
        negative_paragraphs_reused: 0,
    };

    EvaluationReport {
        overview,
        dataset,
        slice,
        retrieval,
        llm,
        performance,
        misses: Vec::new(),
        llm_cases: Vec::new(),
        detailed_report: false,
    }
}

fn load_history(path: &Path) -> Result<Vec<EvaluationReport>> {
    if !path.exists() {
        return Ok(Vec::new());
    }

    let contents =
        fs::read(path).with_context(|| format!("reading evaluation log {}", path.display()))?;

    if let Ok(entries) = serde_json::from_slice::<Vec<EvaluationReport>>(&contents) {
        return Ok(entries);
    }

    match serde_json::from_slice::<Vec<LegacyHistoryEntry>>(&contents) {
        Ok(entries) => Ok(entries.into_iter().map(convert_legacy_entry).collect()),
        Err(err) => {
            let timestamp = Utc::now().format("%Y%m%dT%H%M%S");
            let backup_path = path
                .parent()
                .unwrap_or_else(|| Path::new("."))
                .join(format!("evaluations.json.corrupted.{timestamp}"));
            warn!(
                path = %path.display(),
                backup = %backup_path.display(),
                error = %err,
                "Evaluation history file is corrupted; backing up and starting fresh"
            );
            if let Err(e) = fs::rename(path, &backup_path) {
                warn!(
                    path = %path.display(),
                    error = %e,
                    "Failed to backup corrupted evaluation history"
                );
            }
            Ok(Vec::new())
        }
    }
}

fn record_history(report: &EvaluationReport, report_dir: &Path) -> Result<PathBuf> {
    let path = report_dir.join("evaluations.json");
    let mut entries = load_history(&path)?;
    entries.push(report.clone());

    let blob = serde_json::to_vec_pretty(&entries).context("serialising evaluation log")?;
    fs::write(&path, blob).with_context(|| format!("writing evaluation log {}", path.display()))?;
    Ok(path)
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::eval::{
        EvaluationStageTimings, PerformanceTimings, RetrievedSummary, StageLatencyBreakdown,
    };
    use chrono::Utc;
    use tempfile::tempdir;

    fn latency(ms: f64) -> LatencyStats {
        LatencyStats {
            avg: ms,
            p50: ms as u128,
            p95: ms as u128,
        }
    }

    fn sample_stage_latency() -> StageLatencyBreakdown {
        StageLatencyBreakdown {
            embed: latency(9.0),
            collect_candidates: latency(10.0),
            graph_expansion: latency(11.0),
            chunk_attach: latency(12.0),
            rerank: latency(13.0),
            assemble: latency(14.0),
        }
    }

    fn sample_eval_stage() -> EvaluationStageTimings {
        EvaluationStageTimings {
            prepare_slice_ms: 1,
            prepare_db_ms: 2,
            prepare_corpus_ms: 3,
            prepare_namespace_ms: 4,
            run_queries_ms: 5,
            summarize_ms: 6,
            finalize_ms: 7,
        }
    }

    fn sample_case(is_impossible: bool, matched: bool) -> CaseSummary {
        CaseSummary {
            question_id: if is_impossible {
                "llm-q".into()
            } else {
                "retrieval-q".into()
            },
            question: "Who is the hero?".into(),
            paragraph_id: "p1".into(),
            paragraph_title: "Hero".into(),
            expected_source: "src1".into(),
            answers: vec!["answer".into()],
            matched,
            entity_match: matched,
            chunk_text_match: matched,
            chunk_id_match: matched,
            ndcg: None,
            reciprocal_rank: None,
            is_impossible,
            has_verified_chunks: !is_impossible,
            match_rank: if matched { Some(1) } else { None },
            latency_ms: 42,
            retrieved: vec![RetrievedSummary {
                rank: 1,
                entity_id: "entity1".into(),
                source_id: "src1".into(),
                entity_name: "Entity".into(),
                score: 1.0,
                matched,
                entity_description: None,
                entity_category: None,
                chunk_text_match: Some(matched),
                chunk_id_match: Some(matched),
            }],
        }
    }

    fn sample_summary(include_llm: bool) -> EvaluationSummary {
        let mut cases = vec![sample_case(false, true)];
        if include_llm {
            cases.push(sample_case(true, false));
        }
        EvaluationSummary {
            generated_at: Utc::now(),
            k: 5,
            limit: Some(10),
            run_label: Some("test".into()),
            total_cases: cases.len(),
            correct: 1,
            precision: 1.0,
            correct_at_1: 1,
            correct_at_2: 1,
            correct_at_3: 1,
            precision_at_1: 1.0,
            precision_at_2: 1.0,
            precision_at_3: 1.0,
            duration_ms: 100,
            dataset_id: "ds".into(),
            dataset_label: "Dataset".into(),
            dataset_includes_unanswerable: include_llm,
            dataset_source: "dev".into(),
            includes_impossible_cases: include_llm,
            require_verified_chunks: !include_llm,
            filtered_questions: 0,
            retrieval_cases: 1,
            retrieval_correct: 1,
            retrieval_precision: 1.0,
            average_ndcg: 0.0,
            mrr: 0.0,
            llm_cases: if include_llm { 1 } else { 0 },
            llm_answered: 0,
            llm_precision: 0.0,
            slice_id: "slice".into(),
            slice_seed: 1,
            slice_total_cases: cases.len(),
            slice_window_offset: 0,
            slice_window_length: cases.len(),
            slice_cases: cases.len(),
            slice_positive_paragraphs: 1,
            slice_negative_paragraphs: 0,
            slice_total_paragraphs: 1,
            slice_negative_multiplier: 1.0,
            namespace_reused: true,
            corpus_paragraphs: 1,
            ingestion_cache_path: "/cache".into(),
            ingestion_reused: true,
            ingestion_embeddings_reused: true,
            ingestion_fingerprint: "fp".into(),
            positive_paragraphs_reused: 1,
            negative_paragraphs_reused: 0,
            latency_ms: latency(10.0),
            perf: PerformanceTimings {
                openai_base_url: "https://example.com".into(),
                ingestion_ms: 100,
                namespace_seed_ms: Some(50),
                evaluation_stage_ms: sample_eval_stage(),
                stage_latency: sample_stage_latency(),
            },
            embedding_backend: "fastembed".into(),
            embedding_model: Some("model".into()),
            embedding_dimension: 32,
            rerank_enabled: true,
            rerank_pool_size: Some(4),
            rerank_keep_top: 5,
            concurrency: 2,
            detailed_report: true,
            retrieval_strategy: "initial".into(),
            chunk_result_cap: 5,
            chunk_rrf_k: 60.0,
            chunk_rrf_vector_weight: 1.0,
            chunk_rrf_fts_weight: 1.0,
            chunk_rrf_use_vector: true,
            chunk_rrf_use_fts: true,
            ingest_chunk_min_tokens: 256,
            ingest_chunk_max_tokens: 512,
            ingest_chunk_overlap_tokens: 50,
            ingest_chunks_only: false,
            chunk_vector_take: 50,
            chunk_fts_take: 50,
            chunk_avg_chars_per_token: 4,
            max_chunks_per_entity: 4,
            cases,
        }
    }

    #[test]
    fn markdown_includes_llm_section() {
        let summary = sample_summary(true);
        let report = EvaluationReport::from_summary(&summary, 5);
        let md = render_markdown(&report);
        assert!(md.contains("LLM Mode Metrics"));
        assert!(md.contains("LLM-Only Cases (sample)"));
    }

    #[test]
    fn markdown_hides_llm_section_when_not_present() {
        let summary = sample_summary(false);
        let report = EvaluationReport::from_summary(&summary, 5);
        let md = render_markdown(&report);
        assert!(!md.contains("LLM Mode Metrics"));
        assert!(!md.contains("LLM-Only Cases"));
    }

    #[test]
    fn evaluations_history_captures_strategy_and_concurrency() {
        let tmp = tempdir().unwrap();
        let summary = sample_summary(false);

        let outcome = write_reports(&summary, tmp.path(), 5).expect("writing consolidated reports");
        let contents =
            std::fs::read_to_string(&outcome.history_path).expect("reading evaluations history");
        let entries: Vec<EvaluationReport> =
            serde_json::from_str(&contents).expect("parsing evaluations history");
        assert_eq!(entries.len(), 1);
        let stored = &entries[0];
        assert_eq!(stored.retrieval.concurrency, summary.concurrency);
        assert_eq!(stored.retrieval.strategy, summary.retrieval_strategy);
        assert_eq!(
            stored.performance.evaluation_stages_ms.run_queries_ms,
            summary.perf.evaluation_stage_ms.run_queries_ms
        );
    }
}