use std::{ fs, path::{Path, PathBuf}, }; use anyhow::{Context, Result}; use serde::{Deserialize, Serialize}; use crate::eval::{ format_timestamp, CaseSummary, EvaluationStageTimings, EvaluationSummary, LatencyStats, StageLatencyBreakdown, }; use chrono::Utc; use tracing::warn; #[derive(Debug)] pub struct ReportPaths { pub json: PathBuf, pub markdown: PathBuf, } #[derive(Debug, Serialize)] pub struct EvaluationReport { pub overview: OverviewSection, pub dataset: DatasetSection, pub slice: SliceSection, pub retrieval: RetrievalSection, #[serde(skip_serializing_if = "Option::is_none")] pub llm: Option, pub performance: PerformanceSection, #[serde(skip_serializing_if = "Vec::is_empty")] pub misses: Vec, #[serde(skip_serializing_if = "Vec::is_empty")] pub llm_cases: Vec, pub detailed_report: bool, } #[derive(Debug, Serialize)] pub struct OverviewSection { pub generated_at: String, pub run_label: Option, pub total_cases: usize, pub filtered_questions: usize, } #[derive(Debug, Serialize)] pub struct DatasetSection { pub id: String, pub label: String, pub source: String, pub includes_unanswerable: bool, pub require_verified_chunks: bool, pub embedding_backend: String, pub embedding_model: Option, pub embedding_dimension: usize, } #[derive(Debug, Serialize)] pub struct SliceSection { pub id: String, pub seed: u64, pub window_offset: usize, pub window_length: usize, pub slice_cases: usize, pub ledger_total_cases: usize, pub positives: usize, pub negatives: usize, pub total_paragraphs: usize, pub negative_multiplier: f32, } #[derive(Debug, Serialize)] pub struct RetrievalSection { pub k: usize, pub cases: usize, pub correct: usize, pub precision: f64, pub precision_at_1: f64, pub precision_at_2: f64, pub precision_at_3: f64, pub mrr: f64, pub average_ndcg: f64, pub latency: LatencyStats, pub concurrency: usize, pub strategy: String, pub rerank_enabled: bool, pub rerank_pool_size: Option, pub rerank_keep_top: usize, } #[derive(Debug, Serialize)] pub struct LlmSection { pub cases: usize, pub answered: usize, pub precision: f64, } #[derive(Debug, Serialize)] pub struct PerformanceSection { pub openai_base_url: String, pub ingestion_ms: u128, pub namespace_seed_ms: Option, pub evaluation_stages_ms: EvaluationStageTimings, pub stage_latency: StageLatencyBreakdown, pub namespace_reused: bool, pub ingestion_reused: bool, pub embeddings_reused: bool, pub ingestion_cache_path: String, pub corpus_paragraphs: usize, pub positive_paragraphs_reused: usize, pub negative_paragraphs_reused: usize, } #[derive(Debug, Serialize)] pub struct MissEntry { pub question_id: String, pub paragraph_title: String, pub expected_source: String, pub entity_match: bool, pub chunk_text_match: bool, pub chunk_id_match: bool, pub retrieved: Vec, } #[derive(Debug, Serialize)] pub struct LlmCaseEntry { pub question_id: String, pub answered: bool, #[serde(skip_serializing_if = "Option::is_none")] pub match_rank: Option, pub retrieved: Vec, } #[derive(Debug, Serialize)] pub struct RetrievedSnippet { pub rank: usize, pub source_id: String, pub entity_name: String, pub matched: bool, } impl EvaluationReport { pub fn from_summary(summary: &EvaluationSummary, sample: usize) -> Self { let overview = OverviewSection { generated_at: format_timestamp(&summary.generated_at), run_label: summary.run_label.clone(), total_cases: summary.total_cases, filtered_questions: summary.filtered_questions, }; let dataset = DatasetSection { id: summary.dataset_id.clone(), label: summary.dataset_label.clone(), source: summary.dataset_source.clone(), includes_unanswerable: summary.includes_impossible_cases, require_verified_chunks: summary.require_verified_chunks, embedding_backend: summary.embedding_backend.clone(), embedding_model: summary.embedding_model.clone(), embedding_dimension: summary.embedding_dimension, }; let slice = SliceSection { id: summary.slice_id.clone(), seed: summary.slice_seed, window_offset: summary.slice_window_offset, window_length: summary.slice_window_length, slice_cases: summary.slice_cases, ledger_total_cases: summary.slice_total_cases, positives: summary.slice_positive_paragraphs, negatives: summary.slice_negative_paragraphs, total_paragraphs: summary.slice_total_paragraphs, negative_multiplier: summary.slice_negative_multiplier, }; let retrieval = RetrievalSection { k: summary.k, cases: summary.retrieval_cases, correct: summary.retrieval_correct, precision: summary.retrieval_precision, precision_at_1: summary.precision_at_1, precision_at_2: summary.precision_at_2, precision_at_3: summary.precision_at_3, mrr: summary.mrr, average_ndcg: summary.average_ndcg, latency: summary.latency_ms.clone(), concurrency: summary.concurrency, strategy: summary.retrieval_strategy.clone(), rerank_enabled: summary.rerank_enabled, rerank_pool_size: summary.rerank_pool_size, rerank_keep_top: summary.rerank_keep_top, }; let llm = if summary.llm_cases > 0 { Some(LlmSection { cases: summary.llm_cases, answered: summary.llm_answered, precision: summary.llm_precision, }) } else { None }; let performance = PerformanceSection { openai_base_url: summary.perf.openai_base_url.clone(), ingestion_ms: summary.perf.ingestion_ms, namespace_seed_ms: summary.perf.namespace_seed_ms, evaluation_stages_ms: summary.perf.evaluation_stage_ms.clone(), stage_latency: summary.perf.stage_latency.clone(), namespace_reused: summary.namespace_reused, ingestion_reused: summary.ingestion_reused, embeddings_reused: summary.ingestion_embeddings_reused, ingestion_cache_path: summary.ingestion_cache_path.clone(), corpus_paragraphs: summary.corpus_paragraphs, positive_paragraphs_reused: summary.positive_paragraphs_reused, negative_paragraphs_reused: summary.negative_paragraphs_reused, }; let misses = summary .cases .iter() .filter(|case| !case.matched && !case.is_impossible) .take(sample) .map(MissEntry::from_case) .collect(); let llm_cases = if llm.is_some() { summary .cases .iter() .filter(|case| case.is_impossible) .take(sample) .map(LlmCaseEntry::from_case) .collect() } else { Vec::new() }; Self { overview, dataset, slice, retrieval, llm, performance, misses, llm_cases, detailed_report: summary.detailed_report, } } } impl MissEntry { fn from_case(case: &CaseSummary) -> Self { Self { question_id: case.question_id.clone(), paragraph_title: case.paragraph_title.clone(), expected_source: case.expected_source.clone(), entity_match: case.entity_match, chunk_text_match: case.chunk_text_match, chunk_id_match: case.chunk_id_match, retrieved: case .retrieved .iter() .take(3) .map(RetrievedSnippet::from_summary) .collect(), } } } impl LlmCaseEntry { fn from_case(case: &CaseSummary) -> Self { Self { question_id: case.question_id.clone(), answered: case.matched, match_rank: case.match_rank, retrieved: case .retrieved .iter() .take(3) .map(RetrievedSnippet::from_summary) .collect(), } } } impl RetrievedSnippet { fn from_summary(entry: &crate::eval::RetrievedSummary) -> Self { Self { rank: entry.rank, source_id: entry.source_id.clone(), entity_name: entry.entity_name.clone(), matched: entry.matched, } } } pub fn write_reports( summary: &EvaluationSummary, report_dir: &Path, sample: usize, ) -> Result { fs::create_dir_all(report_dir) .with_context(|| format!("creating report directory {}", report_dir.display()))?; let dataset_dir = dataset_report_dir(report_dir, &summary.dataset_id); fs::create_dir_all(&dataset_dir).with_context(|| { format!( "creating dataset report directory {}", dataset_dir.display() ) })?; let stem = build_report_stem(summary); let report = EvaluationReport::from_summary(summary, sample); let json_path = dataset_dir.join(format!("{stem}.json")); let json_blob = serde_json::to_string_pretty(&report).context("serialising JSON report")?; fs::write(&json_path, &json_blob) .with_context(|| format!("writing JSON report to {}", json_path.display()))?; let md_path = dataset_dir.join(format!("{stem}.md")); let markdown = render_markdown(&report); fs::write(&md_path, &markdown) .with_context(|| format!("writing Markdown report to {}", md_path.display()))?; // Keep a latest.json pointer to simplify automation. let latest_json = dataset_dir.join("latest.json"); fs::write(&latest_json, json_blob) .with_context(|| format!("writing latest JSON report to {}", latest_json.display()))?; let latest_md = dataset_dir.join("latest.md"); fs::write(&latest_md, markdown) .with_context(|| format!("writing latest Markdown report to {}", latest_md.display()))?; record_history(summary, &dataset_dir)?; Ok(ReportPaths { json: json_path, markdown: md_path, }) } fn render_markdown(report: &EvaluationReport) -> String { let mut md = String::new(); md.push_str(&format!( "# Retrieval Evaluation (k={})\\n\\n", report.retrieval.k )); md.push_str("## Overview\\n\\n"); md.push_str("| Metric | Value |\\n| --- | --- |\\n"); md.push_str(&format!( "| Generated | {} |\\n", report.overview.generated_at )); md.push_str(&format!( "| Run Label | {} |\\n", report .overview .run_label .as_deref() .filter(|label| !label.is_empty()) .unwrap_or("-") )); md.push_str(&format!( "| Total Cases | {} |\\n", report.overview.total_cases )); md.push_str(&format!( "| Filtered Questions | {} |\\n", report.overview.filtered_questions )); md.push_str("\\n## Dataset & Slice\\n\\n"); md.push_str("| Metric | Value |\\n| --- | --- |\\n"); md.push_str(&format!( "| Dataset | {} (`{}`) |\\n", report.dataset.label, report.dataset.id )); md.push_str(&format!( "| Dataset Source | {} |\\n", report.dataset.source )); md.push_str(&format!( "| Includes Unanswerable | {} |\\n", bool_badge(report.dataset.includes_unanswerable) )); md.push_str(&format!( "| Require Verified Chunks | {} |\\n", bool_badge(report.dataset.require_verified_chunks) )); let embedding_label = if let Some(model) = report.dataset.embedding_model.as_ref() { format!("{} ({model})", report.dataset.embedding_backend) } else { report.dataset.embedding_backend.clone() }; md.push_str(&format!("| Embedding | {} |\\n", embedding_label)); md.push_str(&format!( "| Embedding Dim | {} |\\n", report.dataset.embedding_dimension )); md.push_str(&format!("| Slice ID | `{}` |\\n", report.slice.id)); md.push_str(&format!("| Slice Seed | {} |\\n", report.slice.seed)); md.push_str(&format!( "| Slice Window (offset/length) | {}/{} |\\n", report.slice.window_offset, report.slice.window_length )); md.push_str(&format!( "| Slice Questions (window/ledger) | {}/{} |\\n", report.slice.slice_cases, report.slice.ledger_total_cases )); md.push_str(&format!( "| Slice Positives / Negatives | {}/{} |\\n", report.slice.positives, report.slice.negatives )); md.push_str(&format!( "| Slice Paragraphs | {} |\\n", report.slice.total_paragraphs )); md.push_str(&format!( "| Negative Multiplier | {:.2} |\\n", report.slice.negative_multiplier )); md.push_str("\\n## Retrieval Metrics\\n\\n"); md.push_str("| Metric | Value |\\n| --- | --- |\\n"); md.push_str(&format!("| Cases | {} |\\n", report.retrieval.cases)); md.push_str(&format!( "| Correct@{} | {}/{} |\\n", report.retrieval.k, report.retrieval.correct, report.retrieval.cases )); md.push_str(&format!( "| Precision@{} | {:.3} |\\n", report.retrieval.k, report.retrieval.precision )); md.push_str(&format!( "| Precision@1/2/3 | {:.3} / {:.3} / {:.3} |\\n", report.retrieval.precision_at_1, report.retrieval.precision_at_2, report.retrieval.precision_at_3 )); md.push_str(&format!( "| MRR | {:.3} |\\n", report.retrieval.mrr )); md.push_str(&format!( "| NDCG | {:.3} |\\n", report.retrieval.average_ndcg )); md.push_str(&format!( "| Latency Avg / P50 / P95 (ms) | {:.1} / {} / {} |\\n", report.retrieval.latency.avg, report.retrieval.latency.p50, report.retrieval.latency.p95 )); md.push_str(&format!( "| Strategy | `{}` |\\n", report.retrieval.strategy )); md.push_str(&format!( "| Concurrency | {} |\\n", report.retrieval.concurrency )); if report.retrieval.rerank_enabled { let pool = report .retrieval .rerank_pool_size .map(|size| size.to_string()) .unwrap_or_else(|| "?".into()); md.push_str(&format!( "| Rerank | enabled (pool {pool}, keep top {}) |\\n", report.retrieval.rerank_keep_top )); } else { md.push_str("| Rerank | disabled |\\n"); } if let Some(llm) = &report.llm { md.push_str("\\n## LLM Mode Metrics\\n\\n"); md.push_str("| Metric | Value |\\n| --- | --- |\\n"); md.push_str(&format!("| Cases | {} |\\n", llm.cases)); md.push_str(&format!("| Answered | {} |\\n", llm.answered)); md.push_str(&format!("| Precision | {:.3} |\\n", llm.precision)); } md.push_str("\\n## Performance\\n\\n"); md.push_str("| Metric | Value |\\n| --- | --- |\\n"); md.push_str(&format!( "| OpenAI Base URL | {} |\\n", report.performance.openai_base_url )); md.push_str(&format!( "| Ingestion Duration | {} ms |\\n", report.performance.ingestion_ms )); if let Some(seed) = report.performance.namespace_seed_ms { md.push_str(&format!("| Namespace Seed | {} ms |\\n", seed)); } md.push_str(&format!( "| Namespace State | {} |\\n", if report.performance.namespace_reused { "reused" } else { "seeded" } )); md.push_str(&format!( "| Corpus Paragraphs | {} |\\n", report.performance.corpus_paragraphs )); if report.detailed_report { md.push_str(&format!( "| Ingestion Cache | `{}` |\\n", report.performance.ingestion_cache_path )); md.push_str(&format!( "| Ingestion Reused | {} |\\n", bool_badge(report.performance.ingestion_reused) )); md.push_str(&format!( "| Embeddings Reused | {} |\\n", bool_badge(report.performance.embeddings_reused) )); } md.push_str(&format!( "| Positives Cached | {} |\\n", report.performance.positive_paragraphs_reused )); md.push_str(&format!( "| Negatives Cached | {} |\\n", report.performance.negative_paragraphs_reused )); md.push_str("\\n## Retrieval Stage Timings\\n\\n"); md.push_str("| Stage | Avg (ms) | P50 (ms) | P95 (ms) |\\n| --- | --- | --- | --- |\\n"); write_stage_row(&mut md, "Embed", &report.performance.stage_latency.embed); write_stage_row( &mut md, "Collect Candidates", &report.performance.stage_latency.collect_candidates, ); write_stage_row( &mut md, "Graph Expansion", &report.performance.stage_latency.graph_expansion, ); write_stage_row( &mut md, "Chunk Attach", &report.performance.stage_latency.chunk_attach, ); write_stage_row(&mut md, "Rerank", &report.performance.stage_latency.rerank); write_stage_row( &mut md, "Assemble", &report.performance.stage_latency.assemble, ); if report.misses.is_empty() { md.push_str("\\n_All evaluated retrieval queries matched within the top-k window._\\n"); if report.detailed_report { md.push_str( "\\nSuccess measures were captured for each query (entity, chunk text, chunk ID).\\n", ); } } else { md.push_str("\\n## Missed Retrieval Queries (sample)\\n\\n"); if report.detailed_report { md.push_str( "| Question ID | Paragraph | Expected Source | Entity Match | Chunk Text | Chunk ID | Top Retrieved |\\n", ); md.push_str("| --- | --- | --- | --- | --- | --- | --- |\\n"); } else { md.push_str("| Question ID | Paragraph | Expected Source | Top Retrieved |\\n"); md.push_str("| --- | --- | --- | --- |\\n"); } for case in &report.misses { let retrieved = render_retrieved(&case.retrieved); if report.detailed_report { md.push_str(&format!( "| `{}` | {} | `{}` | {} | {} | {} | {} |\\n", case.question_id, case.paragraph_title, case.expected_source, bool_badge(case.entity_match), bool_badge(case.chunk_text_match), bool_badge(case.chunk_id_match), retrieved )); } else { md.push_str(&format!( "| `{}` | {} | `{}` | {} |\\n", case.question_id, case.paragraph_title, case.expected_source, retrieved )); } } } if report.llm.is_some() { md.push_str("\\n## LLM-Only Cases (sample)\\n\\n"); if report.llm_cases.is_empty() { md.push_str("All LLM-only cases matched within the evaluation window.\\n"); } else { md.push_str("| Question ID | Answered | Match Rank | Top Retrieved |\\n"); md.push_str("| --- | --- | --- | --- |\\n"); for case in &report.llm_cases { let retrieved = render_retrieved(&case.retrieved); let rank = case .match_rank .map(|rank| rank.to_string()) .unwrap_or_else(|| "-".into()); md.push_str(&format!( "| `{}` | {} | {} | {} |\\n", case.question_id, bool_badge(case.answered), rank, retrieved )); } } } md } fn write_stage_row(buf: &mut String, label: &str, stats: &LatencyStats) { buf.push_str(&format!( "| {} | {:.1} | {} | {} |\n", label, stats.avg, stats.p50, stats.p95 )); } fn bool_badge(value: bool) -> &'static str { if value { "✅" } else { "⚪" } } fn render_retrieved(entries: &[RetrievedSnippet]) -> String { if entries.is_empty() { "-".to_string() } else { entries .iter() .map(|entry| format!("{} (rank {})", entry.source_id, entry.rank)) .take(3) .collect::>() .join("
") } } fn build_report_stem(summary: &EvaluationSummary) -> String { let timestamp = summary.generated_at.format("%Y%m%dT%H%M%S"); let backend = sanitize_component(&summary.embedding_backend); let dataset_component = sanitize_component(&summary.dataset_id); let model_component = summary .embedding_model .as_ref() .map(|model| sanitize_component(model)); match model_component { Some(model) => format!( "precision_at_{}_{}_{}_{}_{}", summary.k, dataset_component, timestamp, backend, model ), None => format!( "precision_at_{}_{}_{}_{}", summary.k, dataset_component, timestamp, backend ), } } fn sanitize_component(input: &str) -> String { input .chars() .map(|ch| if ch.is_ascii_alphanumeric() { ch } else { '_' }) .collect() } pub fn dataset_report_dir(report_dir: &Path, dataset_id: &str) -> PathBuf { report_dir.join(sanitize_component(dataset_id)) } #[derive(Debug, Serialize, Deserialize)] struct HistoryEntry { generated_at: String, run_label: Option, dataset_id: String, dataset_label: String, slice_id: String, slice_seed: u64, slice_window_offset: usize, slice_window_length: usize, slice_cases: usize, slice_total_cases: usize, k: usize, limit: Option, precision: f64, precision_at_1: f64, precision_at_2: f64, precision_at_3: f64, #[serde(default)] mrr: f64, #[serde(default)] average_ndcg: f64, #[serde(default)] retrieval_cases: usize, #[serde(default)] retrieval_precision: f64, #[serde(default)] llm_cases: usize, #[serde(default)] llm_precision: f64, duration_ms: u128, latency_ms: LatencyStats, embedding_backend: String, embedding_model: Option, ingestion_reused: bool, ingestion_embeddings_reused: bool, rerank_enabled: bool, rerank_keep_top: usize, rerank_pool_size: Option, delta: Option, openai_base_url: String, ingestion_ms: u128, #[serde(default)] namespace_seed_ms: Option, } #[derive(Debug, Serialize, Deserialize)] struct HistoryDelta { precision: f64, precision_at_1: f64, latency_avg_ms: f64, } fn record_history(summary: &EvaluationSummary, report_dir: &Path) -> Result<()> { let path = report_dir.join("evaluations.json"); let mut entries: Vec = if path.exists() { let contents = fs::read(&path) .with_context(|| format!("reading evaluation log {}", path.display()))?; match serde_json::from_slice(&contents) { Ok(entries) => entries, Err(err) => { let timestamp = Utc::now().format("%Y%m%dT%H%M%S"); let backup_path = report_dir.join(format!("evaluations.json.corrupted.{}", timestamp)); warn!( path = %path.display(), backup = %backup_path.display(), error = %err, "Evaluation history file is corrupted; backing up and starting fresh" ); if let Err(e) = fs::rename(&path, &backup_path) { warn!( path = %path.display(), error = %e, "Failed to backup corrupted evaluation history" ); } Vec::new() } } } else { Vec::new() }; let delta = entries.last().map(|prev| HistoryDelta { precision: summary.precision - prev.precision, precision_at_1: summary.precision_at_1 - prev.precision_at_1, latency_avg_ms: summary.latency_ms.avg - prev.latency_ms.avg, }); let entry = HistoryEntry { generated_at: format_timestamp(&summary.generated_at), run_label: summary.run_label.clone(), dataset_id: summary.dataset_id.clone(), dataset_label: summary.dataset_label.clone(), slice_id: summary.slice_id.clone(), slice_seed: summary.slice_seed, slice_window_offset: summary.slice_window_offset, slice_window_length: summary.slice_window_length, slice_cases: summary.slice_cases, slice_total_cases: summary.slice_total_cases, k: summary.k, limit: summary.limit, precision: summary.precision, precision_at_1: summary.precision_at_1, precision_at_2: summary.precision_at_2, precision_at_3: summary.precision_at_3, mrr: summary.mrr, average_ndcg: summary.average_ndcg, retrieval_cases: summary.retrieval_cases, retrieval_precision: summary.retrieval_precision, llm_cases: summary.llm_cases, llm_precision: summary.llm_precision, duration_ms: summary.duration_ms, latency_ms: summary.latency_ms.clone(), embedding_backend: summary.embedding_backend.clone(), embedding_model: summary.embedding_model.clone(), ingestion_reused: summary.ingestion_reused, ingestion_embeddings_reused: summary.ingestion_embeddings_reused, rerank_enabled: summary.rerank_enabled, rerank_keep_top: summary.rerank_keep_top, rerank_pool_size: summary.rerank_pool_size, delta, openai_base_url: summary.perf.openai_base_url.clone(), ingestion_ms: summary.perf.ingestion_ms, namespace_seed_ms: summary.perf.namespace_seed_ms, }; entries.push(entry); let blob = serde_json::to_vec_pretty(&entries).context("serialising evaluation log")?; fs::write(&path, blob).with_context(|| format!("writing evaluation log {}", path.display()))?; Ok(()) } #[cfg(test)] mod tests { use super::*; use crate::eval::{ EvaluationStageTimings, PerformanceTimings, RetrievedSummary, StageLatencyBreakdown, }; use chrono::Utc; fn latency(ms: f64) -> LatencyStats { LatencyStats { avg: ms, p50: ms as u128, p95: ms as u128, } } fn sample_stage_latency() -> StageLatencyBreakdown { StageLatencyBreakdown { embed: latency(9.0), collect_candidates: latency(10.0), graph_expansion: latency(11.0), chunk_attach: latency(12.0), rerank: latency(13.0), assemble: latency(14.0), } } fn sample_eval_stage() -> EvaluationStageTimings { EvaluationStageTimings { prepare_slice_ms: 1, prepare_db_ms: 2, prepare_corpus_ms: 3, prepare_namespace_ms: 4, run_queries_ms: 5, summarize_ms: 6, finalize_ms: 7, } } fn sample_case(is_impossible: bool, matched: bool) -> CaseSummary { CaseSummary { question_id: if is_impossible { "llm-q".into() } else { "retrieval-q".into() }, question: "Who is the hero?".into(), paragraph_id: "p1".into(), paragraph_title: "Hero".into(), expected_source: "src1".into(), answers: vec!["answer".into()], matched, entity_match: matched, chunk_text_match: matched, chunk_id_match: matched, ndcg: None, reciprocal_rank: None, is_impossible, has_verified_chunks: !is_impossible, match_rank: if matched { Some(1) } else { None }, latency_ms: 42, retrieved: vec![RetrievedSummary { rank: 1, entity_id: "entity1".into(), source_id: "src1".into(), entity_name: "Entity".into(), score: 1.0, matched, entity_description: None, entity_category: None, chunk_text_match: Some(matched), chunk_id_match: Some(matched), }], } } fn sample_summary(include_llm: bool) -> EvaluationSummary { let mut cases = vec![sample_case(false, true)]; if include_llm { cases.push(sample_case(true, false)); } EvaluationSummary { generated_at: Utc::now(), k: 5, limit: Some(10), run_label: Some("test".into()), total_cases: cases.len(), correct: 1, precision: 1.0, correct_at_1: 1, correct_at_2: 1, correct_at_3: 1, precision_at_1: 1.0, precision_at_2: 1.0, precision_at_3: 1.0, duration_ms: 100, dataset_id: "ds".into(), dataset_label: "Dataset".into(), dataset_includes_unanswerable: include_llm, dataset_source: "dev".into(), includes_impossible_cases: include_llm, require_verified_chunks: !include_llm, filtered_questions: 0, retrieval_cases: 1, retrieval_correct: 1, retrieval_precision: 1.0, average_ndcg: 0.0, mrr: 0.0, llm_cases: if include_llm { 1 } else { 0 }, llm_answered: 0, llm_precision: 0.0, slice_id: "slice".into(), slice_seed: 1, slice_total_cases: cases.len(), slice_window_offset: 0, slice_window_length: cases.len(), slice_cases: cases.len(), slice_positive_paragraphs: 1, slice_negative_paragraphs: 0, slice_total_paragraphs: 1, slice_negative_multiplier: 1.0, namespace_reused: true, corpus_paragraphs: 1, ingestion_cache_path: "/cache".into(), ingestion_reused: true, ingestion_embeddings_reused: true, ingestion_fingerprint: "fp".into(), positive_paragraphs_reused: 1, negative_paragraphs_reused: 0, latency_ms: latency(10.0), perf: PerformanceTimings { openai_base_url: "https://example.com".into(), ingestion_ms: 100, namespace_seed_ms: Some(50), evaluation_stage_ms: sample_eval_stage(), stage_latency: sample_stage_latency(), }, embedding_backend: "fastembed".into(), embedding_model: Some("model".into()), embedding_dimension: 32, rerank_enabled: true, rerank_pool_size: Some(4), rerank_keep_top: 5, concurrency: 2, detailed_report: true, retrieval_strategy: "initial".into(), chunk_vector_take: 50, chunk_fts_take: 50, chunk_token_budget: 10_000, chunk_avg_chars_per_token: 4, max_chunks_per_entity: 4, cases, } } #[test] fn markdown_includes_llm_section() { let summary = sample_summary(true); let report = EvaluationReport::from_summary(&summary, 5); let md = render_markdown(&report); assert!(md.contains("LLM Mode Metrics")); assert!(md.contains("LLM-Only Cases (sample)")); } #[test] fn markdown_hides_llm_section_when_not_present() { let summary = sample_summary(false); let report = EvaluationReport::from_summary(&summary, 5); let md = render_markdown(&report); assert!(!md.contains("LLM Mode Metrics")); assert!(!md.contains("LLM-Only Cases")); } }