Files
minne/eval/src/report.rs
T
Per Stark e77f2d51e8 beir-rff
2025-12-08 20:39:12 +01:00

1188 lines
38 KiB
Rust

use std::{
fs,
path::{Path, PathBuf},
};
use anyhow::{Context, Result};
use serde::{Deserialize, Serialize};
use crate::eval::{
format_timestamp, CaseSummary, EvaluationStageTimings, EvaluationSummary, LatencyStats,
StageLatencyBreakdown,
};
use chrono::Utc;
use tracing::warn;
#[derive(Debug)]
pub struct ReportPaths {
pub json: PathBuf,
pub markdown: PathBuf,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvaluationReport {
pub overview: OverviewSection,
pub dataset: DatasetSection,
pub slice: SliceSection,
pub retrieval: RetrievalSection,
#[serde(skip_serializing_if = "Option::is_none")]
pub llm: Option<LlmSection>,
pub performance: PerformanceSection,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub misses: Vec<MissEntry>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub llm_cases: Vec<LlmCaseEntry>,
#[serde(default)]
pub detailed_report: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OverviewSection {
pub generated_at: String,
pub run_label: Option<String>,
pub total_cases: usize,
pub filtered_questions: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DatasetSection {
pub id: String,
pub label: String,
pub source: String,
pub includes_unanswerable: bool,
pub require_verified_chunks: bool,
pub embedding_backend: String,
pub embedding_model: Option<String>,
pub embedding_dimension: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SliceSection {
pub id: String,
pub seed: u64,
pub window_offset: usize,
pub window_length: usize,
pub slice_cases: usize,
pub ledger_total_cases: usize,
pub positives: usize,
pub negatives: usize,
pub total_paragraphs: usize,
pub negative_multiplier: f32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RetrievalSection {
pub k: usize,
pub cases: usize,
pub correct: usize,
pub precision: f64,
pub precision_at_1: f64,
pub precision_at_2: f64,
pub precision_at_3: f64,
pub mrr: f64,
pub average_ndcg: f64,
pub latency: LatencyStats,
pub concurrency: usize,
pub strategy: String,
pub rerank_enabled: bool,
pub rerank_pool_size: Option<usize>,
pub rerank_keep_top: usize,
pub chunk_result_cap: usize,
#[serde(default = "default_chunk_rrf_k")]
pub chunk_rrf_k: f32,
#[serde(default = "default_chunk_rrf_weight")]
pub chunk_rrf_vector_weight: f32,
#[serde(default = "default_chunk_rrf_weight")]
pub chunk_rrf_fts_weight: f32,
#[serde(default = "default_chunk_rrf_use")]
pub chunk_rrf_use_vector: bool,
#[serde(default = "default_chunk_rrf_use")]
pub chunk_rrf_use_fts: bool,
#[serde(default)]
pub chunk_vector_take: usize,
#[serde(default)]
pub chunk_fts_take: usize,
pub ingest_chunk_min_tokens: usize,
pub ingest_chunk_max_tokens: usize,
pub ingest_chunk_overlap_tokens: usize,
pub ingest_chunks_only: bool,
}
const fn default_chunk_rrf_k() -> f32 {
60.0
}
const fn default_chunk_rrf_weight() -> f32 {
1.0
}
const fn default_chunk_rrf_use() -> bool {
true
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LlmSection {
pub cases: usize,
pub answered: usize,
pub precision: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PerformanceSection {
pub openai_base_url: String,
pub ingestion_ms: u128,
pub namespace_seed_ms: Option<u128>,
pub evaluation_stages_ms: EvaluationStageTimings,
pub stage_latency: StageLatencyBreakdown,
pub namespace_reused: bool,
pub ingestion_reused: bool,
pub embeddings_reused: bool,
pub ingestion_cache_path: String,
pub corpus_paragraphs: usize,
pub positive_paragraphs_reused: usize,
pub negative_paragraphs_reused: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MissEntry {
pub question_id: String,
pub paragraph_title: String,
pub expected_source: String,
pub entity_match: bool,
pub chunk_text_match: bool,
pub chunk_id_match: bool,
pub retrieved: Vec<RetrievedSnippet>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LlmCaseEntry {
pub question_id: String,
pub answered: bool,
#[serde(skip_serializing_if = "Option::is_none")]
pub match_rank: Option<usize>,
pub retrieved: Vec<RetrievedSnippet>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RetrievedSnippet {
pub rank: usize,
pub source_id: String,
pub entity_name: String,
pub matched: bool,
}
#[derive(Debug)]
pub struct ReportOutcome {
pub record: EvaluationReport,
pub paths: ReportPaths,
pub history_path: PathBuf,
}
impl EvaluationReport {
pub fn from_summary(summary: &EvaluationSummary, sample: usize) -> Self {
let overview = OverviewSection {
generated_at: format_timestamp(&summary.generated_at),
run_label: summary.run_label.clone(),
total_cases: summary.total_cases,
filtered_questions: summary.filtered_questions,
};
let dataset = DatasetSection {
id: summary.dataset_id.clone(),
label: summary.dataset_label.clone(),
source: summary.dataset_source.clone(),
includes_unanswerable: summary.includes_impossible_cases,
require_verified_chunks: summary.require_verified_chunks,
embedding_backend: summary.embedding_backend.clone(),
embedding_model: summary.embedding_model.clone(),
embedding_dimension: summary.embedding_dimension,
};
let slice = SliceSection {
id: summary.slice_id.clone(),
seed: summary.slice_seed,
window_offset: summary.slice_window_offset,
window_length: summary.slice_window_length,
slice_cases: summary.slice_cases,
ledger_total_cases: summary.slice_total_cases,
positives: summary.slice_positive_paragraphs,
negatives: summary.slice_negative_paragraphs,
total_paragraphs: summary.slice_total_paragraphs,
negative_multiplier: summary.slice_negative_multiplier,
};
let retrieval = RetrievalSection {
k: summary.k,
cases: summary.retrieval_cases,
correct: summary.retrieval_correct,
precision: summary.retrieval_precision,
precision_at_1: summary.precision_at_1,
precision_at_2: summary.precision_at_2,
precision_at_3: summary.precision_at_3,
mrr: summary.mrr,
average_ndcg: summary.average_ndcg,
latency: summary.latency_ms.clone(),
concurrency: summary.concurrency,
strategy: summary.retrieval_strategy.clone(),
rerank_enabled: summary.rerank_enabled,
rerank_pool_size: summary.rerank_pool_size,
rerank_keep_top: summary.rerank_keep_top,
chunk_result_cap: summary.chunk_result_cap,
chunk_rrf_k: summary.chunk_rrf_k,
chunk_rrf_vector_weight: summary.chunk_rrf_vector_weight,
chunk_rrf_fts_weight: summary.chunk_rrf_fts_weight,
chunk_rrf_use_vector: summary.chunk_rrf_use_vector,
chunk_rrf_use_fts: summary.chunk_rrf_use_fts,
chunk_vector_take: summary.chunk_vector_take,
chunk_fts_take: summary.chunk_fts_take,
ingest_chunk_min_tokens: summary.ingest_chunk_min_tokens,
ingest_chunk_max_tokens: summary.ingest_chunk_max_tokens,
ingest_chunk_overlap_tokens: summary.ingest_chunk_overlap_tokens,
ingest_chunks_only: summary.ingest_chunks_only,
};
let llm = if summary.llm_cases > 0 {
Some(LlmSection {
cases: summary.llm_cases,
answered: summary.llm_answered,
precision: summary.llm_precision,
})
} else {
None
};
let performance = PerformanceSection {
openai_base_url: summary.perf.openai_base_url.clone(),
ingestion_ms: summary.perf.ingestion_ms,
namespace_seed_ms: summary.perf.namespace_seed_ms,
evaluation_stages_ms: summary.perf.evaluation_stage_ms.clone(),
stage_latency: summary.perf.stage_latency.clone(),
namespace_reused: summary.namespace_reused,
ingestion_reused: summary.ingestion_reused,
embeddings_reused: summary.ingestion_embeddings_reused,
ingestion_cache_path: summary.ingestion_cache_path.clone(),
corpus_paragraphs: summary.corpus_paragraphs,
positive_paragraphs_reused: summary.positive_paragraphs_reused,
negative_paragraphs_reused: summary.negative_paragraphs_reused,
};
let (misses, llm_cases) = if summary.detailed_report {
let misses = summary
.cases
.iter()
.filter(|case| !case.matched && !case.is_impossible)
.take(sample)
.map(MissEntry::from_case)
.collect();
let llm_cases = if llm.is_some() {
summary
.cases
.iter()
.filter(|case| case.is_impossible)
.take(sample)
.map(LlmCaseEntry::from_case)
.collect()
} else {
Vec::new()
};
(misses, llm_cases)
} else {
(Vec::new(), Vec::new())
};
Self {
overview,
dataset,
slice,
retrieval,
llm,
performance,
misses,
llm_cases,
detailed_report: summary.detailed_report,
}
}
}
impl MissEntry {
fn from_case(case: &CaseSummary) -> Self {
Self {
question_id: case.question_id.clone(),
paragraph_title: case.paragraph_title.clone(),
expected_source: case.expected_source.clone(),
entity_match: case.entity_match,
chunk_text_match: case.chunk_text_match,
chunk_id_match: case.chunk_id_match,
retrieved: case
.retrieved
.iter()
.take(3)
.map(RetrievedSnippet::from_summary)
.collect(),
}
}
}
impl LlmCaseEntry {
fn from_case(case: &CaseSummary) -> Self {
Self {
question_id: case.question_id.clone(),
answered: case.matched,
match_rank: case.match_rank,
retrieved: case
.retrieved
.iter()
.take(3)
.map(RetrievedSnippet::from_summary)
.collect(),
}
}
}
impl RetrievedSnippet {
fn from_summary(entry: &crate::eval::RetrievedSummary) -> Self {
Self {
rank: entry.rank,
source_id: entry.source_id.clone(),
entity_name: entry.entity_name.clone(),
matched: entry.matched,
}
}
}
pub fn write_reports(
summary: &EvaluationSummary,
report_dir: &Path,
sample: usize,
) -> Result<ReportOutcome> {
fs::create_dir_all(report_dir)
.with_context(|| format!("creating report directory {}", report_dir.display()))?;
let dataset_dir = dataset_report_dir(report_dir, &summary.dataset_id);
fs::create_dir_all(&dataset_dir).with_context(|| {
format!(
"creating dataset report directory {}",
dataset_dir.display()
)
})?;
let stem = build_report_stem(summary);
let report = EvaluationReport::from_summary(summary, sample);
let json_path = dataset_dir.join(format!("{stem}.json"));
let json_blob = serde_json::to_string_pretty(&report).context("serialising JSON report")?;
fs::write(&json_path, &json_blob)
.with_context(|| format!("writing JSON report to {}", json_path.display()))?;
let md_path = dataset_dir.join(format!("{stem}.md"));
let markdown = render_markdown(&report);
fs::write(&md_path, &markdown)
.with_context(|| format!("writing Markdown report to {}", md_path.display()))?;
// Keep a latest.json pointer to simplify automation.
let latest_json = dataset_dir.join("latest.json");
fs::write(&latest_json, json_blob)
.with_context(|| format!("writing latest JSON report to {}", latest_json.display()))?;
let latest_md = dataset_dir.join("latest.md");
fs::write(&latest_md, markdown)
.with_context(|| format!("writing latest Markdown report to {}", latest_md.display()))?;
let history_path = record_history(&report, &dataset_dir)?;
Ok(ReportOutcome {
record: report,
paths: ReportPaths {
json: json_path,
markdown: md_path,
},
history_path,
})
}
fn render_markdown(report: &EvaluationReport) -> String {
let mut md = String::new();
md.push_str(&format!(
"# Retrieval Evaluation (k={})\\n\\n",
report.retrieval.k
));
md.push_str("## Overview\\n\\n");
md.push_str("| Metric | Value |\\n| --- | --- |\\n");
md.push_str(&format!(
"| Generated | {} |\\n",
report.overview.generated_at
));
md.push_str(&format!(
"| Run Label | {} |\\n",
report
.overview
.run_label
.as_deref()
.filter(|label| !label.is_empty())
.unwrap_or("-")
));
md.push_str(&format!(
"| Total Cases | {} |\\n",
report.overview.total_cases
));
md.push_str(&format!(
"| Filtered Questions | {} |\\n",
report.overview.filtered_questions
));
md.push_str("\\n## Dataset & Slice\\n\\n");
md.push_str("| Metric | Value |\\n| --- | --- |\\n");
md.push_str(&format!(
"| Dataset | {} (`{}`) |\\n",
report.dataset.label, report.dataset.id
));
md.push_str(&format!(
"| Dataset Source | {} |\\n",
report.dataset.source
));
md.push_str(&format!(
"| Includes Unanswerable | {} |\\n",
bool_badge(report.dataset.includes_unanswerable)
));
md.push_str(&format!(
"| Require Verified Chunks | {} |\\n",
bool_badge(report.dataset.require_verified_chunks)
));
let embedding_label = if let Some(model) = report.dataset.embedding_model.as_ref() {
format!("{} ({model})", report.dataset.embedding_backend)
} else {
report.dataset.embedding_backend.clone()
};
md.push_str(&format!("| Embedding | {} |\\n", embedding_label));
md.push_str(&format!(
"| Embedding Dim | {} |\\n",
report.dataset.embedding_dimension
));
md.push_str(&format!("| Slice ID | `{}` |\\n", report.slice.id));
md.push_str(&format!("| Slice Seed | {} |\\n", report.slice.seed));
md.push_str(&format!(
"| Slice Window (offset/length) | {}/{} |\\n",
report.slice.window_offset, report.slice.window_length
));
md.push_str(&format!(
"| Slice Questions (window/ledger) | {}/{} |\\n",
report.slice.slice_cases, report.slice.ledger_total_cases
));
md.push_str(&format!(
"| Slice Positives / Negatives | {}/{} |\\n",
report.slice.positives, report.slice.negatives
));
md.push_str(&format!(
"| Slice Paragraphs | {} |\\n",
report.slice.total_paragraphs
));
md.push_str(&format!(
"| Negative Multiplier | {:.2} |\\n",
report.slice.negative_multiplier
));
md.push_str("\\n## Retrieval Metrics\\n\\n");
md.push_str("| Metric | Value |\\n| --- | --- |\\n");
md.push_str(&format!("| Cases | {} |\\n", report.retrieval.cases));
md.push_str(&format!(
"| Correct@{} | {}/{} |\\n",
report.retrieval.k, report.retrieval.correct, report.retrieval.cases
));
md.push_str(&format!(
"| Precision@{} | {:.3} |\\n",
report.retrieval.k, report.retrieval.precision
));
md.push_str(&format!(
"| Precision@1/2/3 | {:.3} / {:.3} / {:.3} |\\n",
report.retrieval.precision_at_1,
report.retrieval.precision_at_2,
report.retrieval.precision_at_3
));
md.push_str(&format!("| MRR | {:.3} |\\n", report.retrieval.mrr));
md.push_str(&format!(
"| NDCG | {:.3} |\\n",
report.retrieval.average_ndcg
));
md.push_str(&format!(
"| Latency Avg / P50 / P95 (ms) | {:.1} / {} / {} |\\n",
report.retrieval.latency.avg, report.retrieval.latency.p50, report.retrieval.latency.p95
));
md.push_str(&format!(
"| Strategy | `{}` |\\n",
report.retrieval.strategy
));
md.push_str(&format!(
"| Concurrency | {} |\\n",
report.retrieval.concurrency
));
if report.retrieval.rerank_enabled {
let pool = report
.retrieval
.rerank_pool_size
.map(|size| size.to_string())
.unwrap_or_else(|| "?".into());
md.push_str(&format!(
"| Rerank | enabled (pool {pool}, keep top {}) |\\n",
report.retrieval.rerank_keep_top
));
} else {
md.push_str("| Rerank | disabled |\\n");
}
if let Some(llm) = &report.llm {
md.push_str("\\n## LLM Mode Metrics\\n\\n");
md.push_str("| Metric | Value |\\n| --- | --- |\\n");
md.push_str(&format!("| Cases | {} |\\n", llm.cases));
md.push_str(&format!("| Answered | {} |\\n", llm.answered));
md.push_str(&format!("| Precision | {:.3} |\\n", llm.precision));
}
md.push_str("\\n## Performance\\n\\n");
md.push_str("| Metric | Value |\\n| --- | --- |\\n");
md.push_str(&format!(
"| OpenAI Base URL | {} |\\n",
report.performance.openai_base_url
));
md.push_str(&format!(
"| Ingestion Duration | {} ms |\\n",
report.performance.ingestion_ms
));
if let Some(seed) = report.performance.namespace_seed_ms {
md.push_str(&format!("| Namespace Seed | {} ms |\\n", seed));
}
md.push_str(&format!(
"| Namespace State | {} |\\n",
if report.performance.namespace_reused {
"reused"
} else {
"seeded"
}
));
md.push_str(&format!(
"| Corpus Paragraphs | {} |\\n",
report.performance.corpus_paragraphs
));
if report.detailed_report {
md.push_str(&format!(
"| Ingestion Cache | `{}` |\\n",
report.performance.ingestion_cache_path
));
md.push_str(&format!(
"| Ingestion Reused | {} |\\n",
bool_badge(report.performance.ingestion_reused)
));
md.push_str(&format!(
"| Embeddings Reused | {} |\\n",
bool_badge(report.performance.embeddings_reused)
));
}
md.push_str(&format!(
"| Positives Cached | {} |\\n",
report.performance.positive_paragraphs_reused
));
md.push_str(&format!(
"| Negatives Cached | {} |\\n",
report.performance.negative_paragraphs_reused
));
md.push_str("\\n## Retrieval Stage Timings\\n\\n");
md.push_str("| Stage | Avg (ms) | P50 (ms) | P95 (ms) |\\n| --- | --- | --- | --- |\\n");
write_stage_row(&mut md, "Embed", &report.performance.stage_latency.embed);
write_stage_row(
&mut md,
"Collect Candidates",
&report.performance.stage_latency.collect_candidates,
);
write_stage_row(
&mut md,
"Graph Expansion",
&report.performance.stage_latency.graph_expansion,
);
write_stage_row(
&mut md,
"Chunk Attach",
&report.performance.stage_latency.chunk_attach,
);
write_stage_row(&mut md, "Rerank", &report.performance.stage_latency.rerank);
write_stage_row(
&mut md,
"Assemble",
&report.performance.stage_latency.assemble,
);
if report.misses.is_empty() {
if report.detailed_report {
md.push_str(
"\\n_All evaluated retrieval queries matched within the top-k window._\\n\
\\nSuccess measures were captured for each query (entity, chunk text, chunk ID).\\n",
);
} else {
md.push_str(
"\\n_Misses omitted. Re-run with `--detailed-report` to see sampled failures._\\n",
);
}
} else {
md.push_str("\\n## Missed Retrieval Queries (sample)\\n\\n");
if report.detailed_report {
md.push_str(
"| Question ID | Paragraph | Expected Source | Entity Match | Chunk Text | Chunk ID | Top Retrieved |\\n",
);
md.push_str("| --- | --- | --- | --- | --- | --- | --- |\\n");
} else {
md.push_str("| Question ID | Paragraph | Expected Source | Top Retrieved |\\n");
md.push_str("| --- | --- | --- | --- |\\n");
}
for case in &report.misses {
let retrieved = render_retrieved(&case.retrieved);
if report.detailed_report {
md.push_str(&format!(
"| `{}` | {} | `{}` | {} | {} | {} | {} |\\n",
case.question_id,
case.paragraph_title,
case.expected_source,
bool_badge(case.entity_match),
bool_badge(case.chunk_text_match),
bool_badge(case.chunk_id_match),
retrieved
));
} else {
md.push_str(&format!(
"| `{}` | {} | `{}` | {} |\\n",
case.question_id, case.paragraph_title, case.expected_source, retrieved
));
}
}
}
if report.llm.is_some() {
md.push_str("\\n## LLM-Only Cases (sample)\\n\\n");
if report.llm_cases.is_empty() {
if report.detailed_report {
md.push_str("All LLM-only cases matched within the evaluation window.\\n");
} else {
md.push_str(
"LLM-only cases omitted. Re-run with `--detailed-report` to see samples.\\n",
);
}
} else {
md.push_str("| Question ID | Answered | Match Rank | Top Retrieved |\\n");
md.push_str("| --- | --- | --- | --- |\\n");
for case in &report.llm_cases {
let retrieved = render_retrieved(&case.retrieved);
let rank = case
.match_rank
.map(|rank| rank.to_string())
.unwrap_or_else(|| "-".into());
md.push_str(&format!(
"| `{}` | {} | {} | {} |\\n",
case.question_id,
bool_badge(case.answered),
rank,
retrieved
));
}
}
}
md
}
fn write_stage_row(buf: &mut String, label: &str, stats: &LatencyStats) {
buf.push_str(&format!(
"| {} | {:.1} | {} | {} |\n",
label, stats.avg, stats.p50, stats.p95
));
}
fn bool_badge(value: bool) -> &'static str {
if value {
""
} else {
""
}
}
fn render_retrieved(entries: &[RetrievedSnippet]) -> String {
if entries.is_empty() {
"-".to_string()
} else {
entries
.iter()
.map(|entry| format!("{} (rank {})", entry.source_id, entry.rank))
.take(3)
.collect::<Vec<_>>()
.join("<br>")
}
}
fn build_report_stem(summary: &EvaluationSummary) -> String {
let timestamp = summary.generated_at.format("%Y%m%dT%H%M%S");
let backend = sanitize_component(&summary.embedding_backend);
let dataset_component = sanitize_component(&summary.dataset_id);
let model_component = summary
.embedding_model
.as_ref()
.map(|model| sanitize_component(model));
match model_component {
Some(model) => format!(
"precision_at_{}_{}_{}_{}_{}",
summary.k, dataset_component, timestamp, backend, model
),
None => format!(
"precision_at_{}_{}_{}_{}",
summary.k, dataset_component, timestamp, backend
),
}
}
fn sanitize_component(input: &str) -> String {
input
.chars()
.map(|ch| if ch.is_ascii_alphanumeric() { ch } else { '_' })
.collect()
}
pub fn dataset_report_dir(report_dir: &Path, dataset_id: &str) -> PathBuf {
report_dir.join(sanitize_component(dataset_id))
}
#[derive(Debug, Serialize, Deserialize)]
struct LegacyHistoryEntry {
generated_at: String,
run_label: Option<String>,
dataset_id: String,
dataset_label: String,
slice_id: String,
slice_seed: u64,
slice_window_offset: usize,
slice_window_length: usize,
slice_cases: usize,
slice_total_cases: usize,
k: usize,
limit: Option<usize>,
precision: f64,
precision_at_1: f64,
precision_at_2: f64,
precision_at_3: f64,
#[serde(default)]
mrr: f64,
#[serde(default)]
average_ndcg: f64,
#[serde(default)]
retrieval_cases: usize,
#[serde(default)]
retrieval_precision: f64,
#[serde(default)]
llm_cases: usize,
#[serde(default)]
llm_precision: f64,
duration_ms: u128,
latency_ms: LatencyStats,
embedding_backend: String,
embedding_model: Option<String>,
ingestion_reused: bool,
ingestion_embeddings_reused: bool,
rerank_enabled: bool,
rerank_keep_top: usize,
rerank_pool_size: Option<usize>,
#[serde(default)]
chunk_result_cap: Option<usize>,
#[serde(default)]
ingest_chunk_min_tokens: Option<usize>,
#[serde(default)]
ingest_chunk_max_tokens: Option<usize>,
#[serde(default)]
ingest_chunk_overlap_tokens: Option<usize>,
#[serde(default)]
ingest_chunks_only: Option<bool>,
#[serde(default)]
delta: Option<LegacyHistoryDelta>,
openai_base_url: String,
ingestion_ms: u128,
#[serde(default)]
namespace_seed_ms: Option<u128>,
}
#[derive(Debug, Serialize, Deserialize)]
struct LegacyHistoryDelta {
precision: f64,
precision_at_1: f64,
latency_avg_ms: f64,
}
fn default_stage_latency() -> StageLatencyBreakdown {
StageLatencyBreakdown {
embed: LatencyStats::default(),
collect_candidates: LatencyStats::default(),
graph_expansion: LatencyStats::default(),
chunk_attach: LatencyStats::default(),
rerank: LatencyStats::default(),
assemble: LatencyStats::default(),
}
}
fn convert_legacy_entry(entry: LegacyHistoryEntry) -> EvaluationReport {
let overview = OverviewSection {
generated_at: entry.generated_at,
run_label: entry.run_label,
total_cases: entry.slice_cases,
filtered_questions: 0,
};
let dataset = DatasetSection {
id: entry.dataset_id,
label: entry.dataset_label,
source: String::new(),
includes_unanswerable: entry.llm_cases > 0,
require_verified_chunks: true,
embedding_backend: entry.embedding_backend,
embedding_model: entry.embedding_model,
embedding_dimension: 0,
};
let slice = SliceSection {
id: entry.slice_id,
seed: entry.slice_seed,
window_offset: entry.slice_window_offset,
window_length: entry.slice_window_length,
slice_cases: entry.slice_cases,
ledger_total_cases: entry.slice_total_cases,
positives: 0,
negatives: 0,
total_paragraphs: 0,
negative_multiplier: 0.0,
};
let retrieval_cases = if entry.retrieval_cases > 0 {
entry.retrieval_cases
} else {
entry.slice_cases.saturating_sub(entry.llm_cases)
};
let retrieval_precision = if entry.retrieval_precision > 0.0 {
entry.retrieval_precision
} else {
entry.precision
};
let retrieval = RetrievalSection {
k: entry.k,
cases: retrieval_cases,
correct: 0,
precision: retrieval_precision,
precision_at_1: entry.precision_at_1,
precision_at_2: entry.precision_at_2,
precision_at_3: entry.precision_at_3,
mrr: entry.mrr,
average_ndcg: entry.average_ndcg,
latency: entry.latency_ms,
concurrency: 0,
strategy: "unknown".into(),
rerank_enabled: entry.rerank_enabled,
rerank_pool_size: entry.rerank_pool_size,
rerank_keep_top: entry.rerank_keep_top,
chunk_result_cap: entry.chunk_result_cap.unwrap_or(5),
chunk_rrf_k: default_chunk_rrf_k(),
chunk_rrf_vector_weight: default_chunk_rrf_weight(),
chunk_rrf_fts_weight: default_chunk_rrf_weight(),
chunk_rrf_use_vector: default_chunk_rrf_use(),
chunk_rrf_use_fts: default_chunk_rrf_use(),
chunk_vector_take: 0,
chunk_fts_take: 0,
ingest_chunk_min_tokens: entry.ingest_chunk_min_tokens.unwrap_or(256),
ingest_chunk_max_tokens: entry.ingest_chunk_max_tokens.unwrap_or(512),
ingest_chunk_overlap_tokens: entry.ingest_chunk_overlap_tokens.unwrap_or(50),
ingest_chunks_only: entry.ingest_chunks_only.unwrap_or(false),
};
let llm = if entry.llm_cases > 0 {
Some(LlmSection {
cases: entry.llm_cases,
answered: 0,
precision: entry.llm_precision,
})
} else {
None
};
let performance = PerformanceSection {
openai_base_url: entry.openai_base_url,
ingestion_ms: entry.ingestion_ms,
namespace_seed_ms: entry.namespace_seed_ms,
evaluation_stages_ms: EvaluationStageTimings::default(),
stage_latency: default_stage_latency(),
namespace_reused: false,
ingestion_reused: entry.ingestion_reused,
embeddings_reused: entry.ingestion_embeddings_reused,
ingestion_cache_path: String::new(),
corpus_paragraphs: 0,
positive_paragraphs_reused: 0,
negative_paragraphs_reused: 0,
};
EvaluationReport {
overview,
dataset,
slice,
retrieval,
llm,
performance,
misses: Vec::new(),
llm_cases: Vec::new(),
detailed_report: false,
}
}
fn load_history(path: &Path) -> Result<Vec<EvaluationReport>> {
if !path.exists() {
return Ok(Vec::new());
}
let contents =
fs::read(path).with_context(|| format!("reading evaluation log {}", path.display()))?;
if let Ok(entries) = serde_json::from_slice::<Vec<EvaluationReport>>(&contents) {
return Ok(entries);
}
match serde_json::from_slice::<Vec<LegacyHistoryEntry>>(&contents) {
Ok(entries) => Ok(entries.into_iter().map(convert_legacy_entry).collect()),
Err(err) => {
let timestamp = Utc::now().format("%Y%m%dT%H%M%S");
let backup_path = path
.parent()
.unwrap_or_else(|| Path::new("."))
.join(format!("evaluations.json.corrupted.{timestamp}"));
warn!(
path = %path.display(),
backup = %backup_path.display(),
error = %err,
"Evaluation history file is corrupted; backing up and starting fresh"
);
if let Err(e) = fs::rename(path, &backup_path) {
warn!(
path = %path.display(),
error = %e,
"Failed to backup corrupted evaluation history"
);
}
Ok(Vec::new())
}
}
}
fn record_history(report: &EvaluationReport, report_dir: &Path) -> Result<PathBuf> {
let path = report_dir.join("evaluations.json");
let mut entries = load_history(&path)?;
entries.push(report.clone());
let blob = serde_json::to_vec_pretty(&entries).context("serialising evaluation log")?;
fs::write(&path, blob).with_context(|| format!("writing evaluation log {}", path.display()))?;
Ok(path)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::eval::{
EvaluationStageTimings, PerformanceTimings, RetrievedSummary, StageLatencyBreakdown,
};
use chrono::Utc;
use tempfile::tempdir;
fn latency(ms: f64) -> LatencyStats {
LatencyStats {
avg: ms,
p50: ms as u128,
p95: ms as u128,
}
}
fn sample_stage_latency() -> StageLatencyBreakdown {
StageLatencyBreakdown {
embed: latency(9.0),
collect_candidates: latency(10.0),
graph_expansion: latency(11.0),
chunk_attach: latency(12.0),
rerank: latency(13.0),
assemble: latency(14.0),
}
}
fn sample_eval_stage() -> EvaluationStageTimings {
EvaluationStageTimings {
prepare_slice_ms: 1,
prepare_db_ms: 2,
prepare_corpus_ms: 3,
prepare_namespace_ms: 4,
run_queries_ms: 5,
summarize_ms: 6,
finalize_ms: 7,
}
}
fn sample_case(is_impossible: bool, matched: bool) -> CaseSummary {
CaseSummary {
question_id: if is_impossible {
"llm-q".into()
} else {
"retrieval-q".into()
},
question: "Who is the hero?".into(),
paragraph_id: "p1".into(),
paragraph_title: "Hero".into(),
expected_source: "src1".into(),
answers: vec!["answer".into()],
matched,
entity_match: matched,
chunk_text_match: matched,
chunk_id_match: matched,
ndcg: None,
reciprocal_rank: None,
is_impossible,
has_verified_chunks: !is_impossible,
match_rank: if matched { Some(1) } else { None },
latency_ms: 42,
retrieved: vec![RetrievedSummary {
rank: 1,
entity_id: "entity1".into(),
source_id: "src1".into(),
entity_name: "Entity".into(),
score: 1.0,
matched,
entity_description: None,
entity_category: None,
chunk_text_match: Some(matched),
chunk_id_match: Some(matched),
}],
}
}
fn sample_summary(include_llm: bool) -> EvaluationSummary {
let mut cases = vec![sample_case(false, true)];
if include_llm {
cases.push(sample_case(true, false));
}
EvaluationSummary {
generated_at: Utc::now(),
k: 5,
limit: Some(10),
run_label: Some("test".into()),
total_cases: cases.len(),
correct: 1,
precision: 1.0,
correct_at_1: 1,
correct_at_2: 1,
correct_at_3: 1,
precision_at_1: 1.0,
precision_at_2: 1.0,
precision_at_3: 1.0,
duration_ms: 100,
dataset_id: "ds".into(),
dataset_label: "Dataset".into(),
dataset_includes_unanswerable: include_llm,
dataset_source: "dev".into(),
includes_impossible_cases: include_llm,
require_verified_chunks: !include_llm,
filtered_questions: 0,
retrieval_cases: 1,
retrieval_correct: 1,
retrieval_precision: 1.0,
average_ndcg: 0.0,
mrr: 0.0,
llm_cases: if include_llm { 1 } else { 0 },
llm_answered: 0,
llm_precision: 0.0,
slice_id: "slice".into(),
slice_seed: 1,
slice_total_cases: cases.len(),
slice_window_offset: 0,
slice_window_length: cases.len(),
slice_cases: cases.len(),
slice_positive_paragraphs: 1,
slice_negative_paragraphs: 0,
slice_total_paragraphs: 1,
slice_negative_multiplier: 1.0,
namespace_reused: true,
corpus_paragraphs: 1,
ingestion_cache_path: "/cache".into(),
ingestion_reused: true,
ingestion_embeddings_reused: true,
ingestion_fingerprint: "fp".into(),
positive_paragraphs_reused: 1,
negative_paragraphs_reused: 0,
latency_ms: latency(10.0),
perf: PerformanceTimings {
openai_base_url: "https://example.com".into(),
ingestion_ms: 100,
namespace_seed_ms: Some(50),
evaluation_stage_ms: sample_eval_stage(),
stage_latency: sample_stage_latency(),
},
embedding_backend: "fastembed".into(),
embedding_model: Some("model".into()),
embedding_dimension: 32,
rerank_enabled: true,
rerank_pool_size: Some(4),
rerank_keep_top: 5,
concurrency: 2,
detailed_report: true,
retrieval_strategy: "initial".into(),
chunk_result_cap: 5,
chunk_rrf_k: 60.0,
chunk_rrf_vector_weight: 1.0,
chunk_rrf_fts_weight: 1.0,
chunk_rrf_use_vector: true,
chunk_rrf_use_fts: true,
ingest_chunk_min_tokens: 256,
ingest_chunk_max_tokens: 512,
ingest_chunk_overlap_tokens: 50,
ingest_chunks_only: false,
chunk_vector_take: 50,
chunk_fts_take: 50,
chunk_avg_chars_per_token: 4,
max_chunks_per_entity: 4,
cases,
}
}
#[test]
fn markdown_includes_llm_section() {
let summary = sample_summary(true);
let report = EvaluationReport::from_summary(&summary, 5);
let md = render_markdown(&report);
assert!(md.contains("LLM Mode Metrics"));
assert!(md.contains("LLM-Only Cases (sample)"));
}
#[test]
fn markdown_hides_llm_section_when_not_present() {
let summary = sample_summary(false);
let report = EvaluationReport::from_summary(&summary, 5);
let md = render_markdown(&report);
assert!(!md.contains("LLM Mode Metrics"));
assert!(!md.contains("LLM-Only Cases"));
}
#[test]
fn evaluations_history_captures_strategy_and_concurrency() {
let tmp = tempdir().unwrap();
let summary = sample_summary(false);
let outcome = write_reports(&summary, tmp.path(), 5).expect("writing consolidated reports");
let contents =
std::fs::read_to_string(&outcome.history_path).expect("reading evaluations history");
let entries: Vec<EvaluationReport> =
serde_json::from_str(&contents).expect("parsing evaluations history");
assert_eq!(entries.len(), 1);
let stored = &entries[0];
assert_eq!(stored.retrieval.concurrency, summary.concurrency);
assert_eq!(stored.retrieval.strategy, summary.retrieval_strategy);
assert_eq!(
stored.performance.evaluation_stages_ms.run_queries_ms,
summary.perf.evaluation_stage_ms.run_queries_ms
);
}
}