Files
minne/eval/src/report.rs
Per Stark 0eda65b07e benchmarks: v1
Benchmarking ingestion, retrieval precision and performance
2025-11-18 11:50:15 +01:00

457 lines
14 KiB
Rust

use std::{
fs,
path::{Path, PathBuf},
};
use anyhow::{Context, Result};
use serde::{Deserialize, Serialize};
use crate::eval::{format_timestamp, CaseSummary, EvaluationSummary, LatencyStats};
#[derive(Debug)]
pub struct ReportPaths {
pub json: PathBuf,
pub markdown: PathBuf,
}
pub fn write_reports(
summary: &EvaluationSummary,
report_dir: &Path,
sample: usize,
) -> Result<ReportPaths> {
fs::create_dir_all(report_dir)
.with_context(|| format!("creating report directory {}", report_dir.display()))?;
let dataset_dir = dataset_report_dir(report_dir, &summary.dataset_id);
fs::create_dir_all(&dataset_dir).with_context(|| {
format!(
"creating dataset report directory {}",
dataset_dir.display()
)
})?;
let stem = build_report_stem(summary);
let json_path = dataset_dir.join(format!("{stem}.json"));
let json_blob = serde_json::to_string_pretty(summary).context("serialising JSON report")?;
fs::write(&json_path, &json_blob)
.with_context(|| format!("writing JSON report to {}", json_path.display()))?;
let md_path = dataset_dir.join(format!("{stem}.md"));
let markdown = render_markdown(summary, sample);
fs::write(&md_path, &markdown)
.with_context(|| format!("writing Markdown report to {}", md_path.display()))?;
// Keep a latest.json pointer to simplify automation.
let latest_json = dataset_dir.join("latest.json");
fs::write(&latest_json, json_blob)
.with_context(|| format!("writing latest JSON report to {}", latest_json.display()))?;
let latest_md = dataset_dir.join("latest.md");
fs::write(&latest_md, markdown)
.with_context(|| format!("writing latest Markdown report to {}", latest_md.display()))?;
record_history(summary, &dataset_dir)?;
Ok(ReportPaths {
json: json_path,
markdown: md_path,
})
}
fn render_markdown(summary: &EvaluationSummary, sample: usize) -> String {
let mut md = String::new();
md.push_str(&format!("# Retrieval Precision@{}\n\n", summary.k));
md.push_str("| Metric | Value |\n");
md.push_str("| --- | --- |\n");
md.push_str(&format!(
"| Generated | {} |\n",
format_timestamp(&summary.generated_at)
));
md.push_str(&format!(
"| Dataset | {} (`{}`) |\n",
summary.dataset_label, summary.dataset_id
));
md.push_str(&format!(
"| Run Label | {} |\n",
summary
.run_label
.as_deref()
.filter(|label| !label.is_empty())
.unwrap_or("-")
));
md.push_str(&format!(
"| Unanswerable Included | {} |\n",
if summary.dataset_includes_unanswerable {
"yes"
} else {
"no"
}
));
md.push_str(&format!(
"| Dataset Source | {} |\n",
summary.dataset_source
));
md.push_str(&format!(
"| OpenAI Base URL | {} |\n",
summary.perf.openai_base_url
));
md.push_str(&format!("| Slice ID | `{}` |\n", summary.slice_id));
md.push_str(&format!("| Slice Seed | {} |\n", summary.slice_seed));
md.push_str(&format!(
"| Slice Total Questions | {} |\n",
summary.slice_total_cases
));
md.push_str(&format!(
"| Slice Window (offset/length) | {}/{} |\n",
summary.slice_window_offset, summary.slice_window_length
));
md.push_str(&format!(
"| Slice Window Questions | {} |\n",
summary.slice_cases
));
md.push_str(&format!(
"| Slice Negatives | {} |\n",
summary.slice_negative_paragraphs
));
md.push_str(&format!(
"| Slice Total Paragraphs | {} |\n",
summary.slice_total_paragraphs
));
md.push_str(&format!(
"| Slice Negative Multiplier | {:.2} |\n",
summary.slice_negative_multiplier
));
md.push_str(&format!(
"| Namespace State | {} |\n",
if summary.namespace_reused {
"reused"
} else {
"seeded"
}
));
md.push_str(&format!(
"| Corpus Paragraphs | {} |\n",
summary.corpus_paragraphs
));
md.push_str(&format!(
"| Ingestion Duration | {} ms |\n",
summary.perf.ingestion_ms
));
if let Some(seed) = summary.perf.namespace_seed_ms {
md.push_str(&format!("| Namespace Seed | {} ms |\n", seed));
}
if summary.detailed_report {
md.push_str(&format!(
"| Ingestion Cache | `{}` |\n",
summary.ingestion_cache_path
));
md.push_str(&format!(
"| Ingestion Reused | {} |\n",
if summary.ingestion_reused {
"yes"
} else {
"no"
}
));
md.push_str(&format!(
"| Embeddings Reused | {} |\n",
if summary.ingestion_embeddings_reused {
"yes"
} else {
"no"
}
));
}
md.push_str(&format!(
"| Positives Cached | {} |
",
summary.positive_paragraphs_reused
));
md.push_str(&format!(
"| Negatives Cached | {} |
",
summary.negative_paragraphs_reused
));
let embedding_label = if let Some(model) = summary.embedding_model.as_ref() {
format!("{} ({model})", summary.embedding_backend)
} else {
summary.embedding_backend.clone()
};
md.push_str(&format!("| Embedding | {} |\n", embedding_label));
md.push_str(&format!(
"| Embedding Dim | {} |\n",
summary.embedding_dimension
));
if let Some(limit) = summary.limit {
md.push_str(&format!(
"| Evaluated Queries | {} (limit {}) |\n",
summary.total_cases, limit
));
} else {
md.push_str(&format!(
"| Evaluated Queries | {} |\n",
summary.total_cases
));
}
if summary.rerank_enabled {
let pool = summary
.rerank_pool_size
.map(|size| size.to_string())
.unwrap_or_else(|| "?".to_string());
md.push_str(&format!(
"| Rerank | enabled (pool {pool}, keep top {}) |\n",
summary.rerank_keep_top
));
} else {
md.push_str("| Rerank | disabled |\n");
}
md.push_str(&format!("| Concurrency | {} |\n", summary.concurrency));
md.push_str(&format!(
"| Correct@{} | {}/{} |\n",
summary.k, summary.correct, summary.total_cases
));
md.push_str(&format!(
"| Precision@{} | {:.3} |\n",
summary.k, summary.precision
));
md.push_str(&format!(
"| Precision@1 | {:.3} |\n",
summary.precision_at_1
));
md.push_str(&format!(
"| Precision@2 | {:.3} |\n",
summary.precision_at_2
));
md.push_str(&format!(
"| Precision@3 | {:.3} |\n",
summary.precision_at_3
));
md.push_str(&format!("| Duration | {} ms |\n", summary.duration_ms));
md.push_str(&format!(
"| Latency Avg (ms) | {:.1} |\n",
summary.latency_ms.avg
));
md.push_str(&format!(
"| Latency P50 (ms) | {} |\n",
summary.latency_ms.p50
));
md.push_str(&format!(
"| Latency P95 (ms) | {} |\n",
summary.latency_ms.p95
));
md.push_str("\n## Retrieval Stage Timings\n\n");
md.push_str("| Stage | Avg (ms) | P50 (ms) | P95 (ms) |\n");
md.push_str("| --- | --- | --- | --- |\n");
write_stage_row(
&mut md,
"Collect Candidates",
&summary.perf.stage_latency.collect_candidates,
);
write_stage_row(
&mut md,
"Graph Expansion",
&summary.perf.stage_latency.graph_expansion,
);
write_stage_row(
&mut md,
"Chunk Attach",
&summary.perf.stage_latency.chunk_attach,
);
write_stage_row(&mut md, "Rerank", &summary.perf.stage_latency.rerank);
write_stage_row(&mut md, "Assemble", &summary.perf.stage_latency.assemble);
let misses: Vec<&CaseSummary> = summary.cases.iter().filter(|case| !case.matched).collect();
if !misses.is_empty() {
md.push_str("\n## Missed Queries (sample)\n\n");
if summary.detailed_report {
md.push_str(
"| Question ID | Paragraph | Expected Source | Entity Match | Chunk Text | Chunk ID | Top Retrieved |\n",
);
md.push_str("| --- | --- | --- | --- | --- | --- | --- |\n");
} else {
md.push_str("| Question ID | Paragraph | Expected Source | Top Retrieved |\n");
md.push_str("| --- | --- | --- | --- |\n");
}
for case in misses.iter().take(sample) {
let retrieved = case
.retrieved
.iter()
.map(|entry| format!("{} (rank {})", entry.source_id, entry.rank))
.take(3)
.collect::<Vec<_>>()
.join("<br>");
if summary.detailed_report {
md.push_str(&format!(
"| `{}` | {} | `{}` | {} | {} | {} | {} |\n",
case.question_id,
case.paragraph_title,
case.expected_source,
bool_badge(case.entity_match),
bool_badge(case.chunk_text_match),
bool_badge(case.chunk_id_match),
retrieved
));
} else {
md.push_str(&format!(
"| `{}` | {} | `{}` | {} |\n",
case.question_id, case.paragraph_title, case.expected_source, retrieved
));
}
}
} else {
md.push_str("\n_All evaluated queries matched within the top-k window._\n");
if summary.detailed_report {
md.push_str(
"\nSuccess measures were captured for each query (entity, chunk text, chunk ID).\n",
);
}
}
md
}
fn write_stage_row(buf: &mut String, label: &str, stats: &LatencyStats) {
buf.push_str(&format!(
"| {} | {:.1} | {} | {} |\n",
label, stats.avg, stats.p50, stats.p95
));
}
fn bool_badge(value: bool) -> &'static str {
if value {
""
} else {
""
}
}
fn build_report_stem(summary: &EvaluationSummary) -> String {
let timestamp = summary.generated_at.format("%Y%m%dT%H%M%S");
let backend = sanitize_component(&summary.embedding_backend);
let dataset_component = sanitize_component(&summary.dataset_id);
let model_component = summary
.embedding_model
.as_ref()
.map(|model| sanitize_component(model));
match model_component {
Some(model) => format!(
"precision_at_{}_{}_{}_{}_{}",
summary.k, dataset_component, timestamp, backend, model
),
None => format!(
"precision_at_{}_{}_{}_{}",
summary.k, dataset_component, timestamp, backend
),
}
}
fn sanitize_component(input: &str) -> String {
input
.chars()
.map(|ch| if ch.is_ascii_alphanumeric() { ch } else { '_' })
.collect()
}
pub fn dataset_report_dir(report_dir: &Path, dataset_id: &str) -> PathBuf {
report_dir.join(sanitize_component(dataset_id))
}
#[derive(Debug, Serialize, Deserialize)]
struct HistoryEntry {
generated_at: String,
run_label: Option<String>,
dataset_id: String,
dataset_label: String,
slice_id: String,
slice_seed: u64,
slice_window_offset: usize,
slice_window_length: usize,
slice_cases: usize,
slice_total_cases: usize,
k: usize,
limit: Option<usize>,
precision: f64,
precision_at_1: f64,
precision_at_2: f64,
precision_at_3: f64,
duration_ms: u128,
latency_ms: LatencyStats,
embedding_backend: String,
embedding_model: Option<String>,
ingestion_reused: bool,
ingestion_embeddings_reused: bool,
rerank_enabled: bool,
rerank_keep_top: usize,
rerank_pool_size: Option<usize>,
delta: Option<HistoryDelta>,
openai_base_url: String,
ingestion_ms: u128,
#[serde(default)]
namespace_seed_ms: Option<u128>,
}
#[derive(Debug, Serialize, Deserialize)]
struct HistoryDelta {
precision: f64,
precision_at_1: f64,
latency_avg_ms: f64,
}
fn record_history(summary: &EvaluationSummary, report_dir: &Path) -> Result<()> {
let path = report_dir.join("evaluations.json");
let mut entries: Vec<HistoryEntry> = if path.exists() {
let contents = fs::read(&path)
.with_context(|| format!("reading evaluation log {}", path.display()))?;
serde_json::from_slice(&contents).unwrap_or_default()
} else {
Vec::new()
};
let delta = entries.last().map(|prev| HistoryDelta {
precision: summary.precision - prev.precision,
precision_at_1: summary.precision_at_1 - prev.precision_at_1,
latency_avg_ms: summary.latency_ms.avg - prev.latency_ms.avg,
});
let entry = HistoryEntry {
generated_at: format_timestamp(&summary.generated_at),
run_label: summary.run_label.clone(),
dataset_id: summary.dataset_id.clone(),
dataset_label: summary.dataset_label.clone(),
slice_id: summary.slice_id.clone(),
slice_seed: summary.slice_seed,
slice_window_offset: summary.slice_window_offset,
slice_window_length: summary.slice_window_length,
slice_cases: summary.slice_cases,
slice_total_cases: summary.slice_total_cases,
k: summary.k,
limit: summary.limit,
precision: summary.precision,
precision_at_1: summary.precision_at_1,
precision_at_2: summary.precision_at_2,
precision_at_3: summary.precision_at_3,
duration_ms: summary.duration_ms,
latency_ms: summary.latency_ms.clone(),
embedding_backend: summary.embedding_backend.clone(),
embedding_model: summary.embedding_model.clone(),
ingestion_reused: summary.ingestion_reused,
ingestion_embeddings_reused: summary.ingestion_embeddings_reused,
rerank_enabled: summary.rerank_enabled,
rerank_keep_top: summary.rerank_keep_top,
rerank_pool_size: summary.rerank_pool_size,
delta,
openai_base_url: summary.perf.openai_base_url.clone(),
ingestion_ms: summary.perf.ingestion_ms,
namespace_seed_ms: summary.perf.namespace_seed_ms,
};
entries.push(entry);
let blob = serde_json::to_vec_pretty(&entries).context("serialising evaluation log")?;
fs::write(&path, blob).with_context(|| format!("writing evaluation log {}", path.display()))?;
Ok(())
}