mirror of
https://github.com/perstarkse/minne.git
synced 2026-03-14 06:16:19 +01:00
457 lines
14 KiB
Rust
457 lines
14 KiB
Rust
use std::{
|
|
fs,
|
|
path::{Path, PathBuf},
|
|
};
|
|
|
|
use anyhow::{Context, Result};
|
|
use serde::{Deserialize, Serialize};
|
|
|
|
use crate::eval::{format_timestamp, CaseSummary, EvaluationSummary, LatencyStats};
|
|
|
|
#[derive(Debug)]
|
|
pub struct ReportPaths {
|
|
pub json: PathBuf,
|
|
pub markdown: PathBuf,
|
|
}
|
|
|
|
pub fn write_reports(
|
|
summary: &EvaluationSummary,
|
|
report_dir: &Path,
|
|
sample: usize,
|
|
) -> Result<ReportPaths> {
|
|
fs::create_dir_all(report_dir)
|
|
.with_context(|| format!("creating report directory {}", report_dir.display()))?;
|
|
let dataset_dir = dataset_report_dir(report_dir, &summary.dataset_id);
|
|
fs::create_dir_all(&dataset_dir).with_context(|| {
|
|
format!(
|
|
"creating dataset report directory {}",
|
|
dataset_dir.display()
|
|
)
|
|
})?;
|
|
|
|
let stem = build_report_stem(summary);
|
|
|
|
let json_path = dataset_dir.join(format!("{stem}.json"));
|
|
let json_blob = serde_json::to_string_pretty(summary).context("serialising JSON report")?;
|
|
fs::write(&json_path, &json_blob)
|
|
.with_context(|| format!("writing JSON report to {}", json_path.display()))?;
|
|
|
|
let md_path = dataset_dir.join(format!("{stem}.md"));
|
|
let markdown = render_markdown(summary, sample);
|
|
fs::write(&md_path, &markdown)
|
|
.with_context(|| format!("writing Markdown report to {}", md_path.display()))?;
|
|
|
|
// Keep a latest.json pointer to simplify automation.
|
|
let latest_json = dataset_dir.join("latest.json");
|
|
fs::write(&latest_json, json_blob)
|
|
.with_context(|| format!("writing latest JSON report to {}", latest_json.display()))?;
|
|
let latest_md = dataset_dir.join("latest.md");
|
|
fs::write(&latest_md, markdown)
|
|
.with_context(|| format!("writing latest Markdown report to {}", latest_md.display()))?;
|
|
|
|
record_history(summary, &dataset_dir)?;
|
|
|
|
Ok(ReportPaths {
|
|
json: json_path,
|
|
markdown: md_path,
|
|
})
|
|
}
|
|
|
|
fn render_markdown(summary: &EvaluationSummary, sample: usize) -> String {
|
|
let mut md = String::new();
|
|
|
|
md.push_str(&format!("# Retrieval Precision@{}\n\n", summary.k));
|
|
md.push_str("| Metric | Value |\n");
|
|
md.push_str("| --- | --- |\n");
|
|
md.push_str(&format!(
|
|
"| Generated | {} |\n",
|
|
format_timestamp(&summary.generated_at)
|
|
));
|
|
md.push_str(&format!(
|
|
"| Dataset | {} (`{}`) |\n",
|
|
summary.dataset_label, summary.dataset_id
|
|
));
|
|
md.push_str(&format!(
|
|
"| Run Label | {} |\n",
|
|
summary
|
|
.run_label
|
|
.as_deref()
|
|
.filter(|label| !label.is_empty())
|
|
.unwrap_or("-")
|
|
));
|
|
md.push_str(&format!(
|
|
"| Unanswerable Included | {} |\n",
|
|
if summary.dataset_includes_unanswerable {
|
|
"yes"
|
|
} else {
|
|
"no"
|
|
}
|
|
));
|
|
md.push_str(&format!(
|
|
"| Dataset Source | {} |\n",
|
|
summary.dataset_source
|
|
));
|
|
md.push_str(&format!(
|
|
"| OpenAI Base URL | {} |\n",
|
|
summary.perf.openai_base_url
|
|
));
|
|
md.push_str(&format!("| Slice ID | `{}` |\n", summary.slice_id));
|
|
md.push_str(&format!("| Slice Seed | {} |\n", summary.slice_seed));
|
|
md.push_str(&format!(
|
|
"| Slice Total Questions | {} |\n",
|
|
summary.slice_total_cases
|
|
));
|
|
md.push_str(&format!(
|
|
"| Slice Window (offset/length) | {}/{} |\n",
|
|
summary.slice_window_offset, summary.slice_window_length
|
|
));
|
|
md.push_str(&format!(
|
|
"| Slice Window Questions | {} |\n",
|
|
summary.slice_cases
|
|
));
|
|
md.push_str(&format!(
|
|
"| Slice Negatives | {} |\n",
|
|
summary.slice_negative_paragraphs
|
|
));
|
|
md.push_str(&format!(
|
|
"| Slice Total Paragraphs | {} |\n",
|
|
summary.slice_total_paragraphs
|
|
));
|
|
md.push_str(&format!(
|
|
"| Slice Negative Multiplier | {:.2} |\n",
|
|
summary.slice_negative_multiplier
|
|
));
|
|
md.push_str(&format!(
|
|
"| Namespace State | {} |\n",
|
|
if summary.namespace_reused {
|
|
"reused"
|
|
} else {
|
|
"seeded"
|
|
}
|
|
));
|
|
md.push_str(&format!(
|
|
"| Corpus Paragraphs | {} |\n",
|
|
summary.corpus_paragraphs
|
|
));
|
|
md.push_str(&format!(
|
|
"| Ingestion Duration | {} ms |\n",
|
|
summary.perf.ingestion_ms
|
|
));
|
|
if let Some(seed) = summary.perf.namespace_seed_ms {
|
|
md.push_str(&format!("| Namespace Seed | {} ms |\n", seed));
|
|
}
|
|
if summary.detailed_report {
|
|
md.push_str(&format!(
|
|
"| Ingestion Cache | `{}` |\n",
|
|
summary.ingestion_cache_path
|
|
));
|
|
md.push_str(&format!(
|
|
"| Ingestion Reused | {} |\n",
|
|
if summary.ingestion_reused {
|
|
"yes"
|
|
} else {
|
|
"no"
|
|
}
|
|
));
|
|
md.push_str(&format!(
|
|
"| Embeddings Reused | {} |\n",
|
|
if summary.ingestion_embeddings_reused {
|
|
"yes"
|
|
} else {
|
|
"no"
|
|
}
|
|
));
|
|
}
|
|
md.push_str(&format!(
|
|
"| Positives Cached | {} |
|
|
",
|
|
summary.positive_paragraphs_reused
|
|
));
|
|
md.push_str(&format!(
|
|
"| Negatives Cached | {} |
|
|
",
|
|
summary.negative_paragraphs_reused
|
|
));
|
|
let embedding_label = if let Some(model) = summary.embedding_model.as_ref() {
|
|
format!("{} ({model})", summary.embedding_backend)
|
|
} else {
|
|
summary.embedding_backend.clone()
|
|
};
|
|
md.push_str(&format!("| Embedding | {} |\n", embedding_label));
|
|
md.push_str(&format!(
|
|
"| Embedding Dim | {} |\n",
|
|
summary.embedding_dimension
|
|
));
|
|
if let Some(limit) = summary.limit {
|
|
md.push_str(&format!(
|
|
"| Evaluated Queries | {} (limit {}) |\n",
|
|
summary.total_cases, limit
|
|
));
|
|
} else {
|
|
md.push_str(&format!(
|
|
"| Evaluated Queries | {} |\n",
|
|
summary.total_cases
|
|
));
|
|
}
|
|
if summary.rerank_enabled {
|
|
let pool = summary
|
|
.rerank_pool_size
|
|
.map(|size| size.to_string())
|
|
.unwrap_or_else(|| "?".to_string());
|
|
md.push_str(&format!(
|
|
"| Rerank | enabled (pool {pool}, keep top {}) |\n",
|
|
summary.rerank_keep_top
|
|
));
|
|
} else {
|
|
md.push_str("| Rerank | disabled |\n");
|
|
}
|
|
md.push_str(&format!("| Concurrency | {} |\n", summary.concurrency));
|
|
md.push_str(&format!(
|
|
"| Correct@{} | {}/{} |\n",
|
|
summary.k, summary.correct, summary.total_cases
|
|
));
|
|
md.push_str(&format!(
|
|
"| Precision@{} | {:.3} |\n",
|
|
summary.k, summary.precision
|
|
));
|
|
md.push_str(&format!(
|
|
"| Precision@1 | {:.3} |\n",
|
|
summary.precision_at_1
|
|
));
|
|
md.push_str(&format!(
|
|
"| Precision@2 | {:.3} |\n",
|
|
summary.precision_at_2
|
|
));
|
|
md.push_str(&format!(
|
|
"| Precision@3 | {:.3} |\n",
|
|
summary.precision_at_3
|
|
));
|
|
md.push_str(&format!("| Duration | {} ms |\n", summary.duration_ms));
|
|
md.push_str(&format!(
|
|
"| Latency Avg (ms) | {:.1} |\n",
|
|
summary.latency_ms.avg
|
|
));
|
|
md.push_str(&format!(
|
|
"| Latency P50 (ms) | {} |\n",
|
|
summary.latency_ms.p50
|
|
));
|
|
md.push_str(&format!(
|
|
"| Latency P95 (ms) | {} |\n",
|
|
summary.latency_ms.p95
|
|
));
|
|
|
|
md.push_str("\n## Retrieval Stage Timings\n\n");
|
|
md.push_str("| Stage | Avg (ms) | P50 (ms) | P95 (ms) |\n");
|
|
md.push_str("| --- | --- | --- | --- |\n");
|
|
write_stage_row(
|
|
&mut md,
|
|
"Collect Candidates",
|
|
&summary.perf.stage_latency.collect_candidates,
|
|
);
|
|
write_stage_row(
|
|
&mut md,
|
|
"Graph Expansion",
|
|
&summary.perf.stage_latency.graph_expansion,
|
|
);
|
|
write_stage_row(
|
|
&mut md,
|
|
"Chunk Attach",
|
|
&summary.perf.stage_latency.chunk_attach,
|
|
);
|
|
write_stage_row(&mut md, "Rerank", &summary.perf.stage_latency.rerank);
|
|
write_stage_row(&mut md, "Assemble", &summary.perf.stage_latency.assemble);
|
|
|
|
let misses: Vec<&CaseSummary> = summary.cases.iter().filter(|case| !case.matched).collect();
|
|
if !misses.is_empty() {
|
|
md.push_str("\n## Missed Queries (sample)\n\n");
|
|
if summary.detailed_report {
|
|
md.push_str(
|
|
"| Question ID | Paragraph | Expected Source | Entity Match | Chunk Text | Chunk ID | Top Retrieved |\n",
|
|
);
|
|
md.push_str("| --- | --- | --- | --- | --- | --- | --- |\n");
|
|
} else {
|
|
md.push_str("| Question ID | Paragraph | Expected Source | Top Retrieved |\n");
|
|
md.push_str("| --- | --- | --- | --- |\n");
|
|
}
|
|
|
|
for case in misses.iter().take(sample) {
|
|
let retrieved = case
|
|
.retrieved
|
|
.iter()
|
|
.map(|entry| format!("{} (rank {})", entry.source_id, entry.rank))
|
|
.take(3)
|
|
.collect::<Vec<_>>()
|
|
.join("<br>");
|
|
if summary.detailed_report {
|
|
md.push_str(&format!(
|
|
"| `{}` | {} | `{}` | {} | {} | {} | {} |\n",
|
|
case.question_id,
|
|
case.paragraph_title,
|
|
case.expected_source,
|
|
bool_badge(case.entity_match),
|
|
bool_badge(case.chunk_text_match),
|
|
bool_badge(case.chunk_id_match),
|
|
retrieved
|
|
));
|
|
} else {
|
|
md.push_str(&format!(
|
|
"| `{}` | {} | `{}` | {} |\n",
|
|
case.question_id, case.paragraph_title, case.expected_source, retrieved
|
|
));
|
|
}
|
|
}
|
|
} else {
|
|
md.push_str("\n_All evaluated queries matched within the top-k window._\n");
|
|
if summary.detailed_report {
|
|
md.push_str(
|
|
"\nSuccess measures were captured for each query (entity, chunk text, chunk ID).\n",
|
|
);
|
|
}
|
|
}
|
|
|
|
md
|
|
}
|
|
|
|
fn write_stage_row(buf: &mut String, label: &str, stats: &LatencyStats) {
|
|
buf.push_str(&format!(
|
|
"| {} | {:.1} | {} | {} |\n",
|
|
label, stats.avg, stats.p50, stats.p95
|
|
));
|
|
}
|
|
|
|
fn bool_badge(value: bool) -> &'static str {
|
|
if value {
|
|
"✅"
|
|
} else {
|
|
"⚪"
|
|
}
|
|
}
|
|
|
|
fn build_report_stem(summary: &EvaluationSummary) -> String {
|
|
let timestamp = summary.generated_at.format("%Y%m%dT%H%M%S");
|
|
let backend = sanitize_component(&summary.embedding_backend);
|
|
let dataset_component = sanitize_component(&summary.dataset_id);
|
|
let model_component = summary
|
|
.embedding_model
|
|
.as_ref()
|
|
.map(|model| sanitize_component(model));
|
|
|
|
match model_component {
|
|
Some(model) => format!(
|
|
"precision_at_{}_{}_{}_{}_{}",
|
|
summary.k, dataset_component, timestamp, backend, model
|
|
),
|
|
None => format!(
|
|
"precision_at_{}_{}_{}_{}",
|
|
summary.k, dataset_component, timestamp, backend
|
|
),
|
|
}
|
|
}
|
|
|
|
fn sanitize_component(input: &str) -> String {
|
|
input
|
|
.chars()
|
|
.map(|ch| if ch.is_ascii_alphanumeric() { ch } else { '_' })
|
|
.collect()
|
|
}
|
|
|
|
pub fn dataset_report_dir(report_dir: &Path, dataset_id: &str) -> PathBuf {
|
|
report_dir.join(sanitize_component(dataset_id))
|
|
}
|
|
|
|
#[derive(Debug, Serialize, Deserialize)]
|
|
struct HistoryEntry {
|
|
generated_at: String,
|
|
run_label: Option<String>,
|
|
dataset_id: String,
|
|
dataset_label: String,
|
|
slice_id: String,
|
|
slice_seed: u64,
|
|
slice_window_offset: usize,
|
|
slice_window_length: usize,
|
|
slice_cases: usize,
|
|
slice_total_cases: usize,
|
|
k: usize,
|
|
limit: Option<usize>,
|
|
precision: f64,
|
|
precision_at_1: f64,
|
|
precision_at_2: f64,
|
|
precision_at_3: f64,
|
|
duration_ms: u128,
|
|
latency_ms: LatencyStats,
|
|
embedding_backend: String,
|
|
embedding_model: Option<String>,
|
|
ingestion_reused: bool,
|
|
ingestion_embeddings_reused: bool,
|
|
rerank_enabled: bool,
|
|
rerank_keep_top: usize,
|
|
rerank_pool_size: Option<usize>,
|
|
delta: Option<HistoryDelta>,
|
|
openai_base_url: String,
|
|
ingestion_ms: u128,
|
|
#[serde(default)]
|
|
namespace_seed_ms: Option<u128>,
|
|
}
|
|
|
|
#[derive(Debug, Serialize, Deserialize)]
|
|
struct HistoryDelta {
|
|
precision: f64,
|
|
precision_at_1: f64,
|
|
latency_avg_ms: f64,
|
|
}
|
|
|
|
fn record_history(summary: &EvaluationSummary, report_dir: &Path) -> Result<()> {
|
|
let path = report_dir.join("evaluations.json");
|
|
let mut entries: Vec<HistoryEntry> = if path.exists() {
|
|
let contents = fs::read(&path)
|
|
.with_context(|| format!("reading evaluation log {}", path.display()))?;
|
|
serde_json::from_slice(&contents).unwrap_or_default()
|
|
} else {
|
|
Vec::new()
|
|
};
|
|
|
|
let delta = entries.last().map(|prev| HistoryDelta {
|
|
precision: summary.precision - prev.precision,
|
|
precision_at_1: summary.precision_at_1 - prev.precision_at_1,
|
|
latency_avg_ms: summary.latency_ms.avg - prev.latency_ms.avg,
|
|
});
|
|
|
|
let entry = HistoryEntry {
|
|
generated_at: format_timestamp(&summary.generated_at),
|
|
run_label: summary.run_label.clone(),
|
|
dataset_id: summary.dataset_id.clone(),
|
|
dataset_label: summary.dataset_label.clone(),
|
|
slice_id: summary.slice_id.clone(),
|
|
slice_seed: summary.slice_seed,
|
|
slice_window_offset: summary.slice_window_offset,
|
|
slice_window_length: summary.slice_window_length,
|
|
slice_cases: summary.slice_cases,
|
|
slice_total_cases: summary.slice_total_cases,
|
|
k: summary.k,
|
|
limit: summary.limit,
|
|
precision: summary.precision,
|
|
precision_at_1: summary.precision_at_1,
|
|
precision_at_2: summary.precision_at_2,
|
|
precision_at_3: summary.precision_at_3,
|
|
duration_ms: summary.duration_ms,
|
|
latency_ms: summary.latency_ms.clone(),
|
|
embedding_backend: summary.embedding_backend.clone(),
|
|
embedding_model: summary.embedding_model.clone(),
|
|
ingestion_reused: summary.ingestion_reused,
|
|
ingestion_embeddings_reused: summary.ingestion_embeddings_reused,
|
|
rerank_enabled: summary.rerank_enabled,
|
|
rerank_keep_top: summary.rerank_keep_top,
|
|
rerank_pool_size: summary.rerank_pool_size,
|
|
delta,
|
|
openai_base_url: summary.perf.openai_base_url.clone(),
|
|
ingestion_ms: summary.perf.ingestion_ms,
|
|
namespace_seed_ms: summary.perf.namespace_seed_ms,
|
|
};
|
|
|
|
entries.push(entry);
|
|
|
|
let blob = serde_json::to_vec_pretty(&entries).context("serialising evaluation log")?;
|
|
fs::write(&path, blob).with_context(|| format!("writing evaluation log {}", path.display()))?;
|
|
Ok(())
|
|
}
|