mirror of
https://github.com/perstarkse/minne.git
synced 2026-03-26 11:21:35 +01:00
991 lines
32 KiB
Rust
991 lines
32 KiB
Rust
use std::{
|
|
fs,
|
|
path::{Path, PathBuf},
|
|
};
|
|
|
|
use anyhow::{Context, Result};
|
|
use serde::{Deserialize, Serialize};
|
|
|
|
use crate::eval::{
|
|
format_timestamp, CaseSummary, EvaluationStageTimings, EvaluationSummary, LatencyStats,
|
|
StageLatencyBreakdown,
|
|
};
|
|
use chrono::Utc;
|
|
use tracing::warn;
|
|
|
|
#[derive(Debug)]
|
|
pub struct ReportPaths {
|
|
pub json: PathBuf,
|
|
pub markdown: PathBuf,
|
|
}
|
|
|
|
#[derive(Debug, Serialize)]
|
|
pub struct EvaluationReport {
|
|
pub overview: OverviewSection,
|
|
pub dataset: DatasetSection,
|
|
pub slice: SliceSection,
|
|
pub retrieval: RetrievalSection,
|
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
pub llm: Option<LlmSection>,
|
|
pub performance: PerformanceSection,
|
|
#[serde(skip_serializing_if = "Vec::is_empty")]
|
|
pub misses: Vec<MissEntry>,
|
|
#[serde(skip_serializing_if = "Vec::is_empty")]
|
|
pub llm_cases: Vec<LlmCaseEntry>,
|
|
pub detailed_report: bool,
|
|
}
|
|
|
|
#[derive(Debug, Serialize)]
|
|
pub struct OverviewSection {
|
|
pub generated_at: String,
|
|
pub run_label: Option<String>,
|
|
pub total_cases: usize,
|
|
pub filtered_questions: usize,
|
|
}
|
|
|
|
#[derive(Debug, Serialize)]
|
|
pub struct DatasetSection {
|
|
pub id: String,
|
|
pub label: String,
|
|
pub source: String,
|
|
pub includes_unanswerable: bool,
|
|
pub require_verified_chunks: bool,
|
|
pub embedding_backend: String,
|
|
pub embedding_model: Option<String>,
|
|
pub embedding_dimension: usize,
|
|
}
|
|
|
|
#[derive(Debug, Serialize)]
|
|
pub struct SliceSection {
|
|
pub id: String,
|
|
pub seed: u64,
|
|
pub window_offset: usize,
|
|
pub window_length: usize,
|
|
pub slice_cases: usize,
|
|
pub ledger_total_cases: usize,
|
|
pub positives: usize,
|
|
pub negatives: usize,
|
|
pub total_paragraphs: usize,
|
|
pub negative_multiplier: f32,
|
|
}
|
|
|
|
#[derive(Debug, Serialize)]
|
|
pub struct RetrievalSection {
|
|
pub k: usize,
|
|
pub cases: usize,
|
|
pub correct: usize,
|
|
pub precision: f64,
|
|
pub precision_at_1: f64,
|
|
pub precision_at_2: f64,
|
|
pub precision_at_3: f64,
|
|
pub mrr: f64,
|
|
pub average_ndcg: f64,
|
|
pub latency: LatencyStats,
|
|
pub concurrency: usize,
|
|
pub strategy: String,
|
|
pub rerank_enabled: bool,
|
|
pub rerank_pool_size: Option<usize>,
|
|
pub rerank_keep_top: usize,
|
|
}
|
|
|
|
#[derive(Debug, Serialize)]
|
|
pub struct LlmSection {
|
|
pub cases: usize,
|
|
pub answered: usize,
|
|
pub precision: f64,
|
|
}
|
|
|
|
#[derive(Debug, Serialize)]
|
|
pub struct PerformanceSection {
|
|
pub openai_base_url: String,
|
|
pub ingestion_ms: u128,
|
|
pub namespace_seed_ms: Option<u128>,
|
|
pub evaluation_stages_ms: EvaluationStageTimings,
|
|
pub stage_latency: StageLatencyBreakdown,
|
|
pub namespace_reused: bool,
|
|
pub ingestion_reused: bool,
|
|
pub embeddings_reused: bool,
|
|
pub ingestion_cache_path: String,
|
|
pub corpus_paragraphs: usize,
|
|
pub positive_paragraphs_reused: usize,
|
|
pub negative_paragraphs_reused: usize,
|
|
}
|
|
|
|
#[derive(Debug, Serialize)]
|
|
pub struct MissEntry {
|
|
pub question_id: String,
|
|
pub paragraph_title: String,
|
|
pub expected_source: String,
|
|
pub entity_match: bool,
|
|
pub chunk_text_match: bool,
|
|
pub chunk_id_match: bool,
|
|
pub retrieved: Vec<RetrievedSnippet>,
|
|
}
|
|
|
|
#[derive(Debug, Serialize)]
|
|
pub struct LlmCaseEntry {
|
|
pub question_id: String,
|
|
pub answered: bool,
|
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
pub match_rank: Option<usize>,
|
|
pub retrieved: Vec<RetrievedSnippet>,
|
|
}
|
|
|
|
#[derive(Debug, Serialize)]
|
|
pub struct RetrievedSnippet {
|
|
pub rank: usize,
|
|
pub source_id: String,
|
|
pub entity_name: String,
|
|
pub matched: bool,
|
|
}
|
|
|
|
impl EvaluationReport {
|
|
pub fn from_summary(summary: &EvaluationSummary, sample: usize) -> Self {
|
|
let overview = OverviewSection {
|
|
generated_at: format_timestamp(&summary.generated_at),
|
|
run_label: summary.run_label.clone(),
|
|
total_cases: summary.total_cases,
|
|
filtered_questions: summary.filtered_questions,
|
|
};
|
|
|
|
let dataset = DatasetSection {
|
|
id: summary.dataset_id.clone(),
|
|
label: summary.dataset_label.clone(),
|
|
source: summary.dataset_source.clone(),
|
|
includes_unanswerable: summary.includes_impossible_cases,
|
|
require_verified_chunks: summary.require_verified_chunks,
|
|
embedding_backend: summary.embedding_backend.clone(),
|
|
embedding_model: summary.embedding_model.clone(),
|
|
embedding_dimension: summary.embedding_dimension,
|
|
};
|
|
|
|
let slice = SliceSection {
|
|
id: summary.slice_id.clone(),
|
|
seed: summary.slice_seed,
|
|
window_offset: summary.slice_window_offset,
|
|
window_length: summary.slice_window_length,
|
|
slice_cases: summary.slice_cases,
|
|
ledger_total_cases: summary.slice_total_cases,
|
|
positives: summary.slice_positive_paragraphs,
|
|
negatives: summary.slice_negative_paragraphs,
|
|
total_paragraphs: summary.slice_total_paragraphs,
|
|
negative_multiplier: summary.slice_negative_multiplier,
|
|
};
|
|
|
|
let retrieval = RetrievalSection {
|
|
k: summary.k,
|
|
cases: summary.retrieval_cases,
|
|
correct: summary.retrieval_correct,
|
|
precision: summary.retrieval_precision,
|
|
precision_at_1: summary.precision_at_1,
|
|
precision_at_2: summary.precision_at_2,
|
|
precision_at_3: summary.precision_at_3,
|
|
mrr: summary.mrr,
|
|
average_ndcg: summary.average_ndcg,
|
|
latency: summary.latency_ms.clone(),
|
|
concurrency: summary.concurrency,
|
|
strategy: summary.retrieval_strategy.clone(),
|
|
rerank_enabled: summary.rerank_enabled,
|
|
rerank_pool_size: summary.rerank_pool_size,
|
|
rerank_keep_top: summary.rerank_keep_top,
|
|
};
|
|
|
|
let llm = if summary.llm_cases > 0 {
|
|
Some(LlmSection {
|
|
cases: summary.llm_cases,
|
|
answered: summary.llm_answered,
|
|
precision: summary.llm_precision,
|
|
})
|
|
} else {
|
|
None
|
|
};
|
|
|
|
let performance = PerformanceSection {
|
|
openai_base_url: summary.perf.openai_base_url.clone(),
|
|
ingestion_ms: summary.perf.ingestion_ms,
|
|
namespace_seed_ms: summary.perf.namespace_seed_ms,
|
|
evaluation_stages_ms: summary.perf.evaluation_stage_ms.clone(),
|
|
stage_latency: summary.perf.stage_latency.clone(),
|
|
namespace_reused: summary.namespace_reused,
|
|
ingestion_reused: summary.ingestion_reused,
|
|
embeddings_reused: summary.ingestion_embeddings_reused,
|
|
ingestion_cache_path: summary.ingestion_cache_path.clone(),
|
|
corpus_paragraphs: summary.corpus_paragraphs,
|
|
positive_paragraphs_reused: summary.positive_paragraphs_reused,
|
|
negative_paragraphs_reused: summary.negative_paragraphs_reused,
|
|
};
|
|
|
|
let misses = summary
|
|
.cases
|
|
.iter()
|
|
.filter(|case| !case.matched && !case.is_impossible)
|
|
.take(sample)
|
|
.map(MissEntry::from_case)
|
|
.collect();
|
|
|
|
let llm_cases = if llm.is_some() {
|
|
summary
|
|
.cases
|
|
.iter()
|
|
.filter(|case| case.is_impossible)
|
|
.take(sample)
|
|
.map(LlmCaseEntry::from_case)
|
|
.collect()
|
|
} else {
|
|
Vec::new()
|
|
};
|
|
|
|
Self {
|
|
overview,
|
|
dataset,
|
|
slice,
|
|
retrieval,
|
|
llm,
|
|
performance,
|
|
misses,
|
|
llm_cases,
|
|
detailed_report: summary.detailed_report,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl MissEntry {
|
|
fn from_case(case: &CaseSummary) -> Self {
|
|
Self {
|
|
question_id: case.question_id.clone(),
|
|
paragraph_title: case.paragraph_title.clone(),
|
|
expected_source: case.expected_source.clone(),
|
|
entity_match: case.entity_match,
|
|
chunk_text_match: case.chunk_text_match,
|
|
chunk_id_match: case.chunk_id_match,
|
|
retrieved: case
|
|
.retrieved
|
|
.iter()
|
|
.take(3)
|
|
.map(RetrievedSnippet::from_summary)
|
|
.collect(),
|
|
}
|
|
}
|
|
}
|
|
|
|
impl LlmCaseEntry {
|
|
fn from_case(case: &CaseSummary) -> Self {
|
|
Self {
|
|
question_id: case.question_id.clone(),
|
|
answered: case.matched,
|
|
match_rank: case.match_rank,
|
|
retrieved: case
|
|
.retrieved
|
|
.iter()
|
|
.take(3)
|
|
.map(RetrievedSnippet::from_summary)
|
|
.collect(),
|
|
}
|
|
}
|
|
}
|
|
|
|
impl RetrievedSnippet {
|
|
fn from_summary(entry: &crate::eval::RetrievedSummary) -> Self {
|
|
Self {
|
|
rank: entry.rank,
|
|
source_id: entry.source_id.clone(),
|
|
entity_name: entry.entity_name.clone(),
|
|
matched: entry.matched,
|
|
}
|
|
}
|
|
}
|
|
|
|
pub fn write_reports(
|
|
summary: &EvaluationSummary,
|
|
report_dir: &Path,
|
|
sample: usize,
|
|
) -> Result<ReportPaths> {
|
|
fs::create_dir_all(report_dir)
|
|
.with_context(|| format!("creating report directory {}", report_dir.display()))?;
|
|
let dataset_dir = dataset_report_dir(report_dir, &summary.dataset_id);
|
|
fs::create_dir_all(&dataset_dir).with_context(|| {
|
|
format!(
|
|
"creating dataset report directory {}",
|
|
dataset_dir.display()
|
|
)
|
|
})?;
|
|
|
|
let stem = build_report_stem(summary);
|
|
let report = EvaluationReport::from_summary(summary, sample);
|
|
|
|
let json_path = dataset_dir.join(format!("{stem}.json"));
|
|
let json_blob = serde_json::to_string_pretty(&report).context("serialising JSON report")?;
|
|
fs::write(&json_path, &json_blob)
|
|
.with_context(|| format!("writing JSON report to {}", json_path.display()))?;
|
|
|
|
let md_path = dataset_dir.join(format!("{stem}.md"));
|
|
let markdown = render_markdown(&report);
|
|
fs::write(&md_path, &markdown)
|
|
.with_context(|| format!("writing Markdown report to {}", md_path.display()))?;
|
|
|
|
// Keep a latest.json pointer to simplify automation.
|
|
let latest_json = dataset_dir.join("latest.json");
|
|
fs::write(&latest_json, json_blob)
|
|
.with_context(|| format!("writing latest JSON report to {}", latest_json.display()))?;
|
|
let latest_md = dataset_dir.join("latest.md");
|
|
fs::write(&latest_md, markdown)
|
|
.with_context(|| format!("writing latest Markdown report to {}", latest_md.display()))?;
|
|
|
|
record_history(summary, &dataset_dir)?;
|
|
|
|
Ok(ReportPaths {
|
|
json: json_path,
|
|
markdown: md_path,
|
|
})
|
|
}
|
|
|
|
fn render_markdown(report: &EvaluationReport) -> String {
|
|
let mut md = String::new();
|
|
|
|
md.push_str(&format!(
|
|
"# Retrieval Evaluation (k={})\\n\\n",
|
|
report.retrieval.k
|
|
));
|
|
|
|
md.push_str("## Overview\\n\\n");
|
|
md.push_str("| Metric | Value |\\n| --- | --- |\\n");
|
|
md.push_str(&format!(
|
|
"| Generated | {} |\\n",
|
|
report.overview.generated_at
|
|
));
|
|
md.push_str(&format!(
|
|
"| Run Label | {} |\\n",
|
|
report
|
|
.overview
|
|
.run_label
|
|
.as_deref()
|
|
.filter(|label| !label.is_empty())
|
|
.unwrap_or("-")
|
|
));
|
|
md.push_str(&format!(
|
|
"| Total Cases | {} |\\n",
|
|
report.overview.total_cases
|
|
));
|
|
md.push_str(&format!(
|
|
"| Filtered Questions | {} |\\n",
|
|
report.overview.filtered_questions
|
|
));
|
|
|
|
md.push_str("\\n## Dataset & Slice\\n\\n");
|
|
md.push_str("| Metric | Value |\\n| --- | --- |\\n");
|
|
md.push_str(&format!(
|
|
"| Dataset | {} (`{}`) |\\n",
|
|
report.dataset.label, report.dataset.id
|
|
));
|
|
md.push_str(&format!(
|
|
"| Dataset Source | {} |\\n",
|
|
report.dataset.source
|
|
));
|
|
md.push_str(&format!(
|
|
"| Includes Unanswerable | {} |\\n",
|
|
bool_badge(report.dataset.includes_unanswerable)
|
|
));
|
|
md.push_str(&format!(
|
|
"| Require Verified Chunks | {} |\\n",
|
|
bool_badge(report.dataset.require_verified_chunks)
|
|
));
|
|
let embedding_label = if let Some(model) = report.dataset.embedding_model.as_ref() {
|
|
format!("{} ({model})", report.dataset.embedding_backend)
|
|
} else {
|
|
report.dataset.embedding_backend.clone()
|
|
};
|
|
md.push_str(&format!("| Embedding | {} |\\n", embedding_label));
|
|
md.push_str(&format!(
|
|
"| Embedding Dim | {} |\\n",
|
|
report.dataset.embedding_dimension
|
|
));
|
|
md.push_str(&format!("| Slice ID | `{}` |\\n", report.slice.id));
|
|
md.push_str(&format!("| Slice Seed | {} |\\n", report.slice.seed));
|
|
md.push_str(&format!(
|
|
"| Slice Window (offset/length) | {}/{} |\\n",
|
|
report.slice.window_offset, report.slice.window_length
|
|
));
|
|
md.push_str(&format!(
|
|
"| Slice Questions (window/ledger) | {}/{} |\\n",
|
|
report.slice.slice_cases, report.slice.ledger_total_cases
|
|
));
|
|
md.push_str(&format!(
|
|
"| Slice Positives / Negatives | {}/{} |\\n",
|
|
report.slice.positives, report.slice.negatives
|
|
));
|
|
md.push_str(&format!(
|
|
"| Slice Paragraphs | {} |\\n",
|
|
report.slice.total_paragraphs
|
|
));
|
|
md.push_str(&format!(
|
|
"| Negative Multiplier | {:.2} |\\n",
|
|
report.slice.negative_multiplier
|
|
));
|
|
|
|
md.push_str("\\n## Retrieval Metrics\\n\\n");
|
|
md.push_str("| Metric | Value |\\n| --- | --- |\\n");
|
|
md.push_str(&format!("| Cases | {} |\\n", report.retrieval.cases));
|
|
md.push_str(&format!(
|
|
"| Correct@{} | {}/{} |\\n",
|
|
report.retrieval.k, report.retrieval.correct, report.retrieval.cases
|
|
));
|
|
md.push_str(&format!(
|
|
"| Precision@{} | {:.3} |\\n",
|
|
report.retrieval.k, report.retrieval.precision
|
|
));
|
|
md.push_str(&format!(
|
|
"| Precision@1/2/3 | {:.3} / {:.3} / {:.3} |\\n",
|
|
report.retrieval.precision_at_1,
|
|
report.retrieval.precision_at_2,
|
|
report.retrieval.precision_at_3
|
|
));
|
|
md.push_str(&format!(
|
|
"| MRR | {:.3} |\\n",
|
|
report.retrieval.mrr
|
|
));
|
|
md.push_str(&format!(
|
|
"| NDCG | {:.3} |\\n",
|
|
report.retrieval.average_ndcg
|
|
));
|
|
md.push_str(&format!(
|
|
"| Latency Avg / P50 / P95 (ms) | {:.1} / {} / {} |\\n",
|
|
report.retrieval.latency.avg, report.retrieval.latency.p50, report.retrieval.latency.p95
|
|
));
|
|
md.push_str(&format!(
|
|
"| Strategy | `{}` |\\n",
|
|
report.retrieval.strategy
|
|
));
|
|
md.push_str(&format!(
|
|
"| Concurrency | {} |\\n",
|
|
report.retrieval.concurrency
|
|
));
|
|
if report.retrieval.rerank_enabled {
|
|
let pool = report
|
|
.retrieval
|
|
.rerank_pool_size
|
|
.map(|size| size.to_string())
|
|
.unwrap_or_else(|| "?".into());
|
|
md.push_str(&format!(
|
|
"| Rerank | enabled (pool {pool}, keep top {}) |\\n",
|
|
report.retrieval.rerank_keep_top
|
|
));
|
|
} else {
|
|
md.push_str("| Rerank | disabled |\\n");
|
|
}
|
|
|
|
if let Some(llm) = &report.llm {
|
|
md.push_str("\\n## LLM Mode Metrics\\n\\n");
|
|
md.push_str("| Metric | Value |\\n| --- | --- |\\n");
|
|
md.push_str(&format!("| Cases | {} |\\n", llm.cases));
|
|
md.push_str(&format!("| Answered | {} |\\n", llm.answered));
|
|
md.push_str(&format!("| Precision | {:.3} |\\n", llm.precision));
|
|
}
|
|
|
|
md.push_str("\\n## Performance\\n\\n");
|
|
md.push_str("| Metric | Value |\\n| --- | --- |\\n");
|
|
md.push_str(&format!(
|
|
"| OpenAI Base URL | {} |\\n",
|
|
report.performance.openai_base_url
|
|
));
|
|
md.push_str(&format!(
|
|
"| Ingestion Duration | {} ms |\\n",
|
|
report.performance.ingestion_ms
|
|
));
|
|
if let Some(seed) = report.performance.namespace_seed_ms {
|
|
md.push_str(&format!("| Namespace Seed | {} ms |\\n", seed));
|
|
}
|
|
md.push_str(&format!(
|
|
"| Namespace State | {} |\\n",
|
|
if report.performance.namespace_reused {
|
|
"reused"
|
|
} else {
|
|
"seeded"
|
|
}
|
|
));
|
|
md.push_str(&format!(
|
|
"| Corpus Paragraphs | {} |\\n",
|
|
report.performance.corpus_paragraphs
|
|
));
|
|
if report.detailed_report {
|
|
md.push_str(&format!(
|
|
"| Ingestion Cache | `{}` |\\n",
|
|
report.performance.ingestion_cache_path
|
|
));
|
|
md.push_str(&format!(
|
|
"| Ingestion Reused | {} |\\n",
|
|
bool_badge(report.performance.ingestion_reused)
|
|
));
|
|
md.push_str(&format!(
|
|
"| Embeddings Reused | {} |\\n",
|
|
bool_badge(report.performance.embeddings_reused)
|
|
));
|
|
}
|
|
md.push_str(&format!(
|
|
"| Positives Cached | {} |\\n",
|
|
report.performance.positive_paragraphs_reused
|
|
));
|
|
md.push_str(&format!(
|
|
"| Negatives Cached | {} |\\n",
|
|
report.performance.negative_paragraphs_reused
|
|
));
|
|
|
|
md.push_str("\\n## Retrieval Stage Timings\\n\\n");
|
|
md.push_str("| Stage | Avg (ms) | P50 (ms) | P95 (ms) |\\n| --- | --- | --- | --- |\\n");
|
|
write_stage_row(&mut md, "Embed", &report.performance.stage_latency.embed);
|
|
write_stage_row(
|
|
&mut md,
|
|
"Collect Candidates",
|
|
&report.performance.stage_latency.collect_candidates,
|
|
);
|
|
write_stage_row(
|
|
&mut md,
|
|
"Graph Expansion",
|
|
&report.performance.stage_latency.graph_expansion,
|
|
);
|
|
write_stage_row(
|
|
&mut md,
|
|
"Chunk Attach",
|
|
&report.performance.stage_latency.chunk_attach,
|
|
);
|
|
write_stage_row(&mut md, "Rerank", &report.performance.stage_latency.rerank);
|
|
write_stage_row(
|
|
&mut md,
|
|
"Assemble",
|
|
&report.performance.stage_latency.assemble,
|
|
);
|
|
|
|
if report.misses.is_empty() {
|
|
md.push_str("\\n_All evaluated retrieval queries matched within the top-k window._\\n");
|
|
if report.detailed_report {
|
|
md.push_str(
|
|
"\\nSuccess measures were captured for each query (entity, chunk text, chunk ID).\\n",
|
|
);
|
|
}
|
|
} else {
|
|
md.push_str("\\n## Missed Retrieval Queries (sample)\\n\\n");
|
|
if report.detailed_report {
|
|
md.push_str(
|
|
"| Question ID | Paragraph | Expected Source | Entity Match | Chunk Text | Chunk ID | Top Retrieved |\\n",
|
|
);
|
|
md.push_str("| --- | --- | --- | --- | --- | --- | --- |\\n");
|
|
} else {
|
|
md.push_str("| Question ID | Paragraph | Expected Source | Top Retrieved |\\n");
|
|
md.push_str("| --- | --- | --- | --- |\\n");
|
|
}
|
|
for case in &report.misses {
|
|
let retrieved = render_retrieved(&case.retrieved);
|
|
if report.detailed_report {
|
|
md.push_str(&format!(
|
|
"| `{}` | {} | `{}` | {} | {} | {} | {} |\\n",
|
|
case.question_id,
|
|
case.paragraph_title,
|
|
case.expected_source,
|
|
bool_badge(case.entity_match),
|
|
bool_badge(case.chunk_text_match),
|
|
bool_badge(case.chunk_id_match),
|
|
retrieved
|
|
));
|
|
} else {
|
|
md.push_str(&format!(
|
|
"| `{}` | {} | `{}` | {} |\\n",
|
|
case.question_id, case.paragraph_title, case.expected_source, retrieved
|
|
));
|
|
}
|
|
}
|
|
}
|
|
|
|
if report.llm.is_some() {
|
|
md.push_str("\\n## LLM-Only Cases (sample)\\n\\n");
|
|
if report.llm_cases.is_empty() {
|
|
md.push_str("All LLM-only cases matched within the evaluation window.\\n");
|
|
} else {
|
|
md.push_str("| Question ID | Answered | Match Rank | Top Retrieved |\\n");
|
|
md.push_str("| --- | --- | --- | --- |\\n");
|
|
for case in &report.llm_cases {
|
|
let retrieved = render_retrieved(&case.retrieved);
|
|
let rank = case
|
|
.match_rank
|
|
.map(|rank| rank.to_string())
|
|
.unwrap_or_else(|| "-".into());
|
|
md.push_str(&format!(
|
|
"| `{}` | {} | {} | {} |\\n",
|
|
case.question_id,
|
|
bool_badge(case.answered),
|
|
rank,
|
|
retrieved
|
|
));
|
|
}
|
|
}
|
|
}
|
|
|
|
md
|
|
}
|
|
fn write_stage_row(buf: &mut String, label: &str, stats: &LatencyStats) {
|
|
buf.push_str(&format!(
|
|
"| {} | {:.1} | {} | {} |\n",
|
|
label, stats.avg, stats.p50, stats.p95
|
|
));
|
|
}
|
|
|
|
fn bool_badge(value: bool) -> &'static str {
|
|
if value {
|
|
"✅"
|
|
} else {
|
|
"⚪"
|
|
}
|
|
}
|
|
|
|
fn render_retrieved(entries: &[RetrievedSnippet]) -> String {
|
|
if entries.is_empty() {
|
|
"-".to_string()
|
|
} else {
|
|
entries
|
|
.iter()
|
|
.map(|entry| format!("{} (rank {})", entry.source_id, entry.rank))
|
|
.take(3)
|
|
.collect::<Vec<_>>()
|
|
.join("<br>")
|
|
}
|
|
}
|
|
|
|
fn build_report_stem(summary: &EvaluationSummary) -> String {
|
|
let timestamp = summary.generated_at.format("%Y%m%dT%H%M%S");
|
|
let backend = sanitize_component(&summary.embedding_backend);
|
|
let dataset_component = sanitize_component(&summary.dataset_id);
|
|
let model_component = summary
|
|
.embedding_model
|
|
.as_ref()
|
|
.map(|model| sanitize_component(model));
|
|
|
|
match model_component {
|
|
Some(model) => format!(
|
|
"precision_at_{}_{}_{}_{}_{}",
|
|
summary.k, dataset_component, timestamp, backend, model
|
|
),
|
|
None => format!(
|
|
"precision_at_{}_{}_{}_{}",
|
|
summary.k, dataset_component, timestamp, backend
|
|
),
|
|
}
|
|
}
|
|
|
|
fn sanitize_component(input: &str) -> String {
|
|
input
|
|
.chars()
|
|
.map(|ch| if ch.is_ascii_alphanumeric() { ch } else { '_' })
|
|
.collect()
|
|
}
|
|
|
|
pub fn dataset_report_dir(report_dir: &Path, dataset_id: &str) -> PathBuf {
|
|
report_dir.join(sanitize_component(dataset_id))
|
|
}
|
|
|
|
#[derive(Debug, Serialize, Deserialize)]
|
|
struct HistoryEntry {
|
|
generated_at: String,
|
|
run_label: Option<String>,
|
|
dataset_id: String,
|
|
dataset_label: String,
|
|
slice_id: String,
|
|
slice_seed: u64,
|
|
slice_window_offset: usize,
|
|
slice_window_length: usize,
|
|
slice_cases: usize,
|
|
slice_total_cases: usize,
|
|
k: usize,
|
|
limit: Option<usize>,
|
|
precision: f64,
|
|
precision_at_1: f64,
|
|
precision_at_2: f64,
|
|
precision_at_3: f64,
|
|
#[serde(default)]
|
|
mrr: f64,
|
|
#[serde(default)]
|
|
average_ndcg: f64,
|
|
#[serde(default)]
|
|
retrieval_cases: usize,
|
|
#[serde(default)]
|
|
retrieval_precision: f64,
|
|
#[serde(default)]
|
|
llm_cases: usize,
|
|
#[serde(default)]
|
|
llm_precision: f64,
|
|
duration_ms: u128,
|
|
latency_ms: LatencyStats,
|
|
embedding_backend: String,
|
|
embedding_model: Option<String>,
|
|
ingestion_reused: bool,
|
|
ingestion_embeddings_reused: bool,
|
|
rerank_enabled: bool,
|
|
rerank_keep_top: usize,
|
|
rerank_pool_size: Option<usize>,
|
|
delta: Option<HistoryDelta>,
|
|
openai_base_url: String,
|
|
ingestion_ms: u128,
|
|
#[serde(default)]
|
|
namespace_seed_ms: Option<u128>,
|
|
}
|
|
|
|
#[derive(Debug, Serialize, Deserialize)]
|
|
struct HistoryDelta {
|
|
precision: f64,
|
|
precision_at_1: f64,
|
|
latency_avg_ms: f64,
|
|
}
|
|
|
|
fn record_history(summary: &EvaluationSummary, report_dir: &Path) -> Result<()> {
|
|
let path = report_dir.join("evaluations.json");
|
|
let mut entries: Vec<HistoryEntry> = if path.exists() {
|
|
let contents = fs::read(&path)
|
|
.with_context(|| format!("reading evaluation log {}", path.display()))?;
|
|
match serde_json::from_slice(&contents) {
|
|
Ok(entries) => entries,
|
|
Err(err) => {
|
|
let timestamp = Utc::now().format("%Y%m%dT%H%M%S");
|
|
let backup_path =
|
|
report_dir.join(format!("evaluations.json.corrupted.{}", timestamp));
|
|
warn!(
|
|
path = %path.display(),
|
|
backup = %backup_path.display(),
|
|
error = %err,
|
|
"Evaluation history file is corrupted; backing up and starting fresh"
|
|
);
|
|
if let Err(e) = fs::rename(&path, &backup_path) {
|
|
warn!(
|
|
path = %path.display(),
|
|
error = %e,
|
|
"Failed to backup corrupted evaluation history"
|
|
);
|
|
}
|
|
Vec::new()
|
|
}
|
|
}
|
|
} else {
|
|
Vec::new()
|
|
};
|
|
|
|
let delta = entries.last().map(|prev| HistoryDelta {
|
|
precision: summary.precision - prev.precision,
|
|
precision_at_1: summary.precision_at_1 - prev.precision_at_1,
|
|
latency_avg_ms: summary.latency_ms.avg - prev.latency_ms.avg,
|
|
});
|
|
|
|
let entry = HistoryEntry {
|
|
generated_at: format_timestamp(&summary.generated_at),
|
|
run_label: summary.run_label.clone(),
|
|
dataset_id: summary.dataset_id.clone(),
|
|
dataset_label: summary.dataset_label.clone(),
|
|
slice_id: summary.slice_id.clone(),
|
|
slice_seed: summary.slice_seed,
|
|
slice_window_offset: summary.slice_window_offset,
|
|
slice_window_length: summary.slice_window_length,
|
|
slice_cases: summary.slice_cases,
|
|
slice_total_cases: summary.slice_total_cases,
|
|
k: summary.k,
|
|
limit: summary.limit,
|
|
precision: summary.precision,
|
|
precision_at_1: summary.precision_at_1,
|
|
precision_at_2: summary.precision_at_2,
|
|
precision_at_3: summary.precision_at_3,
|
|
mrr: summary.mrr,
|
|
average_ndcg: summary.average_ndcg,
|
|
retrieval_cases: summary.retrieval_cases,
|
|
retrieval_precision: summary.retrieval_precision,
|
|
llm_cases: summary.llm_cases,
|
|
llm_precision: summary.llm_precision,
|
|
duration_ms: summary.duration_ms,
|
|
latency_ms: summary.latency_ms.clone(),
|
|
embedding_backend: summary.embedding_backend.clone(),
|
|
embedding_model: summary.embedding_model.clone(),
|
|
ingestion_reused: summary.ingestion_reused,
|
|
ingestion_embeddings_reused: summary.ingestion_embeddings_reused,
|
|
rerank_enabled: summary.rerank_enabled,
|
|
rerank_keep_top: summary.rerank_keep_top,
|
|
rerank_pool_size: summary.rerank_pool_size,
|
|
delta,
|
|
openai_base_url: summary.perf.openai_base_url.clone(),
|
|
ingestion_ms: summary.perf.ingestion_ms,
|
|
namespace_seed_ms: summary.perf.namespace_seed_ms,
|
|
};
|
|
|
|
entries.push(entry);
|
|
|
|
let blob = serde_json::to_vec_pretty(&entries).context("serialising evaluation log")?;
|
|
fs::write(&path, blob).with_context(|| format!("writing evaluation log {}", path.display()))?;
|
|
Ok(())
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use crate::eval::{
|
|
EvaluationStageTimings, PerformanceTimings, RetrievedSummary, StageLatencyBreakdown,
|
|
};
|
|
use chrono::Utc;
|
|
|
|
fn latency(ms: f64) -> LatencyStats {
|
|
LatencyStats {
|
|
avg: ms,
|
|
p50: ms as u128,
|
|
p95: ms as u128,
|
|
}
|
|
}
|
|
|
|
fn sample_stage_latency() -> StageLatencyBreakdown {
|
|
StageLatencyBreakdown {
|
|
embed: latency(9.0),
|
|
collect_candidates: latency(10.0),
|
|
graph_expansion: latency(11.0),
|
|
chunk_attach: latency(12.0),
|
|
rerank: latency(13.0),
|
|
assemble: latency(14.0),
|
|
}
|
|
}
|
|
|
|
fn sample_eval_stage() -> EvaluationStageTimings {
|
|
EvaluationStageTimings {
|
|
prepare_slice_ms: 1,
|
|
prepare_db_ms: 2,
|
|
prepare_corpus_ms: 3,
|
|
prepare_namespace_ms: 4,
|
|
run_queries_ms: 5,
|
|
summarize_ms: 6,
|
|
finalize_ms: 7,
|
|
}
|
|
}
|
|
|
|
fn sample_case(is_impossible: bool, matched: bool) -> CaseSummary {
|
|
CaseSummary {
|
|
question_id: if is_impossible {
|
|
"llm-q".into()
|
|
} else {
|
|
"retrieval-q".into()
|
|
},
|
|
question: "Who is the hero?".into(),
|
|
paragraph_id: "p1".into(),
|
|
paragraph_title: "Hero".into(),
|
|
expected_source: "src1".into(),
|
|
answers: vec!["answer".into()],
|
|
matched,
|
|
entity_match: matched,
|
|
chunk_text_match: matched,
|
|
chunk_id_match: matched,
|
|
ndcg: None,
|
|
reciprocal_rank: None,
|
|
is_impossible,
|
|
has_verified_chunks: !is_impossible,
|
|
match_rank: if matched { Some(1) } else { None },
|
|
latency_ms: 42,
|
|
retrieved: vec![RetrievedSummary {
|
|
rank: 1,
|
|
entity_id: "entity1".into(),
|
|
source_id: "src1".into(),
|
|
entity_name: "Entity".into(),
|
|
score: 1.0,
|
|
matched,
|
|
entity_description: None,
|
|
entity_category: None,
|
|
chunk_text_match: Some(matched),
|
|
chunk_id_match: Some(matched),
|
|
}],
|
|
}
|
|
}
|
|
|
|
fn sample_summary(include_llm: bool) -> EvaluationSummary {
|
|
let mut cases = vec![sample_case(false, true)];
|
|
if include_llm {
|
|
cases.push(sample_case(true, false));
|
|
}
|
|
EvaluationSummary {
|
|
generated_at: Utc::now(),
|
|
k: 5,
|
|
limit: Some(10),
|
|
run_label: Some("test".into()),
|
|
total_cases: cases.len(),
|
|
correct: 1,
|
|
precision: 1.0,
|
|
correct_at_1: 1,
|
|
correct_at_2: 1,
|
|
correct_at_3: 1,
|
|
precision_at_1: 1.0,
|
|
precision_at_2: 1.0,
|
|
precision_at_3: 1.0,
|
|
duration_ms: 100,
|
|
dataset_id: "ds".into(),
|
|
dataset_label: "Dataset".into(),
|
|
dataset_includes_unanswerable: include_llm,
|
|
dataset_source: "dev".into(),
|
|
includes_impossible_cases: include_llm,
|
|
require_verified_chunks: !include_llm,
|
|
filtered_questions: 0,
|
|
retrieval_cases: 1,
|
|
retrieval_correct: 1,
|
|
retrieval_precision: 1.0,
|
|
average_ndcg: 0.0,
|
|
mrr: 0.0,
|
|
llm_cases: if include_llm { 1 } else { 0 },
|
|
llm_answered: 0,
|
|
llm_precision: 0.0,
|
|
slice_id: "slice".into(),
|
|
slice_seed: 1,
|
|
slice_total_cases: cases.len(),
|
|
slice_window_offset: 0,
|
|
slice_window_length: cases.len(),
|
|
slice_cases: cases.len(),
|
|
slice_positive_paragraphs: 1,
|
|
slice_negative_paragraphs: 0,
|
|
slice_total_paragraphs: 1,
|
|
slice_negative_multiplier: 1.0,
|
|
namespace_reused: true,
|
|
corpus_paragraphs: 1,
|
|
ingestion_cache_path: "/cache".into(),
|
|
ingestion_reused: true,
|
|
ingestion_embeddings_reused: true,
|
|
ingestion_fingerprint: "fp".into(),
|
|
positive_paragraphs_reused: 1,
|
|
negative_paragraphs_reused: 0,
|
|
latency_ms: latency(10.0),
|
|
perf: PerformanceTimings {
|
|
openai_base_url: "https://example.com".into(),
|
|
ingestion_ms: 100,
|
|
namespace_seed_ms: Some(50),
|
|
evaluation_stage_ms: sample_eval_stage(),
|
|
stage_latency: sample_stage_latency(),
|
|
},
|
|
embedding_backend: "fastembed".into(),
|
|
embedding_model: Some("model".into()),
|
|
embedding_dimension: 32,
|
|
rerank_enabled: true,
|
|
rerank_pool_size: Some(4),
|
|
rerank_keep_top: 5,
|
|
concurrency: 2,
|
|
detailed_report: true,
|
|
retrieval_strategy: "initial".into(),
|
|
chunk_vector_take: 50,
|
|
chunk_fts_take: 50,
|
|
chunk_token_budget: 10_000,
|
|
chunk_avg_chars_per_token: 4,
|
|
max_chunks_per_entity: 4,
|
|
cases,
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn markdown_includes_llm_section() {
|
|
let summary = sample_summary(true);
|
|
let report = EvaluationReport::from_summary(&summary, 5);
|
|
let md = render_markdown(&report);
|
|
assert!(md.contains("LLM Mode Metrics"));
|
|
assert!(md.contains("LLM-Only Cases (sample)"));
|
|
}
|
|
|
|
#[test]
|
|
fn markdown_hides_llm_section_when_not_present() {
|
|
let summary = sample_summary(false);
|
|
let report = EvaluationReport::from_summary(&summary, 5);
|
|
let md = render_markdown(&report);
|
|
assert!(!md.contains("LLM Mode Metrics"));
|
|
assert!(!md.contains("LLM-Only Cases"));
|
|
}
|
|
}
|