benchmarks: ready for hybrid revised

This commit is contained in:
Per Stark
2025-12-03 11:38:07 +01:00
parent 5e5053039a
commit c60efb2af7
22 changed files with 760 additions and 476 deletions
+263 -109
View File
@@ -19,7 +19,7 @@ pub struct ReportPaths {
pub markdown: PathBuf,
}
#[derive(Debug, Serialize)]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvaluationReport {
pub overview: OverviewSection,
pub dataset: DatasetSection,
@@ -28,14 +28,15 @@ pub struct EvaluationReport {
#[serde(skip_serializing_if = "Option::is_none")]
pub llm: Option<LlmSection>,
pub performance: PerformanceSection,
#[serde(skip_serializing_if = "Vec::is_empty")]
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub misses: Vec<MissEntry>,
#[serde(skip_serializing_if = "Vec::is_empty")]
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub llm_cases: Vec<LlmCaseEntry>,
#[serde(default)]
pub detailed_report: bool,
}
#[derive(Debug, Serialize)]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OverviewSection {
pub generated_at: String,
pub run_label: Option<String>,
@@ -43,7 +44,7 @@ pub struct OverviewSection {
pub filtered_questions: usize,
}
#[derive(Debug, Serialize)]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DatasetSection {
pub id: String,
pub label: String,
@@ -55,7 +56,7 @@ pub struct DatasetSection {
pub embedding_dimension: usize,
}
#[derive(Debug, Serialize)]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SliceSection {
pub id: String,
pub seed: u64,
@@ -69,7 +70,7 @@ pub struct SliceSection {
pub negative_multiplier: f32,
}
#[derive(Debug, Serialize)]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RetrievalSection {
pub k: usize,
pub cases: usize,
@@ -86,16 +87,21 @@ pub struct RetrievalSection {
pub rerank_enabled: bool,
pub rerank_pool_size: Option<usize>,
pub rerank_keep_top: usize,
pub chunk_result_cap: usize,
pub ingest_chunk_min_tokens: usize,
pub ingest_chunk_max_tokens: usize,
pub ingest_chunk_overlap_tokens: usize,
pub ingest_chunks_only: bool,
}
#[derive(Debug, Serialize)]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LlmSection {
pub cases: usize,
pub answered: usize,
pub precision: f64,
}
#[derive(Debug, Serialize)]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PerformanceSection {
pub openai_base_url: String,
pub ingestion_ms: u128,
@@ -111,7 +117,7 @@ pub struct PerformanceSection {
pub negative_paragraphs_reused: usize,
}
#[derive(Debug, Serialize)]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MissEntry {
pub question_id: String,
pub paragraph_title: String,
@@ -122,7 +128,7 @@ pub struct MissEntry {
pub retrieved: Vec<RetrievedSnippet>,
}
#[derive(Debug, Serialize)]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LlmCaseEntry {
pub question_id: String,
pub answered: bool,
@@ -131,7 +137,7 @@ pub struct LlmCaseEntry {
pub retrieved: Vec<RetrievedSnippet>,
}
#[derive(Debug, Serialize)]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RetrievedSnippet {
pub rank: usize,
pub source_id: String,
@@ -139,6 +145,13 @@ pub struct RetrievedSnippet {
pub matched: bool,
}
#[derive(Debug)]
pub struct ReportOutcome {
pub record: EvaluationReport,
pub paths: ReportPaths,
pub history_path: PathBuf,
}
impl EvaluationReport {
pub fn from_summary(summary: &EvaluationSummary, sample: usize) -> Self {
let overview = OverviewSection {
@@ -188,6 +201,11 @@ impl EvaluationReport {
rerank_enabled: summary.rerank_enabled,
rerank_pool_size: summary.rerank_pool_size,
rerank_keep_top: summary.rerank_keep_top,
chunk_result_cap: summary.chunk_result_cap,
ingest_chunk_min_tokens: summary.ingest_chunk_min_tokens,
ingest_chunk_max_tokens: summary.ingest_chunk_max_tokens,
ingest_chunk_overlap_tokens: summary.ingest_chunk_overlap_tokens,
ingest_chunks_only: summary.ingest_chunks_only,
};
let llm = if summary.llm_cases > 0 {
@@ -215,24 +233,30 @@ impl EvaluationReport {
negative_paragraphs_reused: summary.negative_paragraphs_reused,
};
let misses = summary
.cases
.iter()
.filter(|case| !case.matched && !case.is_impossible)
.take(sample)
.map(MissEntry::from_case)
.collect();
let llm_cases = if llm.is_some() {
summary
let (misses, llm_cases) = if summary.detailed_report {
let misses = summary
.cases
.iter()
.filter(|case| case.is_impossible)
.filter(|case| !case.matched && !case.is_impossible)
.take(sample)
.map(LlmCaseEntry::from_case)
.collect()
.map(MissEntry::from_case)
.collect();
let llm_cases = if llm.is_some() {
summary
.cases
.iter()
.filter(|case| case.is_impossible)
.take(sample)
.map(LlmCaseEntry::from_case)
.collect()
} else {
Vec::new()
};
(misses, llm_cases)
} else {
Vec::new()
(Vec::new(), Vec::new())
};
Self {
@@ -299,7 +323,7 @@ pub fn write_reports(
summary: &EvaluationSummary,
report_dir: &Path,
sample: usize,
) -> Result<ReportPaths> {
) -> Result<ReportOutcome> {
fs::create_dir_all(report_dir)
.with_context(|| format!("creating report directory {}", report_dir.display()))?;
let dataset_dir = dataset_report_dir(report_dir, &summary.dataset_id);
@@ -331,11 +355,15 @@ pub fn write_reports(
fs::write(&latest_md, markdown)
.with_context(|| format!("writing latest Markdown report to {}", latest_md.display()))?;
record_history(summary, &dataset_dir)?;
let history_path = record_history(&report, &dataset_dir)?;
Ok(ReportPaths {
json: json_path,
markdown: md_path,
Ok(ReportOutcome {
record: report,
paths: ReportPaths {
json: json_path,
markdown: md_path,
},
history_path,
})
}
@@ -555,10 +583,14 @@ fn render_markdown(report: &EvaluationReport) -> String {
);
if report.misses.is_empty() {
md.push_str("\\n_All evaluated retrieval queries matched within the top-k window._\\n");
if report.detailed_report {
md.push_str(
"\\nSuccess measures were captured for each query (entity, chunk text, chunk ID).\\n",
"\\n_All evaluated retrieval queries matched within the top-k window._\\n\
\\nSuccess measures were captured for each query (entity, chunk text, chunk ID).\\n",
);
} else {
md.push_str(
"\\n_Misses omitted. Re-run with `--detailed-report` to see sampled failures._\\n",
);
}
} else {
@@ -597,7 +629,11 @@ fn render_markdown(report: &EvaluationReport) -> String {
if report.llm.is_some() {
md.push_str("\\n## LLM-Only Cases (sample)\\n\\n");
if report.llm_cases.is_empty() {
md.push_str("All LLM-only cases matched within the evaluation window.\\n");
if report.detailed_report {
md.push_str("All LLM-only cases matched within the evaluation window.\\n");
} else {
md.push_str("LLM-only cases omitted. Re-run with `--detailed-report` to see samples.\\n");
}
} else {
md.push_str("| Question ID | Answered | Match Rank | Top Retrieved |\\n");
md.push_str("| --- | --- | --- | --- |\\n");
@@ -681,7 +717,7 @@ pub fn dataset_report_dir(report_dir: &Path, dataset_id: &str) -> PathBuf {
}
#[derive(Debug, Serialize, Deserialize)]
struct HistoryEntry {
struct LegacyHistoryEntry {
generated_at: String,
run_label: Option<String>,
dataset_id: String,
@@ -719,7 +755,18 @@ struct HistoryEntry {
rerank_enabled: bool,
rerank_keep_top: usize,
rerank_pool_size: Option<usize>,
delta: Option<HistoryDelta>,
#[serde(default)]
chunk_result_cap: Option<usize>,
#[serde(default)]
ingest_chunk_min_tokens: Option<usize>,
#[serde(default)]
ingest_chunk_max_tokens: Option<usize>,
#[serde(default)]
ingest_chunk_overlap_tokens: Option<usize>,
#[serde(default)]
ingest_chunks_only: Option<bool>,
#[serde(default)]
delta: Option<LegacyHistoryDelta>,
openai_base_url: String,
ingestion_ms: u128,
#[serde(default)]
@@ -727,92 +774,173 @@ struct HistoryEntry {
}
#[derive(Debug, Serialize, Deserialize)]
struct HistoryDelta {
struct LegacyHistoryDelta {
precision: f64,
precision_at_1: f64,
latency_avg_ms: f64,
}
fn record_history(summary: &EvaluationSummary, report_dir: &Path) -> Result<()> {
let path = report_dir.join("evaluations.json");
let mut entries: Vec<HistoryEntry> = if path.exists() {
let contents = fs::read(&path)
.with_context(|| format!("reading evaluation log {}", path.display()))?;
match serde_json::from_slice(&contents) {
Ok(entries) => entries,
Err(err) => {
let timestamp = Utc::now().format("%Y%m%dT%H%M%S");
let backup_path =
report_dir.join(format!("evaluations.json.corrupted.{}", timestamp));
fn default_stage_latency() -> StageLatencyBreakdown {
StageLatencyBreakdown {
embed: LatencyStats::default(),
collect_candidates: LatencyStats::default(),
graph_expansion: LatencyStats::default(),
chunk_attach: LatencyStats::default(),
rerank: LatencyStats::default(),
assemble: LatencyStats::default(),
}
}
fn convert_legacy_entry(entry: LegacyHistoryEntry) -> EvaluationReport {
let overview = OverviewSection {
generated_at: entry.generated_at,
run_label: entry.run_label,
total_cases: entry.slice_cases,
filtered_questions: 0,
};
let dataset = DatasetSection {
id: entry.dataset_id,
label: entry.dataset_label,
source: String::new(),
includes_unanswerable: entry.llm_cases > 0,
require_verified_chunks: true,
embedding_backend: entry.embedding_backend,
embedding_model: entry.embedding_model,
embedding_dimension: 0,
};
let slice = SliceSection {
id: entry.slice_id,
seed: entry.slice_seed,
window_offset: entry.slice_window_offset,
window_length: entry.slice_window_length,
slice_cases: entry.slice_cases,
ledger_total_cases: entry.slice_total_cases,
positives: 0,
negatives: 0,
total_paragraphs: 0,
negative_multiplier: 0.0,
};
let retrieval_cases = if entry.retrieval_cases > 0 {
entry.retrieval_cases
} else {
entry.slice_cases.saturating_sub(entry.llm_cases)
};
let retrieval_precision = if entry.retrieval_precision > 0.0 {
entry.retrieval_precision
} else {
entry.precision
};
let retrieval = RetrievalSection {
k: entry.k,
cases: retrieval_cases,
correct: 0,
precision: retrieval_precision,
precision_at_1: entry.precision_at_1,
precision_at_2: entry.precision_at_2,
precision_at_3: entry.precision_at_3,
mrr: entry.mrr,
average_ndcg: entry.average_ndcg,
latency: entry.latency_ms,
concurrency: 0,
strategy: "unknown".into(),
rerank_enabled: entry.rerank_enabled,
rerank_pool_size: entry.rerank_pool_size,
rerank_keep_top: entry.rerank_keep_top,
chunk_result_cap: entry.chunk_result_cap.unwrap_or(5),
ingest_chunk_min_tokens: entry.ingest_chunk_min_tokens.unwrap_or(256),
ingest_chunk_max_tokens: entry.ingest_chunk_max_tokens.unwrap_or(512),
ingest_chunk_overlap_tokens: entry.ingest_chunk_overlap_tokens.unwrap_or(50),
ingest_chunks_only: entry.ingest_chunks_only.unwrap_or(false),
};
let llm = if entry.llm_cases > 0 {
Some(LlmSection {
cases: entry.llm_cases,
answered: 0,
precision: entry.llm_precision,
})
} else {
None
};
let performance = PerformanceSection {
openai_base_url: entry.openai_base_url,
ingestion_ms: entry.ingestion_ms,
namespace_seed_ms: entry.namespace_seed_ms,
evaluation_stages_ms: EvaluationStageTimings::default(),
stage_latency: default_stage_latency(),
namespace_reused: false,
ingestion_reused: entry.ingestion_reused,
embeddings_reused: entry.ingestion_embeddings_reused,
ingestion_cache_path: String::new(),
corpus_paragraphs: 0,
positive_paragraphs_reused: 0,
negative_paragraphs_reused: 0,
};
EvaluationReport {
overview,
dataset,
slice,
retrieval,
llm,
performance,
misses: Vec::new(),
llm_cases: Vec::new(),
detailed_report: false,
}
}
fn load_history(path: &Path) -> Result<Vec<EvaluationReport>> {
if !path.exists() {
return Ok(Vec::new());
}
let contents =
fs::read(path).with_context(|| format!("reading evaluation log {}", path.display()))?;
if let Ok(entries) = serde_json::from_slice::<Vec<EvaluationReport>>(&contents) {
return Ok(entries);
}
match serde_json::from_slice::<Vec<LegacyHistoryEntry>>(&contents) {
Ok(entries) => Ok(entries.into_iter().map(convert_legacy_entry).collect()),
Err(err) => {
let timestamp = Utc::now().format("%Y%m%dT%H%M%S");
let backup_path = path
.parent()
.unwrap_or_else(|| Path::new("."))
.join(format!("evaluations.json.corrupted.{timestamp}"));
warn!(
path = %path.display(),
backup = %backup_path.display(),
error = %err,
"Evaluation history file is corrupted; backing up and starting fresh"
);
if let Err(e) = fs::rename(path, &backup_path) {
warn!(
path = %path.display(),
backup = %backup_path.display(),
error = %err,
"Evaluation history file is corrupted; backing up and starting fresh"
error = %e,
"Failed to backup corrupted evaluation history"
);
if let Err(e) = fs::rename(&path, &backup_path) {
warn!(
path = %path.display(),
error = %e,
"Failed to backup corrupted evaluation history"
);
}
Vec::new()
}
Ok(Vec::new())
}
} else {
Vec::new()
};
}
}
let delta = entries.last().map(|prev| HistoryDelta {
precision: summary.precision - prev.precision,
precision_at_1: summary.precision_at_1 - prev.precision_at_1,
latency_avg_ms: summary.latency_ms.avg - prev.latency_ms.avg,
});
let entry = HistoryEntry {
generated_at: format_timestamp(&summary.generated_at),
run_label: summary.run_label.clone(),
dataset_id: summary.dataset_id.clone(),
dataset_label: summary.dataset_label.clone(),
slice_id: summary.slice_id.clone(),
slice_seed: summary.slice_seed,
slice_window_offset: summary.slice_window_offset,
slice_window_length: summary.slice_window_length,
slice_cases: summary.slice_cases,
slice_total_cases: summary.slice_total_cases,
k: summary.k,
limit: summary.limit,
precision: summary.precision,
precision_at_1: summary.precision_at_1,
precision_at_2: summary.precision_at_2,
precision_at_3: summary.precision_at_3,
mrr: summary.mrr,
average_ndcg: summary.average_ndcg,
retrieval_cases: summary.retrieval_cases,
retrieval_precision: summary.retrieval_precision,
llm_cases: summary.llm_cases,
llm_precision: summary.llm_precision,
duration_ms: summary.duration_ms,
latency_ms: summary.latency_ms.clone(),
embedding_backend: summary.embedding_backend.clone(),
embedding_model: summary.embedding_model.clone(),
ingestion_reused: summary.ingestion_reused,
ingestion_embeddings_reused: summary.ingestion_embeddings_reused,
rerank_enabled: summary.rerank_enabled,
rerank_keep_top: summary.rerank_keep_top,
rerank_pool_size: summary.rerank_pool_size,
delta,
openai_base_url: summary.perf.openai_base_url.clone(),
ingestion_ms: summary.perf.ingestion_ms,
namespace_seed_ms: summary.perf.namespace_seed_ms,
};
entries.push(entry);
fn record_history(report: &EvaluationReport, report_dir: &Path) -> Result<PathBuf> {
let path = report_dir.join("evaluations.json");
let mut entries = load_history(&path)?;
entries.push(report.clone());
let blob = serde_json::to_vec_pretty(&entries).context("serialising evaluation log")?;
fs::write(&path, blob).with_context(|| format!("writing evaluation log {}", path.display()))?;
Ok(())
Ok(path)
}
#[cfg(test)]
@@ -822,6 +950,7 @@ mod tests {
EvaluationStageTimings, PerformanceTimings, RetrievedSummary, StageLatencyBreakdown,
};
use chrono::Utc;
use tempfile::tempdir;
fn latency(ms: f64) -> LatencyStats {
LatencyStats {
@@ -961,9 +1090,13 @@ mod tests {
concurrency: 2,
detailed_report: true,
retrieval_strategy: "initial".into(),
chunk_result_cap: 5,
ingest_chunk_min_tokens: 256,
ingest_chunk_max_tokens: 512,
ingest_chunk_overlap_tokens: 50,
ingest_chunks_only: false,
chunk_vector_take: 50,
chunk_fts_take: 50,
chunk_token_budget: 10_000,
chunk_avg_chars_per_token: 4,
max_chunks_per_entity: 4,
cases,
@@ -987,4 +1120,25 @@ mod tests {
assert!(!md.contains("LLM Mode Metrics"));
assert!(!md.contains("LLM-Only Cases"));
}
#[test]
fn evaluations_history_captures_strategy_and_concurrency() {
let tmp = tempdir().unwrap();
let summary = sample_summary(false);
let outcome =
write_reports(&summary, tmp.path(), 5).expect("writing consolidated reports");
let contents =
std::fs::read_to_string(&outcome.history_path).expect("reading evaluations history");
let entries: Vec<EvaluationReport> =
serde_json::from_str(&contents).expect("parsing evaluations history");
assert_eq!(entries.len(), 1);
let stored = &entries[0];
assert_eq!(stored.retrieval.concurrency, summary.concurrency);
assert_eq!(stored.retrieval.strategy, summary.retrieval_strategy);
assert_eq!(
stored.performance.evaluation_stages_ms.run_queries_ms,
summary.perf.evaluation_stage_ms.run_queries_ms
);
}
}