benchmarks: fin

2026-04-26 10:48:37 +02:00 · 2025-12-08 21:57:53 +01:00
parent 0cb1abc6db
commit a8d10f265c
39 changed files with 774 additions and 714 deletions
--- a/evaluations/src/types.rs
+++ b/evaluations/src/types.rs
@@ -0,0 +1,461 @@
+use std::collections::HashSet;
+
+use chrono::{DateTime, Utc};
+use common::storage::types::StoredObject;
+use retrieval_pipeline::{
+    PipelineDiagnostics, PipelineStageTimings, RetrievedChunk, RetrievedEntity, StrategyOutput,
+};
+use serde::{Deserialize, Serialize};
+use unicode_normalization::UnicodeNormalization;
+
+#[derive(Debug, Serialize)]
+pub struct EvaluationSummary {
+    pub generated_at: DateTime<Utc>,
+    pub k: usize,
+    pub limit: Option<usize>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub run_label: Option<String>,
+    pub total_cases: usize,
+    pub correct: usize,
+    pub precision: f64,
+    pub correct_at_1: usize,
+    pub correct_at_2: usize,
+    pub correct_at_3: usize,
+    pub precision_at_1: f64,
+    pub precision_at_2: f64,
+    pub precision_at_3: f64,
+    pub mrr: f64,
+    pub average_ndcg: f64,
+    pub duration_ms: u128,
+    pub dataset_id: String,
+    pub dataset_label: String,
+    pub dataset_includes_unanswerable: bool,
+    pub dataset_source: String,
+    pub includes_impossible_cases: bool,
+    pub require_verified_chunks: bool,
+    pub filtered_questions: usize,
+    pub retrieval_cases: usize,
+    pub retrieval_correct: usize,
+    pub retrieval_precision: f64,
+    pub llm_cases: usize,
+    pub llm_answered: usize,
+    pub llm_precision: f64,
+    pub slice_id: String,
+    pub slice_seed: u64,
+    pub slice_total_cases: usize,
+    pub slice_window_offset: usize,
+    pub slice_window_length: usize,
+    pub slice_cases: usize,
+    pub slice_positive_paragraphs: usize,
+    pub slice_negative_paragraphs: usize,
+    pub slice_total_paragraphs: usize,
+    pub slice_negative_multiplier: f32,
+    pub namespace_reused: bool,
+    pub corpus_paragraphs: usize,
+    pub ingestion_cache_path: String,
+    pub ingestion_reused: bool,
+    pub ingestion_embeddings_reused: bool,
+    pub ingestion_fingerprint: String,
+    pub positive_paragraphs_reused: usize,
+    pub negative_paragraphs_reused: usize,
+    pub latency_ms: LatencyStats,
+    pub perf: PerformanceTimings,
+    pub embedding_backend: String,
+    pub embedding_model: Option<String>,
+    pub embedding_dimension: usize,
+    pub rerank_enabled: bool,
+    pub rerank_pool_size: Option<usize>,
+    pub rerank_keep_top: usize,
+    pub concurrency: usize,
+    pub detailed_report: bool,
+    pub retrieval_strategy: String,
+    pub chunk_result_cap: usize,
+    pub chunk_rrf_k: f32,
+    pub chunk_rrf_vector_weight: f32,
+    pub chunk_rrf_fts_weight: f32,
+    pub chunk_rrf_use_vector: bool,
+    pub chunk_rrf_use_fts: bool,
+    pub ingest_chunk_min_tokens: usize,
+    pub ingest_chunk_max_tokens: usize,
+    pub ingest_chunks_only: bool,
+    pub ingest_chunk_overlap_tokens: usize,
+    pub chunk_vector_take: usize,
+    pub chunk_fts_take: usize,
+    pub chunk_avg_chars_per_token: usize,
+    pub max_chunks_per_entity: usize,
+    pub cases: Vec<CaseSummary>,
+}
+
+#[derive(Debug, Serialize)]
+pub struct CaseSummary {
+    pub question_id: String,
+    pub question: String,
+    pub paragraph_id: String,
+    pub paragraph_title: String,
+    pub expected_source: String,
+    pub answers: Vec<String>,
+    pub matched: bool,
+    pub entity_match: bool,
+    pub chunk_text_match: bool,
+    pub chunk_id_match: bool,
+    pub is_impossible: bool,
+    pub has_verified_chunks: bool,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub match_rank: Option<usize>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub reciprocal_rank: Option<f64>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub ndcg: Option<f64>,
+    pub latency_ms: u128,
+    pub retrieved: Vec<RetrievedSummary>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct LatencyStats {
+    pub avg: f64,
+    pub p50: u128,
+    pub p95: u128,
+}
+
+impl Default for LatencyStats {
+    fn default() -> Self {
+        Self {
+            avg: 0.0,
+            p50: 0,
+            p95: 0,
+        }
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, Default)]
+pub struct StageLatencyBreakdown {
+    pub embed: LatencyStats,
+    pub collect_candidates: LatencyStats,
+    pub graph_expansion: LatencyStats,
+    pub chunk_attach: LatencyStats,
+    pub rerank: LatencyStats,
+    pub assemble: LatencyStats,
+}
+
+#[derive(Debug, Default, Clone, Serialize, Deserialize)]
+pub struct EvaluationStageTimings {
+    pub prepare_slice_ms: u128,
+    pub prepare_db_ms: u128,
+    pub prepare_corpus_ms: u128,
+    pub prepare_namespace_ms: u128,
+    pub run_queries_ms: u128,
+    pub summarize_ms: u128,
+    pub finalize_ms: u128,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct PerformanceTimings {
+    pub openai_base_url: String,
+    pub ingestion_ms: u128,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub namespace_seed_ms: Option<u128>,
+    pub evaluation_stage_ms: EvaluationStageTimings,
+    pub stage_latency: StageLatencyBreakdown,
+}
+
+#[derive(Debug, Serialize)]
+pub struct RetrievedSummary {
+    pub rank: usize,
+    pub entity_id: String,
+    pub source_id: String,
+    pub entity_name: String,
+    pub score: f32,
+    pub matched: bool,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub entity_description: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub entity_category: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub chunk_text_match: Option<bool>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub chunk_id_match: Option<bool>,
+}
+
+#[derive(Debug, Clone)]
+pub struct EvaluationCandidate {
+    pub entity_id: String,
+    pub source_id: String,
+    pub entity_name: String,
+    pub entity_description: Option<String>,
+    pub entity_category: Option<String>,
+    pub score: f32,
+    pub chunks: Vec<RetrievedChunk>,
+}
+
+impl EvaluationCandidate {
+    fn from_entity(entity: RetrievedEntity) -> Self {
+        let entity_category = Some(format!("{:?}", entity.entity.entity_type));
+        Self {
+            entity_id: entity.entity.get_id().to_string(),
+            source_id: entity.entity.source_id.clone(),
+            entity_name: entity.entity.name.clone(),
+            entity_description: Some(entity.entity.description.clone()),
+            entity_category,
+            score: entity.score,
+            chunks: entity.chunks,
+        }
+    }
+
+    fn from_chunk(chunk: RetrievedChunk) -> Self {
+        let snippet = chunk_snippet(&chunk.chunk.chunk);
+        Self {
+            entity_id: chunk.chunk.get_id().to_string(),
+            source_id: chunk.chunk.source_id.clone(),
+            entity_name: chunk.chunk.source_id.clone(),
+            entity_description: Some(snippet),
+            entity_category: Some("Chunk".to_string()),
+            score: chunk.score,
+            chunks: vec![chunk],
+        }
+    }
+}
+
+pub fn adapt_strategy_output(output: StrategyOutput) -> Vec<EvaluationCandidate> {
+    match output {
+        StrategyOutput::Entities(entities) => entities
+            .into_iter()
+            .map(EvaluationCandidate::from_entity)
+            .collect(),
+        StrategyOutput::Chunks(chunks) => chunks
+            .into_iter()
+            .map(EvaluationCandidate::from_chunk)
+            .collect(),
+    }
+}
+
+#[derive(Debug, Serialize)]
+pub struct CaseDiagnostics {
+    pub question_id: String,
+    pub question: String,
+    pub paragraph_id: String,
+    pub paragraph_title: String,
+    pub expected_source: String,
+    pub expected_chunk_ids: Vec<String>,
+    pub answers: Vec<String>,
+    pub entity_match: bool,
+    pub chunk_text_match: bool,
+    pub chunk_id_match: bool,
+    pub failure_reasons: Vec<String>,
+    pub missing_expected_chunk_ids: Vec<String>,
+    pub attached_chunk_ids: Vec<String>,
+    pub retrieved: Vec<EntityDiagnostics>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub pipeline: Option<PipelineDiagnostics>,
+}
+
+#[derive(Debug, Serialize)]
+pub struct EntityDiagnostics {
+    pub rank: usize,
+    pub entity_id: String,
+    pub source_id: String,
+    pub name: String,
+    pub score: f32,
+    pub entity_match: bool,
+    pub chunk_text_match: bool,
+    pub chunk_id_match: bool,
+    pub chunks: Vec<ChunkDiagnosticsEntry>,
+}
+
+#[derive(Debug, Serialize)]
+pub struct ChunkDiagnosticsEntry {
+    pub chunk_id: String,
+    pub score: f32,
+    pub contains_answer: bool,
+    pub expected_chunk: bool,
+    pub snippet: String,
+}
+
+pub fn text_contains_answer(text: &str, answers: &[String]) -> bool {
+    if answers.is_empty() {
+        return true;
+    }
+    let haystack = normalize_for_match(text);
+    answers
+        .iter()
+        .map(|needle| normalize_for_match(needle))
+        .any(|needle| !needle.is_empty() && haystack.contains(&needle))
+}
+
+fn normalize_for_match(input: &str) -> String {
+    // NFKC normalize, lowercase, and collapse whitespace/punctuation to a single space
+    // to reduce false negatives from formatting or punctuation differences.
+    let mut out = String::with_capacity(input.len());
+    let mut last_space = false;
+    for ch in input.nfkc().flat_map(|c| c.to_lowercase()) {
+        let is_space = ch.is_whitespace();
+        let is_punct = ch.is_ascii_punctuation()
+            || matches!(
+                ch,
+                '“' | '”' | '‘' | '’' | '«' | '»' | '–' | '—' | '…' | '·' | '•'
+            );
+        if is_space || is_punct {
+            if !last_space {
+                out.push(' ');
+                last_space = true;
+            }
+        } else {
+            out.push(ch);
+            last_space = false;
+        }
+    }
+
+    let trimmed = out.trim();
+    if trimmed.is_empty() {
+        return String::new();
+    }
+
+    trimmed
+        .trim_matches(|c: char| c.is_ascii_punctuation() || c.is_whitespace())
+        .to_string()
+}
+
+fn chunk_snippet(text: &str) -> String {
+    const MAX_CHARS: usize = 160;
+    let trimmed = text.trim();
+    if trimmed.chars().count() <= MAX_CHARS {
+        return trimmed.to_string();
+    }
+    let mut acc = String::with_capacity(MAX_CHARS + 3);
+    for (idx, ch) in trimmed.chars().enumerate() {
+        if idx >= MAX_CHARS {
+            break;
+        }
+        acc.push(ch);
+    }
+    acc.push_str("...");
+    acc
+}
+
+pub fn compute_latency_stats(latencies: &[u128]) -> LatencyStats {
+    if latencies.is_empty() {
+        return LatencyStats {
+            avg: 0.0,
+            p50: 0,
+            p95: 0,
+        };
+    }
+    let mut sorted = latencies.to_vec();
+    sorted.sort_unstable();
+    let sum: u128 = sorted.iter().copied().sum();
+    let avg = sum as f64 / (sorted.len() as f64);
+    let p50 = percentile(&sorted, 0.50);
+    let p95 = percentile(&sorted, 0.95);
+    LatencyStats { avg, p50, p95 }
+}
+
+pub fn build_stage_latency_breakdown(samples: &[PipelineStageTimings]) -> StageLatencyBreakdown {
+    fn collect_stage<F>(samples: &[PipelineStageTimings], selector: F) -> Vec<u128>
+    where
+        F: Fn(&PipelineStageTimings) -> u128,
+    {
+        samples.iter().map(selector).collect()
+    }
+
+    StageLatencyBreakdown {
+        embed: compute_latency_stats(&collect_stage(samples, |entry| entry.embed_ms())),
+        collect_candidates: compute_latency_stats(&collect_stage(samples, |entry| {
+            entry.collect_candidates_ms()
+        })),
+        graph_expansion: compute_latency_stats(&collect_stage(samples, |entry| {
+            entry.graph_expansion_ms()
+        })),
+        chunk_attach: compute_latency_stats(&collect_stage(samples, |entry| {
+            entry.chunk_attach_ms()
+        })),
+        rerank: compute_latency_stats(&collect_stage(samples, |entry| entry.rerank_ms())),
+        assemble: compute_latency_stats(&collect_stage(samples, |entry| entry.assemble_ms())),
+    }
+}
+
+fn percentile(sorted: &[u128], fraction: f64) -> u128 {
+    if sorted.is_empty() {
+        return 0;
+    }
+    let clamped = fraction.clamp(0.0, 1.0);
+    let idx = (clamped * (sorted.len() as f64 - 1.0)).round() as usize;
+    sorted[idx.min(sorted.len() - 1)]
+}
+
+pub fn build_case_diagnostics(
+    summary: &CaseSummary,
+    expected_chunk_ids: &[String],
+    answers_lower: &[String],
+    candidates: &[EvaluationCandidate],
+    pipeline_stats: Option<PipelineDiagnostics>,
+) -> CaseDiagnostics {
+    let expected_set: HashSet<&str> = expected_chunk_ids.iter().map(|id| id.as_str()).collect();
+    let mut seen_chunks: HashSet<String> = HashSet::new();
+    let mut attached_chunk_ids = Vec::new();
+    let mut entity_diagnostics = Vec::new();
+
+    for (idx, candidate) in candidates.iter().enumerate() {
+        let mut chunk_entries = Vec::new();
+        for chunk in &candidate.chunks {
+            let contains_answer = text_contains_answer(&chunk.chunk.chunk, answers_lower);
+            let expected_chunk = expected_set.contains(chunk.chunk.get_id());
+            seen_chunks.insert(chunk.chunk.get_id().to_string());
+            attached_chunk_ids.push(chunk.chunk.get_id().to_string());
+            chunk_entries.push(ChunkDiagnosticsEntry {
+                chunk_id: chunk.chunk.get_id().to_string(),
+                score: chunk.score,
+                contains_answer,
+                expected_chunk,
+                snippet: chunk_snippet(&chunk.chunk.chunk),
+            });
+        }
+        entity_diagnostics.push(EntityDiagnostics {
+            rank: idx + 1,
+            entity_id: candidate.entity_id.clone(),
+            source_id: candidate.source_id.clone(),
+            name: candidate.entity_name.clone(),
+            score: candidate.score,
+            entity_match: candidate.source_id == summary.expected_source,
+            chunk_text_match: chunk_entries.iter().any(|entry| entry.contains_answer),
+            chunk_id_match: chunk_entries.iter().any(|entry| entry.expected_chunk),
+            chunks: chunk_entries,
+        });
+    }
+
+    let missing_expected_chunk_ids = expected_chunk_ids
+        .iter()
+        .filter(|id| !seen_chunks.contains(id.as_str()))
+        .cloned()
+        .collect::<Vec<_>>();
+
+    let mut failure_reasons = Vec::new();
+    if !summary.entity_match {
+        failure_reasons.push("entity_miss".to_string());
+    }
+    if !summary.chunk_text_match {
+        failure_reasons.push("chunk_text_missing".to_string());
+    }
+    if !summary.chunk_id_match {
+        failure_reasons.push("chunk_id_missing".to_string());
+    }
+    if !missing_expected_chunk_ids.is_empty() {
+        failure_reasons.push("expected_chunk_absent".to_string());
+    }
+
+    CaseDiagnostics {
+        question_id: summary.question_id.clone(),
+        question: summary.question.clone(),
+        paragraph_id: summary.paragraph_id.clone(),
+        paragraph_title: summary.paragraph_title.clone(),
+        expected_source: summary.expected_source.clone(),
+        expected_chunk_ids: expected_chunk_ids.to_vec(),
+        answers: summary.answers.clone(),
+        entity_match: summary.entity_match,
+        chunk_text_match: summary.chunk_text_match,
+        chunk_id_match: summary.chunk_id_match,
+        failure_reasons,
+        missing_expected_chunk_ids,
+        attached_chunk_ids,
+        retrieved: entity_diagnostics,
+        pipeline: pipeline_stats,
+    }
+}