use std::collections::HashSet; use chrono::{DateTime, Utc}; use common::storage::types::StoredObject; use retrieval_pipeline::{ PipelineDiagnostics, PipelineStageTimings, RetrievedChunk, RetrievedEntity, StrategyOutput, }; use serde::{Deserialize, Serialize}; use unicode_normalization::UnicodeNormalization; #[derive(Debug, Serialize)] pub struct EvaluationSummary { pub generated_at: DateTime, pub k: usize, pub limit: Option, #[serde(skip_serializing_if = "Option::is_none")] pub run_label: Option, pub total_cases: usize, pub correct: usize, pub precision: f64, pub correct_at_1: usize, pub correct_at_2: usize, pub correct_at_3: usize, pub precision_at_1: f64, pub precision_at_2: f64, pub precision_at_3: f64, pub mrr: f64, pub average_ndcg: f64, pub duration_ms: u128, pub dataset_id: String, pub dataset_label: String, pub dataset_includes_unanswerable: bool, pub dataset_source: String, pub includes_impossible_cases: bool, pub require_verified_chunks: bool, pub filtered_questions: usize, pub retrieval_cases: usize, pub retrieval_correct: usize, pub retrieval_precision: f64, pub llm_cases: usize, pub llm_answered: usize, pub llm_precision: f64, pub slice_id: String, pub slice_seed: u64, pub slice_total_cases: usize, pub slice_window_offset: usize, pub slice_window_length: usize, pub slice_cases: usize, pub slice_positive_paragraphs: usize, pub slice_negative_paragraphs: usize, pub slice_total_paragraphs: usize, pub slice_negative_multiplier: f32, pub namespace_reused: bool, pub corpus_paragraphs: usize, pub ingestion_cache_path: String, pub ingestion_reused: bool, pub ingestion_embeddings_reused: bool, pub ingestion_fingerprint: String, pub positive_paragraphs_reused: usize, pub negative_paragraphs_reused: usize, pub latency_ms: LatencyStats, pub perf: PerformanceTimings, pub embedding_backend: String, pub embedding_model: Option, pub embedding_dimension: usize, pub rerank_enabled: bool, pub rerank_pool_size: Option, pub rerank_keep_top: usize, pub concurrency: usize, pub detailed_report: bool, pub retrieval_strategy: String, pub chunk_result_cap: usize, pub chunk_rrf_k: f32, pub chunk_rrf_vector_weight: f32, pub chunk_rrf_fts_weight: f32, pub chunk_rrf_use_vector: bool, pub chunk_rrf_use_fts: bool, pub ingest_chunk_min_tokens: usize, pub ingest_chunk_max_tokens: usize, pub ingest_chunks_only: bool, pub ingest_chunk_overlap_tokens: usize, pub chunk_vector_take: usize, pub chunk_fts_take: usize, pub chunk_avg_chars_per_token: usize, pub max_chunks_per_entity: usize, pub cases: Vec, } #[derive(Debug, Serialize)] pub struct CaseSummary { pub question_id: String, pub question: String, pub paragraph_id: String, pub paragraph_title: String, pub expected_source: String, pub answers: Vec, pub matched: bool, pub entity_match: bool, pub chunk_text_match: bool, pub chunk_id_match: bool, pub is_impossible: bool, pub has_verified_chunks: bool, #[serde(skip_serializing_if = "Option::is_none")] pub match_rank: Option, #[serde(skip_serializing_if = "Option::is_none")] pub reciprocal_rank: Option, #[serde(skip_serializing_if = "Option::is_none")] pub ndcg: Option, pub latency_ms: u128, pub retrieved: Vec, } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct LatencyStats { pub avg: f64, pub p50: u128, pub p95: u128, } impl Default for LatencyStats { fn default() -> Self { Self { avg: 0.0, p50: 0, p95: 0, } } } #[derive(Debug, Clone, Serialize, Deserialize, Default)] pub struct StageLatencyBreakdown { pub embed: LatencyStats, pub collect_candidates: LatencyStats, pub graph_expansion: LatencyStats, pub chunk_attach: LatencyStats, pub rerank: LatencyStats, pub assemble: LatencyStats, } #[derive(Debug, Default, Clone, Serialize, Deserialize)] pub struct EvaluationStageTimings { pub prepare_slice_ms: u128, pub prepare_db_ms: u128, pub prepare_corpus_ms: u128, pub prepare_namespace_ms: u128, pub run_queries_ms: u128, pub summarize_ms: u128, pub finalize_ms: u128, } #[derive(Debug, Serialize, Deserialize, Clone)] pub struct PerformanceTimings { pub openai_base_url: String, pub ingestion_ms: u128, #[serde(skip_serializing_if = "Option::is_none")] pub namespace_seed_ms: Option, pub evaluation_stage_ms: EvaluationStageTimings, pub stage_latency: StageLatencyBreakdown, } #[derive(Debug, Serialize)] pub struct RetrievedSummary { pub rank: usize, pub entity_id: String, pub source_id: String, pub entity_name: String, pub score: f32, pub matched: bool, #[serde(skip_serializing_if = "Option::is_none")] pub entity_description: Option, #[serde(skip_serializing_if = "Option::is_none")] pub entity_category: Option, #[serde(skip_serializing_if = "Option::is_none")] pub chunk_text_match: Option, #[serde(skip_serializing_if = "Option::is_none")] pub chunk_id_match: Option, } #[derive(Debug, Clone)] pub struct EvaluationCandidate { pub entity_id: String, pub source_id: String, pub entity_name: String, pub entity_description: Option, pub entity_category: Option, pub score: f32, pub chunks: Vec, } impl EvaluationCandidate { fn from_entity(entity: RetrievedEntity) -> Self { let entity_category = Some(format!("{:?}", entity.entity.entity_type)); Self { entity_id: entity.entity.get_id().to_string(), source_id: entity.entity.source_id.clone(), entity_name: entity.entity.name.clone(), entity_description: Some(entity.entity.description.clone()), entity_category, score: entity.score, chunks: entity.chunks, } } fn from_chunk(chunk: RetrievedChunk) -> Self { let snippet = chunk_snippet(&chunk.chunk.chunk); Self { entity_id: chunk.chunk.get_id().to_string(), source_id: chunk.chunk.source_id.clone(), entity_name: chunk.chunk.source_id.clone(), entity_description: Some(snippet), entity_category: Some("Chunk".to_string()), score: chunk.score, chunks: vec![chunk], } } } fn candidates_from_entities(entities: Vec) -> Vec { entities .into_iter() .map(EvaluationCandidate::from_entity) .collect() } fn candidates_from_chunks(chunks: Vec) -> Vec { chunks .into_iter() .map(EvaluationCandidate::from_chunk) .collect() } pub fn adapt_strategy_output(output: StrategyOutput) -> Vec { match output { StrategyOutput::Entities(entities) => candidates_from_entities(entities), StrategyOutput::Chunks(chunks) => candidates_from_chunks(chunks), StrategyOutput::Search(search_result) => { let mut candidates = candidates_from_entities(search_result.entities); candidates.extend(candidates_from_chunks(search_result.chunks)); candidates.sort_by(|a, b| b.score.total_cmp(&a.score)); candidates } } } #[derive(Debug, Serialize)] pub struct CaseDiagnostics { pub question_id: String, pub question: String, pub paragraph_id: String, pub paragraph_title: String, pub expected_source: String, pub expected_chunk_ids: Vec, pub answers: Vec, pub entity_match: bool, pub chunk_text_match: bool, pub chunk_id_match: bool, pub failure_reasons: Vec, pub missing_expected_chunk_ids: Vec, pub attached_chunk_ids: Vec, pub retrieved: Vec, #[serde(skip_serializing_if = "Option::is_none")] pub pipeline: Option, } #[derive(Debug, Serialize)] pub struct EntityDiagnostics { pub rank: usize, pub entity_id: String, pub source_id: String, pub name: String, pub score: f32, pub entity_match: bool, pub chunk_text_match: bool, pub chunk_id_match: bool, pub chunks: Vec, } #[derive(Debug, Serialize)] pub struct ChunkDiagnosticsEntry { pub chunk_id: String, pub score: f32, pub contains_answer: bool, pub expected_chunk: bool, pub snippet: String, } pub fn text_contains_answer(text: &str, answers: &[String]) -> bool { if answers.is_empty() { return true; } let haystack = normalize_for_match(text); answers .iter() .map(|needle| normalize_for_match(needle)) .any(|needle| !needle.is_empty() && haystack.contains(&needle)) } fn normalize_for_match(input: &str) -> String { // NFKC normalize, lowercase, and collapse whitespace/punctuation to a single space // to reduce false negatives from formatting or punctuation differences. let mut out = String::with_capacity(input.len()); let mut last_space = false; for ch in input.nfkc().flat_map(|c| c.to_lowercase()) { let is_space = ch.is_whitespace(); let is_punct = ch.is_ascii_punctuation() || matches!( ch, '“' | '”' | '‘' | '’' | '«' | '»' | '–' | '—' | '…' | '·' | '•' ); if is_space || is_punct { if !last_space { out.push(' '); last_space = true; } } else { out.push(ch); last_space = false; } } let trimmed = out.trim(); if trimmed.is_empty() { return String::new(); } trimmed .trim_matches(|c: char| c.is_ascii_punctuation() || c.is_whitespace()) .to_string() } fn chunk_snippet(text: &str) -> String { const MAX_CHARS: usize = 160; let trimmed = text.trim(); if trimmed.chars().count() <= MAX_CHARS { return trimmed.to_string(); } let mut acc = String::with_capacity(MAX_CHARS + 3); for (idx, ch) in trimmed.chars().enumerate() { if idx >= MAX_CHARS { break; } acc.push(ch); } acc.push_str("..."); acc } pub fn compute_latency_stats(latencies: &[u128]) -> LatencyStats { if latencies.is_empty() { return LatencyStats { avg: 0.0, p50: 0, p95: 0, }; } let mut sorted = latencies.to_vec(); sorted.sort_unstable(); let sum: u128 = sorted.iter().copied().sum(); let avg = sum as f64 / (sorted.len() as f64); let p50 = percentile(&sorted, 0.50); let p95 = percentile(&sorted, 0.95); LatencyStats { avg, p50, p95 } } pub fn build_stage_latency_breakdown(samples: &[PipelineStageTimings]) -> StageLatencyBreakdown { fn collect_stage(samples: &[PipelineStageTimings], selector: F) -> Vec where F: Fn(&PipelineStageTimings) -> u128, { samples.iter().map(selector).collect() } StageLatencyBreakdown { embed: compute_latency_stats(&collect_stage(samples, |entry| entry.embed_ms())), collect_candidates: compute_latency_stats(&collect_stage(samples, |entry| { entry.collect_candidates_ms() })), graph_expansion: compute_latency_stats(&collect_stage(samples, |entry| { entry.graph_expansion_ms() })), chunk_attach: compute_latency_stats(&collect_stage(samples, |entry| { entry.chunk_attach_ms() })), rerank: compute_latency_stats(&collect_stage(samples, |entry| entry.rerank_ms())), assemble: compute_latency_stats(&collect_stage(samples, |entry| entry.assemble_ms())), } } fn percentile(sorted: &[u128], fraction: f64) -> u128 { if sorted.is_empty() { return 0; } let clamped = fraction.clamp(0.0, 1.0); let idx = (clamped * (sorted.len() as f64 - 1.0)).round() as usize; sorted[idx.min(sorted.len() - 1)] } pub fn build_case_diagnostics( summary: &CaseSummary, expected_chunk_ids: &[String], answers_lower: &[String], candidates: &[EvaluationCandidate], pipeline_stats: Option, ) -> CaseDiagnostics { let expected_set: HashSet<&str> = expected_chunk_ids.iter().map(|id| id.as_str()).collect(); let mut seen_chunks: HashSet = HashSet::new(); let mut attached_chunk_ids = Vec::new(); let mut entity_diagnostics = Vec::new(); for (idx, candidate) in candidates.iter().enumerate() { let mut chunk_entries = Vec::new(); for chunk in &candidate.chunks { let contains_answer = text_contains_answer(&chunk.chunk.chunk, answers_lower); let expected_chunk = expected_set.contains(chunk.chunk.get_id()); seen_chunks.insert(chunk.chunk.get_id().to_string()); attached_chunk_ids.push(chunk.chunk.get_id().to_string()); chunk_entries.push(ChunkDiagnosticsEntry { chunk_id: chunk.chunk.get_id().to_string(), score: chunk.score, contains_answer, expected_chunk, snippet: chunk_snippet(&chunk.chunk.chunk), }); } entity_diagnostics.push(EntityDiagnostics { rank: idx + 1, entity_id: candidate.entity_id.clone(), source_id: candidate.source_id.clone(), name: candidate.entity_name.clone(), score: candidate.score, entity_match: candidate.source_id == summary.expected_source, chunk_text_match: chunk_entries.iter().any(|entry| entry.contains_answer), chunk_id_match: chunk_entries.iter().any(|entry| entry.expected_chunk), chunks: chunk_entries, }); } let missing_expected_chunk_ids = expected_chunk_ids .iter() .filter(|id| !seen_chunks.contains(id.as_str())) .cloned() .collect::>(); let mut failure_reasons = Vec::new(); if !summary.entity_match { failure_reasons.push("entity_miss".to_string()); } if !summary.chunk_text_match { failure_reasons.push("chunk_text_missing".to_string()); } if !summary.chunk_id_match { failure_reasons.push("chunk_id_missing".to_string()); } if !missing_expected_chunk_ids.is_empty() { failure_reasons.push("expected_chunk_absent".to_string()); } CaseDiagnostics { question_id: summary.question_id.clone(), question: summary.question.clone(), paragraph_id: summary.paragraph_id.clone(), paragraph_title: summary.paragraph_title.clone(), expected_source: summary.expected_source.clone(), expected_chunk_ids: expected_chunk_ids.to_vec(), answers: summary.answers.clone(), entity_match: summary.entity_match, chunk_text_match: summary.chunk_text_match, chunk_id_match: summary.chunk_id_match, failure_reasons, missing_expected_chunk_ids, attached_chunk_ids, retrieved: entity_diagnostics, pipeline: pipeline_stats, } }