Files
minne/evaluations/src/types.rs
2025-12-10 13:54:08 +01:00

476 lines
15 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
use std::collections::HashSet;
use chrono::{DateTime, Utc};
use common::storage::types::StoredObject;
use retrieval_pipeline::{
PipelineDiagnostics, PipelineStageTimings, RetrievedChunk, RetrievedEntity, StrategyOutput,
};
use serde::{Deserialize, Serialize};
use unicode_normalization::UnicodeNormalization;
#[derive(Debug, Serialize)]
pub struct EvaluationSummary {
pub generated_at: DateTime<Utc>,
pub k: usize,
pub limit: Option<usize>,
#[serde(skip_serializing_if = "Option::is_none")]
pub run_label: Option<String>,
pub total_cases: usize,
pub correct: usize,
pub precision: f64,
pub correct_at_1: usize,
pub correct_at_2: usize,
pub correct_at_3: usize,
pub precision_at_1: f64,
pub precision_at_2: f64,
pub precision_at_3: f64,
pub mrr: f64,
pub average_ndcg: f64,
pub duration_ms: u128,
pub dataset_id: String,
pub dataset_label: String,
pub dataset_includes_unanswerable: bool,
pub dataset_source: String,
pub includes_impossible_cases: bool,
pub require_verified_chunks: bool,
pub filtered_questions: usize,
pub retrieval_cases: usize,
pub retrieval_correct: usize,
pub retrieval_precision: f64,
pub llm_cases: usize,
pub llm_answered: usize,
pub llm_precision: f64,
pub slice_id: String,
pub slice_seed: u64,
pub slice_total_cases: usize,
pub slice_window_offset: usize,
pub slice_window_length: usize,
pub slice_cases: usize,
pub slice_positive_paragraphs: usize,
pub slice_negative_paragraphs: usize,
pub slice_total_paragraphs: usize,
pub slice_negative_multiplier: f32,
pub namespace_reused: bool,
pub corpus_paragraphs: usize,
pub ingestion_cache_path: String,
pub ingestion_reused: bool,
pub ingestion_embeddings_reused: bool,
pub ingestion_fingerprint: String,
pub positive_paragraphs_reused: usize,
pub negative_paragraphs_reused: usize,
pub latency_ms: LatencyStats,
pub perf: PerformanceTimings,
pub embedding_backend: String,
pub embedding_model: Option<String>,
pub embedding_dimension: usize,
pub rerank_enabled: bool,
pub rerank_pool_size: Option<usize>,
pub rerank_keep_top: usize,
pub concurrency: usize,
pub detailed_report: bool,
pub retrieval_strategy: String,
pub chunk_result_cap: usize,
pub chunk_rrf_k: f32,
pub chunk_rrf_vector_weight: f32,
pub chunk_rrf_fts_weight: f32,
pub chunk_rrf_use_vector: bool,
pub chunk_rrf_use_fts: bool,
pub ingest_chunk_min_tokens: usize,
pub ingest_chunk_max_tokens: usize,
pub ingest_chunks_only: bool,
pub ingest_chunk_overlap_tokens: usize,
pub chunk_vector_take: usize,
pub chunk_fts_take: usize,
pub chunk_avg_chars_per_token: usize,
pub max_chunks_per_entity: usize,
pub cases: Vec<CaseSummary>,
}
#[derive(Debug, Serialize)]
pub struct CaseSummary {
pub question_id: String,
pub question: String,
pub paragraph_id: String,
pub paragraph_title: String,
pub expected_source: String,
pub answers: Vec<String>,
pub matched: bool,
pub entity_match: bool,
pub chunk_text_match: bool,
pub chunk_id_match: bool,
pub is_impossible: bool,
pub has_verified_chunks: bool,
#[serde(skip_serializing_if = "Option::is_none")]
pub match_rank: Option<usize>,
#[serde(skip_serializing_if = "Option::is_none")]
pub reciprocal_rank: Option<f64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub ndcg: Option<f64>,
pub latency_ms: u128,
pub retrieved: Vec<RetrievedSummary>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LatencyStats {
pub avg: f64,
pub p50: u128,
pub p95: u128,
}
impl Default for LatencyStats {
fn default() -> Self {
Self {
avg: 0.0,
p50: 0,
p95: 0,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct StageLatencyBreakdown {
pub embed: LatencyStats,
pub collect_candidates: LatencyStats,
pub graph_expansion: LatencyStats,
pub chunk_attach: LatencyStats,
pub rerank: LatencyStats,
pub assemble: LatencyStats,
}
#[derive(Debug, Default, Clone, Serialize, Deserialize)]
pub struct EvaluationStageTimings {
pub prepare_slice_ms: u128,
pub prepare_db_ms: u128,
pub prepare_corpus_ms: u128,
pub prepare_namespace_ms: u128,
pub run_queries_ms: u128,
pub summarize_ms: u128,
pub finalize_ms: u128,
}
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct PerformanceTimings {
pub openai_base_url: String,
pub ingestion_ms: u128,
#[serde(skip_serializing_if = "Option::is_none")]
pub namespace_seed_ms: Option<u128>,
pub evaluation_stage_ms: EvaluationStageTimings,
pub stage_latency: StageLatencyBreakdown,
}
#[derive(Debug, Serialize)]
pub struct RetrievedSummary {
pub rank: usize,
pub entity_id: String,
pub source_id: String,
pub entity_name: String,
pub score: f32,
pub matched: bool,
#[serde(skip_serializing_if = "Option::is_none")]
pub entity_description: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub entity_category: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub chunk_text_match: Option<bool>,
#[serde(skip_serializing_if = "Option::is_none")]
pub chunk_id_match: Option<bool>,
}
#[derive(Debug, Clone)]
pub struct EvaluationCandidate {
pub entity_id: String,
pub source_id: String,
pub entity_name: String,
pub entity_description: Option<String>,
pub entity_category: Option<String>,
pub score: f32,
pub chunks: Vec<RetrievedChunk>,
}
impl EvaluationCandidate {
fn from_entity(entity: RetrievedEntity) -> Self {
let entity_category = Some(format!("{:?}", entity.entity.entity_type));
Self {
entity_id: entity.entity.get_id().to_string(),
source_id: entity.entity.source_id.clone(),
entity_name: entity.entity.name.clone(),
entity_description: Some(entity.entity.description.clone()),
entity_category,
score: entity.score,
chunks: entity.chunks,
}
}
fn from_chunk(chunk: RetrievedChunk) -> Self {
let snippet = chunk_snippet(&chunk.chunk.chunk);
Self {
entity_id: chunk.chunk.get_id().to_string(),
source_id: chunk.chunk.source_id.clone(),
entity_name: chunk.chunk.source_id.clone(),
entity_description: Some(snippet),
entity_category: Some("Chunk".to_string()),
score: chunk.score,
chunks: vec![chunk],
}
}
}
fn candidates_from_entities(entities: Vec<RetrievedEntity>) -> Vec<EvaluationCandidate> {
entities
.into_iter()
.map(EvaluationCandidate::from_entity)
.collect()
}
fn candidates_from_chunks(chunks: Vec<RetrievedChunk>) -> Vec<EvaluationCandidate> {
chunks
.into_iter()
.map(EvaluationCandidate::from_chunk)
.collect()
}
pub fn adapt_strategy_output(output: StrategyOutput) -> Vec<EvaluationCandidate> {
match output {
StrategyOutput::Entities(entities) => candidates_from_entities(entities),
StrategyOutput::Chunks(chunks) => candidates_from_chunks(chunks),
StrategyOutput::Search(search_result) => {
let mut candidates = candidates_from_entities(search_result.entities);
candidates.extend(candidates_from_chunks(search_result.chunks));
candidates.sort_by(|a, b| b.score.total_cmp(&a.score));
candidates
}
}
}
#[derive(Debug, Serialize)]
pub struct CaseDiagnostics {
pub question_id: String,
pub question: String,
pub paragraph_id: String,
pub paragraph_title: String,
pub expected_source: String,
pub expected_chunk_ids: Vec<String>,
pub answers: Vec<String>,
pub entity_match: bool,
pub chunk_text_match: bool,
pub chunk_id_match: bool,
pub failure_reasons: Vec<String>,
pub missing_expected_chunk_ids: Vec<String>,
pub attached_chunk_ids: Vec<String>,
pub retrieved: Vec<EntityDiagnostics>,
#[serde(skip_serializing_if = "Option::is_none")]
pub pipeline: Option<PipelineDiagnostics>,
}
#[derive(Debug, Serialize)]
pub struct EntityDiagnostics {
pub rank: usize,
pub entity_id: String,
pub source_id: String,
pub name: String,
pub score: f32,
pub entity_match: bool,
pub chunk_text_match: bool,
pub chunk_id_match: bool,
pub chunks: Vec<ChunkDiagnosticsEntry>,
}
#[derive(Debug, Serialize)]
pub struct ChunkDiagnosticsEntry {
pub chunk_id: String,
pub score: f32,
pub contains_answer: bool,
pub expected_chunk: bool,
pub snippet: String,
}
pub fn text_contains_answer(text: &str, answers: &[String]) -> bool {
if answers.is_empty() {
return true;
}
let haystack = normalize_for_match(text);
answers
.iter()
.map(|needle| normalize_for_match(needle))
.any(|needle| !needle.is_empty() && haystack.contains(&needle))
}
fn normalize_for_match(input: &str) -> String {
// NFKC normalize, lowercase, and collapse whitespace/punctuation to a single space
// to reduce false negatives from formatting or punctuation differences.
let mut out = String::with_capacity(input.len());
let mut last_space = false;
for ch in input.nfkc().flat_map(|c| c.to_lowercase()) {
let is_space = ch.is_whitespace();
let is_punct = ch.is_ascii_punctuation()
|| matches!(
ch,
'“' | '”' | '' | '' | '«' | '»' | '' | '—' | '…' | '·' | '•'
);
if is_space || is_punct {
if !last_space {
out.push(' ');
last_space = true;
}
} else {
out.push(ch);
last_space = false;
}
}
let trimmed = out.trim();
if trimmed.is_empty() {
return String::new();
}
trimmed
.trim_matches(|c: char| c.is_ascii_punctuation() || c.is_whitespace())
.to_string()
}
fn chunk_snippet(text: &str) -> String {
const MAX_CHARS: usize = 160;
let trimmed = text.trim();
if trimmed.chars().count() <= MAX_CHARS {
return trimmed.to_string();
}
let mut acc = String::with_capacity(MAX_CHARS + 3);
for (idx, ch) in trimmed.chars().enumerate() {
if idx >= MAX_CHARS {
break;
}
acc.push(ch);
}
acc.push_str("...");
acc
}
pub fn compute_latency_stats(latencies: &[u128]) -> LatencyStats {
if latencies.is_empty() {
return LatencyStats {
avg: 0.0,
p50: 0,
p95: 0,
};
}
let mut sorted = latencies.to_vec();
sorted.sort_unstable();
let sum: u128 = sorted.iter().copied().sum();
let avg = sum as f64 / (sorted.len() as f64);
let p50 = percentile(&sorted, 0.50);
let p95 = percentile(&sorted, 0.95);
LatencyStats { avg, p50, p95 }
}
pub fn build_stage_latency_breakdown(samples: &[PipelineStageTimings]) -> StageLatencyBreakdown {
fn collect_stage<F>(samples: &[PipelineStageTimings], selector: F) -> Vec<u128>
where
F: Fn(&PipelineStageTimings) -> u128,
{
samples.iter().map(selector).collect()
}
StageLatencyBreakdown {
embed: compute_latency_stats(&collect_stage(samples, |entry| entry.embed_ms())),
collect_candidates: compute_latency_stats(&collect_stage(samples, |entry| {
entry.collect_candidates_ms()
})),
graph_expansion: compute_latency_stats(&collect_stage(samples, |entry| {
entry.graph_expansion_ms()
})),
chunk_attach: compute_latency_stats(&collect_stage(samples, |entry| {
entry.chunk_attach_ms()
})),
rerank: compute_latency_stats(&collect_stage(samples, |entry| entry.rerank_ms())),
assemble: compute_latency_stats(&collect_stage(samples, |entry| entry.assemble_ms())),
}
}
fn percentile(sorted: &[u128], fraction: f64) -> u128 {
if sorted.is_empty() {
return 0;
}
let clamped = fraction.clamp(0.0, 1.0);
let idx = (clamped * (sorted.len() as f64 - 1.0)).round() as usize;
sorted[idx.min(sorted.len() - 1)]
}
pub fn build_case_diagnostics(
summary: &CaseSummary,
expected_chunk_ids: &[String],
answers_lower: &[String],
candidates: &[EvaluationCandidate],
pipeline_stats: Option<PipelineDiagnostics>,
) -> CaseDiagnostics {
let expected_set: HashSet<&str> = expected_chunk_ids.iter().map(|id| id.as_str()).collect();
let mut seen_chunks: HashSet<String> = HashSet::new();
let mut attached_chunk_ids = Vec::new();
let mut entity_diagnostics = Vec::new();
for (idx, candidate) in candidates.iter().enumerate() {
let mut chunk_entries = Vec::new();
for chunk in &candidate.chunks {
let contains_answer = text_contains_answer(&chunk.chunk.chunk, answers_lower);
let expected_chunk = expected_set.contains(chunk.chunk.get_id());
seen_chunks.insert(chunk.chunk.get_id().to_string());
attached_chunk_ids.push(chunk.chunk.get_id().to_string());
chunk_entries.push(ChunkDiagnosticsEntry {
chunk_id: chunk.chunk.get_id().to_string(),
score: chunk.score,
contains_answer,
expected_chunk,
snippet: chunk_snippet(&chunk.chunk.chunk),
});
}
entity_diagnostics.push(EntityDiagnostics {
rank: idx + 1,
entity_id: candidate.entity_id.clone(),
source_id: candidate.source_id.clone(),
name: candidate.entity_name.clone(),
score: candidate.score,
entity_match: candidate.source_id == summary.expected_source,
chunk_text_match: chunk_entries.iter().any(|entry| entry.contains_answer),
chunk_id_match: chunk_entries.iter().any(|entry| entry.expected_chunk),
chunks: chunk_entries,
});
}
let missing_expected_chunk_ids = expected_chunk_ids
.iter()
.filter(|id| !seen_chunks.contains(id.as_str()))
.cloned()
.collect::<Vec<_>>();
let mut failure_reasons = Vec::new();
if !summary.entity_match {
failure_reasons.push("entity_miss".to_string());
}
if !summary.chunk_text_match {
failure_reasons.push("chunk_text_missing".to_string());
}
if !summary.chunk_id_match {
failure_reasons.push("chunk_id_missing".to_string());
}
if !missing_expected_chunk_ids.is_empty() {
failure_reasons.push("expected_chunk_absent".to_string());
}
CaseDiagnostics {
question_id: summary.question_id.clone(),
question: summary.question.clone(),
paragraph_id: summary.paragraph_id.clone(),
paragraph_title: summary.paragraph_title.clone(),
expected_source: summary.expected_source.clone(),
expected_chunk_ids: expected_chunk_ids.to_vec(),
answers: summary.answers.clone(),
entity_match: summary.entity_match,
chunk_text_match: summary.chunk_text_match,
chunk_id_match: summary.chunk_id_match,
failure_reasons,
missing_expected_chunk_ids,
attached_chunk_ids,
retrieved: entity_diagnostics,
pipeline: pipeline_stats,
}
}