mirror of
https://github.com/perstarkse/minne.git
synced 2026-03-24 18:31:45 +01:00
476 lines
15 KiB
Rust
476 lines
15 KiB
Rust
use std::collections::HashSet;
|
||
|
||
use chrono::{DateTime, Utc};
|
||
use common::storage::types::StoredObject;
|
||
use retrieval_pipeline::{
|
||
PipelineDiagnostics, PipelineStageTimings, RetrievedChunk, RetrievedEntity, StrategyOutput,
|
||
};
|
||
use serde::{Deserialize, Serialize};
|
||
use unicode_normalization::UnicodeNormalization;
|
||
|
||
#[derive(Debug, Serialize)]
|
||
pub struct EvaluationSummary {
|
||
pub generated_at: DateTime<Utc>,
|
||
pub k: usize,
|
||
pub limit: Option<usize>,
|
||
#[serde(skip_serializing_if = "Option::is_none")]
|
||
pub run_label: Option<String>,
|
||
pub total_cases: usize,
|
||
pub correct: usize,
|
||
pub precision: f64,
|
||
pub correct_at_1: usize,
|
||
pub correct_at_2: usize,
|
||
pub correct_at_3: usize,
|
||
pub precision_at_1: f64,
|
||
pub precision_at_2: f64,
|
||
pub precision_at_3: f64,
|
||
pub mrr: f64,
|
||
pub average_ndcg: f64,
|
||
pub duration_ms: u128,
|
||
pub dataset_id: String,
|
||
pub dataset_label: String,
|
||
pub dataset_includes_unanswerable: bool,
|
||
pub dataset_source: String,
|
||
pub includes_impossible_cases: bool,
|
||
pub require_verified_chunks: bool,
|
||
pub filtered_questions: usize,
|
||
pub retrieval_cases: usize,
|
||
pub retrieval_correct: usize,
|
||
pub retrieval_precision: f64,
|
||
pub llm_cases: usize,
|
||
pub llm_answered: usize,
|
||
pub llm_precision: f64,
|
||
pub slice_id: String,
|
||
pub slice_seed: u64,
|
||
pub slice_total_cases: usize,
|
||
pub slice_window_offset: usize,
|
||
pub slice_window_length: usize,
|
||
pub slice_cases: usize,
|
||
pub slice_positive_paragraphs: usize,
|
||
pub slice_negative_paragraphs: usize,
|
||
pub slice_total_paragraphs: usize,
|
||
pub slice_negative_multiplier: f32,
|
||
pub namespace_reused: bool,
|
||
pub corpus_paragraphs: usize,
|
||
pub ingestion_cache_path: String,
|
||
pub ingestion_reused: bool,
|
||
pub ingestion_embeddings_reused: bool,
|
||
pub ingestion_fingerprint: String,
|
||
pub positive_paragraphs_reused: usize,
|
||
pub negative_paragraphs_reused: usize,
|
||
pub latency_ms: LatencyStats,
|
||
pub perf: PerformanceTimings,
|
||
pub embedding_backend: String,
|
||
pub embedding_model: Option<String>,
|
||
pub embedding_dimension: usize,
|
||
pub rerank_enabled: bool,
|
||
pub rerank_pool_size: Option<usize>,
|
||
pub rerank_keep_top: usize,
|
||
pub concurrency: usize,
|
||
pub detailed_report: bool,
|
||
pub retrieval_strategy: String,
|
||
pub chunk_result_cap: usize,
|
||
pub chunk_rrf_k: f32,
|
||
pub chunk_rrf_vector_weight: f32,
|
||
pub chunk_rrf_fts_weight: f32,
|
||
pub chunk_rrf_use_vector: bool,
|
||
pub chunk_rrf_use_fts: bool,
|
||
pub ingest_chunk_min_tokens: usize,
|
||
pub ingest_chunk_max_tokens: usize,
|
||
pub ingest_chunks_only: bool,
|
||
pub ingest_chunk_overlap_tokens: usize,
|
||
pub chunk_vector_take: usize,
|
||
pub chunk_fts_take: usize,
|
||
pub chunk_avg_chars_per_token: usize,
|
||
pub max_chunks_per_entity: usize,
|
||
pub cases: Vec<CaseSummary>,
|
||
}
|
||
|
||
#[derive(Debug, Serialize)]
|
||
pub struct CaseSummary {
|
||
pub question_id: String,
|
||
pub question: String,
|
||
pub paragraph_id: String,
|
||
pub paragraph_title: String,
|
||
pub expected_source: String,
|
||
pub answers: Vec<String>,
|
||
pub matched: bool,
|
||
pub entity_match: bool,
|
||
pub chunk_text_match: bool,
|
||
pub chunk_id_match: bool,
|
||
pub is_impossible: bool,
|
||
pub has_verified_chunks: bool,
|
||
#[serde(skip_serializing_if = "Option::is_none")]
|
||
pub match_rank: Option<usize>,
|
||
#[serde(skip_serializing_if = "Option::is_none")]
|
||
pub reciprocal_rank: Option<f64>,
|
||
#[serde(skip_serializing_if = "Option::is_none")]
|
||
pub ndcg: Option<f64>,
|
||
pub latency_ms: u128,
|
||
pub retrieved: Vec<RetrievedSummary>,
|
||
}
|
||
|
||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||
pub struct LatencyStats {
|
||
pub avg: f64,
|
||
pub p50: u128,
|
||
pub p95: u128,
|
||
}
|
||
|
||
impl Default for LatencyStats {
|
||
fn default() -> Self {
|
||
Self {
|
||
avg: 0.0,
|
||
p50: 0,
|
||
p95: 0,
|
||
}
|
||
}
|
||
}
|
||
|
||
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
|
||
pub struct StageLatencyBreakdown {
|
||
pub embed: LatencyStats,
|
||
pub collect_candidates: LatencyStats,
|
||
pub graph_expansion: LatencyStats,
|
||
pub chunk_attach: LatencyStats,
|
||
pub rerank: LatencyStats,
|
||
pub assemble: LatencyStats,
|
||
}
|
||
|
||
#[derive(Debug, Default, Clone, Serialize, Deserialize)]
|
||
pub struct EvaluationStageTimings {
|
||
pub prepare_slice_ms: u128,
|
||
pub prepare_db_ms: u128,
|
||
pub prepare_corpus_ms: u128,
|
||
pub prepare_namespace_ms: u128,
|
||
pub run_queries_ms: u128,
|
||
pub summarize_ms: u128,
|
||
pub finalize_ms: u128,
|
||
}
|
||
|
||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||
pub struct PerformanceTimings {
|
||
pub openai_base_url: String,
|
||
pub ingestion_ms: u128,
|
||
#[serde(skip_serializing_if = "Option::is_none")]
|
||
pub namespace_seed_ms: Option<u128>,
|
||
pub evaluation_stage_ms: EvaluationStageTimings,
|
||
pub stage_latency: StageLatencyBreakdown,
|
||
}
|
||
|
||
#[derive(Debug, Serialize)]
|
||
pub struct RetrievedSummary {
|
||
pub rank: usize,
|
||
pub entity_id: String,
|
||
pub source_id: String,
|
||
pub entity_name: String,
|
||
pub score: f32,
|
||
pub matched: bool,
|
||
#[serde(skip_serializing_if = "Option::is_none")]
|
||
pub entity_description: Option<String>,
|
||
#[serde(skip_serializing_if = "Option::is_none")]
|
||
pub entity_category: Option<String>,
|
||
#[serde(skip_serializing_if = "Option::is_none")]
|
||
pub chunk_text_match: Option<bool>,
|
||
#[serde(skip_serializing_if = "Option::is_none")]
|
||
pub chunk_id_match: Option<bool>,
|
||
}
|
||
|
||
#[derive(Debug, Clone)]
|
||
pub struct EvaluationCandidate {
|
||
pub entity_id: String,
|
||
pub source_id: String,
|
||
pub entity_name: String,
|
||
pub entity_description: Option<String>,
|
||
pub entity_category: Option<String>,
|
||
pub score: f32,
|
||
pub chunks: Vec<RetrievedChunk>,
|
||
}
|
||
|
||
impl EvaluationCandidate {
|
||
fn from_entity(entity: RetrievedEntity) -> Self {
|
||
let entity_category = Some(format!("{:?}", entity.entity.entity_type));
|
||
Self {
|
||
entity_id: entity.entity.get_id().to_string(),
|
||
source_id: entity.entity.source_id.clone(),
|
||
entity_name: entity.entity.name.clone(),
|
||
entity_description: Some(entity.entity.description.clone()),
|
||
entity_category,
|
||
score: entity.score,
|
||
chunks: entity.chunks,
|
||
}
|
||
}
|
||
|
||
fn from_chunk(chunk: RetrievedChunk) -> Self {
|
||
let snippet = chunk_snippet(&chunk.chunk.chunk);
|
||
Self {
|
||
entity_id: chunk.chunk.get_id().to_string(),
|
||
source_id: chunk.chunk.source_id.clone(),
|
||
entity_name: chunk.chunk.source_id.clone(),
|
||
entity_description: Some(snippet),
|
||
entity_category: Some("Chunk".to_string()),
|
||
score: chunk.score,
|
||
chunks: vec![chunk],
|
||
}
|
||
}
|
||
}
|
||
|
||
fn candidates_from_entities(entities: Vec<RetrievedEntity>) -> Vec<EvaluationCandidate> {
|
||
entities
|
||
.into_iter()
|
||
.map(EvaluationCandidate::from_entity)
|
||
.collect()
|
||
}
|
||
|
||
fn candidates_from_chunks(chunks: Vec<RetrievedChunk>) -> Vec<EvaluationCandidate> {
|
||
chunks
|
||
.into_iter()
|
||
.map(EvaluationCandidate::from_chunk)
|
||
.collect()
|
||
}
|
||
|
||
pub fn adapt_strategy_output(output: StrategyOutput) -> Vec<EvaluationCandidate> {
|
||
match output {
|
||
StrategyOutput::Entities(entities) => candidates_from_entities(entities),
|
||
StrategyOutput::Chunks(chunks) => candidates_from_chunks(chunks),
|
||
StrategyOutput::Search(search_result) => {
|
||
let mut candidates = candidates_from_entities(search_result.entities);
|
||
candidates.extend(candidates_from_chunks(search_result.chunks));
|
||
candidates.sort_by(|a, b| b.score.total_cmp(&a.score));
|
||
candidates
|
||
}
|
||
}
|
||
}
|
||
|
||
#[derive(Debug, Serialize)]
|
||
pub struct CaseDiagnostics {
|
||
pub question_id: String,
|
||
pub question: String,
|
||
pub paragraph_id: String,
|
||
pub paragraph_title: String,
|
||
pub expected_source: String,
|
||
pub expected_chunk_ids: Vec<String>,
|
||
pub answers: Vec<String>,
|
||
pub entity_match: bool,
|
||
pub chunk_text_match: bool,
|
||
pub chunk_id_match: bool,
|
||
pub failure_reasons: Vec<String>,
|
||
pub missing_expected_chunk_ids: Vec<String>,
|
||
pub attached_chunk_ids: Vec<String>,
|
||
pub retrieved: Vec<EntityDiagnostics>,
|
||
#[serde(skip_serializing_if = "Option::is_none")]
|
||
pub pipeline: Option<PipelineDiagnostics>,
|
||
}
|
||
|
||
#[derive(Debug, Serialize)]
|
||
pub struct EntityDiagnostics {
|
||
pub rank: usize,
|
||
pub entity_id: String,
|
||
pub source_id: String,
|
||
pub name: String,
|
||
pub score: f32,
|
||
pub entity_match: bool,
|
||
pub chunk_text_match: bool,
|
||
pub chunk_id_match: bool,
|
||
pub chunks: Vec<ChunkDiagnosticsEntry>,
|
||
}
|
||
|
||
#[derive(Debug, Serialize)]
|
||
pub struct ChunkDiagnosticsEntry {
|
||
pub chunk_id: String,
|
||
pub score: f32,
|
||
pub contains_answer: bool,
|
||
pub expected_chunk: bool,
|
||
pub snippet: String,
|
||
}
|
||
|
||
pub fn text_contains_answer(text: &str, answers: &[String]) -> bool {
|
||
if answers.is_empty() {
|
||
return true;
|
||
}
|
||
let haystack = normalize_for_match(text);
|
||
answers
|
||
.iter()
|
||
.map(|needle| normalize_for_match(needle))
|
||
.any(|needle| !needle.is_empty() && haystack.contains(&needle))
|
||
}
|
||
|
||
fn normalize_for_match(input: &str) -> String {
|
||
// NFKC normalize, lowercase, and collapse whitespace/punctuation to a single space
|
||
// to reduce false negatives from formatting or punctuation differences.
|
||
let mut out = String::with_capacity(input.len());
|
||
let mut last_space = false;
|
||
for ch in input.nfkc().flat_map(|c| c.to_lowercase()) {
|
||
let is_space = ch.is_whitespace();
|
||
let is_punct = ch.is_ascii_punctuation()
|
||
|| matches!(
|
||
ch,
|
||
'“' | '”' | '‘' | '’' | '«' | '»' | '–' | '—' | '…' | '·' | '•'
|
||
);
|
||
if is_space || is_punct {
|
||
if !last_space {
|
||
out.push(' ');
|
||
last_space = true;
|
||
}
|
||
} else {
|
||
out.push(ch);
|
||
last_space = false;
|
||
}
|
||
}
|
||
|
||
let trimmed = out.trim();
|
||
if trimmed.is_empty() {
|
||
return String::new();
|
||
}
|
||
|
||
trimmed
|
||
.trim_matches(|c: char| c.is_ascii_punctuation() || c.is_whitespace())
|
||
.to_string()
|
||
}
|
||
|
||
fn chunk_snippet(text: &str) -> String {
|
||
const MAX_CHARS: usize = 160;
|
||
let trimmed = text.trim();
|
||
if trimmed.chars().count() <= MAX_CHARS {
|
||
return trimmed.to_string();
|
||
}
|
||
let mut acc = String::with_capacity(MAX_CHARS + 3);
|
||
for (idx, ch) in trimmed.chars().enumerate() {
|
||
if idx >= MAX_CHARS {
|
||
break;
|
||
}
|
||
acc.push(ch);
|
||
}
|
||
acc.push_str("...");
|
||
acc
|
||
}
|
||
|
||
pub fn compute_latency_stats(latencies: &[u128]) -> LatencyStats {
|
||
if latencies.is_empty() {
|
||
return LatencyStats {
|
||
avg: 0.0,
|
||
p50: 0,
|
||
p95: 0,
|
||
};
|
||
}
|
||
let mut sorted = latencies.to_vec();
|
||
sorted.sort_unstable();
|
||
let sum: u128 = sorted.iter().copied().sum();
|
||
let avg = sum as f64 / (sorted.len() as f64);
|
||
let p50 = percentile(&sorted, 0.50);
|
||
let p95 = percentile(&sorted, 0.95);
|
||
LatencyStats { avg, p50, p95 }
|
||
}
|
||
|
||
pub fn build_stage_latency_breakdown(samples: &[PipelineStageTimings]) -> StageLatencyBreakdown {
|
||
fn collect_stage<F>(samples: &[PipelineStageTimings], selector: F) -> Vec<u128>
|
||
where
|
||
F: Fn(&PipelineStageTimings) -> u128,
|
||
{
|
||
samples.iter().map(selector).collect()
|
||
}
|
||
|
||
StageLatencyBreakdown {
|
||
embed: compute_latency_stats(&collect_stage(samples, |entry| entry.embed_ms())),
|
||
collect_candidates: compute_latency_stats(&collect_stage(samples, |entry| {
|
||
entry.collect_candidates_ms()
|
||
})),
|
||
graph_expansion: compute_latency_stats(&collect_stage(samples, |entry| {
|
||
entry.graph_expansion_ms()
|
||
})),
|
||
chunk_attach: compute_latency_stats(&collect_stage(samples, |entry| {
|
||
entry.chunk_attach_ms()
|
||
})),
|
||
rerank: compute_latency_stats(&collect_stage(samples, |entry| entry.rerank_ms())),
|
||
assemble: compute_latency_stats(&collect_stage(samples, |entry| entry.assemble_ms())),
|
||
}
|
||
}
|
||
|
||
fn percentile(sorted: &[u128], fraction: f64) -> u128 {
|
||
if sorted.is_empty() {
|
||
return 0;
|
||
}
|
||
let clamped = fraction.clamp(0.0, 1.0);
|
||
let idx = (clamped * (sorted.len() as f64 - 1.0)).round() as usize;
|
||
sorted[idx.min(sorted.len() - 1)]
|
||
}
|
||
|
||
pub fn build_case_diagnostics(
|
||
summary: &CaseSummary,
|
||
expected_chunk_ids: &[String],
|
||
answers_lower: &[String],
|
||
candidates: &[EvaluationCandidate],
|
||
pipeline_stats: Option<PipelineDiagnostics>,
|
||
) -> CaseDiagnostics {
|
||
let expected_set: HashSet<&str> = expected_chunk_ids.iter().map(|id| id.as_str()).collect();
|
||
let mut seen_chunks: HashSet<String> = HashSet::new();
|
||
let mut attached_chunk_ids = Vec::new();
|
||
let mut entity_diagnostics = Vec::new();
|
||
|
||
for (idx, candidate) in candidates.iter().enumerate() {
|
||
let mut chunk_entries = Vec::new();
|
||
for chunk in &candidate.chunks {
|
||
let contains_answer = text_contains_answer(&chunk.chunk.chunk, answers_lower);
|
||
let expected_chunk = expected_set.contains(chunk.chunk.get_id());
|
||
seen_chunks.insert(chunk.chunk.get_id().to_string());
|
||
attached_chunk_ids.push(chunk.chunk.get_id().to_string());
|
||
chunk_entries.push(ChunkDiagnosticsEntry {
|
||
chunk_id: chunk.chunk.get_id().to_string(),
|
||
score: chunk.score,
|
||
contains_answer,
|
||
expected_chunk,
|
||
snippet: chunk_snippet(&chunk.chunk.chunk),
|
||
});
|
||
}
|
||
entity_diagnostics.push(EntityDiagnostics {
|
||
rank: idx + 1,
|
||
entity_id: candidate.entity_id.clone(),
|
||
source_id: candidate.source_id.clone(),
|
||
name: candidate.entity_name.clone(),
|
||
score: candidate.score,
|
||
entity_match: candidate.source_id == summary.expected_source,
|
||
chunk_text_match: chunk_entries.iter().any(|entry| entry.contains_answer),
|
||
chunk_id_match: chunk_entries.iter().any(|entry| entry.expected_chunk),
|
||
chunks: chunk_entries,
|
||
});
|
||
}
|
||
|
||
let missing_expected_chunk_ids = expected_chunk_ids
|
||
.iter()
|
||
.filter(|id| !seen_chunks.contains(id.as_str()))
|
||
.cloned()
|
||
.collect::<Vec<_>>();
|
||
|
||
let mut failure_reasons = Vec::new();
|
||
if !summary.entity_match {
|
||
failure_reasons.push("entity_miss".to_string());
|
||
}
|
||
if !summary.chunk_text_match {
|
||
failure_reasons.push("chunk_text_missing".to_string());
|
||
}
|
||
if !summary.chunk_id_match {
|
||
failure_reasons.push("chunk_id_missing".to_string());
|
||
}
|
||
if !missing_expected_chunk_ids.is_empty() {
|
||
failure_reasons.push("expected_chunk_absent".to_string());
|
||
}
|
||
|
||
CaseDiagnostics {
|
||
question_id: summary.question_id.clone(),
|
||
question: summary.question.clone(),
|
||
paragraph_id: summary.paragraph_id.clone(),
|
||
paragraph_title: summary.paragraph_title.clone(),
|
||
expected_source: summary.expected_source.clone(),
|
||
expected_chunk_ids: expected_chunk_ids.to_vec(),
|
||
answers: summary.answers.clone(),
|
||
entity_match: summary.entity_match,
|
||
chunk_text_match: summary.chunk_text_match,
|
||
chunk_id_match: summary.chunk_id_match,
|
||
failure_reasons,
|
||
missing_expected_chunk_ids,
|
||
attached_chunk_ids,
|
||
retrieved: entity_diagnostics,
|
||
pipeline: pipeline_stats,
|
||
}
|
||
}
|