mirror of
https://github.com/perstarkse/minne.git
synced 2026-04-26 10:48:37 +02:00
benchmarks: fin
This commit is contained in:
461
evaluations/src/types.rs
Normal file
461
evaluations/src/types.rs
Normal file
@@ -0,0 +1,461 @@
|
||||
use std::collections::HashSet;
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
use common::storage::types::StoredObject;
|
||||
use retrieval_pipeline::{
|
||||
PipelineDiagnostics, PipelineStageTimings, RetrievedChunk, RetrievedEntity, StrategyOutput,
|
||||
};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use unicode_normalization::UnicodeNormalization;
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct EvaluationSummary {
|
||||
pub generated_at: DateTime<Utc>,
|
||||
pub k: usize,
|
||||
pub limit: Option<usize>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub run_label: Option<String>,
|
||||
pub total_cases: usize,
|
||||
pub correct: usize,
|
||||
pub precision: f64,
|
||||
pub correct_at_1: usize,
|
||||
pub correct_at_2: usize,
|
||||
pub correct_at_3: usize,
|
||||
pub precision_at_1: f64,
|
||||
pub precision_at_2: f64,
|
||||
pub precision_at_3: f64,
|
||||
pub mrr: f64,
|
||||
pub average_ndcg: f64,
|
||||
pub duration_ms: u128,
|
||||
pub dataset_id: String,
|
||||
pub dataset_label: String,
|
||||
pub dataset_includes_unanswerable: bool,
|
||||
pub dataset_source: String,
|
||||
pub includes_impossible_cases: bool,
|
||||
pub require_verified_chunks: bool,
|
||||
pub filtered_questions: usize,
|
||||
pub retrieval_cases: usize,
|
||||
pub retrieval_correct: usize,
|
||||
pub retrieval_precision: f64,
|
||||
pub llm_cases: usize,
|
||||
pub llm_answered: usize,
|
||||
pub llm_precision: f64,
|
||||
pub slice_id: String,
|
||||
pub slice_seed: u64,
|
||||
pub slice_total_cases: usize,
|
||||
pub slice_window_offset: usize,
|
||||
pub slice_window_length: usize,
|
||||
pub slice_cases: usize,
|
||||
pub slice_positive_paragraphs: usize,
|
||||
pub slice_negative_paragraphs: usize,
|
||||
pub slice_total_paragraphs: usize,
|
||||
pub slice_negative_multiplier: f32,
|
||||
pub namespace_reused: bool,
|
||||
pub corpus_paragraphs: usize,
|
||||
pub ingestion_cache_path: String,
|
||||
pub ingestion_reused: bool,
|
||||
pub ingestion_embeddings_reused: bool,
|
||||
pub ingestion_fingerprint: String,
|
||||
pub positive_paragraphs_reused: usize,
|
||||
pub negative_paragraphs_reused: usize,
|
||||
pub latency_ms: LatencyStats,
|
||||
pub perf: PerformanceTimings,
|
||||
pub embedding_backend: String,
|
||||
pub embedding_model: Option<String>,
|
||||
pub embedding_dimension: usize,
|
||||
pub rerank_enabled: bool,
|
||||
pub rerank_pool_size: Option<usize>,
|
||||
pub rerank_keep_top: usize,
|
||||
pub concurrency: usize,
|
||||
pub detailed_report: bool,
|
||||
pub retrieval_strategy: String,
|
||||
pub chunk_result_cap: usize,
|
||||
pub chunk_rrf_k: f32,
|
||||
pub chunk_rrf_vector_weight: f32,
|
||||
pub chunk_rrf_fts_weight: f32,
|
||||
pub chunk_rrf_use_vector: bool,
|
||||
pub chunk_rrf_use_fts: bool,
|
||||
pub ingest_chunk_min_tokens: usize,
|
||||
pub ingest_chunk_max_tokens: usize,
|
||||
pub ingest_chunks_only: bool,
|
||||
pub ingest_chunk_overlap_tokens: usize,
|
||||
pub chunk_vector_take: usize,
|
||||
pub chunk_fts_take: usize,
|
||||
pub chunk_avg_chars_per_token: usize,
|
||||
pub max_chunks_per_entity: usize,
|
||||
pub cases: Vec<CaseSummary>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct CaseSummary {
|
||||
pub question_id: String,
|
||||
pub question: String,
|
||||
pub paragraph_id: String,
|
||||
pub paragraph_title: String,
|
||||
pub expected_source: String,
|
||||
pub answers: Vec<String>,
|
||||
pub matched: bool,
|
||||
pub entity_match: bool,
|
||||
pub chunk_text_match: bool,
|
||||
pub chunk_id_match: bool,
|
||||
pub is_impossible: bool,
|
||||
pub has_verified_chunks: bool,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub match_rank: Option<usize>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub reciprocal_rank: Option<f64>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub ndcg: Option<f64>,
|
||||
pub latency_ms: u128,
|
||||
pub retrieved: Vec<RetrievedSummary>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct LatencyStats {
|
||||
pub avg: f64,
|
||||
pub p50: u128,
|
||||
pub p95: u128,
|
||||
}
|
||||
|
||||
impl Default for LatencyStats {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
avg: 0.0,
|
||||
p50: 0,
|
||||
p95: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
|
||||
pub struct StageLatencyBreakdown {
|
||||
pub embed: LatencyStats,
|
||||
pub collect_candidates: LatencyStats,
|
||||
pub graph_expansion: LatencyStats,
|
||||
pub chunk_attach: LatencyStats,
|
||||
pub rerank: LatencyStats,
|
||||
pub assemble: LatencyStats,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone, Serialize, Deserialize)]
|
||||
pub struct EvaluationStageTimings {
|
||||
pub prepare_slice_ms: u128,
|
||||
pub prepare_db_ms: u128,
|
||||
pub prepare_corpus_ms: u128,
|
||||
pub prepare_namespace_ms: u128,
|
||||
pub run_queries_ms: u128,
|
||||
pub summarize_ms: u128,
|
||||
pub finalize_ms: u128,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct PerformanceTimings {
|
||||
pub openai_base_url: String,
|
||||
pub ingestion_ms: u128,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub namespace_seed_ms: Option<u128>,
|
||||
pub evaluation_stage_ms: EvaluationStageTimings,
|
||||
pub stage_latency: StageLatencyBreakdown,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct RetrievedSummary {
|
||||
pub rank: usize,
|
||||
pub entity_id: String,
|
||||
pub source_id: String,
|
||||
pub entity_name: String,
|
||||
pub score: f32,
|
||||
pub matched: bool,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub entity_description: Option<String>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub entity_category: Option<String>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub chunk_text_match: Option<bool>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub chunk_id_match: Option<bool>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct EvaluationCandidate {
|
||||
pub entity_id: String,
|
||||
pub source_id: String,
|
||||
pub entity_name: String,
|
||||
pub entity_description: Option<String>,
|
||||
pub entity_category: Option<String>,
|
||||
pub score: f32,
|
||||
pub chunks: Vec<RetrievedChunk>,
|
||||
}
|
||||
|
||||
impl EvaluationCandidate {
|
||||
fn from_entity(entity: RetrievedEntity) -> Self {
|
||||
let entity_category = Some(format!("{:?}", entity.entity.entity_type));
|
||||
Self {
|
||||
entity_id: entity.entity.get_id().to_string(),
|
||||
source_id: entity.entity.source_id.clone(),
|
||||
entity_name: entity.entity.name.clone(),
|
||||
entity_description: Some(entity.entity.description.clone()),
|
||||
entity_category,
|
||||
score: entity.score,
|
||||
chunks: entity.chunks,
|
||||
}
|
||||
}
|
||||
|
||||
fn from_chunk(chunk: RetrievedChunk) -> Self {
|
||||
let snippet = chunk_snippet(&chunk.chunk.chunk);
|
||||
Self {
|
||||
entity_id: chunk.chunk.get_id().to_string(),
|
||||
source_id: chunk.chunk.source_id.clone(),
|
||||
entity_name: chunk.chunk.source_id.clone(),
|
||||
entity_description: Some(snippet),
|
||||
entity_category: Some("Chunk".to_string()),
|
||||
score: chunk.score,
|
||||
chunks: vec![chunk],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn adapt_strategy_output(output: StrategyOutput) -> Vec<EvaluationCandidate> {
|
||||
match output {
|
||||
StrategyOutput::Entities(entities) => entities
|
||||
.into_iter()
|
||||
.map(EvaluationCandidate::from_entity)
|
||||
.collect(),
|
||||
StrategyOutput::Chunks(chunks) => chunks
|
||||
.into_iter()
|
||||
.map(EvaluationCandidate::from_chunk)
|
||||
.collect(),
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct CaseDiagnostics {
|
||||
pub question_id: String,
|
||||
pub question: String,
|
||||
pub paragraph_id: String,
|
||||
pub paragraph_title: String,
|
||||
pub expected_source: String,
|
||||
pub expected_chunk_ids: Vec<String>,
|
||||
pub answers: Vec<String>,
|
||||
pub entity_match: bool,
|
||||
pub chunk_text_match: bool,
|
||||
pub chunk_id_match: bool,
|
||||
pub failure_reasons: Vec<String>,
|
||||
pub missing_expected_chunk_ids: Vec<String>,
|
||||
pub attached_chunk_ids: Vec<String>,
|
||||
pub retrieved: Vec<EntityDiagnostics>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub pipeline: Option<PipelineDiagnostics>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct EntityDiagnostics {
|
||||
pub rank: usize,
|
||||
pub entity_id: String,
|
||||
pub source_id: String,
|
||||
pub name: String,
|
||||
pub score: f32,
|
||||
pub entity_match: bool,
|
||||
pub chunk_text_match: bool,
|
||||
pub chunk_id_match: bool,
|
||||
pub chunks: Vec<ChunkDiagnosticsEntry>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct ChunkDiagnosticsEntry {
|
||||
pub chunk_id: String,
|
||||
pub score: f32,
|
||||
pub contains_answer: bool,
|
||||
pub expected_chunk: bool,
|
||||
pub snippet: String,
|
||||
}
|
||||
|
||||
pub fn text_contains_answer(text: &str, answers: &[String]) -> bool {
|
||||
if answers.is_empty() {
|
||||
return true;
|
||||
}
|
||||
let haystack = normalize_for_match(text);
|
||||
answers
|
||||
.iter()
|
||||
.map(|needle| normalize_for_match(needle))
|
||||
.any(|needle| !needle.is_empty() && haystack.contains(&needle))
|
||||
}
|
||||
|
||||
fn normalize_for_match(input: &str) -> String {
|
||||
// NFKC normalize, lowercase, and collapse whitespace/punctuation to a single space
|
||||
// to reduce false negatives from formatting or punctuation differences.
|
||||
let mut out = String::with_capacity(input.len());
|
||||
let mut last_space = false;
|
||||
for ch in input.nfkc().flat_map(|c| c.to_lowercase()) {
|
||||
let is_space = ch.is_whitespace();
|
||||
let is_punct = ch.is_ascii_punctuation()
|
||||
|| matches!(
|
||||
ch,
|
||||
'“' | '”' | '‘' | '’' | '«' | '»' | '–' | '—' | '…' | '·' | '•'
|
||||
);
|
||||
if is_space || is_punct {
|
||||
if !last_space {
|
||||
out.push(' ');
|
||||
last_space = true;
|
||||
}
|
||||
} else {
|
||||
out.push(ch);
|
||||
last_space = false;
|
||||
}
|
||||
}
|
||||
|
||||
let trimmed = out.trim();
|
||||
if trimmed.is_empty() {
|
||||
return String::new();
|
||||
}
|
||||
|
||||
trimmed
|
||||
.trim_matches(|c: char| c.is_ascii_punctuation() || c.is_whitespace())
|
||||
.to_string()
|
||||
}
|
||||
|
||||
fn chunk_snippet(text: &str) -> String {
|
||||
const MAX_CHARS: usize = 160;
|
||||
let trimmed = text.trim();
|
||||
if trimmed.chars().count() <= MAX_CHARS {
|
||||
return trimmed.to_string();
|
||||
}
|
||||
let mut acc = String::with_capacity(MAX_CHARS + 3);
|
||||
for (idx, ch) in trimmed.chars().enumerate() {
|
||||
if idx >= MAX_CHARS {
|
||||
break;
|
||||
}
|
||||
acc.push(ch);
|
||||
}
|
||||
acc.push_str("...");
|
||||
acc
|
||||
}
|
||||
|
||||
pub fn compute_latency_stats(latencies: &[u128]) -> LatencyStats {
|
||||
if latencies.is_empty() {
|
||||
return LatencyStats {
|
||||
avg: 0.0,
|
||||
p50: 0,
|
||||
p95: 0,
|
||||
};
|
||||
}
|
||||
let mut sorted = latencies.to_vec();
|
||||
sorted.sort_unstable();
|
||||
let sum: u128 = sorted.iter().copied().sum();
|
||||
let avg = sum as f64 / (sorted.len() as f64);
|
||||
let p50 = percentile(&sorted, 0.50);
|
||||
let p95 = percentile(&sorted, 0.95);
|
||||
LatencyStats { avg, p50, p95 }
|
||||
}
|
||||
|
||||
pub fn build_stage_latency_breakdown(samples: &[PipelineStageTimings]) -> StageLatencyBreakdown {
|
||||
fn collect_stage<F>(samples: &[PipelineStageTimings], selector: F) -> Vec<u128>
|
||||
where
|
||||
F: Fn(&PipelineStageTimings) -> u128,
|
||||
{
|
||||
samples.iter().map(selector).collect()
|
||||
}
|
||||
|
||||
StageLatencyBreakdown {
|
||||
embed: compute_latency_stats(&collect_stage(samples, |entry| entry.embed_ms())),
|
||||
collect_candidates: compute_latency_stats(&collect_stage(samples, |entry| {
|
||||
entry.collect_candidates_ms()
|
||||
})),
|
||||
graph_expansion: compute_latency_stats(&collect_stage(samples, |entry| {
|
||||
entry.graph_expansion_ms()
|
||||
})),
|
||||
chunk_attach: compute_latency_stats(&collect_stage(samples, |entry| {
|
||||
entry.chunk_attach_ms()
|
||||
})),
|
||||
rerank: compute_latency_stats(&collect_stage(samples, |entry| entry.rerank_ms())),
|
||||
assemble: compute_latency_stats(&collect_stage(samples, |entry| entry.assemble_ms())),
|
||||
}
|
||||
}
|
||||
|
||||
fn percentile(sorted: &[u128], fraction: f64) -> u128 {
|
||||
if sorted.is_empty() {
|
||||
return 0;
|
||||
}
|
||||
let clamped = fraction.clamp(0.0, 1.0);
|
||||
let idx = (clamped * (sorted.len() as f64 - 1.0)).round() as usize;
|
||||
sorted[idx.min(sorted.len() - 1)]
|
||||
}
|
||||
|
||||
pub fn build_case_diagnostics(
|
||||
summary: &CaseSummary,
|
||||
expected_chunk_ids: &[String],
|
||||
answers_lower: &[String],
|
||||
candidates: &[EvaluationCandidate],
|
||||
pipeline_stats: Option<PipelineDiagnostics>,
|
||||
) -> CaseDiagnostics {
|
||||
let expected_set: HashSet<&str> = expected_chunk_ids.iter().map(|id| id.as_str()).collect();
|
||||
let mut seen_chunks: HashSet<String> = HashSet::new();
|
||||
let mut attached_chunk_ids = Vec::new();
|
||||
let mut entity_diagnostics = Vec::new();
|
||||
|
||||
for (idx, candidate) in candidates.iter().enumerate() {
|
||||
let mut chunk_entries = Vec::new();
|
||||
for chunk in &candidate.chunks {
|
||||
let contains_answer = text_contains_answer(&chunk.chunk.chunk, answers_lower);
|
||||
let expected_chunk = expected_set.contains(chunk.chunk.get_id());
|
||||
seen_chunks.insert(chunk.chunk.get_id().to_string());
|
||||
attached_chunk_ids.push(chunk.chunk.get_id().to_string());
|
||||
chunk_entries.push(ChunkDiagnosticsEntry {
|
||||
chunk_id: chunk.chunk.get_id().to_string(),
|
||||
score: chunk.score,
|
||||
contains_answer,
|
||||
expected_chunk,
|
||||
snippet: chunk_snippet(&chunk.chunk.chunk),
|
||||
});
|
||||
}
|
||||
entity_diagnostics.push(EntityDiagnostics {
|
||||
rank: idx + 1,
|
||||
entity_id: candidate.entity_id.clone(),
|
||||
source_id: candidate.source_id.clone(),
|
||||
name: candidate.entity_name.clone(),
|
||||
score: candidate.score,
|
||||
entity_match: candidate.source_id == summary.expected_source,
|
||||
chunk_text_match: chunk_entries.iter().any(|entry| entry.contains_answer),
|
||||
chunk_id_match: chunk_entries.iter().any(|entry| entry.expected_chunk),
|
||||
chunks: chunk_entries,
|
||||
});
|
||||
}
|
||||
|
||||
let missing_expected_chunk_ids = expected_chunk_ids
|
||||
.iter()
|
||||
.filter(|id| !seen_chunks.contains(id.as_str()))
|
||||
.cloned()
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let mut failure_reasons = Vec::new();
|
||||
if !summary.entity_match {
|
||||
failure_reasons.push("entity_miss".to_string());
|
||||
}
|
||||
if !summary.chunk_text_match {
|
||||
failure_reasons.push("chunk_text_missing".to_string());
|
||||
}
|
||||
if !summary.chunk_id_match {
|
||||
failure_reasons.push("chunk_id_missing".to_string());
|
||||
}
|
||||
if !missing_expected_chunk_ids.is_empty() {
|
||||
failure_reasons.push("expected_chunk_absent".to_string());
|
||||
}
|
||||
|
||||
CaseDiagnostics {
|
||||
question_id: summary.question_id.clone(),
|
||||
question: summary.question.clone(),
|
||||
paragraph_id: summary.paragraph_id.clone(),
|
||||
paragraph_title: summary.paragraph_title.clone(),
|
||||
expected_source: summary.expected_source.clone(),
|
||||
expected_chunk_ids: expected_chunk_ids.to_vec(),
|
||||
answers: summary.answers.clone(),
|
||||
entity_match: summary.entity_match,
|
||||
chunk_text_match: summary.chunk_text_match,
|
||||
chunk_id_match: summary.chunk_id_match,
|
||||
failure_reasons,
|
||||
missing_expected_chunk_ids,
|
||||
attached_chunk_ids,
|
||||
retrieved: entity_diagnostics,
|
||||
pipeline: pipeline_stats,
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user