From a8d10f265ce9f3edb9e41a9cefa1bdc9b6401d7b Mon Sep 17 00:00:00 2001 From: Per Stark Date: Mon, 8 Dec 2025 21:57:53 +0100 Subject: [PATCH] benchmarks: fin --- .cargo/config.toml | 2 +- eval/src/eval/mod.rs | 560 ------------------ eval/src/slice.rs | 28 - {eval => evaluations}/Cargo.toml | 2 +- {eval => evaluations}/manifest.yaml | 0 {eval => evaluations}/src/args.rs | 150 ++--- {eval => evaluations}/src/cache.rs | 0 evaluations/src/cases.rs | 187 ++++++ .../src/corpus}/config.rs | 10 +- .../ingest => evaluations/src/corpus}/mod.rs | 8 +- .../src/corpus}/orchestrator.rs | 16 +- .../src/corpus}/store.rs | 0 {eval => evaluations}/src/datasets/beir.rs | 0 {eval => evaluations}/src/datasets/mod.rs | 0 {eval => evaluations}/src/datasets/nq.rs | 0 {eval => evaluations}/src/datasets/squad.rs | 0 {eval => evaluations}/src/db_helpers.rs | 18 +- evaluations/src/eval.rs | 128 ++++ {eval => evaluations}/src/inspection.rs | 9 +- {eval => evaluations}/src/main.rs | 8 +- evaluations/src/namespace.rs | 224 +++++++ {eval => evaluations}/src/openai.rs | 0 {eval => evaluations}/src/perf.rs | 0 .../src}/pipeline/context.rs | 6 +- .../eval => evaluations/src}/pipeline/mod.rs | 2 +- .../src}/pipeline/stages/finalize.rs | 0 .../src}/pipeline/stages/mod.rs | 0 .../src}/pipeline/stages/prepare_corpus.rs | 16 +- .../src}/pipeline/stages/prepare_db.rs | 0 .../src}/pipeline/stages/prepare_namespace.rs | 6 +- .../src}/pipeline/stages/prepare_slice.rs | 10 +- .../src}/pipeline/stages/run_queries.rs | 0 .../src}/pipeline/stages/summarize.rs | 8 +- .../src}/pipeline/state.rs | 0 {eval => evaluations}/src/report.rs | 0 evaluations/src/settings.rs | 63 ++ .../src/slices.rs => evaluations/src/slice.rs | 27 + {eval => evaluations}/src/snapshot.rs | 0 {eval/src/eval => evaluations/src}/types.rs | 0 39 files changed, 774 insertions(+), 714 deletions(-) delete mode 100644 eval/src/eval/mod.rs delete mode 100644 eval/src/slice.rs rename {eval => evaluations}/Cargo.toml (98%) rename {eval => evaluations}/manifest.yaml (100%) rename {eval => evaluations}/src/args.rs (94%) rename {eval => evaluations}/src/cache.rs (100%) create mode 100644 evaluations/src/cases.rs rename {eval/src/ingest => evaluations/src/corpus}/config.rs (76%) rename {eval/src/ingest => evaluations/src/corpus}/mod.rs (72%) rename {eval/src/ingest => evaluations/src/corpus}/orchestrator.rs (98%) rename {eval/src/ingest => evaluations/src/corpus}/store.rs (100%) rename {eval => evaluations}/src/datasets/beir.rs (100%) rename {eval => evaluations}/src/datasets/mod.rs (100%) rename {eval => evaluations}/src/datasets/nq.rs (100%) rename {eval => evaluations}/src/datasets/squad.rs (100%) rename {eval => evaluations}/src/db_helpers.rs (92%) create mode 100644 evaluations/src/eval.rs rename {eval => evaluations}/src/inspection.rs (95%) rename {eval => evaluations}/src/main.rs (99%) create mode 100644 evaluations/src/namespace.rs rename {eval => evaluations}/src/openai.rs (100%) rename {eval => evaluations}/src/perf.rs (100%) rename {eval/src/eval => evaluations/src}/pipeline/context.rs (97%) rename {eval/src/eval => evaluations/src}/pipeline/mod.rs (90%) rename {eval/src/eval => evaluations/src}/pipeline/stages/finalize.rs (100%) rename {eval/src/eval => evaluations/src}/pipeline/stages/mod.rs (100%) rename {eval/src/eval => evaluations/src}/pipeline/stages/prepare_corpus.rs (90%) rename {eval/src/eval => evaluations/src}/pipeline/stages/prepare_db.rs (100%) rename {eval/src/eval => evaluations/src}/pipeline/stages/prepare_namespace.rs (98%) rename {eval/src/eval => evaluations/src}/pipeline/stages/prepare_slice.rs (89%) rename {eval/src/eval => evaluations/src}/pipeline/stages/run_queries.rs (100%) rename {eval/src/eval => evaluations/src}/pipeline/stages/summarize.rs (96%) rename {eval/src/eval => evaluations/src}/pipeline/state.rs (100%) rename {eval => evaluations}/src/report.rs (100%) create mode 100644 evaluations/src/settings.rs rename eval/src/slices.rs => evaluations/src/slice.rs (97%) rename {eval => evaluations}/src/snapshot.rs (100%) rename {eval/src/eval => evaluations/src}/types.rs (100%) diff --git a/.cargo/config.toml b/.cargo/config.toml index d830e40..61f4796 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -1,2 +1,2 @@ [alias] -eval = "run -p eval --" +eval = "run -p evaluations --" diff --git a/eval/src/eval/mod.rs b/eval/src/eval/mod.rs deleted file mode 100644 index 0ab7254..0000000 --- a/eval/src/eval/mod.rs +++ /dev/null @@ -1,560 +0,0 @@ -mod pipeline; -mod types; - -pub use pipeline::run_evaluation; -pub use types::*; - -use std::{collections::HashMap, path::Path}; - -use anyhow::{anyhow, Context, Result}; -use chrono::{DateTime, SecondsFormat, Utc}; -use common::{ - error::AppError, - storage::{ - db::SurrealDbClient, - types::{system_settings::SystemSettings, user::User, StoredObject}, - }, -}; -use serde::Deserialize; -use tokio::io::AsyncWriteExt; -use tracing::{info, warn}; - -use crate::{ - args::{self, Config}, - datasets::{self, ConvertedDataset}, - ingest, - slice::{self}, - snapshot::{self, DbSnapshotState}, -}; - -pub(crate) struct SeededCase { - question_id: String, - question: String, - expected_source: String, - answers: Vec, - paragraph_id: String, - paragraph_title: String, - expected_chunk_ids: Vec, - is_impossible: bool, - has_verified_chunks: bool, -} - -pub(crate) fn cases_from_manifest(manifest: &ingest::CorpusManifest) -> Vec { - let mut title_map = HashMap::new(); - for paragraph in &manifest.paragraphs { - title_map.insert(paragraph.paragraph_id.as_str(), paragraph.title.clone()); - } - - let include_impossible = manifest.metadata.include_unanswerable; - let require_verified_chunks = manifest.metadata.require_verified_chunks; - - manifest - .questions - .iter() - .filter(|question| { - should_include_question(question, include_impossible, require_verified_chunks) - }) - .map(|question| { - let title = title_map - .get(question.paragraph_id.as_str()) - .cloned() - .unwrap_or_else(|| "Untitled".to_string()); - SeededCase { - question_id: question.question_id.clone(), - question: question.question_text.clone(), - expected_source: question.text_content_id.clone(), - answers: question.answers.clone(), - paragraph_id: question.paragraph_id.clone(), - paragraph_title: title, - expected_chunk_ids: question.matching_chunk_ids.clone(), - is_impossible: question.is_impossible, - has_verified_chunks: !question.matching_chunk_ids.is_empty(), - } - }) - .collect() -} - -fn should_include_question( - question: &ingest::CorpusQuestion, - include_impossible: bool, - require_verified_chunks: bool, -) -> bool { - if !include_impossible && question.is_impossible { - return false; - } - if require_verified_chunks && question.matching_chunk_ids.is_empty() { - return false; - } - true -} - -pub async fn grow_slice(dataset: &ConvertedDataset, config: &Config) -> Result<()> { - let ledger_limit = ledger_target(config); - let slice_settings = slice::slice_config_with_limit(config, ledger_limit); - let slice = - slice::resolve_slice(dataset, &slice_settings).context("resolving dataset slice")?; - info!( - slice = slice.manifest.slice_id.as_str(), - cases = slice.manifest.case_count, - positives = slice.manifest.positive_paragraphs, - negatives = slice.manifest.negative_paragraphs, - total_paragraphs = slice.manifest.total_paragraphs, - "Slice ledger ready" - ); - println!( - "Slice `{}` now contains {} questions ({} positives, {} negatives)", - slice.manifest.slice_id, - slice.manifest.case_count, - slice.manifest.positive_paragraphs, - slice.manifest.negative_paragraphs - ); - Ok(()) -} - -pub(crate) fn ledger_target(config: &Config) -> Option { - match (config.slice_grow, config.limit) { - (Some(grow), Some(limit)) => Some(limit.max(grow)), - (Some(grow), None) => Some(grow), - (None, limit) => limit, - } -} - -pub(crate) async fn write_chunk_diagnostics(path: &Path, cases: &[CaseDiagnostics]) -> Result<()> { - args::ensure_parent(path)?; - let mut file = tokio::fs::File::create(path) - .await - .with_context(|| format!("creating diagnostics file {}", path.display()))?; - for case in cases { - let line = serde_json::to_vec(case).context("serialising chunk diagnostics entry")?; - file.write_all(&line).await?; - file.write_all(b"\n").await?; - } - file.flush().await?; - Ok(()) -} - -pub(crate) async fn warm_hnsw_cache(db: &SurrealDbClient, dimension: usize) -> Result<()> { - // Create a dummy embedding for cache warming - let dummy_embedding: Vec = (0..dimension).map(|i| (i as f32).sin()).collect(); - - info!("Warming HNSW caches with sample queries"); - - // Warm up chunk embedding index - just query the embedding table to load HNSW index - let _ = db - .client - .query( - r#"SELECT chunk_id - FROM text_chunk_embedding - WHERE embedding <|1,1|> $embedding - LIMIT 5"#, - ) - .bind(("embedding", dummy_embedding.clone())) - .await - .context("warming text chunk HNSW cache")?; - - // Warm up entity embedding index - let _ = db - .client - .query( - r#"SELECT entity_id - FROM knowledge_entity_embedding - WHERE embedding <|1,1|> $embedding - LIMIT 5"#, - ) - .bind(("embedding", dummy_embedding)) - .await - .context("warming knowledge entity HNSW cache")?; - - info!("HNSW cache warming completed"); - Ok(()) -} - -pub(crate) async fn ensure_eval_user(db: &SurrealDbClient) -> Result { - let timestamp = datasets::base_timestamp(); - let user = User { - id: "eval-user".to_string(), - created_at: timestamp, - updated_at: timestamp, - email: "eval-retrieval@minne.dev".to_string(), - password: "not-used".to_string(), - anonymous: false, - api_key: None, - admin: false, - timezone: "UTC".to_string(), - }; - - if let Some(existing) = db.get_item::(&user.get_id()).await? { - return Ok(existing); - } - - db.store_item(user.clone()) - .await - .context("storing evaluation user")?; - Ok(user) -} - -pub fn format_timestamp(timestamp: &DateTime) -> String { - timestamp.to_rfc3339_opts(SecondsFormat::Secs, true) -} - -pub(crate) fn sanitize_model_code(code: &str) -> String { - code.chars() - .map(|ch| { - if ch.is_ascii_alphanumeric() { - ch.to_ascii_lowercase() - } else { - '_' - } - }) - .collect() -} - -pub(crate) async fn connect_eval_db( - config: &Config, - namespace: &str, - database: &str, -) -> Result { - match SurrealDbClient::new( - &config.db_endpoint, - &config.db_username, - &config.db_password, - namespace, - database, - ) - .await - { - Ok(client) => { - info!( - endpoint = %config.db_endpoint, - namespace, - database, - auth = "root", - "Connected to SurrealDB" - ); - Ok(client) - } - Err(root_err) => { - info!( - endpoint = %config.db_endpoint, - namespace, - database, - "Root authentication failed; trying namespace-level auth" - ); - let namespace_client = SurrealDbClient::new_with_namespace_user( - &config.db_endpoint, - namespace, - &config.db_username, - &config.db_password, - database, - ) - .await - .map_err(|ns_err| { - anyhow!( - "failed to connect to SurrealDB via root ({root_err}) or namespace ({ns_err}) credentials" - ) - })?; - info!( - endpoint = %config.db_endpoint, - namespace, - database, - auth = "namespace", - "Connected to SurrealDB" - ); - Ok(namespace_client) - } - } -} - -pub(crate) async fn namespace_has_corpus(db: &SurrealDbClient) -> Result { - #[derive(Deserialize)] - struct CountRow { - count: i64, - } - - let mut response = db - .client - .query("SELECT count() AS count FROM text_chunk") - .await - .context("checking namespace corpus state")?; - let rows: Vec = response.take(0).unwrap_or_default(); - Ok(rows.first().map(|row| row.count).unwrap_or(0) > 0) -} - -pub(crate) async fn can_reuse_namespace( - db: &SurrealDbClient, - descriptor: &snapshot::Descriptor, - namespace: &str, - database: &str, - dataset_id: &str, - slice_id: &str, - ingestion_fingerprint: &str, - slice_case_count: usize, -) -> Result { - let state = match descriptor.load_db_state().await? { - Some(state) => state, - None => { - info!("No namespace state recorded; reseeding corpus from cached shards"); - return Ok(false); - } - }; - - if state.slice_case_count != slice_case_count { - info!( - requested_cases = slice_case_count, - stored_cases = state.slice_case_count, - "Skipping live namespace reuse; cached state does not match requested window" - ); - return Ok(false); - } - - if state.dataset_id != dataset_id - || state.slice_id != slice_id - || state.ingestion_fingerprint != ingestion_fingerprint - || state.namespace.as_deref() != Some(namespace) - || state.database.as_deref() != Some(database) - { - info!( - namespace, - database, "Cached namespace metadata mismatch; rebuilding corpus from ingestion cache" - ); - return Ok(false); - } - - if namespace_has_corpus(db).await? { - Ok(true) - } else { - info!( - namespace, - database, - "Namespace metadata matches but tables are empty; reseeding from ingestion cache" - ); - Ok(false) - } -} - -fn sanitize_identifier(input: &str) -> String { - let mut cleaned: String = input - .chars() - .map(|ch| { - if ch.is_ascii_alphanumeric() { - ch.to_ascii_lowercase() - } else { - '_' - } - }) - .collect(); - if cleaned.is_empty() { - cleaned.push('x'); - } - if cleaned.len() > 64 { - cleaned.truncate(64); - } - cleaned -} - -pub(crate) fn default_namespace(dataset_id: &str, limit: Option) -> String { - let dataset_component = sanitize_identifier(dataset_id); - let limit_component = match limit { - Some(value) if value > 0 => format!("limit{}", value), - _ => "all".to_string(), - }; - format!("eval_{}_{}", dataset_component, limit_component) -} - -pub(crate) fn default_database() -> String { - "retrieval_eval".to_string() -} - -pub(crate) async fn record_namespace_state( - descriptor: &snapshot::Descriptor, - dataset_id: &str, - slice_id: &str, - ingestion_fingerprint: &str, - namespace: &str, - database: &str, - slice_case_count: usize, -) { - let state = DbSnapshotState { - dataset_id: dataset_id.to_string(), - slice_id: slice_id.to_string(), - ingestion_fingerprint: ingestion_fingerprint.to_string(), - snapshot_hash: descriptor.metadata_hash().to_string(), - updated_at: Utc::now(), - namespace: Some(namespace.to_string()), - database: Some(database.to_string()), - slice_case_count, - }; - if let Err(err) = descriptor.store_db_state(&state).await { - warn!(error = %err, "Failed to record namespace state"); - } -} - -pub(crate) async fn enforce_system_settings( - db: &SurrealDbClient, - mut settings: SystemSettings, - provider_dimension: usize, - config: &Config, -) -> Result { - let mut updated_settings = settings.clone(); - let mut needs_settings_update = false; - - if provider_dimension != settings.embedding_dimensions as usize { - updated_settings.embedding_dimensions = provider_dimension as u32; - needs_settings_update = true; - } - if let Some(query_override) = config.query_model.as_deref() { - if settings.query_model != query_override { - info!( - model = query_override, - "Overriding system query model for this run" - ); - updated_settings.query_model = query_override.to_string(); - needs_settings_update = true; - } - } - if needs_settings_update { - settings = SystemSettings::update(db, updated_settings) - .await - .context("updating system settings overrides")?; - } - Ok(settings) -} - -pub(crate) async fn load_or_init_system_settings( - db: &SurrealDbClient, - _dimension: usize, -) -> Result<(SystemSettings, bool)> { - match SystemSettings::get_current(db).await { - Ok(settings) => Ok((settings, false)), - Err(AppError::NotFound(_)) => { - info!("System settings missing; applying database migrations for namespace"); - db.apply_migrations() - .await - .context("applying database migrations after missing system settings")?; - let settings = SystemSettings::get_current(db) - .await - .context("loading system settings after migrations")?; - Ok((settings, true)) - } - Err(err) => Err(err).context("loading system settings"), - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::ingest::store::{CorpusParagraph, EmbeddedKnowledgeEntity, EmbeddedTextChunk}; - use crate::ingest::{CorpusManifest, CorpusMetadata, CorpusQuestion, MANIFEST_VERSION}; - use chrono::Utc; - use common::storage::types::text_content::TextContent; - - fn sample_manifest() -> CorpusManifest { - let paragraphs = vec![ - CorpusParagraph { - paragraph_id: "p1".to_string(), - title: "Alpha".to_string(), - text_content: TextContent::new( - "alpha context".to_string(), - None, - "test".to_string(), - None, - None, - "user".to_string(), - ), - entities: Vec::::new(), - relationships: Vec::new(), - chunks: Vec::::new(), - }, - CorpusParagraph { - paragraph_id: "p2".to_string(), - title: "Beta".to_string(), - text_content: TextContent::new( - "beta context".to_string(), - None, - "test".to_string(), - None, - None, - "user".to_string(), - ), - entities: Vec::::new(), - relationships: Vec::new(), - chunks: Vec::::new(), - }, - ]; - let questions = vec![ - CorpusQuestion { - question_id: "q1".to_string(), - paragraph_id: "p1".to_string(), - text_content_id: "tc-alpha".to_string(), - question_text: "What is Alpha?".to_string(), - answers: vec!["Alpha".to_string()], - is_impossible: false, - matching_chunk_ids: vec!["chunk-alpha".to_string()], - }, - CorpusQuestion { - question_id: "q2".to_string(), - paragraph_id: "p1".to_string(), - text_content_id: "tc-alpha".to_string(), - question_text: "Unanswerable?".to_string(), - answers: Vec::new(), - is_impossible: true, - matching_chunk_ids: Vec::new(), - }, - CorpusQuestion { - question_id: "q3".to_string(), - paragraph_id: "p2".to_string(), - text_content_id: "tc-beta".to_string(), - question_text: "Where is Beta?".to_string(), - answers: vec!["Beta".to_string()], - is_impossible: false, - matching_chunk_ids: Vec::new(), - }, - ]; - CorpusManifest { - version: MANIFEST_VERSION, - metadata: CorpusMetadata { - dataset_id: "ds".to_string(), - dataset_label: "Dataset".to_string(), - slice_id: "slice".to_string(), - include_unanswerable: true, - require_verified_chunks: true, - ingestion_fingerprint: "fp".to_string(), - embedding_backend: "test".to_string(), - embedding_model: None, - embedding_dimension: 3, - converted_checksum: "chk".to_string(), - generated_at: Utc::now(), - paragraph_count: paragraphs.len(), - question_count: questions.len(), - chunk_min_tokens: 1, - chunk_max_tokens: 10, - chunk_only: false, - }, - paragraphs, - questions, - } - } - - #[test] - fn cases_respect_mode_filters() { - let mut manifest = sample_manifest(); - manifest.metadata.include_unanswerable = false; - manifest.metadata.require_verified_chunks = true; - - let strict_cases = cases_from_manifest(&manifest); - assert_eq!(strict_cases.len(), 1); - assert_eq!(strict_cases[0].question_id, "q1"); - assert_eq!(strict_cases[0].paragraph_title, "Alpha"); - - let mut llm_manifest = manifest.clone(); - llm_manifest.metadata.include_unanswerable = true; - llm_manifest.metadata.require_verified_chunks = false; - - let llm_cases = cases_from_manifest(&llm_manifest); - let ids: Vec<_> = llm_cases - .iter() - .map(|case| case.question_id.as_str()) - .collect(); - assert_eq!(ids, vec!["q1", "q2", "q3"]); - } -} diff --git a/eval/src/slice.rs b/eval/src/slice.rs deleted file mode 100644 index dda9d80..0000000 --- a/eval/src/slice.rs +++ /dev/null @@ -1,28 +0,0 @@ -use crate::slices::SliceConfig as CoreSliceConfig; - -pub use crate::slices::*; - -use crate::args::Config; - -impl<'a> From<&'a Config> for CoreSliceConfig<'a> { - fn from(config: &'a Config) -> Self { - slice_config_with_limit(config, None) - } -} - -pub fn slice_config_with_limit<'a>( - config: &'a Config, - limit_override: Option, -) -> CoreSliceConfig<'a> { - CoreSliceConfig { - cache_dir: config.cache_dir.as_path(), - force_convert: config.force_convert, - explicit_slice: config.slice.as_deref(), - limit: limit_override.or(config.limit), - corpus_limit: config.corpus_limit, - slice_seed: config.slice_seed, - llm_mode: config.llm_mode, - negative_multiplier: config.negative_multiplier, - require_verified_chunks: config.retrieval.require_verified_chunks, - } -} diff --git a/eval/Cargo.toml b/evaluations/Cargo.toml similarity index 98% rename from eval/Cargo.toml rename to evaluations/Cargo.toml index 33cf249..7920a23 100644 --- a/eval/Cargo.toml +++ b/evaluations/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "eval" +name = "evaluations" version = "0.1.0" edition = "2021" diff --git a/eval/manifest.yaml b/evaluations/manifest.yaml similarity index 100% rename from eval/manifest.yaml rename to evaluations/manifest.yaml diff --git a/eval/src/args.rs b/evaluations/src/args.rs similarity index 94% rename from eval/src/args.rs rename to evaluations/src/args.rs index 2d0d642..9f6838c 100644 --- a/eval/src/args.rs +++ b/evaluations/src/args.rs @@ -15,15 +15,15 @@ fn workspace_root() -> PathBuf { } fn default_report_dir() -> PathBuf { - workspace_root().join("eval/reports") + workspace_root().join("evaluations/reports") } fn default_cache_dir() -> PathBuf { - workspace_root().join("eval/cache") + workspace_root().join("evaluations/cache") } fn default_ingestion_cache_dir() -> PathBuf { - workspace_root().join("eval/cache/ingested") + workspace_root().join("evaluations/cache/ingested") } pub const DEFAULT_SLICE_SEED: u64 = 0x5eed_2025; @@ -135,6 +135,72 @@ impl Default for RetrievalSettings { } } +#[derive(Debug, Clone, Args)] +pub struct IngestConfig { + /// Directory for ingestion corpora caches + #[arg(long, default_value_os_t = default_ingestion_cache_dir())] + pub ingestion_cache_dir: PathBuf, + + /// Minimum tokens per chunk for ingestion + #[arg(long, default_value_t = 256)] + pub ingest_chunk_min_tokens: usize, + + /// Maximum tokens per chunk for ingestion + #[arg(long, default_value_t = 512)] + pub ingest_chunk_max_tokens: usize, + + /// Overlap between chunks during ingestion (tokens) + #[arg(long, default_value_t = 50)] + pub ingest_chunk_overlap_tokens: usize, + + /// Run ingestion in chunk-only mode (skip analyzer/graph generation) + #[arg(long)] + pub ingest_chunks_only: bool, + + /// Number of paragraphs to ingest concurrently + #[arg(long, default_value_t = 10)] + pub ingestion_batch_size: usize, + + /// Maximum retries for ingestion failures per paragraph + #[arg(long, default_value_t = 3)] + pub ingestion_max_retries: usize, + + /// Recompute embeddings for cached corpora without re-running ingestion + #[arg(long, alias = "refresh-embeddings")] + pub refresh_embeddings_only: bool, + + /// Delete cached paragraph shards before rebuilding the ingestion corpus + #[arg(long)] + pub slice_reset_ingestion: bool, +} + +#[derive(Debug, Clone, Args)] +pub struct DatabaseArgs { + /// SurrealDB server endpoint + #[arg(long, default_value = "ws://127.0.0.1:8000", env = "EVAL_DB_ENDPOINT")] + pub db_endpoint: String, + + /// SurrealDB root username + #[arg(long, default_value = "root_user", env = "EVAL_DB_USERNAME")] + pub db_username: String, + + /// SurrealDB root password + #[arg(long, default_value = "root_password", env = "EVAL_DB_PASSWORD")] + pub db_password: String, + + /// Override the namespace used on the SurrealDB server + #[arg(long, env = "EVAL_DB_NAMESPACE")] + pub db_namespace: Option, + + /// Override the database used on the SurrealDB server + #[arg(long, env = "EVAL_DB_DATABASE")] + pub db_database: Option, + + /// Path to inspect DB state + #[arg(long)] + pub inspect_db_state: Option, +} + #[derive(Parser, Debug, Clone)] #[command(author, version, about, long_about = None)] pub struct Config { @@ -205,37 +271,8 @@ pub struct Config { #[arg(long, default_value_os_t = default_cache_dir())] pub cache_dir: PathBuf, - /// Directory for ingestion corpora caches - #[arg(long, default_value_os_t = default_ingestion_cache_dir())] - pub ingestion_cache_dir: PathBuf, - - /// Minimum tokens per chunk for ingestion - #[arg(long, default_value_t = 256)] - pub ingest_chunk_min_tokens: usize, - - /// Maximum tokens per chunk for ingestion - #[arg(long, default_value_t = 512)] - pub ingest_chunk_max_tokens: usize, - - /// Overlap between chunks during ingestion (tokens) - #[arg(long, default_value_t = 50)] - pub ingest_chunk_overlap_tokens: usize, - - /// Run ingestion in chunk-only mode (skip analyzer/graph generation) - #[arg(long)] - pub ingest_chunks_only: bool, - - /// Number of paragraphs to ingest concurrently - #[arg(long, default_value_t = 10)] - pub ingestion_batch_size: usize, - - /// Maximum retries for ingestion failures per paragraph - #[arg(long, default_value_t = 3)] - pub ingestion_max_retries: usize, - - /// Recompute embeddings for cached corpora without re-running ingestion - #[arg(long, alias = "refresh-embeddings")] - pub refresh_embeddings_only: bool, + #[command(flatten)] + pub ingest: IngestConfig, /// Include entity descriptions and categories in JSON reports #[arg(long)] @@ -261,12 +298,8 @@ pub struct Config { #[arg(long, default_value_t = 0)] pub slice_offset: usize, - /// Delete cached paragraph shards before rebuilding the ingestion corpus - #[arg(long)] - pub slice_reset_ingestion: bool, - /// Target negative-to-positive paragraph ratio for slice growth - #[arg(long, default_value_t = crate::slices::DEFAULT_NEGATIVE_MULTIPLIER)] + #[arg(long, default_value_t = crate::slice::DEFAULT_NEGATIVE_MULTIPLIER)] pub negative_multiplier: f32, /// Annotate the run; label is stored in JSON/Markdown reports @@ -301,29 +334,8 @@ pub struct Config { #[arg(long, alias = "perf-log")] pub perf_log_console: bool, - /// SurrealDB server endpoint - #[arg(long, default_value = "ws://127.0.0.1:8000", env = "EVAL_DB_ENDPOINT")] - pub db_endpoint: String, - - /// SurrealDB root username - #[arg(long, default_value = "root_user", env = "EVAL_DB_USERNAME")] - pub db_username: String, - - /// SurrealDB root password - #[arg(long, default_value = "root_password", env = "EVAL_DB_PASSWORD")] - pub db_password: String, - - /// Override the namespace used on the SurrealDB server - #[arg(long, env = "EVAL_DB_NAMESPACE")] - pub db_namespace: Option, - - /// Override the database used on the SurrealDB server - #[arg(long, env = "EVAL_DB_DATABASE")] - pub db_database: Option, - - /// Path to inspect DB state - #[arg(long)] - pub inspect_db_state: Option, + #[command(flatten)] + pub database: DatabaseArgs, // Computed fields (not arguments) #[arg(skip)] @@ -377,21 +389,21 @@ impl Config { } // Validations - if self.ingest_chunk_min_tokens == 0 - || self.ingest_chunk_min_tokens >= self.ingest_chunk_max_tokens + if self.ingest.ingest_chunk_min_tokens == 0 + || self.ingest.ingest_chunk_min_tokens >= self.ingest.ingest_chunk_max_tokens { return Err(anyhow!( "--ingest-chunk-min-tokens must be greater than zero and less than --ingest-chunk-max-tokens (got {} >= {})", - self.ingest_chunk_min_tokens, - self.ingest_chunk_max_tokens + self.ingest.ingest_chunk_min_tokens, + self.ingest.ingest_chunk_max_tokens )); } - if self.ingest_chunk_overlap_tokens >= self.ingest_chunk_min_tokens { + if self.ingest.ingest_chunk_overlap_tokens >= self.ingest.ingest_chunk_min_tokens { return Err(anyhow!( "--ingest-chunk-overlap-tokens ({}) must be less than --ingest-chunk-min-tokens ({})", - self.ingest_chunk_overlap_tokens, - self.ingest_chunk_min_tokens + self.ingest.ingest_chunk_overlap_tokens, + self.ingest.ingest_chunk_min_tokens )); } diff --git a/eval/src/cache.rs b/evaluations/src/cache.rs similarity index 100% rename from eval/src/cache.rs rename to evaluations/src/cache.rs diff --git a/evaluations/src/cases.rs b/evaluations/src/cases.rs new file mode 100644 index 0000000..e7c734f --- /dev/null +++ b/evaluations/src/cases.rs @@ -0,0 +1,187 @@ +//! Case generation from corpus manifests. + +use std::collections::HashMap; + +use crate::corpus; + +/// A test case for retrieval evaluation derived from a manifest question. +pub(crate) struct SeededCase { + pub question_id: String, + pub question: String, + pub expected_source: String, + pub answers: Vec, + pub paragraph_id: String, + pub paragraph_title: String, + pub expected_chunk_ids: Vec, + pub is_impossible: bool, + pub has_verified_chunks: bool, +} + +/// Convert a corpus manifest into seeded evaluation cases. +pub(crate) fn cases_from_manifest(manifest: &corpus::CorpusManifest) -> Vec { + let mut title_map = HashMap::new(); + for paragraph in &manifest.paragraphs { + title_map.insert(paragraph.paragraph_id.as_str(), paragraph.title.clone()); + } + + let include_impossible = manifest.metadata.include_unanswerable; + let require_verified_chunks = manifest.metadata.require_verified_chunks; + + manifest + .questions + .iter() + .filter(|question| { + should_include_question(question, include_impossible, require_verified_chunks) + }) + .map(|question| { + let title = title_map + .get(question.paragraph_id.as_str()) + .cloned() + .unwrap_or_else(|| "Untitled".to_string()); + SeededCase { + question_id: question.question_id.clone(), + question: question.question_text.clone(), + expected_source: question.text_content_id.clone(), + answers: question.answers.clone(), + paragraph_id: question.paragraph_id.clone(), + paragraph_title: title, + expected_chunk_ids: question.matching_chunk_ids.clone(), + is_impossible: question.is_impossible, + has_verified_chunks: !question.matching_chunk_ids.is_empty(), + } + }) + .collect() +} + +fn should_include_question( + question: &corpus::CorpusQuestion, + include_impossible: bool, + require_verified_chunks: bool, +) -> bool { + if !include_impossible && question.is_impossible { + return false; + } + if require_verified_chunks && question.matching_chunk_ids.is_empty() { + return false; + } + true +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::corpus::store::{CorpusParagraph, EmbeddedKnowledgeEntity, EmbeddedTextChunk}; + use crate::corpus::{CorpusManifest, CorpusMetadata, CorpusQuestion, MANIFEST_VERSION}; + use chrono::Utc; + use common::storage::types::text_content::TextContent; + + fn sample_manifest() -> CorpusManifest { + let paragraphs = vec![ + CorpusParagraph { + paragraph_id: "p1".to_string(), + title: "Alpha".to_string(), + text_content: TextContent::new( + "alpha context".to_string(), + None, + "test".to_string(), + None, + None, + "user".to_string(), + ), + entities: Vec::::new(), + relationships: Vec::new(), + chunks: Vec::::new(), + }, + CorpusParagraph { + paragraph_id: "p2".to_string(), + title: "Beta".to_string(), + text_content: TextContent::new( + "beta context".to_string(), + None, + "test".to_string(), + None, + None, + "user".to_string(), + ), + entities: Vec::::new(), + relationships: Vec::new(), + chunks: Vec::::new(), + }, + ]; + let questions = vec![ + CorpusQuestion { + question_id: "q1".to_string(), + paragraph_id: "p1".to_string(), + text_content_id: "tc-alpha".to_string(), + question_text: "What is Alpha?".to_string(), + answers: vec!["Alpha".to_string()], + is_impossible: false, + matching_chunk_ids: vec!["chunk-alpha".to_string()], + }, + CorpusQuestion { + question_id: "q2".to_string(), + paragraph_id: "p1".to_string(), + text_content_id: "tc-alpha".to_string(), + question_text: "Unanswerable?".to_string(), + answers: Vec::new(), + is_impossible: true, + matching_chunk_ids: Vec::new(), + }, + CorpusQuestion { + question_id: "q3".to_string(), + paragraph_id: "p2".to_string(), + text_content_id: "tc-beta".to_string(), + question_text: "Where is Beta?".to_string(), + answers: vec!["Beta".to_string()], + is_impossible: false, + matching_chunk_ids: Vec::new(), + }, + ]; + CorpusManifest { + version: MANIFEST_VERSION, + metadata: CorpusMetadata { + dataset_id: "ds".to_string(), + dataset_label: "Dataset".to_string(), + slice_id: "slice".to_string(), + include_unanswerable: true, + require_verified_chunks: true, + ingestion_fingerprint: "fp".to_string(), + embedding_backend: "test".to_string(), + embedding_model: None, + embedding_dimension: 3, + converted_checksum: "chk".to_string(), + generated_at: Utc::now(), + paragraph_count: paragraphs.len(), + question_count: questions.len(), + chunk_min_tokens: 1, + chunk_max_tokens: 10, + chunk_only: false, + }, + paragraphs, + questions, + } + } + + #[test] + fn cases_respect_mode_filters() { + let mut manifest = sample_manifest(); + manifest.metadata.include_unanswerable = false; + manifest.metadata.require_verified_chunks = true; + + let strict_cases = cases_from_manifest(&manifest); + assert_eq!(strict_cases.len(), 1); + assert_eq!(strict_cases[0].question_id, "q1"); + assert_eq!(strict_cases[0].paragraph_title, "Alpha"); + + let mut llm_manifest = manifest.clone(); + llm_manifest.metadata.include_unanswerable = true; + llm_manifest.metadata.require_verified_chunks = false; + + let llm_cases = cases_from_manifest(&llm_manifest); + let ids: Vec<_> = llm_cases + .iter() + .map(|case| case.question_id.as_str()) + .collect(); + assert_eq!(ids, vec!["q1", "q2", "q3"]); + } +} diff --git a/eval/src/ingest/config.rs b/evaluations/src/corpus/config.rs similarity index 76% rename from eval/src/ingest/config.rs rename to evaluations/src/corpus/config.rs index 3837a0d..a7e6045 100644 --- a/eval/src/ingest/config.rs +++ b/evaluations/src/corpus/config.rs @@ -32,11 +32,11 @@ impl CorpusCacheConfig { impl From<&Config> for CorpusCacheConfig { fn from(config: &Config) -> Self { CorpusCacheConfig::new( - config.ingestion_cache_dir.clone(), - config.force_convert || config.slice_reset_ingestion, - config.refresh_embeddings_only, - config.ingestion_batch_size, - config.ingestion_max_retries, + config.ingest.ingestion_cache_dir.clone(), + config.force_convert || config.ingest.slice_reset_ingestion, + config.ingest.refresh_embeddings_only, + config.ingest.ingestion_batch_size, + config.ingest.ingestion_max_retries, ) } } diff --git a/eval/src/ingest/mod.rs b/evaluations/src/corpus/mod.rs similarity index 72% rename from eval/src/ingest/mod.rs rename to evaluations/src/corpus/mod.rs index f63d23f..3726307 100644 --- a/eval/src/ingest/mod.rs +++ b/evaluations/src/corpus/mod.rs @@ -15,12 +15,12 @@ pub use store::{ pub fn make_ingestion_config(config: &crate::args::Config) -> ingestion_pipeline::IngestionConfig { let mut tuning = ingestion_pipeline::IngestionTuning::default(); - tuning.chunk_min_tokens = config.ingest_chunk_min_tokens; - tuning.chunk_max_tokens = config.ingest_chunk_max_tokens; - tuning.chunk_overlap_tokens = config.ingest_chunk_overlap_tokens; + tuning.chunk_min_tokens = config.ingest.ingest_chunk_min_tokens; + tuning.chunk_max_tokens = config.ingest.ingest_chunk_max_tokens; + tuning.chunk_overlap_tokens = config.ingest.ingest_chunk_overlap_tokens; ingestion_pipeline::IngestionConfig { tuning, - chunk_only: config.ingest_chunks_only, + chunk_only: config.ingest.ingest_chunks_only, } } diff --git a/eval/src/ingest/orchestrator.rs b/evaluations/src/corpus/orchestrator.rs similarity index 98% rename from eval/src/ingest/orchestrator.rs rename to evaluations/src/corpus/orchestrator.rs index da7ce3b..93656e8 100644 --- a/eval/src/ingest/orchestrator.rs +++ b/evaluations/src/corpus/orchestrator.rs @@ -26,10 +26,10 @@ use uuid::Uuid; use crate::{ datasets::{ConvertedDataset, ConvertedParagraph, ConvertedQuestion}, - slices::{self, ResolvedSlice, SliceParagraphKind}, + slice::{self, ResolvedSlice, SliceParagraphKind}, }; -use crate::ingest::{ +use crate::corpus::{ CorpusCacheConfig, CorpusHandle, CorpusManifest, CorpusMetadata, CorpusQuestion, EmbeddedKnowledgeEntity, EmbeddedTextChunk, ParagraphShard, ParagraphShardStore, MANIFEST_VERSION, @@ -58,12 +58,12 @@ impl<'a> IngestRequest<'a> { fn from_entry( slot: usize, paragraph: &'a ConvertedParagraph, - entry: &'a slices::SliceParagraphEntry, + entry: &'a slice::SliceParagraphEntry, ) -> Result { let shard_path = entry .shard_path .clone() - .unwrap_or_else(|| slices::default_shard_path(&entry.id)); + .unwrap_or_else(|| slice::default_shard_path(&entry.id)); let question_refs = match &entry.kind { SliceParagraphKind::Positive { question_ids } => question_ids .iter() @@ -94,7 +94,7 @@ impl<'a> IngestRequest<'a> { struct ParagraphPlan<'a> { slot: usize, - entry: &'a slices::SliceParagraphEntry, + entry: &'a slice::SliceParagraphEntry, paragraph: &'a ConvertedParagraph, } @@ -109,7 +109,7 @@ struct IngestionStats { pub async fn ensure_corpus( dataset: &ConvertedDataset, slice: &ResolvedSlice<'_>, - window: &slices::SliceWindow<'_>, + window: &slice::SliceWindow<'_>, cache: &CorpusCacheConfig, embedding: Arc, openai: Arc, @@ -189,7 +189,7 @@ pub async fn ensure_corpus( .entry .shard_path .clone() - .unwrap_or_else(|| slices::default_shard_path(&plan_entry.entry.id)); + .unwrap_or_else(|| slice::default_shard_path(&plan_entry.entry.id)); let shard = if cache.force_refresh { None } else { @@ -683,7 +683,7 @@ mod tests { use super::*; use crate::{ datasets::{ConvertedDataset, ConvertedParagraph, ConvertedQuestion, DatasetKind}, - slices::{CaseRef, SliceCaseEntry, SliceManifest, SliceParagraphEntry, SliceParagraphKind}, + slice::{CaseRef, SliceCaseEntry, SliceManifest, SliceParagraphEntry, SliceParagraphKind}, }; use chrono::Utc; diff --git a/eval/src/ingest/store.rs b/evaluations/src/corpus/store.rs similarity index 100% rename from eval/src/ingest/store.rs rename to evaluations/src/corpus/store.rs diff --git a/eval/src/datasets/beir.rs b/evaluations/src/datasets/beir.rs similarity index 100% rename from eval/src/datasets/beir.rs rename to evaluations/src/datasets/beir.rs diff --git a/eval/src/datasets/mod.rs b/evaluations/src/datasets/mod.rs similarity index 100% rename from eval/src/datasets/mod.rs rename to evaluations/src/datasets/mod.rs diff --git a/eval/src/datasets/nq.rs b/evaluations/src/datasets/nq.rs similarity index 100% rename from eval/src/datasets/nq.rs rename to evaluations/src/datasets/nq.rs diff --git a/eval/src/datasets/squad.rs b/evaluations/src/datasets/squad.rs similarity index 100% rename from eval/src/datasets/squad.rs rename to evaluations/src/datasets/squad.rs diff --git a/eval/src/db_helpers.rs b/evaluations/src/db_helpers.rs similarity index 92% rename from eval/src/db_helpers.rs rename to evaluations/src/db_helpers.rs index 9be1fc4..2f56937 100644 --- a/eval/src/db_helpers.rs +++ b/evaluations/src/db_helpers.rs @@ -2,16 +2,6 @@ use anyhow::{Context, Result}; use common::storage::{db::SurrealDbClient, indexes::ensure_runtime_indexes}; use tracing::info; -// Remove and recreate HNSW indexes for changing embedding lengths, used at beginning if embedding length differs from default system settings. -pub async fn change_embedding_length_in_hnsw_indexes( - db: &SurrealDbClient, - dimension: usize, -) -> Result<()> { - // No-op for now; runtime indexes are created after ingestion with the correct dimension. - let _ = (db, dimension); - Ok(()) -} - // Helper functions for index management during namespace reseed pub async fn remove_all_indexes(db: &SurrealDbClient) -> Result<()> { let _ = db; @@ -46,6 +36,14 @@ pub async fn reset_namespace(db: &SurrealDbClient, namespace: &str, database: &s Ok(()) } +// Test helper to force index dimension change +pub async fn change_embedding_length_in_hnsw_indexes( + db: &SurrealDbClient, + dimension: usize, +) -> Result<()> { + recreate_indexes(db, dimension).await +} + #[cfg(test)] mod tests { use super::*; diff --git a/evaluations/src/eval.rs b/evaluations/src/eval.rs new file mode 100644 index 0000000..e16fdbd --- /dev/null +++ b/evaluations/src/eval.rs @@ -0,0 +1,128 @@ +//! Evaluation utilities module - re-exports from focused submodules. + +// Re-export types from the root types module +pub use crate::types::*; + +// Re-export from focused modules at crate root (crate-internal only) +pub(crate) use crate::cases::{cases_from_manifest, SeededCase}; +pub(crate) use crate::namespace::{ + can_reuse_namespace, connect_eval_db, default_database, default_namespace, ensure_eval_user, + record_namespace_state, +}; +pub(crate) use crate::settings::{enforce_system_settings, load_or_init_system_settings}; + +use std::path::Path; + +use anyhow::{Context, Result}; +use common::storage::db::SurrealDbClient; +use tokio::io::AsyncWriteExt; +use tracing::info; + +use crate::{ + args::{self, Config}, + datasets::ConvertedDataset, + slice::{self}, +}; + +/// Grow the slice ledger to contain the target number of cases. +pub async fn grow_slice(dataset: &ConvertedDataset, config: &Config) -> Result<()> { + let ledger_limit = ledger_target(config); + let slice_settings = slice::slice_config_with_limit(config, ledger_limit); + let slice = + slice::resolve_slice(dataset, &slice_settings).context("resolving dataset slice")?; + info!( + slice = slice.manifest.slice_id.as_str(), + cases = slice.manifest.case_count, + positives = slice.manifest.positive_paragraphs, + negatives = slice.manifest.negative_paragraphs, + total_paragraphs = slice.manifest.total_paragraphs, + "Slice ledger ready" + ); + println!( + "Slice `{}` now contains {} questions ({} positives, {} negatives)", + slice.manifest.slice_id, + slice.manifest.case_count, + slice.manifest.positive_paragraphs, + slice.manifest.negative_paragraphs + ); + Ok(()) +} + +pub(crate) fn ledger_target(config: &Config) -> Option { + match (config.slice_grow, config.limit) { + (Some(grow), Some(limit)) => Some(limit.max(grow)), + (Some(grow), None) => Some(grow), + (None, limit) => limit, + } +} + +pub(crate) async fn write_chunk_diagnostics(path: &Path, cases: &[CaseDiagnostics]) -> Result<()> { + args::ensure_parent(path)?; + let mut file = tokio::fs::File::create(path) + .await + .with_context(|| format!("creating diagnostics file {}", path.display()))?; + for case in cases { + let line = serde_json::to_vec(case).context("serialising chunk diagnostics entry")?; + file.write_all(&line).await?; + file.write_all(b"\n").await?; + } + file.flush().await?; + Ok(()) +} + +pub(crate) async fn warm_hnsw_cache(db: &SurrealDbClient, dimension: usize) -> Result<()> { + // Create a dummy embedding for cache warming + let dummy_embedding: Vec = (0..dimension).map(|i| (i as f32).sin()).collect(); + + info!("Warming HNSW caches with sample queries"); + + // Warm up chunk embedding index - just query the embedding table to load HNSW index + let _ = db + .client + .query( + r#"SELECT chunk_id + FROM text_chunk_embedding + WHERE embedding <|1,1|> $embedding + LIMIT 5"#, + ) + .bind(("embedding", dummy_embedding.clone())) + .await + .context("warming text chunk HNSW cache")?; + + // Warm up entity embedding index + let _ = db + .client + .query( + r#"SELECT entity_id + FROM knowledge_entity_embedding + WHERE embedding <|1,1|> $embedding + LIMIT 5"#, + ) + .bind(("embedding", dummy_embedding)) + .await + .context("warming knowledge entity HNSW cache")?; + + info!("HNSW cache warming completed"); + Ok(()) +} + +use chrono::{DateTime, SecondsFormat, Utc}; + +pub fn format_timestamp(timestamp: &DateTime) -> String { + timestamp.to_rfc3339_opts(SecondsFormat::Secs, true) +} + +pub(crate) fn sanitize_model_code(code: &str) -> String { + code.chars() + .map(|ch| { + if ch.is_ascii_alphanumeric() { + ch.to_ascii_lowercase() + } else { + '_' + } + }) + .collect() +} + +// Re-export run_evaluation from the pipeline module at crate root +pub use crate::pipeline::run_evaluation; diff --git a/eval/src/inspection.rs b/evaluations/src/inspection.rs similarity index 95% rename from eval/src/inspection.rs rename to evaluations/src/inspection.rs index 7b0cac0..064ed53 100644 --- a/eval/src/inspection.rs +++ b/evaluations/src/inspection.rs @@ -7,7 +7,7 @@ use std::{ use anyhow::{anyhow, Context, Result}; use common::storage::{db::SurrealDbClient, types::text_chunk::TextChunk}; -use crate::{args::Config, eval::connect_eval_db, ingest, snapshot::DbSnapshotState}; +use crate::{args::Config, eval::connect_eval_db, corpus, snapshot::DbSnapshotState}; pub async fn inspect_question(config: &Config) -> Result<()> { let question_id = config @@ -65,6 +65,7 @@ pub async fn inspect_question(config: &Config) -> Result<()> { } let db_state_path = config + .database .inspect_db_state .clone() .unwrap_or_else(|| default_state_path(config, &manifest)); @@ -109,14 +110,14 @@ struct ChunkEntry { snippet: String, } -fn load_manifest(path: &Path) -> Result { +fn load_manifest(path: &Path) -> Result { let bytes = fs::read(path).with_context(|| format!("reading ingestion manifest {}", path.display()))?; serde_json::from_slice(&bytes) .with_context(|| format!("parsing ingestion manifest {}", path.display())) } -fn build_chunk_lookup(manifest: &ingest::CorpusManifest) -> HashMap { +fn build_chunk_lookup(manifest: &corpus::CorpusManifest) -> HashMap { let mut lookup = HashMap::new(); for paragraph in &manifest.paragraphs { for chunk in ¶graph.chunks { @@ -139,7 +140,7 @@ fn build_chunk_lookup(manifest: &ingest::CorpusManifest) -> HashMap PathBuf { +fn default_state_path(config: &Config, manifest: &corpus::CorpusManifest) -> PathBuf { config .cache_dir .join("snapshots") diff --git a/eval/src/main.rs b/evaluations/src/main.rs similarity index 99% rename from eval/src/main.rs rename to evaluations/src/main.rs index aefe6f9..c490a79 100644 --- a/eval/src/main.rs +++ b/evaluations/src/main.rs @@ -1,16 +1,20 @@ mod args; mod cache; +mod cases; +mod corpus; mod datasets; mod db_helpers; mod eval; -mod ingest; mod inspection; +mod namespace; mod openai; mod perf; +mod pipeline; mod report; +mod settings; mod slice; -mod slices; mod snapshot; +mod types; use anyhow::Context; use tokio::runtime::Builder; diff --git a/evaluations/src/namespace.rs b/evaluations/src/namespace.rs new file mode 100644 index 0000000..8eaa856 --- /dev/null +++ b/evaluations/src/namespace.rs @@ -0,0 +1,224 @@ +//! Database namespace management utilities. + +use anyhow::{anyhow, Context, Result}; +use chrono::Utc; +use common::storage::{db::SurrealDbClient, types::user::User, types::StoredObject}; +use serde::Deserialize; +use tracing::{info, warn}; + +use crate::{ + args::Config, + datasets, + snapshot::{self, DbSnapshotState}, +}; + +/// Connect to the evaluation database with fallback auth strategies. +pub(crate) async fn connect_eval_db( + config: &Config, + namespace: &str, + database: &str, +) -> Result { + match SurrealDbClient::new( + &config.database.db_endpoint, + &config.database.db_username, + &config.database.db_password, + namespace, + database, + ) + .await + { + Ok(client) => { + info!( + endpoint = %config.database.db_endpoint, + namespace, + database, + auth = "root", + "Connected to SurrealDB" + ); + Ok(client) + } + Err(root_err) => { + info!( + endpoint = %config.database.db_endpoint, + namespace, + database, + "Root authentication failed; trying namespace-level auth" + ); + let namespace_client = SurrealDbClient::new_with_namespace_user( + &config.database.db_endpoint, + namespace, + &config.database.db_username, + &config.database.db_password, + database, + ) + .await + .map_err(|ns_err| { + anyhow!( + "failed to connect to SurrealDB via root ({root_err}) or namespace ({ns_err}) credentials" + ) + })?; + info!( + endpoint = %config.database.db_endpoint, + namespace, + database, + auth = "namespace", + "Connected to SurrealDB" + ); + Ok(namespace_client) + } + } +} + +/// Check if the namespace contains any corpus data. +pub(crate) async fn namespace_has_corpus(db: &SurrealDbClient) -> Result { + #[derive(Deserialize)] + struct CountRow { + count: i64, + } + + let mut response = db + .client + .query("SELECT count() AS count FROM text_chunk") + .await + .context("checking namespace corpus state")?; + let rows: Vec = response.take(0).unwrap_or_default(); + Ok(rows.first().map(|row| row.count).unwrap_or(0) > 0) +} + +/// Determine if we can reuse an existing namespace based on cached state. +pub(crate) async fn can_reuse_namespace( + db: &SurrealDbClient, + descriptor: &snapshot::Descriptor, + namespace: &str, + database: &str, + dataset_id: &str, + slice_id: &str, + ingestion_fingerprint: &str, + slice_case_count: usize, +) -> Result { + let state = match descriptor.load_db_state().await? { + Some(state) => state, + None => { + info!("No namespace state recorded; reseeding corpus from cached shards"); + return Ok(false); + } + }; + + if state.slice_case_count != slice_case_count { + info!( + requested_cases = slice_case_count, + stored_cases = state.slice_case_count, + "Skipping live namespace reuse; cached state does not match requested window" + ); + return Ok(false); + } + + if state.dataset_id != dataset_id + || state.slice_id != slice_id + || state.ingestion_fingerprint != ingestion_fingerprint + || state.namespace.as_deref() != Some(namespace) + || state.database.as_deref() != Some(database) + { + info!( + namespace, + database, "Cached namespace metadata mismatch; rebuilding corpus from ingestion cache" + ); + return Ok(false); + } + + if namespace_has_corpus(db).await? { + Ok(true) + } else { + info!( + namespace, + database, + "Namespace metadata matches but tables are empty; reseeding from ingestion cache" + ); + Ok(false) + } +} + +/// Record the current namespace state to allow future reuse checks. +pub(crate) async fn record_namespace_state( + descriptor: &snapshot::Descriptor, + dataset_id: &str, + slice_id: &str, + ingestion_fingerprint: &str, + namespace: &str, + database: &str, + slice_case_count: usize, +) { + let state = DbSnapshotState { + dataset_id: dataset_id.to_string(), + slice_id: slice_id.to_string(), + ingestion_fingerprint: ingestion_fingerprint.to_string(), + snapshot_hash: descriptor.metadata_hash().to_string(), + updated_at: Utc::now(), + namespace: Some(namespace.to_string()), + database: Some(database.to_string()), + slice_case_count, + }; + if let Err(err) = descriptor.store_db_state(&state).await { + warn!(error = %err, "Failed to record namespace state"); + } +} + +fn sanitize_identifier(input: &str) -> String { + let mut cleaned: String = input + .chars() + .map(|ch| { + if ch.is_ascii_alphanumeric() { + ch.to_ascii_lowercase() + } else { + '_' + } + }) + .collect(); + if cleaned.is_empty() { + cleaned.push('x'); + } + if cleaned.len() > 64 { + cleaned.truncate(64); + } + cleaned +} + +/// Generate a default namespace name based on dataset and limit. +pub(crate) fn default_namespace(dataset_id: &str, limit: Option) -> String { + let dataset_component = sanitize_identifier(dataset_id); + let limit_component = match limit { + Some(value) if value > 0 => format!("limit{}", value), + _ => "all".to_string(), + }; + format!("eval_{}_{}", dataset_component, limit_component) +} + +/// Generate the default database name for evaluations. +pub(crate) fn default_database() -> String { + "retrieval_eval".to_string() +} + +/// Ensure the evaluation user exists in the database. +pub(crate) async fn ensure_eval_user(db: &SurrealDbClient) -> Result { + let timestamp = datasets::base_timestamp(); + let user = User { + id: "eval-user".to_string(), + created_at: timestamp, + updated_at: timestamp, + email: "eval-retrieval@minne.dev".to_string(), + password: "not-used".to_string(), + anonymous: false, + api_key: None, + admin: false, + timezone: "UTC".to_string(), + }; + + if let Some(existing) = db.get_item::(&user.get_id()).await? { + return Ok(existing); + } + + db.store_item(user.clone()) + .await + .context("storing evaluation user")?; + Ok(user) +} diff --git a/eval/src/openai.rs b/evaluations/src/openai.rs similarity index 100% rename from eval/src/openai.rs rename to evaluations/src/openai.rs diff --git a/eval/src/perf.rs b/evaluations/src/perf.rs similarity index 100% rename from eval/src/perf.rs rename to evaluations/src/perf.rs diff --git a/eval/src/eval/pipeline/context.rs b/evaluations/src/pipeline/context.rs similarity index 97% rename from eval/src/eval/pipeline/context.rs rename to evaluations/src/pipeline/context.rs index dad2579..99c0eef 100644 --- a/eval/src/eval/pipeline/context.rs +++ b/evaluations/src/pipeline/context.rs @@ -22,7 +22,7 @@ use crate::{ cache::EmbeddingCache, datasets::ConvertedDataset, eval::{CaseDiagnostics, CaseSummary, EvaluationStageTimings, EvaluationSummary, SeededCase}, - ingest, slice, snapshot, + corpus, slice, snapshot, }; pub(super) struct EvaluationContext<'a> { @@ -52,7 +52,7 @@ pub(super) struct EvaluationContext<'a> { pub namespace_reused: bool, pub evaluation_start: Option, pub eval_user: Option, - pub corpus_handle: Option, + pub corpus_handle: Option, pub cases: Vec, pub filtered_questions: usize, pub stage_latency_samples: Vec, @@ -145,7 +145,7 @@ impl<'a> EvaluationContext<'a> { .clone() } - pub fn corpus_handle(&self) -> &ingest::CorpusHandle { + pub fn corpus_handle(&self) -> &corpus::CorpusHandle { self.corpus_handle.as_ref().expect("corpus handle missing") } diff --git a/eval/src/eval/pipeline/mod.rs b/evaluations/src/pipeline/mod.rs similarity index 90% rename from eval/src/eval/pipeline/mod.rs rename to evaluations/src/pipeline/mod.rs index 2f62a6f..f563346 100644 --- a/eval/src/eval/pipeline/mod.rs +++ b/evaluations/src/pipeline/mod.rs @@ -4,7 +4,7 @@ mod state; use anyhow::Result; -use crate::{args::Config, datasets::ConvertedDataset, eval::EvaluationSummary}; +use crate::{args::Config, datasets::ConvertedDataset, types::EvaluationSummary}; use context::EvaluationContext; diff --git a/eval/src/eval/pipeline/stages/finalize.rs b/evaluations/src/pipeline/stages/finalize.rs similarity index 100% rename from eval/src/eval/pipeline/stages/finalize.rs rename to evaluations/src/pipeline/stages/finalize.rs diff --git a/eval/src/eval/pipeline/stages/mod.rs b/evaluations/src/pipeline/stages/mod.rs similarity index 100% rename from eval/src/eval/pipeline/stages/mod.rs rename to evaluations/src/pipeline/stages/mod.rs diff --git a/eval/src/eval/pipeline/stages/prepare_corpus.rs b/evaluations/src/pipeline/stages/prepare_corpus.rs similarity index 90% rename from eval/src/eval/pipeline/stages/prepare_corpus.rs rename to evaluations/src/pipeline/stages/prepare_corpus.rs index 96b7c4f..102a3b3 100644 --- a/eval/src/eval/pipeline/stages/prepare_corpus.rs +++ b/evaluations/src/pipeline/stages/prepare_corpus.rs @@ -3,7 +3,7 @@ use std::time::Instant; use anyhow::Context; use tracing::info; -use crate::{eval::can_reuse_namespace, ingest, slice, snapshot}; +use crate::{eval::can_reuse_namespace, corpus, slice, snapshot}; use super::super::{ context::{EvalStage, EvaluationContext}, @@ -23,7 +23,7 @@ pub(crate) async fn prepare_corpus( let started = Instant::now(); let config = ctx.config(); - let cache_settings = ingest::CorpusCacheConfig::from(config); + let cache_settings = corpus::CorpusCacheConfig::from(config); let embedding_provider = ctx.embedding_provider().clone(); let openai_client = ctx.openai_client(); let slice = ctx.slice(); @@ -31,14 +31,14 @@ pub(crate) async fn prepare_corpus( .context("selecting slice window for corpus preparation")?; let descriptor = snapshot::Descriptor::new(config, slice, ctx.embedding_provider()); - let ingestion_config = ingest::make_ingestion_config(config); - let expected_fingerprint = ingest::compute_ingestion_fingerprint( + let ingestion_config = corpus::make_ingestion_config(config); + let expected_fingerprint = corpus::compute_ingestion_fingerprint( ctx.dataset(), slice, config.converted_dataset_path.as_path(), &ingestion_config, )?; - let base_dir = ingest::cached_corpus_dir( + let base_dir = corpus::cached_corpus_dir( &cache_settings, ctx.dataset().metadata.id.as_str(), slice.manifest.slice_id.as_str(), @@ -58,14 +58,14 @@ pub(crate) async fn prepare_corpus( ) .await? { - if let Some(manifest) = ingest::load_cached_manifest(&base_dir)? { + if let Some(manifest) = corpus::load_cached_manifest(&base_dir)? { info!( cache = %base_dir.display(), namespace = ctx.namespace.as_str(), database = ctx.database.as_str(), "Namespace already seeded; reusing cached corpus manifest" ); - let corpus_handle = ingest::corpus_handle_from_manifest(manifest, base_dir); + let corpus_handle = corpus::corpus_handle_from_manifest(manifest, base_dir); ctx.corpus_handle = Some(corpus_handle); ctx.expected_fingerprint = Some(expected_fingerprint); ctx.ingestion_duration_ms = 0; @@ -94,7 +94,7 @@ pub(crate) async fn prepare_corpus( let eval_user_id = "eval-user".to_string(); let ingestion_timer = Instant::now(); let corpus_handle = { - ingest::ensure_corpus( + corpus::ensure_corpus( ctx.dataset(), slice, &window, diff --git a/eval/src/eval/pipeline/stages/prepare_db.rs b/evaluations/src/pipeline/stages/prepare_db.rs similarity index 100% rename from eval/src/eval/pipeline/stages/prepare_db.rs rename to evaluations/src/pipeline/stages/prepare_db.rs diff --git a/eval/src/eval/pipeline/stages/prepare_namespace.rs b/evaluations/src/pipeline/stages/prepare_namespace.rs similarity index 98% rename from eval/src/eval/pipeline/stages/prepare_namespace.rs rename to evaluations/src/pipeline/stages/prepare_namespace.rs index 78d18f8..ec87f75 100644 --- a/eval/src/eval/pipeline/stages/prepare_namespace.rs +++ b/evaluations/src/pipeline/stages/prepare_namespace.rs @@ -10,7 +10,7 @@ use crate::{ can_reuse_namespace, cases_from_manifest, enforce_system_settings, ensure_eval_user, record_namespace_state, warm_hnsw_cache, }, - ingest, + corpus, }; use super::super::{ @@ -47,7 +47,7 @@ pub(crate) async fn prepare_namespace( if ctx.window_offset == 0 && ctx.window_length >= base_manifest.questions.len() { base_manifest.clone() } else { - ingest::window_manifest( + corpus::window_manifest( base_manifest, ctx.window_offset, ctx.window_length, @@ -116,7 +116,7 @@ pub(crate) async fn prepare_namespace( let indexes_disabled = remove_all_indexes(ctx.db()).await.is_ok(); let seed_start = Instant::now(); - ingest::seed_manifest_into_db(ctx.db(), &manifest_for_seed) + corpus::seed_manifest_into_db(ctx.db(), &manifest_for_seed) .await .context("seeding ingestion corpus from manifest")?; namespace_seed_ms = Some(seed_start.elapsed().as_millis() as u128); diff --git a/eval/src/eval/pipeline/stages/prepare_slice.rs b/evaluations/src/pipeline/stages/prepare_slice.rs similarity index 89% rename from eval/src/eval/pipeline/stages/prepare_slice.rs rename to evaluations/src/pipeline/stages/prepare_slice.rs index 9c524e2..7396338 100644 --- a/eval/src/eval/pipeline/stages/prepare_slice.rs +++ b/evaluations/src/pipeline/stages/prepare_slice.rs @@ -43,11 +43,15 @@ pub(crate) async fn prepare_slice( ctx.window_length = window.length; ctx.window_total_cases = window.total_cases; - ctx.namespace = ctx.config().db_namespace.clone().unwrap_or_else(|| { - default_namespace(ctx.dataset().metadata.id.as_str(), ctx.config().limit) - }); + ctx.namespace = ctx + .config() + .database + .db_namespace + .clone() + .unwrap_or_else(|| default_namespace(ctx.dataset().metadata.id.as_str(), ctx.config().limit)); ctx.database = ctx .config() + .database .db_database .clone() .unwrap_or_else(default_database); diff --git a/eval/src/eval/pipeline/stages/run_queries.rs b/evaluations/src/pipeline/stages/run_queries.rs similarity index 100% rename from eval/src/eval/pipeline/stages/run_queries.rs rename to evaluations/src/pipeline/stages/run_queries.rs diff --git a/eval/src/eval/pipeline/stages/summarize.rs b/evaluations/src/pipeline/stages/summarize.rs similarity index 96% rename from eval/src/eval/pipeline/stages/summarize.rs rename to evaluations/src/pipeline/stages/summarize.rs index e4cee7d..f3d03b3 100644 --- a/eval/src/eval/pipeline/stages/summarize.rs +++ b/evaluations/src/pipeline/stages/summarize.rs @@ -207,10 +207,10 @@ pub(crate) async fn summarize( chunk_rrf_fts_weight: active_tuning.chunk_rrf_fts_weight, chunk_rrf_use_vector: active_tuning.chunk_rrf_use_vector, chunk_rrf_use_fts: active_tuning.chunk_rrf_use_fts, - ingest_chunk_min_tokens: config.ingest_chunk_min_tokens, - ingest_chunk_max_tokens: config.ingest_chunk_max_tokens, - ingest_chunks_only: config.ingest_chunks_only, - ingest_chunk_overlap_tokens: config.ingest_chunk_overlap_tokens, + ingest_chunk_min_tokens: config.ingest.ingest_chunk_min_tokens, + ingest_chunk_max_tokens: config.ingest.ingest_chunk_max_tokens, + ingest_chunks_only: config.ingest.ingest_chunks_only, + ingest_chunk_overlap_tokens: config.ingest.ingest_chunk_overlap_tokens, chunk_vector_take: active_tuning.chunk_vector_take, chunk_fts_take: active_tuning.chunk_fts_take, chunk_avg_chars_per_token: active_tuning.avg_chars_per_token, diff --git a/eval/src/eval/pipeline/state.rs b/evaluations/src/pipeline/state.rs similarity index 100% rename from eval/src/eval/pipeline/state.rs rename to evaluations/src/pipeline/state.rs diff --git a/eval/src/report.rs b/evaluations/src/report.rs similarity index 100% rename from eval/src/report.rs rename to evaluations/src/report.rs diff --git a/evaluations/src/settings.rs b/evaluations/src/settings.rs new file mode 100644 index 0000000..ba8f611 --- /dev/null +++ b/evaluations/src/settings.rs @@ -0,0 +1,63 @@ +//! System settings enforcement for evaluations. + +use anyhow::{Context, Result}; +use common::{ + error::AppError, + storage::{db::SurrealDbClient, types::system_settings::SystemSettings}, +}; +use tracing::info; + +use crate::args::Config; + +/// Enforce evaluation-specific system settings overrides. +pub(crate) async fn enforce_system_settings( + db: &SurrealDbClient, + mut settings: SystemSettings, + provider_dimension: usize, + config: &Config, +) -> Result { + let mut updated_settings = settings.clone(); + let mut needs_settings_update = false; + + if provider_dimension != settings.embedding_dimensions as usize { + updated_settings.embedding_dimensions = provider_dimension as u32; + needs_settings_update = true; + } + if let Some(query_override) = config.query_model.as_deref() { + if settings.query_model != query_override { + info!( + model = query_override, + "Overriding system query model for this run" + ); + updated_settings.query_model = query_override.to_string(); + needs_settings_update = true; + } + } + if needs_settings_update { + settings = SystemSettings::update(db, updated_settings) + .await + .context("updating system settings overrides")?; + } + Ok(settings) +} + +/// Load existing system settings or initialize them via migrations. +pub(crate) async fn load_or_init_system_settings( + db: &SurrealDbClient, + _dimension: usize, +) -> Result<(SystemSettings, bool)> { + match SystemSettings::get_current(db).await { + Ok(settings) => Ok((settings, false)), + Err(AppError::NotFound(_)) => { + info!("System settings missing; applying database migrations for namespace"); + db.apply_migrations() + .await + .context("applying database migrations after missing system settings")?; + let settings = SystemSettings::get_current(db) + .await + .context("loading system settings after migrations")?; + Ok((settings, true)) + } + Err(err) => Err(err).context("loading system settings"), + } +} diff --git a/eval/src/slices.rs b/evaluations/src/slice.rs similarity index 97% rename from eval/src/slices.rs rename to evaluations/src/slice.rs index 014f82a..e21f5ce 100644 --- a/eval/src/slices.rs +++ b/evaluations/src/slice.rs @@ -1214,3 +1214,30 @@ mod tests { Ok(()) } } + +// MARK: - Config integration (merged from slice.rs) + +use crate::args::Config; + +impl<'a> From<&'a Config> for SliceConfig<'a> { + fn from(config: &'a Config) -> Self { + slice_config_with_limit(config, None) + } +} + +pub fn slice_config_with_limit<'a>( + config: &'a Config, + limit_override: Option, +) -> SliceConfig<'a> { + SliceConfig { + cache_dir: config.cache_dir.as_path(), + force_convert: config.force_convert, + explicit_slice: config.slice.as_deref(), + limit: limit_override.or(config.limit), + corpus_limit: config.corpus_limit, + slice_seed: config.slice_seed, + llm_mode: config.llm_mode, + negative_multiplier: config.negative_multiplier, + require_verified_chunks: config.retrieval.require_verified_chunks, + } +} diff --git a/eval/src/snapshot.rs b/evaluations/src/snapshot.rs similarity index 100% rename from eval/src/snapshot.rs rename to evaluations/src/snapshot.rs diff --git a/eval/src/eval/types.rs b/evaluations/src/types.rs similarity index 100% rename from eval/src/eval/types.rs rename to evaluations/src/types.rs