use std::{ env, path::{Path, PathBuf}, }; use anyhow::{anyhow, Context, Result}; use clap::{Args, Parser, ValueEnum}; use retrieval_pipeline::RetrievalStrategy; use crate::datasets::DatasetKind; fn workspace_root() -> PathBuf { let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); manifest_dir.parent().unwrap_or(&manifest_dir).to_path_buf() } fn default_report_dir() -> PathBuf { workspace_root().join("eval/reports") } fn default_cache_dir() -> PathBuf { workspace_root().join("eval/cache") } fn default_ingestion_cache_dir() -> PathBuf { workspace_root().join("eval/cache/ingested") } pub const DEFAULT_SLICE_SEED: u64 = 0x5eed_2025; #[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum)] #[value(rename_all = "lowercase")] pub enum EmbeddingBackend { Hashed, FastEmbed, } impl Default for EmbeddingBackend { fn default() -> Self { Self::FastEmbed } } impl std::fmt::Display for EmbeddingBackend { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::Hashed => write!(f, "hashed"), Self::FastEmbed => write!(f, "fastembed"), } } } #[derive(Debug, Clone, Args)] pub struct RetrievalSettings { /// Override chunk vector candidate cap #[arg(long)] pub chunk_vector_take: Option, /// Override chunk FTS candidate cap #[arg(long)] pub chunk_fts_take: Option, /// Override average characters per token used for budgeting #[arg(long)] pub chunk_avg_chars_per_token: Option, /// Override maximum chunks attached per entity #[arg(long)] pub max_chunks_per_entity: Option, /// Enable the FastEmbed reranking stage #[arg(long = "rerank", action = clap::ArgAction::SetTrue, default_value_t = false)] pub rerank: bool, /// Reranking engine pool size / parallelism #[arg(long, default_value_t = 4)] pub rerank_pool_size: usize, /// Keep top-N entities after reranking #[arg(long, default_value_t = 10)] pub rerank_keep_top: usize, /// Cap the number of chunks returned by retrieval (revised strategy) #[arg(long, default_value_t = 5)] pub chunk_result_cap: usize, /// Reciprocal rank fusion k value for revised chunk merging #[arg(long)] pub chunk_rrf_k: Option, /// Weight for vector ranks in revised RRF #[arg(long)] pub chunk_rrf_vector_weight: Option, /// Weight for chunk FTS ranks in revised RRF #[arg(long)] pub chunk_rrf_fts_weight: Option, /// Include vector ranks in revised RRF (default: true) #[arg(long)] pub chunk_rrf_use_vector: Option, /// Include chunk FTS ranks in revised RRF (default: true) #[arg(long)] pub chunk_rrf_use_fts: Option, /// Require verified chunks (disable with --llm-mode) #[arg(skip = true)] pub require_verified_chunks: bool, /// Select the retrieval pipeline strategy #[arg(long, default_value_t = RetrievalStrategy::Initial)] pub strategy: RetrievalStrategy, } impl Default for RetrievalSettings { fn default() -> Self { Self { chunk_vector_take: None, chunk_fts_take: None, chunk_avg_chars_per_token: None, max_chunks_per_entity: None, rerank: false, rerank_pool_size: 4, rerank_keep_top: 10, chunk_result_cap: 5, chunk_rrf_k: None, chunk_rrf_vector_weight: None, chunk_rrf_fts_weight: None, chunk_rrf_use_vector: None, chunk_rrf_use_fts: None, require_verified_chunks: true, strategy: RetrievalStrategy::Initial, } } } #[derive(Parser, Debug, Clone)] #[command(author, version, about, long_about = None)] pub struct Config { /// Convert the selected dataset and exit #[arg(long)] pub convert_only: bool, /// Regenerate the converted dataset even if it already exists #[arg(long, alias = "refresh")] pub force_convert: bool, /// Dataset to evaluate #[arg(long, default_value_t = DatasetKind::default())] pub dataset: DatasetKind, /// Enable LLM-assisted evaluation features (includes unanswerable cases) #[arg(long)] pub llm_mode: bool, /// Cap the slice corpus size (positives + negatives) #[arg(long)] pub corpus_limit: Option, /// Path to the raw dataset (defaults per dataset) #[arg(long)] pub raw: Option, /// Path to write/read the converted dataset (defaults per dataset) #[arg(long)] pub converted: Option, /// Directory to write evaluation reports #[arg(long, default_value_os_t = default_report_dir())] pub report_dir: PathBuf, /// Precision@k cutoff #[arg(long, default_value_t = 5)] pub k: usize, /// Limit the number of questions evaluated (0 = all) #[arg(long = "limit", default_value_t = 200)] pub limit_arg: usize, /// Number of mismatches to surface in the Markdown summary #[arg(long, default_value_t = 5)] pub sample: usize, /// Disable context cropping when converting datasets (ingest entire documents) #[arg(long)] pub full_context: bool, #[command(flatten)] pub retrieval: RetrievalSettings, /// Concurrency level #[arg(long, default_value_t = 1)] pub concurrency: usize, /// Embedding backend #[arg(long, default_value_t = EmbeddingBackend::FastEmbed)] pub embedding_backend: EmbeddingBackend, /// FastEmbed model code #[arg(long)] pub embedding_model: Option, /// Directory for embedding caches #[arg(long, default_value_os_t = default_cache_dir())] pub cache_dir: PathBuf, /// Directory for ingestion corpora caches #[arg(long, default_value_os_t = default_ingestion_cache_dir())] pub ingestion_cache_dir: PathBuf, /// Minimum tokens per chunk for ingestion #[arg(long, default_value_t = 256)] pub ingest_chunk_min_tokens: usize, /// Maximum tokens per chunk for ingestion #[arg(long, default_value_t = 512)] pub ingest_chunk_max_tokens: usize, /// Overlap between chunks during ingestion (tokens) #[arg(long, default_value_t = 50)] pub ingest_chunk_overlap_tokens: usize, /// Run ingestion in chunk-only mode (skip analyzer/graph generation) #[arg(long)] pub ingest_chunks_only: bool, /// Number of paragraphs to ingest concurrently #[arg(long, default_value_t = 10)] pub ingestion_batch_size: usize, /// Maximum retries for ingestion failures per paragraph #[arg(long, default_value_t = 3)] pub ingestion_max_retries: usize, /// Recompute embeddings for cached corpora without re-running ingestion #[arg(long, alias = "refresh-embeddings")] pub refresh_embeddings_only: bool, /// Include entity descriptions and categories in JSON reports #[arg(long)] pub detailed_report: bool, /// Use a cached dataset slice by id or path #[arg(long)] pub slice: Option, /// Ignore cached corpus state and rebuild the slice's SurrealDB corpus #[arg(long)] pub reseed_slice: bool, /// Slice seed #[arg(skip = DEFAULT_SLICE_SEED)] pub slice_seed: u64, /// Grow the slice ledger to contain at least this many answerable cases, then exit #[arg(long)] pub slice_grow: Option, /// Evaluate questions starting at this offset within the slice #[arg(long, default_value_t = 0)] pub slice_offset: usize, /// Delete cached paragraph shards before rebuilding the ingestion corpus #[arg(long)] pub slice_reset_ingestion: bool, /// Target negative-to-positive paragraph ratio for slice growth #[arg(long, default_value_t = crate::slices::DEFAULT_NEGATIVE_MULTIPLIER)] pub negative_multiplier: f32, /// Annotate the run; label is stored in JSON/Markdown reports #[arg(long)] pub label: Option, /// Write per-query chunk diagnostics JSONL to the provided path #[arg(long, alias = "chunk-diagnostics")] pub chunk_diagnostics_path: Option, /// Inspect an ingestion cache question and exit #[arg(long)] pub inspect_question: Option, /// Path to an ingestion cache manifest JSON for inspection mode #[arg(long)] pub inspect_manifest: Option, /// Override the SurrealDB system settings query model #[arg(long)] pub query_model: Option, /// Write structured performance telemetry JSON to the provided path #[arg(long)] pub perf_log_json: Option, /// Directory that receives timestamped perf JSON copies #[arg(long)] pub perf_log_dir: Option, /// Print per-stage performance timings to stdout after the run #[arg(long, alias = "perf-log")] pub perf_log_console: bool, /// SurrealDB server endpoint #[arg(long, default_value = "ws://127.0.0.1:8000", env = "EVAL_DB_ENDPOINT")] pub db_endpoint: String, /// SurrealDB root username #[arg(long, default_value = "root_user", env = "EVAL_DB_USERNAME")] pub db_username: String, /// SurrealDB root password #[arg(long, default_value = "root_password", env = "EVAL_DB_PASSWORD")] pub db_password: String, /// Override the namespace used on the SurrealDB server #[arg(long, env = "EVAL_DB_NAMESPACE")] pub db_namespace: Option, /// Override the database used on the SurrealDB server #[arg(long, env = "EVAL_DB_DATABASE")] pub db_database: Option, /// Path to inspect DB state #[arg(long)] pub inspect_db_state: Option, // Computed fields (not arguments) #[arg(skip)] pub raw_dataset_path: PathBuf, #[arg(skip)] pub converted_dataset_path: PathBuf, #[arg(skip)] pub limit: Option, #[arg(skip)] pub summary_sample: usize, } impl Config { pub fn context_token_limit(&self) -> Option { None } pub fn finalize(&mut self) -> Result<()> { // Handle dataset paths if let Some(raw) = &self.raw { self.raw_dataset_path = raw.clone(); } else { self.raw_dataset_path = self.dataset.default_raw_path(); } if let Some(converted) = &self.converted { self.converted_dataset_path = converted.clone(); } else { self.converted_dataset_path = self.dataset.default_converted_path(); } // Handle limit if self.limit_arg == 0 { self.limit = None; } else { self.limit = Some(self.limit_arg); } // Handle sample self.summary_sample = self.sample.max(1); // Handle retrieval settings if self.llm_mode { self.retrieval.require_verified_chunks = false; } else { self.retrieval.require_verified_chunks = true; } if self.dataset == DatasetKind::Beir { self.negative_multiplier = 9.0; } // Validations if self.ingest_chunk_min_tokens == 0 || self.ingest_chunk_min_tokens >= self.ingest_chunk_max_tokens { return Err(anyhow!( "--ingest-chunk-min-tokens must be greater than zero and less than --ingest-chunk-max-tokens (got {} >= {})", self.ingest_chunk_min_tokens, self.ingest_chunk_max_tokens )); } if self.ingest_chunk_overlap_tokens >= self.ingest_chunk_min_tokens { return Err(anyhow!( "--ingest-chunk-overlap-tokens ({}) must be less than --ingest-chunk-min-tokens ({})", self.ingest_chunk_overlap_tokens, self.ingest_chunk_min_tokens )); } if self.retrieval.rerank && self.retrieval.rerank_pool_size == 0 { return Err(anyhow!( "--rerank-pool must be greater than zero when reranking is enabled" )); } if let Some(k) = self.retrieval.chunk_rrf_k { if k <= 0.0 || !k.is_finite() { return Err(anyhow!( "--chunk-rrf-k must be a positive, finite number (got {k})" )); } } if let Some(weight) = self.retrieval.chunk_rrf_vector_weight { if weight < 0.0 || !weight.is_finite() { return Err(anyhow!( "--chunk-rrf-vector-weight must be a non-negative, finite number (got {weight})" )); } } if let Some(weight) = self.retrieval.chunk_rrf_fts_weight { if weight < 0.0 || !weight.is_finite() { return Err(anyhow!( "--chunk-rrf-fts-weight must be a non-negative, finite number (got {weight})" )); } } if self.concurrency == 0 { return Err(anyhow!("--concurrency must be greater than zero")); } if self.embedding_backend == EmbeddingBackend::Hashed && self.embedding_model.is_some() { return Err(anyhow!( "--embedding-model cannot be used with the 'hashed' embedding backend" )); } if let Some(query_model) = &self.query_model { if query_model.trim().is_empty() { return Err(anyhow!("--query-model requires a non-empty model name")); } } if let Some(grow) = self.slice_grow { if grow == 0 { return Err(anyhow!("--slice-grow must be greater than zero")); } } if self.negative_multiplier <= 0.0 || !self.negative_multiplier.is_finite() { return Err(anyhow!( "--negative-multiplier must be a positive finite number" )); } // Handle corpus limit logic if let Some(limit) = self.limit { if let Some(corpus_limit) = self.corpus_limit { if corpus_limit < limit { self.corpus_limit = Some(limit); } } else { let default_multiplier = 10usize; let mut computed = limit.saturating_mul(default_multiplier); if computed < limit { computed = limit; } let max_cap = 1_000usize; if computed > max_cap { computed = max_cap; } self.corpus_limit = Some(computed); } } // Handle perf log dir env var fallback if self.perf_log_dir.is_none() { if let Ok(dir) = env::var("EVAL_PERF_LOG_DIR") { if !dir.trim().is_empty() { self.perf_log_dir = Some(PathBuf::from(dir)); } } } Ok(()) } } pub struct ParsedArgs { pub config: Config, } pub fn parse() -> Result { let mut config = Config::parse(); config.finalize()?; Ok(ParsedArgs { config }) } pub fn ensure_parent(path: &Path) -> Result<()> { if let Some(parent) = path.parent() { std::fs::create_dir_all(parent) .with_context(|| format!("creating parent directory for {}", path.display()))?; } Ok(()) }