diff --git a/evaluations/src/args.rs b/evaluations/src/args.rs index 8bf6fea..2e385ff 100644 --- a/evaluations/src/args.rs +++ b/evaluations/src/args.rs @@ -63,7 +63,7 @@ pub struct RetrievalSettings { #[arg(long)] pub max_chunks_per_entity: Option, - /// Enable the FastEmbed reranking stage + /// Enable the `FastEmbed` reranking stage #[arg(long = "rerank", action = clap::ArgAction::SetTrue, default_value_t = false)] pub rerank: bool, @@ -171,23 +171,23 @@ pub struct IngestConfig { #[derive(Debug, Clone, Args)] pub struct DatabaseArgs { - /// SurrealDB server endpoint + /// `SurrealDB` server endpoint #[arg(long, default_value = "ws://127.0.0.1:8000", env = "EVAL_DB_ENDPOINT")] pub db_endpoint: String, - /// SurrealDB root username + /// `SurrealDB` root username #[arg(long, default_value = "root_user", env = "EVAL_DB_USERNAME")] pub db_username: String, - /// SurrealDB root password + /// `SurrealDB` root password #[arg(long, default_value = "root_password", env = "EVAL_DB_PASSWORD")] pub db_password: String, - /// Override the namespace used on the SurrealDB server + /// Override the namespace used on the `SurrealDB` server #[arg(long, env = "EVAL_DB_NAMESPACE")] pub db_namespace: Option, - /// Override the database used on the SurrealDB server + /// Override the database used on the `SurrealDB` server #[arg(long, env = "EVAL_DB_DATABASE")] pub db_database: Option, @@ -258,7 +258,7 @@ pub struct Config { #[arg(long, default_value_t = EmbeddingBackend::FastEmbed)] pub embedding_backend: EmbeddingBackend, - /// FastEmbed model code + /// `FastEmbed` model code #[arg(long)] pub embedding_model: Option, @@ -277,7 +277,7 @@ pub struct Config { #[arg(long)] pub slice: Option, - /// Ignore cached corpus state and rebuild the slice's SurrealDB corpus + /// Ignore cached corpus state and rebuild the slice's `SurrealDB` corpus #[arg(long)] pub reseed_slice: bool, @@ -313,7 +313,7 @@ pub struct Config { #[arg(long)] pub inspect_manifest: Option, - /// Override the SurrealDB system settings query model + /// Override the `SurrealDB` system settings query model #[arg(long)] pub query_model: Option, diff --git a/evaluations/src/corpus/orchestrator.rs b/evaluations/src/corpus/orchestrator.rs index 38a9e44..9a56b19 100644 --- a/evaluations/src/corpus/orchestrator.rs +++ b/evaluations/src/corpus/orchestrator.rs @@ -610,7 +610,7 @@ pub fn build_ingestion_fingerprint( checksum: &str, ingestion_config: &IngestionConfig, ) -> String { - let config_repr = format!("{:?}", ingestion_config); + let config_repr = format!("{ingestion_config:?}"); let mut hasher = Sha256::new(); hasher.update(config_repr.as_bytes()); let config_hash = format!("{:x}", hasher.finalize()); diff --git a/evaluations/src/corpus/store.rs b/evaluations/src/corpus/store.rs index d2d841b..c415472 100644 --- a/evaluations/src/corpus/store.rs +++ b/evaluations/src/corpus/store.rs @@ -576,13 +576,13 @@ fn validate_answers( } } - if !found_any { + if found_any { + Ok(matches.into_iter().collect()) + } else { Err(anyhow!( "expected answer for question '{}' was not found in ingested content", question.id )) - } else { - Ok(matches.into_iter().collect()) } } diff --git a/evaluations/src/datasets/beir.rs b/evaluations/src/datasets/beir.rs index d150b65..49c1714 100644 --- a/evaluations/src/datasets/beir.rs +++ b/evaluations/src/datasets/beir.rs @@ -59,7 +59,7 @@ pub fn convert_beir(raw_dir: &Path, dataset: DatasetKind) -> Result Result query, - None => { - missing_queries += 1; - warn!(query_id = %query_id, "Skipping qrels entry for missing query"); - continue; - } + let query = if let Some(query) = queries.get(&query_id) { query } else { + missing_queries += 1; + warn!(query_id = %query_id, "Skipping qrels entry for missing query"); + continue; }; let best = match select_best_doc(&entries) { @@ -90,31 +87,25 @@ pub fn convert_beir(raw_dir: &Path, dataset: DatasetKind) -> Result continue, }; - let paragraph_slot = match paragraph_index.get(&best.doc_id) { - Some(slot) => *slot, - None => { - missing_docs += 1; - warn!( - query_id = %query_id, - doc_id = %best.doc_id, - "Skipping qrels entry referencing missing corpus document" - ); - continue; - } + let paragraph_slot = if let Some(slot) = paragraph_index.get(&best.doc_id) { *slot } else { + missing_docs += 1; + warn!( + query_id = %query_id, + doc_id = %best.doc_id, + "Skipping qrels entry referencing missing corpus document" + ); + continue; }; let answer = answer_snippet(¶graphs[paragraph_slot].context); - let answers = match answer { - Some(snippet) => vec![snippet], - None => { - skipped_answers += 1; - warn!( - query_id = %query_id, - doc_id = %best.doc_id, - "Skipping query because no non-empty answer snippet could be derived" - ); - continue; - } + let answers = if let Some(snippet) = answer { vec![snippet] } else { + skipped_answers += 1; + warn!( + query_id = %query_id, + doc_id = %best.doc_id, + "Skipping query because no non-empty answer snippet could be derived" + ); + continue; }; let question_id = format!("{}-{query_id}", dataset.source_prefix()); diff --git a/evaluations/src/datasets/mod.rs b/evaluations/src/datasets/mod.rs index 353b567..62f8b72 100644 --- a/evaluations/src/datasets/mod.rs +++ b/evaluations/src/datasets/mod.rs @@ -99,9 +99,9 @@ struct ManifestSlice { impl DatasetCatalog { pub fn load() -> Result { let manifest_raw = fs::read_to_string(MANIFEST_PATH) - .with_context(|| format!("reading dataset manifest at {}", MANIFEST_PATH))?; + .with_context(|| format!("reading dataset manifest at {MANIFEST_PATH}"))?; let manifest: ManifestFile = serde_yaml::from_str(&manifest_raw) - .with_context(|| format!("parsing dataset manifest at {}", MANIFEST_PATH))?; + .with_context(|| format!("parsing dataset manifest at {MANIFEST_PATH}"))?; let root = Path::new(env!("CARGO_MANIFEST_DIR")); let mut datasets = BTreeMap::new(); @@ -351,15 +351,11 @@ impl DatasetKind { } pub fn default_raw_path(self) -> PathBuf { - dataset_entry_for_kind(self) - .map(|entry| entry.raw_path.clone()) - .unwrap_or_else(|err| panic!("dataset manifest missing entry for {:?}: {err}", self)) + dataset_entry_for_kind(self).map_or_else(|err| panic!("dataset manifest missing entry for {self:?}: {err}"), |entry| entry.raw_path.clone()) } pub fn default_converted_path(self) -> PathBuf { - dataset_entry_for_kind(self) - .map(|entry| entry.converted_path.clone()) - .unwrap_or_else(|err| panic!("dataset manifest missing entry for {:?}: {err}", self)) + dataset_entry_for_kind(self).map_or_else(|err| panic!("dataset manifest missing entry for {self:?}: {err}"), |entry| entry.converted_path.clone()) } } diff --git a/evaluations/src/db_helpers.rs b/evaluations/src/db_helpers.rs index 47a426e..02e7bc7 100644 --- a/evaluations/src/db_helpers.rs +++ b/evaluations/src/db_helpers.rs @@ -18,11 +18,9 @@ pub async fn recreate_indexes(db: &SurrealDbClient, dimension: usize) -> Result< pub async fn reset_namespace(db: &SurrealDbClient, namespace: &str, database: &str) -> Result<()> { let query = format!( - "REMOVE NAMESPACE {ns}; - DEFINE NAMESPACE {ns}; - DEFINE DATABASE {db};", - ns = namespace, - db = database + "REMOVE NAMESPACE {namespace}; + DEFINE NAMESPACE {namespace}; + DEFINE DATABASE {database};" ); db.client .query(query) diff --git a/evaluations/src/inspection.rs b/evaluations/src/inspection.rs index 6e5d086..ba71f0b 100644 --- a/evaluations/src/inspection.rs +++ b/evaluations/src/inspection.rs @@ -49,7 +49,7 @@ pub async fn inspect_question(config: &Config) -> Result<()> { chunk_id, entry.paragraph_title, entry.snippet ); } else { - println!(" - {} (missing from manifest)", chunk_id); + println!(" - {chunk_id} (missing from manifest)"); missing_in_manifest.push(chunk_id.clone()); } } @@ -74,18 +74,15 @@ pub async fn inspect_question(config: &Config) -> Result<()> { match connect_eval_db(config, ns, db_name).await { Ok(db) => match verify_chunks_in_db(&db, &question.matching_chunk_ids).await? { MissingChunks::None => println!( - "All matching_chunk_ids exist in namespace '{}', database '{}'", - ns, db_name + "All matching_chunk_ids exist in namespace '{ns}', database '{db_name}'" ), MissingChunks::Missing(list) => println!( - "Missing chunks in namespace '{}', database '{}': {:?}", - ns, db_name, list + "Missing chunks in namespace '{ns}', database '{db_name}': {list:?}" ), }, Err(err) => { println!( - "Failed to connect to SurrealDB namespace '{}' / database '{}': {err}", - ns, db_name + "Failed to connect to SurrealDB namespace '{ns}' / database '{db_name}': {err}" ); } } @@ -170,7 +167,7 @@ async fn verify_chunks_in_db(db: &SurrealDbClient, chunk_ids: &[String]) -> Resu let exists = db .get_item::(chunk_id) .await - .with_context(|| format!("fetching text_chunk {}", chunk_id))? + .with_context(|| format!("fetching text_chunk {chunk_id}"))? .is_some(); if !exists { missing.push(chunk_id.clone()); diff --git a/evaluations/src/main.rs b/evaluations/src/main.rs index c490a79..eb6a1d8 100644 --- a/evaluations/src/main.rs +++ b/evaluations/src/main.rs @@ -21,7 +21,7 @@ use tokio::runtime::Builder; use tracing::info; use tracing_subscriber::{fmt, EnvFilter}; -/// Configure SurrealDB environment variables for optimal performance +/// Configure `SurrealDB` environment variables for optimal performance fn configure_surrealdb_performance(cpu_count: usize) { // Set environment variables only if they're not already set let indexing_batch_size = std::env::var("SURREAL_INDEXING_BATCH_SIZE") diff --git a/evaluations/src/namespace.rs b/evaluations/src/namespace.rs index cf0c208..a696a24 100644 --- a/evaluations/src/namespace.rs +++ b/evaluations/src/namespace.rs @@ -86,7 +86,7 @@ pub(crate) async fn namespace_has_corpus(db: &SurrealDbClient) -> Result { .await .context("checking namespace corpus state")?; let rows: Vec = response.take(0).unwrap_or_default(); - Ok(rows.first().map(|row| row.count).unwrap_or(0) > 0) + Ok(rows.first().map_or(0, |row| row.count) > 0) } /// Determine if we can reuse an existing namespace based on cached state. @@ -101,12 +101,9 @@ pub(crate) async fn can_reuse_namespace( ingestion_fingerprint: &str, slice_case_count: usize, ) -> Result { - let state = match descriptor.load_db_state().await? { - Some(state) => state, - None => { - info!("No namespace state recorded; reseeding corpus from cached shards"); - return Ok(false); - } + let state = if let Some(state) = descriptor.load_db_state().await? { state } else { + info!("No namespace state recorded; reseeding corpus from cached shards"); + return Ok(false); }; if state.slice_case_count != slice_case_count { @@ -192,10 +189,10 @@ fn sanitize_identifier(input: &str) -> String { pub(crate) fn default_namespace(dataset_id: &str, limit: Option) -> String { let dataset_component = sanitize_identifier(dataset_id); let limit_component = match limit { - Some(value) if value > 0 => format!("limit{}", value), + Some(value) if value > 0 => format!("limit{value}"), _ => "all".to_string(), }; - format!("eval_{}_{}", dataset_component, limit_component) + format!("eval_{dataset_component}_{limit_component}") } /// Generate the default database name for evaluations. diff --git a/evaluations/src/perf.rs b/evaluations/src/perf.rs index c5e4573..21bd66c 100644 --- a/evaluations/src/perf.rs +++ b/evaluations/src/perf.rs @@ -37,7 +37,7 @@ pub fn mirror_perf_outputs( .and_then(|os| os.to_str()) .unwrap_or("dataset"); let timestamp = summary.generated_at.format("%Y%m%dT%H%M%S").to_string(); - let filename = format!("perf-{}-{}.json", dataset_slug, timestamp); + let filename = format!("perf-{dataset_slug}-{timestamp}.json"); let path = dir.join(filename); let blob = serde_json::to_vec_pretty(record).context("serialising perf log JSON")?; fs::write(&path, blob) @@ -87,9 +87,7 @@ pub fn print_console_summary(record: &EvaluationReport) { } fn format_duration(value: Option) -> String { - value - .map(|ms| format!("{ms}ms")) - .unwrap_or_else(|| "-".to_string()) + value.map_or_else(|| "-".to_string(), |ms| format!("{ms}ms")) } #[cfg(test)] diff --git a/evaluations/src/pipeline/stages/finalize.rs b/evaluations/src/pipeline/stages/finalize.rs index 17e7d40..e706346 100644 --- a/evaluations/src/pipeline/stages/finalize.rs +++ b/evaluations/src/pipeline/stages/finalize.rs @@ -38,9 +38,9 @@ pub(crate) async fn finalize( } info!( - total_cases = ctx.summary.as_ref().map(|s| s.total_cases).unwrap_or(0), - correct = ctx.summary.as_ref().map(|s| s.correct).unwrap_or(0), - precision = ctx.summary.as_ref().map(|s| s.precision).unwrap_or(0.0), + total_cases = ctx.summary.as_ref().map_or(0, |s| s.total_cases), + correct = ctx.summary.as_ref().map_or(0, |s| s.correct), + precision = ctx.summary.as_ref().map_or(0.0, |s| s.precision), dataset = ctx.dataset().metadata.id.as_str(), "Evaluation complete" ); diff --git a/evaluations/src/pipeline/stages/prepare_corpus.rs b/evaluations/src/pipeline/stages/prepare_corpus.rs index 2e6e369..fdf7f5e 100644 --- a/evaluations/src/pipeline/stages/prepare_corpus.rs +++ b/evaluations/src/pipeline/stages/prepare_corpus.rs @@ -82,12 +82,11 @@ pub(crate) async fn prepare_corpus( return machine .prepare_corpus() .map_err(|(_, guard)| map_guard_error("prepare_corpus", guard)); - } else { - info!( - cache = %base_dir.display(), - "Namespace reusable but cached manifest missing; regenerating corpus" - ); } + info!( + cache = %base_dir.display(), + "Namespace reusable but cached manifest missing; regenerating corpus" + ); } } diff --git a/evaluations/src/pipeline/stages/run_queries.rs b/evaluations/src/pipeline/stages/run_queries.rs index 04d5d88..b3e5887 100644 --- a/evaluations/src/pipeline/stages/run_queries.rs +++ b/evaluations/src/pipeline/stages/run_queries.rs @@ -179,7 +179,7 @@ pub(crate) async fn run_queries( debug!(question_id = %question_id, "Evaluating query"); let query_embedding = embedding_provider.embed(&question).await.with_context(|| { - format!("generating embedding for question {}", question_id) + format!("generating embedding for question {question_id}") })?; let reranker = match rerank_pool.as_ref() { Some(pool) => pool.checkout().await, @@ -201,7 +201,7 @@ pub(crate) async fn run_queries( query_embedding, ) .await - .with_context(|| format!("running pipeline for question {}", question_id))?; + .with_context(|| format!("running pipeline for question {question_id}"))?; (outcome.results, outcome.diagnostics, outcome.stage_timings) } else { let outcome = pipeline::run_pipeline_with_embedding_with_metrics( @@ -209,7 +209,7 @@ pub(crate) async fn run_queries( query_embedding, ) .await - .with_context(|| format!("running pipeline for question {}", question_id))?; + .with_context(|| format!("running pipeline for question {question_id}"))?; (outcome.results, None, outcome.stage_timings) }; let query_latency = query_start.elapsed().as_millis(); @@ -220,7 +220,7 @@ pub(crate) async fn run_queries( let answers_lower: Vec = answers.iter().map(|ans| ans.to_ascii_lowercase()).collect(); let expected_chunk_ids_set: HashSet<&str> = - expected_chunk_ids.iter().map(|id| id.as_str()).collect(); + expected_chunk_ids.iter().map(std::string::String::as_str).collect(); let chunk_id_required = has_verified_chunks; let mut entity_hit = false; let mut chunk_text_hit = false; @@ -408,7 +408,7 @@ fn calculate_ndcg(retrieved: &[RetrievedSummary], k: usize) -> f64 { let mut idcg = 0.0; for i in 0..relevant_count { let rel = 1.0; - idcg += rel / (i as f64 + 2.0).log2(); + idcg += rel / (f64::from(i) + 2.0).log2(); } if idcg == 0.0 { diff --git a/evaluations/src/report.rs b/evaluations/src/report.rs index abc7c55..ef637f1 100644 --- a/evaluations/src/report.rs +++ b/evaluations/src/report.rs @@ -455,7 +455,7 @@ fn render_markdown(report: &EvaluationReport) -> String { } else { report.dataset.embedding_backend.clone() }; - md.push_str(&format!("| Embedding | {} |\\n", embedding_label)); + md.push_str(&format!("| Embedding | {embedding_label} |\\n")); md.push_str(&format!( "| Embedding Dim | {} |\\n", report.dataset.embedding_dimension @@ -520,9 +520,7 @@ fn render_markdown(report: &EvaluationReport) -> String { if report.retrieval.rerank_enabled { let pool = report .retrieval - .rerank_pool_size - .map(|size| size.to_string()) - .unwrap_or_else(|| "?".into()); + .rerank_pool_size.map_or_else(|| "?".into(), |size| size.to_string()); md.push_str(&format!( "| Rerank | enabled (pool {pool}, keep top {}) |\\n", report.retrieval.rerank_keep_top @@ -550,7 +548,7 @@ fn render_markdown(report: &EvaluationReport) -> String { report.performance.ingestion_ms )); if let Some(seed) = report.performance.namespace_seed_ms { - md.push_str(&format!("| Namespace Seed | {} ms |\\n", seed)); + md.push_str(&format!("| Namespace Seed | {seed} ms |\\n")); } md.push_str(&format!( "| Namespace State | {} |\\n", @@ -672,9 +670,7 @@ fn render_markdown(report: &EvaluationReport) -> String { for case in &report.llm_cases { let retrieved = render_retrieved(&case.retrieved); let rank = case - .match_rank - .map(|rank| rank.to_string()) - .unwrap_or_else(|| "-".into()); + .match_rank.map_or_else(|| "-".into(), |rank| rank.to_string()); md.push_str(&format!( "| `{}` | {} | {} | {} |\\n", case.question_id, diff --git a/evaluations/src/slice.rs b/evaluations/src/slice.rs index 270e7f1..db8bee5 100644 --- a/evaluations/src/slice.rs +++ b/evaluations/src/slice.rs @@ -124,9 +124,9 @@ pub struct SliceWindow<'a> { positive_paragraph_ids: Vec, } -impl<'a> SliceWindow<'a> { +impl SliceWindow<'_> { pub fn positive_ids(&self) -> impl Iterator { - self.positive_paragraph_ids.iter().map(|id| id.as_str()) + self.positive_paragraph_ids.iter().map(std::string::String::as_str) } } @@ -312,15 +312,13 @@ pub fn resolve_slice<'a>( if manifest .as_ref() - .map(|manifest| manifest.version != SLICE_VERSION) - .unwrap_or(false) + .is_some_and(|manifest| manifest.version != SLICE_VERSION) { warn!( slice = manifest .as_ref() - .map(|m| m.slice_id.as_str()) - .unwrap_or("unknown"), - found = manifest.as_ref().map(|m| m.version).unwrap_or(0), + .map_or("unknown", |m| m.slice_id.as_str()), + found = manifest.as_ref().map_or(0, |m| m.version), expected = SLICE_VERSION, "Slice manifest version mismatch; regenerating" ); @@ -387,7 +385,7 @@ pub fn resolve_slice<'a>( ); } - let resolved = manifest_to_resolved(dataset, &index, manifest.clone(), path.clone())?; + let resolved = manifest_to_resolved(dataset, &index, manifest.clone(), path)?; Ok(resolved) } @@ -674,7 +672,7 @@ fn ordered_question_refs_beir( } } - if grouped.values().all(|entries| entries.is_empty()) { + if grouped.values().all(std::vec::Vec::is_empty) { return Err(anyhow!( "no eligible BEIR questions found; cannot build slice" )); @@ -710,7 +708,7 @@ fn ordered_question_refs_beir( let mut shortfall = 0usize; for prefix in &prefixes { - let available = grouped.get(prefix).map(|v| v.len()).unwrap_or(0); + let available = grouped.get(prefix).map_or(0, std::vec::Vec::len); let quota = *quotas.get(prefix).unwrap_or(&0); let take = quota.min(available); let missing = quota.saturating_sub(take); @@ -766,7 +764,7 @@ fn ordered_question_refs_beir( let mut output = Vec::with_capacity(total_selected); loop { let mut progressed = false; - for queue in queues.iter_mut() { + for queue in &mut queues { if let Some(item) = queue.pop_front() { output.push(item); progressed = true; @@ -1045,10 +1043,10 @@ impl<'a> From<&'a Config> for SliceConfig<'a> { } } -pub fn slice_config_with_limit<'a>( - config: &'a Config, +pub fn slice_config_with_limit( + config: &Config, limit_override: Option, -) -> SliceConfig<'a> { +) -> SliceConfig<'_> { SliceConfig { cache_dir: config.cache_dir.as_path(), force_convert: config.force_convert, diff --git a/evaluations/src/types.rs b/evaluations/src/types.rs index b01e169..51fe19f 100644 --- a/evaluations/src/types.rs +++ b/evaluations/src/types.rs @@ -300,7 +300,7 @@ fn normalize_for_match(input: &str) -> String { // to reduce false negatives from formatting or punctuation differences. let mut out = String::with_capacity(input.len()); let mut last_space = false; - for ch in input.nfkc().flat_map(|c| c.to_lowercase()) { + for ch in input.nfkc().flat_map(char::to_lowercase) { let is_space = ch.is_whitespace(); let is_punct = ch.is_ascii_punctuation() || matches!( @@ -371,7 +371,7 @@ pub fn build_stage_latency_breakdown(samples: &[PipelineStageTimings]) -> StageL } StageLatencyBreakdown { - embed: compute_latency_stats(&collect_stage(samples, |entry| entry.embed_ms())), + embed: compute_latency_stats(&collect_stage(samples, retrieval_pipeline::StageTimings::embed_ms)), collect_candidates: compute_latency_stats(&collect_stage(samples, |entry| { entry.collect_candidates_ms() })), @@ -381,8 +381,8 @@ pub fn build_stage_latency_breakdown(samples: &[PipelineStageTimings]) -> StageL chunk_attach: compute_latency_stats(&collect_stage(samples, |entry| { entry.chunk_attach_ms() })), - rerank: compute_latency_stats(&collect_stage(samples, |entry| entry.rerank_ms())), - assemble: compute_latency_stats(&collect_stage(samples, |entry| entry.assemble_ms())), + rerank: compute_latency_stats(&collect_stage(samples, retrieval_pipeline::StageTimings::rerank_ms)), + assemble: compute_latency_stats(&collect_stage(samples, retrieval_pipeline::StageTimings::assemble_ms)), } } @@ -402,7 +402,7 @@ pub fn build_case_diagnostics( candidates: &[EvaluationCandidate], pipeline_stats: Option, ) -> CaseDiagnostics { - let expected_set: HashSet<&str> = expected_chunk_ids.iter().map(|id| id.as_str()).collect(); + let expected_set: HashSet<&str> = expected_chunk_ids.iter().map(std::string::String::as_str).collect(); let mut seen_chunks: HashSet = HashSet::new(); let mut attached_chunk_ids = Vec::new(); let mut entity_diagnostics = Vec::new();