From 09e545816ef5a5c483d90c747df036aab388fc2b Mon Sep 17 00:00:00 2001 From: Per Stark Date: Sat, 20 Jun 2026 10:10:29 +0200 Subject: [PATCH] chore: git-hooks rustfmt and clippy --- CHANGELOG.md | 7 ++- devenv.lock | 28 +++++----- devenv.nix | 21 ++++++-- devenv.yaml | 3 ++ evaluations/src/cli/status.rs | 53 +++++++++---------- evaluations/src/corpus/orchestrator.rs | 5 +- evaluations/src/corpus/store.rs | 7 +-- evaluations/src/datasets/beir.rs | 4 +- evaluations/src/datasets/beir_mix.rs | 34 ++++++------ evaluations/src/datasets/checksum.rs | 9 ++-- evaluations/src/datasets/loader.rs | 4 +- evaluations/src/datasets/mod.rs | 4 +- evaluations/src/datasets/store.rs | 26 ++++----- evaluations/src/db/connect.rs | 3 +- evaluations/src/db/mod.rs | 2 +- evaluations/src/inspection.rs | 12 ++--- evaluations/src/main.rs | 14 ++--- evaluations/src/openai.rs | 5 +- evaluations/src/perf.rs | 5 +- evaluations/src/pipeline/mod.rs | 13 +---- .../src/pipeline/stages/prepare_slice.rs | 5 +- evaluations/src/slice/mod.rs | 19 +++---- ingestion-pipeline/src/pipeline/context.rs | 5 +- ingestion-pipeline/src/pipeline/mod.rs | 22 ++------ .../src/pipeline/test_support.rs | 10 ++-- ingestion-pipeline/src/pipeline/tests.rs | 24 ++++----- json-stream-parser/src/lib.rs | 2 +- main/src/main.rs | 7 +-- main/src/worker.rs | 4 +- 29 files changed, 156 insertions(+), 201 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bd85bb6..776b2fb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,10 +2,9 @@ ## Unreleased -- Refactor: deduplicated test database setup across common/src/storage/ types by routing remaining inline SurrealDbClient::memory() calls through shared setup_test_db(), prepare_text_chunk_test_db(), and prepare_knowledge_entity_test_db() helpers; removed redundant apply_migrations() calls after setup_test_db() and collapsed configure_embedding_dimension + redefine_hnsw_index triplication into prepare_**test_db helpers; extracted generic ensure_fts_index helper for FTS index bootstrap replacing duplicated per-table ensure**_fts_indexes helpers -- Refactor: split knowledge-graph.js monolith into focused functions (loadGraphData, buildSvg, createSimulation, drawLinks/Nodes/Labels, createHighlighting, createZoom, attachResize); fixed dead duplicate zoom instance -- Refactor: extracted rubberbanding scroll logic in design-polish.js into standalone attachRubberbanding helper; removed dead pullDistance state - +- Fix: added pre-commit hooks to further maintain code consistency. +- Refactor: deduplicated test database setup across common/src/storage/. +- Refactor: split knowledge-graph.js monolith into focused functions. - Evaluations: simplified crate layout — linear pipeline, sharded-only converted store, in-memory ingestion, `db/` and `cli/` modules; namespace reuse state in corpus manifest (removed `cache/snapshots/`); no legacy JSON/history compatibility (re-run `--warm` after upgrade) - Performance: ingestion skips per-task index rebuild; worker runs scheduled `REBUILD INDEX` (default every 24h via `index_rebuild_interval_secs`, `0` disables) - Performance: ingestion persists all artifacts in a single SurrealDB transaction per task (atomic replace by task id) diff --git a/devenv.lock b/devenv.lock index 9e61485..4bb9366 100644 --- a/devenv.lock +++ b/devenv.lock @@ -3,10 +3,10 @@ "devenv": { "locked": { "dir": "src/modules", - "lastModified": 1771066302, + "lastModified": 1781800860, "owner": "cachix", "repo": "devenv", - "rev": "1b355dec9bddbaddbe4966d6fc30d7aa3af8575b", + "rev": "d59d872d80876d9eeb3e214d3b088bc4a14a9c4f", "type": "github" }, "original": { @@ -22,10 +22,10 @@ "rust-analyzer-src": "rust-analyzer-src" }, "locked": { - "lastModified": 1771052630, + "lastModified": 1781779700, "owner": "nix-community", "repo": "fenix", - "rev": "d0555da98576b8611c25df0c208e51e9a182d95f", + "rev": "ad30e585c7a2917325943c2b19511f5a249eff53", "type": "github" }, "original": { @@ -58,10 +58,10 @@ ] }, "locked": { - "lastModified": 1770726378, + "lastModified": 1781733627, "owner": "cachix", "repo": "git-hooks.nix", - "rev": "5eaaedde414f6eb1aea8b8525c466dc37bba95ae", + "rev": "3bbec39bc90eadfa031e6f3b77272f3f60803e39", "type": "github" }, "original": { @@ -92,10 +92,10 @@ }, "nixpkgs": { "locked": { - "lastModified": 1771008912, + "lastModified": 1781577229, "owner": "nixos", "repo": "nixpkgs", - "rev": "a82ccc39b39b621151d6732718e3e250109076fa", + "rev": "567a49d1913ce81ac6e9582e3553dd90a955875f", "type": "github" }, "original": { @@ -107,10 +107,10 @@ }, "nixpkgs_2": { "locked": { - "lastModified": 1770843696, + "lastModified": 1781607440, "owner": "nixos", "repo": "nixpkgs", - "rev": "2343bbb58f99267223bc2aac4fc9ea301a155a16", + "rev": "3e41b24abd260e8f71dbe2f5737d24122f972158", "type": "github" }, "original": { @@ -135,10 +135,10 @@ "rust-analyzer-src": { "flake": false, "locked": { - "lastModified": 1771007332, + "lastModified": 1781714865, "owner": "rust-lang", "repo": "rust-analyzer", - "rev": "bbc84d335fbbd9b3099d3e40c7469ee57dbd1873", + "rev": "abb1301c3c14a40645bb2588b1cc858fe374b527", "type": "github" }, "original": { @@ -155,10 +155,10 @@ ] }, "locked": { - "lastModified": 1771038269, + "lastModified": 1781850613, "owner": "oxalica", "repo": "rust-overlay", - "rev": "d7a86c8a4df49002446737603a3e0d7ef91a9637", + "rev": "4baecb43a008cd004e5220a777e1724bd8d43e43", "type": "github" }, "original": { diff --git a/devenv.nix b/devenv.nix index 5594b24..8f3c00e 100644 --- a/devenv.nix +++ b/devenv.nix @@ -4,19 +4,32 @@ config, inputs, ... -}: -let +}: let ortVersion = lib.removeSuffix "\n" (builtins.readFile "${toString ./.}/ort-version"); _ortVersionCheck = if pkgs.onnxruntime.version == ortVersion then null - else - throw "pkgs.onnxruntime.version (${pkgs.onnxruntime.version}) must match ort-version (${ortVersion})"; + else throw "pkgs.onnxruntime.version (${pkgs.onnxruntime.version}) must match ort-version (${ortVersion})"; in { devenv.warnOnNewVersion = false; cachix.enable = false; + git-hooks.install.enable = true; + git-hooks.hooks = { + rustfmt.enable = true; + clippy = { + enable = true; + settings.allFeatures = true; + }; + }; + + # Use pinned Rust toolchain from languages.rust for git-hooks wrappers + # (git-hooks.nix defaults to nixpkgs's cargo/clippy/rustfmt, ignoring the pin) + git-hooks.tools.cargo = lib.mkDefault config.languages.rust.toolchain.cargo; + git-hooks.tools.clippy = lib.mkDefault config.languages.rust.toolchain.clippy; + git-hooks.tools.rustfmt = lib.mkDefault config.languages.rust.toolchain.rustfmt; + packages = [ pkgs.openssl pkgs.nodejs diff --git a/devenv.yaml b/devenv.yaml index 84838c2..016a29d 100644 --- a/devenv.yaml +++ b/devenv.yaml @@ -9,3 +9,6 @@ inputs: nixpkgs: follows: nixpkgs allowUnfree: true +nixpkgs: + permittedInsecurePackages: + - "minio-2025-10-15T17-29-55Z" diff --git a/evaluations/src/cli/status.rs b/evaluations/src/cli/status.rs index b65a5f5..7ee2b85 100644 --- a/evaluations/src/cli/status.rs +++ b/evaluations/src/cli/status.rs @@ -103,10 +103,16 @@ pub async fn collect_status(config: &Config) -> Result { ready: slice_manifest .as_ref() .is_some_and(|manifest| slice::manifest_is_complete(manifest, &slice_config)), - path: manifest_path.as_ref().map(|path| path.display().to_string()), + path: manifest_path + .as_ref() + .map(|path| path.display().to_string()), cases: slice_manifest.as_ref().map(|manifest| manifest.case_count), - positives: slice_manifest.as_ref().map(|manifest| manifest.positive_paragraphs), - negatives: slice_manifest.as_ref().map(|manifest| manifest.negative_paragraphs), + positives: slice_manifest + .as_ref() + .map(|manifest| manifest.positive_paragraphs), + negatives: slice_manifest + .as_ref() + .map(|manifest| manifest.negative_paragraphs), }; let beir_paragraph_ids = slice_manifest.as_ref().map(|manifest| { @@ -159,17 +165,9 @@ pub async fn collect_status(config: &Config) -> Result { } }; - let namespace = config - .database - .db_namespace - .clone() - .unwrap_or_else(|| { - default_namespace( - config.dataset.id(), - config.limit, - config.slice.as_deref(), - ) - }); + let namespace = config.database.db_namespace.clone().unwrap_or_else(|| { + default_namespace(config.dataset.id(), config.limit, config.slice.as_deref()) + }); let database = config .database .db_database @@ -183,16 +181,17 @@ pub async fn collect_status(config: &Config) -> Result { .and_then(|manifest| manifest.metadata.namespace_seed) }); - let (seeded, namespace_seed_recorded) = match connect_eval_db(config, &namespace, &database).await { - Ok(db) => { - let has_corpus = namespace_has_corpus(&db).await.unwrap_or(false); - (has_corpus, namespace_seed.is_some()) - } - Err(err) => { - notes.push(format!("SurrealDB unavailable: {err}")); - (false, false) - } - }; + let (seeded, namespace_seed_recorded) = + match connect_eval_db(config, &namespace, &database).await { + Ok(db) => { + let has_corpus = namespace_has_corpus(&db).await.unwrap_or(false); + (has_corpus, namespace_seed.is_some()) + } + Err(err) => { + notes.push(format!("SurrealDB unavailable: {err}")); + (false, false) + } + }; let query_ready = converted_ready && slice_ledger.ready @@ -281,11 +280,7 @@ pub fn print_status(status: &EvalStatus) { ); println!( "Query-ready: {}", - if status.query_ready { - "yes" - } else { - "no" - } + if status.query_ready { "yes" } else { "no" } ); for note in &status.notes { println!("Note: {note}"); diff --git a/evaluations/src/corpus/orchestrator.rs b/evaluations/src/corpus/orchestrator.rs index 3575090..d96c338 100644 --- a/evaluations/src/corpus/orchestrator.rs +++ b/evaluations/src/corpus/orchestrator.rs @@ -701,10 +701,7 @@ mod tests { ConvertedDataset { generated_at: Utc::now(), - metadata: crate::datasets::DatasetMetadata::for_kind( - DatasetKind::default(), - false, - ), + metadata: crate::datasets::DatasetMetadata::for_kind(DatasetKind::default(), false), source: "src".to_string(), paragraphs: vec![paragraph], } diff --git a/evaluations/src/corpus/store.rs b/evaluations/src/corpus/store.rs index 294ceed..c83dff3 100644 --- a/evaluations/src/corpus/store.rs +++ b/evaluations/src/corpus/store.rs @@ -10,11 +10,8 @@ use chrono::{DateTime, Utc}; use common::storage::{ db::SurrealDbClient, types::{ - knowledge_entity::KnowledgeEntity, - knowledge_relationship::KnowledgeRelationship, - text_chunk::TextChunk, - text_content::TextContent, - StoredObject, + knowledge_entity::KnowledgeEntity, knowledge_relationship::KnowledgeRelationship, + text_chunk::TextChunk, text_content::TextContent, StoredObject, }, }; use ingestion_pipeline::{persist_artifacts, IngestionTuning, PipelineArtifacts}; diff --git a/evaluations/src/datasets/beir.rs b/evaluations/src/datasets/beir.rs index be355a0..6c891d7 100644 --- a/evaluations/src/datasets/beir.rs +++ b/evaluations/src/datasets/beir.rs @@ -190,9 +190,7 @@ pub fn convert_beir_documents( pub fn corpus_doc_id(paragraph_id: &str, dataset: DatasetKind) -> Option { let prefix = format!("{}-", dataset.source_prefix()); - paragraph_id - .strip_prefix(&prefix) - .map(str::to_string) + paragraph_id.strip_prefix(&prefix).map(str::to_string) } fn resolve_qrels_path(raw_dir: &Path) -> Result { diff --git a/evaluations/src/datasets/beir_mix.rs b/evaluations/src/datasets/beir_mix.rs index 45a8e66..c60dc9e 100644 --- a/evaluations/src/datasets/beir_mix.rs +++ b/evaluations/src/datasets/beir_mix.rs @@ -11,12 +11,9 @@ use super::{ self, build_dataset_from_catalog, paragraph_path, read_meta, store_dir_for, upsert_sharded_paragraphs, write_sharded, }, - BEIR_DATASETS, ConvertedDataset, DatasetKind, DatasetMetadata, -}; -use crate::{ - args::Config, - slice, + ConvertedDataset, DatasetKind, DatasetMetadata, BEIR_DATASETS, }; +use crate::{args::Config, slice}; pub fn subset_for_paragraph_id(paragraph_id: &str) -> Option { let mut kinds: Vec = BEIR_DATASETS.to_vec(); @@ -53,9 +50,8 @@ pub fn build_beir_mix_qrels_dataset(include_unanswerable: bool) -> Result Result { let virtual_ds = build_beir_mix_qrels_dataset(config.llm_mode)?; let slice_config = slice::slice_config_with_limit(config, slice::ledger_target(config)); - let resolved = slice::resolve_slice(&virtual_ds, &slice_config).context( - "resolving BEIR mix slice ledger (check --slice and --limit match your intent)", - )?; + let resolved = slice::resolve_slice(&virtual_ds, &slice_config) + .context("resolving BEIR mix slice ledger (check --slice and --limit match your intent)")?; let unique: HashSet = resolved .manifest @@ -83,16 +79,16 @@ pub fn prepare_beir_mix(config: &Config) -> Result }) } -pub fn materialize_subset_stores( - paragraph_ids: &HashSet, - force: bool, -) -> Result<()> { +pub fn materialize_subset_stores(paragraph_ids: &HashSet, force: bool) -> Result<()> { let mut by_subset: HashMap> = HashMap::new(); for paragraph_id in paragraph_ids { let kind = subset_for_paragraph_id(paragraph_id).with_context(|| { format!("routing BEIR mix paragraph id '{paragraph_id}' to subset store") })?; - by_subset.entry(kind).or_default().push(paragraph_id.clone()); + by_subset + .entry(kind) + .or_default() + .push(paragraph_id.clone()); } for (kind, ids) in by_subset { @@ -120,11 +116,7 @@ pub fn materialize_subset_stores( .iter() .filter_map(|paragraph_id| beir::corpus_doc_id(paragraph_id, kind)) .collect(); - let paragraphs = beir::convert_beir_documents( - &entry.raw_path, - kind, - Some(&corpus_ids), - )?; + let paragraphs = beir::convert_beir_documents(&entry.raw_path, kind, Some(&corpus_ids))?; if store_dir.join("meta.json").is_file() { upsert_sharded_paragraphs(&store_dir, ¶graphs)?; @@ -233,7 +225,11 @@ pub fn beir_subset_store_summary() -> Result> { let store_dir = store_dir_for(&entry.converted_path); if store_dir.join("meta.json").is_file() { let meta = read_meta(&store_dir)?; - summary.push((kind.id().to_string(), meta.paragraph_count, meta.question_count)); + summary.push(( + kind.id().to_string(), + meta.paragraph_count, + meta.question_count, + )); } } Ok(summary) diff --git a/evaluations/src/datasets/checksum.rs b/evaluations/src/datasets/checksum.rs index 331457a..4493a75 100644 --- a/evaluations/src/datasets/checksum.rs +++ b/evaluations/src/datasets/checksum.rs @@ -56,8 +56,8 @@ impl ChecksumSidecar { #[allow(clippy::indexing_slicing)] pub fn hash_file(path: &Path) -> Result { - let mut file = - File::open(path).with_context(|| format!("opening file {} for checksum", path.display()))?; + let mut file = File::open(path) + .with_context(|| format!("opening file {} for checksum", path.display()))?; let mut hasher = Sha256::new(); let mut buffer = vec![0u8; 65_536]; loop { @@ -176,7 +176,10 @@ fn collect_store_files(base: &Path, current: &Path, entries: &mut Vec) - for entry in fs::read_dir(current)? { let entry = entry?; let path = entry.path(); - if path.file_name().is_some_and(|name| name == "checksum.sha256") { + if path + .file_name() + .is_some_and(|name| name == "checksum.sha256") + { continue; } if path.is_dir() { diff --git a/evaluations/src/datasets/loader.rs b/evaluations/src/datasets/loader.rs index 752ad93..4994f45 100644 --- a/evaluations/src/datasets/loader.rs +++ b/evaluations/src/datasets/loader.rs @@ -186,9 +186,7 @@ fn slice_config_for_catalog_entry<'a>( limit: slice_entry.limit, corpus_limit: slice_entry.corpus_limit, slice_seed: slice_entry.seed.unwrap_or(config.slice_seed), - llm_mode: slice_entry - .include_unanswerable - .unwrap_or(config.llm_mode), + llm_mode: slice_entry.include_unanswerable.unwrap_or(config.llm_mode), negative_multiplier: slice_entry .negative_multiplier .unwrap_or(config.negative_multiplier), diff --git a/evaluations/src/datasets/mod.rs b/evaluations/src/datasets/mod.rs index 7380d77..ad546e8 100644 --- a/evaluations/src/datasets/mod.rs +++ b/evaluations/src/datasets/mod.rs @@ -222,10 +222,8 @@ fn resolve_path(root: &Path, value: &str) -> PathBuf { } } +pub use beir_mix::{beir_subset_store_summary, beir_subset_stores_ready, mix_content_checksum}; pub use checksum::store_aggregate_checksum; -pub use beir_mix::{ - beir_subset_store_summary, beir_subset_stores_ready, mix_content_checksum, -}; pub use loader::{prebuild_catalog_slices, prepare_dataset}; pub use store::{ content_checksum_for_layout, detect_layout, store_dir_for, write_sharded, ConvertedLayout, diff --git a/evaluations/src/datasets/store.rs b/evaluations/src/datasets/store.rs index a4e85d9..5335e79 100644 --- a/evaluations/src/datasets/store.rs +++ b/evaluations/src/datasets/store.rs @@ -11,8 +11,8 @@ use serde::{Deserialize, Serialize}; use tracing::info; use super::{ - checksum::store_aggregate_checksum, - ConvertedDataset, ConvertedParagraph, ConvertedQuestion, DatasetMetadata, + checksum::store_aggregate_checksum, ConvertedDataset, ConvertedParagraph, ConvertedQuestion, + DatasetMetadata, }; use crate::slice; @@ -50,11 +50,10 @@ pub fn store_dir_for(converted_path: &Path) -> PathBuf { converted_path .parent() .unwrap_or_else(|| Path::new(".")) - .join( - converted_path - .file_stem() - .map_or_else(|| "dataset".to_string(), |stem| stem.to_string_lossy().into()), - ) + .join(converted_path.file_stem().map_or_else( + || "dataset".to_string(), + |stem| stem.to_string_lossy().into(), + )) } pub fn detect_layout(converted_path: &Path) -> ConvertedLayout { @@ -167,8 +166,8 @@ pub fn content_checksum_for_layout(converted_path: &Path) -> Result { fn load_paragraph(store_dir: &Path, paragraph_id: &str) -> Result { let path = paragraph_path(store_dir, paragraph_id); - let raw = fs::read(&path) - .with_context(|| format!("reading sharded paragraph {}", path.display()))?; + let raw = + fs::read(&path).with_context(|| format!("reading sharded paragraph {}", path.display()))?; serde_json::from_slice(&raw) .with_context(|| format!("parsing sharded paragraph {}", path.display())) } @@ -180,7 +179,10 @@ fn load_paragraphs(store_dir: &Path, paragraph_ids: &[String]) -> Result Result { +pub fn load_sharded_partial( + store_dir: &Path, + paragraph_ids: &[String], +) -> Result { let meta = read_meta(store_dir)?; let mut paragraphs = load_paragraphs(store_dir, paragraph_ids)?; paragraphs.sort_by(|left, right| left.id.cmp(&right.id)); @@ -333,8 +335,8 @@ pub fn load_question_catalog(store_dir: &Path) -> Result { if line.trim().is_empty() { continue; } - let record: QuestionRecord = serde_json::from_str(&line) - .context("parsing question catalog record")?; + let record: QuestionRecord = + serde_json::from_str(&line).context("parsing question catalog record")?; entries.push(record); } Ok(QuestionCatalog { entries }) diff --git a/evaluations/src/db/connect.rs b/evaluations/src/db/connect.rs index d200c5e..be4cd28 100644 --- a/evaluations/src/db/connect.rs +++ b/evaluations/src/db/connect.rs @@ -132,8 +132,7 @@ pub(crate) async fn can_reuse_namespace( if seed.namespace != namespace || seed.database != database { info!( namespace, - database, - "Corpus manifest namespace metadata mismatch; reseeding" + database, "Corpus manifest namespace metadata mismatch; reseeding" ); return Ok(false); } diff --git a/evaluations/src/db/mod.rs b/evaluations/src/db/mod.rs index ee57459..382bc48 100644 --- a/evaluations/src/db/mod.rs +++ b/evaluations/src/db/mod.rs @@ -5,5 +5,5 @@ pub(crate) use connect::{ can_reuse_namespace, connect_eval_db, default_database, default_namespace, ensure_eval_user, namespace_has_corpus, record_namespace_seed, sanitize_model_code, }; -pub use lifecycle::{recreate_indexes, reset_namespace}; pub(crate) use lifecycle::warm_hnsw_cache; +pub use lifecycle::{recreate_indexes, reset_namespace}; diff --git a/evaluations/src/inspection.rs b/evaluations/src/inspection.rs index cca57b9..915fc78 100644 --- a/evaluations/src/inspection.rs +++ b/evaluations/src/inspection.rs @@ -1,8 +1,4 @@ -use std::{ - collections::HashMap, - fs, - path::Path, -}; +use std::{collections::HashMap, fs, path::Path}; use anyhow::{anyhow, Context, Result}; use common::storage::{db::SurrealDbClient, types::text_chunk::TextChunk}; @@ -72,9 +68,9 @@ pub async fn inspect_question(config: &Config) -> Result<()> { MissingChunks::None => println!( "All matching_chunk_ids exist in namespace '{ns}', database '{db_name}'" ), - MissingChunks::Missing(list) => println!( - "Missing chunks in namespace '{ns}', database '{db_name}': {list:?}" - ), + MissingChunks::Missing(list) => { + println!("Missing chunks in namespace '{ns}', database '{db_name}': {list:?}"); + } }, Err(err) => { println!( diff --git a/evaluations/src/main.rs b/evaluations/src/main.rs index 087c73c..720fbe2 100644 --- a/evaluations/src/main.rs +++ b/evaluations/src/main.rs @@ -1,7 +1,7 @@ mod args; -mod context_stats; mod cases; mod cli; +mod context_stats; mod corpus; mod datasets; mod db; @@ -129,10 +129,7 @@ async fn async_main() -> anyhow::Result<()> { let store_dir = datasets::store_dir_for(&parsed.config.converted_dataset_path); datasets::write_sharded(&dataset, &store_dir)?; datasets::prebuild_catalog_slices(&dataset, &parsed.config)?; - println!( - "Converted dataset written under {}", - store_dir.display() - ); + println!("Converted dataset written under {}", store_dir.display()); return Ok(()); } @@ -141,14 +138,13 @@ async fn async_main() -> anyhow::Result<()> { } info!(dataset = dataset_kind.id(), "Preparing converted dataset"); - let loaded = crate::datasets::prepare_dataset(dataset_kind, &parsed.config).with_context( - || { + let loaded = + crate::datasets::prepare_dataset(dataset_kind, &parsed.config).with_context(|| { format!( "preparing converted dataset at {}", parsed.config.converted_dataset_path.display() ) - }, - )?; + })?; info!( questions = loaded diff --git a/evaluations/src/openai.rs b/evaluations/src/openai.rs index 1928dc6..e40ca1a 100644 --- a/evaluations/src/openai.rs +++ b/evaluations/src/openai.rs @@ -14,10 +14,7 @@ pub fn ingestion_openai_client( )?; Ok((Arc::new(client), Some(base_url))) } else { - Ok(( - Arc::new(Client::with_config(OpenAIConfig::default())), - None, - )) + Ok((Arc::new(Client::with_config(OpenAIConfig::default())), None)) } } diff --git a/evaluations/src/perf.rs b/evaluations/src/perf.rs index de12810..19e7058 100644 --- a/evaluations/src/perf.rs +++ b/evaluations/src/perf.rs @@ -91,7 +91,10 @@ fn format_duration(value: Option) -> String { #[cfg(test)] mod tests { use super::*; - use crate::types::{EvaluationStageTimings, PerformanceTimings, LatencyStats, StageLatency, StageLatencyBreakdown}; + use crate::types::{ + EvaluationStageTimings, LatencyStats, PerformanceTimings, StageLatency, + StageLatencyBreakdown, + }; use chrono::Utc; use tempfile::tempdir; diff --git a/evaluations/src/pipeline/mod.rs b/evaluations/src/pipeline/mod.rs index f154e2a..7b5e807 100644 --- a/evaluations/src/pipeline/mod.rs +++ b/evaluations/src/pipeline/mod.rs @@ -26,12 +26,7 @@ pub async fn warm_evaluation( config: &Config, content_checksum: &str, ) -> Result<()> { - let _ctx = run_through_namespace( - dataset, - config, - Some(content_checksum.to_string()), - ) - .await?; + let _ctx = run_through_namespace(dataset, config, Some(content_checksum.to_string())).await?; Ok(()) } @@ -40,11 +35,7 @@ pub async fn run_evaluation( config: &Config, content_checksum: Option<&str>, ) -> Result { - let mut ctx = EvaluationContext::new( - dataset, - config, - content_checksum.map(str::to_string), - ); + let mut ctx = EvaluationContext::new(dataset, config, content_checksum.map(str::to_string)); stages::prepare_slice(&mut ctx).await?; stages::prepare_db(&mut ctx).await?; stages::prepare_corpus(&mut ctx).await?; diff --git a/evaluations/src/pipeline/stages/prepare_slice.rs b/evaluations/src/pipeline/stages/prepare_slice.rs index 9b2493e..fb5a373 100644 --- a/evaluations/src/pipeline/stages/prepare_slice.rs +++ b/evaluations/src/pipeline/stages/prepare_slice.rs @@ -3,7 +3,10 @@ use std::time::Instant; use anyhow::Context; use tracing::info; -use crate::{db::{default_database, default_namespace}, slice}; +use crate::{ + db::{default_database, default_namespace}, + slice, +}; use super::super::context::{EvalStage, EvaluationContext}; diff --git a/evaluations/src/slice/mod.rs b/evaluations/src/slice/mod.rs index 3b7d7d9..9be5f48 100644 --- a/evaluations/src/slice/mod.rs +++ b/evaluations/src/slice/mod.rs @@ -14,9 +14,7 @@ use tracing::{info, warn}; use crate::{ args::Config, - datasets::{ - ConvertedDataset, ConvertedParagraph, ConvertedQuestion, DatasetKind, - }, + datasets::{ConvertedDataset, ConvertedParagraph, ConvertedQuestion, DatasetKind}, }; mod beir; @@ -244,8 +242,7 @@ pub fn resolve_slice<'a>( ); return Ok(resolved); } - let resolved = - materialize_slice_ledger(dataset, config, &index, slice_arg, path)?; + let resolved = materialize_slice_ledger(dataset, config, &index, slice_arg, path)?; info!( slice = %resolved.manifest.slice_id, path = %resolved.path.display(), @@ -927,10 +924,7 @@ pub fn cached_manifest_path(config: &crate::args::Config) -> Option { } pub fn manifest_is_complete(manifest: &SliceManifest, config: &SliceConfig<'_>) -> bool { - let requested_limit = config - .limit - .unwrap_or(manifest.case_count.max(1)) - .max(1); + let requested_limit = config.limit.unwrap_or(manifest.case_count.max(1)).max(1); if manifest.case_count < requested_limit { return false; } @@ -942,7 +936,9 @@ pub fn manifest_is_complete(manifest: &SliceManifest, config: &SliceConfig<'_>) let desired_negatives = desired_negative_target( manifest.positive_paragraphs, requested_corpus, - manifest.total_paragraphs.max(manifest.positive_paragraphs.max(1)), + manifest + .total_paragraphs + .max(manifest.positive_paragraphs.max(1)), config.negative_multiplier, ); manifest.negative_paragraphs >= desired_negatives @@ -978,8 +974,7 @@ pub fn ledger_target(config: &Config) -> Option { pub fn grow_slice(dataset: &ConvertedDataset, config: &Config) -> Result<()> { let ledger_limit = ledger_target(config); let slice_settings = slice_config_with_limit(config, ledger_limit); - let slice = - resolve_slice(dataset, &slice_settings).context("resolving dataset slice")?; + let slice = resolve_slice(dataset, &slice_settings).context("resolving dataset slice")?; info!( slice = slice.manifest.slice_id.as_str(), cases = slice.manifest.case_count, diff --git a/ingestion-pipeline/src/pipeline/context.rs b/ingestion-pipeline/src/pipeline/context.rs index 9014f94..cb70c48 100644 --- a/ingestion-pipeline/src/pipeline/context.rs +++ b/ingestion-pipeline/src/pipeline/context.rs @@ -109,10 +109,7 @@ impl<'a> PipelineContext<'a> { let content = self.take_text_content()?; let analysis = self.take_analysis()?; - let (entities, relationships) = self - .services - .convert_analysis(&content, &analysis) - .await?; + let (entities, relationships) = self.services.convert_analysis(&content, &analysis).await?; let chunk_range = self.chunk_token_range(); let chunk_overlap = self.chunk_overlap_tokens(); diff --git a/ingestion-pipeline/src/pipeline/mod.rs b/ingestion-pipeline/src/pipeline/mod.rs index 75d853a..35672a7 100644 --- a/ingestion-pipeline/src/pipeline/mod.rs +++ b/ingestion-pipeline/src/pipeline/mod.rs @@ -186,11 +186,7 @@ impl IngestionPipeline { } async fn artifacts_persisted(&self, task_id: &str) -> Result { - Ok(self - .db - .get_item::(task_id) - .await? - .is_some()) + Ok(self.db.get_item::(task_id).await?.is_some()) } async fn finalize_succeeded(&self, task: &IngestionTask) -> Result<(), AppError> { @@ -379,8 +375,7 @@ mod finalize_tests { persist_max_backoff_ms: 10, ..IngestionTuning::default() }; - let pipeline = - IngestionPipeline::with_services(Arc::new(db.clone()), config, services)?; + let pipeline = IngestionPipeline::with_services(Arc::new(db.clone()), config, services)?; let task = reserve_task( &db, @@ -397,9 +392,7 @@ mod finalize_tests { let processing = task.mark_processing(&db).await?; db.client - .query( - "UPDATE type::thing('ingestion_task', $id) SET worker_id = $wrong_worker;", - ) + .query("UPDATE type::thing('ingestion_task', $id) SET worker_id = $wrong_worker;") .bind(("id", processing.id.clone())) .bind(("wrong_worker", "wrong-worker")) .await?; @@ -410,9 +403,7 @@ mod finalize_tests { sleep(Duration::from_millis(5)).await; let _ = db_fix .client - .query( - "UPDATE type::thing('ingestion_task', $id) SET worker_id = $worker_id;", - ) + .query("UPDATE type::thing('ingestion_task', $id) SET worker_id = $worker_id;") .bind(("id", task_id)) .bind(("worker_id", worker_id)) .await; @@ -420,10 +411,7 @@ mod finalize_tests { pipeline.finalize_succeeded(&processing).await?; - let stored: IngestionTask = db - .get_item(&processing.id) - .await? - .context("task stored")?; + let stored: IngestionTask = db.get_item(&processing.id).await?.context("task stored")?; assert_eq!(stored.state, TaskState::Succeeded); Ok(()) diff --git a/ingestion-pipeline/src/pipeline/test_support.rs b/ingestion-pipeline/src/pipeline/test_support.rs index 805f00f..4f645ff 100644 --- a/ingestion-pipeline/src/pipeline/test_support.rs +++ b/ingestion-pipeline/src/pipeline/test_support.rs @@ -133,15 +133,15 @@ pub fn large_artifacts( } } -pub async fn persist( - db: &SurrealDbClient, - artifacts: PipelineArtifacts, -) -> Result<(), AppError> { +pub async fn persist(db: &SurrealDbClient, artifacts: PipelineArtifacts) -> Result<(), AppError> { persist_artifacts(db, &tuning(), TEST_EMBEDDING_DIM, artifacts).await?; Ok(()) } -pub async fn count_chunks_for_source(db: &SurrealDbClient, source_id: &str) -> anyhow::Result { +pub async fn count_chunks_for_source( + db: &SurrealDbClient, + source_id: &str, +) -> anyhow::Result { let chunks: Vec = db .client .query("SELECT * FROM text_chunk WHERE source_id = $source_id;") diff --git a/ingestion-pipeline/src/pipeline/tests.rs b/ingestion-pipeline/src/pipeline/tests.rs index 7483bb6..0ed229e 100644 --- a/ingestion-pipeline/src/pipeline/tests.rs +++ b/ingestion-pipeline/src/pipeline/tests.rs @@ -1,5 +1,15 @@ use std::sync::Arc; +use super::{ + config::{IngestionConfig, IngestionTuning}, + enrichment_result::LLMEnrichmentResult, + services::PipelineServices, + test_support::{ + count_chunks_for_source, count_entities_for_source, count_relationships_for_source, + persist, sample_artifacts, setup_db, + }, + IngestionPipeline, +}; use crate::pipeline::context::{EmbeddedKnowledgeEntity, EmbeddedTextChunk}; use anyhow::{self, Context}; use async_trait::async_trait; @@ -20,16 +30,6 @@ use common::{ }; use retrieval_pipeline::{RetrievedChunk, RetrievedEntity}; use tokio::sync::Mutex; -use super::{ - config::{IngestionConfig, IngestionTuning}, - enrichment_result::LLMEnrichmentResult, - services::PipelineServices, - test_support::{ - count_chunks_for_source, count_entities_for_source, count_relationships_for_source, - persist, sample_artifacts, setup_db, - }, - IngestionPipeline, -}; pub(crate) struct MockServices { text_content: TextContent, @@ -221,9 +221,7 @@ impl PipelineServices for FailingServices { content: &TextContent, analysis: &LLMEnrichmentResult, ) -> Result<(Vec, Vec), AppError> { - self.inner - .convert_analysis(content, analysis) - .await + self.inner.convert_analysis(content, analysis).await } async fn prepare_chunks( diff --git a/json-stream-parser/src/lib.rs b/json-stream-parser/src/lib.rs index d8a35da..1555f05 100644 --- a/json-stream-parser/src/lib.rs +++ b/json-stream-parser/src/lib.rs @@ -219,7 +219,7 @@ fn add_char_into_object( } } (&Value::Bool(true) | &Value::Bool(false), &ObjectStatus::Scalar { .. }, 'e') -| (&Value::Object(_), &ObjectStatus::ValueQuoteClose, '}') => { + | (&Value::Object(_), &ObjectStatus::ValueQuoteClose, '}') => { *current_status = ObjectStatus::Closed; } diff --git a/main/src/main.rs b/main/src/main.rs index 844314e..52fb706 100644 --- a/main/src/main.rs +++ b/main/src/main.rs @@ -60,12 +60,7 @@ async fn main() -> anyhow::Result<()> { worker_embedding, )?); - run_worker_loop( - worker_db, - ingestion_pipeline, - index_rebuild_interval_secs, - ) - .await + run_worker_loop(worker_db, ingestion_pipeline, index_rebuild_interval_secs).await }); tokio::select! { diff --git a/main/src/worker.rs b/main/src/worker.rs index dac6dfb..6530dbe 100644 --- a/main/src/worker.rs +++ b/main/src/worker.rs @@ -74,9 +74,7 @@ mod tests { let db = Arc::clone(&services.db); let pipeline = Arc::new(pipeline); let worker = - tokio::spawn(async move { - ingestion_pipeline::run_worker_loop(db, pipeline, 0).await - }); + tokio::spawn(async move { ingestion_pipeline::run_worker_loop(db, pipeline, 0).await }); tokio::time::sleep(Duration::from_millis(250)).await; assert!(