diff --git a/CHANGELOG.md b/CHANGELOG.md index fe534e2..ae18b4a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,9 @@ ## Unreleased + +## 1.0.5 (2026-06-24) + - Infra: CI workflow fixes. CI is now a nix flake check which includes compilation, caching and running tests, clippy, fmt, validation for ort version. - Docker-compose: The example now references the ghcr image, this is so we can remove the Dockerfile and reducing maintenance scope. - Refactor: web scraping now uses `servo-fetch` (pure-Rust Servo engine) and PDF rendering uses `pdfium-render` (direct PDFium bindings) — reduces Docker image size by ~300MB, improves startup latency by ~100× for PDF rendering, and provides more stable output @@ -10,7 +13,6 @@ - Docs: updated architecture, features, and installation docs to reflect the new web processing stack - Fix: added pre-commit hooks to further maintain code consistency. - Security: updated some deps because dependabot told me, good bot. -- Security: bump `async-openai` to 0.41.1 (feature-gated types, transcription API rename; removes `backoff` transitive dep) - Refactor: deduplicated test database setup across common/src/storage/. - Refactor: split knowledge-graph.js monolith into focused functions. - Evaluations: simplified crate layout — linear pipeline, sharded-only converted store, in-memory ingestion, `db/` and `cli/` modules; namespace reuse state in corpus manifest (removed `cache/snapshots/`); no legacy JSON/history compatibility (re-run `--warm` after upgrade) diff --git a/Cargo.lock b/Cargo.lock index 4811a80..1be1b27 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5852,7 +5852,7 @@ checksum = "670fdfda89751bc4a84ac13eaa63e205cf0fd22b4c9a5fbfa085b63c1f1d3a30" [[package]] name = "main" -version = "1.0.4" +version = "1.0.5" dependencies = [ "anyhow", "api-router", diff --git a/api-router/Cargo.toml b/api-router/Cargo.toml index 5d9125d..612cf68 100644 --- a/api-router/Cargo.toml +++ b/api-router/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "api-router" version = "0.1.0" -edition = "2021" +edition = "2024" license = "AGPL-3.0-or-later" [lints] diff --git a/api-router/src/error.rs b/api-router/src/error.rs index e82e49b..1782121 100644 --- a/api-router/src/error.rs +++ b/api-router/src/error.rs @@ -1,7 +1,7 @@ use axum::{ + Json, http::StatusCode, response::{IntoResponse, Response}, - Json, }; use common::error::AppError; use serde::Serialize; diff --git a/api-router/src/lib.rs b/api-router/src/lib.rs index 1bb9d66..08f2281 100644 --- a/api-router/src/lib.rs +++ b/api-router/src/lib.rs @@ -1,9 +1,9 @@ use api_state::ApiState; use axum::{ + Router, extract::{DefaultBodyLimit, FromRef}, middleware::from_fn_with_state, routing::{get, post}, - Router, }; use middleware_api_auth::api_auth; use routes::{categories::list, ingest::handle, liveness::live, readiness::ready}; diff --git a/api-router/src/routes/categories.rs b/api-router/src/routes/categories.rs index e04b0bb..be6cb77 100644 --- a/api-router/src/routes/categories.rs +++ b/api-router/src/routes/categories.rs @@ -1,4 +1,4 @@ -use axum::{extract::State, response::IntoResponse, Extension, Json}; +use axum::{Extension, Json, extract::State, response::IntoResponse}; use common::storage::types::user::User; use crate::{api_state::ApiState, error::ApiErr}; diff --git a/api-router/src/routes/ingest.rs b/api-router/src/routes/ingest.rs index ea6ab86..e14f384 100644 --- a/api-router/src/routes/ingest.rs +++ b/api-router/src/routes/ingest.rs @@ -1,4 +1,4 @@ -use axum::{extract::State, http::StatusCode, response::IntoResponse, Extension, Json}; +use axum::{Extension, Json, extract::State, http::StatusCode, response::IntoResponse}; use axum_typed_multipart::{FieldData, TryFromMultipart, TypedMultipart}; use common::{ error::AppError, @@ -6,9 +6,9 @@ use common::{ file_info::FileInfo, ingestion_payload::IngestionPayload, ingestion_task::IngestionTask, user::User, }, - utils::ingest_limits::{validate_ingest_input, IngestValidationError}, + utils::ingest_limits::{IngestValidationError, validate_ingest_input}, }; -use futures::{future::try_join_all, TryFutureExt}; +use futures::{TryFutureExt, future::try_join_all}; use serde_json::json; use tempfile::NamedTempFile; use tracing::info; diff --git a/api-router/src/routes/liveness.rs b/api-router/src/routes/liveness.rs index cacb245..a2ec409 100644 --- a/api-router/src/routes/liveness.rs +++ b/api-router/src/routes/liveness.rs @@ -1,4 +1,4 @@ -use axum::{http::StatusCode, response::IntoResponse, Json}; +use axum::{Json, http::StatusCode, response::IntoResponse}; use serde_json::json; /// Liveness probe: always returns 200 to indicate the process is running. diff --git a/api-router/src/routes/readiness.rs b/api-router/src/routes/readiness.rs index 7827d94..455330e 100644 --- a/api-router/src/routes/readiness.rs +++ b/api-router/src/routes/readiness.rs @@ -1,4 +1,4 @@ -use axum::{extract::State, http::StatusCode, response::IntoResponse, Json}; +use axum::{Json, extract::State, http::StatusCode, response::IntoResponse}; use serde_json::json; use tracing::error; diff --git a/api-router/tests/api_router_integration.rs b/api-router/tests/api_router_integration.rs index 9227221..b41344d 100644 --- a/api-router/tests/api_router_integration.rs +++ b/api-router/tests/api_router_integration.rs @@ -4,9 +4,9 @@ use std::sync::Arc; use api_router::{api_routes_v1, api_state::ApiState}; use axum::{ - body::{to_bytes, Body}, - http::{Request, StatusCode}, Router, + body::{Body, to_bytes}, + http::{Request, StatusCode}, }; use common::{ storage::{db::SurrealDbClient, store::StorageManager, types::user::User}, diff --git a/common/Cargo.toml b/common/Cargo.toml index 2a93194..d73b065 100644 --- a/common/Cargo.toml +++ b/common/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "common" version = "0.1.0" -edition = "2021" +edition = "2024" license = "AGPL-3.0-or-later" [lints] diff --git a/common/src/storage/db.rs b/common/src/storage/db.rs index b810236..9a7744b 100644 --- a/common/src/storage/db.rs +++ b/common/src/storage/db.rs @@ -3,14 +3,14 @@ use crate::error::AppError; use axum_session::{SessionConfig, SessionError, SessionStore}; use axum_session_surreal::SessionSurrealPool; use futures::Stream; -use include_dir::{include_dir, Dir}; -use serde::de::DeserializeOwned; +use include_dir::{Dir, include_dir}; use serde::Serialize; +use serde::de::DeserializeOwned; use std::{ops::Deref, sync::Arc}; use surrealdb::{ - engine::any::{connect, Any}, - opt::auth::{Namespace, Root}, Error, Notification, Surreal, + engine::any::{Any, connect}, + opt::auth::{Namespace, Root}, }; use surrealdb_migrations::MigrationRunner; use tracing::debug; diff --git a/common/src/storage/store.rs b/common/src/storage/store.rs index b40389a..f84a738 100644 --- a/common/src/storage/store.rs +++ b/common/src/storage/store.rs @@ -2,14 +2,14 @@ use std::io::ErrorKind; use std::path::{Component, Path, PathBuf}; use std::sync::Arc; -use anyhow::{anyhow, Context, Result as AnyResult}; +use anyhow::{Context, Result as AnyResult, anyhow}; use bytes::Bytes; use futures::stream::BoxStream; use futures::{StreamExt, TryStreamExt}; use object_store::aws::AmazonS3Builder; use object_store::local::LocalFileSystem; use object_store::memory::InMemory; -use object_store::{path::Path as ObjPath, ObjectStore}; +use object_store::{ObjectStore, path::Path as ObjPath}; use crate::utils::config::{AppConfig, StorageKind}; @@ -461,9 +461,12 @@ pub mod testing { pub async fn new_s3() -> object_store::Result { // Ensure credentials are set for MinIO // We set these env vars for the process, which AmazonS3Builder will pick up - std::env::set_var("AWS_ACCESS_KEY_ID", "minioadmin"); - std::env::set_var("AWS_SECRET_ACCESS_KEY", "minioadmin"); - std::env::set_var("AWS_REGION", "us-east-1"); + // SAFETY: test setup runs before concurrent S3 client use in this process. + unsafe { + std::env::set_var("AWS_ACCESS_KEY_ID", "minioadmin"); + std::env::set_var("AWS_SECRET_ACCESS_KEY", "minioadmin"); + std::env::set_var("AWS_REGION", "us-east-1"); + } let cfg = test_config_s3(); let storage = StorageManager::new(&cfg).await?; @@ -543,10 +546,10 @@ pub mod testing { impl Drop for TestStorageManager { fn drop(&mut self) { // Clean up temporary directories for local storage - if let Some((_, path)) = &self.temp_dir { - if path.exists() { - let _ = std::fs::remove_dir_all(path); - } + if let Some((_, path)) = &self.temp_dir + && path.exists() + { + let _ = std::fs::remove_dir_all(path); } } } @@ -690,20 +693,24 @@ mod tests { assert_eq!(retrieved.as_ref(), data); // Test exists - assert!(storage - .exists(location) - .await - .with_context(|| "exists check".to_string())?); + assert!( + storage + .exists(location) + .await + .with_context(|| "exists check".to_string())? + ); // Test delete storage .delete_prefix("test/data/") .await .with_context(|| "delete".to_string())?; - assert!(!storage - .exists(location) - .await - .with_context(|| "exists check after delete".to_string())?); + assert!( + !storage + .exists(location) + .await + .with_context(|| "exists check after delete".to_string())? + ); Ok(()) } @@ -741,20 +748,24 @@ mod tests { .with_context(|| "object directory exists after write".to_string())?; // Test exists - assert!(storage - .exists(location) - .await - .with_context(|| "exists check".to_string())?); + assert!( + storage + .exists(location) + .await + .with_context(|| "exists check".to_string())? + ); // Test delete storage .delete_prefix("test/data/") .await .with_context(|| "delete".to_string())?; - assert!(!storage - .exists(location) - .await - .with_context(|| "exists check after delete".to_string())?); + assert!( + !storage + .exists(location) + .await + .with_context(|| "exists check after delete".to_string())? + ); assert!( tokio::fs::metadata(&object_dir).await.is_err(), "object directory should be removed" @@ -846,12 +857,16 @@ mod tests { .await .with_context(|| "list dir1".to_string())?; assert_eq!(dir1_files.len(), 2); - assert!(dir1_files - .iter() - .any(|meta| meta.location.as_ref().contains("file1.txt"))); - assert!(dir1_files - .iter() - .any(|meta| meta.location.as_ref().contains("file2.txt"))); + assert!( + dir1_files + .iter() + .any(|meta| meta.location.as_ref().contains("file1.txt")) + ); + assert!( + dir1_files + .iter() + .any(|meta| meta.location.as_ref().contains("file2.txt")) + ); // Test listing non-existent prefix let empty_files = storage @@ -918,10 +933,12 @@ mod tests { .with_context(|| "get".to_string())?; assert_eq!(retrieved.as_ref(), data); - assert!(storage - .exists(location) - .await - .with_context(|| "exists".to_string())?); + assert!( + storage + .exists(location) + .await + .with_context(|| "exists".to_string())? + ); assert_eq!(*storage.backend_kind(), StorageKind::Memory); Ok(()) @@ -975,10 +992,12 @@ mod tests { assert_eq!(retrieved.as_ref(), data); // Test existence check - assert!(test_storage - .exists(location) - .await - .with_context(|| "exists".to_string())?); + assert!( + test_storage + .exists(location) + .await + .with_context(|| "exists".to_string())? + ); // Test list let files = test_storage @@ -992,10 +1011,12 @@ mod tests { .delete_prefix("test/storage/") .await .with_context(|| "delete".to_string())?; - assert!(!test_storage - .exists(location) - .await - .with_context(|| "exists after delete".to_string())?); + assert!( + !test_storage + .exists(location) + .await + .with_context(|| "exists after delete".to_string())? + ); Ok(()) } @@ -1019,10 +1040,12 @@ mod tests { .with_context(|| "get".to_string())?; assert_eq!(retrieved.as_ref(), data); - assert!(test_storage - .exists(location) - .await - .with_context(|| "exists".to_string())?); + assert!( + test_storage + .exists(location) + .await + .with_context(|| "exists".to_string())? + ); Ok(()) } @@ -1119,20 +1142,24 @@ mod tests { assert_eq!(retrieved.as_ref(), data); // Test exists - assert!(storage - .exists(&location) - .await - .with_context(|| "exists".to_string())?); + assert!( + storage + .exists(&location) + .await + .with_context(|| "exists".to_string())? + ); // Test delete storage .delete_prefix(&format!("{prefix}/")) .await .with_context(|| "delete".to_string())?; - assert!(!storage - .exists(&location) - .await - .with_context(|| "exists after delete".to_string())?); + assert!( + !storage + .exists(&location) + .await + .with_context(|| "exists after delete".to_string())? + ); Ok(()) } diff --git a/common/src/storage/types/analytics.rs b/common/src/storage/types/analytics.rs index 18ea82b..1daa97f 100644 --- a/common/src/storage/types/analytics.rs +++ b/common/src/storage/types/analytics.rs @@ -1,4 +1,4 @@ -use crate::storage::types::{user::User, StoredObject}; +use crate::storage::types::{StoredObject, user::User}; use crate::utils::serde_helpers::deserialize_flexible_id; use serde::{Deserialize, Serialize}; diff --git a/common/src/storage/types/ingestion_task.rs b/common/src/storage/types/ingestion_task.rs index 62788c6..b72cb90 100644 --- a/common/src/storage/types/ingestion_task.rs +++ b/common/src/storage/types/ingestion_task.rs @@ -315,9 +315,10 @@ impl IngestionTask { "#; debug_assert!(lifecycle::pending().reserve().is_ok()); - debug_assert!(lifecycle::pending().reserve().is_ok_and(|m| m - .start_processing() - .is_ok_and(|m| m.fail().is_ok_and(|m| m.reserve().is_ok())))); + debug_assert!(lifecycle::pending().reserve().is_ok_and(|m| { + m.start_processing() + .is_ok_and(|m| m.fail().is_ok_and(|m| m.reserve().is_ok())) + })); let mut result = db .client diff --git a/common/src/storage/types/knowledge_entity.rs b/common/src/storage/types/knowledge_entity.rs index bebb20a..ad704d9 100644 --- a/common/src/storage/types/knowledge_entity.rs +++ b/common/src/storage/types/knowledge_entity.rs @@ -399,7 +399,9 @@ impl KnowledgeEntity { if embedding.len() != new_dimensions { let err_msg = format!( "CRITICAL: Generated embedding for entity {} has incorrect dimension ({}). Expected {}. Aborting.", - entity.id, embedding.len(), new_dimensions + entity.id, + embedding.len(), + new_dimensions ); error!("{err_msg}"); return Err(AppError::internal(err_msg)); @@ -864,14 +866,18 @@ mod tests { let rid_e1 = surrealdb::RecordId::from_table_key(KnowledgeEntity::table_name(), &e1.id); let rid_e2 = surrealdb::RecordId::from_table_key(KnowledgeEntity::table_name(), &e2.id); - assert!(KnowledgeEntityEmbedding::get_by_record_id(&db, &rid_e1) - .await - .with_context(|| "get embedding e1".to_string())? - .is_some()); - assert!(KnowledgeEntityEmbedding::get_by_record_id(&db, &rid_e2) - .await - .with_context(|| "get embedding e2".to_string())? - .is_some()); + assert!( + KnowledgeEntityEmbedding::get_by_record_id(&db, &rid_e1) + .await + .with_context(|| "get embedding e1".to_string())? + .is_some() + ); + assert!( + KnowledgeEntityEmbedding::get_by_record_id(&db, &rid_e2) + .await + .with_context(|| "get embedding e2".to_string())? + .is_some() + ); let results = KnowledgeEntity::vector_search(2, &[0.0, 1.0, 0.0], &db, &user_id) .await diff --git a/common/src/storage/types/knowledge_entity_embedding.rs b/common/src/storage/types/knowledge_entity_embedding.rs index 18dccba..697a1f4 100644 --- a/common/src/storage/types/knowledge_entity_embedding.rs +++ b/common/src/storage/types/knowledge_entity_embedding.rs @@ -287,10 +287,12 @@ mod tests { .with_context(|| "get entity2 embedding after delete".to_string())? .is_none() ); - assert!(KnowledgeEntityEmbedding::get_by_record_id(&db, &other_rid) - .await - .with_context(|| "get other embedding after delete".to_string())? - .is_some()); + assert!( + KnowledgeEntityEmbedding::get_by_record_id(&db, &other_rid) + .await + .with_context(|| "get other embedding after delete".to_string())? + .is_some() + ); Ok(()) } diff --git a/common/src/storage/types/knowledge_relationship.rs b/common/src/storage/types/knowledge_relationship.rs index b23265e..abce231 100644 --- a/common/src/storage/types/knowledge_relationship.rs +++ b/common/src/storage/types/knowledge_relationship.rs @@ -575,12 +575,16 @@ mod tests { KnowledgeRelationship::delete_relationships_by_source_id(shared_source, user_a, &db) .await?; - assert!(get_relationship_by_id(&owner_relationship_id, &db) - .await - .is_none()); - assert!(get_relationship_by_id(&other_relationship_id, &db) - .await - .is_some()); + assert!( + get_relationship_by_id(&owner_relationship_id, &db) + .await + .is_none() + ); + assert!( + get_relationship_by_id(&other_relationship_id, &db) + .await + .is_some() + ); Ok(()) } diff --git a/common/src/storage/types/system_settings.rs b/common/src/storage/types/system_settings.rs index d2e504d..8abdab8 100644 --- a/common/src/storage/types/system_settings.rs +++ b/common/src/storage/types/system_settings.rs @@ -223,16 +223,16 @@ impl SystemSettings { needs_update = true; } - if let Some(model) = provider_model { - if settings.embedding_model != model { - tracing::info!( - old_model = %settings.embedding_model, - new_model = %model, - "Embedding model changed, updating SystemSettings" - ); - settings.embedding_model = model; - needs_update = true; - } + if let Some(model) = provider_model + && settings.embedding_model != model + { + tracing::info!( + old_model = %settings.embedding_model, + new_model = %model, + "Embedding model changed, updating SystemSettings" + ); + settings.embedding_model = model; + needs_update = true; } if needs_update { @@ -719,8 +719,8 @@ mod tests { } #[tokio::test] - async fn test_should_change_embedding_length_on_indexes_when_switching_length( - ) -> anyhow::Result<()> { + async fn test_should_change_embedding_length_on_indexes_when_switching_length() + -> anyhow::Result<()> { use crate::utils::embedding::EmbeddingProvider; let db = setup_test_db().await?; diff --git a/common/src/storage/types/text_chunk.rs b/common/src/storage/types/text_chunk.rs index b92942f..38072ad 100644 --- a/common/src/storage/types/text_chunk.rs +++ b/common/src/storage/types/text_chunk.rs @@ -4,7 +4,7 @@ use std::fmt::Write; use crate::storage::indexes::hnsw_index_overwrite_sql; use crate::storage::types::{ - text_chunk_embedding::TextChunkEmbedding, EmbeddingRecord, HasEmbedding, + EmbeddingRecord, HasEmbedding, text_chunk_embedding::TextChunkEmbedding, }; use crate::utils::embedding::RE_EMBED_BATCH_SIZE; use crate::{error::AppError, storage::db::SurrealDbClient, stored_object}; @@ -216,7 +216,9 @@ impl TextChunk { if embedding.len() != new_dimensions { let err_msg = format!( "CRITICAL: Generated embedding for chunk {} has incorrect dimension ({}). Expected {}. Aborting.", - chunk.id, embedding.len(), new_dimensions + chunk.id, + embedding.len(), + new_dimensions ); error!("{err_msg}"); return Err(AppError::internal(err_msg)); diff --git a/common/src/storage/types/text_chunk_embedding.rs b/common/src/storage/types/text_chunk_embedding.rs index d61bc0a..db2dab1 100644 --- a/common/src/storage/types/text_chunk_embedding.rs +++ b/common/src/storage/types/text_chunk_embedding.rs @@ -235,35 +235,47 @@ mod tests { .with_context(|| format!("store embedding for {key}"))?; } - assert!(TextChunkEmbedding::get_by_record_id(&db, &chunk1_rid) - .await - .with_context(|| "get chunk1".to_string())? - .is_some()); - assert!(TextChunkEmbedding::get_by_record_id(&db, &chunk2_rid) - .await - .with_context(|| "get chunk2".to_string())? - .is_some()); - assert!(TextChunkEmbedding::get_by_record_id(&db, &chunk_other_rid) - .await - .with_context(|| "get chunk_other".to_string())? - .is_some()); + assert!( + TextChunkEmbedding::get_by_record_id(&db, &chunk1_rid) + .await + .with_context(|| "get chunk1".to_string())? + .is_some() + ); + assert!( + TextChunkEmbedding::get_by_record_id(&db, &chunk2_rid) + .await + .with_context(|| "get chunk2".to_string())? + .is_some() + ); + assert!( + TextChunkEmbedding::get_by_record_id(&db, &chunk_other_rid) + .await + .with_context(|| "get chunk_other".to_string())? + .is_some() + ); TextChunkEmbedding::delete_by_source_id(source_id, &db) .await .with_context(|| "Failed to delete by source_id".to_string())?; - assert!(TextChunkEmbedding::get_by_record_id(&db, &chunk1_rid) - .await - .with_context(|| "check chunk1".to_string())? - .is_none()); - assert!(TextChunkEmbedding::get_by_record_id(&db, &chunk2_rid) - .await - .with_context(|| "check chunk2".to_string())? - .is_none()); - assert!(TextChunkEmbedding::get_by_record_id(&db, &chunk_other_rid) - .await - .with_context(|| "check chunk_other".to_string())? - .is_some()); + assert!( + TextChunkEmbedding::get_by_record_id(&db, &chunk1_rid) + .await + .with_context(|| "check chunk1".to_string())? + .is_none() + ); + assert!( + TextChunkEmbedding::get_by_record_id(&db, &chunk2_rid) + .await + .with_context(|| "check chunk2".to_string())? + .is_none() + ); + assert!( + TextChunkEmbedding::get_by_record_id(&db, &chunk_other_rid) + .await + .with_context(|| "check chunk_other".to_string())? + .is_some() + ); Ok(()) } diff --git a/common/src/storage/types/text_content.rs b/common/src/storage/types/text_content.rs index df81484..94539a2 100644 --- a/common/src/storage/types/text_content.rs +++ b/common/src/storage/types/text_content.rs @@ -1,8 +1,8 @@ use std::collections::{HashMap, HashSet}; use std::str::FromStr; -use surrealdb::opt::PatchOp; use surrealdb::RecordId; +use surrealdb::opt::PatchOp; use uuid::Uuid; use crate::{error::AppError, storage::db::SurrealDbClient, stored_object}; @@ -682,8 +682,8 @@ mod tests { } #[tokio::test] - async fn clear_ingested_children_removes_chunks_entities_and_relationships( - ) -> anyhow::Result<()> { + async fn clear_ingested_children_removes_chunks_entities_and_relationships() + -> anyhow::Result<()> { let db = setup_test_db().await?; let user_id = "clear-user"; let source_id = Uuid::new_v4().to_string(); diff --git a/common/src/storage/types/user.rs b/common/src/storage/types/user.rs index c96e393..473ea1c 100644 --- a/common/src/storage/types/user.rs +++ b/common/src/storage/types/user.rs @@ -3,7 +3,7 @@ use anyhow::anyhow; use async_trait::async_trait; use axum_session_auth::Authentication; use chrono_tz::Tz; -use surrealdb::{engine::any::Any, Surreal}; +use surrealdb::{Surreal, engine::any::Any}; use uuid::Uuid; use super::text_chunk::TextChunk; @@ -729,7 +729,7 @@ mod tests { use super::*; use crate::storage::types::ingestion_payload::IngestionPayload; - use crate::storage::types::ingestion_task::{IngestionTask, TaskState, MAX_ATTEMPTS}; + use crate::storage::types::ingestion_task::{IngestionTask, MAX_ATTEMPTS, TaskState}; use std::collections::HashSet; use crate::test_utils::setup_test_db; diff --git a/common/src/test_utils.rs b/common/src/test_utils.rs index 92ad1c9..664d215 100644 --- a/common/src/test_utils.rs +++ b/common/src/test_utils.rs @@ -8,8 +8,8 @@ use crate::storage::{ db::SurrealDbClient, indexes::{ensure_runtime, rebuild}, types::{ - knowledge_entity_embedding::KnowledgeEntityEmbedding, system_settings::SystemSettings, - text_chunk_embedding::TextChunkEmbedding, EmbeddingRecord, + EmbeddingRecord, knowledge_entity_embedding::KnowledgeEntityEmbedding, + system_settings::SystemSettings, text_chunk_embedding::TextChunkEmbedding, }, }; diff --git a/common/src/utils/config.rs b/common/src/utils/config.rs index 454f000..b0d9886 100644 --- a/common/src/utils/config.rs +++ b/common/src/utils/config.rs @@ -198,7 +198,10 @@ pub fn ensure_ort_path() { exe.join("lib").join("onnxruntime.dll"), ] { if p.exists() { - env::set_var("ORT_DYLIB_PATH", p); + // SAFETY: `Once` ensures this runs on a single thread during startup. + unsafe { + env::set_var("ORT_DYLIB_PATH", p); + } return; } } @@ -210,7 +213,10 @@ pub fn ensure_ort_path() { }; let p = exe.join("lib").join(name); if p.exists() { - env::set_var("ORT_DYLIB_PATH", p); + // SAFETY: `Once` ensures this runs on a single thread during startup. + unsafe { + env::set_var("ORT_DYLIB_PATH", p); + } } }); } diff --git a/common/src/utils/embedding.rs b/common/src/utils/embedding.rs index 10732ad..976b864 100644 --- a/common/src/utils/embedding.rs +++ b/common/src/utils/embedding.rs @@ -9,7 +9,7 @@ use std::{ use serde::Serialize; use tracing::warn; -use async_openai::{types::embeddings::CreateEmbeddingRequestArgs, Client}; +use async_openai::{Client, types::embeddings::CreateEmbeddingRequestArgs}; use fastembed::{EmbeddingModel, ModelTrait, TextEmbedding, TextInitOptions}; use tokio::sync::{OwnedSemaphorePermit, Semaphore}; @@ -588,9 +588,8 @@ mod tests { #![allow(clippy::expect_used)] use super::{ - align_fastembed_system_settings, fastembed_model_dimension, - list_fastembed_embedding_models, resolve_fastembed_model_code, EmbeddingError, - DEFAULT_FASTEMBED_MODEL_CODE, + DEFAULT_FASTEMBED_MODEL_CODE, EmbeddingError, align_fastembed_system_settings, + fastembed_model_dimension, list_fastembed_embedding_models, resolve_fastembed_model_code, }; use crate::storage::types::system_settings::SystemSettings; use crate::utils::config::{AppConfig, EmbeddingBackend, ParseEmbeddingBackendError}; diff --git a/common/src/utils/ingest_limits.rs b/common/src/utils/ingest_limits.rs index fbe0261..41bf7b2 100644 --- a/common/src/utils/ingest_limits.rs +++ b/common/src/utils/ingest_limits.rs @@ -47,13 +47,13 @@ pub fn validate_ingest_input( ))); } - if let Some(content) = content { - if content.len() > config.ingest_max_content_bytes { - return Err(IngestValidationError::PayloadTooLarge(format!( - "content is too large: maximum allowed is {} bytes", - config.ingest_max_content_bytes - ))); - } + if let Some(content) = content + && content.len() > config.ingest_max_content_bytes + { + return Err(IngestValidationError::PayloadTooLarge(format!( + "content is too large: maximum allowed is {} bytes", + config.ingest_max_content_bytes + ))); } if ctx.len() > config.ingest_max_context_bytes { diff --git a/common/src/utils/template_engine.rs b/common/src/utils/template_engine.rs index 54d33a1..7e6fb4b 100644 --- a/common/src/utils/template_engine.rs +++ b/common/src/utils/template_engine.rs @@ -1,4 +1,4 @@ -pub use minijinja::{path_loader, Environment, Value}; +pub use minijinja::{Environment, Value, path_loader}; pub use minijinja_autoreload::AutoReloader; pub use minijinja_contrib; pub use minijinja_embed; diff --git a/docs/ci-cd-roadmap.md b/docs/ci-cd-roadmap.md new file mode 100644 index 0000000..5876b06 --- /dev/null +++ b/docs/ci-cd-roadmap.md @@ -0,0 +1,288 @@ +# CI/CD Roadmap: Nix-First Release Builds + +This document tracks the migration from cargo-dist raw `cargo build --release` on bare GitHub runners to Nix-built release artifacts for all platforms. The goal is a single build system (the flake) shared by CI, Docker, and release binaries. + +**Status:** Phase 3–4 complete locally — Nix builds all release targets including Windows cross (`nix build .#minne-release-windows` verified on x86_64-linux). cargo-dist removed from workflow and devenv. GHA tag-push validation pending. + +**Decision (2026-06-23):** Drop `x86_64-apple-darwin` (Intel macOS). Ship `aarch64-apple-darwin` only; Intel Mac users can run via Rosetta 2. + +--- + +## Executive Summary + +Nix is now the sole compiler for all release binaries. Per-platform `minne-release` flake outputs produce archives compatible with GitHub Releases layout (binaries + `lib/libonnxruntime.*` + docs). The release workflow uses matrix jobs running `nix build` with `cache-nix-action` on every job. cargo-dist has been removed; releases use `gh release create` with CHANGELOG-driven notes. + +This fixes the mozangle/clang failure at the root: the flake already wires `libclang`, `bindgenHook`, `llvm`, `python3`, `fontconfig`, and `MOZJS_ARCHIVE` — cargo-dist on bare Ubuntu cannot see any of that without duplicating it in apt/workflow steps. + +--- + +## Current State + +### What works + +- [x] CI (`nix flake check`) — format, clippy, tests, ort-version gate via Crane `buildDepsOnly` +- [x] Release Docker job — `nix build .#dockerImage`, push to GHCR with dynamic tag from `docker load` +- [x] Release plan job — `nix flake check`, ORT version from flake (no cargo-dist) +- [x] Harmonized native deps in `flake.nix` for CI/Docker (openssl, libglvnd, onnxruntime, fontconfig, bindgen, mozjs) + +### What is broken or painful + +- [x] ~~cargo-dist Linux build fails without apt/mozjs workarounds~~ — resolved: Nix builds all platforms +- [x] ~~Two build systems: Nix for CI/Docker, cargo + apt/homebrew for dist binaries~~ — resolved: Nix-only release +- [x] ~~Four independent release compiles with no shared Nix store across jobs~~ — resolved: `cache-nix-action` on all release jobs +- [x] ~~`[dist.dependencies.apt]` duplicates flake.nix logic~~ — resolved: `dist-workspace.toml` deleted +- [ ] Release compile time on GHA not yet measured post-migration (expected ~10–25 min warm vs ~50–110 min cold) +- [ ] GHA tag-push validation pending for macOS and Windows archives + +--- + +## Target Architecture + +| Layer | Owner | Notes | +|-------|-------|-------| +| Compile binaries | **Nix** (`minne-pkg` / cross derivations) | Crane + `commonArgs`, per-platform mozjs | +| Bundle ORT + runtime libs | **Nix** (`minne-release`) | Match `include = ["lib"]` layout | +| Create archives | **Nix** or thin shell | `.tar.xz` (Unix), `.zip` (Windows) | +| Publish GitHub Release | **`gh release create`** | CHANGELOG body | +| Docker image | **Nix** (unchanged) | Shares `minne-pkg` derivation with Linux release | +| cargo-dist | **Removed** | Replaced by Nix jobs + `gh release` | + +### Release targets (end state) + +| Target | Builder | Nix output | +|--------|---------|------------| +| `x86_64-unknown-linux-gnu` | `ubuntu-22.04` native | `.#minne-release` | +| `aarch64-apple-darwin` | `macos-latest` native | `.#minne-release` | +| ~~`x86_64-apple-darwin`~~ | **Dropped** | — | +| `x86_64-pc-windows-msvc` | `ubuntu-22.04` cross | `.#minne-release-windows` | + +--- + +## Per-Platform Build Matrix + +| Target | Feasibility | Nix command | Artifact layout | Blockers | +|--------|-------------|-------------|-----------------|----------| +| `x86_64-unknown-linux-gnu` | Ready with modest flake changes | `nix build .#minne-release --system x86_64-linux` | `main-{ver}-x86_64-unknown-linux-gnu.tar.xz` → `main/`, `server/`, `worker/`, `lib/libonnxruntime.so`, README, LICENSE, CHANGELOG | glibc 2.40 (nixpkgs-unstable) vs Ubuntu 22.04 glibc 2.35; portable runtime bundling needed | +| `aarch64-apple-darwin` | Feasible | `nix build .#minne-release --system aarch64-darwin` | `main-{ver}-aarch64-apple-darwin.tar.xz` + `lib/libonnxruntime.dylib` | Per-system mozjs URL; Darwin `postInstall` assumes Linux today | +| `x86_64-pc-windows-msvc` | Feasible with new cross flake | `nix build .#minne-release-windows` (x86_64-linux host) | `main-{ver}-x86_64-pc-windows-msvc.zip` + `lib/onnxruntime.dll` | Crane + cargo-xwin cross setup; no native Nix-on-Windows for v1 | + +### mozjs prebuilt availability (mozjs-sys-v140.10.1-0) + +Confirmed for all release triples: + +- `libmozjs-x86_64-unknown-linux-gnu.tar.gz` +- `libmozjs-aarch64-apple-darwin.tar.gz` +- `libmozjs-x86_64-pc-windows-msvc.tar.gz` + +--- + +## Caching Strategy + +| Layer | Invalidated by | Shared across | +|-------|----------------|---------------| +| Nix store (system deps) | `flake.lock`, `*.nix`, `Cargo.lock` | plan, CI, Docker, all release jobs (per OS) | +| `cargoArtifacts` (`buildDepsOnly`) | `Cargo.lock` dep changes only | minne-pkg, clippy, test, dockerImage, release | +| `minne-pkg` (source) | Application source changes | dockerImage, release | +| cargo-dist `target/` | Version bumps (weak) | Removed — Nix store replaces it | + +### Expected release times + +| Scenario | Current (cargo-dist) | After migration (Nix) | +|----------|---------------------|-------------------------| +| Cold release (version bump, no cache) | ~50–110 min × 4 jobs | ~45–90 min × 3 jobs (no Intel Mac) | +| Warm release (source-only, cache hit) | Still ~full rebuild | ~10–25 min incremental per OS | +| No-op re-release | Full rebuild | ~2–5 min if derivations unchanged | +| Docker job (cached) | ~5–15 min | Unchanged; shares `minne-pkg` with Linux release | + +`buildDepsOnly` survives version bumps (version is in flake `minneVersion`, not `Cargo.lock`) — major win over cargo-dist. + +--- + +## Implementation Phases + +### Phase 1 — Linux via Nix (highest pain, highest value) + +- [x] Add per-system `mozjsArchive` helper with hash map (at minimum fix structure for all platforms) +- [x] Add `nix/minne-release.nix` — bundle ORT + portable runtime libs + docs into archive +- [x] Linux portable runtime lib bundling + `patchelf --set-rpath '$ORIGIN/lib'` +- [x] Replace Linux `build-local-artifacts` steps with `nix build .#minne-release` +- [x] Add `cache-nix-action` to Linux release build job +- [x] Validate glibc portability (test binary on Ubuntu 22.04) +- [x] Remove Linux apt/mozjs/ORT curl workarounds from `.github/workflows/release.yml` +- [x] Archive naming matches prior releases (`main-{triple}.tar.xz`, no version in filename) + +### Phase 2 — macOS aarch64 + +- [x] Platform-conditional `postInstall` in `flake.nix` (Darwin vs Linux wrapping) +- [x] Add `nix/minne-release-darwin.nix` — ORT + runtime dylibs + docs archive +- [x] macOS `build-local-artifacts` uses `nix build .#minne-release` on `macos-latest` +- [x] `cache-nix-action` on macOS release build job +- [x] Drop `x86_64-apple-darwin` target (was in `dist-workspace.toml`, now deleted) +- [ ] Test archive on clean macOS VM / GHA release run +- [x] Update `docs/installation.md` to note aarch64-only macOS binary (Rosetta 2 for Intel Macs) + +### Phase 3 — Windows cross from Linux + +- [x] Add `minne-release-windows` cross derivation (Crane + cargo-xwin) +- [x] Add `nix/clang-cl-msvc-link-wrapper.sh` for mozangle DLL links under clang-cl +- [x] Windows GHA job on `ubuntu-22.04` (cross-build, not Nix-on-Windows) +- [x] Bundle `onnxruntime.dll` in release zip (match cargo-dist flat layout) +- [x] Fenix `rust-std` for `x86_64-pc-windows-msvc` via `fenix.combine` +- [x] Local cross-build verified: `nix build .#minne-release-windows` on x86_64-linux +- [ ] Test archive on Windows VM / GHA release run + +### Phase 4 — Cleanup + +- [x] Remove cargo-dist compile steps from release workflow +- [x] Delete `dist-workspace.toml` +- [x] Simplify CI to `nix flake check` only (drop `cargo-dist plan`) +- [x] Replace `host`/cargo-dist with `gh release create` + CHANGELOG +- [x] Remove `pkgs.cargo-dist` from `devenv.nix` +- [x] Update `AGENTS.md` release checklist +- [ ] Update README release badges/docs if workflow structure changes + +--- + +## Flake Changes (outline) + +New/modified outputs: + +```nix +# Per-system mozjs (replace hardcoded Linux x86_64) +mozjsTarget = { "x86_64-linux" = "x86_64-unknown-linux-gnu"; ... }.${system}; +mozjsArchive = pkgs.fetchurl { url = ".../libmozjs-${mozjsTarget}.tar.gz"; hash = mozjsHashes.${system}; }; + +# Platform-conditional postInstall (Linux LD_LIBRARY_PATH vs Darwin) + +# NEW: release archive derivation +packages.minne-release = callPackage ./nix/minne-release.nix { inherit minne-pkg minneVersion ortVersion; }; + +# NEW: Windows cross (x86_64-linux host only) +packages.minne-release-windows = ...; +``` + +New file: `nix/minne-release.nix` — copies stripped binaries, stages `lib/libonnxruntime.{so,dylib}`, optional runtime `.so` copies, includes README/LICENSE/CHANGELOG, builds `.tar.xz` / `.zip`. + +Optional: `devShells.dist` for local release-build debugging. + +--- + +## Workflow Changes (outline) + +Target `release.yml` structure: + +``` +plan: + - nix flake check, nix eval ortVersion + - output tag from github.ref (no hardcoded versions) + +build-nix-artifacts: # replaces build-local-artifacts + matrix: linux | macos-aarch64 | windows-cross + - determinate-nix + cache-nix-action on ALL jobs + - nix build .#${attr} --system ${system} -L + - upload: main-*-{triple}.tar.xz / .zip + +build_and_push_docker_image: # unchanged + +release: # replaces build-global-artifacts + host + - download artifacts + - gh release create with CHANGELOG body +``` + +Artifact naming: match current convention for backwards compatibility — `main-{version}-{triple}.tar.xz` (Unix) / `.zip` (Windows). + +--- + +## cargo-dist Fate + +**Status:** Removed (Option B implemented in Phase 4). + +| Option | Verdict | +|--------|---------| +| A) Nix builds → cargo-dist packages only | No clean skip-compile mode; high friction | +| **B) Replace with custom Nix jobs + `gh release`** | **Implemented** | +| C) `build-local-artifacts = false` + custom jobs | Experimental; superseded by Option B | + +--- + +## Task Checklist (with complexity) + +| # | Task | Size | Phase | Done | +|---|------|------|-------|------| +| 1 | `mozjsArchive` per `system` with hash map | S | 1 | [x] | +| 2 | Platform-conditional `postInstall` in flake | S | 1–2 | [x] | +| 3 | `nix/minne-release.nix` archive bundler | M | 1 | [x] | +| 4 | Linux portable runtime lib bundling + patchelf | M | 1 | [x] | +| 5 | Replace Linux `build-local-artifacts` with Nix job | S | 1 | [x] | +| 6 | Add `cache-nix-action` to all release build jobs | S | 1–3 | [x] | +| 7 | glibc portability test + fix | M | 1 | [x] | +| 8 | Darwin release bundle + macOS GHA job | M | 2 | [x] | +| 9 | Drop `x86_64-apple-darwin` from targets | S | 2 | [x] | +| 10 | Windows cross flake (`minne-release-windows`) | L | 3 | [x] | +| 11 | Windows GHA job | S | 3 | [x] | +| 12 | Replace `host`/cargo-dist with `gh release` | S | 4 | [x] | +| 13 | Remove apt deps, ORT curl, cargo-dist install | S | 4 | [x] | +| 14 | Update docs/AGENTS release checklist | S | 4 | [x] | + +S = hours–1 day, M = 2–4 days, L = 1–2 weeks + +--- + +## Risks & Blockers + +| Risk | Severity | Mitigation | Resolved | +|------|----------|------------|----------| +| glibc compatibility (nixpkgs 2.40 vs Ubuntu 22.04 2.35) | High | Bundle runtime libs in `lib/` + `LD_LIBRARY_PATH` wrappers; bundled glibc interpreter | [x] | +| mozjs per-platform hashes drift on `Cargo.lock` bump | Medium | Centralize in `mozjsHashes` attrset; document bump procedure | [ ] | +| Darwin `postInstall` assumes Linux (`LD_LIBRARY_PATH`, `libglvnd`) | Medium | Platform-conditional wrapping in flake | [x] | +| Windows cross complexity (Crane + cargo-xwin) | Medium–High | cargo-xwin env + clang-cl wrapper for mozangle; Dbghelp.lib case symlink | [x] | +| Nix on macOS GHA speed | Medium | cache-nix-action; larger runner if needed | [ ] | +| Codesigning / notarization (macOS) | Low | Not required for CLI today; document `xattr` workaround; revisit if needed | [ ] | +| musl target (`x86_64-unknown-linux-musl`) | N/A | mozjs/servo stack is glibc-oriented; stay on `*-linux-gnu` unless explicitly requested | [ ] | +| ORT version drift | Low | Existing `ortVersion` gate in flake + devenv | [x] | + +--- + +## Open Questions + +1. **glibc portability strategy** — Bundle runtime libs in `lib/` (preferred for portability) vs pin `nixpkgs` to an older release channel for release builds vs document minimum distro? Need a test matrix: Ubuntu 22.04, Debian 12, Fedora current. + +2. **Archive format** — Confirmed: `.tar.xz` (Unix), `.zip` (Windows); naming `main-{triple}.*` (no version in filename). + +3. **Binary scope** — Release all three binaries (`main`, `server`, `worker`) in one archive per platform (unchanged from prior cargo-dist behavior). + +4. **PR artifact builds** — Not implemented; cargo-dist `pr-run-mode` was disabled. Revisit if PR smoke-test artifacts are wanted. + +5. **Cachix** — Deferred; `cache-nix-action` on all release jobs is sufficient for now. + +6. **Windows cross approach** — Resolved: Crane + offline xwin MSVC cache + fenix `rust-std` + clang-cl/lld-link shims (`nix build .#minne-release-windows` verified locally). + +7. **Version source of truth** — Release workflow reads version from flake (`minneVersion`). + +8. **cargo-dist removal timing** — Resolved: removed in Phase 4. + +9. **Intel Mac deprecation communication** — Done: `docs/installation.md` notes aarch64-only + Rosetta 2. + +--- + +## Success Criteria + +After implementation: + +- [x] Release workflow no longer runs raw `cargo build --release` on bare GitHub runners +- [x] Native deps (clang, mozjs, onnxruntime, etc.) come from flake/Nix, not apt +- [x] Linux, macOS (aarch64), and Windows release binaries are produced via Nix +- [x] Docker and release binaries share maximum Nix store cache (`cache-nix-action` on all jobs) +- [x] No hardcoded version strings in `release.yml` +- [ ] Warm release compile time materially improved (~10–25 min/platform vs ~50–110 min today) — pending GHA measurement +- [ ] macOS and Windows archives validated on clean VM / GHA tag-push release run + +--- + +## References + +- [Crane cross-windows example](https://crane.dev/examples/cross-windows.html) +- [Crane discussion: MSVC / cargo-xwin](https://github.com/ipetkov/crane/discussions/555) +- [cargo-dist CI customization](https://axodotdev.github.io/cargo-dist/book/ci/customizing.html) +- [servo/mozjs releases](https://github.com/servo/mozjs/releases) +- Project files: `flake.nix`, `.github/workflows/release.yml`, `devenv.nix`, `nix/minne-release*.nix` diff --git a/evaluations/Cargo.toml b/evaluations/Cargo.toml index b68e061..7d96a72 100644 --- a/evaluations/Cargo.toml +++ b/evaluations/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "evaluations" version = "0.1.0" -edition = "2021" +edition = "2024" [lints] workspace = true diff --git a/evaluations/src/args.rs b/evaluations/src/args.rs index b7d7f7d..7734af0 100644 --- a/evaluations/src/args.rs +++ b/evaluations/src/args.rs @@ -3,7 +3,7 @@ use std::{ path::{Path, PathBuf}, }; -use anyhow::{anyhow, Context, Result}; +use anyhow::{Context, Result, anyhow}; use clap::{Args, Parser, ValueEnum}; use crate::datasets::DatasetKind; @@ -394,26 +394,26 @@ impl Config { )); } - if let Some(k) = self.retrieval.chunk_rrf_k { - if k <= 0.0 || !k.is_finite() { - return Err(anyhow!( - "--chunk-rrf-k must be a positive, finite number (got {k})" - )); - } + if let Some(k) = self.retrieval.chunk_rrf_k + && (k <= 0.0 || !k.is_finite()) + { + return Err(anyhow!( + "--chunk-rrf-k must be a positive, finite number (got {k})" + )); } - if let Some(weight) = self.retrieval.chunk_rrf_vector_weight { - if weight < 0.0 || !weight.is_finite() { - return Err(anyhow!( - "--chunk-rrf-vector-weight must be a non-negative, finite number (got {weight})" - )); - } + if let Some(weight) = self.retrieval.chunk_rrf_vector_weight + && (weight < 0.0 || !weight.is_finite()) + { + return Err(anyhow!( + "--chunk-rrf-vector-weight must be a non-negative, finite number (got {weight})" + )); } - if let Some(weight) = self.retrieval.chunk_rrf_fts_weight { - if weight < 0.0 || !weight.is_finite() { - return Err(anyhow!( - "--chunk-rrf-fts-weight must be a non-negative, finite number (got {weight})" - )); - } + if let Some(weight) = self.retrieval.chunk_rrf_fts_weight + && (weight < 0.0 || !weight.is_finite()) + { + return Err(anyhow!( + "--chunk-rrf-fts-weight must be a non-negative, finite number (got {weight})" + )); } if self.concurrency == 0 { @@ -426,16 +426,16 @@ impl Config { )); } - if let Some(query_model) = &self.query_model { - if query_model.trim().is_empty() { - return Err(anyhow!("--query-model requires a non-empty model name")); - } + if let Some(query_model) = &self.query_model + && query_model.trim().is_empty() + { + return Err(anyhow!("--query-model requires a non-empty model name")); } - if let Some(grow) = self.slice_grow { - if grow == 0 { - return Err(anyhow!("--slice-grow must be greater than zero")); - } + if let Some(grow) = self.slice_grow + && grow == 0 + { + return Err(anyhow!("--slice-grow must be greater than zero")); } if self.negative_multiplier <= 0.0 || !self.negative_multiplier.is_finite() { @@ -465,12 +465,11 @@ impl Config { } // Handle perf log dir env var fallback - if self.perf_log_dir.is_none() { - if let Ok(dir) = env::var("EVAL_PERF_LOG_DIR") { - if !dir.trim().is_empty() { - self.perf_log_dir = Some(PathBuf::from(dir)); - } - } + if self.perf_log_dir.is_none() + && let Ok(dir) = env::var("EVAL_PERF_LOG_DIR") + && !dir.trim().is_empty() + { + self.perf_log_dir = Some(PathBuf::from(dir)); } Ok(()) @@ -480,10 +479,10 @@ impl Config { let catalog = crate::datasets::catalog()?; let entry = catalog.dataset(self.dataset.id())?; - if self.slice.is_none() { - if let Some(default_slice) = entry.slices.first() { - self.slice = Some(default_slice.id.clone()); - } + if self.slice.is_none() + && let Some(default_slice) = entry.slices.first() + { + self.slice = Some(default_slice.id.clone()); } let Some(slice_id) = self.slice.as_deref() else { @@ -498,11 +497,11 @@ impl Config { return Ok(()); } - if let Some(limit) = slice.limit { - if self.limit_arg == 200 { - self.limit_arg = limit; - self.limit = Some(limit); - } + if let Some(limit) = slice.limit + && self.limit_arg == 200 + { + self.limit_arg = limit; + self.limit = Some(limit); } if self.corpus_limit.is_none() { self.corpus_limit = slice.corpus_limit; @@ -514,10 +513,10 @@ impl Config { self.llm_mode = include_unanswerable; self.retrieval.require_verified_chunks = !include_unanswerable; } - if let Some(multiplier) = slice.negative_multiplier { - if negative_multiplier_is_default(self.negative_multiplier) { - self.negative_multiplier = multiplier; - } + if let Some(multiplier) = slice.negative_multiplier + && negative_multiplier_is_default(self.negative_multiplier) + { + self.negative_multiplier = multiplier; } Ok(()) } diff --git a/evaluations/src/cli/status.rs b/evaluations/src/cli/status.rs index 7ee2b85..7178c11 100644 --- a/evaluations/src/cli/status.rs +++ b/evaluations/src/cli/status.rs @@ -9,8 +9,8 @@ use crate::{ args::Config, corpus::{self, CorpusCacheConfig}, datasets::{ - beir_subset_store_summary, beir_subset_stores_ready, content_checksum_for_layout, - detect_layout, mix_content_checksum, store_dir_for, ConvertedLayout, DatasetKind, + ConvertedLayout, DatasetKind, beir_subset_store_summary, beir_subset_stores_ready, + content_checksum_for_layout, detect_layout, mix_content_checksum, store_dir_for, }, db::{connect_eval_db, default_database, default_namespace, namespace_has_corpus}, slice::{self, ledger_target}, diff --git a/evaluations/src/corpus/mod.rs b/evaluations/src/corpus/mod.rs index ba41444..a95ab32 100644 --- a/evaluations/src/corpus/mod.rs +++ b/evaluations/src/corpus/mod.rs @@ -8,8 +8,9 @@ pub use orchestrator::{ load_cached_manifest, persist_corpus_manifest, }; pub use store::{ - seed_manifest_into_db, window_manifest, CorpusHandle, CorpusManifest, CorpusMetadata, - CorpusQuestion, NamespaceSeedRecord, ParagraphShard, ParagraphShardStore, MANIFEST_VERSION, + CorpusHandle, CorpusManifest, CorpusMetadata, CorpusQuestion, MANIFEST_VERSION, + NamespaceSeedRecord, ParagraphShard, ParagraphShardStore, seed_manifest_into_db, + window_manifest, }; pub fn make_ingestion_config(config: &crate::args::Config) -> ingestion_pipeline::IngestionConfig { diff --git a/evaluations/src/corpus/orchestrator.rs b/evaluations/src/corpus/orchestrator.rs index d96c338..eae3817 100644 --- a/evaluations/src/corpus/orchestrator.rs +++ b/evaluations/src/corpus/orchestrator.rs @@ -6,14 +6,14 @@ use std::{ sync::Arc, }; -use anyhow::{anyhow, Context, Result}; +use anyhow::{Context, Result, anyhow}; use async_openai::Client; use chrono::Utc; use common::{ storage::{ db::SurrealDbClient, store::{DynStorage, StorageManager}, - types::{ingestion_payload::IngestionPayload, ingestion_task::IngestionTask, StoredObject}, + types::{StoredObject, ingestion_payload::IngestionPayload, ingestion_task::IngestionTask}, }, utils::config::{AppConfig, StorageKind}, }; @@ -31,7 +31,7 @@ use crate::{ use crate::corpus::{ CorpusCacheConfig, CorpusHandle, CorpusManifest, CorpusMetadata, CorpusQuestion, - ParagraphShard, ParagraphShardStore, MANIFEST_VERSION, + MANIFEST_VERSION, ParagraphShard, ParagraphShardStore, }; const INGESTION_SPEC_VERSION: u32 = 2; diff --git a/evaluations/src/corpus/store.rs b/evaluations/src/corpus/store.rs index c83dff3..ea5c768 100644 --- a/evaluations/src/corpus/store.rs +++ b/evaluations/src/corpus/store.rs @@ -5,16 +5,17 @@ use std::{ path::PathBuf, }; -use anyhow::{anyhow, Context, Result}; +use anyhow::{Context, Result, anyhow}; use chrono::{DateTime, Utc}; use common::storage::{ db::SurrealDbClient, types::{ - knowledge_entity::KnowledgeEntity, knowledge_relationship::KnowledgeRelationship, - text_chunk::TextChunk, text_content::TextContent, StoredObject, + StoredObject, knowledge_entity::KnowledgeEntity, + knowledge_relationship::KnowledgeRelationship, text_chunk::TextChunk, + text_content::TextContent, }, }; -use ingestion_pipeline::{persist_artifacts, IngestionTuning, PipelineArtifacts}; +use ingestion_pipeline::{IngestionTuning, PipelineArtifacts, persist_artifacts}; use serde::Deserialize; use tracing::{debug, warn}; @@ -304,7 +305,7 @@ impl ParagraphShardStore { Ok(file) => file, Err(err) if err.kind() == std::io::ErrorKind::NotFound => return Ok(None), Err(err) => { - return Err(err).with_context(|| format!("opening shard {}", path.display())) + return Err(err).with_context(|| format!("opening shard {}", path.display())); } }; let reader = BufReader::new(file); diff --git a/evaluations/src/datasets/beir.rs b/evaluations/src/datasets/beir.rs index 6c891d7..befe0bc 100644 --- a/evaluations/src/datasets/beir.rs +++ b/evaluations/src/datasets/beir.rs @@ -5,7 +5,7 @@ use std::{ path::{Path, PathBuf}, }; -use anyhow::{anyhow, Context, Result}; +use anyhow::{Context, Result, anyhow}; use serde::Deserialize; use tracing::warn; @@ -138,10 +138,10 @@ pub fn convert_beir_documents( continue; }; - if let Some(filter) = doc_ids { - if !filter.contains(&best.doc_id) { - continue; - } + if let Some(filter) = doc_ids + && !filter.contains(&best.doc_id) + { + continue; } let Some(¶graph_slot) = paragraph_index.get(&best.doc_id) else { diff --git a/evaluations/src/datasets/beir_mix.rs b/evaluations/src/datasets/beir_mix.rs index c60dc9e..37e8993 100644 --- a/evaluations/src/datasets/beir_mix.rs +++ b/evaluations/src/datasets/beir_mix.rs @@ -1,17 +1,16 @@ use std::collections::{HashMap, HashSet}; -use anyhow::{anyhow, Context, Result}; +use anyhow::{Context, Result, anyhow}; use sha2::{Digest, Sha256}; use tracing::info; use super::{ - beir, + BEIR_DATASETS, ConvertedDataset, DatasetKind, DatasetMetadata, beir, checksum::hash_file, store::{ self, build_dataset_from_catalog, paragraph_path, read_meta, store_dir_for, upsert_sharded_paragraphs, write_sharded, }, - ConvertedDataset, DatasetKind, DatasetMetadata, BEIR_DATASETS, }; use crate::{args::Config, slice}; diff --git a/evaluations/src/datasets/checksum.rs b/evaluations/src/datasets/checksum.rs index 4493a75..cbff7fd 100644 --- a/evaluations/src/datasets/checksum.rs +++ b/evaluations/src/datasets/checksum.rs @@ -112,10 +112,10 @@ pub fn write_sidecar(content_path: &Path, sha256: &str) -> Result<()> { #[cfg(test)] pub fn content_checksum(content_path: &Path) -> Result { let sidecar_path = ChecksumSidecar::sidecar_path(content_path); - if let Some(sidecar) = read_sidecar(&sidecar_path)? { - if sidecar.is_valid_for(content_path) { - return Ok(sidecar.sha256); - } + if let Some(sidecar) = read_sidecar(&sidecar_path)? + && sidecar.is_valid_for(content_path) + { + return Ok(sidecar.sha256); } let sha256 = hash_file(content_path)?; write_sidecar(content_path, &sha256)?; @@ -125,19 +125,17 @@ pub fn content_checksum(content_path: &Path) -> Result { pub fn store_aggregate_checksum(store_dir: &Path) -> Result { let marker = store_dir.join("checksum.sha256"); let meta = store_dir.join("meta.json"); - if marker.is_file() && meta.is_file() { - if let (Ok(marker_meta), Ok(meta_meta)) = (marker.metadata(), meta.metadata()) { - if marker_meta - .modified() - .ok() - .zip(meta_meta.modified().ok()) - .is_some_and(|(marker_modified, meta_modified)| marker_modified >= meta_modified) - { - if let Some(sidecar) = read_sidecar(&marker)? { - return Ok(sidecar.sha256); - } - } - } + if marker.is_file() + && meta.is_file() + && let (Ok(marker_meta), Ok(meta_meta)) = (marker.metadata(), meta.metadata()) + && marker_meta + .modified() + .ok() + .zip(meta_meta.modified().ok()) + .is_some_and(|(marker_modified, meta_modified)| marker_modified >= meta_modified) + && let Some(sidecar) = read_sidecar(&marker)? + { + return Ok(sidecar.sha256); } let mut entries = Vec::new(); diff --git a/evaluations/src/datasets/loader.rs b/evaluations/src/datasets/loader.rs index 4994f45..74a9e97 100644 --- a/evaluations/src/datasets/loader.rs +++ b/evaluations/src/datasets/loader.rs @@ -4,12 +4,11 @@ use anyhow::{Context, Result}; use tracing::info; use super::{ - catalog, + ConvertedDataset, DatasetKind, catalog, store::{ - self, build_dataset_from_catalog, detect_layout, read_meta, store_dir_for, write_sharded, - ConvertedLayout, + self, ConvertedLayout, build_dataset_from_catalog, detect_layout, read_meta, store_dir_for, + write_sharded, }, - ConvertedDataset, DatasetKind, }; use crate::{ args::Config, @@ -69,21 +68,19 @@ fn load_from_store( let meta = read_meta(store_dir)?; validate_metadata_fields(&meta.metadata, dataset_kind, config)?; - if allow_partial { - if let Some(paragraph_ids) = slice_paragraph_ids_for_fast_path(config)? { - let unique: HashSet = paragraph_ids.into_iter().collect(); - info!( - paragraphs = unique.len(), - store = %store_dir.display(), - "Loading slice-addressed paragraphs from sharded converted store" - ); - let dataset = build_dataset_from_catalog(store_dir, &unique)?; - return Ok(LoadedDataset { - dataset, - content_checksum: checksum, - partial: true, - }); - } + if allow_partial && let Some(paragraph_ids) = slice_paragraph_ids_for_fast_path(config)? { + let unique: HashSet = paragraph_ids.into_iter().collect(); + info!( + paragraphs = unique.len(), + store = %store_dir.display(), + "Loading slice-addressed paragraphs from sharded converted store" + ); + let dataset = build_dataset_from_catalog(store_dir, &unique)?; + return Ok(LoadedDataset { + dataset, + content_checksum: checksum, + partial: true, + }); } info!( diff --git a/evaluations/src/datasets/mod.rs b/evaluations/src/datasets/mod.rs index ad546e8..c037086 100644 --- a/evaluations/src/datasets/mod.rs +++ b/evaluations/src/datasets/mod.rs @@ -13,7 +13,7 @@ use std::{ str::FromStr, }; -use anyhow::{anyhow, bail, Context, Result}; +use anyhow::{Context, Result, anyhow, bail}; use chrono::{DateTime, TimeZone, Utc}; use clap::ValueEnum; use once_cell::sync::OnceCell; @@ -226,7 +226,7 @@ pub use beir_mix::{beir_subset_store_summary, beir_subset_stores_ready, mix_cont pub use checksum::store_aggregate_checksum; pub use loader::{prebuild_catalog_slices, prepare_dataset}; pub use store::{ - content_checksum_for_layout, detect_layout, store_dir_for, write_sharded, ConvertedLayout, + ConvertedLayout, content_checksum_for_layout, detect_layout, store_dir_for, write_sharded, }; pub fn catalog() -> Result<&'static DatasetCatalog> { @@ -383,7 +383,9 @@ impl FromStr for DatasetKind { "scifact" => Ok(Self::Scifact), "nq-beir" | "natural-questions-beir" => Ok(Self::NqBeir), other => { - anyhow::bail!("unknown dataset '{other}'. Expected one of: squad, natural-questions, beir, fever, fiqa, hotpotqa, nfcorpus, quora, trec-covid, scifact, nq-beir.") + anyhow::bail!( + "unknown dataset '{other}'. Expected one of: squad, natural-questions, beir, fever, fiqa, hotpotqa, nfcorpus, quora, trec-covid, scifact, nq-beir." + ) } } } diff --git a/evaluations/src/datasets/store.rs b/evaluations/src/datasets/store.rs index 5335e79..fb09b61 100644 --- a/evaluations/src/datasets/store.rs +++ b/evaluations/src/datasets/store.rs @@ -5,14 +5,14 @@ use std::{ path::{Path, PathBuf}, }; -use anyhow::{anyhow, Context, Result}; +use anyhow::{Context, Result, anyhow}; use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize}; use tracing::info; use super::{ - checksum::store_aggregate_checksum, ConvertedDataset, ConvertedParagraph, ConvertedQuestion, - DatasetMetadata, + ConvertedDataset, ConvertedParagraph, ConvertedQuestion, DatasetMetadata, + checksum::store_aggregate_checksum, }; use crate::slice; diff --git a/evaluations/src/db/connect.rs b/evaluations/src/db/connect.rs index be4cd28..2885a3d 100644 --- a/evaluations/src/db/connect.rs +++ b/evaluations/src/db/connect.rs @@ -1,10 +1,10 @@ -use anyhow::{anyhow, Context, Result}; +use anyhow::{Context, Result, anyhow}; use chrono::Utc; use common::{ storage::{ db::SurrealDbClient, - types::user::{Theme, User}, types::StoredObject, + types::user::{Theme, User}, }, utils::embedding::EmbeddingProvider, }; diff --git a/evaluations/src/inspection.rs b/evaluations/src/inspection.rs index 915fc78..96042ba 100644 --- a/evaluations/src/inspection.rs +++ b/evaluations/src/inspection.rs @@ -1,6 +1,6 @@ use std::{collections::HashMap, fs, path::Path}; -use anyhow::{anyhow, Context, Result}; +use anyhow::{Context, Result, anyhow}; use common::storage::{db::SurrealDbClient, types::text_chunk::TextChunk}; use crate::{args::Config, corpus, db::connect_eval_db}; diff --git a/evaluations/src/main.rs b/evaluations/src/main.rs index 720fbe2..251d355 100644 --- a/evaluations/src/main.rs +++ b/evaluations/src/main.rs @@ -17,36 +17,35 @@ mod types; use anyhow::Context; use tokio::runtime::Builder; use tracing::info; -use tracing_subscriber::{fmt, EnvFilter}; +use tracing_subscriber::{EnvFilter, fmt}; /// Configure `SurrealDB` environment variables for optimal performance #[allow(clippy::arithmetic_side_effects, clippy::unwrap_used)] fn configure_surrealdb_performance(cpu_count: usize) { let indexing_batch_size = std::env::var("SURREAL_INDEXING_BATCH_SIZE") .unwrap_or_else(|_| (cpu_count * 2).to_string()); - std::env::set_var("SURREAL_INDEXING_BATCH_SIZE", indexing_batch_size); - let max_order_queue = std::env::var("SURREAL_MAX_ORDER_LIMIT_PRIORITY_QUEUE_SIZE") .unwrap_or_else(|_| (cpu_count * 4).to_string()); - std::env::set_var( - "SURREAL_MAX_ORDER_LIMIT_PRIORITY_QUEUE_SIZE", - max_order_queue, - ); - let websocket_concurrent = std::env::var("SURREAL_WEBSOCKET_MAX_CONCURRENT_REQUESTS") .unwrap_or_else(|_| cpu_count.to_string()); - std::env::set_var( - "SURREAL_WEBSOCKET_MAX_CONCURRENT_REQUESTS", - websocket_concurrent, - ); - let websocket_buffer = std::env::var("SURREAL_WEBSOCKET_RESPONSE_BUFFER_SIZE") .unwrap_or_else(|_| (cpu_count * 8).to_string()); - std::env::set_var("SURREAL_WEBSOCKET_RESPONSE_BUFFER_SIZE", websocket_buffer); - let transaction_cache = std::env::var("SURREAL_TRANSACTION_CACHE_SIZE") .unwrap_or_else(|_| (cpu_count * 16).to_string()); - std::env::set_var("SURREAL_TRANSACTION_CACHE_SIZE", transaction_cache); + // SAFETY: single-threaded setup before SurrealDB clients are created. + unsafe { + std::env::set_var("SURREAL_INDEXING_BATCH_SIZE", indexing_batch_size); + std::env::set_var( + "SURREAL_MAX_ORDER_LIMIT_PRIORITY_QUEUE_SIZE", + max_order_queue, + ); + std::env::set_var( + "SURREAL_WEBSOCKET_MAX_CONCURRENT_REQUESTS", + websocket_concurrent, + ); + std::env::set_var("SURREAL_WEBSOCKET_RESPONSE_BUFFER_SIZE", websocket_buffer); + std::env::set_var("SURREAL_TRANSACTION_CACHE_SIZE", transaction_cache); + } info!( indexing_batch_size = %std::env::var("SURREAL_INDEXING_BATCH_SIZE").unwrap(), diff --git a/evaluations/src/openai.rs b/evaluations/src/openai.rs index e40ca1a..decc83b 100644 --- a/evaluations/src/openai.rs +++ b/evaluations/src/openai.rs @@ -1,7 +1,7 @@ use std::sync::Arc; use anyhow::{Context, Result}; -use async_openai::{config::OpenAIConfig, Client}; +use async_openai::{Client, config::OpenAIConfig}; const DEFAULT_BASE_URL: &str = "https://api.openai.com/v1"; diff --git a/evaluations/src/pipeline/context.rs b/evaluations/src/pipeline/context.rs index 08d8723..e86cfd4 100644 --- a/evaluations/src/pipeline/context.rs +++ b/evaluations/src/pipeline/context.rs @@ -4,7 +4,7 @@ use std::{ time::{Duration, Instant}, }; -use anyhow::{anyhow, Result}; +use anyhow::{Result, anyhow}; use async_openai::Client; use common::{ storage::{ diff --git a/evaluations/src/pipeline/stages/finalize.rs b/evaluations/src/pipeline/stages/finalize.rs index 82d8e53..1c5fe3a 100644 --- a/evaluations/src/pipeline/stages/finalize.rs +++ b/evaluations/src/pipeline/stages/finalize.rs @@ -16,12 +16,12 @@ pub(crate) async fn finalize(ctx: &mut EvaluationContext<'_>) -> anyhow::Result< ); let started = Instant::now(); - if let Some(path) = ctx.diagnostics_path.as_ref() { - if ctx.diagnostics_enabled { - write_chunk_diagnostics(path.as_path(), &ctx.diagnostics_output) - .await - .with_context(|| format!("writing chunk diagnostics to {}", path.display()))?; - } + if let Some(path) = ctx.diagnostics_path.as_ref() + && ctx.diagnostics_enabled + { + write_chunk_diagnostics(path.as_path(), &ctx.diagnostics_output) + .await + .with_context(|| format!("writing chunk diagnostics to {}", path.display()))?; } info!( diff --git a/evaluations/src/pipeline/stages/prepare_corpus.rs b/evaluations/src/pipeline/stages/prepare_corpus.rs index d5da651..d2ed956 100644 --- a/evaluations/src/pipeline/stages/prepare_corpus.rs +++ b/evaluations/src/pipeline/stages/prepare_corpus.rs @@ -40,8 +40,8 @@ pub(crate) async fn prepare_corpus(ctx: &mut EvaluationContext<'_>) -> anyhow::R if !config.reseed_slice { let requested_cases = window.cases.len(); - if let Some(manifest) = corpus::load_cached_manifest(&base_dir)? { - if can_reuse_namespace( + if let Some(manifest) = corpus::load_cached_manifest(&base_dir)? + && can_reuse_namespace( ctx.db()?, &manifest, &embedding_provider, @@ -51,28 +51,27 @@ pub(crate) async fn prepare_corpus(ctx: &mut EvaluationContext<'_>) -> anyhow::R requested_cases, ) .await? - { - info!( - cache = %base_dir.display(), - namespace = ctx.namespace.as_str(), - database = ctx.database.as_str(), - "Namespace already seeded; reusing cached corpus manifest" - ); - let corpus_handle = corpus::corpus_handle_from_manifest(manifest, base_dir); - ctx.corpus_handle = Some(corpus_handle); - ctx.expected_fingerprint = Some(expected_fingerprint); - ctx.ingestion_duration_ms = 0; + { + info!( + cache = %base_dir.display(), + namespace = ctx.namespace.as_str(), + database = ctx.database.as_str(), + "Namespace already seeded; reusing cached corpus manifest" + ); + let corpus_handle = corpus::corpus_handle_from_manifest(manifest, base_dir); + ctx.corpus_handle = Some(corpus_handle); + ctx.expected_fingerprint = Some(expected_fingerprint); + ctx.ingestion_duration_ms = 0; - let elapsed = started.elapsed(); - ctx.record_stage_duration(stage, elapsed); - info!( - evaluation_stage = stage.label(), - duration_ms = elapsed.as_millis(), - "completed evaluation stage" - ); + let elapsed = started.elapsed(); + ctx.record_stage_duration(stage, elapsed); + info!( + evaluation_stage = stage.label(), + duration_ms = elapsed.as_millis(), + "completed evaluation stage" + ); - return Ok(()); - } + return Ok(()); } } diff --git a/evaluations/src/pipeline/stages/prepare_db.rs b/evaluations/src/pipeline/stages/prepare_db.rs index 3eb09cb..0cf85f9 100644 --- a/evaluations/src/pipeline/stages/prepare_db.rs +++ b/evaluations/src/pipeline/stages/prepare_db.rs @@ -1,6 +1,6 @@ use std::time::Instant; -use anyhow::{anyhow, Context}; +use anyhow::{Context, anyhow}; use tracing::info; use crate::{ @@ -9,7 +9,7 @@ use crate::{ openai, settings::{enforce_system_settings, load_or_init_system_settings}, }; -use common::utils::embedding::{default_embedding_pool_size, EmbeddingProvider}; +use common::utils::embedding::{EmbeddingProvider, default_embedding_pool_size}; use super::super::context::{EvalStage, EvaluationContext}; @@ -65,16 +65,16 @@ pub(crate) async fn prepare_db(ctx: &mut EvaluationContext<'_>) -> anyhow::Resul let (mut settings, settings_missing) = load_or_init_system_settings(&db, provider_dimension).await?; - if config.embedding_backend == EmbeddingBackend::FastEmbed { - if let Some(model_code) = embedding_provider.model_code() { - let sanitized = sanitize_model_code(&model_code); - let path = config.cache_dir.join(format!("{sanitized}.json")); - if config.force_convert && path.exists() { - tokio::fs::remove_file(&path) - .await - .with_context(|| format!("removing stale cache {}", path.display())) - .ok(); - } + if config.embedding_backend == EmbeddingBackend::FastEmbed + && let Some(model_code) = embedding_provider.model_code() + { + let sanitized = sanitize_model_code(&model_code); + let path = config.cache_dir.join(format!("{sanitized}.json")); + if config.force_convert && path.exists() { + tokio::fs::remove_file(&path) + .await + .with_context(|| format!("removing stale cache {}", path.display())) + .ok(); } } diff --git a/evaluations/src/pipeline/stages/prepare_namespace.rs b/evaluations/src/pipeline/stages/prepare_namespace.rs index 8eb635d..31be6f3 100644 --- a/evaluations/src/pipeline/stages/prepare_namespace.rs +++ b/evaluations/src/pipeline/stages/prepare_namespace.rs @@ -1,6 +1,6 @@ use std::time::Instant; -use anyhow::{anyhow, Context}; +use anyhow::{Context, anyhow}; use common::storage::types::system_settings::SystemSettings; use tracing::{info, warn}; diff --git a/evaluations/src/pipeline/stages/run_queries.rs b/evaluations/src/pipeline/stages/run_queries.rs index 96fd948..ac7761b 100644 --- a/evaluations/src/pipeline/stages/run_queries.rs +++ b/evaluations/src/pipeline/stages/run_queries.rs @@ -1,6 +1,6 @@ use std::{collections::HashSet, sync::Arc, time::Instant}; -use anyhow::{anyhow, Context}; +use anyhow::{Context, anyhow}; use common::storage::types::StoredObject; use futures::stream::{self, StreamExt}; use tracing::{debug, info}; @@ -9,8 +9,8 @@ use crate::{ cases::SeededCase, context_stats, types::{ - adapt_retrieval_output, build_case_diagnostics, text_contains_answer, CaseDiagnostics, - CaseSummary, RetrievedSummary, + CaseDiagnostics, CaseSummary, RetrievedSummary, adapt_retrieval_output, + build_case_diagnostics, text_contains_answer, }, }; use retrieval_pipeline::{ @@ -391,9 +391,5 @@ fn calculate_ndcg(retrieved: &[RetrievedSummary], k: usize) -> f64 { idcg += rel / (f64::from(i) + 2.0).log2(); } - if idcg == 0.0 { - 0.0 - } else { - dcg / idcg - } + if idcg == 0.0 { 0.0 } else { dcg / idcg } } diff --git a/evaluations/src/pipeline/stages/summarize.rs b/evaluations/src/pipeline/stages/summarize.rs index 9603439..d8b047a 100644 --- a/evaluations/src/pipeline/stages/summarize.rs +++ b/evaluations/src/pipeline/stages/summarize.rs @@ -4,8 +4,8 @@ use chrono::Utc; use tracing::info; use crate::types::{ - build_stage_latency_breakdown, compute_latency_stats, EvaluationSummary, PerformanceTimings, - RetrievedContextStats, + EvaluationSummary, PerformanceTimings, RetrievedContextStats, build_stage_latency_breakdown, + compute_latency_stats, }; use super::super::context::{EvalStage, EvaluationContext}; diff --git a/evaluations/src/report.rs b/evaluations/src/report.rs index e299567..f96d2de 100644 --- a/evaluations/src/report.rs +++ b/evaluations/src/report.rs @@ -8,8 +8,8 @@ use anyhow::{Context, Result}; use serde::{Deserialize, Serialize}; use crate::types::{ - format_timestamp, CaseSummary, EvaluationStageTimings, EvaluationSummary, LatencyStats, - RetrievalContextStats, StageLatencyBreakdown, + CaseSummary, EvaluationStageTimings, EvaluationSummary, LatencyStats, RetrievalContextStats, + StageLatencyBreakdown, format_timestamp, }; #[derive(Debug)] @@ -804,11 +804,7 @@ fn prettify_stage(label: &str) -> String { } fn bool_badge(value: bool) -> &'static str { - if value { - "✅" - } else { - "⚪" - } + if value { "✅" } else { "⚪" } } fn render_retrieved(entries: &[RetrievedSnippet]) -> String { diff --git a/evaluations/src/settings.rs b/evaluations/src/settings.rs index e642681..aa542b2 100644 --- a/evaluations/src/settings.rs +++ b/evaluations/src/settings.rs @@ -24,15 +24,15 @@ pub(crate) async fn enforce_system_settings( updated_settings.embedding_dimensions = provider_dimension as u32; needs_settings_update = true; } - if let Some(query_override) = config.query_model.as_deref() { - if settings.query_model != query_override { - info!( - model = query_override, - "Overriding system query model for this run" - ); - updated_settings.query_model = query_override.to_string(); - needs_settings_update = true; - } + if let Some(query_override) = config.query_model.as_deref() + && settings.query_model != query_override + { + info!( + model = query_override, + "Overriding system query model for this run" + ); + updated_settings.query_model = query_override.to_string(); + needs_settings_update = true; } if needs_settings_update { settings = SystemSettings::update(db, updated_settings) diff --git a/evaluations/src/slice/beir.rs b/evaluations/src/slice/beir.rs index 56108c9..d0a0bdd 100644 --- a/evaluations/src/slice/beir.rs +++ b/evaluations/src/slice/beir.rs @@ -1,12 +1,12 @@ use std::collections::{HashMap, VecDeque}; -use anyhow::{anyhow, Result}; -use rand::{rngs::StdRng, seq::SliceRandom, SeedableRng}; +use anyhow::{Result, anyhow}; +use rand::{SeedableRng, rngs::StdRng, seq::SliceRandom}; use tracing::warn; -use crate::datasets::{ConvertedDataset, BEIR_DATASETS}; +use crate::datasets::{BEIR_DATASETS, ConvertedDataset}; -use super::build::{mix_seed, BuildParams}; +use super::build::{BuildParams, mix_seed}; #[allow(clippy::too_many_lines, clippy::arithmetic_side_effects)] pub(super) fn ordered_question_refs_beir( @@ -164,10 +164,10 @@ pub(super) fn ordered_question_refs_beir( pub(super) fn question_prefix(question_id: &str) -> Option<&'static str> { for prefix in BEIR_DATASETS.iter().map(|kind| kind.source_prefix()) { - if let Some(rest) = question_id.strip_prefix(prefix) { - if rest.starts_with('-') { - return Some(prefix); - } + if let Some(rest) = question_id.strip_prefix(prefix) + && rest.starts_with('-') + { + return Some(prefix); } } None diff --git a/evaluations/src/slice/mod.rs b/evaluations/src/slice/mod.rs index 9be5f48..53b12f2 100644 --- a/evaluations/src/slice/mod.rs +++ b/evaluations/src/slice/mod.rs @@ -5,9 +5,9 @@ use std::{ path::{Path, PathBuf}, }; -use anyhow::{anyhow, Context, Result}; +use anyhow::{Context, Result, anyhow}; use chrono::{DateTime, Utc}; -use rand::{rngs::StdRng, seq::SliceRandom, SeedableRng}; +use rand::{SeedableRng, rngs::StdRng, seq::SliceRandom}; use serde::{Deserialize, Serialize}; use sha2::{Digest, Sha256}; use tracing::{info, warn}; @@ -20,7 +20,7 @@ use crate::{ mod beir; mod build; -use build::{mix_seed, BuildParams}; +use build::{BuildParams, mix_seed}; const SLICE_VERSION: u32 = 2; pub const DEFAULT_NEGATIVE_MULTIPLIER: f32 = 4.0; @@ -1116,11 +1116,13 @@ mod tests { assert_eq!(window.cases.len(), 1); let positive_ids: Vec<&str> = window.positive_ids().collect(); assert_eq!(positive_ids.len(), 1); - assert!(resolved - .manifest - .paragraphs - .iter() - .any(|entry| entry.id == positive_ids[0])); + assert!( + resolved + .manifest + .paragraphs + .iter() + .any(|entry| entry.id == positive_ids[0]) + ); Ok(()) } diff --git a/html-router/Cargo.toml b/html-router/Cargo.toml index 1d231e6..16b626b 100644 --- a/html-router/Cargo.toml +++ b/html-router/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "html-router" version = "0.1.0" -edition = "2021" +edition = "2024" license = "AGPL-3.0-or-later" [lints] diff --git a/html-router/src/html_state.rs b/html-router/src/html_state.rs index 946851d..c1294d7 100644 --- a/html-router/src/html_state.rs +++ b/html-router/src/html_state.rs @@ -6,8 +6,8 @@ use common::{create_template_engine, storage::db::ProvidesDb, utils::config::App use retrieval_pipeline::reranking::RerankerPool; use std::collections::HashMap; use std::sync::{ - atomic::{AtomicUsize, Ordering}, Arc, + atomic::{AtomicUsize, Ordering}, }; use std::time::{Duration, Instant}; use tokio::sync::RwLock; diff --git a/html-router/src/lib.rs b/html-router/src/lib.rs index afe2269..f120394 100644 --- a/html-router/src/lib.rs +++ b/html-router/src/lib.rs @@ -13,14 +13,14 @@ pub mod router_factory; pub mod routes; pub mod utils; -use axum::{extract::FromRef, Router}; +use axum::{Router, extract::FromRef}; use axum_session::{Session, SessionStore}; use axum_session_auth::AuthSession; use axum_session_surreal::SessionSurrealPool; use common::storage::types::user::User; use html_state::HtmlState; use router_factory::RouterFactory; -use surrealdb::{engine::any::Any, Surreal}; +use surrealdb::{Surreal, engine::any::Any}; pub type AuthSessionType = AuthSession, Surreal>; pub type SessionType = Session>; diff --git a/html-router/src/middlewares/response_middleware.rs b/html-router/src/middlewares/response_middleware.rs index 4be6d96..0bfc09a 100644 --- a/html-router/src/middlewares/response_middleware.rs +++ b/html-router/src/middlewares/response_middleware.rs @@ -2,13 +2,13 @@ use std::collections::HashMap; use std::sync::Arc; use axum::{ + Extension, extract::{Request, State}, http::{HeaderName, StatusCode}, middleware::Next, response::{Html, IntoResponse, Redirect, Response}, - Extension, }; -use axum_htmx::{HxRequest, HX_TRIGGER}; +use axum_htmx::{HX_TRIGGER, HxRequest}; use common::{ error::AppError, utils::template_engine::{ProvidesTemplateEngine, Value}, @@ -18,7 +18,7 @@ use serde::Serialize; use serde_json::json; use tracing::error; -use crate::{html_state::HtmlState, AuthSessionType}; +use crate::{AuthSessionType, html_state::HtmlState}; use common::storage::types::{ conversation::{Conversation, SidebarConversation}, user::{Theme, User}, @@ -175,10 +175,10 @@ const HTMX_HEADERS_TO_FORWARD: &[&str] = &["HX-Push", "HX-Trigger", "HX-Redirect fn forward_headers(from: &axum::http::HeaderMap, to: &mut axum::http::HeaderMap) { for &header_name in HTMX_HEADERS_TO_FORWARD { - if let Ok(name) = HeaderName::from_bytes(header_name.as_bytes()) { - if let Some(value) = from.get(&name) { - to.insert(name.clone(), value.clone()); - } + if let Ok(name) = HeaderName::from_bytes(header_name.as_bytes()) + && let Some(value) = from.get(&name) + { + to.insert(name.clone(), value.clone()); } } } @@ -219,13 +219,13 @@ where let mut current_user = None; { - if let Some(auth) = req.extensions().get::() { - if let Some(user) = &auth.current_user { - is_authenticated = true; - user_theme = user.theme.as_str(); - initial_theme = user.theme.initial_theme(); - current_user = Some(TemplateUser::from(user)); - } + if let Some(auth) = req.extensions().get::() + && let Some(user) = &auth.current_user + { + is_authenticated = true; + user_theme = user.theme.as_str(); + initial_theme = user.theme.initial_theme(); + current_user = Some(TemplateUser::from(user)); } } diff --git a/html-router/src/router_factory.rs b/html-router/src/router_factory.rs index 2778b5a..73f5e04 100644 --- a/html-router/src/router_factory.rs +++ b/html-router/src/router_factory.rs @@ -1,9 +1,9 @@ -use axum::{extract::FromRef, middleware::from_fn_with_state, Router}; +use axum::{Router, extract::FromRef, middleware::from_fn_with_state}; use axum_session::SessionLayer; use axum_session_auth::{AuthConfig, AuthSessionLayer}; use axum_session_surreal::SessionSurrealPool; use common::storage::types::user::User; -use surrealdb::{engine::any::Any, Surreal}; +use surrealdb::{Surreal, engine::any::Any}; use crate::{ html_state::HtmlState, diff --git a/html-router/src/routes/account/handlers.rs b/html-router/src/routes/account/handlers.rs index 50a0ffa..714e9e9 100644 --- a/html-router/src/routes/account/handlers.rs +++ b/html-router/src/routes/account/handlers.rs @@ -1,13 +1,13 @@ -use axum::{extract::State, Form}; +use axum::{Form, extract::State}; use chrono_tz::TZ_VARIANTS; use serde::{Deserialize, Serialize}; use crate::{ + AuthSessionType, middlewares::{ auth_middleware::RequireUser, response_middleware::{TemplateResponse, TemplateResult}, }, - AuthSessionType, }; use common::storage::types::user::{Theme, User}; diff --git a/html-router/src/routes/account/mod.rs b/html-router/src/routes/account/mod.rs index 4aa399a..db847ec 100644 --- a/html-router/src/routes/account/mod.rs +++ b/html-router/src/routes/account/mod.rs @@ -1,8 +1,8 @@ mod handlers; use axum::{ + Router, extract::FromRef, routing::{delete, get, patch, post}, - Router, }; use crate::html_state::HtmlState; diff --git a/html-router/src/routes/admin/handlers.rs b/html-router/src/routes/admin/handlers.rs index b06331d..5abb378 100644 --- a/html-router/src/routes/admin/handlers.rs +++ b/html-router/src/routes/admin/handlers.rs @@ -1,7 +1,7 @@ use async_openai::types::models::ListModelResponse; use axum::{ - extract::{Query, State}, Form, + extract::{Query, State}, }; use serde::{Deserialize, Serialize}; @@ -18,8 +18,8 @@ use common::{ utils::{ config::AppConfig, embedding::{ - fastembed_model_dimension, is_valid_fastembed_model_code, - list_fastembed_embedding_models, EmbeddingBackend, FastEmbedModelOption, + EmbeddingBackend, FastEmbedModelOption, fastembed_model_dimension, + is_valid_fastembed_model_code, list_fastembed_embedding_models, }, }, }; diff --git a/html-router/src/routes/admin/mod.rs b/html-router/src/routes/admin/mod.rs index c77e47e..06fb840 100644 --- a/html-router/src/routes/admin/mod.rs +++ b/html-router/src/routes/admin/mod.rs @@ -1,9 +1,9 @@ mod handlers; use axum::{ + Router, extract::FromRef, middleware::from_fn, routing::{get, patch}, - Router, }; use handlers::{ patch_image_prompt, patch_ingestion_prompt, patch_query_prompt, show_admin_panel, diff --git a/html-router/src/routes/auth/mod.rs b/html-router/src/routes/auth/mod.rs index 4035205..269bcc2 100644 --- a/html-router/src/routes/auth/mod.rs +++ b/html-router/src/routes/auth/mod.rs @@ -2,7 +2,7 @@ pub mod signin; pub mod signout; pub mod signup; -use axum::{extract::FromRef, routing::get, Router}; +use axum::{Router, extract::FromRef, routing::get}; use signin::{authenticate_user, show_signin_form}; use signout::sign_out_user; use signup::{process_signup_and_show_verification, show_signup_form}; diff --git a/html-router/src/routes/auth/signin.rs b/html-router/src/routes/auth/signin.rs index 14736a3..fddd6d5 100644 --- a/html-router/src/routes/auth/signin.rs +++ b/html-router/src/routes/auth/signin.rs @@ -1,11 +1,11 @@ -use axum::{extract::State, Form}; +use axum::{Form, extract::State}; use axum_htmx::HxBoosted; use serde::{Deserialize, Serialize}; use crate::{ + AuthSessionType, html_state::HtmlState, middlewares::response_middleware::{TemplateResponse, TemplateResult}, - AuthSessionType, }; use common::storage::types::user::User; diff --git a/html-router/src/routes/auth/signout.rs b/html-router/src/routes/auth/signout.rs index 1795d8f..32dcb2b 100644 --- a/html-router/src/routes/auth/signout.rs +++ b/html-router/src/routes/auth/signout.rs @@ -1,6 +1,6 @@ use crate::{ - middlewares::response_middleware::{TemplateResponse, TemplateResult}, AuthSessionType, + middlewares::response_middleware::{TemplateResponse, TemplateResult}, }; pub async fn sign_out_user(auth: AuthSessionType) -> TemplateResult { diff --git a/html-router/src/routes/auth/signup.rs b/html-router/src/routes/auth/signup.rs index dfa36be..69d2ec3 100644 --- a/html-router/src/routes/auth/signup.rs +++ b/html-router/src/routes/auth/signup.rs @@ -1,4 +1,4 @@ -use axum::{extract::State, Form}; +use axum::{Form, extract::State}; use axum_htmx::HxBoosted; use serde::{Deserialize, Serialize}; @@ -8,9 +8,9 @@ use common::{ }; use crate::{ + AuthSessionType, html_state::HtmlState, middlewares::response_middleware::{TemplateResponse, TemplateResult}, - AuthSessionType, }; #[derive(Deserialize, Serialize)] diff --git a/html-router/src/routes/chat/chat_handlers.rs b/html-router/src/routes/chat/chat_handlers.rs index 0ad838d..37a3938 100644 --- a/html-router/src/routes/chat/chat_handlers.rs +++ b/html-router/src/routes/chat/chat_handlers.rs @@ -1,7 +1,7 @@ use axum::{ + Form, extract::{Path, State}, http::HeaderValue, - Form, }; use serde::{Deserialize, Serialize}; @@ -18,8 +18,8 @@ use crate::{ middlewares::{ auth_middleware::RequireUser, response_middleware::{ - template_as_response, template_with_headers, ResponseResult, TemplateResponse, - TemplateResult, + ResponseResult, TemplateResponse, TemplateResult, template_as_response, + template_with_headers, }, }, }; diff --git a/html-router/src/routes/chat/message_response_stream.rs b/html-router/src/routes/chat/message_response_stream.rs index 48801ae..f555d80 100644 --- a/html-router/src/routes/chat/message_response_stream.rs +++ b/html-router/src/routes/chat/message_response_stream.rs @@ -6,24 +6,24 @@ use async_stream::stream; use axum::{ extract::{Query, State}, response::{ - sse::{Event, KeepAlive, KeepAliveStream}, Sse, + sse::{Event, KeepAlive, KeepAliveStream}, }, }; use futures::{ - stream::{self, once}, Stream, StreamExt, TryStreamExt, + stream::{self, once}, }; use json_stream_parser::JsonStreamParser; use minijinja::Value; use retrieval_pipeline::answer_retrieval::{ - chunks_to_chat_context, create_chat_request, create_user_message_with_history, - LLMResponseFormat, + LLMResponseFormat, chunks_to_chat_context, create_chat_request, + create_user_message_with_history, }; use serde::{Deserialize, Serialize}; use serde_json::from_str; -use tokio::sync::mpsc::channel; use tokio::sync::Mutex; +use tokio::sync::mpsc::channel; use tracing::{debug, error, info}; use common::storage::{ @@ -66,7 +66,7 @@ async fn get_message_and_user( Ok(None) => { return Err(sse_with_keep_alive(create_error_stream( "Message not found: the specified message does not exist", - ))) + ))); } Err(e) => { error!("Database error retrieving message {}: {:?}", message_id, e); @@ -233,12 +233,12 @@ pub async fn get_response_stream( fn build_chat_event_stream( state: HtmlState, openai_stream: impl Stream< - Item = Result< - async_openai::types::chat::CreateChatCompletionStreamResponse, - async_openai::error::OpenAIError, - >, - > + Send - + 'static, + Item = Result< + async_openai::types::chat::CreateChatCompletionStreamResponse, + async_openai::error::OpenAIError, + >, + > + Send + + 'static, user_message: &Message, user_id: String, allowed_reference_ids: Vec, @@ -502,17 +502,17 @@ impl StreamParserState { let json = self.parser.result(); - if let Some(obj) = json.as_object() { - if let Some(answer) = obj.get("answer") { - self.in_answer_field = true; + if let Some(obj) = json.as_object() + && let Some(answer) = obj.get("answer") + { + self.in_answer_field = true; - let current_content = answer.as_str().unwrap_or_default().to_string(); + let current_content = answer.as_str().unwrap_or_default().to_string(); - if current_content.len() > self.last_answer_content.len() { - let new_content = current_content[self.last_answer_content.len()..].to_string(); - self.last_answer_content = current_content; - return new_content; - } + if current_content.len() > self.last_answer_content.len() { + let new_content = current_content[self.last_answer_content.len()..].to_string(); + self.last_answer_content = current_content; + return new_content; } } diff --git a/html-router/src/routes/chat/mod.rs b/html-router/src/routes/chat/mod.rs index 20912ef..0e38570 100644 --- a/html-router/src/routes/chat/mod.rs +++ b/html-router/src/routes/chat/mod.rs @@ -3,7 +3,7 @@ mod message_response_stream; mod reference_validation; mod references; -use axum::{extract::FromRef, routing::get, Router}; +use axum::{Router, extract::FromRef, routing::get}; pub use chat_handlers::{ delete_conversation, new_chat_user_message, new_user_message, patch_conversation_title, reload_sidebar, show_chat_base as show_base, show_conversation_editing_title, diff --git a/html-router/src/routes/chat/reference_validation.rs b/html-router/src/routes/chat/reference_validation.rs index f69456e..750ea2d 100644 --- a/html-router/src/routes/chat/reference_validation.rs +++ b/html-router/src/routes/chat/reference_validation.rs @@ -6,7 +6,7 @@ use common::{ error::AppError, storage::{ db::SurrealDbClient, - types::{knowledge_entity::KnowledgeEntity, text_chunk::TextChunk, StoredObject}, + types::{StoredObject, knowledge_entity::KnowledgeEntity, text_chunk::TextChunk}, }, }; use retrieval_pipeline::RetrievalOutput; @@ -448,10 +448,12 @@ mod tests { assert_eq!(result.valid_refs, vec![first.id, second.id]); assert_eq!(result.invalid_refs.len(), 2); - assert!(result - .invalid_refs - .iter() - .all(|entry| entry.reason == InvalidReferenceReason::Duplicate)); + assert!( + result + .invalid_refs + .iter() + .all(|entry| entry.reason == InvalidReferenceReason::Duplicate) + ); } #[tokio::test] diff --git a/html-router/src/routes/chat/references.rs b/html-router/src/routes/chat/references.rs index 93cc8e1..7d16317 100644 --- a/html-router/src/routes/chat/references.rs +++ b/html-router/src/routes/chat/references.rs @@ -17,7 +17,7 @@ use crate::{ }, }; -use super::reference_validation::{normalize_reference, ReferenceLookupTarget}; +use super::reference_validation::{ReferenceLookupTarget, normalize_reference}; #[derive(Serialize)] struct ReferenceTooltipData { diff --git a/html-router/src/routes/content/handlers.rs b/html-router/src/routes/content/handlers.rs index 5e48a9c..913b539 100644 --- a/html-router/src/routes/content/handlers.rs +++ b/html-router/src/routes/content/handlers.rs @@ -1,6 +1,6 @@ use axum::{ - extract::{Path, Query, State}, Form, + extract::{Path, Query, State}, }; use axum_htmx::{HxBoosted, HxRequest, HxTarget}; use serde::{Deserialize, Serialize}; @@ -13,7 +13,7 @@ use crate::{ auth_middleware::RequireUser, response_middleware::{TemplateResponse, TemplateResult}, }, - utils::pagination::{paginate_items, Pagination}, + utils::pagination::{Pagination, paginate_items}, utils::text_content_preview::truncate_text_contents, }; use url::form_urlencoded; diff --git a/html-router/src/routes/content/mod.rs b/html-router/src/routes/content/mod.rs index c8376c0..de397d7 100644 --- a/html-router/src/routes/content/mod.rs +++ b/html-router/src/routes/content/mod.rs @@ -1,6 +1,6 @@ mod handlers; -use axum::{extract::FromRef, routing::get, Router}; +use axum::{Router, extract::FromRef, routing::get}; use handlers::{ delete_text_content, patch_text_content, show_content_page, show_content_read_modal, show_recent_content, show_text_content_edit_form, diff --git a/html-router/src/routes/index/handlers.rs b/html-router/src/routes/index/handlers.rs index ce1b437..cdad134 100644 --- a/html-router/src/routes/index/handlers.rs +++ b/html-router/src/routes/index/handlers.rs @@ -1,7 +1,7 @@ use axum::{ body::Body, extract::{Path, State}, - http::{header, HeaderMap, HeaderValue, StatusCode}, + http::{HeaderMap, HeaderValue, StatusCode, header}, response::IntoResponse, }; use chrono::{DateTime, Utc}; @@ -13,7 +13,7 @@ use crate::{ middlewares::{ auth_middleware::RequireUser, response_middleware::{ - template_as_response, ResponseResult, TemplateResponse, TemplateResult, + ResponseResult, TemplateResponse, TemplateResult, template_as_response, }, }, utils::text_content_preview::truncate_text_contents, diff --git a/html-router/src/routes/index/mod.rs b/html-router/src/routes/index/mod.rs index 3a320f8..133aa53 100644 --- a/html-router/src/routes/index/mod.rs +++ b/html-router/src/routes/index/mod.rs @@ -1,9 +1,9 @@ pub mod handlers; use axum::{ + Router, extract::FromRef, routing::{delete, get}, - Router, }; use handlers::{ delete_job, delete_text_content, index_handler, serve_file, show_active_jobs, show_task_archive, diff --git a/html-router/src/routes/ingestion/handlers.rs b/html-router/src/routes/ingestion/handlers.rs index 893df92..7791c46 100644 --- a/html-router/src/routes/ingestion/handlers.rs +++ b/html-router/src/routes/ingestion/handlers.rs @@ -4,12 +4,12 @@ use axum::{ extract::{Query, State}, http::StatusCode, response::{ - sse::{Event, KeepAlive, KeepAliveStream}, Sse, + sse::{Event, KeepAlive, KeepAliveStream}, }, }; use axum_typed_multipart::{FieldData, TryFromMultipart, TypedMultipart}; -use futures::{future::try_join_all, stream, Stream, StreamExt, TryFutureExt}; +use futures::{Stream, StreamExt, TryFutureExt, future::try_join_all, stream}; use minijinja::context; use serde::{Deserialize, Serialize}; use tempfile::NamedTempFile; @@ -24,7 +24,7 @@ use common::{ ingestion_task::{IngestionTask, TaskState}, user::User, }, - utils::ingest_limits::{validate_ingest_input, IngestValidationError}, + utils::ingest_limits::{IngestValidationError, validate_ingest_input}, }; use crate::{ diff --git a/html-router/src/routes/ingestion/mod.rs b/html-router/src/routes/ingestion/mod.rs index 77ff53c..aef714d 100644 --- a/html-router/src/routes/ingestion/mod.rs +++ b/html-router/src/routes/ingestion/mod.rs @@ -1,6 +1,6 @@ mod handlers; -use axum::{extract::DefaultBodyLimit, extract::FromRef, routing::get, Router}; +use axum::{Router, extract::DefaultBodyLimit, extract::FromRef, routing::get}; use handlers::{get_task_updates_stream, hide_ingest_form, process_ingest_form, show_ingest_form}; use crate::html_state::HtmlState; diff --git a/html-router/src/routes/knowledge/handlers.rs b/html-router/src/routes/knowledge/handlers.rs index 5548519..5aa4762 100644 --- a/html-router/src/routes/knowledge/handlers.rs +++ b/html-router/src/routes/knowledge/handlers.rs @@ -3,15 +3,15 @@ use std::collections::{HashMap, HashSet}; use std::fmt; use axum::{ + Form, Json, extract::{Path, Query, State}, http::HeaderValue, response::{IntoResponse, Response}, - Form, Json, }; -use axum_htmx::{HxBoosted, HxRequest, HX_TRIGGER}; +use axum_htmx::{HX_TRIGGER, HxBoosted, HxRequest}; use serde::{ - de::{self, Deserializer, MapAccess, Visitor}, Deserialize, Serialize, + de::{self, Deserializer, MapAccess, Visitor}, }; use common::{ @@ -27,7 +27,7 @@ use common::{ utils::embedding::EmbeddingProvider, }; use retrieval_pipeline::{ - normalize_fts_terms, reciprocal_rank_fusion, RetrievalTuning, RrfConfig, Scored, + RetrievalTuning, RrfConfig, Scored, normalize_fts_terms, reciprocal_rank_fusion, }; use tracing::debug; use uuid::Uuid; @@ -37,10 +37,10 @@ use crate::{ middlewares::{ auth_middleware::RequireUser, response_middleware::{ - template_with_headers, ResponseResult, TemplateResponse, TemplateResult, + ResponseResult, TemplateResponse, TemplateResult, template_with_headers, }, }, - utils::pagination::{paginate_items, paginate_slice, Pagination}, + utils::pagination::{Pagination, paginate_items, paginate_slice}, }; use url::form_urlencoded; @@ -950,12 +950,11 @@ fn normalize_filter(input: Option) -> Option { fn trim_matching_quotes(value: &str) -> &str { let bytes = value.as_bytes(); - if let (Some(&first), Some(&last)) = (bytes.first(), bytes.last()) { - if bytes.len() >= 2 - && ((first == b'"' && last == b'"') || (first == b'\'' && last == b'\'')) - { - return &value[1..value.len().saturating_sub(1)]; - } + if let (Some(&first), Some(&last)) = (bytes.first(), bytes.last()) + && bytes.len() >= 2 + && ((first == b'"' && last == b'"') || (first == b'\'' && last == b'\'')) + { + return &value[1..value.len().saturating_sub(1)]; } value } diff --git a/html-router/src/routes/knowledge/mod.rs b/html-router/src/routes/knowledge/mod.rs index f2c39cb..489b07b 100644 --- a/html-router/src/routes/knowledge/mod.rs +++ b/html-router/src/routes/knowledge/mod.rs @@ -1,9 +1,9 @@ mod handlers; use axum::{ + Router, extract::FromRef, routing::{delete, get, post}, - Router, }; use handlers::{ create_knowledge_entity, delete_knowledge_entity, delete_knowledge_relationship, diff --git a/html-router/src/routes/scratchpad/handlers.rs b/html-router/src/routes/scratchpad/handlers.rs index 12ad051..ae16b82 100644 --- a/html-router/src/routes/scratchpad/handlers.rs +++ b/html-router/src/routes/scratchpad/handlers.rs @@ -1,10 +1,10 @@ use axum::{ + Form, extract::{Path, Query, State}, http::{HeaderValue, StatusCode}, response::{IntoResponse, Response}, - Form, }; -use axum_htmx::{HxBoosted, HxRequest, HX_TRIGGER}; +use axum_htmx::{HX_TRIGGER, HxBoosted, HxRequest}; use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize}; @@ -12,7 +12,7 @@ use crate::html_state::HtmlState; use crate::middlewares::{ auth_middleware::RequireUser, response_middleware::{ - template_with_headers, ResponseResult, TemplateResponse, TemplateResult, + ResponseResult, TemplateResponse, TemplateResult, template_with_headers, }, }; use common::storage::types::{ diff --git a/html-router/src/routes/scratchpad/mod.rs b/html-router/src/routes/scratchpad/mod.rs index 4b94ed5..f657a65 100644 --- a/html-router/src/routes/scratchpad/mod.rs +++ b/html-router/src/routes/scratchpad/mod.rs @@ -1,8 +1,8 @@ mod handlers; use axum::{ + Router, extract::FromRef, routing::{delete, get, patch, post}, - Router, }; use crate::html_state::HtmlState; diff --git a/html-router/src/routes/search/handlers.rs b/html-router/src/routes/search/handlers.rs index 9ce386e..5fe42c8 100644 --- a/html-router/src/routes/search/handlers.rs +++ b/html-router/src/routes/search/handlers.rs @@ -4,9 +4,9 @@ use axum::extract::{Query, State}; use axum_htmx::{HxBoosted, HxRequest}; use common::storage::types::{text_content::TextContent, user::User}; use retrieval_pipeline::{ - retrieve, RetrievalConfig, RetrievalOutput, RetrievedChunk, RetrievedEntity, + RetrievalConfig, RetrievalOutput, RetrievedChunk, RetrievedEntity, retrieve, }; -use serde::{de, Deserialize, Deserializer, Serialize}; +use serde::{Deserialize, Deserializer, Serialize, de}; use std::{fmt, str::FromStr}; use crate::{ diff --git a/html-router/src/routes/search/mod.rs b/html-router/src/routes/search/mod.rs index a1a62d1..b9e9e02 100644 --- a/html-router/src/routes/search/mod.rs +++ b/html-router/src/routes/search/mod.rs @@ -1,8 +1,8 @@ mod handlers; -use axum::{extract::FromRef, routing::get, Router}; +use axum::{Router, extract::FromRef, routing::get}; #[allow(clippy::module_name_repetitions)] -pub use handlers::{search_result_handler as result_handler, SearchParams as SearchQueryParams}; +pub use handlers::{SearchParams as SearchQueryParams, search_result_handler as result_handler}; use crate::html_state::HtmlState; diff --git a/html-router/tests/router_integration.rs b/html-router/tests/router_integration.rs index f503422..50cccf1 100644 --- a/html-router/tests/router_integration.rs +++ b/html-router/tests/router_integration.rs @@ -3,10 +3,10 @@ use std::sync::Arc; use axum::{ - body::{to_bytes, Body}, - http::{header, Request, StatusCode}, - response::Response, Router, + body::{Body, to_bytes}, + http::{Request, StatusCode, header}, + response::Response, }; use common::{ storage::{db::SurrealDbClient, store::StorageManager, types::user::User}, diff --git a/html-router/tests/template_load.rs b/html-router/tests/template_load.rs index bcb0570..53d9abc 100644 --- a/html-router/tests/template_load.rs +++ b/html-router/tests/template_load.rs @@ -9,7 +9,7 @@ use std::fs; use std::path::{Path, PathBuf}; -use minijinja::{path_loader, Environment}; +use minijinja::{Environment, path_loader}; fn templates_dir() -> PathBuf { PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("templates") diff --git a/ingestion-pipeline/Cargo.toml b/ingestion-pipeline/Cargo.toml index a8ae8df..f85dd11 100644 --- a/ingestion-pipeline/Cargo.toml +++ b/ingestion-pipeline/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "ingestion-pipeline" version = "0.1.0" -edition = "2021" +edition = "2024" license = "AGPL-3.0-or-later" [lints] diff --git a/ingestion-pipeline/src/lib.rs b/ingestion-pipeline/src/lib.rs index b6b723a..9f2f78c 100644 --- a/ingestion-pipeline/src/lib.rs +++ b/ingestion-pipeline/src/lib.rs @@ -7,14 +7,14 @@ use chrono::Utc; use common::storage::{ db::SurrealDbClient, indexes::maybe_run_scheduled_index_rebuild, - types::ingestion_task::{IngestionTask, DEFAULT_LEASE_SECS}, + types::ingestion_task::{DEFAULT_LEASE_SECS, IngestionTask}, }; pub use pipeline::{ - persist_artifacts, EmbeddedKnowledgeEntity, EmbeddedTextChunk, IngestionConfig, - IngestionPipeline, IngestionTuning, PipelineArtifacts, + EmbeddedKnowledgeEntity, EmbeddedTextChunk, IngestionConfig, IngestionPipeline, + IngestionTuning, PipelineArtifacts, persist_artifacts, }; use std::sync::Arc; -use tokio::time::{sleep, Duration}; +use tokio::time::{Duration, sleep}; use tracing::{error, info, warn}; use uuid::Uuid; diff --git a/ingestion-pipeline/src/pipeline/mod.rs b/ingestion-pipeline/src/pipeline/mod.rs index 35672a7..ccddb50 100644 --- a/ingestion-pipeline/src/pipeline/mod.rs +++ b/ingestion-pipeline/src/pipeline/mod.rs @@ -42,7 +42,7 @@ use tracing::{debug, info, warn}; use self::{ context::PipelineContext, stages::{enrich, persist, prepare_content, retrieve_related}, - state::{ready, Enriched, IngestionMachine}, + state::{Enriched, IngestionMachine, ready}, }; /// Wall-clock duration of each pre-persistence pipeline stage. @@ -355,10 +355,10 @@ mod finalize_tests { use tokio::time::sleep; use super::{ + IngestionPipeline, PipelineServices, config::IngestionTuning, test_support::setup_db, - tests::{pipeline_config, reserve_task, MockServices}, - IngestionPipeline, PipelineServices, + tests::{MockServices, pipeline_config, reserve_task}, }; #[tokio::test] diff --git a/ingestion-pipeline/src/pipeline/persistence.rs b/ingestion-pipeline/src/pipeline/persistence.rs index 89cbd00..9dbd89e 100644 --- a/ingestion-pipeline/src/pipeline/persistence.rs +++ b/ingestion-pipeline/src/pipeline/persistence.rs @@ -12,14 +12,13 @@ use common::{ storage::{ db::SurrealDbClient, types::{ - knowledge_entity::KnowledgeEntity, + EmbeddingRecord, StoredObject, knowledge_entity::KnowledgeEntity, knowledge_entity_embedding::KnowledgeEntityEmbedding, text_chunk::TextChunk, - text_chunk_embedding::TextChunkEmbedding, text_content::TextContent, EmbeddingRecord, - StoredObject, + text_chunk_embedding::TextChunkEmbedding, text_content::TextContent, }, }, }; -use tokio::time::{sleep, Duration}; +use tokio::time::{Duration, sleep}; use tracing::warn; use super::{ @@ -268,8 +267,8 @@ mod tests { use super::*; use crate::pipeline::test_support::{ - self, count_chunks_for_source, count_entities_for_source, count_relationships_for_source, - large_artifacts, persist, sample_artifacts, setup_db, TEST_EMBEDDING_DIM, + self, TEST_EMBEDDING_DIM, count_chunks_for_source, count_entities_for_source, + count_relationships_for_source, large_artifacts, persist, sample_artifacts, setup_db, }; #[tokio::test] diff --git a/ingestion-pipeline/src/pipeline/services.rs b/ingestion-pipeline/src/pipeline/services.rs index a6d1571..790c332 100644 --- a/ingestion-pipeline/src/pipeline/services.rs +++ b/ingestion-pipeline/src/pipeline/services.rs @@ -15,14 +15,14 @@ use common::{ db::SurrealDbClient, store::StorageManager, types::{ - ingestion_payload::IngestionPayload, knowledge_relationship::KnowledgeRelationship, - system_settings::SystemSettings, text_chunk::TextChunk, text_content::TextContent, - StoredObject, + StoredObject, ingestion_payload::IngestionPayload, + knowledge_relationship::KnowledgeRelationship, system_settings::SystemSettings, + text_chunk::TextChunk, text_content::TextContent, }, }, utils::{config::AppConfig, embedding::EmbeddingProvider}, }; -use retrieval_pipeline::{reranking::RerankerPool, retrieved_entities_to_json, RetrievedEntity}; +use retrieval_pipeline::{RetrievedEntity, reranking::RerankerPool, retrieved_entities_to_json}; use text_splitter::{ChunkCapacity, ChunkConfig, TextSplitter}; use super::{enrichment_result::LLMEnrichmentResult, preparation::to_text_content}; @@ -358,7 +358,7 @@ mod tests { use std::sync::Arc; use anyhow::Context; - use async_openai::{config::OpenAIConfig, types::chat::ChatCompletionRequestMessage, Client}; + use async_openai::{Client, config::OpenAIConfig, types::chat::ChatCompletionRequestMessage}; use common::{ storage::{ db::SurrealDbClient, store::StorageManager, types::system_settings::SystemSettingsPatch, diff --git a/ingestion-pipeline/src/pipeline/tests.rs b/ingestion-pipeline/src/pipeline/tests.rs index 0ed229e..c8c555e 100644 --- a/ingestion-pipeline/src/pipeline/tests.rs +++ b/ingestion-pipeline/src/pipeline/tests.rs @@ -1,6 +1,7 @@ use std::sync::Arc; use super::{ + IngestionPipeline, config::{IngestionConfig, IngestionTuning}, enrichment_result::LLMEnrichmentResult, services::PipelineServices, @@ -8,7 +9,6 @@ use super::{ count_chunks_for_source, count_entities_for_source, count_relationships_for_source, persist, sample_artifacts, setup_db, }, - IngestionPipeline, }; use crate::pipeline::context::{EmbeddedKnowledgeEntity, EmbeddedTextChunk}; use anyhow::{self, Context}; @@ -405,9 +405,11 @@ async fn ingestion_pipeline_happy_path_persists_artifacts() -> anyhow::Result<() call_log.get(0..4), Some(&["prepare", "retrieve", "enrich", "convert"][..]) ); - assert!(call_log - .get(4..) - .is_some_and(|tail| tail.iter().all(|entry| *entry == "chunk"))); + assert!( + call_log + .get(4..) + .is_some_and(|tail| tail.iter().all(|entry| *entry == "chunk")) + ); Ok(()) } diff --git a/ingestion-pipeline/src/utils/file_text_extraction.rs b/ingestion-pipeline/src/utils/file_text_extraction.rs index dee232b..c7aa2d1 100644 --- a/ingestion-pipeline/src/utils/file_text_extraction.rs +++ b/ingestion-pipeline/src/utils/file_text_extraction.rs @@ -38,11 +38,11 @@ async fn materialize_temp_file( let mut path = env::temp_dir(); let mut file_name = format!("minne-ingest-{}", Uuid::new_v4()); - if let Some(ext) = extension { - if !ext.is_empty() { - file_name.push('.'); - file_name.push_str(ext); - } + if let Some(ext) = extension + && !ext.is_empty() + { + file_name.push('.'); + file_name.push_str(ext); } path.push(file_name); @@ -142,7 +142,7 @@ pub async fn extract_text_from_file( #[cfg(test)] mod tests { use super::*; - use async_openai::{config::OpenAIConfig, Client}; + use async_openai::{Client, config::OpenAIConfig}; use bytes::Bytes; use chrono::Utc; use common::{ diff --git a/ingestion-pipeline/src/utils/image_parsing.rs b/ingestion-pipeline/src/utils/image_parsing.rs index 844c0f7..f80b12a 100644 --- a/ingestion-pipeline/src/utils/image_parsing.rs +++ b/ingestion-pipeline/src/utils/image_parsing.rs @@ -3,7 +3,7 @@ use async_openai::types::chat::{ ChatCompletionRequestMessageContentPartTextArgs, ChatCompletionRequestUserMessageArgs, CreateChatCompletionRequestArgs, ImageDetail, ImageUrlArgs, }; -use base64::{engine::general_purpose::STANDARD, Engine as _}; +use base64::{Engine as _, engine::general_purpose::STANDARD}; use common::{ error::AppError, storage::{db::SurrealDbClient, types::system_settings::SystemSettings}, diff --git a/ingestion-pipeline/src/utils/pdf/render.rs b/ingestion-pipeline/src/utils/pdf/render.rs index b274e3a..bec209c 100644 --- a/ingestion-pipeline/src/utils/pdf/render.rs +++ b/ingestion-pipeline/src/utils/pdf/render.rs @@ -148,20 +148,27 @@ async fn maybe_dump_debug_image(page_index: u32, bytes: &[u8]) -> Result<(), App mod tests { use super::*; use anyhow::{self}; - use lopdf::dictionary; use lopdf::Object; + use lopdf::dictionary; #[test] fn test_debug_dump_directory_env_var() -> anyhow::Result<()> { - std::env::remove_var(DEBUG_IMAGE_ENV_VAR); + // SAFETY: test runs serially; env is restored before return. + unsafe { + std::env::remove_var(DEBUG_IMAGE_ENV_VAR); + } assert!(debug_dump_directory().is_none()); - std::env::set_var(DEBUG_IMAGE_ENV_VAR, "/tmp/minne_pdf_debug"); + unsafe { + std::env::set_var(DEBUG_IMAGE_ENV_VAR, "/tmp/minne_pdf_debug"); + } let dir = debug_dump_directory().ok_or_else(|| anyhow::anyhow!("expected debug directory"))?; assert_eq!(dir, PathBuf::from("/tmp/minne_pdf_debug")); - std::env::remove_var(DEBUG_IMAGE_ENV_VAR); + unsafe { + std::env::remove_var(DEBUG_IMAGE_ENV_VAR); + } Ok(()) } diff --git a/ingestion-pipeline/src/utils/pdf/vision.rs b/ingestion-pipeline/src/utils/pdf/vision.rs index fa42e38..e7d6820 100644 --- a/ingestion-pipeline/src/utils/pdf/vision.rs +++ b/ingestion-pipeline/src/utils/pdf/vision.rs @@ -5,7 +5,7 @@ use async_openai::types::chat::{ ChatCompletionRequestMessageContentPartTextArgs, ChatCompletionRequestUserMessageArgs, CreateChatCompletionRequest, CreateChatCompletionRequestArgs, ImageDetail, ImageUrlArgs, }; -use base64::{engine::general_purpose::STANDARD, Engine as _}; +use base64::{Engine as _, engine::general_purpose::STANDARD}; use tracing::{debug, warn}; use common::{ diff --git a/json-stream-parser/Cargo.toml b/json-stream-parser/Cargo.toml index 3f9326b..44e7837 100644 --- a/json-stream-parser/Cargo.toml +++ b/json-stream-parser/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "json-stream-parser" version = "0.1.0" -edition = "2021" +edition = "2024" license = "MIT" [lints] diff --git a/json-stream-parser/src/lib.rs b/json-stream-parser/src/lib.rs index 1555f05..6cb4da6 100644 --- a/json-stream-parser/src/lib.rs +++ b/json-stream-parser/src/lib.rs @@ -1,7 +1,7 @@ // This code is based on the json-stream-rust library (https://github.com/json-stream/json-stream-rust) // Original code is MIT licensed // Modified to fix escape character handling in strings -use serde_json::{json, Value}; +use serde_json::{Value, json}; use std::mem; #[derive(Clone, Debug)] @@ -50,7 +50,7 @@ fn add_char_into_object( if let Value::String(s) = object { s.push('"'); } - if let ObjectStatus::StringQuoteOpen(ref mut escaped) = current_status { + if let ObjectStatus::StringQuoteOpen(escaped) = current_status { *escaped = false; } } @@ -62,12 +62,12 @@ fn add_char_into_object( s.push('\\'); s.push(c); } - if let ObjectStatus::StringQuoteOpen(ref mut escaped) = current_status { + if let ObjectStatus::StringQuoteOpen(escaped) = current_status { *escaped = false; } } (&Value::String(_), &ObjectStatus::StringQuoteOpen(false), '\\') => { - if let ObjectStatus::StringQuoteOpen(ref mut escaped) = current_status { + if let ObjectStatus::StringQuoteOpen(escaped) = current_status { *escaped = true; } } @@ -80,8 +80,8 @@ fn add_char_into_object( // --- Object: key with escaped quote --- (&Value::Object(_), &ObjectStatus::KeyQuoteOpen { escaped: true, .. }, '"') => { if let ObjectStatus::KeyQuoteOpen { - ref mut key_so_far, - ref mut escaped, + key_so_far, + escaped, } = current_status { key_so_far.push('"'); @@ -89,10 +89,7 @@ fn add_char_into_object( } } (&Value::Object(_), &ObjectStatus::KeyQuoteOpen { escaped: false, .. }, '"') => { - if let ObjectStatus::KeyQuoteOpen { - ref mut key_so_far, .. - } = current_status - { + if let ObjectStatus::KeyQuoteOpen { key_so_far, .. } = current_status { let key = mem::take(key_so_far); if let Value::Object(obj) = object { obj.insert(key.clone(), Value::Null); @@ -103,8 +100,8 @@ fn add_char_into_object( // --- Object: key with escaped other char --- (&Value::Object(_), &ObjectStatus::KeyQuoteOpen { escaped: true, .. }, c) => { if let ObjectStatus::KeyQuoteOpen { - ref mut key_so_far, - ref mut escaped, + key_so_far, + escaped, } = current_status { key_so_far.push('\\'); @@ -113,33 +110,23 @@ fn add_char_into_object( } } (&Value::Object(_), &ObjectStatus::KeyQuoteOpen { escaped: false, .. }, '\\') => { - if let ObjectStatus::KeyQuoteOpen { - ref mut escaped, .. - } = current_status - { + if let ObjectStatus::KeyQuoteOpen { escaped, .. } = current_status { *escaped = true; } } (&Value::Object(_), &ObjectStatus::KeyQuoteOpen { escaped: false, .. }, c) => { - if let ObjectStatus::KeyQuoteOpen { - ref mut key_so_far, .. - } = current_status - { + if let ObjectStatus::KeyQuoteOpen { key_so_far, .. } = current_status { key_so_far.push(c); } } // --- Object: value with escaped quote --- (&Value::Object(_), &ObjectStatus::ValueQuoteOpen { escaped: true, .. }, '"') => { - if let ObjectStatus::ValueQuoteOpen { - ref key, - ref mut escaped, - } = current_status - { - if let Value::Object(obj) = object { - if let Some(Value::String(value)) = obj.get_mut(key) { - value.push('"'); - } + if let ObjectStatus::ValueQuoteOpen { key, escaped } = current_status { + if let Value::Object(obj) = object + && let Some(Value::String(value)) = obj.get_mut(key) + { + value.push('"'); } *escaped = false; } @@ -148,36 +135,29 @@ fn add_char_into_object( *current_status = ObjectStatus::ValueQuoteClose; } (&Value::Object(_), &ObjectStatus::ValueQuoteOpen { escaped: true, .. }, c) => { - if let ObjectStatus::ValueQuoteOpen { - ref key, - ref mut escaped, - } = current_status - { - if let Value::Object(obj) = object { - if let Some(Value::String(value)) = obj.get_mut(key) { - value.push('\\'); - value.push(c); - } + if let ObjectStatus::ValueQuoteOpen { key, escaped } = current_status { + if let Value::Object(obj) = object + && let Some(Value::String(value)) = obj.get_mut(key) + { + value.push('\\'); + value.push(c); } *escaped = false; } } (&Value::Object(_), &ObjectStatus::ValueQuoteOpen { escaped: false, .. }, '\\') => { - if let ObjectStatus::ValueQuoteOpen { - ref mut escaped, .. - } = current_status - { + if let ObjectStatus::ValueQuoteOpen { escaped, .. } = current_status { *escaped = true; } } (&Value::Object(_), &ObjectStatus::ValueQuoteOpen { escaped: false, .. }, c) => { - if let ObjectStatus::ValueQuoteOpen { ref key, .. } = current_status { - if let Value::Object(obj) = object { - if let Some(Value::String(value)) = obj.get_mut(key) { - value.push(c); - } else { - return Err(ParseError::InvalidValueType(key.clone())); - } + if let ObjectStatus::ValueQuoteOpen { key, .. } = current_status + && let Value::Object(obj) = object + { + if let Some(Value::String(value)) = obj.get_mut(key) { + value.push(c); + } else { + return Err(ParseError::InvalidValueType(key.clone())); } } } @@ -199,23 +179,17 @@ fn add_char_into_object( }; } (&Value::Bool(true), &ObjectStatus::Scalar { .. }, 'r') => { - if let ObjectStatus::Scalar { - ref mut value_so_far, - } = current_status + if let ObjectStatus::Scalar { value_so_far } = current_status + && *value_so_far == "t" { - if *value_so_far == "t" { - value_so_far.push('r'); - } + value_so_far.push('r'); } } (&Value::Bool(true), &ObjectStatus::Scalar { .. }, 'u') => { - if let ObjectStatus::Scalar { - ref mut value_so_far, - } = current_status + if let ObjectStatus::Scalar { value_so_far } = current_status + && *value_so_far == "tr" { - if *value_so_far == "tr" { - value_so_far.push('u'); - } + value_so_far.push('u'); } } (&Value::Bool(true) | &Value::Bool(false), &ObjectStatus::Scalar { .. }, 'e') @@ -230,33 +204,24 @@ fn add_char_into_object( }; } (&Value::Bool(false), &ObjectStatus::Scalar { .. }, 'a') => { - if let ObjectStatus::Scalar { - ref mut value_so_far, - } = current_status + if let ObjectStatus::Scalar { value_so_far } = current_status + && *value_so_far == "f" { - if *value_so_far == "f" { - value_so_far.push('a'); - } + value_so_far.push('a'); } } (&Value::Bool(false), &ObjectStatus::Scalar { .. }, 'l') => { - if let ObjectStatus::Scalar { - ref mut value_so_far, - } = current_status + if let ObjectStatus::Scalar { value_so_far } = current_status + && *value_so_far == "fa" { - if *value_so_far == "fa" { - value_so_far.push('l'); - } + value_so_far.push('l'); } } (&Value::Bool(false), &ObjectStatus::Scalar { .. }, 's') => { - if let ObjectStatus::Scalar { - ref mut value_so_far, - } = current_status + if let ObjectStatus::Scalar { value_so_far } = current_status + && *value_so_far == "fal" { - if *value_so_far == "fal" { - value_so_far.push('s'); - } + value_so_far.push('s'); } } @@ -267,20 +232,14 @@ fn add_char_into_object( }; } (&Value::Null, &ObjectStatus::Scalar { .. }, 'u') => { - if let ObjectStatus::Scalar { - ref mut value_so_far, - } = current_status + if let ObjectStatus::Scalar { value_so_far } = current_status + && *value_so_far == "n" { - if *value_so_far == "n" { - value_so_far.push('u'); - } + value_so_far.push('u'); } } (&Value::Null, &ObjectStatus::Scalar { .. }, 'l') => { - if let ObjectStatus::Scalar { - ref mut value_so_far, - } = current_status - { + if let ObjectStatus::Scalar { value_so_far } = current_status { if *value_so_far == "nu" { value_so_far.push('l'); } else if *value_so_far == "nul" { @@ -305,12 +264,9 @@ fn add_char_into_object( }; } (&Value::Number(_), &ObjectStatus::ScalarNumber { .. }, c @ '0'..='9') => { - if let ObjectStatus::ScalarNumber { - ref mut value_so_far, - } = current_status - { + if let ObjectStatus::ScalarNumber { value_so_far } = current_status { value_so_far.push(c); - if let Value::Number(ref mut num) = object { + if let Value::Number(num) = object { if value_so_far.contains('.') { let parsed: f64 = value_so_far.parse().map_err(|e| { ParseError::InvalidNumber(format!( @@ -332,10 +288,7 @@ fn add_char_into_object( } } (&Value::Number(_), &ObjectStatus::ScalarNumber { .. }, '.') => { - if let ObjectStatus::ScalarNumber { - ref mut value_so_far, - } = current_status - { + if let ObjectStatus::ScalarNumber { value_so_far } = current_status { value_so_far.push('.'); } } @@ -348,14 +301,14 @@ fn add_char_into_object( }; } (&Value::Object(_), &ObjectStatus::KeyQuoteClose { .. }, ':') => { - if let ObjectStatus::KeyQuoteClose { ref mut key } = current_status { + if let ObjectStatus::KeyQuoteClose { key } = current_status { let key = mem::take(key); *current_status = ObjectStatus::Colon { key }; } } (&Value::Object(_), &ObjectStatus::Colon { .. }, ' ' | '\n' | '\t' | '\r') => {} (&Value::Object(_), &ObjectStatus::Colon { .. }, '"') => { - if let ObjectStatus::Colon { ref mut key } = current_status { + if let ObjectStatus::Colon { key } = current_status { let key_str = mem::take(key); if let Value::Object(obj) = object { obj.insert(key_str.clone(), json!("")); @@ -368,7 +321,7 @@ fn add_char_into_object( } (&Value::Object(_), &ObjectStatus::Colon { .. }, char) => { - if let ObjectStatus::Colon { ref mut key } = current_status { + if let ObjectStatus::Colon { key } = current_status { let key = mem::take(key); *current_status = ObjectStatus::ValueScalar { key, @@ -377,11 +330,7 @@ fn add_char_into_object( } } (&Value::Object(_), &ObjectStatus::ValueScalar { .. }, ',') => { - if let ObjectStatus::ValueScalar { - ref mut key, - ref mut value_so_far, - } = current_status - { + if let ObjectStatus::ValueScalar { key, value_so_far } = current_status { let key = mem::take(key); let value_str = mem::take(value_so_far); if let Value::Object(obj) = object { @@ -398,11 +347,7 @@ fn add_char_into_object( } } (&Value::Object(_), &ObjectStatus::ValueScalar { .. }, '}') => { - if let ObjectStatus::ValueScalar { - ref mut key, - ref mut value_so_far, - } = current_status - { + if let ObjectStatus::ValueScalar { key, value_so_far } = current_status { let key = mem::take(key); let value_str = mem::take(value_so_far); if let Value::Object(obj) = object { @@ -419,11 +364,7 @@ fn add_char_into_object( } } (&Value::Object(_), &ObjectStatus::ValueScalar { .. }, char) => { - if let ObjectStatus::ValueScalar { - ref mut value_so_far, - .. - } = current_status - { + if let ObjectStatus::ValueScalar { value_so_far, .. } = current_status { value_so_far.push(char); } } diff --git a/main/Cargo.toml b/main/Cargo.toml index c049d2c..fffd523 100644 --- a/main/Cargo.toml +++ b/main/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "main" -version = "1.0.4" -edition = "2021" +version = "1.0.5" +edition = "2024" rust-version = "1.91" repository = "https://github.com/perstarkse/minne" license = "AGPL-3.0-or-later" diff --git a/main/src/bootstrap/mod.rs b/main/src/bootstrap/mod.rs index f6db73c..7240ef5 100644 --- a/main/src/bootstrap/mod.rs +++ b/main/src/bootstrap/mod.rs @@ -1,7 +1,7 @@ mod startup; pub mod wiring; -pub use startup::{prepare_embedding_runtime, EmbeddingRuntimeRole}; +pub use startup::{EmbeddingRuntimeRole, prepare_embedding_runtime}; use std::sync::Arc; @@ -10,12 +10,12 @@ use async_openai::Client; use common::{ storage::{db::SurrealDbClient, store::StorageManager}, utils::{ - config::{get_config, AppConfig}, - embedding::{align_fastembed_system_settings, EmbeddingProvider}, + config::{AppConfig, get_config}, + embedding::{EmbeddingProvider, align_fastembed_system_settings}, }, }; use retrieval_pipeline::reranking::RerankerPool; -use tracing_subscriber::{fmt, prelude::*, EnvFilter}; +use tracing_subscriber::{EnvFilter, fmt, prelude::*}; pub struct SharedServices { pub db: Arc, diff --git a/main/src/bootstrap/wiring.rs b/main/src/bootstrap/wiring.rs index 01eb042..a76dcc2 100644 --- a/main/src/bootstrap/wiring.rs +++ b/main/src/bootstrap/wiring.rs @@ -2,7 +2,7 @@ use std::sync::Arc; use anyhow::Context; use api_router::{api_routes_v1, api_state::ApiState}; -use axum::{extract::FromRef, Router}; +use axum::{Router, extract::FromRef}; use html_router::{ html_routes, html_state::{HtmlState, StateResources}, diff --git a/main/src/main.rs b/main/src/main.rs index 52fb706..d1300d6 100644 --- a/main/src/main.rs +++ b/main/src/main.rs @@ -4,9 +4,8 @@ use std::sync::Arc; use axum::extract::FromRef; use bootstrap::{ - init, prepare_embedding_runtime, + EmbeddingRuntimeRole, init, prepare_embedding_runtime, wiring::{build_api_state, build_html_state, minne_routes}, - EmbeddingRuntimeRole, }; use ingestion_pipeline::{pipeline::IngestionPipeline, run_worker_loop}; use tracing::info; @@ -82,16 +81,15 @@ struct AppState { mod tests { use super::*; use axum::{ - body::Body, - http::{header, Request, StatusCode}, - response::Response, Router, + body::Body, + http::{Request, StatusCode, header}, + response::Response, }; use bootstrap::{ - prepare_embedding_runtime, + EmbeddingRuntimeRole, prepare_embedding_runtime, tests::init_smoke_services, wiring::{build_api_state, build_html_state, minne_routes}, - EmbeddingRuntimeRole, }; use common::storage::types::{system_settings::SystemSettings, user::User}; use tower::ServiceExt; diff --git a/main/src/server.rs b/main/src/server.rs index 6192b86..4e296eb 100644 --- a/main/src/server.rs +++ b/main/src/server.rs @@ -2,9 +2,8 @@ mod bootstrap; use axum::extract::FromRef; use bootstrap::{ - init, prepare_embedding_runtime, + EmbeddingRuntimeRole, init, prepare_embedding_runtime, wiring::{build_api_state, build_html_state, minne_routes}, - EmbeddingRuntimeRole, }; use tracing::info; diff --git a/main/src/worker.rs b/main/src/worker.rs index 6530dbe..a89b8b4 100644 --- a/main/src/worker.rs +++ b/main/src/worker.rs @@ -2,7 +2,7 @@ mod bootstrap; use std::sync::Arc; -use bootstrap::{init, prepare_embedding_runtime, EmbeddingRuntimeRole}; +use bootstrap::{EmbeddingRuntimeRole, init, prepare_embedding_runtime}; use ingestion_pipeline::{pipeline::IngestionPipeline, run_worker_loop}; use tracing::info; @@ -40,7 +40,7 @@ mod tests { use std::time::Duration; use chrono::Utc; - use common::storage::types::ingestion_task::{IngestionTask, DEFAULT_LEASE_SECS}; + use common::storage::types::ingestion_task::{DEFAULT_LEASE_SECS, IngestionTask}; use ingestion_pipeline::pipeline::IngestionPipeline; use crate::bootstrap::tests::init_smoke_services; diff --git a/retrieval-pipeline/Cargo.toml b/retrieval-pipeline/Cargo.toml index 05bcdf8..c975a73 100644 --- a/retrieval-pipeline/Cargo.toml +++ b/retrieval-pipeline/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "retrieval-pipeline" version = "0.1.0" -edition = "2021" +edition = "2024" license = "AGPL-3.0-or-later" [lints] diff --git a/retrieval-pipeline/src/answer_retrieval.rs b/retrieval-pipeline/src/answer_retrieval.rs index 7ede797..9ed99c3 100644 --- a/retrieval-pipeline/src/answer_retrieval.rs +++ b/retrieval-pipeline/src/answer_retrieval.rs @@ -9,11 +9,11 @@ use async_openai::{ }, }; use common::storage::types::{ - message::{format_history, Message}, + message::{Message, format_history}, system_settings::SystemSettings, }; use serde::Deserialize; -use serde_json::{json, Value}; +use serde_json::{Value, json}; /// JSON schema describing the structured chat answer (answer text + references). fn get_query_response_schema() -> Value { @@ -62,16 +62,18 @@ impl LLMResponseFormat { pub fn chunks_to_chat_context(chunks: &[crate::RetrievedChunk]) -> Value { use crate::round_score; - serde_json::json!(chunks - .iter() - .map(|chunk| { - serde_json::json!({ - "id": chunk.chunk.id, - "content": chunk.chunk.chunk, - "score": round_score(chunk.score), + serde_json::json!( + chunks + .iter() + .map(|chunk| { + serde_json::json!({ + "id": chunk.chunk.id, + "content": chunk.chunk.chunk, + "score": round_score(chunk.score), + }) }) - }) - .collect::>()) + .collect::>() + ) } pub fn create_user_message_with_history( diff --git a/retrieval-pipeline/src/lib.rs b/retrieval-pipeline/src/lib.rs index 42e2dbb..b85bd81 100644 --- a/retrieval-pipeline/src/lib.rs +++ b/retrieval-pipeline/src/lib.rs @@ -33,11 +33,11 @@ pub enum RetrievalOutput { } pub use pipeline::{ - retrieved_entities_to_json, Diagnostics, RetrievalConfig, RetrievalParams, RetrievalTuning, - StageKind, StageTimings, + Diagnostics, RetrievalConfig, RetrievalParams, RetrievalTuning, StageKind, StageTimings, + retrieved_entities_to_json, }; pub use query::normalize_fts_terms; -pub use scoring::{reciprocal_rank_fusion, RrfConfig, Scored}; +pub use scoring::{RrfConfig, Scored, reciprocal_rank_fusion}; /// Round a score to three decimal places for JSON output. pub(crate) fn round_score(value: f32) -> f64 { diff --git a/retrieval-pipeline/src/pipeline/context.rs b/retrieval-pipeline/src/pipeline/context.rs index 831036e..fd6e9a6 100644 --- a/retrieval-pipeline/src/pipeline/context.rs +++ b/retrieval-pipeline/src/pipeline/context.rs @@ -6,12 +6,12 @@ use common::{ use crate::scoring::Scored; -use crate::{reranking::RerankerLease, RetrievedChunk, RetrievedEntity}; +use crate::{RetrievedChunk, RetrievedEntity, reranking::RerankerLease}; use super::{ + RetrievalParams, StageKind, StageTimings, config::RetrievalConfig, diagnostics::{AssembleStats, Diagnostics, SearchStats}, - RetrievalParams, StageKind, StageTimings, }; /// Mutable working state threaded through every retrieval stage. diff --git a/retrieval-pipeline/src/pipeline/mod.rs b/retrieval-pipeline/src/pipeline/mod.rs index b833ae1..f872188 100644 --- a/retrieval-pipeline/src/pipeline/mod.rs +++ b/retrieval-pipeline/src/pipeline/mod.rs @@ -6,7 +6,7 @@ mod stages; pub use config::{RetrievalConfig, RetrievalTuning}; pub use diagnostics::Diagnostics; -use crate::{round_score, RetrievalOutput, RetrievedEntity}; +use crate::{RetrievalOutput, RetrievedEntity, round_score}; use async_trait::async_trait; use common::{error::AppError, storage::db::SurrealDbClient}; use std::time::{Duration, Instant}; @@ -188,23 +188,25 @@ pub async fn run_with_embedding_instrumented( } pub fn retrieved_entities_to_json(entities: &[RetrievedEntity]) -> serde_json::Value { - serde_json::json!(entities - .iter() - .map(|entry| { - serde_json::json!({ - "KnowledgeEntity": { - "id": entry.entity.id, - "name": entry.entity.name, - "description": entry.entity.description, - "score": round_score(entry.score), - "chunks": entry.chunks.iter().map(|chunk| { - serde_json::json!({ - "score": round_score(chunk.score), - "content": chunk.chunk.chunk - }) - }).collect::>() - } + serde_json::json!( + entities + .iter() + .map(|entry| { + serde_json::json!({ + "KnowledgeEntity": { + "id": entry.entity.id, + "name": entry.entity.name, + "description": entry.entity.description, + "score": round_score(entry.score), + "chunks": entry.chunks.iter().map(|chunk| { + serde_json::json!({ + "score": round_score(chunk.score), + "content": chunk.chunk.chunk + }) + }).collect::>() + } + }) }) - }) - .collect::>()) + .collect::>() + ) } diff --git a/retrieval-pipeline/src/pipeline/stages.rs b/retrieval-pipeline/src/pipeline/stages.rs index 91e3690..7709502 100644 --- a/retrieval-pipeline/src/pipeline/stages.rs +++ b/retrieval-pipeline/src/pipeline/stages.rs @@ -8,16 +8,16 @@ use std::{collections::HashMap, fmt::Write, sync::Arc}; use tracing::{debug, instrument, warn}; use crate::{ - query::normalize_fts_terms, - scoring::{clamp_unit, min_max_normalize, reciprocal_rank_fusion, RrfConfig, Scored}, RetrievedChunk, RetrievedEntity, + query::normalize_fts_terms, + scoring::{RrfConfig, Scored, clamp_unit, min_max_normalize, reciprocal_rank_fusion}, }; use super::{ + Stage, StageKind, config::RetrievalTuning, context::PipelineContext, diagnostics::{AssembleStats, SearchStats}, - Stage, StageKind, }; #[derive(Debug, Clone, Copy)] diff --git a/retrieval-pipeline/src/reranking.rs b/retrieval-pipeline/src/reranking.rs index f11fb24..f80dcd5 100644 --- a/retrieval-pipeline/src/reranking.rs +++ b/retrieval-pipeline/src/reranking.rs @@ -2,8 +2,8 @@ use std::{ env, fs, path::{Path, PathBuf}, sync::{ - atomic::{AtomicUsize, Ordering}, Arc, Mutex, + atomic::{AtomicUsize, Ordering}, }, thread::available_parallelism, }; diff --git a/retrieval-pipeline/src/scoring.rs b/retrieval-pipeline/src/scoring.rs index 0327c93..4ec5a61 100644 --- a/retrieval-pipeline/src/scoring.rs +++ b/retrieval-pipeline/src/scoring.rs @@ -1,11 +1,11 @@ use std::{ cmp::Ordering, - collections::{hash_map::Entry, HashMap}, + collections::{HashMap, hash_map::Entry}, sync::Arc, }; use common::storage::types::{ - knowledge_entity::KnowledgeEntity, text_chunk::TextChunk, StoredObject, + StoredObject, knowledge_entity::KnowledgeEntity, text_chunk::TextChunk, }; /// Identifier access for retrieval fusion and sorting. diff --git a/rustfmt.toml b/rustfmt.toml index 3a26366..f216078 100644 --- a/rustfmt.toml +++ b/rustfmt.toml @@ -1 +1 @@ -edition = "2021" +edition = "2024"