retrieval simplfied

2026-04-11 03:36:53 +02:00 · 2025-12-09 20:35:42 +01:00
parent a8d10f265c
commit a090a8c76e
55 changed files with 469 additions and 1208 deletions
--- a/common/src/utils/config.rs
+++ b/common/src/utils/config.rs
@@ -9,6 +9,7 @@ pub enum StorageKind {
    Memory,
 }

+/// Default storage backend when none is configured.
 fn default_storage_kind() -> StorageKind {
    StorageKind::Local
 }
@@ -23,10 +24,13 @@ pub enum PdfIngestMode {
    LlmFirst,
 }

+/// Default PDF ingestion mode when unset.
 fn default_pdf_ingest_mode() -> PdfIngestMode {
    PdfIngestMode::LlmFirst
 }

+/// Application configuration loaded from files and environment variables.
+#[allow(clippy::module_name_repetitions)]
 #[derive(Clone, Deserialize, Debug)]
 pub struct AppConfig {
    pub openai_api_key: String,
@@ -58,14 +62,17 @@ pub struct AppConfig {
    pub retrieval_strategy: Option<String>,
 }

+/// Default data directory for persisted assets.
 fn default_data_dir() -> String {
    "./data".to_string()
 }

+/// Default base URL used for OpenAI-compatible APIs.
 fn default_base_url() -> String {
    "https://api.openai.com/v1".to_string()
 }

+/// Whether reranking is enabled by default.
 fn default_reranking_enabled() -> bool {
    false
 }
@@ -124,6 +131,8 @@ impl Default for AppConfig {
    }
 }

+/// Loads the application configuration from the environment and optional config file.
+#[allow(clippy::module_name_repetitions)]
 pub fn get_config() -> Result<AppConfig, ConfigError> {
    ensure_ort_path();

--- a/common/src/utils/embedding.rs
+++ b/common/src/utils/embedding.rs
@@ -16,19 +16,16 @@ use crate::{
    storage::{db::SurrealDbClient, types::system_settings::SystemSettings},
 };

-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+/// Supported embedding backends.
+#[allow(clippy::module_name_repetitions)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
 pub enum EmbeddingBackend {
    OpenAI,
+    #[default]
    FastEmbed,
    Hashed,
 }

-impl Default for EmbeddingBackend {
-    fn default() -> Self {
-        Self::FastEmbed
-    }
-}
-
 impl std::str::FromStr for EmbeddingBackend {
    type Err = anyhow::Error;

@@ -44,24 +41,38 @@ impl std::str::FromStr for EmbeddingBackend {
    }
 }

+/// Wrapper around the chosen embedding backend.
+#[allow(clippy::module_name_repetitions)]
 #[derive(Clone)]
 pub struct EmbeddingProvider {
+    /// Concrete backend implementation.
    inner: EmbeddingInner,
 }

+/// Concrete embedding implementations.
 #[derive(Clone)]
 enum EmbeddingInner {
+    /// Uses an `OpenAI`-compatible API.
    OpenAI {
+        /// Client used to issue embedding requests.
        client: Arc<Client<async_openai::config::OpenAIConfig>>,
+        /// Model identifier for the API.
        model: String,
+        /// Expected output dimensions.
        dimensions: u32,
    },
+    /// Generates deterministic hashed embeddings without external calls.
    Hashed {
+        /// Output vector length.
        dimension: usize,
    },
+    /// Uses `FastEmbed` running locally.
    FastEmbed {
+        /// Shared `FastEmbed` model.
        model: Arc<Mutex<TextEmbedding>>,
+        /// Model metadata used for info logging.
        model_name: EmbeddingModel,
+        /// Output vector length.
        dimension: usize,
    },
 }
@@ -77,8 +88,9 @@ impl EmbeddingProvider {

    pub fn dimension(&self) -> usize {
        match &self.inner {
-            EmbeddingInner::Hashed { dimension } => *dimension,
-            EmbeddingInner::FastEmbed { dimension, .. } => *dimension,
+            EmbeddingInner::Hashed { dimension } | EmbeddingInner::FastEmbed { dimension, .. } => {
+                *dimension
+            }
            EmbeddingInner::OpenAI { dimensions, .. } => *dimensions as usize,
        }
    }
@@ -172,12 +184,12 @@ impl EmbeddingProvider {
        }
    }

-    pub async fn new_openai(
+    pub fn new_openai(
        client: Arc<Client<async_openai::config::OpenAIConfig>>,
        model: String,
        dimensions: u32,
    ) -> Result<Self> {
-        Ok(EmbeddingProvider {
+        Ok(Self {
            inner: EmbeddingInner::OpenAI {
                client,
                model,
@@ -226,6 +238,7 @@ impl EmbeddingProvider {
 }

 // Helper functions for hashed embeddings
+/// Generates a hashed embedding vector without external dependencies.
 fn hashed_embedding(text: &str, dimension: usize) -> Vec<f32> {
    let dim = dimension.max(1);
    let mut vector = vec![0.0f32; dim];
@@ -233,15 +246,11 @@ fn hashed_embedding(text: &str, dimension: usize) -> Vec<f32> {
        return vector;
    }

-    let mut token_count = 0f32;
    for token in tokens(text) {
-        token_count += 1.0;
        let idx = bucket(&token, dim);
-        vector[idx] += 1.0;
-    }
-
-    if token_count == 0.0 {
-        return vector;
+        if let Some(slot) = vector.get_mut(idx) {
+            *slot += 1.0;
+        }
    }

    let norm = vector.iter().map(|v| v * v).sum::<f32>().sqrt();
@@ -254,16 +263,22 @@ fn hashed_embedding(text: &str, dimension: usize) -> Vec<f32> {
    vector
 }

+/// Tokenizes the text into alphanumeric lowercase tokens.
 fn tokens(text: &str) -> impl Iterator<Item = String> + '_ {
    text.split(|c: char| !c.is_ascii_alphanumeric())
        .filter(|token| !token.is_empty())
-        .map(|token| token.to_ascii_lowercase())
+        .map(str::to_ascii_lowercase)
 }

+/// Buckets a token into the hashed embedding vector.
+#[allow(clippy::arithmetic_side_effects)]
 fn bucket(token: &str, dimension: usize) -> usize {
+    let safe_dimension = dimension.max(1);
    let mut hasher = DefaultHasher::new();
    token.hash(&mut hasher);
-    (hasher.finish() as usize) % dimension
+    usize::try_from(hasher.finish())
+        .unwrap_or_default()
+        % safe_dimension
 }

 // Backward compatibility function
@@ -274,15 +289,15 @@ pub async fn generate_embedding_with_provider(
    provider.embed(input).await.map_err(AppError::from)
 }

-/// Generates an embedding vector for the given input text using OpenAI's embedding model.
+/// Generates an embedding vector for the given input text using `OpenAI`'s embedding model.
 ///
 /// This function takes a text input and converts it into a numerical vector representation (embedding)
-/// using OpenAI's text-embedding-3-small model. These embeddings can be used for semantic similarity
+/// using `OpenAI`'s text-embedding-3-small model. These embeddings can be used for semantic similarity
 /// comparisons, vector search, and other natural language processing tasks.
 ///
 /// # Arguments
 ///
-/// * `client`: The OpenAI client instance used to make API requests.
+/// * `client`: The `OpenAI` client instance used to make API requests.
 /// * `input`: The text string to generate embeddings for.
 ///
 /// # Returns
@@ -294,9 +309,10 @@ pub async fn generate_embedding_with_provider(
 /// # Errors
 ///
 /// This function can return a `AppError` in the following cases:
-/// * If the OpenAI API request fails
+/// * If the `OpenAI` API request fails
 /// * If the request building fails
 /// * If no embedding data is received in the response
+#[allow(clippy::module_name_repetitions)]
 pub async fn generate_embedding(
    client: &async_openai::Client<async_openai::config::OpenAIConfig>,
    input: &str,
--- a/common/src/utils/template_engine.rs
+++ b/common/src/utils/template_engine.rs
@@ -4,6 +4,7 @@ pub use minijinja_contrib;
 pub use minijinja_embed;
 use std::sync::Arc;

+#[allow(clippy::module_name_repetitions)]
 pub trait ProvidesTemplateEngine {
    fn template_engine(&self) -> &Arc<TemplateEngine>;
 }