tidying stuff up, dto for search

2026-05-14 20:01:03 +02:00 · 2025-12-20 22:30:31 +01:00
parent a5bc72aedf
commit 79ea007b0a
23 changed files with 936 additions and 73 deletions
--- a/common/migrations/20251210_add_embedding_backend_to_system_settings.surql
+++ b/common/migrations/20251210_add_embedding_backend_to_system_settings.surql
@@ -0,0 +1,8 @@
+-- Add embedding_backend field to system_settings for visibility of active backend
+
+DEFINE FIELD IF NOT EXISTS embedding_backend ON system_settings TYPE option<string>;
+
+-- Set default to 'openai' for existing installs to preserve backward compatibility
+UPDATE system_settings:current SET
+    embedding_backend = 'openai'
+WHERE embedding_backend == NONE;
--- a/common/src/storage/types/knowledge_entity.rs
+++ b/common/src/storage/types/knowledge_entity.rs
@@ -428,6 +428,103 @@ impl KnowledgeEntity {
        info!("Re-embedding process for knowledge entities completed successfully.");
        Ok(())
    }
+
+    /// Re-creates embeddings for all knowledge entities using an `EmbeddingProvider`.
+    ///
+    /// This variant uses the application's configured embedding provider (FastEmbed, OpenAI, etc.)
+    /// instead of directly calling OpenAI. Used during startup when embedding configuration changes.
+    pub async fn update_all_embeddings_with_provider(
+        db: &SurrealDbClient,
+        provider: &crate::utils::embedding::EmbeddingProvider,
+    ) -> Result<(), AppError> {
+        let new_dimensions = provider.dimension();
+        info!(
+            dimensions = new_dimensions,
+            backend = provider.backend_label(),
+            "Starting re-embedding process for all knowledge entities"
+        );
+
+        // Fetch all entities first
+        let all_entities: Vec<KnowledgeEntity> = db.select(Self::table_name()).await?;
+        let total_entities = all_entities.len();
+        if total_entities == 0 {
+            info!("No knowledge entities to update. Just updating the index.");
+            KnowledgeEntityEmbedding::redefine_hnsw_index(db, new_dimensions).await?;
+            return Ok(());
+        }
+        info!(entities = total_entities, "Found entities to process");
+
+        // Generate all new embeddings in memory
+        let mut new_embeddings: HashMap<String, (Vec<f32>, String)> = HashMap::new();
+        info!("Generating new embeddings for all entities...");
+
+        for (i, entity) in all_entities.iter().enumerate() {
+            if i > 0 && i % 100 == 0 {
+                info!(progress = i, total = total_entities, "Re-embedding progress");
+            }
+
+            let embedding_input = format!(
+                "name: {}, description: {}, type: {:?}",
+                entity.name, entity.description, entity.entity_type
+            );
+
+            let embedding = provider
+                .embed(&embedding_input)
+                .await
+                .map_err(|e| AppError::InternalError(format!("Embedding failed: {e}")))?;
+
+            // Safety check: ensure the generated embedding has the correct dimension.
+            if embedding.len() != new_dimensions {
+                let err_msg = format!(
+                    "CRITICAL: Generated embedding for entity {} has incorrect dimension ({}). Expected {}. Aborting.",
+                    entity.id, embedding.len(), new_dimensions
+                );
+                error!("{}", err_msg);
+                return Err(AppError::InternalError(err_msg));
+            }
+            new_embeddings.insert(entity.id.clone(), (embedding, entity.user_id.clone()));
+        }
+        info!("Successfully generated all new embeddings.");
+
+        // Perform DB updates in a single transaction
+        info!("Applying embedding updates in a transaction...");
+        let mut transaction_query = String::from("BEGIN TRANSACTION;");
+
+        for (id, (embedding, user_id)) in new_embeddings {
+            let embedding_str = format!(
+                "[{}]",
+                embedding
+                    .iter()
+                    .map(|f| f.to_string())
+                    .collect::<Vec<_>>()
+                    .join(",")
+            );
+            transaction_query.push_str(&format!(
+                "UPSERT type::thing('knowledge_entity_embedding', '{id}') SET \
+                    entity_id = type::thing('knowledge_entity', '{id}'), \
+                    embedding = {embedding}, \
+                    user_id = '{user_id}', \
+                    created_at = IF created_at != NONE THEN created_at ELSE time::now() END, \
+                    updated_at = time::now();",
+                id = id,
+                embedding = embedding_str,
+                user_id = user_id
+            ));
+        }
+
+        transaction_query.push_str(&format!(
+            "DEFINE INDEX OVERWRITE idx_embedding_knowledge_entity_embedding ON TABLE knowledge_entity_embedding FIELDS embedding HNSW DIMENSION {};",
+            new_dimensions
+        ));
+
+        transaction_query.push_str("COMMIT TRANSACTION;");
+
+        // Execute the entire atomic operation
+        db.query(transaction_query).await?;
+
+        info!("Re-embedding process for knowledge entities completed successfully.");
+        Ok(())
+    }
 }

 #[cfg(test)]
--- a/common/src/storage/types/system_settings.rs
+++ b/common/src/storage/types/system_settings.rs
@@ -13,6 +13,9 @@ pub struct SystemSettings {
    pub processing_model: String,
    pub embedding_model: String,
    pub embedding_dimensions: u32,
+    /// Active embedding backend ("openai", "fastembed", "hashed"). Read-only, synced from config.
+    #[serde(default)]
+    pub embedding_backend: Option<String>,
    pub query_system_prompt: String,
    pub ingestion_system_prompt: String,
    pub image_processing_model: String,
@@ -49,6 +52,57 @@ impl SystemSettings {
            "Something went wrong updating the settings".into(),
        ))
    }
+
+    /// Syncs SystemSettings with the active embedding provider's properties.
+    /// Updates embedding_backend, embedding_model, and embedding_dimensions if they differ.
+    /// Returns true if any settings were changed.
+    pub async fn sync_from_embedding_provider(
+        db: &SurrealDbClient,
+        provider: &crate::utils::embedding::EmbeddingProvider,
+    ) -> Result<(Self, bool), AppError> {
+        let mut settings = Self::get_current(db).await?;
+        let mut needs_update = false;
+
+        let backend_label = provider.backend_label().to_string();
+        let provider_dimensions = provider.dimension() as u32;
+        let provider_model = provider.model_code();
+
+        // Sync backend label
+        if settings.embedding_backend.as_deref() != Some(&backend_label) {
+            settings.embedding_backend = Some(backend_label);
+            needs_update = true;
+        }
+
+        // Sync dimensions
+        if settings.embedding_dimensions != provider_dimensions {
+            tracing::info!(
+                old_dimensions = settings.embedding_dimensions,
+                new_dimensions = provider_dimensions,
+                "Embedding dimensions changed, updating SystemSettings"
+            );
+            settings.embedding_dimensions = provider_dimensions;
+            needs_update = true;
+        }
+
+        // Sync model if provider has one
+        if let Some(model) = provider_model {
+            if settings.embedding_model != model {
+                tracing::info!(
+                    old_model = %settings.embedding_model,
+                    new_model = %model,
+                    "Embedding model changed, updating SystemSettings"
+                );
+                settings.embedding_model = model;
+                needs_update = true;
+            }
+        }
+
+        if needs_update {
+            settings = Self::update(db, settings).await?;
+        }
+
+        Ok((settings, needs_update))
+    }
 }

 #[cfg(test)]
--- a/common/src/storage/types/text_chunk.rs
+++ b/common/src/storage/types/text_chunk.rs
@@ -323,6 +323,106 @@ impl TextChunk {
        info!("Re-embedding process for text chunks completed successfully.");
        Ok(())
    }
+
+    /// Re-creates embeddings for all text chunks using an `EmbeddingProvider`.
+    ///
+    /// This variant uses the application's configured embedding provider (FastEmbed, OpenAI, etc.)
+    /// instead of directly calling OpenAI. Used during startup when embedding configuration changes.
+    pub async fn update_all_embeddings_with_provider(
+        db: &SurrealDbClient,
+        provider: &crate::utils::embedding::EmbeddingProvider,
+    ) -> Result<(), AppError> {
+        let new_dimensions = provider.dimension();
+        info!(
+            dimensions = new_dimensions,
+            backend = provider.backend_label(),
+            "Starting re-embedding process for all text chunks"
+        );
+
+        // Fetch all chunks first
+        let all_chunks: Vec<TextChunk> = db.select(Self::table_name()).await?;
+        let total_chunks = all_chunks.len();
+        if total_chunks == 0 {
+            info!("No text chunks to update. Just updating the index.");
+            TextChunkEmbedding::redefine_hnsw_index(db, new_dimensions).await?;
+            return Ok(());
+        }
+        info!(chunks = total_chunks, "Found chunks to process");
+
+        // Generate all new embeddings in memory
+        let mut new_embeddings: HashMap<String, (Vec<f32>, String, String)> = HashMap::new();
+        info!("Generating new embeddings for all chunks...");
+        
+        for (i, chunk) in all_chunks.iter().enumerate() {
+            if i > 0 && i % 100 == 0 {
+                info!(progress = i, total = total_chunks, "Re-embedding progress");
+            }
+            
+            let embedding = provider
+                .embed(&chunk.chunk)
+                .await
+                .map_err(|e| AppError::InternalError(format!("Embedding failed: {e}")))?;
+
+            // Safety check: ensure the generated embedding has the correct dimension.
+            if embedding.len() != new_dimensions {
+                let err_msg = format!(
+                    "CRITICAL: Generated embedding for chunk {} has incorrect dimension ({}). Expected {}. Aborting.",
+                    chunk.id, embedding.len(), new_dimensions
+                );
+                error!("{}", err_msg);
+                return Err(AppError::InternalError(err_msg));
+            }
+            new_embeddings.insert(
+                chunk.id.clone(),
+                (embedding, chunk.user_id.clone(), chunk.source_id.clone()),
+            );
+        }
+        info!("Successfully generated all new embeddings.");
+
+        // Perform DB updates in a single transaction against the embedding table
+        info!("Applying embedding updates in a transaction...");
+        let mut transaction_query = String::from("BEGIN TRANSACTION;");
+
+        for (id, (embedding, user_id, source_id)) in new_embeddings {
+            let embedding_str = format!(
+                "[{}]",
+                embedding
+                    .iter()
+                    .map(ToString::to_string)
+                    .collect::<Vec<_>>()
+                    .join(",")
+            );
+            write!(
+                &mut transaction_query,
+                "UPSERT type::thing('text_chunk_embedding', '{id}') SET \
+                    chunk_id = type::thing('text_chunk', '{id}'), \
+                    source_id = '{source_id}', \
+                    embedding = {embedding}, \
+                    user_id = '{user_id}', \
+                    created_at = IF created_at != NONE THEN created_at ELSE time::now() END, \
+                    updated_at = time::now();",
+                id = id,
+                embedding = embedding_str,
+                user_id = user_id,
+                source_id = source_id
+            )
+            .map_err(|e| AppError::InternalError(e.to_string()))?;
+        }
+
+        write!(
+            &mut transaction_query,
+            "DEFINE INDEX OVERWRITE idx_embedding_text_chunk_embedding ON TABLE text_chunk_embedding FIELDS embedding HNSW DIMENSION {};",
+            new_dimensions
+        )
+        .map_err(|e| AppError::InternalError(e.to_string()))?;
+
+        transaction_query.push_str("COMMIT TRANSACTION;");
+
+        db.query(transaction_query).await?;
+
+        info!("Re-embedding process for text chunks completed successfully.");
+        Ok(())
+    }
 }

 #[cfg(test)]
--- a/common/src/utils/config.rs
+++ b/common/src/utils/config.rs
@@ -2,6 +2,19 @@ use config::{Config, ConfigError, Environment, File};
 use serde::Deserialize;
 use std::env;

+/// Selects the embedding backend for vector generation.
+#[derive(Clone, Deserialize, Debug, Default, PartialEq)]
+#[serde(rename_all = "lowercase")]
+pub enum EmbeddingBackend {
+    /// Use OpenAI-compatible API for embeddings.
+    OpenAI,
+    /// Use FastEmbed local embeddings (default).
+    #[default]
+    FastEmbed,
+    /// Use deterministic hashed embeddings (for testing).
+    Hashed,
+}
+
 #[derive(Clone, Deserialize, Debug, PartialEq)]
 #[serde(rename_all = "lowercase")]
 pub enum StorageKind {
@@ -60,6 +73,8 @@ pub struct AppConfig {
    pub fastembed_max_length: Option<usize>,
    #[serde(default)]
    pub retrieval_strategy: Option<String>,
+    #[serde(default)]
+    pub embedding_backend: EmbeddingBackend,
 }

 /// Default data directory for persisted assets.
@@ -127,6 +142,7 @@ impl Default for AppConfig {
            fastembed_show_download_progress: None,
            fastembed_max_length: None,
            retrieval_strategy: None,
+            embedding_backend: EmbeddingBackend::default(),
        }
    }
 }
--- a/common/src/utils/embedding.rs
+++ b/common/src/utils/embedding.rs
@@ -235,6 +235,34 @@ impl EmbeddingProvider {
            },
        })
    }
+
+    /// Creates an embedding provider based on application configuration.
+    ///
+    /// Dispatches to the appropriate constructor based on `config.embedding_backend`:
+    /// - `OpenAI`: Requires a valid OpenAI client
+    /// - `FastEmbed`: Uses local embedding model
+    /// - `Hashed`: Uses deterministic hashed embeddings (for testing)
+    pub async fn from_config(
+        config: &crate::utils::config::AppConfig,
+        openai_client: Option<Arc<Client<async_openai::config::OpenAIConfig>>>,
+    ) -> Result<Self> {
+        use crate::utils::config::EmbeddingBackend;
+
+        match config.embedding_backend {
+            EmbeddingBackend::OpenAI => {
+                let client = openai_client.ok_or_else(|| {
+                    anyhow!("OpenAI embedding backend requires an OpenAI client")
+                })?;
+                // Use defaults that match SystemSettings initial values
+                Self::new_openai(client, "text-embedding-3-small".to_string(), 1536)
+            }
+            EmbeddingBackend::FastEmbed => {
+                // Use nomic-embed-text-v1.5 as the default FastEmbed model
+                Self::new_fastembed(Some("nomic-ai/nomic-embed-text-v1.5".to_string())).await
+            }
+            EmbeddingBackend::Hashed => Self::new_hashed(384),
+        }
+    }
 }

 // Helper functions for hashed embeddings