tidying stuff up, dto for search

This commit is contained in:
Per Stark
2025-12-20 22:30:31 +01:00
parent a5bc72aedf
commit 79ea007b0a
23 changed files with 936 additions and 73 deletions

View File

@@ -0,0 +1,8 @@
-- Add embedding_backend field to system_settings for visibility of active backend
DEFINE FIELD IF NOT EXISTS embedding_backend ON system_settings TYPE option<string>;
-- Set default to 'openai' for existing installs to preserve backward compatibility
UPDATE system_settings:current SET
embedding_backend = 'openai'
WHERE embedding_backend == NONE;

View File

@@ -428,6 +428,103 @@ impl KnowledgeEntity {
info!("Re-embedding process for knowledge entities completed successfully.");
Ok(())
}
/// Re-creates embeddings for all knowledge entities using an `EmbeddingProvider`.
///
/// This variant uses the application's configured embedding provider (FastEmbed, OpenAI, etc.)
/// instead of directly calling OpenAI. Used during startup when embedding configuration changes.
pub async fn update_all_embeddings_with_provider(
db: &SurrealDbClient,
provider: &crate::utils::embedding::EmbeddingProvider,
) -> Result<(), AppError> {
let new_dimensions = provider.dimension();
info!(
dimensions = new_dimensions,
backend = provider.backend_label(),
"Starting re-embedding process for all knowledge entities"
);
// Fetch all entities first
let all_entities: Vec<KnowledgeEntity> = db.select(Self::table_name()).await?;
let total_entities = all_entities.len();
if total_entities == 0 {
info!("No knowledge entities to update. Just updating the index.");
KnowledgeEntityEmbedding::redefine_hnsw_index(db, new_dimensions).await?;
return Ok(());
}
info!(entities = total_entities, "Found entities to process");
// Generate all new embeddings in memory
let mut new_embeddings: HashMap<String, (Vec<f32>, String)> = HashMap::new();
info!("Generating new embeddings for all entities...");
for (i, entity) in all_entities.iter().enumerate() {
if i > 0 && i % 100 == 0 {
info!(progress = i, total = total_entities, "Re-embedding progress");
}
let embedding_input = format!(
"name: {}, description: {}, type: {:?}",
entity.name, entity.description, entity.entity_type
);
let embedding = provider
.embed(&embedding_input)
.await
.map_err(|e| AppError::InternalError(format!("Embedding failed: {e}")))?;
// Safety check: ensure the generated embedding has the correct dimension.
if embedding.len() != new_dimensions {
let err_msg = format!(
"CRITICAL: Generated embedding for entity {} has incorrect dimension ({}). Expected {}. Aborting.",
entity.id, embedding.len(), new_dimensions
);
error!("{}", err_msg);
return Err(AppError::InternalError(err_msg));
}
new_embeddings.insert(entity.id.clone(), (embedding, entity.user_id.clone()));
}
info!("Successfully generated all new embeddings.");
// Perform DB updates in a single transaction
info!("Applying embedding updates in a transaction...");
let mut transaction_query = String::from("BEGIN TRANSACTION;");
for (id, (embedding, user_id)) in new_embeddings {
let embedding_str = format!(
"[{}]",
embedding
.iter()
.map(|f| f.to_string())
.collect::<Vec<_>>()
.join(",")
);
transaction_query.push_str(&format!(
"UPSERT type::thing('knowledge_entity_embedding', '{id}') SET \
entity_id = type::thing('knowledge_entity', '{id}'), \
embedding = {embedding}, \
user_id = '{user_id}', \
created_at = IF created_at != NONE THEN created_at ELSE time::now() END, \
updated_at = time::now();",
id = id,
embedding = embedding_str,
user_id = user_id
));
}
transaction_query.push_str(&format!(
"DEFINE INDEX OVERWRITE idx_embedding_knowledge_entity_embedding ON TABLE knowledge_entity_embedding FIELDS embedding HNSW DIMENSION {};",
new_dimensions
));
transaction_query.push_str("COMMIT TRANSACTION;");
// Execute the entire atomic operation
db.query(transaction_query).await?;
info!("Re-embedding process for knowledge entities completed successfully.");
Ok(())
}
}
#[cfg(test)]

View File

@@ -13,6 +13,9 @@ pub struct SystemSettings {
pub processing_model: String,
pub embedding_model: String,
pub embedding_dimensions: u32,
/// Active embedding backend ("openai", "fastembed", "hashed"). Read-only, synced from config.
#[serde(default)]
pub embedding_backend: Option<String>,
pub query_system_prompt: String,
pub ingestion_system_prompt: String,
pub image_processing_model: String,
@@ -49,6 +52,57 @@ impl SystemSettings {
"Something went wrong updating the settings".into(),
))
}
/// Syncs SystemSettings with the active embedding provider's properties.
/// Updates embedding_backend, embedding_model, and embedding_dimensions if they differ.
/// Returns true if any settings were changed.
pub async fn sync_from_embedding_provider(
db: &SurrealDbClient,
provider: &crate::utils::embedding::EmbeddingProvider,
) -> Result<(Self, bool), AppError> {
let mut settings = Self::get_current(db).await?;
let mut needs_update = false;
let backend_label = provider.backend_label().to_string();
let provider_dimensions = provider.dimension() as u32;
let provider_model = provider.model_code();
// Sync backend label
if settings.embedding_backend.as_deref() != Some(&backend_label) {
settings.embedding_backend = Some(backend_label);
needs_update = true;
}
// Sync dimensions
if settings.embedding_dimensions != provider_dimensions {
tracing::info!(
old_dimensions = settings.embedding_dimensions,
new_dimensions = provider_dimensions,
"Embedding dimensions changed, updating SystemSettings"
);
settings.embedding_dimensions = provider_dimensions;
needs_update = true;
}
// Sync model if provider has one
if let Some(model) = provider_model {
if settings.embedding_model != model {
tracing::info!(
old_model = %settings.embedding_model,
new_model = %model,
"Embedding model changed, updating SystemSettings"
);
settings.embedding_model = model;
needs_update = true;
}
}
if needs_update {
settings = Self::update(db, settings).await?;
}
Ok((settings, needs_update))
}
}
#[cfg(test)]

View File

@@ -323,6 +323,106 @@ impl TextChunk {
info!("Re-embedding process for text chunks completed successfully.");
Ok(())
}
/// Re-creates embeddings for all text chunks using an `EmbeddingProvider`.
///
/// This variant uses the application's configured embedding provider (FastEmbed, OpenAI, etc.)
/// instead of directly calling OpenAI. Used during startup when embedding configuration changes.
pub async fn update_all_embeddings_with_provider(
db: &SurrealDbClient,
provider: &crate::utils::embedding::EmbeddingProvider,
) -> Result<(), AppError> {
let new_dimensions = provider.dimension();
info!(
dimensions = new_dimensions,
backend = provider.backend_label(),
"Starting re-embedding process for all text chunks"
);
// Fetch all chunks first
let all_chunks: Vec<TextChunk> = db.select(Self::table_name()).await?;
let total_chunks = all_chunks.len();
if total_chunks == 0 {
info!("No text chunks to update. Just updating the index.");
TextChunkEmbedding::redefine_hnsw_index(db, new_dimensions).await?;
return Ok(());
}
info!(chunks = total_chunks, "Found chunks to process");
// Generate all new embeddings in memory
let mut new_embeddings: HashMap<String, (Vec<f32>, String, String)> = HashMap::new();
info!("Generating new embeddings for all chunks...");
for (i, chunk) in all_chunks.iter().enumerate() {
if i > 0 && i % 100 == 0 {
info!(progress = i, total = total_chunks, "Re-embedding progress");
}
let embedding = provider
.embed(&chunk.chunk)
.await
.map_err(|e| AppError::InternalError(format!("Embedding failed: {e}")))?;
// Safety check: ensure the generated embedding has the correct dimension.
if embedding.len() != new_dimensions {
let err_msg = format!(
"CRITICAL: Generated embedding for chunk {} has incorrect dimension ({}). Expected {}. Aborting.",
chunk.id, embedding.len(), new_dimensions
);
error!("{}", err_msg);
return Err(AppError::InternalError(err_msg));
}
new_embeddings.insert(
chunk.id.clone(),
(embedding, chunk.user_id.clone(), chunk.source_id.clone()),
);
}
info!("Successfully generated all new embeddings.");
// Perform DB updates in a single transaction against the embedding table
info!("Applying embedding updates in a transaction...");
let mut transaction_query = String::from("BEGIN TRANSACTION;");
for (id, (embedding, user_id, source_id)) in new_embeddings {
let embedding_str = format!(
"[{}]",
embedding
.iter()
.map(ToString::to_string)
.collect::<Vec<_>>()
.join(",")
);
write!(
&mut transaction_query,
"UPSERT type::thing('text_chunk_embedding', '{id}') SET \
chunk_id = type::thing('text_chunk', '{id}'), \
source_id = '{source_id}', \
embedding = {embedding}, \
user_id = '{user_id}', \
created_at = IF created_at != NONE THEN created_at ELSE time::now() END, \
updated_at = time::now();",
id = id,
embedding = embedding_str,
user_id = user_id,
source_id = source_id
)
.map_err(|e| AppError::InternalError(e.to_string()))?;
}
write!(
&mut transaction_query,
"DEFINE INDEX OVERWRITE idx_embedding_text_chunk_embedding ON TABLE text_chunk_embedding FIELDS embedding HNSW DIMENSION {};",
new_dimensions
)
.map_err(|e| AppError::InternalError(e.to_string()))?;
transaction_query.push_str("COMMIT TRANSACTION;");
db.query(transaction_query).await?;
info!("Re-embedding process for text chunks completed successfully.");
Ok(())
}
}
#[cfg(test)]

View File

@@ -2,6 +2,19 @@ use config::{Config, ConfigError, Environment, File};
use serde::Deserialize;
use std::env;
/// Selects the embedding backend for vector generation.
#[derive(Clone, Deserialize, Debug, Default, PartialEq)]
#[serde(rename_all = "lowercase")]
pub enum EmbeddingBackend {
/// Use OpenAI-compatible API for embeddings.
OpenAI,
/// Use FastEmbed local embeddings (default).
#[default]
FastEmbed,
/// Use deterministic hashed embeddings (for testing).
Hashed,
}
#[derive(Clone, Deserialize, Debug, PartialEq)]
#[serde(rename_all = "lowercase")]
pub enum StorageKind {
@@ -60,6 +73,8 @@ pub struct AppConfig {
pub fastembed_max_length: Option<usize>,
#[serde(default)]
pub retrieval_strategy: Option<String>,
#[serde(default)]
pub embedding_backend: EmbeddingBackend,
}
/// Default data directory for persisted assets.
@@ -127,6 +142,7 @@ impl Default for AppConfig {
fastembed_show_download_progress: None,
fastembed_max_length: None,
retrieval_strategy: None,
embedding_backend: EmbeddingBackend::default(),
}
}
}

View File

@@ -235,6 +235,34 @@ impl EmbeddingProvider {
},
})
}
/// Creates an embedding provider based on application configuration.
///
/// Dispatches to the appropriate constructor based on `config.embedding_backend`:
/// - `OpenAI`: Requires a valid OpenAI client
/// - `FastEmbed`: Uses local embedding model
/// - `Hashed`: Uses deterministic hashed embeddings (for testing)
pub async fn from_config(
config: &crate::utils::config::AppConfig,
openai_client: Option<Arc<Client<async_openai::config::OpenAIConfig>>>,
) -> Result<Self> {
use crate::utils::config::EmbeddingBackend;
match config.embedding_backend {
EmbeddingBackend::OpenAI => {
let client = openai_client.ok_or_else(|| {
anyhow!("OpenAI embedding backend requires an OpenAI client")
})?;
// Use defaults that match SystemSettings initial values
Self::new_openai(client, "text-embedding-3-small".to_string(), 1536)
}
EmbeddingBackend::FastEmbed => {
// Use nomic-embed-text-v1.5 as the default FastEmbed model
Self::new_fastembed(Some("nomic-ai/nomic-embed-text-v1.5".to_string())).await
}
EmbeddingBackend::Hashed => Self::new_hashed(384),
}
}
}
// Helper functions for hashed embeddings