mirror of
https://github.com/perstarkse/minne.git
synced 2026-05-28 10:29:30 +02:00
tidying stuff up, dto for search
This commit is contained in:
@@ -428,6 +428,103 @@ impl KnowledgeEntity {
|
||||
info!("Re-embedding process for knowledge entities completed successfully.");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Re-creates embeddings for all knowledge entities using an `EmbeddingProvider`.
|
||||
///
|
||||
/// This variant uses the application's configured embedding provider (FastEmbed, OpenAI, etc.)
|
||||
/// instead of directly calling OpenAI. Used during startup when embedding configuration changes.
|
||||
pub async fn update_all_embeddings_with_provider(
|
||||
db: &SurrealDbClient,
|
||||
provider: &crate::utils::embedding::EmbeddingProvider,
|
||||
) -> Result<(), AppError> {
|
||||
let new_dimensions = provider.dimension();
|
||||
info!(
|
||||
dimensions = new_dimensions,
|
||||
backend = provider.backend_label(),
|
||||
"Starting re-embedding process for all knowledge entities"
|
||||
);
|
||||
|
||||
// Fetch all entities first
|
||||
let all_entities: Vec<KnowledgeEntity> = db.select(Self::table_name()).await?;
|
||||
let total_entities = all_entities.len();
|
||||
if total_entities == 0 {
|
||||
info!("No knowledge entities to update. Just updating the index.");
|
||||
KnowledgeEntityEmbedding::redefine_hnsw_index(db, new_dimensions).await?;
|
||||
return Ok(());
|
||||
}
|
||||
info!(entities = total_entities, "Found entities to process");
|
||||
|
||||
// Generate all new embeddings in memory
|
||||
let mut new_embeddings: HashMap<String, (Vec<f32>, String)> = HashMap::new();
|
||||
info!("Generating new embeddings for all entities...");
|
||||
|
||||
for (i, entity) in all_entities.iter().enumerate() {
|
||||
if i > 0 && i % 100 == 0 {
|
||||
info!(progress = i, total = total_entities, "Re-embedding progress");
|
||||
}
|
||||
|
||||
let embedding_input = format!(
|
||||
"name: {}, description: {}, type: {:?}",
|
||||
entity.name, entity.description, entity.entity_type
|
||||
);
|
||||
|
||||
let embedding = provider
|
||||
.embed(&embedding_input)
|
||||
.await
|
||||
.map_err(|e| AppError::InternalError(format!("Embedding failed: {e}")))?;
|
||||
|
||||
// Safety check: ensure the generated embedding has the correct dimension.
|
||||
if embedding.len() != new_dimensions {
|
||||
let err_msg = format!(
|
||||
"CRITICAL: Generated embedding for entity {} has incorrect dimension ({}). Expected {}. Aborting.",
|
||||
entity.id, embedding.len(), new_dimensions
|
||||
);
|
||||
error!("{}", err_msg);
|
||||
return Err(AppError::InternalError(err_msg));
|
||||
}
|
||||
new_embeddings.insert(entity.id.clone(), (embedding, entity.user_id.clone()));
|
||||
}
|
||||
info!("Successfully generated all new embeddings.");
|
||||
|
||||
// Perform DB updates in a single transaction
|
||||
info!("Applying embedding updates in a transaction...");
|
||||
let mut transaction_query = String::from("BEGIN TRANSACTION;");
|
||||
|
||||
for (id, (embedding, user_id)) in new_embeddings {
|
||||
let embedding_str = format!(
|
||||
"[{}]",
|
||||
embedding
|
||||
.iter()
|
||||
.map(|f| f.to_string())
|
||||
.collect::<Vec<_>>()
|
||||
.join(",")
|
||||
);
|
||||
transaction_query.push_str(&format!(
|
||||
"UPSERT type::thing('knowledge_entity_embedding', '{id}') SET \
|
||||
entity_id = type::thing('knowledge_entity', '{id}'), \
|
||||
embedding = {embedding}, \
|
||||
user_id = '{user_id}', \
|
||||
created_at = IF created_at != NONE THEN created_at ELSE time::now() END, \
|
||||
updated_at = time::now();",
|
||||
id = id,
|
||||
embedding = embedding_str,
|
||||
user_id = user_id
|
||||
));
|
||||
}
|
||||
|
||||
transaction_query.push_str(&format!(
|
||||
"DEFINE INDEX OVERWRITE idx_embedding_knowledge_entity_embedding ON TABLE knowledge_entity_embedding FIELDS embedding HNSW DIMENSION {};",
|
||||
new_dimensions
|
||||
));
|
||||
|
||||
transaction_query.push_str("COMMIT TRANSACTION;");
|
||||
|
||||
// Execute the entire atomic operation
|
||||
db.query(transaction_query).await?;
|
||||
|
||||
info!("Re-embedding process for knowledge entities completed successfully.");
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -13,6 +13,9 @@ pub struct SystemSettings {
|
||||
pub processing_model: String,
|
||||
pub embedding_model: String,
|
||||
pub embedding_dimensions: u32,
|
||||
/// Active embedding backend ("openai", "fastembed", "hashed"). Read-only, synced from config.
|
||||
#[serde(default)]
|
||||
pub embedding_backend: Option<String>,
|
||||
pub query_system_prompt: String,
|
||||
pub ingestion_system_prompt: String,
|
||||
pub image_processing_model: String,
|
||||
@@ -49,6 +52,57 @@ impl SystemSettings {
|
||||
"Something went wrong updating the settings".into(),
|
||||
))
|
||||
}
|
||||
|
||||
/// Syncs SystemSettings with the active embedding provider's properties.
|
||||
/// Updates embedding_backend, embedding_model, and embedding_dimensions if they differ.
|
||||
/// Returns true if any settings were changed.
|
||||
pub async fn sync_from_embedding_provider(
|
||||
db: &SurrealDbClient,
|
||||
provider: &crate::utils::embedding::EmbeddingProvider,
|
||||
) -> Result<(Self, bool), AppError> {
|
||||
let mut settings = Self::get_current(db).await?;
|
||||
let mut needs_update = false;
|
||||
|
||||
let backend_label = provider.backend_label().to_string();
|
||||
let provider_dimensions = provider.dimension() as u32;
|
||||
let provider_model = provider.model_code();
|
||||
|
||||
// Sync backend label
|
||||
if settings.embedding_backend.as_deref() != Some(&backend_label) {
|
||||
settings.embedding_backend = Some(backend_label);
|
||||
needs_update = true;
|
||||
}
|
||||
|
||||
// Sync dimensions
|
||||
if settings.embedding_dimensions != provider_dimensions {
|
||||
tracing::info!(
|
||||
old_dimensions = settings.embedding_dimensions,
|
||||
new_dimensions = provider_dimensions,
|
||||
"Embedding dimensions changed, updating SystemSettings"
|
||||
);
|
||||
settings.embedding_dimensions = provider_dimensions;
|
||||
needs_update = true;
|
||||
}
|
||||
|
||||
// Sync model if provider has one
|
||||
if let Some(model) = provider_model {
|
||||
if settings.embedding_model != model {
|
||||
tracing::info!(
|
||||
old_model = %settings.embedding_model,
|
||||
new_model = %model,
|
||||
"Embedding model changed, updating SystemSettings"
|
||||
);
|
||||
settings.embedding_model = model;
|
||||
needs_update = true;
|
||||
}
|
||||
}
|
||||
|
||||
if needs_update {
|
||||
settings = Self::update(db, settings).await?;
|
||||
}
|
||||
|
||||
Ok((settings, needs_update))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -323,6 +323,106 @@ impl TextChunk {
|
||||
info!("Re-embedding process for text chunks completed successfully.");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Re-creates embeddings for all text chunks using an `EmbeddingProvider`.
|
||||
///
|
||||
/// This variant uses the application's configured embedding provider (FastEmbed, OpenAI, etc.)
|
||||
/// instead of directly calling OpenAI. Used during startup when embedding configuration changes.
|
||||
pub async fn update_all_embeddings_with_provider(
|
||||
db: &SurrealDbClient,
|
||||
provider: &crate::utils::embedding::EmbeddingProvider,
|
||||
) -> Result<(), AppError> {
|
||||
let new_dimensions = provider.dimension();
|
||||
info!(
|
||||
dimensions = new_dimensions,
|
||||
backend = provider.backend_label(),
|
||||
"Starting re-embedding process for all text chunks"
|
||||
);
|
||||
|
||||
// Fetch all chunks first
|
||||
let all_chunks: Vec<TextChunk> = db.select(Self::table_name()).await?;
|
||||
let total_chunks = all_chunks.len();
|
||||
if total_chunks == 0 {
|
||||
info!("No text chunks to update. Just updating the index.");
|
||||
TextChunkEmbedding::redefine_hnsw_index(db, new_dimensions).await?;
|
||||
return Ok(());
|
||||
}
|
||||
info!(chunks = total_chunks, "Found chunks to process");
|
||||
|
||||
// Generate all new embeddings in memory
|
||||
let mut new_embeddings: HashMap<String, (Vec<f32>, String, String)> = HashMap::new();
|
||||
info!("Generating new embeddings for all chunks...");
|
||||
|
||||
for (i, chunk) in all_chunks.iter().enumerate() {
|
||||
if i > 0 && i % 100 == 0 {
|
||||
info!(progress = i, total = total_chunks, "Re-embedding progress");
|
||||
}
|
||||
|
||||
let embedding = provider
|
||||
.embed(&chunk.chunk)
|
||||
.await
|
||||
.map_err(|e| AppError::InternalError(format!("Embedding failed: {e}")))?;
|
||||
|
||||
// Safety check: ensure the generated embedding has the correct dimension.
|
||||
if embedding.len() != new_dimensions {
|
||||
let err_msg = format!(
|
||||
"CRITICAL: Generated embedding for chunk {} has incorrect dimension ({}). Expected {}. Aborting.",
|
||||
chunk.id, embedding.len(), new_dimensions
|
||||
);
|
||||
error!("{}", err_msg);
|
||||
return Err(AppError::InternalError(err_msg));
|
||||
}
|
||||
new_embeddings.insert(
|
||||
chunk.id.clone(),
|
||||
(embedding, chunk.user_id.clone(), chunk.source_id.clone()),
|
||||
);
|
||||
}
|
||||
info!("Successfully generated all new embeddings.");
|
||||
|
||||
// Perform DB updates in a single transaction against the embedding table
|
||||
info!("Applying embedding updates in a transaction...");
|
||||
let mut transaction_query = String::from("BEGIN TRANSACTION;");
|
||||
|
||||
for (id, (embedding, user_id, source_id)) in new_embeddings {
|
||||
let embedding_str = format!(
|
||||
"[{}]",
|
||||
embedding
|
||||
.iter()
|
||||
.map(ToString::to_string)
|
||||
.collect::<Vec<_>>()
|
||||
.join(",")
|
||||
);
|
||||
write!(
|
||||
&mut transaction_query,
|
||||
"UPSERT type::thing('text_chunk_embedding', '{id}') SET \
|
||||
chunk_id = type::thing('text_chunk', '{id}'), \
|
||||
source_id = '{source_id}', \
|
||||
embedding = {embedding}, \
|
||||
user_id = '{user_id}', \
|
||||
created_at = IF created_at != NONE THEN created_at ELSE time::now() END, \
|
||||
updated_at = time::now();",
|
||||
id = id,
|
||||
embedding = embedding_str,
|
||||
user_id = user_id,
|
||||
source_id = source_id
|
||||
)
|
||||
.map_err(|e| AppError::InternalError(e.to_string()))?;
|
||||
}
|
||||
|
||||
write!(
|
||||
&mut transaction_query,
|
||||
"DEFINE INDEX OVERWRITE idx_embedding_text_chunk_embedding ON TABLE text_chunk_embedding FIELDS embedding HNSW DIMENSION {};",
|
||||
new_dimensions
|
||||
)
|
||||
.map_err(|e| AppError::InternalError(e.to_string()))?;
|
||||
|
||||
transaction_query.push_str("COMMIT TRANSACTION;");
|
||||
|
||||
db.query(transaction_query).await?;
|
||||
|
||||
info!("Re-embedding process for text chunks completed successfully.");
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
Reference in New Issue
Block a user