mirror of
https://github.com/perstarkse/minne.git
synced 2026-06-30 10:01:40 +02:00
fix: arc-share retrieved chunks, centralize entity embeddings, and trim hot-path clones.
This commit is contained in:
@@ -3,9 +3,12 @@ use std::collections::HashMap;
|
||||
use std::fmt::Write;
|
||||
|
||||
use crate::{
|
||||
error::AppError, storage::db::SurrealDbClient, storage::indexes::hnsw_index_overwrite_sql,
|
||||
error::AppError,
|
||||
storage::db::SurrealDbClient,
|
||||
storage::indexes::hnsw_index_overwrite_sql,
|
||||
storage::types::knowledge_entity_embedding::KnowledgeEntityEmbedding,
|
||||
storage::types::system_settings::SystemSettings, stored_object,
|
||||
storage::types::system_settings::SystemSettings,
|
||||
stored_object,
|
||||
utils::embedding::{EmbeddingProvider, RE_EMBED_BATCH_SIZE},
|
||||
};
|
||||
use tracing::{error, info};
|
||||
@@ -25,6 +28,17 @@ impl KnowledgeEntityType {
|
||||
pub fn variants() -> &'static [&'static str] {
|
||||
&["Idea", "Project", "Document", "Page", "TextSnippet"]
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub const fn as_str(self) -> &'static str {
|
||||
match self {
|
||||
Self::Idea => "Idea",
|
||||
Self::Project => "Project",
|
||||
Self::Document => "Document",
|
||||
Self::Page => "Page",
|
||||
Self::TextSnippet => "TextSnippet",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<String> for KnowledgeEntityType {
|
||||
@@ -80,6 +94,27 @@ impl KnowledgeEntity {
|
||||
}
|
||||
}
|
||||
|
||||
/// Canonical text fed to the embedding provider for a knowledge entity.
|
||||
#[must_use]
|
||||
pub fn embedding_input_text(
|
||||
name: &str,
|
||||
description: &str,
|
||||
entity_type: KnowledgeEntityType,
|
||||
) -> String {
|
||||
let mut out = String::with_capacity(
|
||||
name.len()
|
||||
.saturating_add(description.len())
|
||||
.saturating_add(entity_type.as_str().len())
|
||||
.saturating_add(32),
|
||||
);
|
||||
let _ = write!(
|
||||
out,
|
||||
"name: {name}, description: {description}, type: {}",
|
||||
entity_type.as_str()
|
||||
);
|
||||
out
|
||||
}
|
||||
|
||||
/// Full-text search over knowledge entities using the BM25 FTS index.
|
||||
pub async fn fts_search(
|
||||
take: usize,
|
||||
@@ -314,8 +349,7 @@ impl KnowledgeEntity {
|
||||
db_client: &SurrealDbClient,
|
||||
embedding_provider: &EmbeddingProvider,
|
||||
) -> Result<(), AppError> {
|
||||
let embedding_input =
|
||||
format!("name: {name}, description: {description}, type: {entity_type:?}",);
|
||||
let embedding_input = Self::embedding_input_text(name, description, *entity_type);
|
||||
let embedding = embedding_provider.embed(&embedding_input).await?;
|
||||
|
||||
let entity: KnowledgeEntity = db_client
|
||||
@@ -402,9 +436,10 @@ impl KnowledgeEntity {
|
||||
let inputs: Vec<String> = batch
|
||||
.iter()
|
||||
.map(|entity| {
|
||||
format!(
|
||||
"name: {}, description: {}, type: {:?}",
|
||||
entity.name, entity.description, entity.entity_type
|
||||
Self::embedding_input_text(
|
||||
&entity.name,
|
||||
&entity.description,
|
||||
entity.entity_type,
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
@@ -523,6 +558,16 @@ mod tests {
|
||||
use anyhow::{self, Context};
|
||||
use uuid::Uuid;
|
||||
|
||||
#[test]
|
||||
fn embedding_input_text_uses_canonical_type_label() {
|
||||
let text = KnowledgeEntity::embedding_input_text(
|
||||
"Alpha",
|
||||
"Beta",
|
||||
KnowledgeEntityType::TextSnippet,
|
||||
);
|
||||
assert_eq!(text, "name: Alpha, description: Beta, type: TextSnippet");
|
||||
}
|
||||
|
||||
async fn ensure_entity_fts_indexes(db: &SurrealDbClient) -> anyhow::Result<()> {
|
||||
let snowball_sql = r#"
|
||||
DEFINE ANALYZER IF NOT EXISTS app_en_fts_analyzer TOKENIZERS class, punct FILTERS lowercase, ascii, snowball(english);
|
||||
|
||||
@@ -122,8 +122,7 @@ impl KnowledgeRelationship {
|
||||
.bind(("user_id", user_id.to_owned()))
|
||||
.await
|
||||
.map_err(AppError::from)?;
|
||||
let deleted: Vec<KnowledgeRelationship> =
|
||||
delete_result.take(0).map_err(AppError::from)?;
|
||||
let deleted: Vec<KnowledgeRelationship> = delete_result.take(0).map_err(AppError::from)?;
|
||||
|
||||
if !deleted.is_empty() {
|
||||
return Ok(());
|
||||
@@ -567,8 +566,8 @@ mod tests {
|
||||
shared_source.to_string(),
|
||||
"references".to_string(),
|
||||
);
|
||||
let rel_a_id = rel_a.id.clone();
|
||||
let rel_b_id = rel_b.id.clone();
|
||||
let owner_relationship_id = rel_a.id.clone();
|
||||
let other_relationship_id = rel_b.id.clone();
|
||||
|
||||
rel_a.store_relationship(&db).await?;
|
||||
rel_b.store_relationship(&db).await?;
|
||||
@@ -576,8 +575,12 @@ mod tests {
|
||||
KnowledgeRelationship::delete_relationships_by_source_id(shared_source, user_a, &db)
|
||||
.await?;
|
||||
|
||||
assert!(get_relationship_by_id(&rel_a_id, &db).await.is_none());
|
||||
assert!(get_relationship_by_id(&rel_b_id, &db).await.is_some());
|
||||
assert!(get_relationship_by_id(&owner_relationship_id, &db)
|
||||
.await
|
||||
.is_none());
|
||||
assert!(get_relationship_by_id(&other_relationship_id, &db)
|
||||
.await
|
||||
.is_some());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -299,7 +299,11 @@ impl TextChunk {
|
||||
}
|
||||
|
||||
processed = processed.saturating_add(batch.len());
|
||||
info!(progress = processed, total = total_chunks, "Re-embedding progress");
|
||||
info!(
|
||||
progress = processed,
|
||||
total = total_chunks,
|
||||
"Re-embedding progress"
|
||||
);
|
||||
}
|
||||
info!("Successfully generated all new embeddings.");
|
||||
|
||||
|
||||
@@ -140,8 +140,7 @@ impl TextContent {
|
||||
.await
|
||||
.map_err(AppError::from)?;
|
||||
|
||||
let existing: Option<surrealdb::sql::Thing> =
|
||||
response.take(0).map_err(AppError::from)?;
|
||||
let existing: Option<surrealdb::sql::Thing> = response.take(0).map_err(AppError::from)?;
|
||||
|
||||
Ok(existing.is_some())
|
||||
}
|
||||
|
||||
@@ -38,7 +38,7 @@ enum EmbeddingInner {
|
||||
/// Client used to issue embedding requests.
|
||||
client: Arc<Client<async_openai::config::OpenAIConfig>>,
|
||||
/// Model identifier for the API.
|
||||
model: String,
|
||||
model: Arc<str>,
|
||||
/// Expected output dimensions.
|
||||
dimensions: u32,
|
||||
},
|
||||
@@ -272,8 +272,9 @@ struct FastEmbedLease {
|
||||
}
|
||||
|
||||
impl FastEmbedLease {
|
||||
async fn embed(&self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, EmbeddingError> {
|
||||
async fn embed(&self, texts: &[String]) -> Result<Vec<Vec<f32>>, EmbeddingError> {
|
||||
let engine = Arc::clone(&self.engine);
|
||||
let texts = texts.to_vec();
|
||||
tokio::task::spawn_blocking(move || -> Result<Vec<Vec<f32>>, EmbeddingError> {
|
||||
let mut guard = engine.lock().map_err(EmbeddingError::mutex_poisoned)?;
|
||||
guard.embed(texts, None).map_err(EmbeddingError::fastembed)
|
||||
@@ -293,7 +294,7 @@ impl Drop for FastEmbedLease {
|
||||
|
||||
async fn run_fastembed(
|
||||
pool: &Arc<FastEmbedPool>,
|
||||
texts: Vec<String>,
|
||||
texts: &[String],
|
||||
) -> Result<Vec<Vec<f32>>, EmbeddingError> {
|
||||
let lease = pool.checkout().await?;
|
||||
lease.embed(texts).await
|
||||
@@ -323,7 +324,7 @@ impl EmbeddingProvider {
|
||||
pub fn model_code(&self) -> Option<String> {
|
||||
match &self.inner {
|
||||
EmbeddingInner::FastEmbed { model_name, .. } => Some(model_name.to_string()),
|
||||
EmbeddingInner::OpenAI { model, .. } => Some(model.clone()),
|
||||
EmbeddingInner::OpenAI { model, .. } => Some(model.as_ref().to_owned()),
|
||||
EmbeddingInner::Hashed { .. } => None,
|
||||
}
|
||||
}
|
||||
@@ -338,7 +339,8 @@ impl EmbeddingProvider {
|
||||
match &self.inner {
|
||||
EmbeddingInner::Hashed { dimension } => Ok(hashed_embedding(text, *dimension)),
|
||||
EmbeddingInner::FastEmbed { pool, .. } => {
|
||||
let embeddings = run_fastembed(pool, vec![text.to_owned()]).await?;
|
||||
let text = text.to_owned();
|
||||
let embeddings = run_fastembed(pool, std::slice::from_ref(&text)).await?;
|
||||
embeddings.into_iter().next().ok_or(EmbeddingError::NoData)
|
||||
}
|
||||
EmbeddingInner::OpenAI {
|
||||
@@ -347,7 +349,7 @@ impl EmbeddingProvider {
|
||||
dimensions,
|
||||
} => {
|
||||
let request = CreateEmbeddingRequestArgs::default()
|
||||
.model(model.clone())
|
||||
.model(model.as_ref())
|
||||
.input([text])
|
||||
.dimensions(*dimensions)
|
||||
.build()?;
|
||||
@@ -382,7 +384,7 @@ impl EmbeddingProvider {
|
||||
if texts.is_empty() {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
run_fastembed(pool, texts.to_vec()).await
|
||||
run_fastembed(pool, texts).await
|
||||
}
|
||||
EmbeddingInner::OpenAI {
|
||||
client,
|
||||
@@ -394,7 +396,7 @@ impl EmbeddingProvider {
|
||||
}
|
||||
|
||||
let request = CreateEmbeddingRequestArgs::default()
|
||||
.model(model.clone())
|
||||
.model(model.as_ref())
|
||||
.input(texts.to_vec())
|
||||
.dimensions(*dimensions)
|
||||
.build()?;
|
||||
@@ -417,13 +419,13 @@ impl EmbeddingProvider {
|
||||
/// Currently infallible; reserved for future validation.
|
||||
pub fn new_openai(
|
||||
client: Arc<Client<async_openai::config::OpenAIConfig>>,
|
||||
model: String,
|
||||
model: impl AsRef<str>,
|
||||
dimensions: u32,
|
||||
) -> Result<Self, EmbeddingError> {
|
||||
Ok(Self {
|
||||
inner: EmbeddingInner::OpenAI {
|
||||
client,
|
||||
model,
|
||||
model: Arc::from(model.as_ref()),
|
||||
dimensions,
|
||||
},
|
||||
})
|
||||
@@ -520,7 +522,7 @@ impl EmbeddingProvider {
|
||||
"openai embedding backend requires an openai client".into(),
|
||||
)
|
||||
})?;
|
||||
Self::new_openai(client, settings.embedding_model.clone(), dimensions)
|
||||
Self::new_openai(client, settings.embedding_model.as_str(), dimensions)
|
||||
}
|
||||
EmbeddingBackend::FastEmbed => {
|
||||
let pool_size = config
|
||||
@@ -586,11 +588,12 @@ mod tests {
|
||||
#![allow(clippy::expect_used)]
|
||||
|
||||
use super::{
|
||||
align_fastembed_system_settings, fastembed_model_dimension, list_fastembed_embedding_models,
|
||||
resolve_fastembed_model_code, DEFAULT_FASTEMBED_MODEL_CODE, EmbeddingError,
|
||||
align_fastembed_system_settings, fastembed_model_dimension,
|
||||
list_fastembed_embedding_models, resolve_fastembed_model_code, EmbeddingError,
|
||||
DEFAULT_FASTEMBED_MODEL_CODE,
|
||||
};
|
||||
use crate::utils::config::{AppConfig, EmbeddingBackend, ParseEmbeddingBackendError};
|
||||
use crate::storage::types::system_settings::SystemSettings;
|
||||
use crate::utils::config::{AppConfig, EmbeddingBackend, ParseEmbeddingBackendError};
|
||||
use serde_json::json;
|
||||
|
||||
#[test]
|
||||
@@ -656,16 +659,16 @@ mod tests {
|
||||
fastembed_model: Some("Xenova/bge-base-en-v1.5".into()),
|
||||
..AppConfig::default()
|
||||
};
|
||||
let resolved = resolve_fastembed_model_code(&config, "text-embedding-3-small")
|
||||
.expect("config model");
|
||||
let resolved =
|
||||
resolve_fastembed_model_code(&config, "text-embedding-3-small").expect("config model");
|
||||
assert_eq!(resolved, "Xenova/bge-base-en-v1.5");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn resolve_fastembed_model_falls_back_from_openai_default() {
|
||||
let config = AppConfig::default();
|
||||
let resolved = resolve_fastembed_model_code(&config, "text-embedding-3-small")
|
||||
.expect("default model");
|
||||
let resolved =
|
||||
resolve_fastembed_model_code(&config, "text-embedding-3-small").expect("default model");
|
||||
assert_eq!(resolved, DEFAULT_FASTEMBED_MODEL_CODE);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user