fix: arc-share retrieved chunks, centralize entity embeddings, and trim hot-path clones.

This commit is contained in:
Per Stark
2026-06-06 23:05:53 +02:00
parent 676fdbc132
commit 4559ee0aa8
41 changed files with 368 additions and 289 deletions
+52 -7
View File
@@ -3,9 +3,12 @@ use std::collections::HashMap;
use std::fmt::Write;
use crate::{
error::AppError, storage::db::SurrealDbClient, storage::indexes::hnsw_index_overwrite_sql,
error::AppError,
storage::db::SurrealDbClient,
storage::indexes::hnsw_index_overwrite_sql,
storage::types::knowledge_entity_embedding::KnowledgeEntityEmbedding,
storage::types::system_settings::SystemSettings, stored_object,
storage::types::system_settings::SystemSettings,
stored_object,
utils::embedding::{EmbeddingProvider, RE_EMBED_BATCH_SIZE},
};
use tracing::{error, info};
@@ -25,6 +28,17 @@ impl KnowledgeEntityType {
pub fn variants() -> &'static [&'static str] {
&["Idea", "Project", "Document", "Page", "TextSnippet"]
}
#[must_use]
pub const fn as_str(self) -> &'static str {
match self {
Self::Idea => "Idea",
Self::Project => "Project",
Self::Document => "Document",
Self::Page => "Page",
Self::TextSnippet => "TextSnippet",
}
}
}
impl From<String> for KnowledgeEntityType {
@@ -80,6 +94,27 @@ impl KnowledgeEntity {
}
}
/// Canonical text fed to the embedding provider for a knowledge entity.
#[must_use]
pub fn embedding_input_text(
name: &str,
description: &str,
entity_type: KnowledgeEntityType,
) -> String {
let mut out = String::with_capacity(
name.len()
.saturating_add(description.len())
.saturating_add(entity_type.as_str().len())
.saturating_add(32),
);
let _ = write!(
out,
"name: {name}, description: {description}, type: {}",
entity_type.as_str()
);
out
}
/// Full-text search over knowledge entities using the BM25 FTS index.
pub async fn fts_search(
take: usize,
@@ -314,8 +349,7 @@ impl KnowledgeEntity {
db_client: &SurrealDbClient,
embedding_provider: &EmbeddingProvider,
) -> Result<(), AppError> {
let embedding_input =
format!("name: {name}, description: {description}, type: {entity_type:?}",);
let embedding_input = Self::embedding_input_text(name, description, *entity_type);
let embedding = embedding_provider.embed(&embedding_input).await?;
let entity: KnowledgeEntity = db_client
@@ -402,9 +436,10 @@ impl KnowledgeEntity {
let inputs: Vec<String> = batch
.iter()
.map(|entity| {
format!(
"name: {}, description: {}, type: {:?}",
entity.name, entity.description, entity.entity_type
Self::embedding_input_text(
&entity.name,
&entity.description,
entity.entity_type,
)
})
.collect();
@@ -523,6 +558,16 @@ mod tests {
use anyhow::{self, Context};
use uuid::Uuid;
#[test]
fn embedding_input_text_uses_canonical_type_label() {
let text = KnowledgeEntity::embedding_input_text(
"Alpha",
"Beta",
KnowledgeEntityType::TextSnippet,
);
assert_eq!(text, "name: Alpha, description: Beta, type: TextSnippet");
}
async fn ensure_entity_fts_indexes(db: &SurrealDbClient) -> anyhow::Result<()> {
let snowball_sql = r#"
DEFINE ANALYZER IF NOT EXISTS app_en_fts_analyzer TOKENIZERS class, punct FILTERS lowercase, ascii, snowball(english);
@@ -122,8 +122,7 @@ impl KnowledgeRelationship {
.bind(("user_id", user_id.to_owned()))
.await
.map_err(AppError::from)?;
let deleted: Vec<KnowledgeRelationship> =
delete_result.take(0).map_err(AppError::from)?;
let deleted: Vec<KnowledgeRelationship> = delete_result.take(0).map_err(AppError::from)?;
if !deleted.is_empty() {
return Ok(());
@@ -567,8 +566,8 @@ mod tests {
shared_source.to_string(),
"references".to_string(),
);
let rel_a_id = rel_a.id.clone();
let rel_b_id = rel_b.id.clone();
let owner_relationship_id = rel_a.id.clone();
let other_relationship_id = rel_b.id.clone();
rel_a.store_relationship(&db).await?;
rel_b.store_relationship(&db).await?;
@@ -576,8 +575,12 @@ mod tests {
KnowledgeRelationship::delete_relationships_by_source_id(shared_source, user_a, &db)
.await?;
assert!(get_relationship_by_id(&rel_a_id, &db).await.is_none());
assert!(get_relationship_by_id(&rel_b_id, &db).await.is_some());
assert!(get_relationship_by_id(&owner_relationship_id, &db)
.await
.is_none());
assert!(get_relationship_by_id(&other_relationship_id, &db)
.await
.is_some());
Ok(())
}
+5 -1
View File
@@ -299,7 +299,11 @@ impl TextChunk {
}
processed = processed.saturating_add(batch.len());
info!(progress = processed, total = total_chunks, "Re-embedding progress");
info!(
progress = processed,
total = total_chunks,
"Re-embedding progress"
);
}
info!("Successfully generated all new embeddings.");
+1 -2
View File
@@ -140,8 +140,7 @@ impl TextContent {
.await
.map_err(AppError::from)?;
let existing: Option<surrealdb::sql::Thing> =
response.take(0).map_err(AppError::from)?;
let existing: Option<surrealdb::sql::Thing> = response.take(0).map_err(AppError::from)?;
Ok(existing.is_some())
}
+21 -18
View File
@@ -38,7 +38,7 @@ enum EmbeddingInner {
/// Client used to issue embedding requests.
client: Arc<Client<async_openai::config::OpenAIConfig>>,
/// Model identifier for the API.
model: String,
model: Arc<str>,
/// Expected output dimensions.
dimensions: u32,
},
@@ -272,8 +272,9 @@ struct FastEmbedLease {
}
impl FastEmbedLease {
async fn embed(&self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, EmbeddingError> {
async fn embed(&self, texts: &[String]) -> Result<Vec<Vec<f32>>, EmbeddingError> {
let engine = Arc::clone(&self.engine);
let texts = texts.to_vec();
tokio::task::spawn_blocking(move || -> Result<Vec<Vec<f32>>, EmbeddingError> {
let mut guard = engine.lock().map_err(EmbeddingError::mutex_poisoned)?;
guard.embed(texts, None).map_err(EmbeddingError::fastembed)
@@ -293,7 +294,7 @@ impl Drop for FastEmbedLease {
async fn run_fastembed(
pool: &Arc<FastEmbedPool>,
texts: Vec<String>,
texts: &[String],
) -> Result<Vec<Vec<f32>>, EmbeddingError> {
let lease = pool.checkout().await?;
lease.embed(texts).await
@@ -323,7 +324,7 @@ impl EmbeddingProvider {
pub fn model_code(&self) -> Option<String> {
match &self.inner {
EmbeddingInner::FastEmbed { model_name, .. } => Some(model_name.to_string()),
EmbeddingInner::OpenAI { model, .. } => Some(model.clone()),
EmbeddingInner::OpenAI { model, .. } => Some(model.as_ref().to_owned()),
EmbeddingInner::Hashed { .. } => None,
}
}
@@ -338,7 +339,8 @@ impl EmbeddingProvider {
match &self.inner {
EmbeddingInner::Hashed { dimension } => Ok(hashed_embedding(text, *dimension)),
EmbeddingInner::FastEmbed { pool, .. } => {
let embeddings = run_fastembed(pool, vec![text.to_owned()]).await?;
let text = text.to_owned();
let embeddings = run_fastembed(pool, std::slice::from_ref(&text)).await?;
embeddings.into_iter().next().ok_or(EmbeddingError::NoData)
}
EmbeddingInner::OpenAI {
@@ -347,7 +349,7 @@ impl EmbeddingProvider {
dimensions,
} => {
let request = CreateEmbeddingRequestArgs::default()
.model(model.clone())
.model(model.as_ref())
.input([text])
.dimensions(*dimensions)
.build()?;
@@ -382,7 +384,7 @@ impl EmbeddingProvider {
if texts.is_empty() {
return Ok(Vec::new());
}
run_fastembed(pool, texts.to_vec()).await
run_fastembed(pool, texts).await
}
EmbeddingInner::OpenAI {
client,
@@ -394,7 +396,7 @@ impl EmbeddingProvider {
}
let request = CreateEmbeddingRequestArgs::default()
.model(model.clone())
.model(model.as_ref())
.input(texts.to_vec())
.dimensions(*dimensions)
.build()?;
@@ -417,13 +419,13 @@ impl EmbeddingProvider {
/// Currently infallible; reserved for future validation.
pub fn new_openai(
client: Arc<Client<async_openai::config::OpenAIConfig>>,
model: String,
model: impl AsRef<str>,
dimensions: u32,
) -> Result<Self, EmbeddingError> {
Ok(Self {
inner: EmbeddingInner::OpenAI {
client,
model,
model: Arc::from(model.as_ref()),
dimensions,
},
})
@@ -520,7 +522,7 @@ impl EmbeddingProvider {
"openai embedding backend requires an openai client".into(),
)
})?;
Self::new_openai(client, settings.embedding_model.clone(), dimensions)
Self::new_openai(client, settings.embedding_model.as_str(), dimensions)
}
EmbeddingBackend::FastEmbed => {
let pool_size = config
@@ -586,11 +588,12 @@ mod tests {
#![allow(clippy::expect_used)]
use super::{
align_fastembed_system_settings, fastembed_model_dimension, list_fastembed_embedding_models,
resolve_fastembed_model_code, DEFAULT_FASTEMBED_MODEL_CODE, EmbeddingError,
align_fastembed_system_settings, fastembed_model_dimension,
list_fastembed_embedding_models, resolve_fastembed_model_code, EmbeddingError,
DEFAULT_FASTEMBED_MODEL_CODE,
};
use crate::utils::config::{AppConfig, EmbeddingBackend, ParseEmbeddingBackendError};
use crate::storage::types::system_settings::SystemSettings;
use crate::utils::config::{AppConfig, EmbeddingBackend, ParseEmbeddingBackendError};
use serde_json::json;
#[test]
@@ -656,16 +659,16 @@ mod tests {
fastembed_model: Some("Xenova/bge-base-en-v1.5".into()),
..AppConfig::default()
};
let resolved = resolve_fastembed_model_code(&config, "text-embedding-3-small")
.expect("config model");
let resolved =
resolve_fastembed_model_code(&config, "text-embedding-3-small").expect("config model");
assert_eq!(resolved, "Xenova/bge-base-en-v1.5");
}
#[test]
fn resolve_fastembed_model_falls_back_from_openai_default() {
let config = AppConfig::default();
let resolved = resolve_fastembed_model_code(&config, "text-embedding-3-small")
.expect("default model");
let resolved =
resolve_fastembed_model_code(&config, "text-embedding-3-small").expect("default model");
assert_eq!(resolved, DEFAULT_FASTEMBED_MODEL_CODE);
}