evals: v3, ebeddings at the side

additional indexes
This commit is contained in:
Per Stark
2025-11-26 15:00:55 +01:00
parent 226b2db43a
commit 030f0fc17d
63 changed files with 3859 additions and 1124 deletions

View File

@@ -18,6 +18,18 @@ use super::enrichment_result::LLMEnrichmentResult;
use super::{config::IngestionConfig, services::PipelineServices};
#[derive(Debug, Clone)]
pub struct EmbeddedKnowledgeEntity {
pub entity: KnowledgeEntity,
pub embedding: Vec<f32>,
}
#[derive(Debug, Clone)]
pub struct EmbeddedTextChunk {
pub chunk: TextChunk,
pub embedding: Vec<f32>,
}
pub struct PipelineContext<'a> {
pub task: &'a IngestionTask,
pub task_id: String,
@@ -33,9 +45,9 @@ pub struct PipelineContext<'a> {
#[derive(Debug)]
pub struct PipelineArtifacts {
pub text_content: TextContent,
pub entities: Vec<KnowledgeEntity>,
pub entities: Vec<EmbeddedKnowledgeEntity>,
pub relationships: Vec<KnowledgeRelationship>,
pub chunks: Vec<TextChunk>,
pub chunks: Vec<EmbeddedTextChunk>,
}
impl<'a> PipelineContext<'a> {

View File

@@ -4,6 +4,7 @@ use chrono::Utc;
use futures::stream::{self, StreamExt, TryStreamExt};
use serde::{Deserialize, Serialize};
use anyhow::Context;
use common::{
error::AppError,
storage::{
@@ -13,9 +14,10 @@ use common::{
knowledge_relationship::KnowledgeRelationship,
},
},
utils::embedding::generate_embedding,
utils::{embedding::generate_embedding, embedding::EmbeddingProvider},
};
use crate::pipeline::context::EmbeddedKnowledgeEntity;
use crate::utils::graph_mapper::GraphMapper;
#[derive(Debug, Serialize, Deserialize, Clone)]
@@ -48,7 +50,8 @@ impl LLMEnrichmentResult {
openai_client: &async_openai::Client<async_openai::config::OpenAIConfig>,
db_client: &SurrealDbClient,
entity_concurrency: usize,
) -> Result<(Vec<KnowledgeEntity>, Vec<KnowledgeRelationship>), AppError> {
embedding_provider: Option<&EmbeddingProvider>,
) -> Result<(Vec<EmbeddedKnowledgeEntity>, Vec<KnowledgeRelationship>), AppError> {
let mapper = Arc::new(self.create_mapper()?);
let entities = self
@@ -59,6 +62,7 @@ impl LLMEnrichmentResult {
openai_client,
db_client,
entity_concurrency,
embedding_provider,
)
.await?;
@@ -85,7 +89,8 @@ impl LLMEnrichmentResult {
openai_client: &async_openai::Client<async_openai::config::OpenAIConfig>,
db_client: &SurrealDbClient,
entity_concurrency: usize,
) -> Result<Vec<KnowledgeEntity>, AppError> {
embedding_provider: Option<&EmbeddingProvider>,
) -> Result<Vec<EmbeddedKnowledgeEntity>, AppError> {
stream::iter(self.knowledge_entities.iter().cloned().map(|entity| {
let mapper = Arc::clone(&mapper);
let openai_client = openai_client.clone();
@@ -101,6 +106,7 @@ impl LLMEnrichmentResult {
mapper,
&openai_client,
&db_client,
embedding_provider,
)
.await
}
@@ -141,7 +147,8 @@ async fn create_single_entity(
mapper: Arc<GraphMapper>,
openai_client: &async_openai::Client<async_openai::config::OpenAIConfig>,
db_client: &SurrealDbClient,
) -> Result<KnowledgeEntity, AppError> {
embedding_provider: Option<&EmbeddingProvider>,
) -> Result<EmbeddedKnowledgeEntity, AppError> {
let assigned_id = mapper.get_id(&llm_entity.key)?.to_string();
let embedding_input = format!(
@@ -149,10 +156,17 @@ async fn create_single_entity(
llm_entity.name, llm_entity.description, llm_entity.entity_type
);
let embedding = generate_embedding(openai_client, &embedding_input, db_client).await?;
let embedding = if let Some(provider) = embedding_provider {
provider
.embed(&embedding_input)
.await
.context("generating FastEmbed embedding for entity")?
} else {
generate_embedding(openai_client, &embedding_input, db_client).await?
};
let now = Utc::now();
Ok(KnowledgeEntity {
let entity = KnowledgeEntity {
id: assigned_id,
created_at: now,
updated_at: now,
@@ -161,7 +175,8 @@ async fn create_single_entity(
entity_type: KnowledgeEntityType::from(llm_entity.entity_type.to_string()),
source_id: source_id.to_string(),
metadata: None,
embedding,
user_id: user_id.into(),
})
};
Ok(EmbeddedKnowledgeEntity { entity, embedding })
}

View File

@@ -50,6 +50,7 @@ impl IngestionPipeline {
config: AppConfig,
reranker_pool: Option<Arc<RerankerPool>>,
storage: StorageManager,
embedding_provider: Arc<common::utils::embedding::EmbeddingProvider>,
) -> Result<Self, AppError> {
let services = DefaultPipelineServices::new(
db.clone(),
@@ -57,6 +58,7 @@ impl IngestionPipeline {
config.clone(),
reranker_pool,
storage,
embedding_provider,
);
Self::with_services(db, IngestionConfig::default(), Arc::new(services))

View File

@@ -1,5 +1,6 @@
use std::{ops::Range, sync::Arc};
use anyhow::Context;
use async_openai::types::{
ChatCompletionRequestSystemMessage, ChatCompletionRequestUserMessage,
CreateChatCompletionRequest, CreateChatCompletionRequestArgs, ResponseFormat,
@@ -12,19 +13,18 @@ use common::{
db::SurrealDbClient,
store::StorageManager,
types::{
ingestion_payload::IngestionPayload, knowledge_entity::KnowledgeEntity,
knowledge_relationship::KnowledgeRelationship, system_settings::SystemSettings,
text_chunk::TextChunk, text_content::TextContent,
ingestion_payload::IngestionPayload, knowledge_relationship::KnowledgeRelationship,
system_settings::SystemSettings, text_chunk::TextChunk, text_content::TextContent,
StoredObject,
},
},
utils::{config::AppConfig, embedding::generate_embedding},
};
use retrieval_pipeline::{
reranking::RerankerPool, retrieved_entities_to_json, RetrievedEntity,
utils::{config::AppConfig, embedding::EmbeddingProvider},
};
use retrieval_pipeline::{reranking::RerankerPool, retrieved_entities_to_json, RetrievedEntity};
use text_splitter::TextSplitter;
use super::{enrichment_result::LLMEnrichmentResult, preparation::to_text_content};
use crate::pipeline::context::{EmbeddedKnowledgeEntity, EmbeddedTextChunk};
use crate::utils::llm_instructions::{
get_ingress_analysis_schema, INGRESS_ANALYSIS_SYSTEM_MESSAGE,
};
@@ -54,13 +54,13 @@ pub trait PipelineServices: Send + Sync {
content: &TextContent,
analysis: &LLMEnrichmentResult,
entity_concurrency: usize,
) -> Result<(Vec<KnowledgeEntity>, Vec<KnowledgeRelationship>), AppError>;
) -> Result<(Vec<EmbeddedKnowledgeEntity>, Vec<KnowledgeRelationship>), AppError>;
async fn prepare_chunks(
&self,
content: &TextContent,
range: Range<usize>,
) -> Result<Vec<TextChunk>, AppError>;
) -> Result<Vec<EmbeddedTextChunk>, AppError>;
}
pub struct DefaultPipelineServices {
@@ -69,6 +69,7 @@ pub struct DefaultPipelineServices {
config: AppConfig,
reranker_pool: Option<Arc<RerankerPool>>,
storage: StorageManager,
embedding_provider: Arc<EmbeddingProvider>,
}
impl DefaultPipelineServices {
@@ -78,6 +79,7 @@ impl DefaultPipelineServices {
config: AppConfig,
reranker_pool: Option<Arc<RerankerPool>>,
storage: StorageManager,
embedding_provider: Arc<EmbeddingProvider>,
) -> Self {
Self {
db,
@@ -85,6 +87,7 @@ impl DefaultPipelineServices {
config,
reranker_pool,
storage,
embedding_provider,
}
}
@@ -182,6 +185,7 @@ impl PipelineServices for DefaultPipelineServices {
match retrieval_pipeline::retrieve_entities(
&self.db,
&self.openai_client,
// embedding_provider_ref,
&input_text,
&content.user_id,
config,
@@ -218,14 +222,15 @@ impl PipelineServices for DefaultPipelineServices {
content: &TextContent,
analysis: &LLMEnrichmentResult,
entity_concurrency: usize,
) -> Result<(Vec<KnowledgeEntity>, Vec<KnowledgeRelationship>), AppError> {
) -> Result<(Vec<EmbeddedKnowledgeEntity>, Vec<KnowledgeRelationship>), AppError> {
analysis
.to_database_entities(
&content.id,
&content.get_id(),
&content.user_id,
&self.openai_client,
&self.db,
entity_concurrency,
Some(&*self.embedding_provider),
)
.await
}
@@ -234,7 +239,7 @@ impl PipelineServices for DefaultPipelineServices {
&self,
content: &TextContent,
range: Range<usize>,
) -> Result<Vec<TextChunk>, AppError> {
) -> Result<Vec<EmbeddedTextChunk>, AppError> {
let splitter = TextSplitter::new(range.clone());
let chunk_texts: Vec<String> = splitter
.chunks(&content.text)
@@ -243,13 +248,17 @@ impl PipelineServices for DefaultPipelineServices {
let mut chunks = Vec::with_capacity(chunk_texts.len());
for chunk in chunk_texts {
let embedding = generate_embedding(&self.openai_client, &chunk, &self.db).await?;
chunks.push(TextChunk::new(
content.id.clone(),
chunk,
let embedding = self
.embedding_provider
.embed(&chunk)
.await
.context("generating FastEmbed embedding for chunk")?;
let chunk_struct =
TextChunk::new(content.get_id().to_string(), chunk, content.user_id.clone());
chunks.push(EmbeddedTextChunk {
chunk: chunk_struct,
embedding,
content.user_id.clone(),
));
});
}
Ok(chunks)
}

View File

@@ -15,7 +15,7 @@ use tokio::time::{sleep, Duration};
use tracing::{debug, instrument, warn};
use super::{
context::{PipelineArtifacts, PipelineContext},
context::{EmbeddedKnowledgeEntity, EmbeddedTextChunk, PipelineArtifacts, PipelineContext},
state::{ContentPrepared, Enriched, IngestionMachine, Persisted, Ready, Retrieved},
};
@@ -177,17 +177,21 @@ fn map_guard_error(event: &str, guard: GuardError) -> AppError {
async fn store_graph_entities(
db: &SurrealDbClient,
tuning: &super::config::IngestionTuning,
entities: Vec<KnowledgeEntity>,
entities: Vec<EmbeddedKnowledgeEntity>,
relationships: Vec<KnowledgeRelationship>,
) -> Result<(), AppError> {
const STORE_GRAPH_MUTATION: &str = r"
BEGIN TRANSACTION;
LET $entities = $entities;
LET $relationships = $relationships;
// Persist entities with embeddings first.
for embedded in entities {
KnowledgeEntity::store_with_embedding(embedded.entity, embedded.embedding, db).await?;
}
FOR $entity IN $entities {
CREATE type::thing('knowledge_entity', $entity.id) CONTENT $entity;
};
if relationships.is_empty() {
return Ok(());
}
const STORE_RELATIONSHIPS: &str = r"
BEGIN TRANSACTION;
LET $relationships = $relationships;
FOR $relationship IN $relationships {
LET $in_node = type::thing('knowledge_entity', $relationship.in);
@@ -201,7 +205,6 @@ async fn store_graph_entities(
COMMIT TRANSACTION;
";
let entities = Arc::new(entities);
let relationships = Arc::new(relationships);
let mut backoff_ms = tuning.graph_initial_backoff_ms;
@@ -209,8 +212,7 @@ async fn store_graph_entities(
for attempt in 0..tuning.graph_store_attempts {
let result = db
.client
.query(STORE_GRAPH_MUTATION)
.bind(("entities", entities.clone()))
.query(STORE_RELATIONSHIPS)
.bind(("relationships", relationships.clone()))
.await;
@@ -240,17 +242,17 @@ async fn store_graph_entities(
async fn store_vector_chunks(
db: &SurrealDbClient,
task_id: &str,
chunks: &[TextChunk],
chunks: &[EmbeddedTextChunk],
tuning: &super::config::IngestionTuning,
) -> Result<usize, AppError> {
let chunk_count = chunks.len();
let batch_size = tuning.chunk_insert_concurrency.max(1);
for chunk in chunks {
for embedded in chunks {
debug!(
task_id = %task_id,
chunk_id = %chunk.id,
chunk_len = chunk.chunk.chars().count(),
chunk_id = %embedded.chunk.id,
chunk_len = embedded.chunk.chunk.chars().count(),
"chunk persisted"
);
}
@@ -270,53 +272,17 @@ fn is_retryable_conflict(error: &surrealdb::Error) -> bool {
async fn store_chunk_batch(
db: &SurrealDbClient,
batch: &[TextChunk],
tuning: &super::config::IngestionTuning,
batch: &[EmbeddedTextChunk],
_tuning: &super::config::IngestionTuning,
) -> Result<(), AppError> {
if batch.is_empty() {
return Ok(());
}
const STORE_CHUNKS_MUTATION: &str = r"
BEGIN TRANSACTION;
LET $chunks = $chunks;
FOR $chunk IN $chunks {
CREATE type::thing('text_chunk', $chunk.id) CONTENT $chunk;
};
COMMIT TRANSACTION;
";
let chunks = Arc::new(batch.to_vec());
let mut backoff_ms = tuning.graph_initial_backoff_ms;
for attempt in 0..tuning.graph_store_attempts {
let result = db
.client
.query(STORE_CHUNKS_MUTATION)
.bind(("chunks", chunks.clone()))
.await;
match result {
Ok(_) => return Ok(()),
Err(err) => {
if is_retryable_conflict(&err) && attempt + 1 < tuning.graph_store_attempts {
warn!(
attempt = attempt + 1,
"Transient SurrealDB conflict while storing chunks; retrying"
);
sleep(Duration::from_millis(backoff_ms)).await;
backoff_ms = (backoff_ms * 2).min(tuning.graph_max_backoff_ms);
continue;
}
return Err(AppError::from(err));
}
}
for embedded in batch {
TextChunk::store_with_embedding(embedded.chunk.clone(), embedded.embedding.clone(), db)
.await?;
}
Err(AppError::InternalError(
"Failed to store text chunks after retries".to_string(),
))
Ok(())
}

View File

@@ -1,5 +1,6 @@
use std::sync::Arc;
use crate::pipeline::context::{EmbeddedKnowledgeEntity, EmbeddedTextChunk};
use async_trait::async_trait;
use chrono::{Duration as ChronoDuration, Utc};
use common::{
@@ -32,7 +33,7 @@ struct MockServices {
similar_entities: Vec<RetrievedEntity>,
analysis: LLMEnrichmentResult,
chunk_embedding: Vec<f32>,
graph_entities: Vec<KnowledgeEntity>,
graph_entities: Vec<EmbeddedKnowledgeEntity>,
graph_relationships: Vec<KnowledgeRelationship>,
calls: Mutex<Vec<&'static str>>,
}
@@ -54,14 +55,12 @@ impl MockServices {
"Previously known context".into(),
KnowledgeEntityType::Document,
None,
vec![0.1; TEST_EMBEDDING_DIM],
user_id.into(),
);
let retrieved_chunk = TextChunk::new(
retrieved_entity.source_id.clone(),
"existing chunk".into(),
vec![0.1; TEST_EMBEDDING_DIM],
user_id.into(),
);
@@ -76,7 +75,6 @@ impl MockServices {
"Entity from enrichment".into(),
KnowledgeEntityType::Idea,
None,
vec![0.2; TEST_EMBEDDING_DIM],
user_id.into(),
);
let graph_relationship = KnowledgeRelationship::new(
@@ -99,7 +97,10 @@ impl MockServices {
}],
analysis,
chunk_embedding: vec![0.3; TEST_EMBEDDING_DIM],
graph_entities: vec![graph_entity],
graph_entities: vec![EmbeddedKnowledgeEntity {
entity: graph_entity,
embedding: vec![0.2; TEST_EMBEDDING_DIM],
}],
graph_relationships: vec![graph_relationship],
calls: Mutex::new(Vec::new()),
}
@@ -142,7 +143,7 @@ impl PipelineServices for MockServices {
_content: &TextContent,
_analysis: &LLMEnrichmentResult,
_entity_concurrency: usize,
) -> Result<(Vec<KnowledgeEntity>, Vec<KnowledgeRelationship>), AppError> {
) -> Result<(Vec<EmbeddedKnowledgeEntity>, Vec<KnowledgeRelationship>), AppError> {
self.record("convert").await;
Ok((
self.graph_entities.clone(),
@@ -154,14 +155,16 @@ impl PipelineServices for MockServices {
&self,
content: &TextContent,
_range: std::ops::Range<usize>,
) -> Result<Vec<TextChunk>, AppError> {
) -> Result<Vec<EmbeddedTextChunk>, AppError> {
self.record("chunk").await;
Ok(vec![TextChunk::new(
content.id.clone(),
"chunk from mock services".into(),
self.chunk_embedding.clone(),
content.user_id.clone(),
)])
Ok(vec![EmbeddedTextChunk {
chunk: TextChunk::new(
content.id.clone(),
"chunk from mock services".into(),
content.user_id.clone(),
),
embedding: self.chunk_embedding.clone(),
}])
}
}
@@ -200,7 +203,7 @@ impl PipelineServices for FailingServices {
content: &TextContent,
analysis: &LLMEnrichmentResult,
entity_concurrency: usize,
) -> Result<(Vec<KnowledgeEntity>, Vec<KnowledgeRelationship>), AppError> {
) -> Result<(Vec<EmbeddedKnowledgeEntity>, Vec<KnowledgeRelationship>), AppError> {
self.inner
.convert_analysis(content, analysis, entity_concurrency)
.await
@@ -210,7 +213,7 @@ impl PipelineServices for FailingServices {
&self,
content: &TextContent,
range: std::ops::Range<usize>,
) -> Result<Vec<TextChunk>, AppError> {
) -> Result<Vec<EmbeddedTextChunk>, AppError> {
self.inner.prepare_chunks(content, range).await
}
}
@@ -244,7 +247,7 @@ impl PipelineServices for ValidationServices {
_content: &TextContent,
_analysis: &LLMEnrichmentResult,
_entity_concurrency: usize,
) -> Result<(Vec<KnowledgeEntity>, Vec<KnowledgeRelationship>), AppError> {
) -> Result<(Vec<EmbeddedKnowledgeEntity>, Vec<KnowledgeRelationship>), AppError> {
unreachable!("convert_analysis should not be called after validation failure")
}
@@ -252,7 +255,7 @@ impl PipelineServices for ValidationServices {
&self,
_content: &TextContent,
_range: std::ops::Range<usize>,
) -> Result<Vec<TextChunk>, AppError> {
) -> Result<Vec<EmbeddedTextChunk>, AppError> {
unreachable!("prepare_chunks should not be called after validation failure")
}
}