evals: v3, ebeddings at the side

additional indexes
2026-04-23 17:28:34 +02:00 · 2025-11-26 15:00:55 +01:00
parent 226b2db43a
commit 030f0fc17d
63 changed files with 3859 additions and 1124 deletions
--- a/ingestion-pipeline/src/pipeline/context.rs
+++ b/ingestion-pipeline/src/pipeline/context.rs
@@ -18,6 +18,18 @@ use super::enrichment_result::LLMEnrichmentResult;

 use super::{config::IngestionConfig, services::PipelineServices};

+#[derive(Debug, Clone)]
+pub struct EmbeddedKnowledgeEntity {
+    pub entity: KnowledgeEntity,
+    pub embedding: Vec<f32>,
+}
+
+#[derive(Debug, Clone)]
+pub struct EmbeddedTextChunk {
+    pub chunk: TextChunk,
+    pub embedding: Vec<f32>,
+}
+
 pub struct PipelineContext<'a> {
    pub task: &'a IngestionTask,
    pub task_id: String,
@@ -33,9 +45,9 @@ pub struct PipelineContext<'a> {
 #[derive(Debug)]
 pub struct PipelineArtifacts {
    pub text_content: TextContent,
-    pub entities: Vec<KnowledgeEntity>,
+    pub entities: Vec<EmbeddedKnowledgeEntity>,
    pub relationships: Vec<KnowledgeRelationship>,
-    pub chunks: Vec<TextChunk>,
+    pub chunks: Vec<EmbeddedTextChunk>,
 }

 impl<'a> PipelineContext<'a> {
--- a/ingestion-pipeline/src/pipeline/enrichment_result.rs
+++ b/ingestion-pipeline/src/pipeline/enrichment_result.rs
@@ -4,6 +4,7 @@ use chrono::Utc;
 use futures::stream::{self, StreamExt, TryStreamExt};
 use serde::{Deserialize, Serialize};

+use anyhow::Context;
 use common::{
    error::AppError,
    storage::{
@@ -13,9 +14,10 @@ use common::{
            knowledge_relationship::KnowledgeRelationship,
        },
    },
-    utils::embedding::generate_embedding,
+    utils::{embedding::generate_embedding, embedding::EmbeddingProvider},
 };

+use crate::pipeline::context::EmbeddedKnowledgeEntity;
 use crate::utils::graph_mapper::GraphMapper;

 #[derive(Debug, Serialize, Deserialize, Clone)]
@@ -48,7 +50,8 @@ impl LLMEnrichmentResult {
        openai_client: &async_openai::Client<async_openai::config::OpenAIConfig>,
        db_client: &SurrealDbClient,
        entity_concurrency: usize,
-    ) -> Result<(Vec<KnowledgeEntity>, Vec<KnowledgeRelationship>), AppError> {
+        embedding_provider: Option<&EmbeddingProvider>,
+    ) -> Result<(Vec<EmbeddedKnowledgeEntity>, Vec<KnowledgeRelationship>), AppError> {
        let mapper = Arc::new(self.create_mapper()?);

        let entities = self
@@ -59,6 +62,7 @@ impl LLMEnrichmentResult {
                openai_client,
                db_client,
                entity_concurrency,
+                embedding_provider,
            )
            .await?;

@@ -85,7 +89,8 @@ impl LLMEnrichmentResult {
        openai_client: &async_openai::Client<async_openai::config::OpenAIConfig>,
        db_client: &SurrealDbClient,
        entity_concurrency: usize,
-    ) -> Result<Vec<KnowledgeEntity>, AppError> {
+        embedding_provider: Option<&EmbeddingProvider>,
+    ) -> Result<Vec<EmbeddedKnowledgeEntity>, AppError> {
        stream::iter(self.knowledge_entities.iter().cloned().map(|entity| {
            let mapper = Arc::clone(&mapper);
            let openai_client = openai_client.clone();
@@ -101,6 +106,7 @@ impl LLMEnrichmentResult {
                    mapper,
                    &openai_client,
                    &db_client,
+                    embedding_provider,
                )
                .await
            }
@@ -141,7 +147,8 @@ async fn create_single_entity(
    mapper: Arc<GraphMapper>,
    openai_client: &async_openai::Client<async_openai::config::OpenAIConfig>,
    db_client: &SurrealDbClient,
-) -> Result<KnowledgeEntity, AppError> {
+    embedding_provider: Option<&EmbeddingProvider>,
+) -> Result<EmbeddedKnowledgeEntity, AppError> {
    let assigned_id = mapper.get_id(&llm_entity.key)?.to_string();

    let embedding_input = format!(
@@ -149,10 +156,17 @@ async fn create_single_entity(
        llm_entity.name, llm_entity.description, llm_entity.entity_type
    );

-    let embedding = generate_embedding(openai_client, &embedding_input, db_client).await?;
+    let embedding = if let Some(provider) = embedding_provider {
+        provider
+            .embed(&embedding_input)
+            .await
+            .context("generating FastEmbed embedding for entity")?
+    } else {
+        generate_embedding(openai_client, &embedding_input, db_client).await?
+    };

    let now = Utc::now();
-    Ok(KnowledgeEntity {
+    let entity = KnowledgeEntity {
        id: assigned_id,
        created_at: now,
        updated_at: now,
@@ -161,7 +175,8 @@ async fn create_single_entity(
        entity_type: KnowledgeEntityType::from(llm_entity.entity_type.to_string()),
        source_id: source_id.to_string(),
        metadata: None,
-        embedding,
        user_id: user_id.into(),
-    })
+    };
+
+    Ok(EmbeddedKnowledgeEntity { entity, embedding })
 }
--- a/ingestion-pipeline/src/pipeline/mod.rs
+++ b/ingestion-pipeline/src/pipeline/mod.rs
@@ -50,6 +50,7 @@ impl IngestionPipeline {
        config: AppConfig,
        reranker_pool: Option<Arc<RerankerPool>>,
        storage: StorageManager,
+        embedding_provider: Arc<common::utils::embedding::EmbeddingProvider>,
    ) -> Result<Self, AppError> {
        let services = DefaultPipelineServices::new(
            db.clone(),
@@ -57,6 +58,7 @@ impl IngestionPipeline {
            config.clone(),
            reranker_pool,
            storage,
+            embedding_provider,
        );

        Self::with_services(db, IngestionConfig::default(), Arc::new(services))
--- a/ingestion-pipeline/src/pipeline/services.rs
+++ b/ingestion-pipeline/src/pipeline/services.rs
@@ -1,5 +1,6 @@
 use std::{ops::Range, sync::Arc};

+use anyhow::Context;
 use async_openai::types::{
    ChatCompletionRequestSystemMessage, ChatCompletionRequestUserMessage,
    CreateChatCompletionRequest, CreateChatCompletionRequestArgs, ResponseFormat,
@@ -12,19 +13,18 @@ use common::{
        db::SurrealDbClient,
        store::StorageManager,
        types::{
-            ingestion_payload::IngestionPayload, knowledge_entity::KnowledgeEntity,
-            knowledge_relationship::KnowledgeRelationship, system_settings::SystemSettings,
-            text_chunk::TextChunk, text_content::TextContent,
+            ingestion_payload::IngestionPayload, knowledge_relationship::KnowledgeRelationship,
+            system_settings::SystemSettings, text_chunk::TextChunk, text_content::TextContent,
+            StoredObject,
        },
    },
-    utils::{config::AppConfig, embedding::generate_embedding},
-};
-use retrieval_pipeline::{
-    reranking::RerankerPool, retrieved_entities_to_json, RetrievedEntity,
+    utils::{config::AppConfig, embedding::EmbeddingProvider},
 };
+use retrieval_pipeline::{reranking::RerankerPool, retrieved_entities_to_json, RetrievedEntity};
 use text_splitter::TextSplitter;

 use super::{enrichment_result::LLMEnrichmentResult, preparation::to_text_content};
+use crate::pipeline::context::{EmbeddedKnowledgeEntity, EmbeddedTextChunk};
 use crate::utils::llm_instructions::{
    get_ingress_analysis_schema, INGRESS_ANALYSIS_SYSTEM_MESSAGE,
 };
@@ -54,13 +54,13 @@ pub trait PipelineServices: Send + Sync {
        content: &TextContent,
        analysis: &LLMEnrichmentResult,
        entity_concurrency: usize,
-    ) -> Result<(Vec<KnowledgeEntity>, Vec<KnowledgeRelationship>), AppError>;
+    ) -> Result<(Vec<EmbeddedKnowledgeEntity>, Vec<KnowledgeRelationship>), AppError>;

    async fn prepare_chunks(
        &self,
        content: &TextContent,
        range: Range<usize>,
-    ) -> Result<Vec<TextChunk>, AppError>;
+    ) -> Result<Vec<EmbeddedTextChunk>, AppError>;
 }

 pub struct DefaultPipelineServices {
@@ -69,6 +69,7 @@ pub struct DefaultPipelineServices {
    config: AppConfig,
    reranker_pool: Option<Arc<RerankerPool>>,
    storage: StorageManager,
+    embedding_provider: Arc<EmbeddingProvider>,
 }

 impl DefaultPipelineServices {
@@ -78,6 +79,7 @@ impl DefaultPipelineServices {
        config: AppConfig,
        reranker_pool: Option<Arc<RerankerPool>>,
        storage: StorageManager,
+        embedding_provider: Arc<EmbeddingProvider>,
    ) -> Self {
        Self {
            db,
@@ -85,6 +87,7 @@ impl DefaultPipelineServices {
            config,
            reranker_pool,
            storage,
+            embedding_provider,
        }
    }

@@ -182,6 +185,7 @@ impl PipelineServices for DefaultPipelineServices {
        match retrieval_pipeline::retrieve_entities(
            &self.db,
            &self.openai_client,
+            // embedding_provider_ref,
            &input_text,
            &content.user_id,
            config,
@@ -218,14 +222,15 @@ impl PipelineServices for DefaultPipelineServices {
        content: &TextContent,
        analysis: &LLMEnrichmentResult,
        entity_concurrency: usize,
-    ) -> Result<(Vec<KnowledgeEntity>, Vec<KnowledgeRelationship>), AppError> {
+    ) -> Result<(Vec<EmbeddedKnowledgeEntity>, Vec<KnowledgeRelationship>), AppError> {
        analysis
            .to_database_entities(
-                &content.id,
+                &content.get_id(),
                &content.user_id,
                &self.openai_client,
                &self.db,
                entity_concurrency,
+                Some(&*self.embedding_provider),
            )
            .await
    }
@@ -234,7 +239,7 @@ impl PipelineServices for DefaultPipelineServices {
        &self,
        content: &TextContent,
        range: Range<usize>,
-    ) -> Result<Vec<TextChunk>, AppError> {
+    ) -> Result<Vec<EmbeddedTextChunk>, AppError> {
        let splitter = TextSplitter::new(range.clone());
        let chunk_texts: Vec<String> = splitter
            .chunks(&content.text)
@@ -243,13 +248,17 @@ impl PipelineServices for DefaultPipelineServices {

        let mut chunks = Vec::with_capacity(chunk_texts.len());
        for chunk in chunk_texts {
-            let embedding = generate_embedding(&self.openai_client, &chunk, &self.db).await?;
-            chunks.push(TextChunk::new(
-                content.id.clone(),
-                chunk,
+            let embedding = self
+                .embedding_provider
+                .embed(&chunk)
+                .await
+                .context("generating FastEmbed embedding for chunk")?;
+            let chunk_struct =
+                TextChunk::new(content.get_id().to_string(), chunk, content.user_id.clone());
+            chunks.push(EmbeddedTextChunk {
+                chunk: chunk_struct,
                embedding,
-                content.user_id.clone(),
-            ));
+            });
        }
        Ok(chunks)
    }
--- a/ingestion-pipeline/src/pipeline/stages/mod.rs
+++ b/ingestion-pipeline/src/pipeline/stages/mod.rs
@@ -15,7 +15,7 @@ use tokio::time::{sleep, Duration};
 use tracing::{debug, instrument, warn};

 use super::{
-    context::{PipelineArtifacts, PipelineContext},
+    context::{EmbeddedKnowledgeEntity, EmbeddedTextChunk, PipelineArtifacts, PipelineContext},
    state::{ContentPrepared, Enriched, IngestionMachine, Persisted, Ready, Retrieved},
 };

@@ -177,17 +177,21 @@ fn map_guard_error(event: &str, guard: GuardError) -> AppError {
 async fn store_graph_entities(
    db: &SurrealDbClient,
    tuning: &super::config::IngestionTuning,
-    entities: Vec<KnowledgeEntity>,
+    entities: Vec<EmbeddedKnowledgeEntity>,
    relationships: Vec<KnowledgeRelationship>,
 ) -> Result<(), AppError> {
-    const STORE_GRAPH_MUTATION: &str = r"
-        BEGIN TRANSACTION;
-        LET $entities = $entities;
-        LET $relationships = $relationships;
+    // Persist entities with embeddings first.
+    for embedded in entities {
+        KnowledgeEntity::store_with_embedding(embedded.entity, embedded.embedding, db).await?;
+    }

-        FOR $entity IN $entities {
-            CREATE type::thing('knowledge_entity', $entity.id) CONTENT $entity;
-        };
+    if relationships.is_empty() {
+        return Ok(());
+    }
+
+    const STORE_RELATIONSHIPS: &str = r"
+        BEGIN TRANSACTION;
+        LET $relationships = $relationships;

        FOR $relationship IN $relationships {
            LET $in_node = type::thing('knowledge_entity', $relationship.in);
@@ -201,7 +205,6 @@ async fn store_graph_entities(
        COMMIT TRANSACTION;
    ";

-    let entities = Arc::new(entities);
    let relationships = Arc::new(relationships);

    let mut backoff_ms = tuning.graph_initial_backoff_ms;
@@ -209,8 +212,7 @@ async fn store_graph_entities(
    for attempt in 0..tuning.graph_store_attempts {
        let result = db
            .client
-            .query(STORE_GRAPH_MUTATION)
-            .bind(("entities", entities.clone()))
+            .query(STORE_RELATIONSHIPS)
            .bind(("relationships", relationships.clone()))
            .await;

@@ -240,17 +242,17 @@ async fn store_graph_entities(
 async fn store_vector_chunks(
    db: &SurrealDbClient,
    task_id: &str,
-    chunks: &[TextChunk],
+    chunks: &[EmbeddedTextChunk],
    tuning: &super::config::IngestionTuning,
 ) -> Result<usize, AppError> {
    let chunk_count = chunks.len();

    let batch_size = tuning.chunk_insert_concurrency.max(1);
-    for chunk in chunks {
+    for embedded in chunks {
        debug!(
            task_id = %task_id,
-            chunk_id = %chunk.id,
-            chunk_len = chunk.chunk.chars().count(),
+            chunk_id = %embedded.chunk.id,
+            chunk_len = embedded.chunk.chunk.chars().count(),
            "chunk persisted"
        );
    }
@@ -270,53 +272,17 @@ fn is_retryable_conflict(error: &surrealdb::Error) -> bool {

 async fn store_chunk_batch(
    db: &SurrealDbClient,
-    batch: &[TextChunk],
-    tuning: &super::config::IngestionTuning,
+    batch: &[EmbeddedTextChunk],
+    _tuning: &super::config::IngestionTuning,
 ) -> Result<(), AppError> {
    if batch.is_empty() {
        return Ok(());
    }

-    const STORE_CHUNKS_MUTATION: &str = r"
-        BEGIN TRANSACTION;
-        LET $chunks = $chunks;
-
-        FOR $chunk IN $chunks {
-            CREATE type::thing('text_chunk', $chunk.id) CONTENT $chunk;
-        };
-
-        COMMIT TRANSACTION;
-    ";
-
-    let chunks = Arc::new(batch.to_vec());
-    let mut backoff_ms = tuning.graph_initial_backoff_ms;
-
-    for attempt in 0..tuning.graph_store_attempts {
-        let result = db
-            .client
-            .query(STORE_CHUNKS_MUTATION)
-            .bind(("chunks", chunks.clone()))
-            .await;
-
-        match result {
-            Ok(_) => return Ok(()),
-            Err(err) => {
-                if is_retryable_conflict(&err) && attempt + 1 < tuning.graph_store_attempts {
-                    warn!(
-                        attempt = attempt + 1,
-                        "Transient SurrealDB conflict while storing chunks; retrying"
-                    );
-                    sleep(Duration::from_millis(backoff_ms)).await;
-                    backoff_ms = (backoff_ms * 2).min(tuning.graph_max_backoff_ms);
-                    continue;
-                }
-
-                return Err(AppError::from(err));
-            }
-        }
+    for embedded in batch {
+        TextChunk::store_with_embedding(embedded.chunk.clone(), embedded.embedding.clone(), db)
+            .await?;
    }

-    Err(AppError::InternalError(
-        "Failed to store text chunks after retries".to_string(),
-    ))
+    Ok(())
 }
--- a/ingestion-pipeline/src/pipeline/tests.rs
+++ b/ingestion-pipeline/src/pipeline/tests.rs
@@ -1,5 +1,6 @@
 use std::sync::Arc;

+use crate::pipeline::context::{EmbeddedKnowledgeEntity, EmbeddedTextChunk};
 use async_trait::async_trait;
 use chrono::{Duration as ChronoDuration, Utc};
 use common::{
@@ -32,7 +33,7 @@ struct MockServices {
    similar_entities: Vec<RetrievedEntity>,
    analysis: LLMEnrichmentResult,
    chunk_embedding: Vec<f32>,
-    graph_entities: Vec<KnowledgeEntity>,
+    graph_entities: Vec<EmbeddedKnowledgeEntity>,
    graph_relationships: Vec<KnowledgeRelationship>,
    calls: Mutex<Vec<&'static str>>,
 }
@@ -54,14 +55,12 @@ impl MockServices {
            "Previously known context".into(),
            KnowledgeEntityType::Document,
            None,
-            vec![0.1; TEST_EMBEDDING_DIM],
            user_id.into(),
        );

        let retrieved_chunk = TextChunk::new(
            retrieved_entity.source_id.clone(),
            "existing chunk".into(),
-            vec![0.1; TEST_EMBEDDING_DIM],
            user_id.into(),
        );

@@ -76,7 +75,6 @@ impl MockServices {
            "Entity from enrichment".into(),
            KnowledgeEntityType::Idea,
            None,
-            vec![0.2; TEST_EMBEDDING_DIM],
            user_id.into(),
        );
        let graph_relationship = KnowledgeRelationship::new(
@@ -99,7 +97,10 @@ impl MockServices {
            }],
            analysis,
            chunk_embedding: vec![0.3; TEST_EMBEDDING_DIM],
-            graph_entities: vec![graph_entity],
+            graph_entities: vec![EmbeddedKnowledgeEntity {
+                entity: graph_entity,
+                embedding: vec![0.2; TEST_EMBEDDING_DIM],
+            }],
            graph_relationships: vec![graph_relationship],
            calls: Mutex::new(Vec::new()),
        }
@@ -142,7 +143,7 @@ impl PipelineServices for MockServices {
        _content: &TextContent,
        _analysis: &LLMEnrichmentResult,
        _entity_concurrency: usize,
-    ) -> Result<(Vec<KnowledgeEntity>, Vec<KnowledgeRelationship>), AppError> {
+    ) -> Result<(Vec<EmbeddedKnowledgeEntity>, Vec<KnowledgeRelationship>), AppError> {
        self.record("convert").await;
        Ok((
            self.graph_entities.clone(),
@@ -154,14 +155,16 @@ impl PipelineServices for MockServices {
        &self,
        content: &TextContent,
        _range: std::ops::Range<usize>,
-    ) -> Result<Vec<TextChunk>, AppError> {
+    ) -> Result<Vec<EmbeddedTextChunk>, AppError> {
        self.record("chunk").await;
-        Ok(vec![TextChunk::new(
-            content.id.clone(),
-            "chunk from mock services".into(),
-            self.chunk_embedding.clone(),
-            content.user_id.clone(),
-        )])
+        Ok(vec![EmbeddedTextChunk {
+            chunk: TextChunk::new(
+                content.id.clone(),
+                "chunk from mock services".into(),
+                content.user_id.clone(),
+            ),
+            embedding: self.chunk_embedding.clone(),
+        }])
    }
 }

@@ -200,7 +203,7 @@ impl PipelineServices for FailingServices {
        content: &TextContent,
        analysis: &LLMEnrichmentResult,
        entity_concurrency: usize,
-    ) -> Result<(Vec<KnowledgeEntity>, Vec<KnowledgeRelationship>), AppError> {
+    ) -> Result<(Vec<EmbeddedKnowledgeEntity>, Vec<KnowledgeRelationship>), AppError> {
        self.inner
            .convert_analysis(content, analysis, entity_concurrency)
            .await
@@ -210,7 +213,7 @@ impl PipelineServices for FailingServices {
        &self,
        content: &TextContent,
        range: std::ops::Range<usize>,
-    ) -> Result<Vec<TextChunk>, AppError> {
+    ) -> Result<Vec<EmbeddedTextChunk>, AppError> {
        self.inner.prepare_chunks(content, range).await
    }
 }
@@ -244,7 +247,7 @@ impl PipelineServices for ValidationServices {
        _content: &TextContent,
        _analysis: &LLMEnrichmentResult,
        _entity_concurrency: usize,
-    ) -> Result<(Vec<KnowledgeEntity>, Vec<KnowledgeRelationship>), AppError> {
+    ) -> Result<(Vec<EmbeddedKnowledgeEntity>, Vec<KnowledgeRelationship>), AppError> {
        unreachable!("convert_analysis should not be called after validation failure")
    }

@@ -252,7 +255,7 @@ impl PipelineServices for ValidationServices {
        &self,
        _content: &TextContent,
        _range: std::ops::Range<usize>,
-    ) -> Result<Vec<TextChunk>, AppError> {
+    ) -> Result<Vec<EmbeddedTextChunk>, AppError> {
        unreachable!("prepare_chunks should not be called after validation failure")
    }
 }