benchmarks: ready for hybrid revised

2026-04-25 02:08:30 +02:00 · 2025-12-03 11:38:07 +01:00
parent 2939e4c2a4
commit dd881efbf9
22 changed files with 760 additions and 476 deletions
--- a/ingestion-pipeline/src/pipeline/config.rs
+++ b/ingestion-pipeline/src/pipeline/config.rs
@@ -8,6 +8,7 @@ pub struct IngestionTuning {
    pub graph_max_backoff_ms: u64,
    pub chunk_min_tokens: usize,
    pub chunk_max_tokens: usize,
+    pub chunk_overlap_tokens: usize,
    pub chunk_insert_concurrency: usize,
    pub entity_embedding_concurrency: usize,
 }
@@ -21,8 +22,9 @@ impl Default for IngestionTuning {
            graph_store_attempts: 3,
            graph_initial_backoff_ms: 50,
            graph_max_backoff_ms: 800,
-            chunk_min_tokens: 500,
-            chunk_max_tokens: 2_000,
+            chunk_min_tokens: 256,
+            chunk_max_tokens: 512,
+            chunk_overlap_tokens: 50,
            chunk_insert_concurrency: 8,
            entity_embedding_concurrency: 4,
        }
--- a/ingestion-pipeline/src/pipeline/context.rs
+++ b/ingestion-pipeline/src/pipeline/context.rs
@@ -118,8 +118,12 @@ impl<'a> PipelineContext<'a> {
            .await?;

        let chunk_range = self.chunk_token_range();
+        let chunk_overlap = self.chunk_overlap_tokens();

-        let chunks = self.services.prepare_chunks(&content, chunk_range).await?;
+        let chunks = self
+            .services
+            .prepare_chunks(&content, chunk_range, chunk_overlap)
+            .await?;

        Ok(PipelineArtifacts {
            text_content: content,
@@ -132,8 +136,12 @@ impl<'a> PipelineContext<'a> {
    pub async fn build_chunk_only_artifacts(&mut self) -> Result<PipelineArtifacts, AppError> {
        let content = self.take_text_content()?;
        let chunk_range = self.chunk_token_range();
+        let chunk_overlap = self.chunk_overlap_tokens();

-        let chunks = self.services.prepare_chunks(&content, chunk_range).await?;
+        let chunks = self
+            .services
+            .prepare_chunks(&content, chunk_range, chunk_overlap)
+            .await?;

        Ok(PipelineArtifacts {
            text_content: content,
@@ -146,4 +154,8 @@ impl<'a> PipelineContext<'a> {
    fn chunk_token_range(&self) -> Range<usize> {
        self.pipeline_config.tuning.chunk_min_tokens..self.pipeline_config.tuning.chunk_max_tokens
    }
+
+    fn chunk_overlap_tokens(&self) -> usize {
+        self.pipeline_config.tuning.chunk_overlap_tokens
+    }
 }
--- a/ingestion-pipeline/src/pipeline/services.rs
+++ b/ingestion-pipeline/src/pipeline/services.rs
@@ -1,4 +1,7 @@
-use std::{ops::Range, sync::Arc};
+use std::{
+    ops::Range,
+    sync::{Arc, OnceLock},
+};

 use anyhow::Context;
 use async_openai::types::{
@@ -21,6 +24,7 @@ use common::{
    utils::{config::AppConfig, embedding::EmbeddingProvider},
 };
 use retrieval_pipeline::{reranking::RerankerPool, retrieved_entities_to_json, RetrievedEntity};
+use text_splitter::{ChunkCapacity, ChunkConfig, TextSplitter};

 use super::{enrichment_result::LLMEnrichmentResult, preparation::to_text_content};
 use crate::pipeline::context::{EmbeddedKnowledgeEntity, EmbeddedTextChunk};
@@ -29,7 +33,6 @@ use crate::utils::llm_instructions::{
 };

 const EMBEDDING_QUERY_CHAR_LIMIT: usize = 12_000;
-
 #[async_trait]
 pub trait PipelineServices: Send + Sync {
    async fn prepare_text_content(
@@ -59,6 +62,7 @@ pub trait PipelineServices: Send + Sync {
        &self,
        content: &TextContent,
        token_range: Range<usize>,
+        overlap_tokens: usize,
    ) -> Result<Vec<EmbeddedTextChunk>, AppError>;
 }

@@ -238,9 +242,14 @@ impl PipelineServices for DefaultPipelineServices {
        &self,
        content: &TextContent,
        token_range: Range<usize>,
+        overlap_tokens: usize,
    ) -> Result<Vec<EmbeddedTextChunk>, AppError> {
-        let chunk_candidates =
-            split_by_token_bounds(&content.text, token_range.start, token_range.end)?;
+        let chunk_candidates = prepare_chunks(
+            &content.text,
+            token_range.start,
+            token_range.end,
+            overlap_tokens,
+        )?;

        let mut chunks = Vec::with_capacity(chunk_candidates.len());
        for chunk_text in chunk_candidates {
@@ -249,8 +258,11 @@ impl PipelineServices for DefaultPipelineServices {
                .embed(&chunk_text)
                .await
                .context("generating FastEmbed embedding for chunk")?;
-            let chunk_struct =
-                TextChunk::new(content.get_id().to_string(), chunk_text, content.user_id.clone());
+            let chunk_struct = TextChunk::new(
+                content.get_id().to_string(),
+                chunk_text,
+                content.user_id.clone(),
+            );
            chunks.push(EmbeddedTextChunk {
                chunk: chunk_struct,
                embedding,
@@ -260,10 +272,11 @@ impl PipelineServices for DefaultPipelineServices {
    }
 }

-fn split_by_token_bounds(
+fn prepare_chunks(
    text: &str,
    min_tokens: usize,
    max_tokens: usize,
+    overlap_tokens: usize,
 ) -> Result<Vec<String>, AppError> {
    if min_tokens == 0 || max_tokens == 0 || min_tokens > max_tokens {
        return Err(AppError::Validation(
@@ -271,34 +284,44 @@ fn split_by_token_bounds(
        ));
    }

-    let tokens: Vec<&str> = text.split_whitespace().collect();
-    if tokens.is_empty() {
-        return Ok(vec![String::new()]);
+    if overlap_tokens >= min_tokens {
+        return Err(AppError::Validation(format!(
+            "chunk_min_tokens must be greater than the configured overlap of {overlap_tokens}"
+        )));
    }

-    let mut chunks = Vec::new();
-    let mut buffer: Vec<&str> = Vec::new();
-    for (idx, token) in tokens.iter().enumerate() {
-        buffer.push(token);
-        let remaining = tokens.len().saturating_sub(idx + 1);
-        let at_max = buffer.len() >= max_tokens;
-        let at_min_and_boundary =
-            buffer.len() >= min_tokens && (remaining == 0 || buffer.len() + 1 > max_tokens);
-        if at_max || at_min_and_boundary {
-            let chunk_text = buffer.join(" ");
-            chunks.push(chunk_text);
-            buffer.clear();
-        }
-    }
+    let tokenizer = get_tokenizer()?;

-    if !buffer.is_empty() {
-        let chunk_text = buffer.join(" ");
-        chunks.push(chunk_text);
+    let chunk_capacity = ChunkCapacity::new(min_tokens)
+        .with_max(max_tokens)
+        .map_err(|e| AppError::Validation(format!("invalid chunk token bounds: {e}")))?;
+    let chunk_config = ChunkConfig::new(chunk_capacity)
+        .with_overlap(overlap_tokens)
+        .map_err(|e| AppError::Validation(format!("invalid chunk overlap: {e}")))?
+        .with_sizer(tokenizer);
+    let splitter = TextSplitter::new(chunk_config);
+
+    let mut chunks: Vec<String> = splitter.chunks(text).map(str::to_owned).collect();
+
+    if chunks.is_empty() {
+        chunks.push(String::new());
    }

    Ok(chunks)
 }

+fn get_tokenizer() -> Result<&'static tokenizers::Tokenizer, AppError> {
+    static TOKENIZER: OnceLock<Result<tokenizers::Tokenizer, String>> = OnceLock::new();
+
+    match TOKENIZER.get_or_init(|| {
+        tokenizers::Tokenizer::from_pretrained("bert-base-cased", None)
+            .map_err(|e| format!("failed to initialize tokenizer: {e}"))
+    }) {
+        Ok(tokenizer) => Ok(tokenizer),
+        Err(err) => Err(AppError::InternalError(err.clone())),
+    }
+}
+
 fn truncate_for_embedding(text: &str, max_chars: usize) -> String {
    if text.chars().count() <= max_chars {
        return text.to_string();
--- a/ingestion-pipeline/src/pipeline/tests.rs
+++ b/ingestion-pipeline/src/pipeline/tests.rs
@@ -155,6 +155,7 @@ impl PipelineServices for MockServices {
        &self,
        content: &TextContent,
        _range: std::ops::Range<usize>,
+        _overlap_tokens: usize,
    ) -> Result<Vec<EmbeddedTextChunk>, AppError> {
        self.record("chunk").await;
        Ok(vec![EmbeddedTextChunk {
@@ -213,8 +214,11 @@ impl PipelineServices for FailingServices {
        &self,
        content: &TextContent,
        token_range: std::ops::Range<usize>,
+        overlap_tokens: usize,
    ) -> Result<Vec<EmbeddedTextChunk>, AppError> {
-        self.inner.prepare_chunks(content, token_range).await
+        self.inner
+            .prepare_chunks(content, token_range, overlap_tokens)
+            .await
    }
 }

@@ -255,6 +259,7 @@ impl PipelineServices for ValidationServices {
        &self,
        _content: &TextContent,
        _token_range: std::ops::Range<usize>,
+        _overlap_tokens: usize,
    ) -> Result<Vec<EmbeddedTextChunk>, AppError> {
        unreachable!("prepare_chunks should not be called after validation failure")
    }