benchmarks: ready for hybrid revised

This commit is contained in:
Per Stark
2025-12-03 11:38:07 +01:00
parent 2939e4c2a4
commit dd881efbf9
22 changed files with 760 additions and 476 deletions
+50 -27
View File
@@ -1,4 +1,7 @@
use std::{ops::Range, sync::Arc};
use std::{
ops::Range,
sync::{Arc, OnceLock},
};
use anyhow::Context;
use async_openai::types::{
@@ -21,6 +24,7 @@ use common::{
utils::{config::AppConfig, embedding::EmbeddingProvider},
};
use retrieval_pipeline::{reranking::RerankerPool, retrieved_entities_to_json, RetrievedEntity};
use text_splitter::{ChunkCapacity, ChunkConfig, TextSplitter};
use super::{enrichment_result::LLMEnrichmentResult, preparation::to_text_content};
use crate::pipeline::context::{EmbeddedKnowledgeEntity, EmbeddedTextChunk};
@@ -29,7 +33,6 @@ use crate::utils::llm_instructions::{
};
const EMBEDDING_QUERY_CHAR_LIMIT: usize = 12_000;
#[async_trait]
pub trait PipelineServices: Send + Sync {
async fn prepare_text_content(
@@ -59,6 +62,7 @@ pub trait PipelineServices: Send + Sync {
&self,
content: &TextContent,
token_range: Range<usize>,
overlap_tokens: usize,
) -> Result<Vec<EmbeddedTextChunk>, AppError>;
}
@@ -238,9 +242,14 @@ impl PipelineServices for DefaultPipelineServices {
&self,
content: &TextContent,
token_range: Range<usize>,
overlap_tokens: usize,
) -> Result<Vec<EmbeddedTextChunk>, AppError> {
let chunk_candidates =
split_by_token_bounds(&content.text, token_range.start, token_range.end)?;
let chunk_candidates = prepare_chunks(
&content.text,
token_range.start,
token_range.end,
overlap_tokens,
)?;
let mut chunks = Vec::with_capacity(chunk_candidates.len());
for chunk_text in chunk_candidates {
@@ -249,8 +258,11 @@ impl PipelineServices for DefaultPipelineServices {
.embed(&chunk_text)
.await
.context("generating FastEmbed embedding for chunk")?;
let chunk_struct =
TextChunk::new(content.get_id().to_string(), chunk_text, content.user_id.clone());
let chunk_struct = TextChunk::new(
content.get_id().to_string(),
chunk_text,
content.user_id.clone(),
);
chunks.push(EmbeddedTextChunk {
chunk: chunk_struct,
embedding,
@@ -260,10 +272,11 @@ impl PipelineServices for DefaultPipelineServices {
}
}
fn split_by_token_bounds(
fn prepare_chunks(
text: &str,
min_tokens: usize,
max_tokens: usize,
overlap_tokens: usize,
) -> Result<Vec<String>, AppError> {
if min_tokens == 0 || max_tokens == 0 || min_tokens > max_tokens {
return Err(AppError::Validation(
@@ -271,34 +284,44 @@ fn split_by_token_bounds(
));
}
let tokens: Vec<&str> = text.split_whitespace().collect();
if tokens.is_empty() {
return Ok(vec![String::new()]);
if overlap_tokens >= min_tokens {
return Err(AppError::Validation(format!(
"chunk_min_tokens must be greater than the configured overlap of {overlap_tokens}"
)));
}
let mut chunks = Vec::new();
let mut buffer: Vec<&str> = Vec::new();
for (idx, token) in tokens.iter().enumerate() {
buffer.push(token);
let remaining = tokens.len().saturating_sub(idx + 1);
let at_max = buffer.len() >= max_tokens;
let at_min_and_boundary =
buffer.len() >= min_tokens && (remaining == 0 || buffer.len() + 1 > max_tokens);
if at_max || at_min_and_boundary {
let chunk_text = buffer.join(" ");
chunks.push(chunk_text);
buffer.clear();
}
}
let tokenizer = get_tokenizer()?;
if !buffer.is_empty() {
let chunk_text = buffer.join(" ");
chunks.push(chunk_text);
let chunk_capacity = ChunkCapacity::new(min_tokens)
.with_max(max_tokens)
.map_err(|e| AppError::Validation(format!("invalid chunk token bounds: {e}")))?;
let chunk_config = ChunkConfig::new(chunk_capacity)
.with_overlap(overlap_tokens)
.map_err(|e| AppError::Validation(format!("invalid chunk overlap: {e}")))?
.with_sizer(tokenizer);
let splitter = TextSplitter::new(chunk_config);
let mut chunks: Vec<String> = splitter.chunks(text).map(str::to_owned).collect();
if chunks.is_empty() {
chunks.push(String::new());
}
Ok(chunks)
}
fn get_tokenizer() -> Result<&'static tokenizers::Tokenizer, AppError> {
static TOKENIZER: OnceLock<Result<tokenizers::Tokenizer, String>> = OnceLock::new();
match TOKENIZER.get_or_init(|| {
tokenizers::Tokenizer::from_pretrained("bert-base-cased", None)
.map_err(|e| format!("failed to initialize tokenizer: {e}"))
}) {
Ok(tokenizer) => Ok(tokenizer),
Err(err) => Err(AppError::InternalError(err.clone())),
}
}
fn truncate_for_embedding(text: &str, max_chars: usize) -> String {
if text.chars().count() <= max_chars {
return text.to_string();