mirror of
https://github.com/perstarkse/minne.git
synced 2026-04-25 02:08:30 +02:00
benchmarks: ready for hybrid revised
This commit is contained in:
@@ -8,6 +8,7 @@ pub struct IngestionTuning {
|
||||
pub graph_max_backoff_ms: u64,
|
||||
pub chunk_min_tokens: usize,
|
||||
pub chunk_max_tokens: usize,
|
||||
pub chunk_overlap_tokens: usize,
|
||||
pub chunk_insert_concurrency: usize,
|
||||
pub entity_embedding_concurrency: usize,
|
||||
}
|
||||
@@ -21,8 +22,9 @@ impl Default for IngestionTuning {
|
||||
graph_store_attempts: 3,
|
||||
graph_initial_backoff_ms: 50,
|
||||
graph_max_backoff_ms: 800,
|
||||
chunk_min_tokens: 500,
|
||||
chunk_max_tokens: 2_000,
|
||||
chunk_min_tokens: 256,
|
||||
chunk_max_tokens: 512,
|
||||
chunk_overlap_tokens: 50,
|
||||
chunk_insert_concurrency: 8,
|
||||
entity_embedding_concurrency: 4,
|
||||
}
|
||||
|
||||
@@ -118,8 +118,12 @@ impl<'a> PipelineContext<'a> {
|
||||
.await?;
|
||||
|
||||
let chunk_range = self.chunk_token_range();
|
||||
let chunk_overlap = self.chunk_overlap_tokens();
|
||||
|
||||
let chunks = self.services.prepare_chunks(&content, chunk_range).await?;
|
||||
let chunks = self
|
||||
.services
|
||||
.prepare_chunks(&content, chunk_range, chunk_overlap)
|
||||
.await?;
|
||||
|
||||
Ok(PipelineArtifacts {
|
||||
text_content: content,
|
||||
@@ -132,8 +136,12 @@ impl<'a> PipelineContext<'a> {
|
||||
pub async fn build_chunk_only_artifacts(&mut self) -> Result<PipelineArtifacts, AppError> {
|
||||
let content = self.take_text_content()?;
|
||||
let chunk_range = self.chunk_token_range();
|
||||
let chunk_overlap = self.chunk_overlap_tokens();
|
||||
|
||||
let chunks = self.services.prepare_chunks(&content, chunk_range).await?;
|
||||
let chunks = self
|
||||
.services
|
||||
.prepare_chunks(&content, chunk_range, chunk_overlap)
|
||||
.await?;
|
||||
|
||||
Ok(PipelineArtifacts {
|
||||
text_content: content,
|
||||
@@ -146,4 +154,8 @@ impl<'a> PipelineContext<'a> {
|
||||
fn chunk_token_range(&self) -> Range<usize> {
|
||||
self.pipeline_config.tuning.chunk_min_tokens..self.pipeline_config.tuning.chunk_max_tokens
|
||||
}
|
||||
|
||||
fn chunk_overlap_tokens(&self) -> usize {
|
||||
self.pipeline_config.tuning.chunk_overlap_tokens
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,7 @@
|
||||
use std::{ops::Range, sync::Arc};
|
||||
use std::{
|
||||
ops::Range,
|
||||
sync::{Arc, OnceLock},
|
||||
};
|
||||
|
||||
use anyhow::Context;
|
||||
use async_openai::types::{
|
||||
@@ -21,6 +24,7 @@ use common::{
|
||||
utils::{config::AppConfig, embedding::EmbeddingProvider},
|
||||
};
|
||||
use retrieval_pipeline::{reranking::RerankerPool, retrieved_entities_to_json, RetrievedEntity};
|
||||
use text_splitter::{ChunkCapacity, ChunkConfig, TextSplitter};
|
||||
|
||||
use super::{enrichment_result::LLMEnrichmentResult, preparation::to_text_content};
|
||||
use crate::pipeline::context::{EmbeddedKnowledgeEntity, EmbeddedTextChunk};
|
||||
@@ -29,7 +33,6 @@ use crate::utils::llm_instructions::{
|
||||
};
|
||||
|
||||
const EMBEDDING_QUERY_CHAR_LIMIT: usize = 12_000;
|
||||
|
||||
#[async_trait]
|
||||
pub trait PipelineServices: Send + Sync {
|
||||
async fn prepare_text_content(
|
||||
@@ -59,6 +62,7 @@ pub trait PipelineServices: Send + Sync {
|
||||
&self,
|
||||
content: &TextContent,
|
||||
token_range: Range<usize>,
|
||||
overlap_tokens: usize,
|
||||
) -> Result<Vec<EmbeddedTextChunk>, AppError>;
|
||||
}
|
||||
|
||||
@@ -238,9 +242,14 @@ impl PipelineServices for DefaultPipelineServices {
|
||||
&self,
|
||||
content: &TextContent,
|
||||
token_range: Range<usize>,
|
||||
overlap_tokens: usize,
|
||||
) -> Result<Vec<EmbeddedTextChunk>, AppError> {
|
||||
let chunk_candidates =
|
||||
split_by_token_bounds(&content.text, token_range.start, token_range.end)?;
|
||||
let chunk_candidates = prepare_chunks(
|
||||
&content.text,
|
||||
token_range.start,
|
||||
token_range.end,
|
||||
overlap_tokens,
|
||||
)?;
|
||||
|
||||
let mut chunks = Vec::with_capacity(chunk_candidates.len());
|
||||
for chunk_text in chunk_candidates {
|
||||
@@ -249,8 +258,11 @@ impl PipelineServices for DefaultPipelineServices {
|
||||
.embed(&chunk_text)
|
||||
.await
|
||||
.context("generating FastEmbed embedding for chunk")?;
|
||||
let chunk_struct =
|
||||
TextChunk::new(content.get_id().to_string(), chunk_text, content.user_id.clone());
|
||||
let chunk_struct = TextChunk::new(
|
||||
content.get_id().to_string(),
|
||||
chunk_text,
|
||||
content.user_id.clone(),
|
||||
);
|
||||
chunks.push(EmbeddedTextChunk {
|
||||
chunk: chunk_struct,
|
||||
embedding,
|
||||
@@ -260,10 +272,11 @@ impl PipelineServices for DefaultPipelineServices {
|
||||
}
|
||||
}
|
||||
|
||||
fn split_by_token_bounds(
|
||||
fn prepare_chunks(
|
||||
text: &str,
|
||||
min_tokens: usize,
|
||||
max_tokens: usize,
|
||||
overlap_tokens: usize,
|
||||
) -> Result<Vec<String>, AppError> {
|
||||
if min_tokens == 0 || max_tokens == 0 || min_tokens > max_tokens {
|
||||
return Err(AppError::Validation(
|
||||
@@ -271,34 +284,44 @@ fn split_by_token_bounds(
|
||||
));
|
||||
}
|
||||
|
||||
let tokens: Vec<&str> = text.split_whitespace().collect();
|
||||
if tokens.is_empty() {
|
||||
return Ok(vec![String::new()]);
|
||||
if overlap_tokens >= min_tokens {
|
||||
return Err(AppError::Validation(format!(
|
||||
"chunk_min_tokens must be greater than the configured overlap of {overlap_tokens}"
|
||||
)));
|
||||
}
|
||||
|
||||
let mut chunks = Vec::new();
|
||||
let mut buffer: Vec<&str> = Vec::new();
|
||||
for (idx, token) in tokens.iter().enumerate() {
|
||||
buffer.push(token);
|
||||
let remaining = tokens.len().saturating_sub(idx + 1);
|
||||
let at_max = buffer.len() >= max_tokens;
|
||||
let at_min_and_boundary =
|
||||
buffer.len() >= min_tokens && (remaining == 0 || buffer.len() + 1 > max_tokens);
|
||||
if at_max || at_min_and_boundary {
|
||||
let chunk_text = buffer.join(" ");
|
||||
chunks.push(chunk_text);
|
||||
buffer.clear();
|
||||
}
|
||||
}
|
||||
let tokenizer = get_tokenizer()?;
|
||||
|
||||
if !buffer.is_empty() {
|
||||
let chunk_text = buffer.join(" ");
|
||||
chunks.push(chunk_text);
|
||||
let chunk_capacity = ChunkCapacity::new(min_tokens)
|
||||
.with_max(max_tokens)
|
||||
.map_err(|e| AppError::Validation(format!("invalid chunk token bounds: {e}")))?;
|
||||
let chunk_config = ChunkConfig::new(chunk_capacity)
|
||||
.with_overlap(overlap_tokens)
|
||||
.map_err(|e| AppError::Validation(format!("invalid chunk overlap: {e}")))?
|
||||
.with_sizer(tokenizer);
|
||||
let splitter = TextSplitter::new(chunk_config);
|
||||
|
||||
let mut chunks: Vec<String> = splitter.chunks(text).map(str::to_owned).collect();
|
||||
|
||||
if chunks.is_empty() {
|
||||
chunks.push(String::new());
|
||||
}
|
||||
|
||||
Ok(chunks)
|
||||
}
|
||||
|
||||
fn get_tokenizer() -> Result<&'static tokenizers::Tokenizer, AppError> {
|
||||
static TOKENIZER: OnceLock<Result<tokenizers::Tokenizer, String>> = OnceLock::new();
|
||||
|
||||
match TOKENIZER.get_or_init(|| {
|
||||
tokenizers::Tokenizer::from_pretrained("bert-base-cased", None)
|
||||
.map_err(|e| format!("failed to initialize tokenizer: {e}"))
|
||||
}) {
|
||||
Ok(tokenizer) => Ok(tokenizer),
|
||||
Err(err) => Err(AppError::InternalError(err.clone())),
|
||||
}
|
||||
}
|
||||
|
||||
fn truncate_for_embedding(text: &str, max_chars: usize) -> String {
|
||||
if text.chars().count() <= max_chars {
|
||||
return text.to_string();
|
||||
|
||||
@@ -155,6 +155,7 @@ impl PipelineServices for MockServices {
|
||||
&self,
|
||||
content: &TextContent,
|
||||
_range: std::ops::Range<usize>,
|
||||
_overlap_tokens: usize,
|
||||
) -> Result<Vec<EmbeddedTextChunk>, AppError> {
|
||||
self.record("chunk").await;
|
||||
Ok(vec![EmbeddedTextChunk {
|
||||
@@ -213,8 +214,11 @@ impl PipelineServices for FailingServices {
|
||||
&self,
|
||||
content: &TextContent,
|
||||
token_range: std::ops::Range<usize>,
|
||||
overlap_tokens: usize,
|
||||
) -> Result<Vec<EmbeddedTextChunk>, AppError> {
|
||||
self.inner.prepare_chunks(content, token_range).await
|
||||
self.inner
|
||||
.prepare_chunks(content, token_range, overlap_tokens)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -255,6 +259,7 @@ impl PipelineServices for ValidationServices {
|
||||
&self,
|
||||
_content: &TextContent,
|
||||
_token_range: std::ops::Range<usize>,
|
||||
_overlap_tokens: usize,
|
||||
) -> Result<Vec<EmbeddedTextChunk>, AppError> {
|
||||
unreachable!("prepare_chunks should not be called after validation failure")
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user