mirror of
https://github.com/perstarkse/minne.git
synced 2026-07-05 20:41:41 +02:00
chore: refactor retrieval pipeline to chunk-first RRF with derived entities and slimmer eval surface.
Collapse the multi-strategy entity engine into one benchmarked chunk retrieval path, derive entities from retrieved chunks, and update consumers, docs, and clippy fixes across the workspace.
This commit is contained in:
+63
-115
@@ -1,10 +1,9 @@
|
||||
pub mod answer_retrieval;
|
||||
pub mod answer_retrieval_helper;
|
||||
|
||||
pub mod graph;
|
||||
pub mod pipeline;
|
||||
pub mod reranking;
|
||||
pub mod scoring;
|
||||
|
||||
pub(crate) mod scoring;
|
||||
|
||||
use common::{
|
||||
error::AppError,
|
||||
@@ -16,39 +15,28 @@ use common::{
|
||||
use reranking::RerankerLease;
|
||||
use tracing::instrument;
|
||||
|
||||
// Strategy output variants - defined before pipeline module
|
||||
/// Result of a retrieval run.
|
||||
///
|
||||
/// Chunk retrieval is always performed; entities are only present when the caller
|
||||
/// requested entity resolution via [`RetrievalConfig::with_entities`].
|
||||
#[derive(Debug)]
|
||||
pub enum StrategyOutput {
|
||||
Entities(Vec<RetrievedEntity>),
|
||||
pub enum RetrievalOutput {
|
||||
Chunks(Vec<RetrievedChunk>),
|
||||
Search(SearchResult),
|
||||
}
|
||||
|
||||
/// Unified search result containing both chunks and entities
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct SearchResult {
|
||||
pub chunks: Vec<RetrievedChunk>,
|
||||
pub entities: Vec<RetrievedEntity>,
|
||||
}
|
||||
|
||||
impl SearchResult {
|
||||
pub fn new(chunks: Vec<RetrievedChunk>, entities: Vec<RetrievedEntity>) -> Self {
|
||||
Self { chunks, entities }
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.chunks.is_empty() && self.entities.is_empty()
|
||||
}
|
||||
WithEntities {
|
||||
chunks: Vec<RetrievedChunk>,
|
||||
entities: Vec<RetrievedEntity>,
|
||||
},
|
||||
}
|
||||
|
||||
pub use pipeline::{
|
||||
retrieved_entities_to_json, Diagnostics, StageTimings, RetrievalConfig,
|
||||
RetrievalStrategy, RetrievalTuning, RetrievalTuningFlags, SearchTarget,
|
||||
retrieved_entities_to_json, Diagnostics, RetrievalConfig, RetrievalParams, StageKind,
|
||||
StageTimings,
|
||||
};
|
||||
|
||||
// Backward-compatible type aliases for external consumers
|
||||
pub type PipelineDiagnostics = Diagnostics;
|
||||
pub type PipelineStageTimings = StageTimings;
|
||||
/// Round a score to three decimal places for JSON output.
|
||||
pub(crate) fn round_score(value: f32) -> f64 {
|
||||
(f64::from(value) * 1000.0).round() / 1000.0
|
||||
}
|
||||
|
||||
// Captures a supporting chunk plus its fused retrieval score for downstream prompts.
|
||||
#[derive(Debug, Clone)]
|
||||
@@ -57,7 +45,7 @@ pub struct RetrievedChunk {
|
||||
pub score: f32,
|
||||
}
|
||||
|
||||
// Final entity representation returned to callers, enriched with ranked chunks.
|
||||
// Knowledge entity resolved from retrieved chunks, enriched with its contributing chunks.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct RetrievedEntity {
|
||||
pub entity: KnowledgeEntity,
|
||||
@@ -65,9 +53,9 @@ pub struct RetrievedEntity {
|
||||
pub chunks: Vec<RetrievedChunk>,
|
||||
}
|
||||
|
||||
/// Primary orchestrator for the process of retrieving `KnowledgeEntity` values related to an `input_text`
|
||||
/// Run chunk-first hybrid retrieval for `input_text`, optionally resolving owning entities.
|
||||
#[instrument(skip_all, fields(user_id))]
|
||||
pub async fn retrieve_entities(
|
||||
pub async fn retrieve(
|
||||
db_client: &SurrealDbClient,
|
||||
openai_client: &async_openai::Client<async_openai::config::OpenAIConfig>,
|
||||
embedding_provider: Option<&common::utils::embedding::EmbeddingProvider>,
|
||||
@@ -75,8 +63,8 @@ pub async fn retrieve_entities(
|
||||
user_id: &str,
|
||||
config: RetrievalConfig,
|
||||
reranker: Option<RerankerLease>,
|
||||
) -> Result<StrategyOutput, AppError> {
|
||||
let params = pipeline::StrategyParams {
|
||||
) -> Result<RetrievalOutput, AppError> {
|
||||
let params = pipeline::RetrievalParams {
|
||||
db_client,
|
||||
openai_client,
|
||||
embedding_provider,
|
||||
@@ -94,6 +82,7 @@ mod tests {
|
||||
use anyhow::{self};
|
||||
use async_openai::Client;
|
||||
use common::storage::indexes::ensure_runtime;
|
||||
use common::storage::types::knowledge_entity::{KnowledgeEntity, KnowledgeEntityType};
|
||||
use common::storage::types::system_settings::SystemSettings;
|
||||
use uuid::Uuid;
|
||||
|
||||
@@ -133,7 +122,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_default_strategy_retrieves_chunks() -> anyhow::Result<()> {
|
||||
async fn test_chunk_retrieval_returns_chunks() -> anyhow::Result<()> {
|
||||
let db = setup_test_db().await?;
|
||||
let user_id = "test_user";
|
||||
let chunk = TextChunk::new(
|
||||
@@ -145,7 +134,7 @@ mod tests {
|
||||
TextChunk::store_with_embedding(chunk.clone(), chunk_embedding_primary(), &db).await?;
|
||||
|
||||
let openai_client = Client::new();
|
||||
let params = pipeline::StrategyParams {
|
||||
let params = pipeline::RetrievalParams {
|
||||
db_client: &db,
|
||||
openai_client: &openai_client,
|
||||
embedding_provider: None,
|
||||
@@ -154,12 +143,13 @@ mod tests {
|
||||
config: RetrievalConfig::default(),
|
||||
reranker: None,
|
||||
};
|
||||
let results = pipeline::run_pipeline_with_embedding(params, test_embedding())
|
||||
.await?;
|
||||
let results = pipeline::run_with_embedding(params, test_embedding()).await?;
|
||||
|
||||
let chunks = match results {
|
||||
StrategyOutput::Chunks(items) => items,
|
||||
other => anyhow::bail!("expected chunk results, got {other:?}"),
|
||||
RetrievalOutput::Chunks(items) => items,
|
||||
RetrievalOutput::WithEntities { .. } => {
|
||||
anyhow::bail!("expected chunk results, got entities")
|
||||
}
|
||||
};
|
||||
|
||||
assert!(!chunks.is_empty(), "Expected at least one retrieval result");
|
||||
@@ -171,8 +161,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_default_strategy_returns_chunks_from_multiple_sources(
|
||||
) -> anyhow::Result<()> {
|
||||
async fn test_chunk_retrieval_returns_chunks_from_multiple_sources() -> anyhow::Result<()> {
|
||||
let db = setup_test_db().await?;
|
||||
let user_id = "multi_source_user";
|
||||
|
||||
@@ -191,7 +180,7 @@ mod tests {
|
||||
TextChunk::store_with_embedding(secondary_chunk, chunk_embedding_secondary(), &db).await?;
|
||||
|
||||
let openai_client = Client::new();
|
||||
let params = pipeline::StrategyParams {
|
||||
let params = pipeline::RetrievalParams {
|
||||
db_client: &db,
|
||||
openai_client: &openai_client,
|
||||
embedding_provider: None,
|
||||
@@ -200,12 +189,13 @@ mod tests {
|
||||
config: RetrievalConfig::default(),
|
||||
reranker: None,
|
||||
};
|
||||
let results = pipeline::run_pipeline_with_embedding(params, test_embedding())
|
||||
.await?;
|
||||
let results = pipeline::run_with_embedding(params, test_embedding()).await?;
|
||||
|
||||
let chunks = match results {
|
||||
StrategyOutput::Chunks(items) => items,
|
||||
other => anyhow::bail!("expected chunk results, got {other:?}"),
|
||||
RetrievalOutput::Chunks(items) => items,
|
||||
RetrievalOutput::WithEntities { .. } => {
|
||||
anyhow::bail!("expected chunk results, got entities")
|
||||
}
|
||||
};
|
||||
|
||||
assert!(chunks.len() >= 2, "Expected chunks from multiple sources");
|
||||
@@ -223,96 +213,54 @@ mod tests {
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_revised_strategy_returns_chunks() -> anyhow::Result<()> {
|
||||
async fn test_with_entities_resolves_owning_entities() -> anyhow::Result<()> {
|
||||
let db = setup_test_db().await?;
|
||||
let user_id = "chunk_user";
|
||||
let chunk_one = TextChunk::new(
|
||||
"src_alpha".into(),
|
||||
"Tokio tasks execute on worker threads managed by the runtime.".into(),
|
||||
user_id.into(),
|
||||
);
|
||||
let chunk_two = TextChunk::new(
|
||||
"src_beta".into(),
|
||||
"Hyper utilizes Tokio to drive HTTP state machines efficiently.".into(),
|
||||
user_id.into(),
|
||||
);
|
||||
let user_id = "entity_user";
|
||||
|
||||
TextChunk::store_with_embedding(chunk_one.clone(), chunk_embedding_primary(), &db).await?;
|
||||
TextChunk::store_with_embedding(chunk_two.clone(), chunk_embedding_secondary(), &db).await?;
|
||||
|
||||
let config = RetrievalConfig::with_strategy(RetrievalStrategy::Default);
|
||||
let openai_client = Client::new();
|
||||
let params = pipeline::StrategyParams {
|
||||
db_client: &db,
|
||||
openai_client: &openai_client,
|
||||
embedding_provider: None,
|
||||
input_text: "tokio runtime worker behavior",
|
||||
user_id,
|
||||
config,
|
||||
reranker: None,
|
||||
};
|
||||
let results = pipeline::run_pipeline_with_embedding(params, test_embedding())
|
||||
.await?;
|
||||
|
||||
let chunks = match results {
|
||||
StrategyOutput::Chunks(items) => items,
|
||||
other => anyhow::bail!("expected chunk results, got {other:?}"),
|
||||
};
|
||||
|
||||
assert!(
|
||||
!chunks.is_empty(),
|
||||
"Revised strategy should return chunk-only responses"
|
||||
);
|
||||
assert!(
|
||||
chunks
|
||||
.iter()
|
||||
.any(|entry| entry.chunk.chunk.contains("Tokio")),
|
||||
"Chunk results should contain relevant snippets"
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_search_strategy_returns_search_result() -> anyhow::Result<()> {
|
||||
let db = setup_test_db().await?;
|
||||
let user_id = "search_user";
|
||||
let chunk = TextChunk::new(
|
||||
"search_src".into(),
|
||||
"Async Rust programming uses Tokio runtime for concurrent tasks.".into(),
|
||||
"entity_source".into(),
|
||||
"Async Rust programming uses the Tokio runtime for concurrent tasks.".into(),
|
||||
user_id.into(),
|
||||
);
|
||||
|
||||
TextChunk::store_with_embedding(chunk.clone(), chunk_embedding_primary(), &db).await?;
|
||||
|
||||
let config = RetrievalConfig::for_search(pipeline::SearchTarget::Both);
|
||||
let entity = KnowledgeEntity::new(
|
||||
"entity_source".into(),
|
||||
"Tokio Runtime".into(),
|
||||
"Async runtime for Rust".into(),
|
||||
KnowledgeEntityType::Document,
|
||||
None,
|
||||
user_id.into(),
|
||||
);
|
||||
db.store_item(entity).await?;
|
||||
|
||||
let openai_client = Client::new();
|
||||
let params = pipeline::StrategyParams {
|
||||
let params = pipeline::RetrievalParams {
|
||||
db_client: &db,
|
||||
openai_client: &openai_client,
|
||||
embedding_provider: None,
|
||||
input_text: "async rust programming",
|
||||
user_id,
|
||||
config,
|
||||
config: RetrievalConfig::with_entities(),
|
||||
reranker: None,
|
||||
};
|
||||
let results = pipeline::run_pipeline_with_embedding(params, test_embedding())
|
||||
.await?;
|
||||
let results = pipeline::run_with_embedding(params, test_embedding()).await?;
|
||||
|
||||
let StrategyOutput::Search(search_result) = results else {
|
||||
anyhow::bail!("expected Search output");
|
||||
let RetrievalOutput::WithEntities { chunks, entities } = results else {
|
||||
anyhow::bail!("expected WithEntities output");
|
||||
};
|
||||
|
||||
// Should return chunks (entities may be empty if none stored)
|
||||
assert!(!chunks.is_empty(), "Should return chunks");
|
||||
assert!(
|
||||
!search_result.chunks.is_empty(),
|
||||
"Search strategy should return chunks"
|
||||
entities.iter().any(|e| e.entity.name == "Tokio Runtime"),
|
||||
"Should resolve the entity owning the retrieved chunk"
|
||||
);
|
||||
assert!(
|
||||
search_result
|
||||
.chunks
|
||||
entities
|
||||
.iter()
|
||||
.any(|c| c.chunk.chunk.contains("Tokio")),
|
||||
"Search results should contain relevant chunks"
|
||||
.find(|e| e.entity.name == "Tokio Runtime")
|
||||
.is_some_and(|e| !e.chunks.is_empty()),
|
||||
"Resolved entity should carry its contributing chunks"
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user