mirror of
https://github.com/perstarkse/minne.git
synced 2026-04-25 02:08:30 +02:00
retrieval simplfied
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
pub mod answer_retrieval;
|
||||
pub mod answer_retrieval_helper;
|
||||
pub mod fts;
|
||||
|
||||
pub mod graph;
|
||||
pub mod pipeline;
|
||||
pub mod reranking;
|
||||
@@ -70,11 +70,7 @@ mod tests {
|
||||
use super::*;
|
||||
use async_openai::Client;
|
||||
use common::storage::indexes::ensure_runtime_indexes;
|
||||
use common::storage::types::{
|
||||
knowledge_entity::{KnowledgeEntity, KnowledgeEntityType},
|
||||
knowledge_relationship::KnowledgeRelationship,
|
||||
text_chunk::TextChunk,
|
||||
};
|
||||
use common::storage::types::text_chunk::TextChunk;
|
||||
use pipeline::{RetrievalConfig, RetrievalStrategy};
|
||||
use uuid::Uuid;
|
||||
|
||||
@@ -82,14 +78,6 @@ mod tests {
|
||||
vec![0.9, 0.1, 0.0]
|
||||
}
|
||||
|
||||
fn entity_embedding_high() -> Vec<f32> {
|
||||
vec![0.8, 0.2, 0.0]
|
||||
}
|
||||
|
||||
fn entity_embedding_low() -> Vec<f32> {
|
||||
vec![0.1, 0.9, 0.0]
|
||||
}
|
||||
|
||||
fn chunk_embedding_primary() -> Vec<f32> {
|
||||
vec![0.85, 0.15, 0.0]
|
||||
}
|
||||
@@ -113,41 +101,19 @@ mod tests {
|
||||
.await
|
||||
.expect("failed to build runtime indexes");
|
||||
|
||||
db.query(
|
||||
"BEGIN TRANSACTION;
|
||||
REMOVE INDEX IF EXISTS idx_embedding_text_chunk_embedding ON TABLE text_chunk_embedding;
|
||||
DEFINE INDEX idx_embedding_text_chunk_embedding ON TABLE text_chunk_embedding FIELDS embedding HNSW DIMENSION 3;
|
||||
REMOVE INDEX IF EXISTS idx_embedding_knowledge_entity_embedding ON TABLE knowledge_entity_embedding;
|
||||
DEFINE INDEX idx_embedding_knowledge_entity_embedding ON TABLE knowledge_entity_embedding FIELDS embedding HNSW DIMENSION 3;
|
||||
COMMIT TRANSACTION;",
|
||||
)
|
||||
.await
|
||||
.expect("Failed to configure indices");
|
||||
|
||||
db
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_retrieve_entities_with_embedding_basic_flow() {
|
||||
async fn test_default_strategy_retrieves_chunks() {
|
||||
let db = setup_test_db().await;
|
||||
let user_id = "test_user";
|
||||
let entity = KnowledgeEntity::new(
|
||||
"source_1".into(),
|
||||
"Rust async guide".into(),
|
||||
"Detailed notes about async runtimes".into(),
|
||||
KnowledgeEntityType::Document,
|
||||
None,
|
||||
user_id.into(),
|
||||
);
|
||||
let chunk = TextChunk::new(
|
||||
entity.source_id.clone(),
|
||||
"source_1".into(),
|
||||
"Tokio uses cooperative scheduling for fairness.".into(),
|
||||
user_id.into(),
|
||||
);
|
||||
|
||||
KnowledgeEntity::store_with_embedding(entity.clone(), entity_embedding_high(), &db)
|
||||
.await
|
||||
.expect("Failed to store entity");
|
||||
TextChunk::store_with_embedding(chunk.clone(), chunk_embedding_primary(), &db)
|
||||
.await
|
||||
.expect("Failed to store chunk");
|
||||
@@ -164,64 +130,32 @@ mod tests {
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.expect("Hybrid retrieval failed");
|
||||
.expect("Default strategy retrieval failed");
|
||||
|
||||
let entities = match results {
|
||||
StrategyOutput::Entities(items) => items,
|
||||
other => panic!("expected entity results, got {:?}", other),
|
||||
let chunks = match results {
|
||||
StrategyOutput::Chunks(items) => items,
|
||||
other => panic!("expected chunk results, got {:?}", other),
|
||||
};
|
||||
|
||||
assert!(!chunks.is_empty(), "Expected at least one retrieval result");
|
||||
assert!(
|
||||
!entities.is_empty(),
|
||||
"Expected at least one retrieval result"
|
||||
);
|
||||
let top = &entities[0];
|
||||
assert!(
|
||||
top.entity.name.contains("Rust"),
|
||||
"Expected Rust entity to be ranked first"
|
||||
);
|
||||
assert!(
|
||||
!top.chunks.is_empty(),
|
||||
"Expected Rust entity to include supporting chunks"
|
||||
chunks[0].chunk.chunk.contains("Tokio"),
|
||||
"Expected chunk about Tokio"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_graph_relationship_enriches_results() {
|
||||
async fn test_default_strategy_returns_chunks_from_multiple_sources() {
|
||||
let db = setup_test_db().await;
|
||||
let user_id = "graph_user";
|
||||
|
||||
let primary = KnowledgeEntity::new(
|
||||
"primary_source".into(),
|
||||
"Async Rust patterns".into(),
|
||||
"Explores async runtimes and scheduling strategies.".into(),
|
||||
KnowledgeEntityType::Document,
|
||||
None,
|
||||
user_id.into(),
|
||||
);
|
||||
let neighbor = KnowledgeEntity::new(
|
||||
"neighbor_source".into(),
|
||||
"Tokio Scheduler Deep Dive".into(),
|
||||
"Details on Tokio's cooperative scheduler.".into(),
|
||||
KnowledgeEntityType::Document,
|
||||
None,
|
||||
user_id.into(),
|
||||
);
|
||||
|
||||
KnowledgeEntity::store_with_embedding(primary.clone(), entity_embedding_high(), &db)
|
||||
.await
|
||||
.expect("Failed to store primary entity");
|
||||
KnowledgeEntity::store_with_embedding(neighbor.clone(), entity_embedding_low(), &db)
|
||||
.await
|
||||
.expect("Failed to store neighbor entity");
|
||||
let user_id = "multi_source_user";
|
||||
|
||||
let primary_chunk = TextChunk::new(
|
||||
primary.source_id.clone(),
|
||||
"primary_source".into(),
|
||||
"Rust async tasks use Tokio's cooperative scheduler.".into(),
|
||||
user_id.into(),
|
||||
);
|
||||
let neighbor_chunk = TextChunk::new(
|
||||
neighbor.source_id.clone(),
|
||||
let secondary_chunk = TextChunk::new(
|
||||
"secondary_source".into(),
|
||||
"Tokio's scheduler manages task fairness across executors.".into(),
|
||||
user_id.into(),
|
||||
);
|
||||
@@ -229,23 +163,11 @@ mod tests {
|
||||
TextChunk::store_with_embedding(primary_chunk, chunk_embedding_primary(), &db)
|
||||
.await
|
||||
.expect("Failed to store primary chunk");
|
||||
TextChunk::store_with_embedding(neighbor_chunk, chunk_embedding_secondary(), &db)
|
||||
TextChunk::store_with_embedding(secondary_chunk, chunk_embedding_secondary(), &db)
|
||||
.await
|
||||
.expect("Failed to store neighbor chunk");
|
||||
.expect("Failed to store secondary chunk");
|
||||
|
||||
let openai_client = Client::new();
|
||||
let relationship = KnowledgeRelationship::new(
|
||||
primary.id.clone(),
|
||||
neighbor.id.clone(),
|
||||
user_id.into(),
|
||||
"relationship_source".into(),
|
||||
"references".into(),
|
||||
);
|
||||
relationship
|
||||
.store_relationship(&db)
|
||||
.await
|
||||
.expect("Failed to store relationship");
|
||||
|
||||
let results = pipeline::run_pipeline_with_embedding(
|
||||
&db,
|
||||
&openai_client,
|
||||
@@ -257,35 +179,23 @@ mod tests {
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.expect("Hybrid retrieval failed");
|
||||
.expect("Default strategy retrieval failed");
|
||||
|
||||
let entities = match results {
|
||||
StrategyOutput::Entities(items) => items,
|
||||
other => panic!("expected entity results, got {:?}", other),
|
||||
let chunks = match results {
|
||||
StrategyOutput::Chunks(items) => items,
|
||||
other => panic!("expected chunk results, got {:?}", other),
|
||||
};
|
||||
|
||||
let mut neighbor_entry = None;
|
||||
for entity in &entities {
|
||||
if entity.entity.id == neighbor.id {
|
||||
neighbor_entry = Some(entity.clone());
|
||||
}
|
||||
}
|
||||
|
||||
println!("{:?}", entities);
|
||||
|
||||
let neighbor_entry =
|
||||
neighbor_entry.expect("Graph-enriched neighbor should appear in results");
|
||||
|
||||
assert!(chunks.len() >= 2, "Expected chunks from multiple sources");
|
||||
assert!(
|
||||
neighbor_entry.score > 0.2,
|
||||
"Graph-enriched entity should have a meaningful fused score"
|
||||
chunks.iter().any(|c| c.chunk.source_id == "primary_source"),
|
||||
"Should include primary source chunk"
|
||||
);
|
||||
assert!(
|
||||
neighbor_entry
|
||||
.chunks
|
||||
chunks
|
||||
.iter()
|
||||
.all(|chunk| chunk.chunk.source_id == neighbor.source_id),
|
||||
"Neighbor entity should surface its own supporting chunks"
|
||||
.any(|c| c.chunk.source_id == "secondary_source"),
|
||||
"Should include secondary source chunk"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -311,7 +221,7 @@ mod tests {
|
||||
.await
|
||||
.expect("Failed to store chunk two");
|
||||
|
||||
let config = RetrievalConfig::with_strategy(RetrievalStrategy::Revised);
|
||||
let config = RetrievalConfig::with_strategy(RetrievalStrategy::Default);
|
||||
let openai_client = Client::new();
|
||||
let results = pipeline::run_pipeline_with_embedding(
|
||||
&db,
|
||||
|
||||
Reference in New Issue
Block a user