tidying stuff up, dto for search

This commit is contained in:
Per Stark
2025-12-20 22:30:31 +01:00
parent a5bc72aedf
commit 79ea007b0a
23 changed files with 936 additions and 73 deletions

View File

@@ -21,11 +21,29 @@ use tracing::instrument;
pub enum StrategyOutput {
Entities(Vec<RetrievedEntity>),
Chunks(Vec<RetrievedChunk>),
Search(SearchResult),
}
/// Unified search result containing both chunks and entities
#[derive(Debug, Clone)]
pub struct SearchResult {
pub chunks: Vec<RetrievedChunk>,
pub entities: Vec<RetrievedEntity>,
}
impl SearchResult {
pub fn new(chunks: Vec<RetrievedChunk>, entities: Vec<RetrievedEntity>) -> Self {
Self { chunks, entities }
}
pub fn is_empty(&self) -> bool {
self.chunks.is_empty() && self.entities.is_empty()
}
}
pub use pipeline::{
retrieved_entities_to_json, PipelineDiagnostics, PipelineStageTimings, RetrievalConfig,
RetrievalStrategy, RetrievalTuning,
RetrievalStrategy, RetrievalTuning, SearchTarget,
};
// Captures a supporting chunk plus its fused retrieval score for downstream prompts.
@@ -48,6 +66,7 @@ pub struct RetrievedEntity {
pub async fn retrieve_entities(
db_client: &SurrealDbClient,
openai_client: &async_openai::Client<async_openai::config::OpenAIConfig>,
embedding_provider: Option<&common::utils::embedding::EmbeddingProvider>,
input_text: &str,
user_id: &str,
config: RetrievalConfig,
@@ -56,7 +75,7 @@ pub async fn retrieve_entities(
pipeline::run_pipeline(
db_client,
openai_client,
None,
embedding_provider,
input_text,
user_id,
config,
@@ -252,4 +271,49 @@ mod tests {
"Chunk results should contain relevant snippets"
);
}
#[tokio::test]
async fn test_search_strategy_returns_search_result() {
let db = setup_test_db().await;
let user_id = "search_user";
let chunk = TextChunk::new(
"search_src".into(),
"Async Rust programming uses Tokio runtime for concurrent tasks.".into(),
user_id.into(),
);
TextChunk::store_with_embedding(chunk.clone(), chunk_embedding_primary(), &db)
.await
.expect("Failed to store chunk");
let config = RetrievalConfig::for_search(pipeline::SearchTarget::Both);
let openai_client = Client::new();
let results = pipeline::run_pipeline_with_embedding(
&db,
&openai_client,
None,
test_embedding(),
"async rust programming",
user_id,
config,
None,
)
.await
.expect("Search strategy retrieval failed");
let search_result = match results {
StrategyOutput::Search(sr) => sr,
other => panic!("expected Search output, got {:?}", other),
};
// Should return chunks (entities may be empty if none stored)
assert!(
!search_result.chunks.is_empty(),
"Search strategy should return chunks"
);
assert!(
search_result.chunks.iter().any(|c| c.chunk.chunk.contains("Tokio")),
"Search results should contain relevant chunks"
);
}
}

View File

@@ -12,6 +12,21 @@ pub enum RetrievalStrategy {
RelationshipSuggestion,
/// Entity retrieval for context during content ingestion
Ingestion,
/// Unified search returning both chunks and entities
Search,
}
/// Configures which result types to include in Search strategy
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
#[serde(rename_all = "snake_case")]
pub enum SearchTarget {
/// Return only text chunks
ChunksOnly,
/// Return only knowledge entities
EntitiesOnly,
/// Return both chunks and entities (default)
#[default]
Both,
}
impl Default for RetrievalStrategy {
@@ -37,6 +52,7 @@ impl std::str::FromStr for RetrievalStrategy {
}
"relationship_suggestion" => Ok(Self::RelationshipSuggestion),
"ingestion" => Ok(Self::Ingestion),
"search" => Ok(Self::Search),
other => Err(format!("unknown retrieval strategy '{other}'")),
}
}
@@ -48,6 +64,7 @@ impl fmt::Display for RetrievalStrategy {
RetrievalStrategy::Default => "default",
RetrievalStrategy::RelationshipSuggestion => "relationship_suggestion",
RetrievalStrategy::Ingestion => "ingestion",
RetrievalStrategy::Search => "search",
};
f.write_str(label)
}
@@ -140,6 +157,8 @@ impl Default for RetrievalTuning {
pub struct RetrievalConfig {
pub strategy: RetrievalStrategy,
pub tuning: RetrievalTuning,
/// Target for Search strategy (chunks, entities, or both)
pub search_target: SearchTarget,
}
impl RetrievalConfig {
@@ -147,6 +166,7 @@ impl RetrievalConfig {
Self {
strategy: RetrievalStrategy::Default,
tuning,
search_target: SearchTarget::default(),
}
}
@@ -154,11 +174,16 @@ impl RetrievalConfig {
Self {
strategy,
tuning: RetrievalTuning::default(),
search_target: SearchTarget::default(),
}
}
pub fn with_tuning(strategy: RetrievalStrategy, tuning: RetrievalTuning) -> Self {
Self { strategy, tuning }
Self {
strategy,
tuning,
search_target: SearchTarget::default(),
}
}
/// Create config for chat retrieval with strategy selection support
@@ -175,6 +200,15 @@ impl RetrievalConfig {
pub fn for_ingestion() -> Self {
Self::with_strategy(RetrievalStrategy::Ingestion)
}
/// Create config for unified search (chunks and/or entities)
pub fn for_search(target: SearchTarget) -> Self {
Self {
strategy: RetrievalStrategy::Search,
tuning: RetrievalTuning::default(),
search_target: target,
}
}
}
impl Default for RetrievalConfig {
@@ -182,6 +216,7 @@ impl Default for RetrievalConfig {
Self {
strategy: RetrievalStrategy::default(),
tuning: RetrievalTuning::default(),
search_target: SearchTarget::default(),
}
}
}

View File

@@ -3,7 +3,7 @@ mod diagnostics;
mod stages;
mod strategies;
pub use config::{RetrievalConfig, RetrievalStrategy, RetrievalTuning};
pub use config::{RetrievalConfig, RetrievalStrategy, RetrievalTuning, SearchTarget};
pub use diagnostics::{
AssembleStats, ChunkEnrichmentStats, CollectCandidatesStats, EntityAssemblyTrace,
PipelineDiagnostics,
@@ -17,7 +17,7 @@ use std::time::{Duration, Instant};
use tracing::info;
use stages::PipelineContext;
use strategies::{DefaultStrategyDriver, IngestionDriver, RelationshipSuggestionDriver};
use strategies::{DefaultStrategyDriver, IngestionDriver, RelationshipSuggestionDriver, SearchStrategyDriver};
// Export StrategyOutput publicly from this module
// (it's defined in lib.rs but we re-export it here)
@@ -181,6 +181,24 @@ pub async fn run_pipeline(
.await?;
Ok(StrategyOutput::Entities(run.results))
}
RetrievalStrategy::Search => {
let search_target = config.search_target;
let driver = SearchStrategyDriver::new(search_target);
let run = execute_strategy(
driver,
db_client,
openai_client,
embedding_provider,
None,
input_text,
user_id,
config,
reranker,
false,
)
.await?;
Ok(StrategyOutput::Search(run.results))
}
}
}
@@ -246,6 +264,24 @@ pub async fn run_pipeline_with_embedding(
.await?;
Ok(StrategyOutput::Entities(run.results))
}
RetrievalStrategy::Search => {
let search_target = config.search_target;
let driver = SearchStrategyDriver::new(search_target);
let run = execute_strategy(
driver,
db_client,
openai_client,
embedding_provider,
Some(query_embedding),
input_text,
user_id,
config,
reranker,
false,
)
.await?;
Ok(StrategyOutput::Search(run.results))
}
}
}

View File

@@ -88,3 +88,63 @@ impl StrategyDriver for IngestionDriver {
Ok(ctx.take_entity_results())
}
}
use crate::SearchResult;
use super::config::SearchTarget;
/// Search strategy driver that retrieves both chunks and entities
pub struct SearchStrategyDriver {
target: SearchTarget,
}
impl SearchStrategyDriver {
pub fn new(target: SearchTarget) -> Self {
Self { target }
}
}
impl StrategyDriver for SearchStrategyDriver {
type Output = SearchResult;
fn stages(&self) -> Vec<BoxedStage> {
match self.target {
SearchTarget::ChunksOnly => vec![
Box::new(EmbedStage),
Box::new(ChunkVectorStage),
Box::new(ChunkRerankStage),
Box::new(ChunkAssembleStage),
],
SearchTarget::EntitiesOnly => vec![
Box::new(EmbedStage),
Box::new(CollectCandidatesStage),
Box::new(GraphExpansionStage),
Box::new(RerankStage),
Box::new(AssembleEntitiesStage),
],
SearchTarget::Both => vec![
Box::new(EmbedStage),
// Chunk retrieval path
Box::new(ChunkVectorStage),
Box::new(ChunkRerankStage),
Box::new(ChunkAssembleStage),
// Entity retrieval path (runs after chunk stages)
Box::new(CollectCandidatesStage),
Box::new(GraphExpansionStage),
Box::new(RerankStage),
Box::new(AssembleEntitiesStage),
],
}
}
fn finalize(&self, ctx: &mut PipelineContext<'_>) -> Result<Self::Output, AppError> {
let chunks = match self.target {
SearchTarget::EntitiesOnly => Vec::new(),
_ => ctx.take_chunk_results(),
};
let entities = match self.target {
SearchTarget::ChunksOnly => Vec::new(),
_ => ctx.take_entity_results(),
};
Ok(SearchResult::new(chunks, entities))
}
}