fix: knowledge entity suggestions simplification

2026-06-30 18:11:34 +02:00 · 2026-05-31 20:23:40 +02:00
parent 6c3475ca0e
commit 744482f2c8
9 changed files with 394 additions and 122 deletions
@@ -1,9 +1,9 @@
 pub mod answer_retrieval;

 pub mod pipeline;
+pub mod query;
 pub mod reranking;
-
-pub(crate) mod scoring;
+pub mod scoring;

 use common::{
    error::AppError,
@@ -29,9 +29,11 @@ pub enum RetrievalOutput {
 }

 pub use pipeline::{
-    retrieved_entities_to_json, Diagnostics, RetrievalConfig, RetrievalParams, StageKind,
-    StageTimings,
+    retrieved_entities_to_json, Diagnostics, RetrievalConfig, RetrievalParams, RetrievalTuning,
+    StageKind, StageTimings,
 };
+pub use query::normalize_fts_terms;
+pub use scoring::{reciprocal_rank_fusion, RrfConfig, Scored};

 /// Round a score to three decimal places for JSON output.
 pub(crate) fn round_score(value: f32) -> f64 {
@@ -117,8 +117,8 @@ impl Default for RetrievalTuning {
 /// Per-request retrieval configuration.
 ///
 /// The pipeline always performs chunk-first hybrid retrieval. Set `resolve_entities`
-/// when a caller additionally needs the `KnowledgeEntity` rows that own the retrieved
-/// chunks (search, ingestion linking, relationship suggestion).
+/// when a caller additionally needs the `KnowledgeEntity` rows that own retrieved
+/// chunks (search, ingestion linking).
 #[derive(Debug, Clone, Default)]
 pub struct RetrievalConfig {
    pub tuning: RetrievalTuning,
@@ -5,7 +5,9 @@ use common::{
    utils::embedding::EmbeddingProvider,
 };

-use crate::{reranking::RerankerLease, scoring::Scored, RetrievedChunk, RetrievedEntity};
+use crate::scoring::Scored;
+
+use crate::{reranking::RerankerLease, RetrievedChunk, RetrievedEntity};

 use super::{
    config::RetrievalConfig,
@@ -3,7 +3,7 @@ mod context;
 mod diagnostics;
 mod stages;

-pub use config::RetrievalConfig;
+pub use config::{RetrievalConfig, RetrievalTuning};
 pub use diagnostics::Diagnostics;

 use crate::{round_score, RetrievalOutput, RetrievedEntity};
@@ -9,9 +9,8 @@ use std::collections::HashMap;
 use tracing::{debug, instrument, warn};

 use crate::{
-    scoring::{
-        clamp_unit, min_max_normalize, reciprocal_rank_fusion, RrfConfig, Scored,
-    },
+    query::normalize_fts_terms,
+    scoring::{clamp_unit, min_max_normalize, reciprocal_rank_fusion, RrfConfig, Scored},
    RetrievedChunk, RetrievedEntity,
 };

@@ -115,7 +114,7 @@ pub async fn search_chunks(ctx: &mut PipelineContext<'_>) -> Result<(), AppError
    let embedding = ctx.ensure_embedding().map_err(|e| *e)?.clone();
    let tuning = &ctx.config.tuning;
    let fts_take = tuning.chunk_fts_take;
-    let (fts_query, fts_token_count) = normalize_fts_query(&ctx.input_text);
+    let (fts_query, fts_token_count) = normalize_fts_terms(&ctx.input_text);
    let fts_enabled = tuning.flags.chunk_rrf_use_fts() && fts_take > 0 && !fts_query.is_empty();

    let (vector_rows, fts_rows) = tokio::try_join!(
@@ -333,26 +332,6 @@ where
    items.iter().take(SCORE_SAMPLE_LIMIT).map(extractor).collect()
 }

-fn normalize_fts_query(input: &str) -> (String, usize) {
-    const STOPWORDS: &[&str] = &["the", "a", "an", "of", "in", "on", "and", "or", "to", "for"];
-    let mut cleaned = String::with_capacity(input.len());
-    for ch in input.chars() {
-        if ch.is_alphanumeric() {
-            cleaned.extend(ch.to_lowercase());
-        } else if ch.is_whitespace() {
-            cleaned.push(' ');
-        }
-    }
-    let mut tokens = Vec::with_capacity(cleaned.len().div_ceil(3));
-    for token in cleaned.split_whitespace() {
-        if !STOPWORDS.contains(&token) && !token.is_empty() {
-            tokens.push(token.to_string());
-        }
-    }
-    let normalized = tokens.join(" ");
-    (normalized, tokens.len())
-}
-
 fn build_chunk_rerank_documents(chunks: &[Scored<TextChunk>], max_chunks: usize) -> Vec<String> {
    chunks
        .iter()
@@ -0,0 +1,39 @@
+/// Normalize raw input into FTS-friendly terms and return the token count.
+pub fn normalize_fts_terms(input: &str) -> (String, usize) {
+    const STOPWORDS: &[&str] = &["the", "a", "an", "of", "in", "on", "and", "or", "to", "for"];
+    let mut cleaned = String::with_capacity(input.len());
+    for ch in input.chars() {
+        if ch.is_alphanumeric() {
+            cleaned.extend(ch.to_lowercase());
+        } else if ch.is_whitespace() {
+            cleaned.push(' ');
+        }
+    }
+    let mut tokens = Vec::with_capacity(cleaned.len().div_ceil(3));
+    for token in cleaned.split_whitespace() {
+        if !STOPWORDS.contains(&token) && !token.is_empty() {
+            tokens.push(token.to_string());
+        }
+    }
+    let normalized = tokens.join(" ");
+    (normalized, tokens.len())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::normalize_fts_terms;
+
+    #[test]
+    fn strips_stopwords_and_lowercases() {
+        let (query, count) = normalize_fts_terms("The Cucumber and Tomatoes");
+        assert_eq!(query, "cucumber tomatoes");
+        assert_eq!(count, 2);
+    }
+
+    #[test]
+    fn returns_empty_for_stopwords_only() {
+        let (query, count) = normalize_fts_terms("the and or");
+        assert!(query.is_empty());
+        assert_eq!(count, 0);
+    }
+}