fix: knowledge entity suggestions simplification

This commit is contained in:
Per Stark
2026-05-31 20:23:40 +02:00
parent 6c3475ca0e
commit 744482f2c8
9 changed files with 394 additions and 122 deletions
+6 -4
View File
@@ -1,9 +1,9 @@
pub mod answer_retrieval;
pub mod pipeline;
pub mod query;
pub mod reranking;
pub(crate) mod scoring;
pub mod scoring;
use common::{
error::AppError,
@@ -29,9 +29,11 @@ pub enum RetrievalOutput {
}
pub use pipeline::{
retrieved_entities_to_json, Diagnostics, RetrievalConfig, RetrievalParams, StageKind,
StageTimings,
retrieved_entities_to_json, Diagnostics, RetrievalConfig, RetrievalParams, RetrievalTuning,
StageKind, StageTimings,
};
pub use query::normalize_fts_terms;
pub use scoring::{reciprocal_rank_fusion, RrfConfig, Scored};
/// Round a score to three decimal places for JSON output.
pub(crate) fn round_score(value: f32) -> f64 {
+2 -2
View File
@@ -117,8 +117,8 @@ impl Default for RetrievalTuning {
/// Per-request retrieval configuration.
///
/// The pipeline always performs chunk-first hybrid retrieval. Set `resolve_entities`
/// when a caller additionally needs the `KnowledgeEntity` rows that own the retrieved
/// chunks (search, ingestion linking, relationship suggestion).
/// when a caller additionally needs the `KnowledgeEntity` rows that own retrieved
/// chunks (search, ingestion linking).
#[derive(Debug, Clone, Default)]
pub struct RetrievalConfig {
pub tuning: RetrievalTuning,
+3 -1
View File
@@ -5,7 +5,9 @@ use common::{
utils::embedding::EmbeddingProvider,
};
use crate::{reranking::RerankerLease, scoring::Scored, RetrievedChunk, RetrievedEntity};
use crate::scoring::Scored;
use crate::{reranking::RerankerLease, RetrievedChunk, RetrievedEntity};
use super::{
config::RetrievalConfig,
+1 -1
View File
@@ -3,7 +3,7 @@ mod context;
mod diagnostics;
mod stages;
pub use config::RetrievalConfig;
pub use config::{RetrievalConfig, RetrievalTuning};
pub use diagnostics::Diagnostics;
use crate::{round_score, RetrievalOutput, RetrievedEntity};
+3 -24
View File
@@ -9,9 +9,8 @@ use std::collections::HashMap;
use tracing::{debug, instrument, warn};
use crate::{
scoring::{
clamp_unit, min_max_normalize, reciprocal_rank_fusion, RrfConfig, Scored,
},
query::normalize_fts_terms,
scoring::{clamp_unit, min_max_normalize, reciprocal_rank_fusion, RrfConfig, Scored},
RetrievedChunk, RetrievedEntity,
};
@@ -115,7 +114,7 @@ pub async fn search_chunks(ctx: &mut PipelineContext<'_>) -> Result<(), AppError
let embedding = ctx.ensure_embedding().map_err(|e| *e)?.clone();
let tuning = &ctx.config.tuning;
let fts_take = tuning.chunk_fts_take;
let (fts_query, fts_token_count) = normalize_fts_query(&ctx.input_text);
let (fts_query, fts_token_count) = normalize_fts_terms(&ctx.input_text);
let fts_enabled = tuning.flags.chunk_rrf_use_fts() && fts_take > 0 && !fts_query.is_empty();
let (vector_rows, fts_rows) = tokio::try_join!(
@@ -333,26 +332,6 @@ where
items.iter().take(SCORE_SAMPLE_LIMIT).map(extractor).collect()
}
fn normalize_fts_query(input: &str) -> (String, usize) {
const STOPWORDS: &[&str] = &["the", "a", "an", "of", "in", "on", "and", "or", "to", "for"];
let mut cleaned = String::with_capacity(input.len());
for ch in input.chars() {
if ch.is_alphanumeric() {
cleaned.extend(ch.to_lowercase());
} else if ch.is_whitespace() {
cleaned.push(' ');
}
}
let mut tokens = Vec::with_capacity(cleaned.len().div_ceil(3));
for token in cleaned.split_whitespace() {
if !STOPWORDS.contains(&token) && !token.is_empty() {
tokens.push(token.to_string());
}
}
let normalized = tokens.join(" ");
(normalized, tokens.len())
}
fn build_chunk_rerank_documents(chunks: &[Scored<TextChunk>], max_chunks: usize) -> Vec<String> {
chunks
.iter()
+39
View File
@@ -0,0 +1,39 @@
/// Normalize raw input into FTS-friendly terms and return the token count.
pub fn normalize_fts_terms(input: &str) -> (String, usize) {
const STOPWORDS: &[&str] = &["the", "a", "an", "of", "in", "on", "and", "or", "to", "for"];
let mut cleaned = String::with_capacity(input.len());
for ch in input.chars() {
if ch.is_alphanumeric() {
cleaned.extend(ch.to_lowercase());
} else if ch.is_whitespace() {
cleaned.push(' ');
}
}
let mut tokens = Vec::with_capacity(cleaned.len().div_ceil(3));
for token in cleaned.split_whitespace() {
if !STOPWORDS.contains(&token) && !token.is_empty() {
tokens.push(token.to_string());
}
}
let normalized = tokens.join(" ");
(normalized, tokens.len())
}
#[cfg(test)]
mod tests {
use super::normalize_fts_terms;
#[test]
fn strips_stopwords_and_lowercases() {
let (query, count) = normalize_fts_terms("The Cucumber and Tomatoes");
assert_eq!(query, "cucumber tomatoes");
assert_eq!(count, 2);
}
#[test]
fn returns_empty_for_stopwords_only() {
let (query, count) = normalize_fts_terms("the and or");
assert!(query.is_empty());
assert_eq!(count, 0);
}
}