mirror of
https://github.com/perstarkse/minne.git
synced 2026-06-30 18:11:34 +02:00
fix: knowledge entity suggestions simplification
This commit is contained in:
@@ -1,9 +1,9 @@
|
||||
pub mod answer_retrieval;
|
||||
|
||||
pub mod pipeline;
|
||||
pub mod query;
|
||||
pub mod reranking;
|
||||
|
||||
pub(crate) mod scoring;
|
||||
pub mod scoring;
|
||||
|
||||
use common::{
|
||||
error::AppError,
|
||||
@@ -29,9 +29,11 @@ pub enum RetrievalOutput {
|
||||
}
|
||||
|
||||
pub use pipeline::{
|
||||
retrieved_entities_to_json, Diagnostics, RetrievalConfig, RetrievalParams, StageKind,
|
||||
StageTimings,
|
||||
retrieved_entities_to_json, Diagnostics, RetrievalConfig, RetrievalParams, RetrievalTuning,
|
||||
StageKind, StageTimings,
|
||||
};
|
||||
pub use query::normalize_fts_terms;
|
||||
pub use scoring::{reciprocal_rank_fusion, RrfConfig, Scored};
|
||||
|
||||
/// Round a score to three decimal places for JSON output.
|
||||
pub(crate) fn round_score(value: f32) -> f64 {
|
||||
|
||||
@@ -117,8 +117,8 @@ impl Default for RetrievalTuning {
|
||||
/// Per-request retrieval configuration.
|
||||
///
|
||||
/// The pipeline always performs chunk-first hybrid retrieval. Set `resolve_entities`
|
||||
/// when a caller additionally needs the `KnowledgeEntity` rows that own the retrieved
|
||||
/// chunks (search, ingestion linking, relationship suggestion).
|
||||
/// when a caller additionally needs the `KnowledgeEntity` rows that own retrieved
|
||||
/// chunks (search, ingestion linking).
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct RetrievalConfig {
|
||||
pub tuning: RetrievalTuning,
|
||||
|
||||
@@ -5,7 +5,9 @@ use common::{
|
||||
utils::embedding::EmbeddingProvider,
|
||||
};
|
||||
|
||||
use crate::{reranking::RerankerLease, scoring::Scored, RetrievedChunk, RetrievedEntity};
|
||||
use crate::scoring::Scored;
|
||||
|
||||
use crate::{reranking::RerankerLease, RetrievedChunk, RetrievedEntity};
|
||||
|
||||
use super::{
|
||||
config::RetrievalConfig,
|
||||
|
||||
@@ -3,7 +3,7 @@ mod context;
|
||||
mod diagnostics;
|
||||
mod stages;
|
||||
|
||||
pub use config::RetrievalConfig;
|
||||
pub use config::{RetrievalConfig, RetrievalTuning};
|
||||
pub use diagnostics::Diagnostics;
|
||||
|
||||
use crate::{round_score, RetrievalOutput, RetrievedEntity};
|
||||
|
||||
@@ -9,9 +9,8 @@ use std::collections::HashMap;
|
||||
use tracing::{debug, instrument, warn};
|
||||
|
||||
use crate::{
|
||||
scoring::{
|
||||
clamp_unit, min_max_normalize, reciprocal_rank_fusion, RrfConfig, Scored,
|
||||
},
|
||||
query::normalize_fts_terms,
|
||||
scoring::{clamp_unit, min_max_normalize, reciprocal_rank_fusion, RrfConfig, Scored},
|
||||
RetrievedChunk, RetrievedEntity,
|
||||
};
|
||||
|
||||
@@ -115,7 +114,7 @@ pub async fn search_chunks(ctx: &mut PipelineContext<'_>) -> Result<(), AppError
|
||||
let embedding = ctx.ensure_embedding().map_err(|e| *e)?.clone();
|
||||
let tuning = &ctx.config.tuning;
|
||||
let fts_take = tuning.chunk_fts_take;
|
||||
let (fts_query, fts_token_count) = normalize_fts_query(&ctx.input_text);
|
||||
let (fts_query, fts_token_count) = normalize_fts_terms(&ctx.input_text);
|
||||
let fts_enabled = tuning.flags.chunk_rrf_use_fts() && fts_take > 0 && !fts_query.is_empty();
|
||||
|
||||
let (vector_rows, fts_rows) = tokio::try_join!(
|
||||
@@ -333,26 +332,6 @@ where
|
||||
items.iter().take(SCORE_SAMPLE_LIMIT).map(extractor).collect()
|
||||
}
|
||||
|
||||
fn normalize_fts_query(input: &str) -> (String, usize) {
|
||||
const STOPWORDS: &[&str] = &["the", "a", "an", "of", "in", "on", "and", "or", "to", "for"];
|
||||
let mut cleaned = String::with_capacity(input.len());
|
||||
for ch in input.chars() {
|
||||
if ch.is_alphanumeric() {
|
||||
cleaned.extend(ch.to_lowercase());
|
||||
} else if ch.is_whitespace() {
|
||||
cleaned.push(' ');
|
||||
}
|
||||
}
|
||||
let mut tokens = Vec::with_capacity(cleaned.len().div_ceil(3));
|
||||
for token in cleaned.split_whitespace() {
|
||||
if !STOPWORDS.contains(&token) && !token.is_empty() {
|
||||
tokens.push(token.to_string());
|
||||
}
|
||||
}
|
||||
let normalized = tokens.join(" ");
|
||||
(normalized, tokens.len())
|
||||
}
|
||||
|
||||
fn build_chunk_rerank_documents(chunks: &[Scored<TextChunk>], max_chunks: usize) -> Vec<String> {
|
||||
chunks
|
||||
.iter()
|
||||
|
||||
@@ -0,0 +1,39 @@
|
||||
/// Normalize raw input into FTS-friendly terms and return the token count.
|
||||
pub fn normalize_fts_terms(input: &str) -> (String, usize) {
|
||||
const STOPWORDS: &[&str] = &["the", "a", "an", "of", "in", "on", "and", "or", "to", "for"];
|
||||
let mut cleaned = String::with_capacity(input.len());
|
||||
for ch in input.chars() {
|
||||
if ch.is_alphanumeric() {
|
||||
cleaned.extend(ch.to_lowercase());
|
||||
} else if ch.is_whitespace() {
|
||||
cleaned.push(' ');
|
||||
}
|
||||
}
|
||||
let mut tokens = Vec::with_capacity(cleaned.len().div_ceil(3));
|
||||
for token in cleaned.split_whitespace() {
|
||||
if !STOPWORDS.contains(&token) && !token.is_empty() {
|
||||
tokens.push(token.to_string());
|
||||
}
|
||||
}
|
||||
let normalized = tokens.join(" ");
|
||||
(normalized, tokens.len())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::normalize_fts_terms;
|
||||
|
||||
#[test]
|
||||
fn strips_stopwords_and_lowercases() {
|
||||
let (query, count) = normalize_fts_terms("The Cucumber and Tomatoes");
|
||||
assert_eq!(query, "cucumber tomatoes");
|
||||
assert_eq!(count, 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn returns_empty_for_stopwords_only() {
|
||||
let (query, count) = normalize_fts_terms("the and or");
|
||||
assert!(query.is_empty());
|
||||
assert_eq!(count, 0);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user