mirror of
https://github.com/perstarkse/minne.git
synced 2026-05-28 10:29:30 +02:00
perf: pre-allocate collections with known capacity in hot paths
- Use with_capacity for chunk_by_source, results, per_entity_traces, and selected_chunks in assemble() where bound is known - Pre-allocate tokens/terms vectors in normalize_fts_query and extract_keywords based on input length - Pre-allocate neighbor_ids, seen, and ordered in graph expansion based on relationship count
This commit is contained in:
@@ -44,8 +44,8 @@ pub async fn find_entities_by_relationship_by_id(
|
|||||||
return Ok(Vec::new());
|
return Ok(Vec::new());
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut neighbor_ids: Vec<String> = Vec::new();
|
let mut neighbor_ids: Vec<String> = Vec::with_capacity(relationships.len());
|
||||||
let mut seen: HashSet<String> = HashSet::new();
|
let mut seen: HashSet<String> = HashSet::with_capacity(relationships.len());
|
||||||
for rel in relationships {
|
for rel in relationships {
|
||||||
if rel.in_ == entity_id {
|
if rel.in_ == entity_id {
|
||||||
if seen.insert(rel.out.clone()) {
|
if seen.insert(rel.out.clone()) {
|
||||||
@@ -97,7 +97,7 @@ pub async fn find_entities_by_relationship_by_id(
|
|||||||
.map(|entity| (entity.id.clone(), entity))
|
.map(|entity| (entity.id.clone(), entity))
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
let mut ordered = Vec::new();
|
let mut ordered = Vec::with_capacity(neighbor_ids.len());
|
||||||
for id in neighbor_ids {
|
for id in neighbor_ids {
|
||||||
if let Some(entity) = neighbor_map.remove(&id) {
|
if let Some(entity) = neighbor_map.remove(&id) {
|
||||||
ordered.push(entity);
|
ordered.push(entity);
|
||||||
|
|||||||
@@ -647,7 +647,8 @@ pub fn assemble(ctx: &mut PipelineContext<'_>) -> Result<(), AppError> {
|
|||||||
let tuning = &ctx.config.tuning;
|
let tuning = &ctx.config.tuning;
|
||||||
let question_terms = extract_keywords(&ctx.input_text);
|
let question_terms = extract_keywords(&ctx.input_text);
|
||||||
|
|
||||||
let mut chunk_by_source: HashMap<String, Vec<Scored<TextChunk>>> = HashMap::new();
|
let mut chunk_by_source: HashMap<String, Vec<Scored<TextChunk>>> =
|
||||||
|
HashMap::with_capacity(ctx.chunk_values.len());
|
||||||
for chunk in ctx.chunk_values.drain(..) {
|
for chunk in ctx.chunk_values.drain(..) {
|
||||||
chunk_by_source
|
chunk_by_source
|
||||||
.entry(chunk.item.source_id.clone())
|
.entry(chunk.item.source_id.clone())
|
||||||
@@ -663,15 +664,19 @@ pub fn assemble(ctx: &mut PipelineContext<'_>) -> Result<(), AppError> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let mut token_budget_remaining = tuning.token_budget_estimate;
|
let mut token_budget_remaining = tuning.token_budget_estimate;
|
||||||
let mut results = Vec::new();
|
let mut results = Vec::with_capacity(ctx.filtered_entities.len());
|
||||||
let diagnostics_enabled = ctx.diagnostics_enabled();
|
let diagnostics_enabled = ctx.diagnostics_enabled();
|
||||||
let mut per_entity_traces = Vec::new();
|
let mut per_entity_traces = if diagnostics_enabled {
|
||||||
|
Vec::with_capacity(ctx.filtered_entities.len())
|
||||||
|
} else {
|
||||||
|
Vec::new()
|
||||||
|
};
|
||||||
let mut chunks_skipped_due_budget = 0usize;
|
let mut chunks_skipped_due_budget = 0usize;
|
||||||
let mut chunks_selected = 0usize;
|
let mut chunks_selected = 0usize;
|
||||||
let mut tokens_spent = 0usize;
|
let mut tokens_spent = 0usize;
|
||||||
|
|
||||||
for entity in &ctx.filtered_entities {
|
for entity in &ctx.filtered_entities {
|
||||||
let mut selected_chunks = Vec::new();
|
let mut selected_chunks = Vec::with_capacity(tuning.max_chunks_per_entity);
|
||||||
let mut entity_trace = if diagnostics_enabled {
|
let mut entity_trace = if diagnostics_enabled {
|
||||||
Some(EntityAssemblyTrace {
|
Some(EntityAssemblyTrace {
|
||||||
entity_id: entity.item.id.clone(),
|
entity_id: entity.item.id.clone(),
|
||||||
@@ -788,7 +793,7 @@ fn normalize_fts_query(input: &str) -> (String, usize) {
|
|||||||
cleaned.push(' ');
|
cleaned.push(' ');
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
let mut tokens = Vec::new();
|
let mut tokens = Vec::with_capacity(cleaned.len() / 3 + 1);
|
||||||
for token in cleaned.split_whitespace() {
|
for token in cleaned.split_whitespace() {
|
||||||
if !STOPWORDS.contains(&token) && !token.is_empty() {
|
if !STOPWORDS.contains(&token) && !token.is_empty() {
|
||||||
tokens.push(token.to_string());
|
tokens.push(token.to_string());
|
||||||
@@ -813,7 +818,8 @@ fn build_rerank_documents(ctx: &PipelineContext<'_>, max_chunks_per_entity: usiz
|
|||||||
return Vec::new();
|
return Vec::new();
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut chunk_by_source: HashMap<&str, Vec<&Scored<TextChunk>>> = HashMap::new();
|
let mut chunk_by_source: HashMap<&str, Vec<&Scored<TextChunk>>> =
|
||||||
|
HashMap::with_capacity(ctx.chunk_values.len());
|
||||||
for chunk in &ctx.chunk_values {
|
for chunk in &ctx.chunk_values {
|
||||||
chunk_by_source
|
chunk_by_source
|
||||||
.entry(chunk.item.source_id.as_str())
|
.entry(chunk.item.source_id.as_str())
|
||||||
@@ -1002,7 +1008,7 @@ fn rank_chunks_by_combined_score(
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn extract_keywords(text: &str) -> Vec<String> {
|
fn extract_keywords(text: &str) -> Vec<String> {
|
||||||
let mut terms = Vec::new();
|
let mut terms = Vec::with_capacity((text.len() / 3).max(4));
|
||||||
for raw in text.split(|c: char| !c.is_alphanumeric()) {
|
for raw in text.split(|c: char| !c.is_alphanumeric()) {
|
||||||
let term = raw.trim().to_ascii_lowercase();
|
let term = raw.trim().to_ascii_lowercase();
|
||||||
if term.len() >= 3 {
|
if term.len() >= 3 {
|
||||||
|
|||||||
Reference in New Issue
Block a user