diff --git a/retrieval-pipeline/src/graph.rs b/retrieval-pipeline/src/graph.rs index 6b210f7..a326c38 100644 --- a/retrieval-pipeline/src/graph.rs +++ b/retrieval-pipeline/src/graph.rs @@ -44,8 +44,8 @@ pub async fn find_entities_by_relationship_by_id( return Ok(Vec::new()); } - let mut neighbor_ids: Vec = Vec::new(); - let mut seen: HashSet = HashSet::new(); + let mut neighbor_ids: Vec = Vec::with_capacity(relationships.len()); + let mut seen: HashSet = HashSet::with_capacity(relationships.len()); for rel in relationships { if rel.in_ == entity_id { if seen.insert(rel.out.clone()) { @@ -97,7 +97,7 @@ pub async fn find_entities_by_relationship_by_id( .map(|entity| (entity.id.clone(), entity)) .collect(); - let mut ordered = Vec::new(); + let mut ordered = Vec::with_capacity(neighbor_ids.len()); for id in neighbor_ids { if let Some(entity) = neighbor_map.remove(&id) { ordered.push(entity); diff --git a/retrieval-pipeline/src/pipeline/stages/mod.rs b/retrieval-pipeline/src/pipeline/stages/mod.rs index 6926666..60ab164 100644 --- a/retrieval-pipeline/src/pipeline/stages/mod.rs +++ b/retrieval-pipeline/src/pipeline/stages/mod.rs @@ -647,7 +647,8 @@ pub fn assemble(ctx: &mut PipelineContext<'_>) -> Result<(), AppError> { let tuning = &ctx.config.tuning; let question_terms = extract_keywords(&ctx.input_text); - let mut chunk_by_source: HashMap>> = HashMap::new(); + let mut chunk_by_source: HashMap>> = + HashMap::with_capacity(ctx.chunk_values.len()); for chunk in ctx.chunk_values.drain(..) { chunk_by_source .entry(chunk.item.source_id.clone()) @@ -663,15 +664,19 @@ pub fn assemble(ctx: &mut PipelineContext<'_>) -> Result<(), AppError> { } let mut token_budget_remaining = tuning.token_budget_estimate; - let mut results = Vec::new(); + let mut results = Vec::with_capacity(ctx.filtered_entities.len()); let diagnostics_enabled = ctx.diagnostics_enabled(); - let mut per_entity_traces = Vec::new(); + let mut per_entity_traces = if diagnostics_enabled { + Vec::with_capacity(ctx.filtered_entities.len()) + } else { + Vec::new() + }; let mut chunks_skipped_due_budget = 0usize; let mut chunks_selected = 0usize; let mut tokens_spent = 0usize; for entity in &ctx.filtered_entities { - let mut selected_chunks = Vec::new(); + let mut selected_chunks = Vec::with_capacity(tuning.max_chunks_per_entity); let mut entity_trace = if diagnostics_enabled { Some(EntityAssemblyTrace { entity_id: entity.item.id.clone(), @@ -788,7 +793,7 @@ fn normalize_fts_query(input: &str) -> (String, usize) { cleaned.push(' '); } } - let mut tokens = Vec::new(); + let mut tokens = Vec::with_capacity(cleaned.len() / 3 + 1); for token in cleaned.split_whitespace() { if !STOPWORDS.contains(&token) && !token.is_empty() { tokens.push(token.to_string()); @@ -813,7 +818,8 @@ fn build_rerank_documents(ctx: &PipelineContext<'_>, max_chunks_per_entity: usiz return Vec::new(); } - let mut chunk_by_source: HashMap<&str, Vec<&Scored>> = HashMap::new(); + let mut chunk_by_source: HashMap<&str, Vec<&Scored>> = + HashMap::with_capacity(ctx.chunk_values.len()); for chunk in &ctx.chunk_values { chunk_by_source .entry(chunk.item.source_id.as_str()) @@ -1002,7 +1008,7 @@ fn rank_chunks_by_combined_score( } fn extract_keywords(text: &str) -> Vec { - let mut terms = Vec::new(); + let mut terms = Vec::with_capacity((text.len() / 3).max(4)); for raw in text.split(|c: char| !c.is_alphanumeric()) { let term = raw.trim().to_ascii_lowercase(); if term.len() >= 3 {