perf: pre-allocate collections with known capacity in hot paths

- Use with_capacity for chunk_by_source, results, per_entity_traces, and selected_chunks in assemble() where bound is known - Pre-allocate tokens/terms vectors in normalize_fts_query and extract_keywords based on input length - Pre-allocate neighbor_ids, seen, and ordered in graph expansion based on relationship count
2026-07-15 09:12:42 +02:00 · 2026-05-26 15:49:01 +02:00
parent 6c7b586fc5
commit b4383bb227
2 changed files with 16 additions and 10 deletions
@@ -44,8 +44,8 @@ pub async fn find_entities_by_relationship_by_id(
        return Ok(Vec::new());
    }

-    let mut neighbor_ids: Vec<String> = Vec::new();
-    let mut seen: HashSet<String> = HashSet::new();
+    let mut neighbor_ids: Vec<String> = Vec::with_capacity(relationships.len());
+    let mut seen: HashSet<String> = HashSet::with_capacity(relationships.len());
    for rel in relationships {
        if rel.in_ == entity_id {
            if seen.insert(rel.out.clone()) {
@@ -97,7 +97,7 @@ pub async fn find_entities_by_relationship_by_id(
        .map(|entity| (entity.id.clone(), entity))
        .collect();

-    let mut ordered = Vec::new();
+    let mut ordered = Vec::with_capacity(neighbor_ids.len());
    for id in neighbor_ids {
        if let Some(entity) = neighbor_map.remove(&id) {
            ordered.push(entity);
@@ -647,7 +647,8 @@ pub fn assemble(ctx: &mut PipelineContext<'_>) -> Result<(), AppError> {
    let tuning = &ctx.config.tuning;
    let question_terms = extract_keywords(&ctx.input_text);

-    let mut chunk_by_source: HashMap<String, Vec<Scored<TextChunk>>> = HashMap::new();
+    let mut chunk_by_source: HashMap<String, Vec<Scored<TextChunk>>> =
+        HashMap::with_capacity(ctx.chunk_values.len());
    for chunk in ctx.chunk_values.drain(..) {
        chunk_by_source
            .entry(chunk.item.source_id.clone())
@@ -663,15 +664,19 @@ pub fn assemble(ctx: &mut PipelineContext<'_>) -> Result<(), AppError> {
    }

    let mut token_budget_remaining = tuning.token_budget_estimate;
-    let mut results = Vec::new();
+    let mut results = Vec::with_capacity(ctx.filtered_entities.len());
    let diagnostics_enabled = ctx.diagnostics_enabled();
-    let mut per_entity_traces = Vec::new();
+    let mut per_entity_traces = if diagnostics_enabled {
+        Vec::with_capacity(ctx.filtered_entities.len())
+    } else {
+        Vec::new()
+    };
    let mut chunks_skipped_due_budget = 0usize;
    let mut chunks_selected = 0usize;
    let mut tokens_spent = 0usize;

    for entity in &ctx.filtered_entities {
-        let mut selected_chunks = Vec::new();
+        let mut selected_chunks = Vec::with_capacity(tuning.max_chunks_per_entity);
        let mut entity_trace = if diagnostics_enabled {
            Some(EntityAssemblyTrace {
                entity_id: entity.item.id.clone(),
@@ -788,7 +793,7 @@ fn normalize_fts_query(input: &str) -> (String, usize) {
            cleaned.push(' ');
        }
    }
-    let mut tokens = Vec::new();
+    let mut tokens = Vec::with_capacity(cleaned.len() / 3 + 1);
    for token in cleaned.split_whitespace() {
        if !STOPWORDS.contains(&token) && !token.is_empty() {
            tokens.push(token.to_string());
@@ -813,7 +818,8 @@ fn build_rerank_documents(ctx: &PipelineContext<'_>, max_chunks_per_entity: usiz
        return Vec::new();
    }

-    let mut chunk_by_source: HashMap<&str, Vec<&Scored<TextChunk>>> = HashMap::new();
+    let mut chunk_by_source: HashMap<&str, Vec<&Scored<TextChunk>>> =
+        HashMap::with_capacity(ctx.chunk_values.len());
    for chunk in &ctx.chunk_values {
        chunk_by_source
            .entry(chunk.item.source_id.as_str())
@@ -1002,7 +1008,7 @@ fn rank_chunks_by_combined_score(
 }

 fn extract_keywords(text: &str) -> Vec<String> {
-    let mut terms = Vec::new();
+    let mut terms = Vec::with_capacity((text.len() / 3).max(4));
    for raw in text.split(|c: char| !c.is_alphanumeric()) {
        let term = raw.trim().to_ascii_lowercase();
        if term.len() >= 3 {