perf: pre-allocate collections with known capacity in hot paths

- Use with_capacity for chunk_by_source, results, per_entity_traces,
  and selected_chunks in assemble() where bound is known
- Pre-allocate tokens/terms vectors in normalize_fts_query and
  extract_keywords based on input length
- Pre-allocate neighbor_ids, seen, and ordered in graph expansion
  based on relationship count
This commit is contained in:
Per Stark
2026-05-26 15:49:01 +02:00
parent 6c7b586fc5
commit b4383bb227
2 changed files with 16 additions and 10 deletions
+3 -3
View File
@@ -44,8 +44,8 @@ pub async fn find_entities_by_relationship_by_id(
return Ok(Vec::new());
}
let mut neighbor_ids: Vec<String> = Vec::new();
let mut seen: HashSet<String> = HashSet::new();
let mut neighbor_ids: Vec<String> = Vec::with_capacity(relationships.len());
let mut seen: HashSet<String> = HashSet::with_capacity(relationships.len());
for rel in relationships {
if rel.in_ == entity_id {
if seen.insert(rel.out.clone()) {
@@ -97,7 +97,7 @@ pub async fn find_entities_by_relationship_by_id(
.map(|entity| (entity.id.clone(), entity))
.collect();
let mut ordered = Vec::new();
let mut ordered = Vec::with_capacity(neighbor_ids.len());
for id in neighbor_ids {
if let Some(entity) = neighbor_map.remove(&id) {
ordered.push(entity);
+13 -7
View File
@@ -647,7 +647,8 @@ pub fn assemble(ctx: &mut PipelineContext<'_>) -> Result<(), AppError> {
let tuning = &ctx.config.tuning;
let question_terms = extract_keywords(&ctx.input_text);
let mut chunk_by_source: HashMap<String, Vec<Scored<TextChunk>>> = HashMap::new();
let mut chunk_by_source: HashMap<String, Vec<Scored<TextChunk>>> =
HashMap::with_capacity(ctx.chunk_values.len());
for chunk in ctx.chunk_values.drain(..) {
chunk_by_source
.entry(chunk.item.source_id.clone())
@@ -663,15 +664,19 @@ pub fn assemble(ctx: &mut PipelineContext<'_>) -> Result<(), AppError> {
}
let mut token_budget_remaining = tuning.token_budget_estimate;
let mut results = Vec::new();
let mut results = Vec::with_capacity(ctx.filtered_entities.len());
let diagnostics_enabled = ctx.diagnostics_enabled();
let mut per_entity_traces = Vec::new();
let mut per_entity_traces = if diagnostics_enabled {
Vec::with_capacity(ctx.filtered_entities.len())
} else {
Vec::new()
};
let mut chunks_skipped_due_budget = 0usize;
let mut chunks_selected = 0usize;
let mut tokens_spent = 0usize;
for entity in &ctx.filtered_entities {
let mut selected_chunks = Vec::new();
let mut selected_chunks = Vec::with_capacity(tuning.max_chunks_per_entity);
let mut entity_trace = if diagnostics_enabled {
Some(EntityAssemblyTrace {
entity_id: entity.item.id.clone(),
@@ -788,7 +793,7 @@ fn normalize_fts_query(input: &str) -> (String, usize) {
cleaned.push(' ');
}
}
let mut tokens = Vec::new();
let mut tokens = Vec::with_capacity(cleaned.len() / 3 + 1);
for token in cleaned.split_whitespace() {
if !STOPWORDS.contains(&token) && !token.is_empty() {
tokens.push(token.to_string());
@@ -813,7 +818,8 @@ fn build_rerank_documents(ctx: &PipelineContext<'_>, max_chunks_per_entity: usiz
return Vec::new();
}
let mut chunk_by_source: HashMap<&str, Vec<&Scored<TextChunk>>> = HashMap::new();
let mut chunk_by_source: HashMap<&str, Vec<&Scored<TextChunk>>> =
HashMap::with_capacity(ctx.chunk_values.len());
for chunk in &ctx.chunk_values {
chunk_by_source
.entry(chunk.item.source_id.as_str())
@@ -1002,7 +1008,7 @@ fn rank_chunks_by_combined_score(
}
fn extract_keywords(text: &str) -> Vec<String> {
let mut terms = Vec::new();
let mut terms = Vec::with_capacity((text.len() / 3).max(4));
for raw in text.split(|c: char| !c.is_alphanumeric()) {
let term = raw.trim().to_ascii_lowercase();
if term.len() >= 3 {