chore: ingestion-pipeline refactor, sort technical debt, rustfmt

2026-06-21 14:09:33 +02:00 · 2026-05-31 19:37:34 +02:00
parent 5c2d2e24d3
commit 3897345ab3
47 changed files with 1729 additions and 1343 deletions
@@ -9,8 +9,7 @@ pub use orchestrator::{
 };
 pub use store::{
    seed_manifest_into_db, window_manifest, CorpusHandle, CorpusManifest, CorpusMetadata,
-    CorpusQuestion, EmbeddedKnowledgeEntity, EmbeddedTextChunk, ParagraphShard,
-    ParagraphShardStore, MANIFEST_VERSION,
+    CorpusQuestion, ParagraphShard, ParagraphShardStore, MANIFEST_VERSION,
 };

 pub fn make_ingestion_config(config: &crate::args::Config) -> ingestion_pipeline::IngestionConfig {
@@ -33,8 +33,7 @@ use crate::{

 use crate::corpus::{
    CorpusCacheConfig, CorpusHandle, CorpusManifest, CorpusMetadata, CorpusQuestion,
-    EmbeddedKnowledgeEntity, EmbeddedTextChunk, ParagraphShard, ParagraphShardStore,
-    MANIFEST_VERSION,
+    ParagraphShard, ParagraphShardStore, MANIFEST_VERSION,
 };

 const INGESTION_SPEC_VERSION: u32 = 2;
@@ -273,10 +272,19 @@ pub async fn ensure_corpus(
            .context("shard record missing after ingestion run")?;
        if cache.refresh_embeddings_only || shard_record.needs_reembed {
            // Embeddings are now generated by the pipeline using FastEmbed - no need to re-embed
-            shard_record.shard.ingestion_fingerprint.clone_from(&ingestion_fingerprint);
+            shard_record
+                .shard
+                .ingestion_fingerprint
+                .clone_from(&ingestion_fingerprint);
            shard_record.shard.ingested_at = Utc::now();
-            shard_record.shard.embedding_backend.clone_from(&embedding_backend_label);
-            shard_record.shard.embedding_model.clone_from(&embedding_model_code);
+            shard_record
+                .shard
+                .embedding_backend
+                .clone_from(&embedding_backend_label);
+            shard_record
+                .shard
+                .embedding_model
+                .clone_from(&embedding_model_code);
            shard_record.shard.embedding_dimension = embedding_dimension;
            shard_record.dirty = true;
            shard_record.needs_reembed = false;
@@ -543,31 +551,16 @@ async fn ingest_single_paragraph(
        let task = IngestionTask::new(payload, user_id.to_string());
        match pipeline.produce_artifacts(&task).await {
            Ok(artifacts) => {
-                let entities: Vec<EmbeddedKnowledgeEntity> = artifacts
-                    .entities
-                    .into_iter()
-                    .map(|e| EmbeddedKnowledgeEntity {
-                        entity: e.entity,
-                        embedding: e.embedding,
-                    })
-                    .collect();
-                let chunks: Vec<EmbeddedTextChunk> = artifacts
-                    .chunks
-                    .into_iter()
-                    .map(|c| EmbeddedTextChunk {
-                        chunk: c.chunk,
-                        embedding: c.embedding,
-                    })
-                    .collect();
-                // No need to reembed - pipeline now uses FastEmbed internally
+                // Artifacts already carry the shared `Embedded*` types and FastEmbed
+                // embeddings, so they can be persisted to the shard without re-mapping.
                let mut shard = ParagraphShard::new(
                    paragraph,
                    request.shard_path,
                    ingestion_fingerprint,
                    artifacts.text_content,
-                    entities,
+                    artifacts.entities,
                    artifacts.relationships,
-                    chunks,
+                    artifacts.chunks,
                    &embedding_backend,
                    embedding_model.clone(),
                    embedding_dimension,
@@ -54,17 +54,9 @@ fn default_chunk_only() -> bool {
    false
 }

-#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
-pub struct EmbeddedKnowledgeEntity {
-    pub entity: KnowledgeEntity,
-    pub embedding: Vec<f32>,
-}
-
-#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
-pub struct EmbeddedTextChunk {
-    pub chunk: TextChunk,
-    pub embedding: Vec<f32>,
-}
+// Reuse the pipeline's canonical embedded-artifact types so the on-disk corpus
+// format and the ingestion output never drift apart.
+pub use ingestion_pipeline::{EmbeddedKnowledgeEntity, EmbeddedTextChunk};

 #[derive(Debug, Clone, serde::Deserialize)]
 struct LegacyKnowledgeEntity {
@@ -11,7 +11,11 @@ use tracing::warn;

 use super::{ConvertedParagraph, ConvertedQuestion};

-#[allow(clippy::too_many_lines, clippy::arithmetic_side_effects, clippy::cast_sign_loss)]
+#[allow(
+    clippy::too_many_lines,
+    clippy::arithmetic_side_effects,
+    clippy::cast_sign_loss
+)]
 pub fn convert_nq(
    raw_path: &Path,
    include_unanswerable: bool,
@@ -166,8 +166,7 @@ async fn async_main() -> anyhow::Result<()> {
    );

    if parsed.config.slice_grow.is_some() {
-        eval::grow_slice(&dataset, &parsed.config)
-            .context("growing slice ledger")?;
+        eval::grow_slice(&dataset, &parsed.config).context("growing slice ledger")?;
        return Ok(());
    }

@@ -14,7 +14,7 @@ use common::{
    utils::embedding::EmbeddingProvider,
 };
 use retrieval_pipeline::{
-    pipeline::{StageTimings, RetrievalConfig},
+    pipeline::{RetrievalConfig, StageTimings},
    reranking::RerankerPool,
 };

@@ -122,11 +122,15 @@ impl<'a> EvaluationContext<'a> {
    }

    pub fn slice(&self) -> Result<&slice::ResolvedSlice<'a>> {
-        self.slice.as_ref().ok_or_else(|| anyhow!("slice has not been prepared"))
+        self.slice
+            .as_ref()
+            .ok_or_else(|| anyhow!("slice has not been prepared"))
    }

    pub fn db(&self) -> Result<&SurrealDbClient> {
-        self.db.as_ref().ok_or_else(|| anyhow!("database connection missing"))
+        self.db
+            .as_ref()
+            .ok_or_else(|| anyhow!("database connection missing"))
    }

    pub fn descriptor(&self) -> Result<&snapshot::Descriptor> {
@@ -142,15 +146,23 @@ impl<'a> EvaluationContext<'a> {
    }

    pub fn openai_client(&self) -> Result<Arc<Client<async_openai::config::OpenAIConfig>>> {
-        Ok(Arc::clone(self.openai_client.as_ref().ok_or_else(|| anyhow!("openai client missing"))?))
+        Ok(Arc::clone(
+            self.openai_client
+                .as_ref()
+                .ok_or_else(|| anyhow!("openai client missing"))?,
+        ))
    }

    pub fn corpus_handle(&self) -> Result<&corpus::CorpusHandle> {
-        self.corpus_handle.as_ref().ok_or_else(|| anyhow!("corpus handle missing"))
+        self.corpus_handle
+            .as_ref()
+            .ok_or_else(|| anyhow!("corpus handle missing"))
    }

    pub fn evaluation_user(&self) -> Result<&User> {
-        self.eval_user.as_ref().ok_or_else(|| anyhow!("evaluation user missing"))
+        self.eval_user
+            .as_ref()
+            .ok_or_else(|| anyhow!("evaluation user missing"))
    }

    #[allow(clippy::arithmetic_side_effects)]
@@ -168,7 +180,8 @@ impl<'a> EvaluationContext<'a> {
    }

    pub fn into_summary(self) -> Result<EvaluationSummary> {
-        self.summary.ok_or_else(|| anyhow!("evaluation summary missing"))
+        self.summary
+            .ok_or_else(|| anyhow!("evaluation summary missing"))
    }
 }

@@ -10,7 +10,7 @@ use crate::eval::{
    CaseSummary, RetrievedSummary,
 };
 use retrieval_pipeline::{
-    pipeline::{self, StageTimings, RetrievalConfig},
+    pipeline::{self, RetrievalConfig, StageTimings},
    reranking::RerankerPool,
 };
 use tokio::sync::Semaphore;
@@ -169,10 +169,10 @@ pub(crate) async fn run_queries(
                let query_start = Instant::now();

                debug!(question_id = %question_id, "Evaluating query");
-                let query_embedding =
-                    embedding_provider.embed(&question).await.with_context(|| {
-                        format!("generating embedding for question {question_id}")
-                    })?;
+                let query_embedding = embedding_provider
+                    .embed(&question)
+                    .await
+                    .with_context(|| format!("generating embedding for question {question_id}"))?;
                let reranker = match rerank_pool.as_ref() {
                    Some(pool) => pool.checkout().await,
                    None => None,
@@ -204,8 +204,10 @@ pub(crate) async fn run_queries(
                let mut match_rank = None;
                let answers_lower: Vec<String> =
                    answers.iter().map(|ans| ans.to_ascii_lowercase()).collect();
-                let expected_chunk_ids_set: HashSet<&str> =
-                    expected_chunk_ids.iter().map(std::string::String::as_str).collect();
+                let expected_chunk_ids_set: HashSet<&str> = expected_chunk_ids
+                    .iter()
+                    .map(std::string::String::as_str)
+                    .collect();
                let chunk_id_required = has_verified_chunks;
                let mut entity_hit = false;
                let mut chunk_text_hit = false;
@@ -304,15 +306,12 @@ pub(crate) async fn run_queries(
                    None
                };

-                Ok::<
-                    (
-                        usize,
-                        CaseSummary,
-                        Option<CaseDiagnostics>,
-                        StageTimings,
-                    ),
-                    anyhow::Error,
-                >((idx, summary, diagnostics, stage_timings))
+                Ok::<(usize, CaseSummary, Option<CaseDiagnostics>, StageTimings), anyhow::Error>((
+                    idx,
+                    summary,
+                    diagnostics,
+                    stage_timings,
+                ))
            }
        })
        .buffer_unordered(concurrency)
@@ -13,7 +13,11 @@ use super::super::{
 };
 use super::{map_guard_error, StageResult};

-#[allow(clippy::too_many_lines, clippy::arithmetic_side_effects, clippy::cast_precision_loss)]
+#[allow(
+    clippy::too_many_lines,
+    clippy::arithmetic_side_effects,
+    clippy::cast_precision_loss
+)]
 pub(crate) async fn summarize(
    machine: EvaluationMachine<(), QueriesFinished>,
    ctx: &mut EvaluationContext<'_>,
@@ -403,11 +403,20 @@ pub fn write_reports(
    })
 }

-#[allow(clippy::too_many_lines, clippy::write_with_newline, clippy::unwrap_used)]
+#[allow(
+    clippy::too_many_lines,
+    clippy::write_with_newline,
+    clippy::unwrap_used
+)]
 fn render_markdown(report: &EvaluationReport) -> String {
    let mut md = String::new();

-    write!(md, "# Retrieval Evaluation (k={})\\n\\n", report.retrieval.k).unwrap();
+    write!(
+        md,
+        "# Retrieval Evaluation (k={})\\n\\n",
+        report.retrieval.k
+    )
+    .unwrap();

    md.push_str("## Overview\\n\\n");
    md.push_str("| Metric | Value |\\n| --- | --- |\\n");
@@ -424,34 +433,94 @@ fn render_markdown(report: &EvaluationReport) -> String {
    )
    .unwrap();
    write!(md, "| Total Cases | {} |\\n", report.overview.total_cases).unwrap();
-    write!(md, "| Filtered Questions | {} |\\n", report.overview.filtered_questions).unwrap();
+    write!(
+        md,
+        "| Filtered Questions | {} |\\n",
+        report.overview.filtered_questions
+    )
+    .unwrap();

    md.push_str("\\n## Dataset & Slice\\n\\n");
    md.push_str("| Metric | Value |\\n| --- | --- |\\n");
-    write!(md, "| Dataset | {} (`{}`) |\\n", report.dataset.label, report.dataset.id).unwrap();
+    write!(
+        md,
+        "| Dataset | {} (`{}`) |\\n",
+        report.dataset.label, report.dataset.id
+    )
+    .unwrap();
    write!(md, "| Dataset Source | {} |\\n", report.dataset.source).unwrap();
-    write!(md, "| Includes Unanswerable | {} |\\n", bool_badge(report.dataset.includes_unanswerable)).unwrap();
-    write!(md, "| Require Verified Chunks | {} |\\n", bool_badge(report.dataset.require_verified_chunks)).unwrap();
+    write!(
+        md,
+        "| Includes Unanswerable | {} |\\n",
+        bool_badge(report.dataset.includes_unanswerable)
+    )
+    .unwrap();
+    write!(
+        md,
+        "| Require Verified Chunks | {} |\\n",
+        bool_badge(report.dataset.require_verified_chunks)
+    )
+    .unwrap();
    let embedding_label = if let Some(model) = report.dataset.embedding_model.as_ref() {
        format!("{} ({model})", report.dataset.embedding_backend)
    } else {
        report.dataset.embedding_backend.clone()
    };
    write!(md, "| Embedding | {embedding_label} |\\n").unwrap();
-    write!(md, "| Embedding Dim | {} |\\n", report.dataset.embedding_dimension).unwrap();
+    write!(
+        md,
+        "| Embedding Dim | {} |\\n",
+        report.dataset.embedding_dimension
+    )
+    .unwrap();
    write!(md, "| Slice ID | `{}` |\\n", report.slice.id).unwrap();
    write!(md, "| Slice Seed | {} |\\n", report.slice.seed).unwrap();
-    write!(md, "| Slice Window (offset/length) | {}/{} |\\n", report.slice.window_offset, report.slice.window_length).unwrap();
-    write!(md, "| Slice Questions (window/ledger) | {}/{} |\\n", report.slice.slice_cases, report.slice.ledger_total_cases).unwrap();
-    write!(md, "| Slice Positives / Negatives | {}/{} |\\n", report.slice.positives, report.slice.negatives).unwrap();
-    write!(md, "| Slice Paragraphs | {} |\\n", report.slice.total_paragraphs).unwrap();
-    write!(md, "| Negative Multiplier | {:.2} |\\n", report.slice.negative_multiplier).unwrap();
+    write!(
+        md,
+        "| Slice Window (offset/length) | {}/{} |\\n",
+        report.slice.window_offset, report.slice.window_length
+    )
+    .unwrap();
+    write!(
+        md,
+        "| Slice Questions (window/ledger) | {}/{} |\\n",
+        report.slice.slice_cases, report.slice.ledger_total_cases
+    )
+    .unwrap();
+    write!(
+        md,
+        "| Slice Positives / Negatives | {}/{} |\\n",
+        report.slice.positives, report.slice.negatives
+    )
+    .unwrap();
+    write!(
+        md,
+        "| Slice Paragraphs | {} |\\n",
+        report.slice.total_paragraphs
+    )
+    .unwrap();
+    write!(
+        md,
+        "| Negative Multiplier | {:.2} |\\n",
+        report.slice.negative_multiplier
+    )
+    .unwrap();

    md.push_str("\\n## Retrieval Metrics\\n\\n");
    md.push_str("| Metric | Value |\\n| --- | --- |\\n");
    write!(md, "| Cases | {} |\\n", report.retrieval.cases).unwrap();
-    write!(md, "| Correct@{} | {}/{} |\\n", report.retrieval.k, report.retrieval.correct, report.retrieval.cases).unwrap();
-    write!(md, "| Precision@{} | {:.3} |\\n", report.retrieval.k, report.retrieval.precision).unwrap();
+    write!(
+        md,
+        "| Correct@{} | {}/{} |\\n",
+        report.retrieval.k, report.retrieval.correct, report.retrieval.cases
+    )
+    .unwrap();
+    write!(
+        md,
+        "| Precision@{} | {:.3} |\\n",
+        report.retrieval.k, report.retrieval.precision
+    )
+    .unwrap();
    write!(
        md,
        "| Precision@1/2/3 | {:.3} / {:.3} / {:.3} |\\n",
@@ -462,7 +531,12 @@ fn render_markdown(report: &EvaluationReport) -> String {
    .unwrap();
    write!(md, "| MRR | {:.3} |\\n", report.retrieval.mrr).unwrap();
    write!(md, "| NDCG | {:.3} |\\n", report.retrieval.average_ndcg).unwrap();
-    write!(md, "| Latency Avg / P50 / P95 (ms) | {:.1} / {} / {} |\\n", report.retrieval.latency.avg, report.retrieval.latency.p50, report.retrieval.latency.p95).unwrap();
+    write!(
+        md,
+        "| Latency Avg / P50 / P95 (ms) | {:.1} / {} / {} |\\n",
+        report.retrieval.latency.avg, report.retrieval.latency.p50, report.retrieval.latency.p95
+    )
+    .unwrap();
    write!(
        md,
        "| Resolve entities | {} |\\n",
@@ -473,8 +547,14 @@ fn render_markdown(report: &EvaluationReport) -> String {
    if report.retrieval.rerank_enabled {
        let pool = report
            .retrieval
-            .rerank_pool_size.map_or_else(|| "?".into(), |size| size.to_string());
-        write!(md, "| Rerank | enabled (pool {pool}, keep top {}) |\\n", report.retrieval.rerank_keep_top).unwrap();
+            .rerank_pool_size
+            .map_or_else(|| "?".into(), |size| size.to_string());
+        write!(
+            md,
+            "| Rerank | enabled (pool {pool}, keep top {}) |\\n",
+            report.retrieval.rerank_keep_top
+        )
+        .unwrap();
    } else {
        md.push_str("| Rerank | disabled |\\n");
    }
@@ -489,8 +569,18 @@ fn render_markdown(report: &EvaluationReport) -> String {

    md.push_str("\\n## Performance\\n\\n");
    md.push_str("| Metric | Value |\\n| --- | --- |\\n");
-    write!(md, "| OpenAI Base URL | {} |\\n", report.performance.openai_base_url).unwrap();
-    write!(md, "| Ingestion Duration | {} ms |\\n", report.performance.ingestion_ms).unwrap();
+    write!(
+        md,
+        "| OpenAI Base URL | {} |\\n",
+        report.performance.openai_base_url
+    )
+    .unwrap();
+    write!(
+        md,
+        "| Ingestion Duration | {} ms |\\n",
+        report.performance.ingestion_ms
+    )
+    .unwrap();
    if let Some(seed) = report.performance.namespace_seed_ms {
        write!(md, "| Namespace Seed | {seed} ms |\\n").unwrap();
    }
@@ -504,14 +594,44 @@ fn render_markdown(report: &EvaluationReport) -> String {
        }
    )
    .unwrap();
-    write!(md, "| Corpus Paragraphs | {} |\\n", report.performance.corpus_paragraphs).unwrap();
+    write!(
+        md,
+        "| Corpus Paragraphs | {} |\\n",
+        report.performance.corpus_paragraphs
+    )
+    .unwrap();
    if report.detailed_report {
-        write!(md, "| Ingestion Cache | `{}` |\\n", report.performance.ingestion_cache_path).unwrap();
-        write!(md, "| Ingestion Reused | {} |\\n", bool_badge(report.performance.ingestion_reused)).unwrap();
-        write!(md, "| Embeddings Reused | {} |\\n", bool_badge(report.performance.embeddings_reused)).unwrap();
+        write!(
+            md,
+            "| Ingestion Cache | `{}` |\\n",
+            report.performance.ingestion_cache_path
+        )
+        .unwrap();
+        write!(
+            md,
+            "| Ingestion Reused | {} |\\n",
+            bool_badge(report.performance.ingestion_reused)
+        )
+        .unwrap();
+        write!(
+            md,
+            "| Embeddings Reused | {} |\\n",
+            bool_badge(report.performance.embeddings_reused)
+        )
+        .unwrap();
    }
-    write!(md, "| Positives Cached | {} |\\n", report.performance.positive_paragraphs_reused).unwrap();
-    write!(md, "| Negatives Cached | {} |\\n", report.performance.negative_paragraphs_reused).unwrap();
+    write!(
+        md,
+        "| Positives Cached | {} |\\n",
+        report.performance.positive_paragraphs_reused
+    )
+    .unwrap();
+    write!(
+        md,
+        "| Negatives Cached | {} |\\n",
+        report.performance.negative_paragraphs_reused
+    )
+    .unwrap();

    md.push_str("\\n## Retrieval Stage Timings\\n\\n");
    md.push_str("| Stage | Avg (ms) | P50 (ms) | P95 (ms) |\\n| --- | --- | --- | --- |\\n");
@@ -583,7 +703,8 @@ fn render_markdown(report: &EvaluationReport) -> String {
            for case in &report.llm_cases {
                let retrieved = render_retrieved(&case.retrieved);
                let rank = case
-                    .match_rank.map_or_else(|| "-".into(), |rank| rank.to_string());
+                    .match_rank
+                    .map_or_else(|| "-".into(), |rank| rank.to_string());
                write!(
                    md,
                    "| `{}` | {} | {} | {} |\\n",
@@ -99,10 +99,13 @@ fn sanitize_identifier(input: &str) -> String {
        let mut hasher = Sha256::new();
        hasher.update(input.as_bytes());
        let digest = hasher.finalize();
-        digest.iter().take(6).fold(String::with_capacity(12), |mut s, b| {
-            let _ = write!(s, "{b:02x}");
-            s
-        })
+        digest
+            .iter()
+            .take(6)
+            .fold(String::with_capacity(12), |mut s, b| {
+                let _ = write!(s, "{b:02x}");
+                s
+            })
    } else {
        trimmed
    }
@@ -127,7 +130,9 @@ pub struct SliceWindow<'a> {

 impl SliceWindow<'_> {
    pub fn positive_ids(&self) -> impl Iterator<Item = &str> {
-        self.positive_paragraph_ids.iter().map(std::string::String::as_str)
+        self.positive_paragraph_ids
+            .iter()
+            .map(std::string::String::as_str)
    }
 }

@@ -169,7 +174,10 @@ impl DatasetIndex {
            .paragraph_by_id
            .get(id)
            .ok_or_else(|| anyhow!("slice references unknown paragraph '{id}'"))?;
-        dataset.paragraphs.get(*idx).ok_or_else(|| anyhow!("paragraph index out of bounds"))
+        dataset
+            .paragraphs
+            .get(*idx)
+            .ok_or_else(|| anyhow!("paragraph index out of bounds"))
    }

    fn question<'a>(
@@ -181,7 +189,9 @@ impl DatasetIndex {
            .question_by_id
            .get(question_id)
            .ok_or_else(|| anyhow!("slice references unknown question '{question_id}'"))?;
-        let paragraph = dataset.paragraphs.get(*p_idx)
+        let paragraph = dataset
+            .paragraphs
+            .get(*p_idx)
            .ok_or_else(|| anyhow!("paragraph index out of bounds for question '{question_id}'"))?;
        let question = paragraph
            .questions
@@ -318,9 +328,7 @@ pub fn resolve_slice<'a>(
        .is_some_and(|manifest| manifest.version != SLICE_VERSION)
    {
        warn!(
-            slice = manifest
-                .as_ref()
-                .map_or("unknown", |m| m.slice_id.as_str()),
+            slice = manifest.as_ref().map_or("unknown", |m| m.slice_id.as_str()),
            found = manifest.as_ref().map_or(0, |m| m.version),
            expected = SLICE_VERSION,
            "Slice manifest version mismatch; regenerating"
@@ -919,7 +927,11 @@ fn ensure_shard_paths(manifest: &mut SliceManifest) -> bool {
    changed
 }

-#[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation, clippy::cast_sign_loss)]
+#[allow(
+    clippy::cast_precision_loss,
+    clippy::cast_possible_truncation,
+    clippy::cast_sign_loss
+)]
 fn desired_negative_target(
    positive_count: usize,
    requested_corpus: usize,
@@ -1007,10 +1019,13 @@ fn compute_slice_id(key: &SliceKey<'_>) -> Result<String> {
    let mut hasher = Sha256::new();
    hasher.update(payload);
    let digest = hasher.finalize();
-    Ok(digest.iter().take(16).fold(String::with_capacity(32), |mut s, b| {
-        let _ = write!(s, "{b:02x}");
-        s
-    }))
+    Ok(digest
+        .iter()
+        .take(16)
+        .fold(String::with_capacity(32), |mut s, b| {
+            let _ = write!(s, "{b:02x}");
+            s
+        }))
 }

 #[allow(clippy::indexing_slicing)]
@@ -1050,10 +1065,7 @@ impl<'a> From<&'a Config> for SliceConfig<'a> {
    }
 }

-pub fn slice_config_with_limit(
-    config: &Config,
-    limit_override: Option<usize>,
-) -> SliceConfig<'_> {
+pub fn slice_config_with_limit(config: &Config, limit_override: Option<usize>) -> SliceConfig<'_> {
    SliceConfig {
        cache_dir: config.cache_dir.as_path(),
        force_convert: config.force_convert,
@@ -409,7 +409,10 @@ pub fn build_case_diagnostics(
    candidates: &[EvaluationCandidate],
    pipeline_stats: Option<Diagnostics>,
 ) -> CaseDiagnostics {
-    let expected_set: HashSet<&str> = expected_chunk_ids.iter().map(std::string::String::as_str).collect();
+    let expected_set: HashSet<&str> = expected_chunk_ids
+        .iter()
+        .map(std::string::String::as_str)
+        .collect();
    let mut seen_chunks: HashSet<String> = HashSet::new();
    let mut attached_chunk_ids = Vec::new();
    let mut entity_diagnostics = Vec::new();