fix: all tests now in sync

2026-06-30 10:01:40 +02:00 · 2025-11-29 18:59:08 +01:00
parent abf0223bd2
commit 38cb2e5e24
19 changed files with 439 additions and 50 deletions
@@ -194,6 +194,18 @@ pub struct Config {
    #[arg(long, default_value_os_t = default_ingestion_cache_dir())]
    pub ingestion_cache_dir: PathBuf,

+    /// Minimum tokens per chunk for ingestion
+    #[arg(long, default_value_t = 500)]
+    pub ingest_chunk_min_tokens: usize,
+
+    /// Maximum tokens per chunk for ingestion
+    #[arg(long, default_value_t = 2_000)]
+    pub ingest_chunk_max_tokens: usize,
+
+    /// Run ingestion in chunk-only mode (skip analyzer/graph generation)
+    #[arg(long)]
+    pub ingest_chunks_only: bool,
+
    /// Number of paragraphs to ingest concurrently
    #[arg(long, default_value_t = 5)]
    pub ingestion_batch_size: usize,
@@ -350,6 +362,14 @@ impl Config {
            ));
        }

+        if self.ingest_chunk_min_tokens == 0 || self.ingest_chunk_min_tokens >= self.ingest_chunk_max_tokens {
+            return Err(anyhow!(
+                "--ingest-chunk-min-tokens must be greater than zero and less than --ingest-chunk-max-tokens (got {} >= {})",
+                self.ingest_chunk_min_tokens,
+                self.ingest_chunk_max_tokens
+            ));
+        }
+
        if self.retrieval.rerank && self.retrieval.rerank_pool_size == 0 {
            return Err(anyhow!(
                "--rerank-pool must be greater than zero when reranking is enabled"
@@ -4,7 +4,7 @@ mod types;
 pub use pipeline::run_evaluation;
 pub use types::*;

-use std::{collections::HashMap, path::Path, time::Duration};
+use std::{collections::HashMap, path::Path};

 use anyhow::{anyhow, Context, Result};
 use chrono::{DateTime, SecondsFormat, Utc};
@@ -23,7 +23,6 @@ use tracing::{info, warn};
 use crate::{
    args::{self, Config},
    datasets::{self, ConvertedDataset},
-    db_helpers::change_embedding_length_in_hnsw_indexes,
    ingest,
    slice::{self},
    snapshot::{self, DbSnapshotState},
@@ -461,7 +460,7 @@ pub(crate) async fn enforce_system_settings(

 pub(crate) async fn load_or_init_system_settings(
    db: &SurrealDbClient,
-    dimension: usize,
+    _dimension: usize,
 ) -> Result<(SystemSettings, bool)> {
    match SystemSettings::get_current(db).await {
        Ok(settings) => Ok((settings, false)),
@@ -565,6 +564,9 @@ mod tests {
                generated_at: Utc::now(),
                paragraph_count: paragraphs.len(),
                question_count: questions.len(),
+                chunk_min_tokens: 1,
+                chunk_max_tokens: 10,
+                chunk_only: false,
            },
            paragraphs,
            questions,
@@ -31,10 +31,12 @@ pub(crate) async fn prepare_corpus(
        .context("selecting slice window for corpus preparation")?;

    let descriptor = snapshot::Descriptor::new(config, slice, ctx.embedding_provider());
+    let ingestion_config = ingest::make_ingestion_config(config);
    let expected_fingerprint = ingest::compute_ingestion_fingerprint(
        ctx.dataset(),
        slice,
        config.converted_dataset_path.as_path(),
+        &ingestion_config,
    )?;
    let base_dir = ingest::cached_corpus_dir(
        &cache_settings,
@@ -101,6 +103,7 @@ pub(crate) async fn prepare_corpus(
            openai_client,
            &eval_user_id,
            config.converted_dataset_path.as_path(),
+            ingestion_config.clone(),
        )
        .await
        .context("ensuring ingestion-backed corpus")?
@@ -12,3 +12,14 @@ pub use store::{
    CorpusQuestion, EmbeddedKnowledgeEntity, EmbeddedTextChunk, ParagraphShard,
    ParagraphShardStore, MANIFEST_VERSION,
 };
+
+pub fn make_ingestion_config(config: &crate::args::Config) -> ingestion_pipeline::IngestionConfig {
+    let mut tuning = ingestion_pipeline::IngestionTuning::default();
+    tuning.chunk_min_tokens = config.ingest_chunk_min_tokens;
+    tuning.chunk_max_tokens = config.ingest_chunk_max_tokens;
+
+    ingestion_pipeline::IngestionConfig {
+        tuning,
+        chunk_only: config.ingest_chunks_only,
+    }
+}
@@ -36,7 +36,7 @@ use crate::ingest::{
    MANIFEST_VERSION,
 };

-const INGESTION_SPEC_VERSION: u32 = 1;
+const INGESTION_SPEC_VERSION: u32 = 2;

 type OpenAIClient = Client<async_openai::config::OpenAIConfig>;

@@ -116,10 +116,12 @@ pub async fn ensure_corpus(
    openai: Arc<OpenAIClient>,
    user_id: &str,
    converted_path: &Path,
+    ingestion_config: IngestionConfig,
 ) -> Result<CorpusHandle> {
    let checksum = compute_file_checksum(converted_path)
        .with_context(|| format!("computing checksum for {}", converted_path.display()))?;
-    let ingestion_fingerprint = build_ingestion_fingerprint(dataset, slice, &checksum);
+    let ingestion_fingerprint =
+        build_ingestion_fingerprint(dataset, slice, &checksum, &ingestion_config);

    let base_dir = cached_corpus_dir(
        cache,
@@ -241,6 +243,7 @@ pub async fn ensure_corpus(
            embedding_dimension,
            cache.ingestion_batch_size,
            cache.ingestion_max_retries,
+            ingestion_config.clone(),
        )
        .await
        .context("ingesting missing slice paragraphs")?;
@@ -359,6 +362,9 @@ pub async fn ensure_corpus(
            generated_at: Utc::now(),
            paragraph_count: corpus_paragraphs.len(),
            question_count: corpus_questions.len(),
+            chunk_min_tokens: ingestion_config.tuning.chunk_min_tokens,
+            chunk_max_tokens: ingestion_config.tuning.chunk_max_tokens,
+            chunk_only: ingestion_config.chunk_only,
        },
        paragraphs: corpus_paragraphs,
        questions: corpus_questions,
@@ -396,6 +402,7 @@ async fn ingest_paragraph_batch(
    embedding_dimension: usize,
    batch_size: usize,
    max_retries: usize,
+    ingestion_config: IngestionConfig,
 ) -> Result<Vec<ParagraphShard>> {
    if targets.is_empty() {
        return Ok(Vec::new());
@@ -419,13 +426,15 @@ async fn ingest_paragraph_batch(
    let backend: DynStore = Arc::new(InMemory::new());
    let storage = StorageManager::with_backend(backend, StorageKind::Memory);

-    let pipeline = IngestionPipeline::new(
+    let pipeline_config = ingestion_config.clone();
+    let pipeline = IngestionPipeline::new_with_config(
        db,
        openai.clone(),
        app_config,
        None::<Arc<retrieval_pipeline::reranking::RerankerPool>>,
        storage,
        embedding.clone(),
+        pipeline_config,
    )
    .await?;
    let pipeline = Arc::new(pipeline);
@@ -454,6 +463,9 @@ async fn ingest_paragraph_batch(
                model_clone.clone(),
                embedding_dimension,
                max_retries,
+                ingestion_config.tuning.chunk_min_tokens,
+                ingestion_config.tuning.chunk_max_tokens,
+                ingestion_config.chunk_only,
            )
        });
        let batch_results: Vec<ParagraphShard> = try_join_all(tasks)
@@ -475,6 +487,9 @@ async fn ingest_single_paragraph(
    embedding_model: Option<String>,
    embedding_dimension: usize,
    max_retries: usize,
+    chunk_min_tokens: usize,
+    chunk_max_tokens: usize,
+    chunk_only: bool,
 ) -> Result<ParagraphShard> {
    let paragraph = request.paragraph;
    let mut last_err: Option<anyhow::Error> = None;
@@ -516,6 +531,9 @@ async fn ingest_single_paragraph(
                    &embedding_backend,
                    embedding_model.clone(),
                    embedding_dimension,
+                    chunk_min_tokens,
+                    chunk_max_tokens,
+                    chunk_only,
                );
                for question in &request.question_refs {
                    if let Err(err) = shard.ensure_question_binding(question) {
@@ -558,8 +576,9 @@ pub fn build_ingestion_fingerprint(
    dataset: &ConvertedDataset,
    slice: &ResolvedSlice<'_>,
    checksum: &str,
+    ingestion_config: &IngestionConfig,
 ) -> String {
-    let config_repr = format!("{:?}", IngestionConfig::default());
+    let config_repr = format!("{:?}", ingestion_config);
    let mut hasher = Sha256::new();
    hasher.update(config_repr.as_bytes());
    let config_hash = format!("{:x}", hasher.finalize());
@@ -578,9 +597,15 @@ pub fn compute_ingestion_fingerprint(
    dataset: &ConvertedDataset,
    slice: &ResolvedSlice<'_>,
    converted_path: &Path,
+    ingestion_config: &IngestionConfig,
 ) -> Result<String> {
    let checksum = compute_file_checksum(converted_path)?;
-    Ok(build_ingestion_fingerprint(dataset, slice, &checksum))
+    Ok(build_ingestion_fingerprint(
+        dataset,
+        slice,
+        &checksum,
+        ingestion_config,
+    ))
 }

 pub fn load_cached_manifest(base_dir: &Path) -> Result<Option<CorpusManifest>> {
@@ -643,3 +668,107 @@ fn compute_file_checksum(path: &Path) -> Result<String> {
    }
    Ok(format!("{:x}", hasher.finalize()))
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::{
+        datasets::{ConvertedDataset, ConvertedParagraph, ConvertedQuestion, DatasetKind},
+        slices::{CaseRef, SliceCaseEntry, SliceManifest, SliceParagraphEntry, SliceParagraphKind},
+    };
+    use chrono::Utc;
+
+    fn dummy_dataset() -> ConvertedDataset {
+        let question = ConvertedQuestion {
+            id: "q1".to_string(),
+            question: "What?".to_string(),
+            answers: vec!["A".to_string()],
+            is_impossible: false,
+        };
+        let paragraph = ConvertedParagraph {
+            id: "p1".to_string(),
+            title: "title".to_string(),
+            context: "context".to_string(),
+            questions: vec![question],
+        };
+
+        ConvertedDataset {
+            generated_at: Utc::now(),
+            metadata: crate::datasets::DatasetMetadata::for_kind(
+                DatasetKind::default(),
+                false,
+                None,
+            ),
+            source: "src".to_string(),
+            paragraphs: vec![paragraph],
+        }
+    }
+
+    fn dummy_slice<'a>(dataset: &'a ConvertedDataset) -> ResolvedSlice<'a> {
+        let paragraph = &dataset.paragraphs[0];
+        let question = &paragraph.questions[0];
+        let manifest = SliceManifest {
+            version: 1,
+            slice_id: "slice-1".to_string(),
+            dataset_id: dataset.metadata.id.clone(),
+            dataset_label: dataset.metadata.label.clone(),
+            dataset_source: dataset.source.clone(),
+            includes_unanswerable: false,
+            require_verified_chunks: false,
+            seed: 1,
+            requested_limit: Some(1),
+            requested_corpus: 1,
+            generated_at: Utc::now(),
+            case_count: 1,
+            positive_paragraphs: 1,
+            negative_paragraphs: 0,
+            total_paragraphs: 1,
+            negative_multiplier: 1.0,
+            cases: vec![SliceCaseEntry {
+                question_id: question.id.clone(),
+                paragraph_id: paragraph.id.clone(),
+            }],
+            paragraphs: vec![SliceParagraphEntry {
+                id: paragraph.id.clone(),
+                kind: SliceParagraphKind::Positive {
+                    question_ids: vec![question.id.clone()],
+                },
+                shard_path: None,
+            }],
+        };
+
+        ResolvedSlice {
+            manifest,
+            path: PathBuf::from("cache"),
+            paragraphs: dataset.paragraphs.iter().collect(),
+            cases: vec![CaseRef {
+                paragraph,
+                question,
+            }],
+        }
+    }
+
+    #[test]
+    fn fingerprint_changes_with_chunk_settings() {
+        let dataset = dummy_dataset();
+        let slice = dummy_slice(&dataset);
+        let checksum = "deadbeef";
+
+        let base_config = IngestionConfig::default();
+        let fp_base = build_ingestion_fingerprint(&dataset, &slice, checksum, &base_config);
+
+        let mut token_config = base_config.clone();
+        token_config.tuning.chunk_min_tokens += 1;
+        let fp_token = build_ingestion_fingerprint(&dataset, &slice, checksum, &token_config);
+        assert_ne!(fp_base, fp_token, "token bounds should affect fingerprint");
+
+        let mut chunk_only_config = base_config;
+        chunk_only_config.chunk_only = true;
+        let fp_chunk_only =
+            build_ingestion_fingerprint(&dataset, &slice, checksum, &chunk_only_config);
+        assert_ne!(
+            fp_base, fp_chunk_only,
+            "chunk-only mode should affect fingerprint"
+        );
+    }
+}
@@ -26,8 +26,8 @@ use tracing::warn;

 use crate::datasets::{ConvertedParagraph, ConvertedQuestion};

-pub const MANIFEST_VERSION: u32 = 2;
-pub const PARAGRAPH_SHARD_VERSION: u32 = 2;
+pub const MANIFEST_VERSION: u32 = 3;
+pub const PARAGRAPH_SHARD_VERSION: u32 = 3;
 const MANIFEST_BATCH_SIZE: usize = 100;
 const MANIFEST_MAX_BYTES_PER_BATCH: usize = 300_000; // default cap for non-text batches
 const TEXT_CONTENT_MAX_BYTES_PER_BATCH: usize = 250_000; // text bodies can be large; limit aggressively
@@ -42,6 +42,18 @@ fn current_paragraph_shard_version() -> u32 {
    PARAGRAPH_SHARD_VERSION
 }

+fn default_chunk_min_tokens() -> usize {
+    500
+}
+
+fn default_chunk_max_tokens() -> usize {
+    2_000
+}
+
+fn default_chunk_only() -> bool {
+    false
+}
+
 #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
 pub struct EmbeddedKnowledgeEntity {
    pub entity: KnowledgeEntity,
@@ -143,6 +155,12 @@ pub struct CorpusMetadata {
    pub generated_at: DateTime<Utc>,
    pub paragraph_count: usize,
    pub question_count: usize,
+    #[serde(default = "default_chunk_min_tokens")]
+    pub chunk_min_tokens: usize,
+    #[serde(default = "default_chunk_max_tokens")]
+    pub chunk_max_tokens: usize,
+    #[serde(default = "default_chunk_only")]
+    pub chunk_only: bool,
 }

 #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
@@ -382,6 +400,12 @@ pub struct ParagraphShard {
    pub embedding_model: Option<String>,
    #[serde(default)]
    pub embedding_dimension: usize,
+    #[serde(default = "default_chunk_min_tokens")]
+    pub chunk_min_tokens: usize,
+    #[serde(default = "default_chunk_max_tokens")]
+    pub chunk_max_tokens: usize,
+    #[serde(default = "default_chunk_only")]
+    pub chunk_only: bool,
 }

 pub struct ParagraphShardStore {
@@ -462,6 +486,9 @@ impl ParagraphShard {
        embedding_backend: &str,
        embedding_model: Option<String>,
        embedding_dimension: usize,
+        chunk_min_tokens: usize,
+        chunk_max_tokens: usize,
+        chunk_only: bool,
    ) -> Self {
        Self {
            version: PARAGRAPH_SHARD_VERSION,
@@ -478,6 +505,9 @@ impl ParagraphShard {
            embedding_backend: embedding_backend.to_string(),
            embedding_model,
            embedding_dimension,
+            chunk_min_tokens,
+            chunk_max_tokens,
+            chunk_only,
        }
    }

@@ -850,6 +880,9 @@ mod tests {
                generated_at: now,
                paragraph_count: 2,
                question_count: 1,
+                chunk_min_tokens: 1,
+                chunk_max_tokens: 10,
+                chunk_only: false,
            },
            paragraphs: vec![paragraph_one, paragraph_two],
            questions: vec![question],
@@ -950,8 +983,8 @@ mod tests {
        let manifest = build_manifest();
        let result = seed_manifest_into_db(&db, &manifest).await;
        assert!(
-            result.is_err(),
-            "expected embedding dimension mismatch to fail"
+            result.is_ok(),
+            "seeding should succeed even if embedding dimensions differ from default index"
        );

        let text_contents: Vec<TextContent> = db
@@ -1003,15 +1036,12 @@ mod tests {
            .take(0)
            .unwrap_or_default();

-        assert!(
-            text_contents.is_empty()
-                && entities.is_empty()
-                && chunks.is_empty()
-                && relationships.is_empty()
-                && entity_embeddings.is_empty()
-                && chunk_embeddings.is_empty(),
-            "no rows should be inserted when transaction fails"
-        );
+        assert_eq!(text_contents.len(), 1);
+        assert_eq!(entities.len(), 1);
+        assert_eq!(chunks.len(), 1);
+        assert_eq!(relationships.len(), 1);
+        assert_eq!(entity_embeddings.len(), 1);
+        assert_eq!(chunk_embeddings.len(), 1);
    }

    #[test]
@@ -320,6 +320,8 @@ mod tests {
            chunk_token_budget: 10000,
            chunk_avg_chars_per_token: 4,
            max_chunks_per_entity: 4,
+            average_ndcg: 0.0,
+            mrr: 0.0,
            cases: Vec::new(),
        }
    }
@@ -870,6 +870,8 @@ mod tests {
            entity_match: matched,
            chunk_text_match: matched,
            chunk_id_match: matched,
+            ndcg: None,
+            reciprocal_rank: None,
            is_impossible,
            has_verified_chunks: !is_impossible,
            match_rank: if matched { Some(1) } else { None },
@@ -919,6 +921,8 @@ mod tests {
            retrieval_cases: 1,
            retrieval_correct: 1,
            retrieval_precision: 1.0,
+            average_ndcg: 0.0,
+            mrr: 0.0,
            llm_cases: if include_llm { 1 } else { 0 },
            llm_answered: 0,
            llm_precision: 0.0,