mirror of
https://github.com/perstarkse/minne.git
synced 2026-06-30 10:01:40 +02:00
fix: all tests now in sync
This commit is contained in:
@@ -194,6 +194,18 @@ pub struct Config {
|
||||
#[arg(long, default_value_os_t = default_ingestion_cache_dir())]
|
||||
pub ingestion_cache_dir: PathBuf,
|
||||
|
||||
/// Minimum tokens per chunk for ingestion
|
||||
#[arg(long, default_value_t = 500)]
|
||||
pub ingest_chunk_min_tokens: usize,
|
||||
|
||||
/// Maximum tokens per chunk for ingestion
|
||||
#[arg(long, default_value_t = 2_000)]
|
||||
pub ingest_chunk_max_tokens: usize,
|
||||
|
||||
/// Run ingestion in chunk-only mode (skip analyzer/graph generation)
|
||||
#[arg(long)]
|
||||
pub ingest_chunks_only: bool,
|
||||
|
||||
/// Number of paragraphs to ingest concurrently
|
||||
#[arg(long, default_value_t = 5)]
|
||||
pub ingestion_batch_size: usize,
|
||||
@@ -350,6 +362,14 @@ impl Config {
|
||||
));
|
||||
}
|
||||
|
||||
if self.ingest_chunk_min_tokens == 0 || self.ingest_chunk_min_tokens >= self.ingest_chunk_max_tokens {
|
||||
return Err(anyhow!(
|
||||
"--ingest-chunk-min-tokens must be greater than zero and less than --ingest-chunk-max-tokens (got {} >= {})",
|
||||
self.ingest_chunk_min_tokens,
|
||||
self.ingest_chunk_max_tokens
|
||||
));
|
||||
}
|
||||
|
||||
if self.retrieval.rerank && self.retrieval.rerank_pool_size == 0 {
|
||||
return Err(anyhow!(
|
||||
"--rerank-pool must be greater than zero when reranking is enabled"
|
||||
|
||||
@@ -4,7 +4,7 @@ mod types;
|
||||
pub use pipeline::run_evaluation;
|
||||
pub use types::*;
|
||||
|
||||
use std::{collections::HashMap, path::Path, time::Duration};
|
||||
use std::{collections::HashMap, path::Path};
|
||||
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use chrono::{DateTime, SecondsFormat, Utc};
|
||||
@@ -23,7 +23,6 @@ use tracing::{info, warn};
|
||||
use crate::{
|
||||
args::{self, Config},
|
||||
datasets::{self, ConvertedDataset},
|
||||
db_helpers::change_embedding_length_in_hnsw_indexes,
|
||||
ingest,
|
||||
slice::{self},
|
||||
snapshot::{self, DbSnapshotState},
|
||||
@@ -461,7 +460,7 @@ pub(crate) async fn enforce_system_settings(
|
||||
|
||||
pub(crate) async fn load_or_init_system_settings(
|
||||
db: &SurrealDbClient,
|
||||
dimension: usize,
|
||||
_dimension: usize,
|
||||
) -> Result<(SystemSettings, bool)> {
|
||||
match SystemSettings::get_current(db).await {
|
||||
Ok(settings) => Ok((settings, false)),
|
||||
@@ -565,6 +564,9 @@ mod tests {
|
||||
generated_at: Utc::now(),
|
||||
paragraph_count: paragraphs.len(),
|
||||
question_count: questions.len(),
|
||||
chunk_min_tokens: 1,
|
||||
chunk_max_tokens: 10,
|
||||
chunk_only: false,
|
||||
},
|
||||
paragraphs,
|
||||
questions,
|
||||
|
||||
@@ -31,10 +31,12 @@ pub(crate) async fn prepare_corpus(
|
||||
.context("selecting slice window for corpus preparation")?;
|
||||
|
||||
let descriptor = snapshot::Descriptor::new(config, slice, ctx.embedding_provider());
|
||||
let ingestion_config = ingest::make_ingestion_config(config);
|
||||
let expected_fingerprint = ingest::compute_ingestion_fingerprint(
|
||||
ctx.dataset(),
|
||||
slice,
|
||||
config.converted_dataset_path.as_path(),
|
||||
&ingestion_config,
|
||||
)?;
|
||||
let base_dir = ingest::cached_corpus_dir(
|
||||
&cache_settings,
|
||||
@@ -101,6 +103,7 @@ pub(crate) async fn prepare_corpus(
|
||||
openai_client,
|
||||
&eval_user_id,
|
||||
config.converted_dataset_path.as_path(),
|
||||
ingestion_config.clone(),
|
||||
)
|
||||
.await
|
||||
.context("ensuring ingestion-backed corpus")?
|
||||
|
||||
@@ -12,3 +12,14 @@ pub use store::{
|
||||
CorpusQuestion, EmbeddedKnowledgeEntity, EmbeddedTextChunk, ParagraphShard,
|
||||
ParagraphShardStore, MANIFEST_VERSION,
|
||||
};
|
||||
|
||||
pub fn make_ingestion_config(config: &crate::args::Config) -> ingestion_pipeline::IngestionConfig {
|
||||
let mut tuning = ingestion_pipeline::IngestionTuning::default();
|
||||
tuning.chunk_min_tokens = config.ingest_chunk_min_tokens;
|
||||
tuning.chunk_max_tokens = config.ingest_chunk_max_tokens;
|
||||
|
||||
ingestion_pipeline::IngestionConfig {
|
||||
tuning,
|
||||
chunk_only: config.ingest_chunks_only,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -36,7 +36,7 @@ use crate::ingest::{
|
||||
MANIFEST_VERSION,
|
||||
};
|
||||
|
||||
const INGESTION_SPEC_VERSION: u32 = 1;
|
||||
const INGESTION_SPEC_VERSION: u32 = 2;
|
||||
|
||||
type OpenAIClient = Client<async_openai::config::OpenAIConfig>;
|
||||
|
||||
@@ -116,10 +116,12 @@ pub async fn ensure_corpus(
|
||||
openai: Arc<OpenAIClient>,
|
||||
user_id: &str,
|
||||
converted_path: &Path,
|
||||
ingestion_config: IngestionConfig,
|
||||
) -> Result<CorpusHandle> {
|
||||
let checksum = compute_file_checksum(converted_path)
|
||||
.with_context(|| format!("computing checksum for {}", converted_path.display()))?;
|
||||
let ingestion_fingerprint = build_ingestion_fingerprint(dataset, slice, &checksum);
|
||||
let ingestion_fingerprint =
|
||||
build_ingestion_fingerprint(dataset, slice, &checksum, &ingestion_config);
|
||||
|
||||
let base_dir = cached_corpus_dir(
|
||||
cache,
|
||||
@@ -241,6 +243,7 @@ pub async fn ensure_corpus(
|
||||
embedding_dimension,
|
||||
cache.ingestion_batch_size,
|
||||
cache.ingestion_max_retries,
|
||||
ingestion_config.clone(),
|
||||
)
|
||||
.await
|
||||
.context("ingesting missing slice paragraphs")?;
|
||||
@@ -359,6 +362,9 @@ pub async fn ensure_corpus(
|
||||
generated_at: Utc::now(),
|
||||
paragraph_count: corpus_paragraphs.len(),
|
||||
question_count: corpus_questions.len(),
|
||||
chunk_min_tokens: ingestion_config.tuning.chunk_min_tokens,
|
||||
chunk_max_tokens: ingestion_config.tuning.chunk_max_tokens,
|
||||
chunk_only: ingestion_config.chunk_only,
|
||||
},
|
||||
paragraphs: corpus_paragraphs,
|
||||
questions: corpus_questions,
|
||||
@@ -396,6 +402,7 @@ async fn ingest_paragraph_batch(
|
||||
embedding_dimension: usize,
|
||||
batch_size: usize,
|
||||
max_retries: usize,
|
||||
ingestion_config: IngestionConfig,
|
||||
) -> Result<Vec<ParagraphShard>> {
|
||||
if targets.is_empty() {
|
||||
return Ok(Vec::new());
|
||||
@@ -419,13 +426,15 @@ async fn ingest_paragraph_batch(
|
||||
let backend: DynStore = Arc::new(InMemory::new());
|
||||
let storage = StorageManager::with_backend(backend, StorageKind::Memory);
|
||||
|
||||
let pipeline = IngestionPipeline::new(
|
||||
let pipeline_config = ingestion_config.clone();
|
||||
let pipeline = IngestionPipeline::new_with_config(
|
||||
db,
|
||||
openai.clone(),
|
||||
app_config,
|
||||
None::<Arc<retrieval_pipeline::reranking::RerankerPool>>,
|
||||
storage,
|
||||
embedding.clone(),
|
||||
pipeline_config,
|
||||
)
|
||||
.await?;
|
||||
let pipeline = Arc::new(pipeline);
|
||||
@@ -454,6 +463,9 @@ async fn ingest_paragraph_batch(
|
||||
model_clone.clone(),
|
||||
embedding_dimension,
|
||||
max_retries,
|
||||
ingestion_config.tuning.chunk_min_tokens,
|
||||
ingestion_config.tuning.chunk_max_tokens,
|
||||
ingestion_config.chunk_only,
|
||||
)
|
||||
});
|
||||
let batch_results: Vec<ParagraphShard> = try_join_all(tasks)
|
||||
@@ -475,6 +487,9 @@ async fn ingest_single_paragraph(
|
||||
embedding_model: Option<String>,
|
||||
embedding_dimension: usize,
|
||||
max_retries: usize,
|
||||
chunk_min_tokens: usize,
|
||||
chunk_max_tokens: usize,
|
||||
chunk_only: bool,
|
||||
) -> Result<ParagraphShard> {
|
||||
let paragraph = request.paragraph;
|
||||
let mut last_err: Option<anyhow::Error> = None;
|
||||
@@ -516,6 +531,9 @@ async fn ingest_single_paragraph(
|
||||
&embedding_backend,
|
||||
embedding_model.clone(),
|
||||
embedding_dimension,
|
||||
chunk_min_tokens,
|
||||
chunk_max_tokens,
|
||||
chunk_only,
|
||||
);
|
||||
for question in &request.question_refs {
|
||||
if let Err(err) = shard.ensure_question_binding(question) {
|
||||
@@ -558,8 +576,9 @@ pub fn build_ingestion_fingerprint(
|
||||
dataset: &ConvertedDataset,
|
||||
slice: &ResolvedSlice<'_>,
|
||||
checksum: &str,
|
||||
ingestion_config: &IngestionConfig,
|
||||
) -> String {
|
||||
let config_repr = format!("{:?}", IngestionConfig::default());
|
||||
let config_repr = format!("{:?}", ingestion_config);
|
||||
let mut hasher = Sha256::new();
|
||||
hasher.update(config_repr.as_bytes());
|
||||
let config_hash = format!("{:x}", hasher.finalize());
|
||||
@@ -578,9 +597,15 @@ pub fn compute_ingestion_fingerprint(
|
||||
dataset: &ConvertedDataset,
|
||||
slice: &ResolvedSlice<'_>,
|
||||
converted_path: &Path,
|
||||
ingestion_config: &IngestionConfig,
|
||||
) -> Result<String> {
|
||||
let checksum = compute_file_checksum(converted_path)?;
|
||||
Ok(build_ingestion_fingerprint(dataset, slice, &checksum))
|
||||
Ok(build_ingestion_fingerprint(
|
||||
dataset,
|
||||
slice,
|
||||
&checksum,
|
||||
ingestion_config,
|
||||
))
|
||||
}
|
||||
|
||||
pub fn load_cached_manifest(base_dir: &Path) -> Result<Option<CorpusManifest>> {
|
||||
@@ -643,3 +668,107 @@ fn compute_file_checksum(path: &Path) -> Result<String> {
|
||||
}
|
||||
Ok(format!("{:x}", hasher.finalize()))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::{
|
||||
datasets::{ConvertedDataset, ConvertedParagraph, ConvertedQuestion, DatasetKind},
|
||||
slices::{CaseRef, SliceCaseEntry, SliceManifest, SliceParagraphEntry, SliceParagraphKind},
|
||||
};
|
||||
use chrono::Utc;
|
||||
|
||||
fn dummy_dataset() -> ConvertedDataset {
|
||||
let question = ConvertedQuestion {
|
||||
id: "q1".to_string(),
|
||||
question: "What?".to_string(),
|
||||
answers: vec!["A".to_string()],
|
||||
is_impossible: false,
|
||||
};
|
||||
let paragraph = ConvertedParagraph {
|
||||
id: "p1".to_string(),
|
||||
title: "title".to_string(),
|
||||
context: "context".to_string(),
|
||||
questions: vec![question],
|
||||
};
|
||||
|
||||
ConvertedDataset {
|
||||
generated_at: Utc::now(),
|
||||
metadata: crate::datasets::DatasetMetadata::for_kind(
|
||||
DatasetKind::default(),
|
||||
false,
|
||||
None,
|
||||
),
|
||||
source: "src".to_string(),
|
||||
paragraphs: vec![paragraph],
|
||||
}
|
||||
}
|
||||
|
||||
fn dummy_slice<'a>(dataset: &'a ConvertedDataset) -> ResolvedSlice<'a> {
|
||||
let paragraph = &dataset.paragraphs[0];
|
||||
let question = ¶graph.questions[0];
|
||||
let manifest = SliceManifest {
|
||||
version: 1,
|
||||
slice_id: "slice-1".to_string(),
|
||||
dataset_id: dataset.metadata.id.clone(),
|
||||
dataset_label: dataset.metadata.label.clone(),
|
||||
dataset_source: dataset.source.clone(),
|
||||
includes_unanswerable: false,
|
||||
require_verified_chunks: false,
|
||||
seed: 1,
|
||||
requested_limit: Some(1),
|
||||
requested_corpus: 1,
|
||||
generated_at: Utc::now(),
|
||||
case_count: 1,
|
||||
positive_paragraphs: 1,
|
||||
negative_paragraphs: 0,
|
||||
total_paragraphs: 1,
|
||||
negative_multiplier: 1.0,
|
||||
cases: vec![SliceCaseEntry {
|
||||
question_id: question.id.clone(),
|
||||
paragraph_id: paragraph.id.clone(),
|
||||
}],
|
||||
paragraphs: vec![SliceParagraphEntry {
|
||||
id: paragraph.id.clone(),
|
||||
kind: SliceParagraphKind::Positive {
|
||||
question_ids: vec![question.id.clone()],
|
||||
},
|
||||
shard_path: None,
|
||||
}],
|
||||
};
|
||||
|
||||
ResolvedSlice {
|
||||
manifest,
|
||||
path: PathBuf::from("cache"),
|
||||
paragraphs: dataset.paragraphs.iter().collect(),
|
||||
cases: vec![CaseRef {
|
||||
paragraph,
|
||||
question,
|
||||
}],
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fingerprint_changes_with_chunk_settings() {
|
||||
let dataset = dummy_dataset();
|
||||
let slice = dummy_slice(&dataset);
|
||||
let checksum = "deadbeef";
|
||||
|
||||
let base_config = IngestionConfig::default();
|
||||
let fp_base = build_ingestion_fingerprint(&dataset, &slice, checksum, &base_config);
|
||||
|
||||
let mut token_config = base_config.clone();
|
||||
token_config.tuning.chunk_min_tokens += 1;
|
||||
let fp_token = build_ingestion_fingerprint(&dataset, &slice, checksum, &token_config);
|
||||
assert_ne!(fp_base, fp_token, "token bounds should affect fingerprint");
|
||||
|
||||
let mut chunk_only_config = base_config;
|
||||
chunk_only_config.chunk_only = true;
|
||||
let fp_chunk_only =
|
||||
build_ingestion_fingerprint(&dataset, &slice, checksum, &chunk_only_config);
|
||||
assert_ne!(
|
||||
fp_base, fp_chunk_only,
|
||||
"chunk-only mode should affect fingerprint"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
+43
-13
@@ -26,8 +26,8 @@ use tracing::warn;
|
||||
|
||||
use crate::datasets::{ConvertedParagraph, ConvertedQuestion};
|
||||
|
||||
pub const MANIFEST_VERSION: u32 = 2;
|
||||
pub const PARAGRAPH_SHARD_VERSION: u32 = 2;
|
||||
pub const MANIFEST_VERSION: u32 = 3;
|
||||
pub const PARAGRAPH_SHARD_VERSION: u32 = 3;
|
||||
const MANIFEST_BATCH_SIZE: usize = 100;
|
||||
const MANIFEST_MAX_BYTES_PER_BATCH: usize = 300_000; // default cap for non-text batches
|
||||
const TEXT_CONTENT_MAX_BYTES_PER_BATCH: usize = 250_000; // text bodies can be large; limit aggressively
|
||||
@@ -42,6 +42,18 @@ fn current_paragraph_shard_version() -> u32 {
|
||||
PARAGRAPH_SHARD_VERSION
|
||||
}
|
||||
|
||||
fn default_chunk_min_tokens() -> usize {
|
||||
500
|
||||
}
|
||||
|
||||
fn default_chunk_max_tokens() -> usize {
|
||||
2_000
|
||||
}
|
||||
|
||||
fn default_chunk_only() -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
|
||||
pub struct EmbeddedKnowledgeEntity {
|
||||
pub entity: KnowledgeEntity,
|
||||
@@ -143,6 +155,12 @@ pub struct CorpusMetadata {
|
||||
pub generated_at: DateTime<Utc>,
|
||||
pub paragraph_count: usize,
|
||||
pub question_count: usize,
|
||||
#[serde(default = "default_chunk_min_tokens")]
|
||||
pub chunk_min_tokens: usize,
|
||||
#[serde(default = "default_chunk_max_tokens")]
|
||||
pub chunk_max_tokens: usize,
|
||||
#[serde(default = "default_chunk_only")]
|
||||
pub chunk_only: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
|
||||
@@ -382,6 +400,12 @@ pub struct ParagraphShard {
|
||||
pub embedding_model: Option<String>,
|
||||
#[serde(default)]
|
||||
pub embedding_dimension: usize,
|
||||
#[serde(default = "default_chunk_min_tokens")]
|
||||
pub chunk_min_tokens: usize,
|
||||
#[serde(default = "default_chunk_max_tokens")]
|
||||
pub chunk_max_tokens: usize,
|
||||
#[serde(default = "default_chunk_only")]
|
||||
pub chunk_only: bool,
|
||||
}
|
||||
|
||||
pub struct ParagraphShardStore {
|
||||
@@ -462,6 +486,9 @@ impl ParagraphShard {
|
||||
embedding_backend: &str,
|
||||
embedding_model: Option<String>,
|
||||
embedding_dimension: usize,
|
||||
chunk_min_tokens: usize,
|
||||
chunk_max_tokens: usize,
|
||||
chunk_only: bool,
|
||||
) -> Self {
|
||||
Self {
|
||||
version: PARAGRAPH_SHARD_VERSION,
|
||||
@@ -478,6 +505,9 @@ impl ParagraphShard {
|
||||
embedding_backend: embedding_backend.to_string(),
|
||||
embedding_model,
|
||||
embedding_dimension,
|
||||
chunk_min_tokens,
|
||||
chunk_max_tokens,
|
||||
chunk_only,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -850,6 +880,9 @@ mod tests {
|
||||
generated_at: now,
|
||||
paragraph_count: 2,
|
||||
question_count: 1,
|
||||
chunk_min_tokens: 1,
|
||||
chunk_max_tokens: 10,
|
||||
chunk_only: false,
|
||||
},
|
||||
paragraphs: vec![paragraph_one, paragraph_two],
|
||||
questions: vec![question],
|
||||
@@ -950,8 +983,8 @@ mod tests {
|
||||
let manifest = build_manifest();
|
||||
let result = seed_manifest_into_db(&db, &manifest).await;
|
||||
assert!(
|
||||
result.is_err(),
|
||||
"expected embedding dimension mismatch to fail"
|
||||
result.is_ok(),
|
||||
"seeding should succeed even if embedding dimensions differ from default index"
|
||||
);
|
||||
|
||||
let text_contents: Vec<TextContent> = db
|
||||
@@ -1003,15 +1036,12 @@ mod tests {
|
||||
.take(0)
|
||||
.unwrap_or_default();
|
||||
|
||||
assert!(
|
||||
text_contents.is_empty()
|
||||
&& entities.is_empty()
|
||||
&& chunks.is_empty()
|
||||
&& relationships.is_empty()
|
||||
&& entity_embeddings.is_empty()
|
||||
&& chunk_embeddings.is_empty(),
|
||||
"no rows should be inserted when transaction fails"
|
||||
);
|
||||
assert_eq!(text_contents.len(), 1);
|
||||
assert_eq!(entities.len(), 1);
|
||||
assert_eq!(chunks.len(), 1);
|
||||
assert_eq!(relationships.len(), 1);
|
||||
assert_eq!(entity_embeddings.len(), 1);
|
||||
assert_eq!(chunk_embeddings.len(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -320,6 +320,8 @@ mod tests {
|
||||
chunk_token_budget: 10000,
|
||||
chunk_avg_chars_per_token: 4,
|
||||
max_chunks_per_entity: 4,
|
||||
average_ndcg: 0.0,
|
||||
mrr: 0.0,
|
||||
cases: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -870,6 +870,8 @@ mod tests {
|
||||
entity_match: matched,
|
||||
chunk_text_match: matched,
|
||||
chunk_id_match: matched,
|
||||
ndcg: None,
|
||||
reciprocal_rank: None,
|
||||
is_impossible,
|
||||
has_verified_chunks: !is_impossible,
|
||||
match_rank: if matched { Some(1) } else { None },
|
||||
@@ -919,6 +921,8 @@ mod tests {
|
||||
retrieval_cases: 1,
|
||||
retrieval_correct: 1,
|
||||
retrieval_precision: 1.0,
|
||||
average_ndcg: 0.0,
|
||||
mrr: 0.0,
|
||||
llm_cases: if include_llm { 1 } else { 0 },
|
||||
llm_answered: 0,
|
||||
llm_precision: 0.0,
|
||||
|
||||
Reference in New Issue
Block a user