chore: ingestion-pipeline refactor, sort technical debt, rustfmt

This commit is contained in:
Per Stark
2026-05-31 19:37:34 +02:00
parent 5c2d2e24d3
commit 3897345ab3
47 changed files with 1729 additions and 1343 deletions
+1 -2
View File
@@ -9,8 +9,7 @@ pub use orchestrator::{
};
pub use store::{
seed_manifest_into_db, window_manifest, CorpusHandle, CorpusManifest, CorpusMetadata,
CorpusQuestion, EmbeddedKnowledgeEntity, EmbeddedTextChunk, ParagraphShard,
ParagraphShardStore, MANIFEST_VERSION,
CorpusQuestion, ParagraphShard, ParagraphShardStore, MANIFEST_VERSION,
};
pub fn make_ingestion_config(config: &crate::args::Config) -> ingestion_pipeline::IngestionConfig {
+17 -24
View File
@@ -33,8 +33,7 @@ use crate::{
use crate::corpus::{
CorpusCacheConfig, CorpusHandle, CorpusManifest, CorpusMetadata, CorpusQuestion,
EmbeddedKnowledgeEntity, EmbeddedTextChunk, ParagraphShard, ParagraphShardStore,
MANIFEST_VERSION,
ParagraphShard, ParagraphShardStore, MANIFEST_VERSION,
};
const INGESTION_SPEC_VERSION: u32 = 2;
@@ -273,10 +272,19 @@ pub async fn ensure_corpus(
.context("shard record missing after ingestion run")?;
if cache.refresh_embeddings_only || shard_record.needs_reembed {
// Embeddings are now generated by the pipeline using FastEmbed - no need to re-embed
shard_record.shard.ingestion_fingerprint.clone_from(&ingestion_fingerprint);
shard_record
.shard
.ingestion_fingerprint
.clone_from(&ingestion_fingerprint);
shard_record.shard.ingested_at = Utc::now();
shard_record.shard.embedding_backend.clone_from(&embedding_backend_label);
shard_record.shard.embedding_model.clone_from(&embedding_model_code);
shard_record
.shard
.embedding_backend
.clone_from(&embedding_backend_label);
shard_record
.shard
.embedding_model
.clone_from(&embedding_model_code);
shard_record.shard.embedding_dimension = embedding_dimension;
shard_record.dirty = true;
shard_record.needs_reembed = false;
@@ -543,31 +551,16 @@ async fn ingest_single_paragraph(
let task = IngestionTask::new(payload, user_id.to_string());
match pipeline.produce_artifacts(&task).await {
Ok(artifacts) => {
let entities: Vec<EmbeddedKnowledgeEntity> = artifacts
.entities
.into_iter()
.map(|e| EmbeddedKnowledgeEntity {
entity: e.entity,
embedding: e.embedding,
})
.collect();
let chunks: Vec<EmbeddedTextChunk> = artifacts
.chunks
.into_iter()
.map(|c| EmbeddedTextChunk {
chunk: c.chunk,
embedding: c.embedding,
})
.collect();
// No need to reembed - pipeline now uses FastEmbed internally
// Artifacts already carry the shared `Embedded*` types and FastEmbed
// embeddings, so they can be persisted to the shard without re-mapping.
let mut shard = ParagraphShard::new(
paragraph,
request.shard_path,
ingestion_fingerprint,
artifacts.text_content,
entities,
artifacts.entities,
artifacts.relationships,
chunks,
artifacts.chunks,
&embedding_backend,
embedding_model.clone(),
embedding_dimension,
+3 -11
View File
@@ -54,17 +54,9 @@ fn default_chunk_only() -> bool {
false
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct EmbeddedKnowledgeEntity {
pub entity: KnowledgeEntity,
pub embedding: Vec<f32>,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct EmbeddedTextChunk {
pub chunk: TextChunk,
pub embedding: Vec<f32>,
}
// Reuse the pipeline's canonical embedded-artifact types so the on-disk corpus
// format and the ingestion output never drift apart.
pub use ingestion_pipeline::{EmbeddedKnowledgeEntity, EmbeddedTextChunk};
#[derive(Debug, Clone, serde::Deserialize)]
struct LegacyKnowledgeEntity {
+5 -1
View File
@@ -11,7 +11,11 @@ use tracing::warn;
use super::{ConvertedParagraph, ConvertedQuestion};
#[allow(clippy::too_many_lines, clippy::arithmetic_side_effects, clippy::cast_sign_loss)]
#[allow(
clippy::too_many_lines,
clippy::arithmetic_side_effects,
clippy::cast_sign_loss
)]
pub fn convert_nq(
raw_path: &Path,
include_unanswerable: bool,
+1 -2
View File
@@ -166,8 +166,7 @@ async fn async_main() -> anyhow::Result<()> {
);
if parsed.config.slice_grow.is_some() {
eval::grow_slice(&dataset, &parsed.config)
.context("growing slice ledger")?;
eval::grow_slice(&dataset, &parsed.config).context("growing slice ledger")?;
return Ok(());
}
+20 -7
View File
@@ -14,7 +14,7 @@ use common::{
utils::embedding::EmbeddingProvider,
};
use retrieval_pipeline::{
pipeline::{StageTimings, RetrievalConfig},
pipeline::{RetrievalConfig, StageTimings},
reranking::RerankerPool,
};
@@ -122,11 +122,15 @@ impl<'a> EvaluationContext<'a> {
}
pub fn slice(&self) -> Result<&slice::ResolvedSlice<'a>> {
self.slice.as_ref().ok_or_else(|| anyhow!("slice has not been prepared"))
self.slice
.as_ref()
.ok_or_else(|| anyhow!("slice has not been prepared"))
}
pub fn db(&self) -> Result<&SurrealDbClient> {
self.db.as_ref().ok_or_else(|| anyhow!("database connection missing"))
self.db
.as_ref()
.ok_or_else(|| anyhow!("database connection missing"))
}
pub fn descriptor(&self) -> Result<&snapshot::Descriptor> {
@@ -142,15 +146,23 @@ impl<'a> EvaluationContext<'a> {
}
pub fn openai_client(&self) -> Result<Arc<Client<async_openai::config::OpenAIConfig>>> {
Ok(Arc::clone(self.openai_client.as_ref().ok_or_else(|| anyhow!("openai client missing"))?))
Ok(Arc::clone(
self.openai_client
.as_ref()
.ok_or_else(|| anyhow!("openai client missing"))?,
))
}
pub fn corpus_handle(&self) -> Result<&corpus::CorpusHandle> {
self.corpus_handle.as_ref().ok_or_else(|| anyhow!("corpus handle missing"))
self.corpus_handle
.as_ref()
.ok_or_else(|| anyhow!("corpus handle missing"))
}
pub fn evaluation_user(&self) -> Result<&User> {
self.eval_user.as_ref().ok_or_else(|| anyhow!("evaluation user missing"))
self.eval_user
.as_ref()
.ok_or_else(|| anyhow!("evaluation user missing"))
}
#[allow(clippy::arithmetic_side_effects)]
@@ -168,7 +180,8 @@ impl<'a> EvaluationContext<'a> {
}
pub fn into_summary(self) -> Result<EvaluationSummary> {
self.summary.ok_or_else(|| anyhow!("evaluation summary missing"))
self.summary
.ok_or_else(|| anyhow!("evaluation summary missing"))
}
}
+15 -16
View File
@@ -10,7 +10,7 @@ use crate::eval::{
CaseSummary, RetrievedSummary,
};
use retrieval_pipeline::{
pipeline::{self, StageTimings, RetrievalConfig},
pipeline::{self, RetrievalConfig, StageTimings},
reranking::RerankerPool,
};
use tokio::sync::Semaphore;
@@ -169,10 +169,10 @@ pub(crate) async fn run_queries(
let query_start = Instant::now();
debug!(question_id = %question_id, "Evaluating query");
let query_embedding =
embedding_provider.embed(&question).await.with_context(|| {
format!("generating embedding for question {question_id}")
})?;
let query_embedding = embedding_provider
.embed(&question)
.await
.with_context(|| format!("generating embedding for question {question_id}"))?;
let reranker = match rerank_pool.as_ref() {
Some(pool) => pool.checkout().await,
None => None,
@@ -204,8 +204,10 @@ pub(crate) async fn run_queries(
let mut match_rank = None;
let answers_lower: Vec<String> =
answers.iter().map(|ans| ans.to_ascii_lowercase()).collect();
let expected_chunk_ids_set: HashSet<&str> =
expected_chunk_ids.iter().map(std::string::String::as_str).collect();
let expected_chunk_ids_set: HashSet<&str> = expected_chunk_ids
.iter()
.map(std::string::String::as_str)
.collect();
let chunk_id_required = has_verified_chunks;
let mut entity_hit = false;
let mut chunk_text_hit = false;
@@ -304,15 +306,12 @@ pub(crate) async fn run_queries(
None
};
Ok::<
(
usize,
CaseSummary,
Option<CaseDiagnostics>,
StageTimings,
),
anyhow::Error,
>((idx, summary, diagnostics, stage_timings))
Ok::<(usize, CaseSummary, Option<CaseDiagnostics>, StageTimings), anyhow::Error>((
idx,
summary,
diagnostics,
stage_timings,
))
}
})
.buffer_unordered(concurrency)
+5 -1
View File
@@ -13,7 +13,11 @@ use super::super::{
};
use super::{map_guard_error, StageResult};
#[allow(clippy::too_many_lines, clippy::arithmetic_side_effects, clippy::cast_precision_loss)]
#[allow(
clippy::too_many_lines,
clippy::arithmetic_side_effects,
clippy::cast_precision_loss
)]
pub(crate) async fn summarize(
machine: EvaluationMachine<(), QueriesFinished>,
ctx: &mut EvaluationContext<'_>,
+147 -26
View File
@@ -403,11 +403,20 @@ pub fn write_reports(
})
}
#[allow(clippy::too_many_lines, clippy::write_with_newline, clippy::unwrap_used)]
#[allow(
clippy::too_many_lines,
clippy::write_with_newline,
clippy::unwrap_used
)]
fn render_markdown(report: &EvaluationReport) -> String {
let mut md = String::new();
write!(md, "# Retrieval Evaluation (k={})\\n\\n", report.retrieval.k).unwrap();
write!(
md,
"# Retrieval Evaluation (k={})\\n\\n",
report.retrieval.k
)
.unwrap();
md.push_str("## Overview\\n\\n");
md.push_str("| Metric | Value |\\n| --- | --- |\\n");
@@ -424,34 +433,94 @@ fn render_markdown(report: &EvaluationReport) -> String {
)
.unwrap();
write!(md, "| Total Cases | {} |\\n", report.overview.total_cases).unwrap();
write!(md, "| Filtered Questions | {} |\\n", report.overview.filtered_questions).unwrap();
write!(
md,
"| Filtered Questions | {} |\\n",
report.overview.filtered_questions
)
.unwrap();
md.push_str("\\n## Dataset & Slice\\n\\n");
md.push_str("| Metric | Value |\\n| --- | --- |\\n");
write!(md, "| Dataset | {} (`{}`) |\\n", report.dataset.label, report.dataset.id).unwrap();
write!(
md,
"| Dataset | {} (`{}`) |\\n",
report.dataset.label, report.dataset.id
)
.unwrap();
write!(md, "| Dataset Source | {} |\\n", report.dataset.source).unwrap();
write!(md, "| Includes Unanswerable | {} |\\n", bool_badge(report.dataset.includes_unanswerable)).unwrap();
write!(md, "| Require Verified Chunks | {} |\\n", bool_badge(report.dataset.require_verified_chunks)).unwrap();
write!(
md,
"| Includes Unanswerable | {} |\\n",
bool_badge(report.dataset.includes_unanswerable)
)
.unwrap();
write!(
md,
"| Require Verified Chunks | {} |\\n",
bool_badge(report.dataset.require_verified_chunks)
)
.unwrap();
let embedding_label = if let Some(model) = report.dataset.embedding_model.as_ref() {
format!("{} ({model})", report.dataset.embedding_backend)
} else {
report.dataset.embedding_backend.clone()
};
write!(md, "| Embedding | {embedding_label} |\\n").unwrap();
write!(md, "| Embedding Dim | {} |\\n", report.dataset.embedding_dimension).unwrap();
write!(
md,
"| Embedding Dim | {} |\\n",
report.dataset.embedding_dimension
)
.unwrap();
write!(md, "| Slice ID | `{}` |\\n", report.slice.id).unwrap();
write!(md, "| Slice Seed | {} |\\n", report.slice.seed).unwrap();
write!(md, "| Slice Window (offset/length) | {}/{} |\\n", report.slice.window_offset, report.slice.window_length).unwrap();
write!(md, "| Slice Questions (window/ledger) | {}/{} |\\n", report.slice.slice_cases, report.slice.ledger_total_cases).unwrap();
write!(md, "| Slice Positives / Negatives | {}/{} |\\n", report.slice.positives, report.slice.negatives).unwrap();
write!(md, "| Slice Paragraphs | {} |\\n", report.slice.total_paragraphs).unwrap();
write!(md, "| Negative Multiplier | {:.2} |\\n", report.slice.negative_multiplier).unwrap();
write!(
md,
"| Slice Window (offset/length) | {}/{} |\\n",
report.slice.window_offset, report.slice.window_length
)
.unwrap();
write!(
md,
"| Slice Questions (window/ledger) | {}/{} |\\n",
report.slice.slice_cases, report.slice.ledger_total_cases
)
.unwrap();
write!(
md,
"| Slice Positives / Negatives | {}/{} |\\n",
report.slice.positives, report.slice.negatives
)
.unwrap();
write!(
md,
"| Slice Paragraphs | {} |\\n",
report.slice.total_paragraphs
)
.unwrap();
write!(
md,
"| Negative Multiplier | {:.2} |\\n",
report.slice.negative_multiplier
)
.unwrap();
md.push_str("\\n## Retrieval Metrics\\n\\n");
md.push_str("| Metric | Value |\\n| --- | --- |\\n");
write!(md, "| Cases | {} |\\n", report.retrieval.cases).unwrap();
write!(md, "| Correct@{} | {}/{} |\\n", report.retrieval.k, report.retrieval.correct, report.retrieval.cases).unwrap();
write!(md, "| Precision@{} | {:.3} |\\n", report.retrieval.k, report.retrieval.precision).unwrap();
write!(
md,
"| Correct@{} | {}/{} |\\n",
report.retrieval.k, report.retrieval.correct, report.retrieval.cases
)
.unwrap();
write!(
md,
"| Precision@{} | {:.3} |\\n",
report.retrieval.k, report.retrieval.precision
)
.unwrap();
write!(
md,
"| Precision@1/2/3 | {:.3} / {:.3} / {:.3} |\\n",
@@ -462,7 +531,12 @@ fn render_markdown(report: &EvaluationReport) -> String {
.unwrap();
write!(md, "| MRR | {:.3} |\\n", report.retrieval.mrr).unwrap();
write!(md, "| NDCG | {:.3} |\\n", report.retrieval.average_ndcg).unwrap();
write!(md, "| Latency Avg / P50 / P95 (ms) | {:.1} / {} / {} |\\n", report.retrieval.latency.avg, report.retrieval.latency.p50, report.retrieval.latency.p95).unwrap();
write!(
md,
"| Latency Avg / P50 / P95 (ms) | {:.1} / {} / {} |\\n",
report.retrieval.latency.avg, report.retrieval.latency.p50, report.retrieval.latency.p95
)
.unwrap();
write!(
md,
"| Resolve entities | {} |\\n",
@@ -473,8 +547,14 @@ fn render_markdown(report: &EvaluationReport) -> String {
if report.retrieval.rerank_enabled {
let pool = report
.retrieval
.rerank_pool_size.map_or_else(|| "?".into(), |size| size.to_string());
write!(md, "| Rerank | enabled (pool {pool}, keep top {}) |\\n", report.retrieval.rerank_keep_top).unwrap();
.rerank_pool_size
.map_or_else(|| "?".into(), |size| size.to_string());
write!(
md,
"| Rerank | enabled (pool {pool}, keep top {}) |\\n",
report.retrieval.rerank_keep_top
)
.unwrap();
} else {
md.push_str("| Rerank | disabled |\\n");
}
@@ -489,8 +569,18 @@ fn render_markdown(report: &EvaluationReport) -> String {
md.push_str("\\n## Performance\\n\\n");
md.push_str("| Metric | Value |\\n| --- | --- |\\n");
write!(md, "| OpenAI Base URL | {} |\\n", report.performance.openai_base_url).unwrap();
write!(md, "| Ingestion Duration | {} ms |\\n", report.performance.ingestion_ms).unwrap();
write!(
md,
"| OpenAI Base URL | {} |\\n",
report.performance.openai_base_url
)
.unwrap();
write!(
md,
"| Ingestion Duration | {} ms |\\n",
report.performance.ingestion_ms
)
.unwrap();
if let Some(seed) = report.performance.namespace_seed_ms {
write!(md, "| Namespace Seed | {seed} ms |\\n").unwrap();
}
@@ -504,14 +594,44 @@ fn render_markdown(report: &EvaluationReport) -> String {
}
)
.unwrap();
write!(md, "| Corpus Paragraphs | {} |\\n", report.performance.corpus_paragraphs).unwrap();
write!(
md,
"| Corpus Paragraphs | {} |\\n",
report.performance.corpus_paragraphs
)
.unwrap();
if report.detailed_report {
write!(md, "| Ingestion Cache | `{}` |\\n", report.performance.ingestion_cache_path).unwrap();
write!(md, "| Ingestion Reused | {} |\\n", bool_badge(report.performance.ingestion_reused)).unwrap();
write!(md, "| Embeddings Reused | {} |\\n", bool_badge(report.performance.embeddings_reused)).unwrap();
write!(
md,
"| Ingestion Cache | `{}` |\\n",
report.performance.ingestion_cache_path
)
.unwrap();
write!(
md,
"| Ingestion Reused | {} |\\n",
bool_badge(report.performance.ingestion_reused)
)
.unwrap();
write!(
md,
"| Embeddings Reused | {} |\\n",
bool_badge(report.performance.embeddings_reused)
)
.unwrap();
}
write!(md, "| Positives Cached | {} |\\n", report.performance.positive_paragraphs_reused).unwrap();
write!(md, "| Negatives Cached | {} |\\n", report.performance.negative_paragraphs_reused).unwrap();
write!(
md,
"| Positives Cached | {} |\\n",
report.performance.positive_paragraphs_reused
)
.unwrap();
write!(
md,
"| Negatives Cached | {} |\\n",
report.performance.negative_paragraphs_reused
)
.unwrap();
md.push_str("\\n## Retrieval Stage Timings\\n\\n");
md.push_str("| Stage | Avg (ms) | P50 (ms) | P95 (ms) |\\n| --- | --- | --- | --- |\\n");
@@ -583,7 +703,8 @@ fn render_markdown(report: &EvaluationReport) -> String {
for case in &report.llm_cases {
let retrieved = render_retrieved(&case.retrieved);
let rank = case
.match_rank.map_or_else(|| "-".into(), |rank| rank.to_string());
.match_rank
.map_or_else(|| "-".into(), |rank| rank.to_string());
write!(
md,
"| `{}` | {} | {} | {} |\\n",
+31 -19
View File
@@ -99,10 +99,13 @@ fn sanitize_identifier(input: &str) -> String {
let mut hasher = Sha256::new();
hasher.update(input.as_bytes());
let digest = hasher.finalize();
digest.iter().take(6).fold(String::with_capacity(12), |mut s, b| {
let _ = write!(s, "{b:02x}");
s
})
digest
.iter()
.take(6)
.fold(String::with_capacity(12), |mut s, b| {
let _ = write!(s, "{b:02x}");
s
})
} else {
trimmed
}
@@ -127,7 +130,9 @@ pub struct SliceWindow<'a> {
impl SliceWindow<'_> {
pub fn positive_ids(&self) -> impl Iterator<Item = &str> {
self.positive_paragraph_ids.iter().map(std::string::String::as_str)
self.positive_paragraph_ids
.iter()
.map(std::string::String::as_str)
}
}
@@ -169,7 +174,10 @@ impl DatasetIndex {
.paragraph_by_id
.get(id)
.ok_or_else(|| anyhow!("slice references unknown paragraph '{id}'"))?;
dataset.paragraphs.get(*idx).ok_or_else(|| anyhow!("paragraph index out of bounds"))
dataset
.paragraphs
.get(*idx)
.ok_or_else(|| anyhow!("paragraph index out of bounds"))
}
fn question<'a>(
@@ -181,7 +189,9 @@ impl DatasetIndex {
.question_by_id
.get(question_id)
.ok_or_else(|| anyhow!("slice references unknown question '{question_id}'"))?;
let paragraph = dataset.paragraphs.get(*p_idx)
let paragraph = dataset
.paragraphs
.get(*p_idx)
.ok_or_else(|| anyhow!("paragraph index out of bounds for question '{question_id}'"))?;
let question = paragraph
.questions
@@ -318,9 +328,7 @@ pub fn resolve_slice<'a>(
.is_some_and(|manifest| manifest.version != SLICE_VERSION)
{
warn!(
slice = manifest
.as_ref()
.map_or("unknown", |m| m.slice_id.as_str()),
slice = manifest.as_ref().map_or("unknown", |m| m.slice_id.as_str()),
found = manifest.as_ref().map_or(0, |m| m.version),
expected = SLICE_VERSION,
"Slice manifest version mismatch; regenerating"
@@ -919,7 +927,11 @@ fn ensure_shard_paths(manifest: &mut SliceManifest) -> bool {
changed
}
#[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation, clippy::cast_sign_loss)]
#[allow(
clippy::cast_precision_loss,
clippy::cast_possible_truncation,
clippy::cast_sign_loss
)]
fn desired_negative_target(
positive_count: usize,
requested_corpus: usize,
@@ -1007,10 +1019,13 @@ fn compute_slice_id(key: &SliceKey<'_>) -> Result<String> {
let mut hasher = Sha256::new();
hasher.update(payload);
let digest = hasher.finalize();
Ok(digest.iter().take(16).fold(String::with_capacity(32), |mut s, b| {
let _ = write!(s, "{b:02x}");
s
}))
Ok(digest
.iter()
.take(16)
.fold(String::with_capacity(32), |mut s, b| {
let _ = write!(s, "{b:02x}");
s
}))
}
#[allow(clippy::indexing_slicing)]
@@ -1050,10 +1065,7 @@ impl<'a> From<&'a Config> for SliceConfig<'a> {
}
}
pub fn slice_config_with_limit(
config: &Config,
limit_override: Option<usize>,
) -> SliceConfig<'_> {
pub fn slice_config_with_limit(config: &Config, limit_override: Option<usize>) -> SliceConfig<'_> {
SliceConfig {
cache_dir: config.cache_dir.as_path(),
force_convert: config.force_convert,
+4 -1
View File
@@ -409,7 +409,10 @@ pub fn build_case_diagnostics(
candidates: &[EvaluationCandidate],
pipeline_stats: Option<Diagnostics>,
) -> CaseDiagnostics {
let expected_set: HashSet<&str> = expected_chunk_ids.iter().map(std::string::String::as_str).collect();
let expected_set: HashSet<&str> = expected_chunk_ids
.iter()
.map(std::string::String::as_str)
.collect();
let mut seen_chunks: HashSet<String> = HashSet::new();
let mut attached_chunk_ids = Vec::new();
let mut entity_diagnostics = Vec::new();