chore: ingestion-pipeline refactor, sort technical debt, rustfmt

This commit is contained in:
Per Stark
2026-05-31 19:37:34 +02:00
parent 5c2d2e24d3
commit 3897345ab3
47 changed files with 1729 additions and 1343 deletions
+17 -24
View File
@@ -33,8 +33,7 @@ use crate::{
use crate::corpus::{
CorpusCacheConfig, CorpusHandle, CorpusManifest, CorpusMetadata, CorpusQuestion,
EmbeddedKnowledgeEntity, EmbeddedTextChunk, ParagraphShard, ParagraphShardStore,
MANIFEST_VERSION,
ParagraphShard, ParagraphShardStore, MANIFEST_VERSION,
};
const INGESTION_SPEC_VERSION: u32 = 2;
@@ -273,10 +272,19 @@ pub async fn ensure_corpus(
.context("shard record missing after ingestion run")?;
if cache.refresh_embeddings_only || shard_record.needs_reembed {
// Embeddings are now generated by the pipeline using FastEmbed - no need to re-embed
shard_record.shard.ingestion_fingerprint.clone_from(&ingestion_fingerprint);
shard_record
.shard
.ingestion_fingerprint
.clone_from(&ingestion_fingerprint);
shard_record.shard.ingested_at = Utc::now();
shard_record.shard.embedding_backend.clone_from(&embedding_backend_label);
shard_record.shard.embedding_model.clone_from(&embedding_model_code);
shard_record
.shard
.embedding_backend
.clone_from(&embedding_backend_label);
shard_record
.shard
.embedding_model
.clone_from(&embedding_model_code);
shard_record.shard.embedding_dimension = embedding_dimension;
shard_record.dirty = true;
shard_record.needs_reembed = false;
@@ -543,31 +551,16 @@ async fn ingest_single_paragraph(
let task = IngestionTask::new(payload, user_id.to_string());
match pipeline.produce_artifacts(&task).await {
Ok(artifacts) => {
let entities: Vec<EmbeddedKnowledgeEntity> = artifacts
.entities
.into_iter()
.map(|e| EmbeddedKnowledgeEntity {
entity: e.entity,
embedding: e.embedding,
})
.collect();
let chunks: Vec<EmbeddedTextChunk> = artifacts
.chunks
.into_iter()
.map(|c| EmbeddedTextChunk {
chunk: c.chunk,
embedding: c.embedding,
})
.collect();
// No need to reembed - pipeline now uses FastEmbed internally
// Artifacts already carry the shared `Embedded*` types and FastEmbed
// embeddings, so they can be persisted to the shard without re-mapping.
let mut shard = ParagraphShard::new(
paragraph,
request.shard_path,
ingestion_fingerprint,
artifacts.text_content,
entities,
artifacts.entities,
artifacts.relationships,
chunks,
artifacts.chunks,
&embedding_backend,
embedding_model.clone(),
embedding_dimension,