mirror of
https://github.com/perstarkse/minne.git
synced 2026-06-29 13:26:22 +02:00
chore: ingestion-pipeline refactor, sort technical debt, rustfmt
This commit is contained in:
@@ -33,8 +33,7 @@ use crate::{
|
||||
|
||||
use crate::corpus::{
|
||||
CorpusCacheConfig, CorpusHandle, CorpusManifest, CorpusMetadata, CorpusQuestion,
|
||||
EmbeddedKnowledgeEntity, EmbeddedTextChunk, ParagraphShard, ParagraphShardStore,
|
||||
MANIFEST_VERSION,
|
||||
ParagraphShard, ParagraphShardStore, MANIFEST_VERSION,
|
||||
};
|
||||
|
||||
const INGESTION_SPEC_VERSION: u32 = 2;
|
||||
@@ -273,10 +272,19 @@ pub async fn ensure_corpus(
|
||||
.context("shard record missing after ingestion run")?;
|
||||
if cache.refresh_embeddings_only || shard_record.needs_reembed {
|
||||
// Embeddings are now generated by the pipeline using FastEmbed - no need to re-embed
|
||||
shard_record.shard.ingestion_fingerprint.clone_from(&ingestion_fingerprint);
|
||||
shard_record
|
||||
.shard
|
||||
.ingestion_fingerprint
|
||||
.clone_from(&ingestion_fingerprint);
|
||||
shard_record.shard.ingested_at = Utc::now();
|
||||
shard_record.shard.embedding_backend.clone_from(&embedding_backend_label);
|
||||
shard_record.shard.embedding_model.clone_from(&embedding_model_code);
|
||||
shard_record
|
||||
.shard
|
||||
.embedding_backend
|
||||
.clone_from(&embedding_backend_label);
|
||||
shard_record
|
||||
.shard
|
||||
.embedding_model
|
||||
.clone_from(&embedding_model_code);
|
||||
shard_record.shard.embedding_dimension = embedding_dimension;
|
||||
shard_record.dirty = true;
|
||||
shard_record.needs_reembed = false;
|
||||
@@ -543,31 +551,16 @@ async fn ingest_single_paragraph(
|
||||
let task = IngestionTask::new(payload, user_id.to_string());
|
||||
match pipeline.produce_artifacts(&task).await {
|
||||
Ok(artifacts) => {
|
||||
let entities: Vec<EmbeddedKnowledgeEntity> = artifacts
|
||||
.entities
|
||||
.into_iter()
|
||||
.map(|e| EmbeddedKnowledgeEntity {
|
||||
entity: e.entity,
|
||||
embedding: e.embedding,
|
||||
})
|
||||
.collect();
|
||||
let chunks: Vec<EmbeddedTextChunk> = artifacts
|
||||
.chunks
|
||||
.into_iter()
|
||||
.map(|c| EmbeddedTextChunk {
|
||||
chunk: c.chunk,
|
||||
embedding: c.embedding,
|
||||
})
|
||||
.collect();
|
||||
// No need to reembed - pipeline now uses FastEmbed internally
|
||||
// Artifacts already carry the shared `Embedded*` types and FastEmbed
|
||||
// embeddings, so they can be persisted to the shard without re-mapping.
|
||||
let mut shard = ParagraphShard::new(
|
||||
paragraph,
|
||||
request.shard_path,
|
||||
ingestion_fingerprint,
|
||||
artifacts.text_content,
|
||||
entities,
|
||||
artifacts.entities,
|
||||
artifacts.relationships,
|
||||
chunks,
|
||||
artifacts.chunks,
|
||||
&embedding_backend,
|
||||
embedding_model.clone(),
|
||||
embedding_dimension,
|
||||
|
||||
Reference in New Issue
Block a user