fix: atomic ingestion persist with task reclaim and shared cleanup.

One transaction per task replaces prior artifact rows; workers skip the pipeline when content already exists, eval seeding reuses persist_artifacts, and deletes clear graph children via shared SQL.
2026-06-24 02:46:29 +02:00 · 2026-06-12 16:27:07 +02:00
parent cf69cb7b05
commit 1013035731
15 changed files with 794 additions and 487 deletions
@@ -1,7 +1,11 @@
 # Changelog
 ## Unreleased
 - Performance: ingestion skips per-task index rebuild; worker runs scheduled `REBUILD INDEX` (default every 24h via `index_rebuild_interval_secs`, `0` disables)
+- Performance: ingestion persists all artifacts in a single SurrealDB transaction per task (atomic replace by task id)
+- Fix: ingestion reclaims tasks after a successful persist without re-running the pipeline when `mark_succeeded` failed
+- Fix: content deletion clears graph relationships via shared `TextContent::clear_ingested_children`
 - Fix: regression re suggestion of relationships
+- Internal: eval corpus DB seed uses `persist_artifacts` instead of a separate batched insert path

 ## 1.0.3 (2026-06-12)
 - Search: filter results by type — knowledge entities, ingested content, or both
@@ -150,7 +150,7 @@ impl SurrealDbClient {

    /// Upsert an object in SurrealDB, replacing any existing record with the same ID.
    ///
-    /// Useful for idempotent ingestion flows.
+    /// Useful when a single record should be replaced by id (admin updates, embedding rows, etc.).
    ///
    /// # Errors
    ///
@@ -245,8 +245,9 @@ impl KnowledgeEntity {
        Ok(())
    }

-    /// Atomically store a knowledge entity and its embedding.
-    /// Writes the entity to `knowledge_entity` and the embedding to `knowledge_entity_embedding`.
+    /// Atomically store one knowledge entity and its embedding (single-record path).
+    ///
+    /// Bulk ingestion uses `ingestion_pipeline::persist_artifacts` instead.
    pub async fn store_with_embedding(
        entity: KnowledgeEntity,
        embedding: Vec<f32>,
@@ -61,8 +61,9 @@ impl TextChunk {
        Ok(())
    }

-    /// Atomically store a text chunk and its embedding.
-    /// Writes the chunk to `text_chunk` and the embedding to `text_chunk_embedding`.
+    /// Atomically store one text chunk and its embedding (single-record path).
+    ///
+    /// Bulk ingestion uses `ingestion_pipeline::persist_artifacts` instead.
    pub async fn store_with_embedding(
        chunk: TextChunk,
        embedding: Vec<f32>,
@@ -96,6 +96,41 @@ impl TextContent {
        }
    }

+    /// SurrealQL deletes for ingested child rows keyed by `source_id` (no transaction wrapper).
+    ///
+    /// Used inside larger transactions (e.g. ingestion `persist_artifacts`) and mirrored by
+    /// [`Self::clear_ingested_children`].
+    pub const CLEAR_INGESTED_CHILD_ROWS_SURQL: &'static str = r"
+DELETE relates_to WHERE metadata.source_id = $source_id AND metadata.user_id = $user_id;
+DELETE text_chunk_embedding WHERE source_id = $source_id;
+DELETE text_chunk WHERE source_id = $source_id;
+DELETE knowledge_entity_embedding WHERE source_id = $source_id;
+DELETE knowledge_entity WHERE source_id = $source_id;
+";
+
+    /// Removes chunks, embeddings, entities, and relationships for one ingested document snapshot.
+    pub async fn clear_ingested_children(
+        source_id: &str,
+        user_id: &str,
+        db: &SurrealDbClient,
+    ) -> Result<(), AppError> {
+        let query = format!(
+            "BEGIN TRANSACTION;\n{} COMMIT TRANSACTION;",
+            Self::CLEAR_INGESTED_CHILD_ROWS_SURQL
+        );
+
+        db.client
+            .query(query)
+            .bind(("source_id", source_id.to_string()))
+            .bind(("user_id", user_id.to_string()))
+            .await
+            .map_err(AppError::from)?
+            .check()
+            .map_err(AppError::from)?;
+
+        Ok(())
+    }
+
    pub async fn patch(
        id: &str,
        context: &str,
@@ -364,7 +399,14 @@ mod tests {
    use anyhow::{self, Context};

    use super::*;
-    use crate::test_utils::setup_test_db_with_runtime_indexes;
+    use crate::{
+        storage::types::{
+            knowledge_entity::{KnowledgeEntity, KnowledgeEntityType},
+            knowledge_relationship::KnowledgeRelationship,
+            text_chunk::TextChunk,
+        },
+        test_utils::{setup_test_db, setup_test_db_with_runtime_indexes},
+    };

    #[tokio::test]
    async fn test_text_content_creation() -> anyhow::Result<()> {
@@ -638,4 +680,81 @@ mod tests {
        );
        Ok(())
    }
+
+    #[tokio::test]
+    async fn clear_ingested_children_removes_chunks_entities_and_relationships(
+    ) -> anyhow::Result<()> {
+        let db = setup_test_db().await?;
+        let user_id = "clear-user";
+        let source_id = Uuid::new_v4().to_string();
+
+        let entity_a = KnowledgeEntity::new(
+            source_id.clone(),
+            "entity-a".to_string(),
+            "desc-a".to_string(),
+            KnowledgeEntityType::Idea,
+            None,
+            user_id.to_string(),
+        );
+        let entity_b = KnowledgeEntity::new(
+            source_id.clone(),
+            "entity-b".to_string(),
+            "desc-b".to_string(),
+            KnowledgeEntityType::Idea,
+            None,
+            user_id.to_string(),
+        );
+        KnowledgeEntity::store_with_embedding(entity_a.clone(), vec![0.1; 3], 3, &db)
+            .await
+            .context("store entity a")?;
+        KnowledgeEntity::store_with_embedding(entity_b.clone(), vec![0.2; 3], 3, &db)
+            .await
+            .context("store entity b")?;
+
+        let chunk = TextChunk::new(source_id.clone(), "chunk".to_string(), user_id.to_string());
+        TextChunk::store_with_embedding(chunk, vec![0.3; 3], 3, &db)
+            .await
+            .context("store chunk")?;
+
+        KnowledgeRelationship::new(
+            entity_a.id.clone(),
+            entity_b.id,
+            user_id.to_string(),
+            source_id.clone(),
+            "relates_to".to_string(),
+        )
+        .store_relationship(&db)
+        .await
+        .context("store relationship")?;
+
+        TextContent::clear_ingested_children(&source_id, user_id, &db)
+            .await
+            .context("clear ingested children")?;
+
+        let chunks: Vec<TextChunk> = db
+            .client
+            .query("SELECT * FROM text_chunk WHERE source_id = $source_id;")
+            .bind(("source_id", source_id.clone()))
+            .await?
+            .take(0)?;
+        assert!(chunks.is_empty());
+
+        let entities: Vec<KnowledgeEntity> = db
+            .client
+            .query("SELECT * FROM knowledge_entity WHERE source_id = $source_id;")
+            .bind(("source_id", source_id.clone()))
+            .await?
+            .take(0)?;
+        assert!(entities.is_empty());
+
+        let relationships: Vec<KnowledgeRelationship> = db
+            .client
+            .query("SELECT * FROM relates_to WHERE metadata.source_id = $source_id;")
+            .bind(("source_id", source_id))
+            .await?
+            .take(0)?;
+        assert!(relationships.is_empty());
+
+        Ok(())
+    }
 }
@@ -7,33 +7,24 @@ use std::{

 use anyhow::{anyhow, Context, Result};
 use chrono::{DateTime, Utc};
-use common::storage::types::StoredObject;
 use common::storage::{
    db::SurrealDbClient,
    types::{
        knowledge_entity::KnowledgeEntity,
-        knowledge_entity_embedding::KnowledgeEntityEmbedding,
-        knowledge_relationship::{KnowledgeRelationship, RelationshipMetadata},
+        knowledge_relationship::KnowledgeRelationship,
        text_chunk::TextChunk,
-        text_chunk_embedding::TextChunkEmbedding,
        text_content::TextContent,
+        StoredObject,
    },
 };
+use ingestion_pipeline::{persist_artifacts, IngestionTuning, PipelineArtifacts};
 use serde::Deserialize;
-use serde::Serialize;
-use surrealdb::sql::Thing;
 use tracing::{debug, warn};

 use crate::datasets::{ConvertedParagraph, ConvertedQuestion};

 pub const MANIFEST_VERSION: u32 = 3;
 pub const PARAGRAPH_SHARD_VERSION: u32 = 3;
-const MANIFEST_BATCH_SIZE: usize = 100;
-const MANIFEST_MAX_BYTES_PER_BATCH: usize = 300_000; // default cap for non-text batches
-const TEXT_CONTENT_MAX_BYTES_PER_BATCH: usize = 250_000; // text bodies can be large; limit aggressively
-const MAX_BATCHES_PER_REQUEST: usize = 24;
-const REQUEST_MAX_BYTES: usize = 800_000; // total payload cap per Surreal query request
-
 fn current_manifest_version() -> u32 {
    MANIFEST_VERSION
 }
@@ -251,130 +242,6 @@ pub fn window_manifest(
    Ok(narrowed)
 }

-#[derive(Debug, Clone, Serialize)]
-struct RelationInsert {
-    #[serde(rename = "in")]
-    pub in_: Thing,
-    #[serde(rename = "out")]
-    pub out: Thing,
-    pub id: String,
-    pub metadata: RelationshipMetadata,
-}
-
-#[derive(Debug)]
-struct SizedBatch<T> {
-    approx_bytes: usize,
-    items: Vec<T>,
-}
-
-struct ManifestBatches {
-    text_contents: Vec<SizedBatch<TextContent>>,
-    entities: Vec<SizedBatch<KnowledgeEntity>>,
-    entity_embeddings: Vec<SizedBatch<KnowledgeEntityEmbedding>>,
-    relationships: Vec<SizedBatch<RelationInsert>>,
-    chunks: Vec<SizedBatch<TextChunk>>,
-    chunk_embeddings: Vec<SizedBatch<TextChunkEmbedding>>,
-}
-
-fn build_manifest_batches(manifest: &CorpusManifest) -> Result<ManifestBatches> {
-    let mut text_contents = Vec::new();
-    let mut entities = Vec::new();
-    let mut entity_embeddings = Vec::new();
-    let mut relationships = Vec::new();
-    let mut chunks = Vec::new();
-    let mut chunk_embeddings = Vec::new();
-
-    let mut seen_text_content = HashSet::new();
-    let mut seen_entities = HashSet::new();
-    let mut seen_relationships = HashSet::new();
-    let mut seen_chunks = HashSet::new();
-
-    for paragraph in &manifest.paragraphs {
-        if seen_text_content.insert(paragraph.text_content.id.clone()) {
-            text_contents.push(paragraph.text_content.clone());
-        }
-
-        for embedded_entity in &paragraph.entities {
-            if seen_entities.insert(embedded_entity.entity.id.clone()) {
-                let entity = embedded_entity.entity.clone();
-                entities.push(entity.clone());
-                entity_embeddings.push(KnowledgeEntityEmbedding::new(
-                    &entity.id,
-                    entity.source_id.clone(),
-                    embedded_entity.embedding.clone(),
-                    entity.user_id.clone(),
-                ));
-            }
-        }
-
-        for relationship in &paragraph.relationships {
-            if seen_relationships.insert(relationship.id.clone()) {
-                let table = KnowledgeEntity::table_name();
-                let in_id = relationship
-                    .in_
-                    .strip_prefix(&format!("{table}:"))
-                    .unwrap_or(&relationship.in_);
-                let out_id = relationship
-                    .out
-                    .strip_prefix(&format!("{table}:"))
-                    .unwrap_or(&relationship.out);
-                let in_thing = Thing::from((table, in_id));
-                let out_thing = Thing::from((table, out_id));
-                relationships.push(RelationInsert {
-                    in_: in_thing,
-                    out: out_thing,
-                    id: relationship.id.clone(),
-                    metadata: relationship.metadata.clone(),
-                });
-            }
-        }
-
-        for embedded_chunk in &paragraph.chunks {
-            if seen_chunks.insert(embedded_chunk.chunk.id.clone()) {
-                let chunk = embedded_chunk.chunk.clone();
-                chunks.push(chunk.clone());
-                chunk_embeddings.push(TextChunkEmbedding::new(
-                    &chunk.id,
-                    chunk.source_id.clone(),
-                    embedded_chunk.embedding.clone(),
-                    chunk.user_id.clone(),
-                ));
-            }
-        }
-    }
-
-    Ok(ManifestBatches {
-        text_contents: chunk_items(
-            &text_contents,
-            MANIFEST_BATCH_SIZE,
-            TEXT_CONTENT_MAX_BYTES_PER_BATCH,
-        )
-        .context("chunking text_content payloads")?,
-        entities: chunk_items(&entities, MANIFEST_BATCH_SIZE, MANIFEST_MAX_BYTES_PER_BATCH)
-            .context("chunking knowledge_entity payloads")?,
-        entity_embeddings: chunk_items(
-            &entity_embeddings,
-            MANIFEST_BATCH_SIZE,
-            MANIFEST_MAX_BYTES_PER_BATCH,
-        )
-        .context("chunking knowledge_entity_embedding payloads")?,
-        relationships: chunk_items(
-            &relationships,
-            MANIFEST_BATCH_SIZE,
-            MANIFEST_MAX_BYTES_PER_BATCH,
-        )
-        .context("chunking relationship payloads")?,
-        chunks: chunk_items(&chunks, MANIFEST_BATCH_SIZE, MANIFEST_MAX_BYTES_PER_BATCH)
-            .context("chunking text_chunk payloads")?,
-        chunk_embeddings: chunk_items(
-            &chunk_embeddings,
-            MANIFEST_BATCH_SIZE,
-            MANIFEST_MAX_BYTES_PER_BATCH,
-        )
-        .context("chunking text_chunk_embedding payloads")?,
-    })
-}
-
 #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
 pub struct ParagraphShard {
    #[serde(default = "current_paragraph_shard_version")]
@@ -599,157 +466,28 @@ fn normalize_answer_text(text: &str) -> String {
        .join(" ")
 }

-#[allow(clippy::arithmetic_side_effects, clippy::indexing_slicing)]
-fn chunk_items<T: Clone + Serialize>(
-    items: &[T],
-    max_items: usize,
-    max_bytes: usize,
-) -> Result<Vec<SizedBatch<T>>> {
-    if items.is_empty() {
-        return Ok(Vec::new());
-    }
-
-    let mut batches = Vec::new();
-    let mut current = Vec::new();
-    let mut current_bytes = 0usize;
-
-    for item in items {
-        let size = serde_json::to_vec(item)
-            .map(|buf| buf.len())
-            .context("serialising batch item for sizing")?;
-
-        let would_overflow_items = !current.is_empty() && current.len() >= max_items;
-        let would_overflow_bytes = !current.is_empty() && current_bytes + size > max_bytes;
-
-        if would_overflow_items || would_overflow_bytes {
-            batches.push(SizedBatch {
-                approx_bytes: current_bytes.max(1),
-                items: std::mem::take(&mut current),
-            });
-            current_bytes = 0;
-        }
-
-        current_bytes += size;
-        current.push(item.clone());
-    }
-
-    if !current.is_empty() {
-        batches.push(SizedBatch {
-            approx_bytes: current_bytes.max(1),
-            items: current,
-        });
-    }
-
-    Ok(batches)
-}
-
-#[allow(clippy::arithmetic_side_effects, clippy::indexing_slicing)]
-async fn execute_batched_inserts<T: Clone + Serialize + 'static>(
-    db: &SurrealDbClient,
-    statement: impl AsRef<str>,
-    prefix: &str,
-    batches: &[SizedBatch<T>],
-) -> Result<()> {
-    if batches.is_empty() {
-        return Ok(());
-    }
-
-    let mut start = 0;
-    while start < batches.len() {
-        let mut group_bytes = 0usize;
-        let mut group_end = start;
-        let mut group_count = 0usize;
-
-        while group_end < batches.len() {
-            let batch_bytes = batches[group_end].approx_bytes.max(1);
-            if group_count > 0
-                && (group_bytes + batch_bytes > REQUEST_MAX_BYTES
-                    || group_count >= MAX_BATCHES_PER_REQUEST)
-            {
-                break;
-            }
-            group_bytes += batch_bytes;
-            group_end += 1;
-            group_count += 1;
-        }
-
-        let slice = &batches[start..group_end];
-        let mut query = db.client.query("BEGIN TRANSACTION;");
-        for (bind_index, batch) in slice.iter().enumerate() {
-            let name = format!("{prefix}{bind_index}");
-            query = query
-                .query(format!("{} ${};", statement.as_ref(), name))
-                .bind((name, batch.items.clone()));
-        }
-        let response = query
-            .query("COMMIT TRANSACTION;")
-            .await
-            .context("executing batched insert transaction")?;
-        if let Err(err) = response.check() {
-            return Err(anyhow!(
-                "batched insert failed for statement '{}': {err:?}",
-                statement.as_ref()
-            ));
-        }
-
-        start = group_end;
-    }
-
-    Ok(())
-}
-
 pub async fn seed_manifest_into_db(db: &SurrealDbClient, manifest: &CorpusManifest) -> Result<()> {
-    let batches = build_manifest_batches(manifest).context("preparing manifest batches")?;
+    let tuning = IngestionTuning::default();
+    let embedding_dimensions = manifest.metadata.embedding_dimension;
+    let mut seen_text_content = HashSet::new();

    let result = async {
-        execute_batched_inserts(
-            db,
-            format!("INSERT INTO {}", TextContent::table_name()),
-            "tc",
-            &batches.text_contents,
-        )
-        .await?;
+        for paragraph in &manifest.paragraphs {
+            if !seen_text_content.insert(paragraph.text_content.id.clone()) {
+                continue;
+            }

-        execute_batched_inserts(
-            db,
-            format!("INSERT INTO {}", KnowledgeEntity::table_name()),
-            "ke",
-            &batches.entities,
-        )
-        .await?;
-
-        execute_batched_inserts(
-            db,
-            format!("INSERT INTO {}", TextChunk::table_name()),
-            "ch",
-            &batches.chunks,
-        )
-        .await?;
-
-        execute_batched_inserts(
-            db,
-            "INSERT RELATION INTO relates_to",
-            "rel",
-            &batches.relationships,
-        )
-        .await?;
-
-        execute_batched_inserts(
-            db,
-            format!("INSERT INTO {}", KnowledgeEntityEmbedding::table_name()),
-            "kee",
-            &batches.entity_embeddings,
-        )
-        .await?;
-
-        execute_batched_inserts(
-            db,
-            format!("INSERT INTO {}", TextChunkEmbedding::table_name()),
-            "tce",
-            &batches.chunk_embeddings,
-        )
-        .await?;
+            let artifacts = PipelineArtifacts {
+                text_content: paragraph.text_content.clone(),
+                entities: paragraph.entities.clone(),
+                relationships: paragraph.relationships.clone(),
+                chunks: paragraph.chunks.clone(),
+            };

+            persist_artifacts(db, &tuning, embedding_dimensions, artifacts)
+                .await
+                .map_err(|err| anyhow!("persist manifest paragraph: {err}"))?;
+        }
        Ok(())
    }
    .await;
@@ -778,7 +516,10 @@ pub async fn seed_manifest_into_db(db: &SurrealDbClient, manifest: &CorpusManife
 mod tests {
    use super::*;
    use chrono::Utc;
-    use common::storage::types::knowledge_entity::KnowledgeEntityType;
+    use common::storage::types::{
+        knowledge_entity::{KnowledgeEntity, KnowledgeEntityType},
+        text_chunk::TextChunk,
+    };
    use uuid::Uuid;

    #[allow(clippy::too_many_lines)]
@@ -5,10 +5,7 @@ use axum::{
 use axum_htmx::{HxBoosted, HxRequest, HxTarget};
 use serde::{Deserialize, Serialize};

-use common::storage::types::{
-    file_info::FileInfo, knowledge_entity::KnowledgeEntity, text_chunk::TextChunk,
-    text_content::TextContent, user::User,
-};
+use common::storage::types::{file_info::FileInfo, text_content::TextContent, user::User};

 use crate::{
    html_state::HtmlState,
@@ -180,9 +177,7 @@ pub async fn delete_text_content(
        }
    }

-    // Delete related knowledge entities and text chunks
-    KnowledgeEntity::delete_by_source_id(&id, &state.db).await?;
-    TextChunk::delete_by_source_id(&id, &state.db).await?;
+    TextContent::clear_ingested_children(&id, &user.id, &state.db).await?;

    // Delete the text content
    state.db.delete_item::<TextContent>(&id).await?;
@@ -23,9 +23,7 @@ use common::storage::types::user::DashboardStats;
 use common::{
    error::AppError,
    storage::types::{
-        file_info::FileInfo, ingestion_task::IngestionTask, knowledge_entity::KnowledgeEntity,
-        knowledge_relationship::KnowledgeRelationship, text_chunk::TextChunk,
-        text_content::TextContent, user::User,
+        file_info::FileInfo, ingestion_task::IngestionTask, text_content::TextContent, user::User,
    },
 };

@@ -81,11 +79,7 @@ pub async fn delete_text_content(
        }
    }

-    // Delete the text content and any related data
-    TextChunk::delete_by_source_id(&text_content.id, &state.db).await?;
-    KnowledgeEntity::delete_by_source_id(&text_content.id, &state.db).await?;
-    KnowledgeRelationship::delete_relationships_by_source_id(&text_content.id, &user.id, &state.db)
-        .await?;
+    TextContent::clear_ingested_children(&text_content.id, &user.id, &state.db).await?;
    state
        .db
        .delete_item::<TextContent>(&text_content.id)
@@ -10,8 +10,8 @@ use common::storage::{
    types::ingestion_task::{IngestionTask, DEFAULT_LEASE_SECS},
 };
 pub use pipeline::{
-    EmbeddedKnowledgeEntity, EmbeddedTextChunk, IngestionConfig, IngestionPipeline,
-    IngestionTuning, PipelineArtifacts,
+    persist_artifacts, EmbeddedKnowledgeEntity, EmbeddedTextChunk, IngestionConfig,
+    IngestionPipeline, IngestionTuning, PipelineArtifacts,
 };
 use std::sync::Arc;
 use tokio::time::{sleep, Duration};
@@ -3,9 +3,9 @@ pub struct IngestionTuning {
    pub retry_base_delay_secs: u64,
    pub retry_max_delay_secs: u64,
    pub retry_backoff_cap_exponent: u32,
-    pub graph_store_attempts: usize,
-    pub graph_initial_backoff_ms: u64,
-    pub graph_max_backoff_ms: u64,
+    pub persist_attempts: usize,
+    pub persist_initial_backoff_ms: u64,
+    pub persist_max_backoff_ms: u64,
    pub chunk_min_tokens: usize,
    pub chunk_max_tokens: usize,
    pub chunk_overlap_tokens: usize,
@@ -21,9 +21,9 @@ impl Default for IngestionTuning {
            retry_base_delay_secs: 30,
            retry_max_delay_secs: 15 * 60,
            retry_backoff_cap_exponent: 5,
-            graph_store_attempts: 3,
-            graph_initial_backoff_ms: 50,
-            graph_max_backoff_ms: 800,
+            persist_attempts: 3,
+            persist_initial_backoff_ms: 50,
+            persist_max_backoff_ms: 800,
            chunk_min_tokens: 256,
            chunk_max_tokens: 512,
            chunk_overlap_tokens: 50,
@@ -12,6 +12,8 @@ pub use config::{IngestionConfig, IngestionTuning};
 pub use context::{EmbeddedKnowledgeEntity, EmbeddedTextChunk, PipelineArtifacts};
 pub use enrichment_result::{LLMEnrichmentResult, LLMKnowledgeEntity, LLMRelationship};
 #[allow(clippy::module_name_repetitions)]
+pub use persistence::persist_artifacts;
+#[allow(clippy::module_name_repetitions)]
 pub use services::{DefaultPipelineServices, PipelineServices};

 use std::{
@@ -28,11 +30,13 @@ use common::{
        types::{
            ingestion_payload::IngestionPayload,
            ingestion_task::{IngestionTask, TaskErrorInfo},
+            text_content::TextContent,
        },
    },
    utils::config::AppConfig,
 };
 use retrieval_pipeline::reranking::RerankerPool;
+use tokio::time::sleep;
 use tracing::{debug, info, warn};

 use self::{
@@ -120,29 +124,31 @@ impl IngestionPipeline {
    )]
    pub async fn process_task(&self, task: IngestionTask) -> Result<(), AppError> {
        let mut processing_task = task.mark_processing(&self.db).await?;
-        let payload = processing_task.take_content();

-        match self
-            .drive_pipeline(&processing_task, payload)
-            .await
-            .map_err(|err| {
-                debug!(
-                    task_id = %processing_task.id,
-                    attempt = processing_task.attempts,
-                    error = %err,
-                    "ingestion pipeline failed"
-                );
-                err
-            }) {
-            Ok(()) => {
-                processing_task.mark_succeeded(&self.db).await?;
-                tracing::info!(
-                    task_id = %processing_task.id,
-                    attempt = processing_task.attempts,
-                    "ingestion task succeeded"
-                );
-                Ok(())
-            }
+        let pipeline_result = if self.artifacts_persisted(&processing_task.id).await? {
+            info!(
+                task_id = %processing_task.id,
+                attempt = processing_task.attempts,
+                "ingestion artifacts already persisted; skipping pipeline"
+            );
+            Ok(())
+        } else {
+            let payload = processing_task.take_content();
+            self.drive_pipeline(&processing_task, payload)
+                .await
+                .map_err(|err| {
+                    debug!(
+                        task_id = %processing_task.id,
+                        attempt = processing_task.attempts,
+                        error = %err,
+                        "ingestion pipeline failed"
+                    );
+                    err
+                })
+        };
+
+        match pipeline_result {
+            Ok(()) => self.finalize_succeeded(&processing_task).await,
            Err(err) => {
                let reason = err.to_string();
                let retryable = !matches!(err, AppError::Validation(_));
@@ -179,6 +185,51 @@ impl IngestionPipeline {
        }
    }

+    async fn artifacts_persisted(&self, task_id: &str) -> Result<bool, AppError> {
+        Ok(self
+            .db
+            .get_item::<TextContent>(task_id)
+            .await?
+            .is_some())
+    }
+
+    async fn finalize_succeeded(&self, task: &IngestionTask) -> Result<(), AppError> {
+        let tuning = &self.pipeline_config.tuning;
+        let mut backoff_ms = tuning.persist_initial_backoff_ms;
+        let last_attempt = tuning.persist_attempts.saturating_sub(1);
+
+        for attempt in 0..tuning.persist_attempts {
+            match task.mark_succeeded(&self.db).await {
+                Ok(_) => {
+                    info!(
+                        task_id = %task.id,
+                        attempt = task.attempts,
+                        "ingestion task succeeded"
+                    );
+                    return Ok(());
+                }
+                Err(err) if attempt < last_attempt => {
+                    let next_attempt = attempt.saturating_add(1);
+                    warn!(
+                        task_id = %task.id,
+                        attempt = next_attempt,
+                        error = %err,
+                        "failed to mark ingestion task succeeded; retrying"
+                    );
+                    sleep(Duration::from_millis(backoff_ms)).await;
+                    backoff_ms = backoff_ms
+                        .saturating_mul(2)
+                        .min(tuning.persist_max_backoff_ms);
+                }
+                Err(err) => return Err(err),
+            }
+        }
+
+        Err(AppError::InternalError(
+            "failed to mark ingestion task succeeded after retries".into(),
+        ))
+    }
+
    fn retry_delay(&self, attempt: u32) -> Duration {
        let tuning = &self.pipeline_config.tuning;
        let capped_attempt = attempt
@@ -291,5 +342,8 @@ impl IngestionPipeline {
    }
 }

+#[cfg(test)]
+mod test_support;
+
 #[cfg(test)]
 mod tests;
@@ -1,8 +1,9 @@
-//! Low-level database write mechanics for the persist stage.
+//! Atomic persistence for ingested artifacts.
 //!
-//! This module owns *how* ingested artifacts reach `SurrealDB` (per-item store loops,
-//! the relationship transaction, and conflict retry/backoff). The persist stage in
-//! [`super::stages`] owns *what* gets written and in which order.
+//! All rows for one ingestion task are written inside a single `SurrealDB` transaction:
+//! clear any prior rows for the task's `source_id`, then insert the new snapshot.
+//! `SurrealDB` does not cap transaction row count; request payload size is the practical
+//! limit (~4 MiB gRPC on `TiKV`). Typical single-document ingests fit comfortably.

 use std::sync::Arc;

@@ -11,131 +12,332 @@ use common::{
    storage::{
        db::SurrealDbClient,
        types::{
-            knowledge_entity::KnowledgeEntity, knowledge_relationship::KnowledgeRelationship,
+            knowledge_entity::KnowledgeEntity,
+            knowledge_entity_embedding::KnowledgeEntityEmbedding,
            text_chunk::TextChunk,
+            text_chunk_embedding::TextChunkEmbedding,
+            text_content::TextContent,
        },
    },
 };
 use tokio::time::{sleep, Duration};
-use tracing::{debug, warn};
+use tracing::warn;

 use super::{
    config::IngestionTuning,
-    context::{EmbeddedKnowledgeEntity, EmbeddedTextChunk},
+    context::{EmbeddedKnowledgeEntity, EmbeddedTextChunk, PipelineArtifacts},
 };

-const STORE_RELATIONSHIPS: &str = r"
-    BEGIN TRANSACTION;
-    LET $relationships = $relationships;
-
-    FOR $relationship IN $relationships {
-        LET $in_node = type::thing('knowledge_entity', $relationship.in);
-        LET $out_node = type::thing('knowledge_entity', $relationship.out);
-        RELATE $in_node->relates_to->$out_node CONTENT {
-            id: type::thing('relates_to', $relationship.id),
-            metadata: $relationship.metadata
-        };
-    };
-
-    COMMIT TRANSACTION;
-";
-
-/// Persists chunk embeddings to the vector store.
-///
-/// Chunks are written serially on purpose. Concurrent/batched inserts were
-/// trialed and did not reliably improve throughput; see `ingestion-pipeline/AGENTS.md`
-/// for the rationale and as a candidate for future refactoring/benchmarking.
-pub(super) async fn store_vector_chunks(
-    db: &SurrealDbClient,
-    task_id: &str,
-    embedding_dimensions: usize,
-    chunks: Vec<EmbeddedTextChunk>,
-) -> Result<usize, AppError> {
-    let chunk_count = chunks.len();
-    for embedded in chunks {
-        debug!(
-            task_id = %task_id,
-            chunk_id = %embedded.chunk.id,
-            chunk_len = embedded.chunk.chunk.chars().count(),
-            "chunk persisted"
-        );
-        TextChunk::store_with_embedding(
-            embedded.chunk,
-            embedded.embedding,
-            embedding_dimensions,
-            db,
-        )
-        .await?;
-    }
-
-    Ok(chunk_count)
+#[derive(Debug, Clone, Copy)]
+#[allow(clippy::struct_field_names)]
+pub struct PersistCounts {
+    pub chunk_count: usize,
+    pub entity_count: usize,
+    pub relationship_count: usize,
 }

-/// Persists knowledge entities and their relationships.
-///
-/// Entities are stored serially (see `store_vector_chunks` and AGENTS.md for why).
-/// Relationships are written via a single transaction with bounded conflict retry.
-pub(super) async fn store_graph_entities(
+/// Persists all pipeline artifacts in one database transaction.
+pub async fn persist_artifacts(
    db: &SurrealDbClient,
    tuning: &IngestionTuning,
    embedding_dimensions: usize,
-    entities: Vec<EmbeddedKnowledgeEntity>,
-    relationships: Vec<KnowledgeRelationship>,
-) -> Result<(), AppError> {
-    for embedded in entities {
-        KnowledgeEntity::store_with_embedding(
-            embedded.entity,
-            embedded.embedding,
-            embedding_dimensions,
-            db,
-        )
-        .await?;
-    }
+    artifacts: PipelineArtifacts,
+) -> Result<PersistCounts, AppError> {
+    let PipelineArtifacts {
+        text_content,
+        entities,
+        relationships,
+        chunks,
+    } = artifacts;

-    if relationships.is_empty() {
-        return Ok(());
-    }
+    let source_id = text_content.id.clone();
+    let user_id = text_content.user_id.clone();
+    let chunk_count = chunks.len();
+    let entity_count = entities.len();
+    let relationship_count = relationships.len();

-    let relationships = Arc::new(relationships);
+    let (entities, entity_embeddings) = prepare_entity_rows(entities, embedding_dimensions)?;
+    let (chunks, chunk_embeddings) = prepare_chunk_rows(chunks, embedding_dimensions)?;

-    let mut backoff_ms = tuning.graph_initial_backoff_ms;
-    let last_attempt = tuning.graph_store_attempts.saturating_sub(1);
+    let payload = PersistPayload {
+        source_id: Arc::from(source_id),
+        user_id: Arc::from(user_id),
+        text_content: Arc::new(text_content),
+        entities: Arc::from(entities.into_boxed_slice()),
+        entity_embeddings: Arc::from(entity_embeddings.into_boxed_slice()),
+        chunks: Arc::from(chunks.into_boxed_slice()),
+        chunk_embeddings: Arc::from(chunk_embeddings.into_boxed_slice()),
+        relationships: relationships.into(),
+    };

-    for attempt in 0..tuning.graph_store_attempts {
-        let result = db
-            .client
-            .query(STORE_RELATIONSHIPS)
-            .bind(("relationships", Arc::clone(&relationships)))
-            .await;
+    let mut backoff_ms = tuning.persist_initial_backoff_ms;
+    let last_attempt = tuning.persist_attempts.saturating_sub(1);
+
+    for attempt in 0..tuning.persist_attempts {
+        let result = execute_persist_transaction(db, &payload).await;

        match result {
-            Ok(_) => return Ok(()),
+            Ok(()) => {
+                return Ok(PersistCounts {
+                    chunk_count,
+                    entity_count,
+                    relationship_count,
+                });
+            }
            Err(err) => {
                if is_retryable_conflict(&err) && attempt < last_attempt {
                    let next_attempt = attempt.saturating_add(1);
                    warn!(
                        attempt = next_attempt,
-                        "Transient SurrealDB conflict while storing graph data; retrying"
+                        "Transient SurrealDB conflict while persisting ingestion artifacts; retrying"
                    );
                    sleep(Duration::from_millis(backoff_ms)).await;
                    backoff_ms = backoff_ms
                        .saturating_mul(2)
-                        .min(tuning.graph_max_backoff_ms);
+                        .min(tuning.persist_max_backoff_ms);
                    continue;
                }

-                return Err(AppError::from(err));
+                return Err(err);
            }
        }
    }

    Err(AppError::InternalError(
-        "Failed to store graph entities after retries".to_string(),
+        "Failed to persist ingestion artifacts after retries".to_string(),
    ))
 }

-fn is_retryable_conflict(error: &surrealdb::Error) -> bool {
+struct PersistPayload {
+    source_id: Arc<str>,
+    user_id: Arc<str>,
+    text_content: Arc<TextContent>,
+    entities: Arc<[KnowledgeEntity]>,
+    entity_embeddings: Arc<[KnowledgeEntityEmbedding]>,
+    chunks: Arc<[TextChunk]>,
+    chunk_embeddings: Arc<[TextChunkEmbedding]>,
+    relationships:
+        Arc<[common::storage::types::knowledge_relationship::KnowledgeRelationship]>,
+}
+
+async fn execute_persist_transaction(
+    db: &SurrealDbClient,
+    payload: &PersistPayload,
+) -> Result<(), AppError> {
+    let mut query = String::from("BEGIN TRANSACTION;\n");
+    query.push_str(TextContent::CLEAR_INGESTED_CHILD_ROWS_SURQL);
+    query.push_str(
+        "DELETE type::thing('text_content', $source_id);
+         UPSERT type::thing('text_content', $source_id) CONTENT $text_content;",
+    );
+
+    if !payload.entities.is_empty() {
+        query.push_str("\nINSERT INTO knowledge_entity $entities;");
+        query.push_str("\nINSERT INTO knowledge_entity_embedding $entity_embeddings;");
+    }
+    if !payload.chunks.is_empty() {
+        query.push_str("\nINSERT INTO text_chunk $chunks;");
+        query.push_str("\nINSERT INTO text_chunk_embedding $chunk_embeddings;");
+    }
+    if !payload.relationships.is_empty() {
+        query.push_str(
+            r#"
+LET $relationships = $relationships;
+FOR $relationship IN $relationships {
+    LET $in_node = type::thing('knowledge_entity', $relationship.`in`);
+    LET $out_node = type::thing('knowledge_entity', $relationship.out);
+    RELATE $in_node->relates_to->$out_node CONTENT {
+        id: type::thing('relates_to', $relationship.id),
+        metadata: $relationship.metadata
+    };
+};"#,
+        );
+    }
+
+    query.push_str("\nCOMMIT TRANSACTION;");
+
+    let mut request = db
+        .client
+        .query(query)
+        .bind(("source_id", Arc::clone(&payload.source_id)))
+        .bind(("user_id", Arc::clone(&payload.user_id)))
+        .bind(("text_content", Arc::clone(&payload.text_content)));
+
+    if !payload.entities.is_empty() {
+        request = request
+            .bind(("entities", Arc::clone(&payload.entities)))
+            .bind(("entity_embeddings", Arc::clone(&payload.entity_embeddings)));
+    }
+    if !payload.chunks.is_empty() {
+        request = request
+            .bind(("chunks", Arc::clone(&payload.chunks)))
+            .bind(("chunk_embeddings", Arc::clone(&payload.chunk_embeddings)));
+    }
+    if !payload.relationships.is_empty() {
+        request = request.bind(("relationships", Arc::clone(&payload.relationships)));
+    }
+
+    request
+        .await
+        .map_err(AppError::from)?
+        .check()
+        .map_err(AppError::from)?;
+
+    Ok(())
+}
+
+fn prepare_entity_rows(
+    embedded: Vec<EmbeddedKnowledgeEntity>,
+    embedding_dimensions: usize,
+) -> Result<(Vec<KnowledgeEntity>, Vec<KnowledgeEntityEmbedding>), AppError> {
+    let mut entities = Vec::with_capacity(embedded.len());
+    let mut entity_embeddings = Vec::with_capacity(embedded.len());
+
+    for item in embedded {
+        KnowledgeEntityEmbedding::validate_dimension(&item.embedding, embedding_dimensions)?;
+        let entity = item.entity;
+        entity_embeddings.push(KnowledgeEntityEmbedding::new(
+            &entity.id,
+            entity.source_id.clone(),
+            item.embedding,
+            entity.user_id.clone(),
+        ));
+        entities.push(entity);
+    }
+
+    Ok((entities, entity_embeddings))
+}
+
+fn prepare_chunk_rows(
+    embedded: Vec<EmbeddedTextChunk>,
+    embedding_dimensions: usize,
+) -> Result<(Vec<TextChunk>, Vec<TextChunkEmbedding>), AppError> {
+    let mut chunks = Vec::with_capacity(embedded.len());
+    let mut chunk_embeddings = Vec::with_capacity(embedded.len());
+
+    for item in embedded {
+        TextChunkEmbedding::validate_dimension(&item.embedding, embedding_dimensions)?;
+        let chunk = item.chunk;
+        chunk_embeddings.push(TextChunkEmbedding::new(
+            &chunk.id,
+            chunk.source_id.clone(),
+            item.embedding,
+            chunk.user_id.clone(),
+        ));
+        chunks.push(chunk);
+    }
+
+    Ok((chunks, chunk_embeddings))
+}
+
+fn is_retryable_conflict(error: &AppError) -> bool {
    error
        .to_string()
        .contains("Failed to commit transaction due to a read or write conflict")
 }
+
+#[cfg(test)]
+mod tests {
+    use common::storage::types::text_content::TextContent;
+
+    use super::*;
+    use crate::pipeline::test_support::{
+        self, count_chunks_for_source, count_entities_for_source, count_relationships_for_source,
+        large_artifacts, persist, sample_artifacts, setup_db, TEST_EMBEDDING_DIM,
+    };
+
+    #[tokio::test]
+    async fn persist_artifacts_is_idempotent_for_same_source() -> anyhow::Result<()> {
+        let db = setup_db().await?;
+        let source_id = uuid::Uuid::new_v4().to_string();
+        let user_id = "persist-idempotent";
+
+        persist(&db, sample_artifacts(&source_id, user_id)).await?;
+        persist(&db, sample_artifacts(&source_id, user_id)).await?;
+
+        assert_eq!(count_chunks_for_source(&db, &source_id).await?, 1);
+        assert_eq!(count_entities_for_source(&db, &source_id).await?, 1);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn persist_artifacts_rejects_invalid_embedding_before_write() -> anyhow::Result<()> {
+        let db = setup_db().await?;
+        let source_id = uuid::Uuid::new_v4().to_string();
+        let user_id = "persist-validate";
+        let mut artifacts = sample_artifacts(&source_id, user_id);
+        if let Some(chunk) = artifacts.chunks.first_mut() {
+            chunk.embedding = vec![0.1; 2];
+        }
+
+        let result =
+            persist_artifacts(&db, &test_support::tuning(), TEST_EMBEDDING_DIM, artifacts).await;
+        assert!(result.is_err());
+
+        let text: Option<TextContent> = db.get_item(&source_id).await?;
+        assert!(text.is_none());
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn persist_large_snapshot() -> anyhow::Result<()> {
+        let db = setup_db().await?;
+        let source_id = uuid::Uuid::new_v4().to_string();
+        let user_id = "persist-large";
+        let chunk_count = 100;
+        let entity_count = 20;
+        let relationship_count = 30;
+
+        persist(
+            &db,
+            large_artifacts(
+                &source_id,
+                user_id,
+                chunk_count,
+                entity_count,
+                relationship_count,
+                TEST_EMBEDDING_DIM,
+            ),
+        )
+        .await?;
+
+        assert_eq!(count_chunks_for_source(&db, &source_id).await?, chunk_count);
+        assert_eq!(
+            count_entities_for_source(&db, &source_id).await?,
+            entity_count
+        );
+        assert_eq!(
+            count_relationships_for_source(&db, &source_id).await?,
+            relationship_count
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn persist_does_not_touch_other_source_ids() -> anyhow::Result<()> {
+        let db = setup_db().await?;
+        let source_a = uuid::Uuid::new_v4().to_string();
+        let source_b = uuid::Uuid::new_v4().to_string();
+        let user_id = "persist-isolation";
+
+        persist(&db, large_artifacts(&source_a, user_id, 5, 3, 4, TEST_EMBEDDING_DIM)).await?;
+        persist(&db, large_artifacts(&source_b, user_id, 2, 1, 1, TEST_EMBEDDING_DIM)).await?;
+        persist(
+            &db,
+            large_artifacts(&source_a, user_id, 7, 4, 6, TEST_EMBEDDING_DIM),
+        )
+        .await?;
+
+        assert_eq!(count_chunks_for_source(&db, &source_a).await?, 7);
+        assert_eq!(count_entities_for_source(&db, &source_a).await?, 4);
+        assert_eq!(count_relationships_for_source(&db, &source_a).await?, 6);
+        assert_eq!(count_chunks_for_source(&db, &source_b).await?, 2);
+        assert_eq!(count_entities_for_source(&db, &source_b).await?, 1);
+        assert_eq!(count_relationships_for_source(&db, &source_b).await?, 1);
+
+        Ok(())
+    }
+}
@@ -12,9 +12,9 @@ use state_machines::core::GuardError;
 use tracing::{debug, instrument};

 use super::{
-    context::{PipelineArtifacts, PipelineContext},
+    context::PipelineContext,
    enrichment_result::LLMEnrichmentResult,
-    persistence::{store_graph_entities, store_vector_chunks},
+    persistence::persist_artifacts,
    state::{ContentPrepared, Enriched, IngestionMachine, Persisted, Ready, Retrieved},
 };

@@ -28,7 +28,8 @@ pub async fn prepare_content(
    ctx: &mut PipelineContext<'_>,
    payload: IngestionPayload,
 ) -> Result<IngestionMachine<(), ContentPrepared>, AppError> {
-    let text_content = ctx.services.prepare_text_content(payload).await?;
+    let mut text_content = ctx.services.prepare_text_content(payload).await?;
+    text_content.id.clone_from(&ctx.task_id);

    let text_len = text_content.text.chars().count();
    let preview: String = text_content.text.chars().take(120).collect();
@@ -146,43 +147,26 @@ pub async fn persist(
    machine: IngestionMachine<(), Enriched>,
    ctx: &mut PipelineContext<'_>,
 ) -> Result<IngestionMachine<(), Persisted>, AppError> {
-    let PipelineArtifacts {
-        text_content,
-        entities,
-        relationships,
-        chunks,
-    } = ctx.build_artifacts().await?;
-    let entity_count = entities.len();
-    let relationship_count = relationships.len();
-
+    let artifacts = ctx.build_artifacts().await?;
    let settings = SystemSettings::get_current(ctx.db).await?;
    let embedding_dimensions = usize::try_from(settings.embedding_dimensions).map_err(|_| {
        AppError::InternalError("system_settings.embedding_dimensions exceeds usize::MAX".into())
    })?;

-    let chunk_count = store_vector_chunks(
-        ctx.db,
-        ctx.task_id.as_str(),
-        embedding_dimensions,
-        chunks,
-    )
-    .await?;
-    store_graph_entities(
+    let counts = persist_artifacts(
        ctx.db,
        &ctx.pipeline_config.tuning,
        embedding_dimensions,
-        entities,
-        relationships,
+        artifacts,
    )
    .await?;
-    ctx.db.store_item(text_content).await?;

    debug!(
        task_id = %ctx.task_id,
        attempt = ctx.attempt,
-        entity_count,
-        relationship_count,
-        chunk_count,
+        entity_count = counts.entity_count,
+        relationship_count = counts.relationship_count,
+        chunk_count = counts.chunk_count,
        "ingestion persistence flushed to database"
    );

@@ -0,0 +1,178 @@
+//! Shared helpers for ingestion pipeline integration and persistence tests.
+
+use chrono::Utc;
+use common::{
+    error::AppError,
+    storage::{
+        db::SurrealDbClient,
+        types::{
+            knowledge_entity::{KnowledgeEntity, KnowledgeEntityType},
+            knowledge_relationship::KnowledgeRelationship,
+            text_chunk::TextChunk,
+            text_content::TextContent,
+        },
+    },
+};
+use uuid::Uuid;
+
+use super::{
+    config::IngestionTuning,
+    context::{EmbeddedKnowledgeEntity, EmbeddedTextChunk, PipelineArtifacts},
+    persistence::persist_artifacts,
+};
+
+pub const TEST_EMBEDDING_DIM: usize = 3;
+
+pub async fn setup_db() -> anyhow::Result<SurrealDbClient> {
+    let namespace = "ingestion_pipeline_test";
+    let database = Uuid::new_v4().to_string();
+    let db = SurrealDbClient::memory(namespace, &database).await?;
+    db.apply_migrations().await?;
+    Ok(db)
+}
+
+pub fn tuning() -> IngestionTuning {
+    IngestionTuning::default()
+}
+
+pub fn sample_artifacts(source_id: &str, user_id: &str) -> PipelineArtifacts {
+    large_artifacts(source_id, user_id, 1, 1, 1, TEST_EMBEDDING_DIM)
+}
+
+#[allow(
+    clippy::too_many_arguments,
+    clippy::arithmetic_side_effects,
+    clippy::expect_used
+)]
+pub fn large_artifacts(
+    source_id: &str,
+    user_id: &str,
+    chunk_count: usize,
+    entity_count: usize,
+    relationship_count: usize,
+    embedding_dim: usize,
+) -> PipelineArtifacts {
+    let now = Utc::now();
+    let embedding = vec![0.1_f32; embedding_dim];
+
+    let mut entities = Vec::with_capacity(entity_count);
+    let mut entity_ids = Vec::with_capacity(entity_count);
+
+    for index in 0..entity_count {
+        let entity_id = Uuid::new_v4().to_string();
+        entity_ids.push(entity_id.clone());
+        entities.push(EmbeddedKnowledgeEntity {
+            entity: KnowledgeEntity {
+                id: entity_id,
+                created_at: now,
+                updated_at: now,
+                name: format!("entity-{index}"),
+                description: format!("description-{index}"),
+                entity_type: KnowledgeEntityType::Idea,
+                source_id: source_id.to_string(),
+                metadata: None,
+                user_id: user_id.to_string(),
+            },
+            embedding: embedding.clone(),
+        });
+    }
+
+    let mut relationships = Vec::with_capacity(relationship_count);
+    assert!(
+        entity_count > 0 || relationship_count == 0,
+        "large_artifacts requires entity_count > 0 when relationship_count > 0"
+    );
+    for index in 0..relationship_count {
+        let in_id = entity_ids
+            .get(index % entity_count)
+            .expect("entity_count > 0 when relationship_count > 0")
+            .clone();
+        let out_id = entity_ids
+            .get((index + 1) % entity_count)
+            .expect("entity_count > 0 when relationship_count > 0")
+            .clone();
+        relationships.push(KnowledgeRelationship::new(
+            in_id,
+            out_id,
+            user_id.to_string(),
+            source_id.to_string(),
+            "relates_to".to_string(),
+        ));
+    }
+
+    let mut chunks = Vec::with_capacity(chunk_count);
+    for index in 0..chunk_count {
+        chunks.push(EmbeddedTextChunk {
+            chunk: TextChunk {
+                id: Uuid::new_v4().to_string(),
+                created_at: now,
+                updated_at: now,
+                source_id: source_id.to_string(),
+                chunk: format!("chunk body {index}"),
+                user_id: user_id.to_string(),
+            },
+            embedding: embedding.clone(),
+        });
+    }
+
+    PipelineArtifacts {
+        text_content: TextContent {
+            id: source_id.to_string(),
+            created_at: now,
+            updated_at: now,
+            text: format!("document with {chunk_count} chunks"),
+            file_info: None,
+            url_info: None,
+            context: None,
+            category: "notes".to_string(),
+            user_id: user_id.to_string(),
+        },
+        entities,
+        relationships,
+        chunks,
+    }
+}
+
+pub async fn persist(
+    db: &SurrealDbClient,
+    artifacts: PipelineArtifacts,
+) -> Result<(), AppError> {
+    persist_artifacts(db, &tuning(), TEST_EMBEDDING_DIM, artifacts).await?;
+    Ok(())
+}
+
+pub async fn count_chunks_for_source(db: &SurrealDbClient, source_id: &str) -> anyhow::Result<usize> {
+    let chunks: Vec<TextChunk> = db
+        .client
+        .query("SELECT * FROM text_chunk WHERE source_id = $source_id;")
+        .bind(("source_id", source_id.to_string()))
+        .await?
+        .take(0)?;
+    Ok(chunks.len())
+}
+
+pub async fn count_entities_for_source(
+    db: &SurrealDbClient,
+    source_id: &str,
+) -> anyhow::Result<usize> {
+    let entities: Vec<KnowledgeEntity> = db
+        .client
+        .query("SELECT * FROM knowledge_entity WHERE source_id = $source_id;")
+        .bind(("source_id", source_id.to_string()))
+        .await?
+        .take(0)?;
+    Ok(entities.len())
+}
+
+pub async fn count_relationships_for_source(
+    db: &SurrealDbClient,
+    source_id: &str,
+) -> anyhow::Result<usize> {
+    let relationships: Vec<KnowledgeRelationship> = db
+        .client
+        .query("SELECT * FROM relates_to WHERE metadata.source_id = $source_id;")
+        .bind(("source_id", source_id.to_string()))
+        .await?
+        .take(0)?;
+    Ok(relationships.len())
+}
@@ -20,12 +20,14 @@ use common::{
 };
 use retrieval_pipeline::{RetrievedChunk, RetrievedEntity};
 use tokio::sync::Mutex;
-use uuid::Uuid;
-
 use super::{
    config::{IngestionConfig, IngestionTuning},
    enrichment_result::LLMEnrichmentResult,
    services::PipelineServices,
+    test_support::{
+        count_chunks_for_source, count_entities_for_source, count_relationships_for_source,
+        persist, sample_artifacts, setup_db,
+    },
    IngestionPipeline,
 };

@@ -141,15 +143,30 @@ impl PipelineServices for MockServices {

    async fn convert_analysis(
        &self,
-        _content: &TextContent,
+        content: &TextContent,
        _analysis: &LLMEnrichmentResult,
        _entity_concurrency: usize,
    ) -> Result<(Vec<EmbeddedKnowledgeEntity>, Vec<KnowledgeRelationship>), AppError> {
        self.record("convert").await;
-        Ok((
-            self.graph_entities.clone(),
-            self.graph_relationships.clone(),
-        ))
+        let entities = self
+            .graph_entities
+            .iter()
+            .map(|embedded| {
+                let mut embedded = embedded.clone();
+                embedded.entity.source_id = content.id.clone();
+                embedded
+            })
+            .collect();
+        let relationships = self
+            .graph_relationships
+            .iter()
+            .map(|relationship| {
+                let mut relationship = relationship.clone();
+                relationship.metadata.source_id = content.id.clone();
+                relationship
+            })
+            .collect();
+        Ok((entities, relationships))
    }

    async fn prepare_chunks(
@@ -266,14 +283,6 @@ impl PipelineServices for ValidationServices {
    }
 }

-async fn setup_db() -> anyhow::Result<SurrealDbClient> {
-    let namespace = "pipeline_test";
-    let database = Uuid::new_v4().to_string();
-    let db = SurrealDbClient::memory(namespace, &database).await?;
-    db.apply_migrations().await?;
-    Ok(db)
-}
-
 fn pipeline_config() -> IngestionConfig {
    IngestionConfig {
        tuning: IngestionTuning {
@@ -319,7 +328,41 @@ async fn retry_delay_grows_exponentially_and_caps() -> anyhow::Result<()> {
 }

 #[tokio::test]
-async fn ingestion_pipeline_happy_path_persists_entities() -> anyhow::Result<()> {
+async fn process_task_skips_pipeline_when_artifacts_already_persisted() -> anyhow::Result<()> {
+    let db = setup_db().await?;
+    let worker_id = "worker-persisted-skip";
+    let user_id = "user-skip";
+    let services = Arc::new(FailingServices {
+        inner: MockServices::new(user_id),
+    });
+    let pipeline =
+        IngestionPipeline::with_services(Arc::new(db.clone()), pipeline_config(), services)?;
+
+    let task = reserve_task(
+        &db,
+        worker_id,
+        IngestionPayload::Text {
+            text: "Already persisted payload".into(),
+            context: "Context".into(),
+            category: "notes".into(),
+            user_id: user_id.into(),
+        },
+        user_id,
+    )
+    .await?;
+
+    persist(&db, sample_artifacts(&task.id, user_id)).await?;
+
+    pipeline.process_task(task.clone()).await?;
+
+    let stored_task: IngestionTask = db.get_item(&task.id).await?.context("task present")?;
+    assert_eq!(stored_task.state, TaskState::Succeeded);
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn ingestion_pipeline_happy_path_persists_artifacts() -> anyhow::Result<()> {
    let db = setup_db().await?;
    let worker_id = "worker-happy";
    let user_id = "user-123";
@@ -346,14 +389,17 @@ async fn ingestion_pipeline_happy_path_persists_entities() -> anyhow::Result<()>
    let stored_task: IngestionTask = db.get_item(&task.id).await?.context("task present")?;
    assert_eq!(stored_task.state, TaskState::Succeeded);

-    let stored_entities: Vec<KnowledgeEntity> =
-        db.get_all_stored_items::<KnowledgeEntity>().await?;
-    assert!(!stored_entities.is_empty(), "entities should be stored");
-
-    let stored_chunks: Vec<TextChunk> = db.get_all_stored_items::<TextChunk>().await?;
-    assert!(
-        !stored_chunks.is_empty(),
-        "chunks should be stored for ingestion text"
+    let text_content: TextContent = db.get_item(&task.id).await?.context("text content")?;
+    assert_eq!(
+        text_content.id, task.id,
+        "ingested text_content id should equal the ingestion task id"
+    );
+    assert_eq!(count_chunks_for_source(&db, &task.id).await?, 1);
+    assert_eq!(count_entities_for_source(&db, &task.id).await?, 1);
+    assert_eq!(
+        count_relationships_for_source(&db, &task.id).await?,
+        1,
+        "graph relationships should be persisted"
    );

    let call_log = services.calls.lock().await.clone();
@@ -397,26 +443,14 @@ async fn ingestion_pipeline_chunk_only_skips_analysis() -> anyhow::Result<()> {

    pipeline.process_task(task.clone()).await?;

-    let stored_entities: Vec<KnowledgeEntity> =
-        db.get_all_stored_items::<KnowledgeEntity>().await?;
-    assert!(
-        stored_entities.is_empty(),
-        "chunk-only ingestion should not persist entities"
-    );
-    let relationship_count: Option<i64> = db
-        .client
-        .query("SELECT count() as count FROM relates_to;")
-        .await?
-        .take::<Option<i64>>(0)
-        .unwrap_or_default();
    assert_eq!(
-        relationship_count.unwrap_or(0),
+        count_relationships_for_source(&db, &task.id).await?,
        0,
        "chunk-only ingestion should not persist relationships"
    );
-    let stored_chunks: Vec<TextChunk> = db.get_all_stored_items::<TextChunk>().await?;
-    assert!(
-        !stored_chunks.is_empty(),
+    assert_eq!(
+        count_chunks_for_source(&db, &task.id).await?,
+        1,
        "chunk-only ingestion should still persist chunks"
    );