mirror of
https://github.com/perstarkse/minne.git
synced 2026-06-28 12:56:26 +02:00
fix: atomic ingestion persist with task reclaim and shared cleanup.
One transaction per task replaces prior artifact rows; workers skip the pipeline when content already exists, eval seeding reuses persist_artifacts, and deletes clear graph children via shared SQL.
This commit is contained in:
@@ -150,7 +150,7 @@ impl SurrealDbClient {
|
||||
|
||||
/// Upsert an object in SurrealDB, replacing any existing record with the same ID.
|
||||
///
|
||||
/// Useful for idempotent ingestion flows.
|
||||
/// Useful when a single record should be replaced by id (admin updates, embedding rows, etc.).
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
|
||||
@@ -245,8 +245,9 @@ impl KnowledgeEntity {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Atomically store a knowledge entity and its embedding.
|
||||
/// Writes the entity to `knowledge_entity` and the embedding to `knowledge_entity_embedding`.
|
||||
/// Atomically store one knowledge entity and its embedding (single-record path).
|
||||
///
|
||||
/// Bulk ingestion uses `ingestion_pipeline::persist_artifacts` instead.
|
||||
pub async fn store_with_embedding(
|
||||
entity: KnowledgeEntity,
|
||||
embedding: Vec<f32>,
|
||||
|
||||
@@ -61,8 +61,9 @@ impl TextChunk {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Atomically store a text chunk and its embedding.
|
||||
/// Writes the chunk to `text_chunk` and the embedding to `text_chunk_embedding`.
|
||||
/// Atomically store one text chunk and its embedding (single-record path).
|
||||
///
|
||||
/// Bulk ingestion uses `ingestion_pipeline::persist_artifacts` instead.
|
||||
pub async fn store_with_embedding(
|
||||
chunk: TextChunk,
|
||||
embedding: Vec<f32>,
|
||||
|
||||
@@ -96,6 +96,41 @@ impl TextContent {
|
||||
}
|
||||
}
|
||||
|
||||
/// SurrealQL deletes for ingested child rows keyed by `source_id` (no transaction wrapper).
|
||||
///
|
||||
/// Used inside larger transactions (e.g. ingestion `persist_artifacts`) and mirrored by
|
||||
/// [`Self::clear_ingested_children`].
|
||||
pub const CLEAR_INGESTED_CHILD_ROWS_SURQL: &'static str = r"
|
||||
DELETE relates_to WHERE metadata.source_id = $source_id AND metadata.user_id = $user_id;
|
||||
DELETE text_chunk_embedding WHERE source_id = $source_id;
|
||||
DELETE text_chunk WHERE source_id = $source_id;
|
||||
DELETE knowledge_entity_embedding WHERE source_id = $source_id;
|
||||
DELETE knowledge_entity WHERE source_id = $source_id;
|
||||
";
|
||||
|
||||
/// Removes chunks, embeddings, entities, and relationships for one ingested document snapshot.
|
||||
pub async fn clear_ingested_children(
|
||||
source_id: &str,
|
||||
user_id: &str,
|
||||
db: &SurrealDbClient,
|
||||
) -> Result<(), AppError> {
|
||||
let query = format!(
|
||||
"BEGIN TRANSACTION;\n{} COMMIT TRANSACTION;",
|
||||
Self::CLEAR_INGESTED_CHILD_ROWS_SURQL
|
||||
);
|
||||
|
||||
db.client
|
||||
.query(query)
|
||||
.bind(("source_id", source_id.to_string()))
|
||||
.bind(("user_id", user_id.to_string()))
|
||||
.await
|
||||
.map_err(AppError::from)?
|
||||
.check()
|
||||
.map_err(AppError::from)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn patch(
|
||||
id: &str,
|
||||
context: &str,
|
||||
@@ -364,7 +399,14 @@ mod tests {
|
||||
use anyhow::{self, Context};
|
||||
|
||||
use super::*;
|
||||
use crate::test_utils::setup_test_db_with_runtime_indexes;
|
||||
use crate::{
|
||||
storage::types::{
|
||||
knowledge_entity::{KnowledgeEntity, KnowledgeEntityType},
|
||||
knowledge_relationship::KnowledgeRelationship,
|
||||
text_chunk::TextChunk,
|
||||
},
|
||||
test_utils::{setup_test_db, setup_test_db_with_runtime_indexes},
|
||||
};
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_text_content_creation() -> anyhow::Result<()> {
|
||||
@@ -638,4 +680,81 @@ mod tests {
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn clear_ingested_children_removes_chunks_entities_and_relationships(
|
||||
) -> anyhow::Result<()> {
|
||||
let db = setup_test_db().await?;
|
||||
let user_id = "clear-user";
|
||||
let source_id = Uuid::new_v4().to_string();
|
||||
|
||||
let entity_a = KnowledgeEntity::new(
|
||||
source_id.clone(),
|
||||
"entity-a".to_string(),
|
||||
"desc-a".to_string(),
|
||||
KnowledgeEntityType::Idea,
|
||||
None,
|
||||
user_id.to_string(),
|
||||
);
|
||||
let entity_b = KnowledgeEntity::new(
|
||||
source_id.clone(),
|
||||
"entity-b".to_string(),
|
||||
"desc-b".to_string(),
|
||||
KnowledgeEntityType::Idea,
|
||||
None,
|
||||
user_id.to_string(),
|
||||
);
|
||||
KnowledgeEntity::store_with_embedding(entity_a.clone(), vec![0.1; 3], 3, &db)
|
||||
.await
|
||||
.context("store entity a")?;
|
||||
KnowledgeEntity::store_with_embedding(entity_b.clone(), vec![0.2; 3], 3, &db)
|
||||
.await
|
||||
.context("store entity b")?;
|
||||
|
||||
let chunk = TextChunk::new(source_id.clone(), "chunk".to_string(), user_id.to_string());
|
||||
TextChunk::store_with_embedding(chunk, vec![0.3; 3], 3, &db)
|
||||
.await
|
||||
.context("store chunk")?;
|
||||
|
||||
KnowledgeRelationship::new(
|
||||
entity_a.id.clone(),
|
||||
entity_b.id,
|
||||
user_id.to_string(),
|
||||
source_id.clone(),
|
||||
"relates_to".to_string(),
|
||||
)
|
||||
.store_relationship(&db)
|
||||
.await
|
||||
.context("store relationship")?;
|
||||
|
||||
TextContent::clear_ingested_children(&source_id, user_id, &db)
|
||||
.await
|
||||
.context("clear ingested children")?;
|
||||
|
||||
let chunks: Vec<TextChunk> = db
|
||||
.client
|
||||
.query("SELECT * FROM text_chunk WHERE source_id = $source_id;")
|
||||
.bind(("source_id", source_id.clone()))
|
||||
.await?
|
||||
.take(0)?;
|
||||
assert!(chunks.is_empty());
|
||||
|
||||
let entities: Vec<KnowledgeEntity> = db
|
||||
.client
|
||||
.query("SELECT * FROM knowledge_entity WHERE source_id = $source_id;")
|
||||
.bind(("source_id", source_id.clone()))
|
||||
.await?
|
||||
.take(0)?;
|
||||
assert!(entities.is_empty());
|
||||
|
||||
let relationships: Vec<KnowledgeRelationship> = db
|
||||
.client
|
||||
.query("SELECT * FROM relates_to WHERE metadata.source_id = $source_id;")
|
||||
.bind(("source_id", source_id))
|
||||
.await?
|
||||
.take(0)?;
|
||||
assert!(relationships.is_empty());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user