mirror of
https://github.com/perstarkse/minne.git
synced 2026-07-02 11:01:38 +02:00
perf: batch entity embeddings during ingest and expand retry tests.
Entity enrichment now uses embed_batch like chunks; the unused entity_embedding_concurrency knob is removed and ingest retry paths gain test coverage.
This commit is contained in:
@@ -2,10 +2,12 @@
|
|||||||
## Unreleased
|
## Unreleased
|
||||||
- Performance: ingestion skips per-task index rebuild; worker runs scheduled `REBUILD INDEX` (default every 24h via `index_rebuild_interval_secs`, `0` disables)
|
- Performance: ingestion skips per-task index rebuild; worker runs scheduled `REBUILD INDEX` (default every 24h via `index_rebuild_interval_secs`, `0` disables)
|
||||||
- Performance: ingestion persists all artifacts in a single SurrealDB transaction per task (atomic replace by task id)
|
- Performance: ingestion persists all artifacts in a single SurrealDB transaction per task (atomic replace by task id)
|
||||||
|
- Performance: entity embeddings during ingestion use batched `embed_batch`, matching chunk embedding
|
||||||
- Fix: ingestion reclaims tasks after a successful persist without re-running the pipeline when `mark_succeeded` failed
|
- Fix: ingestion reclaims tasks after a successful persist without re-running the pipeline when `mark_succeeded` failed
|
||||||
- Fix: content deletion clears graph relationships via shared `TextContent::clear_ingested_children`
|
- Fix: content deletion clears graph relationships via shared `TextContent::clear_ingested_children`
|
||||||
- Fix: regression re suggestion of relationships
|
- Fix: regression re suggestion of relationships
|
||||||
- Internal: eval corpus DB seed uses `persist_artifacts` instead of a separate batched insert path
|
- Internal: eval corpus DB seed uses `persist_artifacts` instead of a separate batched insert path
|
||||||
|
- Internal: removed unused `entity_embedding_concurrency` ingest tuning knob
|
||||||
|
|
||||||
## 1.0.3 (2026-06-12)
|
## 1.0.3 (2026-06-12)
|
||||||
- Search: filter results by type — knowledge entities, ingested content, or both
|
- Search: filter results by type — knowledge entities, ingested content, or both
|
||||||
|
|||||||
@@ -9,7 +9,6 @@ pub struct IngestionTuning {
|
|||||||
pub chunk_min_tokens: usize,
|
pub chunk_min_tokens: usize,
|
||||||
pub chunk_max_tokens: usize,
|
pub chunk_max_tokens: usize,
|
||||||
pub chunk_overlap_tokens: usize,
|
pub chunk_overlap_tokens: usize,
|
||||||
pub entity_embedding_concurrency: usize,
|
|
||||||
/// Maximum characters of content body used to build the similarity-search query
|
/// Maximum characters of content body used to build the similarity-search query
|
||||||
/// during retrieval. Longer bodies are truncated to keep embedding inputs bounded.
|
/// during retrieval. Longer bodies are truncated to keep embedding inputs bounded.
|
||||||
pub embedding_query_char_limit: usize,
|
pub embedding_query_char_limit: usize,
|
||||||
@@ -27,7 +26,6 @@ impl Default for IngestionTuning {
|
|||||||
chunk_min_tokens: 256,
|
chunk_min_tokens: 256,
|
||||||
chunk_max_tokens: 512,
|
chunk_max_tokens: 512,
|
||||||
chunk_overlap_tokens: 50,
|
chunk_overlap_tokens: 50,
|
||||||
entity_embedding_concurrency: 4,
|
|
||||||
embedding_query_char_limit: 12_000,
|
embedding_query_char_limit: 12_000,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -111,11 +111,7 @@ impl<'a> PipelineContext<'a> {
|
|||||||
|
|
||||||
let (entities, relationships) = self
|
let (entities, relationships) = self
|
||||||
.services
|
.services
|
||||||
.convert_analysis(
|
.convert_analysis(&content, &analysis)
|
||||||
&content,
|
|
||||||
&analysis,
|
|
||||||
self.pipeline_config.tuning.entity_embedding_concurrency,
|
|
||||||
)
|
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
let chunk_range = self.chunk_token_range();
|
let chunk_range = self.chunk_token_range();
|
||||||
|
|||||||
@@ -1,7 +1,4 @@
|
|||||||
use std::sync::Arc;
|
|
||||||
|
|
||||||
use chrono::Utc;
|
use chrono::Utc;
|
||||||
use futures::stream::{self, StreamExt, TryStreamExt};
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
use common::{
|
use common::{
|
||||||
@@ -43,22 +40,15 @@ impl LLMEnrichmentResult {
|
|||||||
&self,
|
&self,
|
||||||
source_id: &str,
|
source_id: &str,
|
||||||
user_id: &str,
|
user_id: &str,
|
||||||
entity_concurrency: usize,
|
|
||||||
embedding_provider: &EmbeddingProvider,
|
embedding_provider: &EmbeddingProvider,
|
||||||
) -> Result<(Vec<EmbeddedKnowledgeEntity>, Vec<KnowledgeRelationship>), AppError> {
|
) -> Result<(Vec<EmbeddedKnowledgeEntity>, Vec<KnowledgeRelationship>), AppError> {
|
||||||
let mapper = Arc::new(self.create_mapper());
|
let mapper = self.create_mapper();
|
||||||
|
|
||||||
let entities = self
|
let entities = self
|
||||||
.process_entities(
|
.process_entities(source_id, user_id, &mapper, embedding_provider)
|
||||||
source_id,
|
|
||||||
user_id,
|
|
||||||
Arc::clone(&mapper),
|
|
||||||
entity_concurrency,
|
|
||||||
embedding_provider,
|
|
||||||
)
|
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
let relationships = self.process_relationships(source_id, user_id, mapper.as_ref())?;
|
let relationships = self.process_relationships(source_id, user_id, &mapper)?;
|
||||||
|
|
||||||
Ok((entities, relationships))
|
Ok((entities, relationships))
|
||||||
}
|
}
|
||||||
@@ -77,36 +67,64 @@ impl LLMEnrichmentResult {
|
|||||||
&self,
|
&self,
|
||||||
source_id: &str,
|
source_id: &str,
|
||||||
user_id: &str,
|
user_id: &str,
|
||||||
mapper: Arc<GraphMapper>,
|
mapper: &GraphMapper,
|
||||||
entity_concurrency: usize,
|
|
||||||
embedding_provider: &EmbeddingProvider,
|
embedding_provider: &EmbeddingProvider,
|
||||||
) -> Result<Vec<EmbeddedKnowledgeEntity>, AppError> {
|
) -> Result<Vec<EmbeddedKnowledgeEntity>, AppError> {
|
||||||
let tasks: Vec<_> = self
|
if self.knowledge_entities.is_empty() {
|
||||||
.knowledge_entities
|
return Ok(Vec::new());
|
||||||
.iter()
|
}
|
||||||
.map(|entity| {
|
|
||||||
let llm_entity = entity.clone();
|
|
||||||
let mapper = Arc::clone(&mapper);
|
|
||||||
let source_id = source_id.to_string();
|
|
||||||
let user_id = user_id.to_string();
|
|
||||||
|
|
||||||
async move {
|
let now = Utc::now();
|
||||||
create_single_entity(
|
let mut prepared = Vec::with_capacity(self.knowledge_entities.len());
|
||||||
llm_entity,
|
let mut embedding_inputs = Vec::with_capacity(self.knowledge_entities.len());
|
||||||
&source_id,
|
|
||||||
&user_id,
|
|
||||||
mapper,
|
|
||||||
embedding_provider,
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
stream::iter(tasks)
|
for llm_entity in &self.knowledge_entities {
|
||||||
.buffer_unordered(entity_concurrency.max(1))
|
let assigned_id = mapper.get_id(&llm_entity.key)?.to_string();
|
||||||
.try_collect()
|
let entity_type = KnowledgeEntityType::from(llm_entity.entity_type.clone());
|
||||||
|
embedding_inputs.push(KnowledgeEntity::embedding_input_text(
|
||||||
|
&llm_entity.name,
|
||||||
|
&llm_entity.description,
|
||||||
|
entity_type,
|
||||||
|
));
|
||||||
|
prepared.push((llm_entity, assigned_id, entity_type));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Embed all entities from this document in one batch: a single lock acquisition and one
|
||||||
|
// blocking hop, letting the backend batch the inference internally.
|
||||||
|
let embeddings = embedding_provider
|
||||||
|
.embed_batch(&embedding_inputs)
|
||||||
.await
|
.await
|
||||||
|
.map_err(|e| AppError::InternalError(format!("entity embedding batch failed: {e}")))?;
|
||||||
|
|
||||||
|
if embeddings.len() != prepared.len() {
|
||||||
|
return Err(AppError::InternalError(format!(
|
||||||
|
"embedding batch returned {} vectors for {} entities",
|
||||||
|
embeddings.len(),
|
||||||
|
prepared.len()
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut entities = Vec::with_capacity(prepared.len());
|
||||||
|
for ((llm_entity, assigned_id, entity_type), embedding) in
|
||||||
|
prepared.into_iter().zip(embeddings)
|
||||||
|
{
|
||||||
|
entities.push(EmbeddedKnowledgeEntity {
|
||||||
|
entity: KnowledgeEntity {
|
||||||
|
id: assigned_id,
|
||||||
|
created_at: now,
|
||||||
|
updated_at: now,
|
||||||
|
name: llm_entity.name.clone(),
|
||||||
|
description: llm_entity.description.clone(),
|
||||||
|
entity_type,
|
||||||
|
source_id: source_id.to_string(),
|
||||||
|
metadata: None,
|
||||||
|
user_id: user_id.to_string(),
|
||||||
|
},
|
||||||
|
embedding,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(entities)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn process_relationships(
|
fn process_relationships(
|
||||||
@@ -133,44 +151,11 @@ impl LLMEnrichmentResult {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn create_single_entity(
|
|
||||||
llm_entity: LLMKnowledgeEntity,
|
|
||||||
source_id: &str,
|
|
||||||
user_id: &str,
|
|
||||||
mapper: Arc<GraphMapper>,
|
|
||||||
embedding_provider: &EmbeddingProvider,
|
|
||||||
) -> Result<EmbeddedKnowledgeEntity, AppError> {
|
|
||||||
let assigned_id = mapper.get_id(&llm_entity.key)?.to_string();
|
|
||||||
|
|
||||||
let entity_type = KnowledgeEntityType::from(llm_entity.entity_type);
|
|
||||||
let embedding_input = KnowledgeEntity::embedding_input_text(
|
|
||||||
&llm_entity.name,
|
|
||||||
&llm_entity.description,
|
|
||||||
entity_type,
|
|
||||||
);
|
|
||||||
|
|
||||||
let embedding = embedding_provider.embed(&embedding_input).await?;
|
|
||||||
|
|
||||||
let now = Utc::now();
|
|
||||||
let entity = KnowledgeEntity {
|
|
||||||
id: assigned_id,
|
|
||||||
created_at: now,
|
|
||||||
updated_at: now,
|
|
||||||
name: llm_entity.name,
|
|
||||||
description: llm_entity.description,
|
|
||||||
entity_type,
|
|
||||||
source_id: source_id.to_string(),
|
|
||||||
metadata: None,
|
|
||||||
user_id: user_id.into(),
|
|
||||||
};
|
|
||||||
|
|
||||||
Ok(EmbeddedKnowledgeEntity { entity, embedding })
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
#![allow(clippy::expect_used)]
|
#![allow(clippy::expect_used)]
|
||||||
use super::*;
|
use super::*;
|
||||||
|
use common::utils::embedding::EmbeddingProvider;
|
||||||
use uuid::Uuid;
|
use uuid::Uuid;
|
||||||
|
|
||||||
fn entity(key: &str) -> LLMKnowledgeEntity {
|
fn entity(key: &str) -> LLMKnowledgeEntity {
|
||||||
@@ -247,6 +232,32 @@ mod tests {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn process_entities_batches_embeddings_and_preserves_order() -> anyhow::Result<()> {
|
||||||
|
let result = LLMEnrichmentResult {
|
||||||
|
knowledge_entities: vec![entity("k1"), entity("k2"), entity("k3")],
|
||||||
|
relationships: Vec::new(),
|
||||||
|
};
|
||||||
|
let mapper = result.create_mapper();
|
||||||
|
let provider = EmbeddingProvider::new_hashed(8)?;
|
||||||
|
|
||||||
|
let entities = result
|
||||||
|
.process_entities("source-1", "user-1", &mapper, &provider)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
assert_eq!(entities.len(), 3);
|
||||||
|
let first = entities.first().expect("first entity");
|
||||||
|
let second = entities.get(1).expect("second entity");
|
||||||
|
let third = entities.get(2).expect("third entity");
|
||||||
|
assert_eq!(first.entity.name, "name-k1");
|
||||||
|
assert_eq!(second.entity.name, "name-k2");
|
||||||
|
assert_eq!(third.entity.name, "name-k3");
|
||||||
|
assert!(entities.iter().all(|item| item.embedding.len() == 8));
|
||||||
|
assert_ne!(first.embedding, second.embedding);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn process_relationships_errors_on_unknown_endpoint() {
|
fn process_relationships_errors_on_unknown_endpoint() {
|
||||||
let result = LLMEnrichmentResult {
|
let result = LLMEnrichmentResult {
|
||||||
|
|||||||
@@ -347,3 +347,85 @@ mod test_support;
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests;
|
mod tests;
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod finalize_tests {
|
||||||
|
use std::{sync::Arc, time::Duration};
|
||||||
|
|
||||||
|
use common::storage::types::{
|
||||||
|
ingestion_payload::IngestionPayload,
|
||||||
|
ingestion_task::{IngestionTask, TaskState},
|
||||||
|
};
|
||||||
|
use tokio::time::sleep;
|
||||||
|
|
||||||
|
use super::{
|
||||||
|
config::IngestionTuning,
|
||||||
|
test_support::setup_db,
|
||||||
|
tests::{pipeline_config, reserve_task, MockServices},
|
||||||
|
IngestionPipeline, PipelineServices,
|
||||||
|
};
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn finalize_succeeded_retries_mark_succeeded() -> anyhow::Result<()> {
|
||||||
|
use anyhow::Context;
|
||||||
|
let db = setup_db().await?;
|
||||||
|
let worker_id = "worker-finalize-retry";
|
||||||
|
let user_id = "user-finalize-retry";
|
||||||
|
let services: Arc<dyn PipelineServices> = Arc::new(MockServices::new(user_id));
|
||||||
|
let mut config = pipeline_config();
|
||||||
|
config.tuning = IngestionTuning {
|
||||||
|
persist_attempts: 3,
|
||||||
|
persist_initial_backoff_ms: 10,
|
||||||
|
persist_max_backoff_ms: 10,
|
||||||
|
..IngestionTuning::default()
|
||||||
|
};
|
||||||
|
let pipeline =
|
||||||
|
IngestionPipeline::with_services(Arc::new(db.clone()), config, services)?;
|
||||||
|
|
||||||
|
let task = reserve_task(
|
||||||
|
&db,
|
||||||
|
worker_id,
|
||||||
|
IngestionPayload::Text {
|
||||||
|
text: "Finalize retry payload".into(),
|
||||||
|
context: "Context".into(),
|
||||||
|
category: "notes".into(),
|
||||||
|
user_id: user_id.into(),
|
||||||
|
},
|
||||||
|
user_id,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
let processing = task.mark_processing(&db).await?;
|
||||||
|
|
||||||
|
db.client
|
||||||
|
.query(
|
||||||
|
"UPDATE type::thing('ingestion_task', $id) SET worker_id = $wrong_worker;",
|
||||||
|
)
|
||||||
|
.bind(("id", processing.id.clone()))
|
||||||
|
.bind(("wrong_worker", "wrong-worker"))
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let task_id = processing.id.clone();
|
||||||
|
let db_fix = db.clone();
|
||||||
|
tokio::spawn(async move {
|
||||||
|
sleep(Duration::from_millis(5)).await;
|
||||||
|
let _ = db_fix
|
||||||
|
.client
|
||||||
|
.query(
|
||||||
|
"UPDATE type::thing('ingestion_task', $id) SET worker_id = $worker_id;",
|
||||||
|
)
|
||||||
|
.bind(("id", task_id))
|
||||||
|
.bind(("worker_id", worker_id))
|
||||||
|
.await;
|
||||||
|
});
|
||||||
|
|
||||||
|
pipeline.finalize_succeeded(&processing).await?;
|
||||||
|
|
||||||
|
let stored: IngestionTask = db
|
||||||
|
.get_item(&processing.id)
|
||||||
|
.await?
|
||||||
|
.context("task stored")?;
|
||||||
|
assert_eq!(stored.state, TaskState::Succeeded);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -124,6 +124,13 @@ async fn execute_persist_transaction(
|
|||||||
db: &SurrealDbClient,
|
db: &SurrealDbClient,
|
||||||
payload: &PersistPayload,
|
payload: &PersistPayload,
|
||||||
) -> Result<(), AppError> {
|
) -> Result<(), AppError> {
|
||||||
|
#[cfg(test)]
|
||||||
|
if test_persist_should_fail() {
|
||||||
|
return Err(AppError::InternalError(
|
||||||
|
"Failed to commit transaction due to a read or write conflict".into(),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
let mut query = String::from("BEGIN TRANSACTION;\n");
|
let mut query = String::from("BEGIN TRANSACTION;\n");
|
||||||
query.push_str(TextContent::CLEAR_INGESTED_CHILD_ROWS_SURQL);
|
query.push_str(TextContent::CLEAR_INGESTED_CHILD_ROWS_SURQL);
|
||||||
query.push_str(
|
query.push_str(
|
||||||
@@ -236,6 +243,24 @@ fn is_retryable_conflict(error: &AppError) -> bool {
|
|||||||
.contains("Failed to commit transaction due to a read or write conflict")
|
.contains("Failed to commit transaction due to a read or write conflict")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
static TEST_PERSIST_FAILURES: std::sync::atomic::AtomicUsize = std::sync::atomic::AtomicUsize::new(0);
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
fn set_test_persist_failures(count: usize) {
|
||||||
|
TEST_PERSIST_FAILURES.store(count, std::sync::atomic::Ordering::SeqCst);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
fn test_persist_should_fail() -> bool {
|
||||||
|
let remaining = TEST_PERSIST_FAILURES.load(std::sync::atomic::Ordering::SeqCst);
|
||||||
|
if remaining == 0 {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
TEST_PERSIST_FAILURES.fetch_sub(1, std::sync::atomic::Ordering::SeqCst);
|
||||||
|
true
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use common::storage::types::text_content::TextContent;
|
use common::storage::types::text_content::TextContent;
|
||||||
@@ -340,4 +365,44 @@ mod tests {
|
|||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn is_retryable_conflict_matches_surreal_transaction_conflict() {
|
||||||
|
let err = AppError::InternalError(
|
||||||
|
"Failed to commit transaction due to a read or write conflict".into(),
|
||||||
|
);
|
||||||
|
assert!(is_retryable_conflict(&err));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn is_retryable_conflict_rejects_unrelated_errors() {
|
||||||
|
let err = AppError::Validation("invalid payload".into());
|
||||||
|
assert!(!is_retryable_conflict(&err));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn persist_artifacts_retries_transient_conflicts() -> anyhow::Result<()> {
|
||||||
|
set_test_persist_failures(2);
|
||||||
|
|
||||||
|
let db = setup_db().await?;
|
||||||
|
let source_id = uuid::Uuid::new_v4().to_string();
|
||||||
|
let user_id = "persist-retry";
|
||||||
|
let mut tuning = test_support::tuning();
|
||||||
|
tuning.persist_attempts = 3;
|
||||||
|
tuning.persist_initial_backoff_ms = 1;
|
||||||
|
tuning.persist_max_backoff_ms = 1;
|
||||||
|
|
||||||
|
let counts = persist_artifacts(
|
||||||
|
&db,
|
||||||
|
&tuning,
|
||||||
|
TEST_EMBEDDING_DIM,
|
||||||
|
sample_artifacts(&source_id, user_id),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
assert_eq!(counts.chunk_count, 1);
|
||||||
|
assert_eq!(count_chunks_for_source(&db, &source_id).await?, 1);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -51,7 +51,6 @@ pub trait PipelineServices: Send + Sync {
|
|||||||
&self,
|
&self,
|
||||||
content: &TextContent,
|
content: &TextContent,
|
||||||
analysis: &LLMEnrichmentResult,
|
analysis: &LLMEnrichmentResult,
|
||||||
entity_concurrency: usize,
|
|
||||||
) -> Result<(Vec<EmbeddedKnowledgeEntity>, Vec<KnowledgeRelationship>), AppError>;
|
) -> Result<(Vec<EmbeddedKnowledgeEntity>, Vec<KnowledgeRelationship>), AppError>;
|
||||||
|
|
||||||
async fn prepare_chunks(
|
async fn prepare_chunks(
|
||||||
@@ -230,15 +229,9 @@ impl PipelineServices for DefaultPipelineServices {
|
|||||||
&self,
|
&self,
|
||||||
content: &TextContent,
|
content: &TextContent,
|
||||||
analysis: &LLMEnrichmentResult,
|
analysis: &LLMEnrichmentResult,
|
||||||
entity_concurrency: usize,
|
|
||||||
) -> Result<(Vec<EmbeddedKnowledgeEntity>, Vec<KnowledgeRelationship>), AppError> {
|
) -> Result<(Vec<EmbeddedKnowledgeEntity>, Vec<KnowledgeRelationship>), AppError> {
|
||||||
analysis
|
analysis
|
||||||
.to_database_entities(
|
.to_database_entities(content.id(), &content.user_id, &self.embedding_provider)
|
||||||
content.id(),
|
|
||||||
&content.user_id,
|
|
||||||
entity_concurrency,
|
|
||||||
&self.embedding_provider,
|
|
||||||
)
|
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ use super::{
|
|||||||
IngestionPipeline,
|
IngestionPipeline,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct MockServices {
|
pub(crate) struct MockServices {
|
||||||
text_content: TextContent,
|
text_content: TextContent,
|
||||||
similar_entities: Vec<RetrievedEntity>,
|
similar_entities: Vec<RetrievedEntity>,
|
||||||
analysis: LLMEnrichmentResult,
|
analysis: LLMEnrichmentResult,
|
||||||
@@ -42,7 +42,7 @@ struct MockServices {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl MockServices {
|
impl MockServices {
|
||||||
fn new(user_id: &str) -> Self {
|
pub(crate) fn new(user_id: &str) -> Self {
|
||||||
const TEST_EMBEDDING_DIM: usize = 1536;
|
const TEST_EMBEDDING_DIM: usize = 1536;
|
||||||
let text_content = TextContent::new(
|
let text_content = TextContent::new(
|
||||||
"Example document for ingestion pipeline.".into(),
|
"Example document for ingestion pipeline.".into(),
|
||||||
@@ -145,7 +145,6 @@ impl PipelineServices for MockServices {
|
|||||||
&self,
|
&self,
|
||||||
content: &TextContent,
|
content: &TextContent,
|
||||||
_analysis: &LLMEnrichmentResult,
|
_analysis: &LLMEnrichmentResult,
|
||||||
_entity_concurrency: usize,
|
|
||||||
) -> Result<(Vec<EmbeddedKnowledgeEntity>, Vec<KnowledgeRelationship>), AppError> {
|
) -> Result<(Vec<EmbeddedKnowledgeEntity>, Vec<KnowledgeRelationship>), AppError> {
|
||||||
self.record("convert").await;
|
self.record("convert").await;
|
||||||
let entities = self
|
let entities = self
|
||||||
@@ -221,10 +220,9 @@ impl PipelineServices for FailingServices {
|
|||||||
&self,
|
&self,
|
||||||
content: &TextContent,
|
content: &TextContent,
|
||||||
analysis: &LLMEnrichmentResult,
|
analysis: &LLMEnrichmentResult,
|
||||||
entity_concurrency: usize,
|
|
||||||
) -> Result<(Vec<EmbeddedKnowledgeEntity>, Vec<KnowledgeRelationship>), AppError> {
|
) -> Result<(Vec<EmbeddedKnowledgeEntity>, Vec<KnowledgeRelationship>), AppError> {
|
||||||
self.inner
|
self.inner
|
||||||
.convert_analysis(content, analysis, entity_concurrency)
|
.convert_analysis(content, analysis)
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -268,7 +266,6 @@ impl PipelineServices for ValidationServices {
|
|||||||
&self,
|
&self,
|
||||||
_content: &TextContent,
|
_content: &TextContent,
|
||||||
_analysis: &LLMEnrichmentResult,
|
_analysis: &LLMEnrichmentResult,
|
||||||
_entity_concurrency: usize,
|
|
||||||
) -> Result<(Vec<EmbeddedKnowledgeEntity>, Vec<KnowledgeRelationship>), AppError> {
|
) -> Result<(Vec<EmbeddedKnowledgeEntity>, Vec<KnowledgeRelationship>), AppError> {
|
||||||
unreachable!("convert_analysis should not be called after validation failure")
|
unreachable!("convert_analysis should not be called after validation failure")
|
||||||
}
|
}
|
||||||
@@ -283,19 +280,18 @@ impl PipelineServices for ValidationServices {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn pipeline_config() -> IngestionConfig {
|
pub(crate) fn pipeline_config() -> IngestionConfig {
|
||||||
IngestionConfig {
|
IngestionConfig {
|
||||||
tuning: IngestionTuning {
|
tuning: IngestionTuning {
|
||||||
chunk_min_tokens: 4,
|
chunk_min_tokens: 4,
|
||||||
chunk_max_tokens: 64,
|
chunk_max_tokens: 64,
|
||||||
entity_embedding_concurrency: 2,
|
|
||||||
..IngestionTuning::default()
|
..IngestionTuning::default()
|
||||||
},
|
},
|
||||||
chunk_only: false,
|
chunk_only: false,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn reserve_task(
|
pub(crate) async fn reserve_task(
|
||||||
db: &SurrealDbClient,
|
db: &SurrealDbClient,
|
||||||
worker_id: &str,
|
worker_id: &str,
|
||||||
payload: IngestionPayload,
|
payload: IngestionPayload,
|
||||||
@@ -459,6 +455,34 @@ async fn ingestion_pipeline_chunk_only_skips_analysis() -> anyhow::Result<()> {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn produce_artifacts_returns_enriched_snapshot_without_persisting() -> anyhow::Result<()> {
|
||||||
|
let db = setup_db().await?;
|
||||||
|
let user_id = "user-produce";
|
||||||
|
let services = Arc::new(MockServices::new(user_id));
|
||||||
|
let pipeline =
|
||||||
|
IngestionPipeline::with_services(Arc::new(db.clone()), pipeline_config(), services)?;
|
||||||
|
|
||||||
|
let payload = IngestionPayload::Text {
|
||||||
|
text: "Produce artifacts payload".into(),
|
||||||
|
context: "Context".into(),
|
||||||
|
category: "notes".into(),
|
||||||
|
user_id: user_id.into(),
|
||||||
|
};
|
||||||
|
let task = IngestionTask::new(payload, user_id.to_string());
|
||||||
|
|
||||||
|
let artifacts = pipeline.produce_artifacts(&task).await?;
|
||||||
|
|
||||||
|
assert_eq!(artifacts.text_content.user_id, user_id);
|
||||||
|
assert_eq!(artifacts.chunks.len(), 1);
|
||||||
|
assert_eq!(artifacts.entities.len(), 1);
|
||||||
|
assert_eq!(artifacts.relationships.len(), 1);
|
||||||
|
assert_eq!(count_chunks_for_source(&db, &task.id).await?, 0);
|
||||||
|
assert_eq!(count_entities_for_source(&db, &task.id).await?, 0);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn ingestion_pipeline_failure_marks_retry() -> anyhow::Result<()> {
|
async fn ingestion_pipeline_failure_marks_retry() -> anyhow::Result<()> {
|
||||||
let db = setup_db().await?;
|
let db = setup_db().await?;
|
||||||
|
|||||||
Reference in New Issue
Block a user