retrieval simplfied

This commit is contained in:
Per Stark
2025-12-09 20:35:42 +01:00
parent a8d10f265c
commit a090a8c76e
55 changed files with 469 additions and 1208 deletions

View File

@@ -1,3 +1,8 @@
#![allow(
clippy::missing_docs_in_private_items,
clippy::result_large_err
)]
pub mod pipeline;
pub mod utils;

View File

@@ -31,17 +31,8 @@ impl Default for IngestionTuning {
}
}
#[derive(Debug, Clone)]
#[derive(Debug, Clone, Default)]
pub struct IngestionConfig {
pub tuning: IngestionTuning,
pub chunk_only: bool,
}
impl Default for IngestionConfig {
fn default() -> Self {
Self {
tuning: IngestionTuning::default(),
chunk_only: false,
}
}
}

View File

@@ -52,7 +52,7 @@ impl LLMEnrichmentResult {
entity_concurrency: usize,
embedding_provider: Option<&EmbeddingProvider>,
) -> Result<(Vec<EmbeddedKnowledgeEntity>, Vec<KnowledgeRelationship>), AppError> {
let mapper = Arc::new(self.create_mapper()?);
let mapper = Arc::new(self.create_mapper());
let entities = self
.process_entities(
@@ -66,21 +66,22 @@ impl LLMEnrichmentResult {
)
.await?;
let relationships = self.process_relationships(source_id, user_id, Arc::clone(&mapper))?;
let relationships = self.process_relationships(source_id, user_id, mapper.as_ref())?;
Ok((entities, relationships))
}
fn create_mapper(&self) -> Result<GraphMapper, AppError> {
fn create_mapper(&self) -> GraphMapper {
let mut mapper = GraphMapper::new();
for entity in &self.knowledge_entities {
mapper.assign_id(&entity.key);
}
Ok(mapper)
mapper
}
#[allow(clippy::too_many_arguments)]
async fn process_entities(
&self,
source_id: &str,
@@ -91,7 +92,7 @@ impl LLMEnrichmentResult {
entity_concurrency: usize,
embedding_provider: Option<&EmbeddingProvider>,
) -> Result<Vec<EmbeddedKnowledgeEntity>, AppError> {
stream::iter(self.knowledge_entities.iter().cloned().map(|entity| {
stream::iter(self.knowledge_entities.clone().into_iter().map(|entity| {
let mapper = Arc::clone(&mapper);
let openai_client = openai_client.clone();
let source_id = source_id.to_string();
@@ -120,7 +121,7 @@ impl LLMEnrichmentResult {
&self,
source_id: &str,
user_id: &str,
mapper: Arc<GraphMapper>,
mapper: &GraphMapper,
) -> Result<Vec<KnowledgeRelationship>, AppError> {
self.relationships
.iter()
@@ -170,9 +171,9 @@ async fn create_single_entity(
id: assigned_id,
created_at: now,
updated_at: now,
name: llm_entity.name.to_string(),
description: llm_entity.description.to_string(),
entity_type: KnowledgeEntityType::from(llm_entity.entity_type.to_string()),
name: llm_entity.name.clone(),
description: llm_entity.description.clone(),
entity_type: KnowledgeEntityType::from(llm_entity.entity_type.clone()),
source_id: source_id.to_string(),
metadata: None,
user_id: user_id.into(),

View File

@@ -8,6 +8,7 @@ mod state;
pub use config::{IngestionConfig, IngestionTuning};
pub use enrichment_result::{LLMEnrichmentResult, LLMKnowledgeEntity, LLMRelationship};
#[allow(clippy::module_name_repetitions)]
pub use services::{DefaultPipelineServices, PipelineServices};
use std::{
@@ -37,6 +38,7 @@ use self::{
state::ready,
};
#[allow(clippy::module_name_repetitions)]
pub struct IngestionPipeline {
db: Arc<SurrealDbClient>,
pipeline_config: IngestionConfig,
@@ -44,7 +46,7 @@ pub struct IngestionPipeline {
}
impl IngestionPipeline {
pub async fn new(
pub fn new(
db: Arc<SurrealDbClient>,
openai_client: Arc<Client<async_openai::config::OpenAIConfig>>,
config: AppConfig,
@@ -61,10 +63,9 @@ impl IngestionPipeline {
embedding_provider,
IngestionConfig::default(),
)
.await
}
pub async fn new_with_config(
pub fn new_with_config(
db: Arc<SurrealDbClient>,
openai_client: Arc<Client<async_openai::config::OpenAIConfig>>,
config: AppConfig,
@@ -74,9 +75,9 @@ impl IngestionPipeline {
pipeline_config: IngestionConfig,
) -> Result<Self, AppError> {
let services = DefaultPipelineServices::new(
db.clone(),
openai_client.clone(),
config.clone(),
Arc::clone(&db),
openai_client,
config,
reranker_pool,
storage,
embedding_provider,
@@ -181,11 +182,17 @@ impl IngestionPipeline {
.saturating_sub(1)
.min(tuning.retry_backoff_cap_exponent);
let multiplier = 2_u64.pow(capped_attempt);
let delay = tuning.retry_base_delay_secs * multiplier;
let delay = tuning
.retry_base_delay_secs
.saturating_mul(multiplier);
Duration::from_secs(delay.min(tuning.retry_max_delay_secs))
}
fn duration_millis(duration: Duration) -> u64 {
u64::try_from(duration.as_millis()).unwrap_or(u64::MAX)
}
#[tracing::instrument(
skip_all,
fields(task_id = %task.id, attempt = task.attempts, user_id = %task.user_id)
@@ -231,14 +238,14 @@ impl IngestionPipeline {
let persist_duration = stage_start.elapsed();
let total_duration = pipeline_started.elapsed();
let prepare_ms = prepare_duration.as_millis() as u64;
let retrieve_ms = retrieve_duration.as_millis() as u64;
let enrich_ms = enrich_duration.as_millis() as u64;
let persist_ms = persist_duration.as_millis() as u64;
let prepare_ms = Self::duration_millis(prepare_duration);
let retrieve_ms = Self::duration_millis(retrieve_duration);
let enrich_ms = Self::duration_millis(enrich_duration);
let persist_ms = Self::duration_millis(persist_duration);
info!(
task_id = %ctx.task_id,
attempt = ctx.attempt,
total_ms = total_duration.as_millis() as u64,
total_ms = Self::duration_millis(total_duration),
prepare_ms,
retrieve_ms,
enrich_ms,

View File

@@ -228,7 +228,7 @@ impl PipelineServices for DefaultPipelineServices {
) -> Result<(Vec<EmbeddedKnowledgeEntity>, Vec<KnowledgeRelationship>), AppError> {
analysis
.to_database_entities(
&content.get_id(),
content.get_id(),
&content.user_id,
&self.openai_client,
&self.db,
@@ -327,13 +327,13 @@ fn truncate_for_embedding(text: &str, max_chars: usize) -> String {
return text.to_string();
}
let mut truncated = String::with_capacity(max_chars + 3);
let mut truncated = String::with_capacity(max_chars.saturating_add(3));
for (idx, ch) in text.chars().enumerate() {
if idx >= max_chars {
break;
}
truncated.push(ch);
}
truncated.push_str("");
truncated.push('…');
truncated
}

View File

@@ -20,6 +20,22 @@ use super::{
state::{ContentPrepared, Enriched, IngestionMachine, Persisted, Ready, Retrieved},
};
const STORE_RELATIONSHIPS: &str = r"
BEGIN TRANSACTION;
LET $relationships = $relationships;
FOR $relationship IN $relationships {
LET $in_node = type::thing('knowledge_entity', $relationship.in);
LET $out_node = type::thing('knowledge_entity', $relationship.out);
RELATE $in_node->relates_to->$out_node CONTENT {
id: type::thing('relates_to', $relationship.id),
metadata: $relationship.metadata
};
};
COMMIT TRANSACTION;
";
#[instrument(
level = "trace",
skip_all,
@@ -40,8 +56,7 @@ pub async fn prepare_content(
let context_len = text_content
.context
.as_ref()
.map(|c| c.chars().count())
.unwrap_or(0);
.map_or(0, |c| c.chars().count());
tracing::info!(
task_id = %ctx.task_id,
@@ -65,7 +80,7 @@ pub async fn prepare_content(
machine
.prepare()
.map_err(|(_, guard)| map_guard_error("prepare", guard))
.map_err(|(_, guard)| map_guard_error("prepare", &guard))
}
#[instrument(
@@ -80,7 +95,7 @@ pub async fn retrieve_related(
if ctx.pipeline_config.chunk_only {
return machine
.retrieve()
.map_err(|(_, guard)| map_guard_error("retrieve", guard));
.map_err(|(_, guard)| map_guard_error("retrieve", &guard));
}
let content = ctx.text_content()?;
@@ -97,7 +112,7 @@ pub async fn retrieve_related(
machine
.retrieve()
.map_err(|(_, guard)| map_guard_error("retrieve", guard))
.map_err(|(_, guard)| map_guard_error("retrieve", &guard))
}
#[instrument(
@@ -116,7 +131,7 @@ pub async fn enrich(
});
return machine
.enrich()
.map_err(|(_, guard)| map_guard_error("enrich", guard));
.map_err(|(_, guard)| map_guard_error("enrich", &guard));
}
let content = ctx.text_content()?;
@@ -137,7 +152,7 @@ pub async fn enrich(
machine
.enrich()
.map_err(|(_, guard)| map_guard_error("enrich", guard))
.map_err(|(_, guard)| map_guard_error("enrich", &guard))
}
#[instrument(
@@ -182,10 +197,10 @@ pub async fn persist(
machine
.persist()
.map_err(|(_, guard)| map_guard_error("persist", guard))
.map_err(|(_, guard)| map_guard_error("persist", &guard))
}
fn map_guard_error(event: &str, guard: GuardError) -> AppError {
fn map_guard_error(event: &str, guard: &GuardError) -> AppError {
AppError::InternalError(format!(
"invalid ingestion pipeline transition during {event}: {guard:?}"
))
@@ -206,43 +221,31 @@ async fn store_graph_entities(
return Ok(());
}
const STORE_RELATIONSHIPS: &str = r"
BEGIN TRANSACTION;
LET $relationships = $relationships;
FOR $relationship IN $relationships {
LET $in_node = type::thing('knowledge_entity', $relationship.in);
LET $out_node = type::thing('knowledge_entity', $relationship.out);
RELATE $in_node->relates_to->$out_node CONTENT {
id: type::thing('relates_to', $relationship.id),
metadata: $relationship.metadata
};
};
COMMIT TRANSACTION;
";
let relationships = Arc::new(relationships);
let mut backoff_ms = tuning.graph_initial_backoff_ms;
let last_attempt = tuning.graph_store_attempts.saturating_sub(1);
for attempt in 0..tuning.graph_store_attempts {
let result = db
.client
.query(STORE_RELATIONSHIPS)
.bind(("relationships", relationships.clone()))
.bind(("relationships", Arc::clone(&relationships)))
.await;
match result {
Ok(_) => return Ok(()),
Err(err) => {
if is_retryable_conflict(&err) && attempt + 1 < tuning.graph_store_attempts {
if is_retryable_conflict(&err) && attempt < last_attempt {
let next_attempt = attempt.saturating_add(1);
warn!(
attempt = attempt + 1,
attempt = next_attempt,
"Transient SurrealDB conflict while storing graph data; retrying"
);
sleep(Duration::from_millis(backoff_ms)).await;
backoff_ms = (backoff_ms * 2).min(tuning.graph_max_backoff_ms);
backoff_ms = backoff_ms
.saturating_mul(2)
.min(tuning.graph_max_backoff_ms);
continue;
}

View File

@@ -65,7 +65,7 @@ fn infer_extension(file_info: &FileInfo) -> Option<String> {
Path::new(&file_info.path)
.extension()
.and_then(|ext| ext.to_str())
.map(|ext| ext.to_string())
.map(std::string::ToString::to_string)
}
pub async fn extract_text_from_file(

View File

@@ -116,6 +116,7 @@ async fn load_page_numbers(pdf_bytes: Vec<u8>) -> Result<Vec<u32>, AppError> {
}
/// Uses the existing headless Chrome dependency to rasterize the requested PDF pages into PNGs.
#[allow(clippy::too_many_lines)]
async fn render_pdf_pages(file_path: &Path, pages: &[u32]) -> Result<Vec<Vec<u8>>, AppError> {
let file_url = url::Url::from_file_path(file_path)
.map_err(|()| AppError::Processing("Unable to construct PDF file URL".into()))?;
@@ -148,7 +149,7 @@ async fn render_pdf_pages(file_path: &Path, pages: &[u32]) -> Result<Vec<Vec<u8>
loaded = true;
break;
}
if attempt + 1 < NAVIGATION_RETRY_ATTEMPTS {
if attempt < NAVIGATION_RETRY_ATTEMPTS.saturating_sub(1) {
sleep(Duration::from_millis(NAVIGATION_RETRY_INTERVAL_MS)).await;
}
}
@@ -172,7 +173,7 @@ async fn render_pdf_pages(file_path: &Path, pages: &[u32]) -> Result<Vec<Vec<u8>
break;
}
Ok(None) => {
if attempt + 1 < CANVAS_VIEWPORT_ATTEMPTS {
if attempt < CANVAS_VIEWPORT_ATTEMPTS.saturating_sub(1) {
tokio::time::sleep(Duration::from_millis(CANVAS_VIEWPORT_WAIT_MS)).await;
}
}
@@ -260,6 +261,7 @@ fn create_browser() -> Result<Browser, AppError> {
}
/// Sends one or more rendered pages to the configured multimodal model and stitches the resulting Markdown chunks together.
#[allow(clippy::too_many_lines)]
async fn vision_markdown(
rendered_pages: Vec<Vec<u8>>,
db: &SurrealDbClient,
@@ -303,10 +305,11 @@ async fn vision_markdown(
let mut batch_markdown: Option<String> = None;
let last_attempt = MAX_VISION_ATTEMPTS.saturating_sub(1);
for attempt in 0..MAX_VISION_ATTEMPTS {
let prompt_text = prompt_for_attempt(attempt, prompt);
let mut content_parts = Vec::with_capacity(encoded_images.len() + 1);
let mut content_parts = Vec::with_capacity(encoded_images.len().saturating_add(1));
content_parts.push(
ChatCompletionRequestMessageContentPartTextArgs::default()
.text(prompt_text)
@@ -375,7 +378,7 @@ async fn vision_markdown(
batch = batch_idx,
attempt, "Vision model returned low quality response"
);
if attempt + 1 == MAX_VISION_ATTEMPTS {
if attempt == last_attempt {
return Err(AppError::Processing(
"Vision model failed to transcribe PDF page contents".into(),
));
@@ -400,6 +403,7 @@ async fn vision_markdown(
}
/// Heuristic that determines whether the fast-path text looks like well-formed prose.
#[allow(clippy::cast_precision_loss)]
fn looks_good_enough(text: &str) -> bool {
if text.len() < FAST_PATH_MIN_LEN {
return false;

View File

@@ -50,7 +50,7 @@ pub async fn extract_text_from_url(
)?;
let mut tmp_file = NamedTempFile::new()?;
let temp_path_str = format!("{:?}", tmp_file.path());
let temp_path_str = tmp_file.path().display().to_string();
tmp_file.write_all(&screenshot)?;
tmp_file.as_file().sync_all()?;
@@ -108,14 +108,11 @@ fn ensure_ingestion_url_allowed(url: &url::Url) -> Result<String, AppError> {
}
}
let host = match url.host_str() {
Some(host) => host,
None => {
warn!(%url, "Rejected ingestion URL missing host");
return Err(AppError::Validation(
"URL is missing a host component".to_string(),
));
}
let Some(host) = url.host_str() else {
warn!(%url, "Rejected ingestion URL missing host");
return Err(AppError::Validation(
"URL is missing a host component".to_string(),
));
};
if host.eq_ignore_ascii_case("localhost") {