retrieval simplfied

2026-04-25 10:18:38 +02:00 · 2025-12-09 20:35:42 +01:00
parent a8d10f265c
commit a090a8c76e
55 changed files with 469 additions and 1208 deletions
--- a/ingestion-pipeline/src/lib.rs
+++ b/ingestion-pipeline/src/lib.rs
@@ -1,3 +1,8 @@
+#![allow(
+    clippy::missing_docs_in_private_items,
+    clippy::result_large_err
+)]
+
 pub mod pipeline;
 pub mod utils;

--- a/ingestion-pipeline/src/pipeline/config.rs
+++ b/ingestion-pipeline/src/pipeline/config.rs
@@ -31,17 +31,8 @@ impl Default for IngestionTuning {
    }
 }

-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, Default)]
 pub struct IngestionConfig {
    pub tuning: IngestionTuning,
    pub chunk_only: bool,
 }
-
-impl Default for IngestionConfig {
-    fn default() -> Self {
-        Self {
-            tuning: IngestionTuning::default(),
-            chunk_only: false,
-        }
-    }
-}
--- a/ingestion-pipeline/src/pipeline/enrichment_result.rs
+++ b/ingestion-pipeline/src/pipeline/enrichment_result.rs
@@ -52,7 +52,7 @@ impl LLMEnrichmentResult {
        entity_concurrency: usize,
        embedding_provider: Option<&EmbeddingProvider>,
    ) -> Result<(Vec<EmbeddedKnowledgeEntity>, Vec<KnowledgeRelationship>), AppError> {
-        let mapper = Arc::new(self.create_mapper()?);
+        let mapper = Arc::new(self.create_mapper());

        let entities = self
            .process_entities(
@@ -66,21 +66,22 @@ impl LLMEnrichmentResult {
            )
            .await?;

-        let relationships = self.process_relationships(source_id, user_id, Arc::clone(&mapper))?;
+        let relationships = self.process_relationships(source_id, user_id, mapper.as_ref())?;

        Ok((entities, relationships))
    }

-    fn create_mapper(&self) -> Result<GraphMapper, AppError> {
+    fn create_mapper(&self) -> GraphMapper {
        let mut mapper = GraphMapper::new();

        for entity in &self.knowledge_entities {
            mapper.assign_id(&entity.key);
        }

-        Ok(mapper)
+        mapper
    }

+    #[allow(clippy::too_many_arguments)]
    async fn process_entities(
        &self,
        source_id: &str,
@@ -91,7 +92,7 @@ impl LLMEnrichmentResult {
        entity_concurrency: usize,
        embedding_provider: Option<&EmbeddingProvider>,
    ) -> Result<Vec<EmbeddedKnowledgeEntity>, AppError> {
-        stream::iter(self.knowledge_entities.iter().cloned().map(|entity| {
+        stream::iter(self.knowledge_entities.clone().into_iter().map(|entity| {
            let mapper = Arc::clone(&mapper);
            let openai_client = openai_client.clone();
            let source_id = source_id.to_string();
@@ -120,7 +121,7 @@ impl LLMEnrichmentResult {
        &self,
        source_id: &str,
        user_id: &str,
-        mapper: Arc<GraphMapper>,
+        mapper: &GraphMapper,
    ) -> Result<Vec<KnowledgeRelationship>, AppError> {
        self.relationships
            .iter()
@@ -170,9 +171,9 @@ async fn create_single_entity(
        id: assigned_id,
        created_at: now,
        updated_at: now,
-        name: llm_entity.name.to_string(),
-        description: llm_entity.description.to_string(),
-        entity_type: KnowledgeEntityType::from(llm_entity.entity_type.to_string()),
+        name: llm_entity.name.clone(),
+        description: llm_entity.description.clone(),
+        entity_type: KnowledgeEntityType::from(llm_entity.entity_type.clone()),
        source_id: source_id.to_string(),
        metadata: None,
        user_id: user_id.into(),
--- a/ingestion-pipeline/src/pipeline/mod.rs
+++ b/ingestion-pipeline/src/pipeline/mod.rs
@@ -8,6 +8,7 @@ mod state;

 pub use config::{IngestionConfig, IngestionTuning};
 pub use enrichment_result::{LLMEnrichmentResult, LLMKnowledgeEntity, LLMRelationship};
+#[allow(clippy::module_name_repetitions)]
 pub use services::{DefaultPipelineServices, PipelineServices};

 use std::{
@@ -37,6 +38,7 @@ use self::{
    state::ready,
 };

+#[allow(clippy::module_name_repetitions)]
 pub struct IngestionPipeline {
    db: Arc<SurrealDbClient>,
    pipeline_config: IngestionConfig,
@@ -44,7 +46,7 @@ pub struct IngestionPipeline {
 }

 impl IngestionPipeline {
-    pub async fn new(
+    pub fn new(
        db: Arc<SurrealDbClient>,
        openai_client: Arc<Client<async_openai::config::OpenAIConfig>>,
        config: AppConfig,
@@ -61,10 +63,9 @@ impl IngestionPipeline {
            embedding_provider,
            IngestionConfig::default(),
        )
-        .await
    }

-    pub async fn new_with_config(
+    pub fn new_with_config(
        db: Arc<SurrealDbClient>,
        openai_client: Arc<Client<async_openai::config::OpenAIConfig>>,
        config: AppConfig,
@@ -74,9 +75,9 @@ impl IngestionPipeline {
        pipeline_config: IngestionConfig,
    ) -> Result<Self, AppError> {
        let services = DefaultPipelineServices::new(
-            db.clone(),
-            openai_client.clone(),
-            config.clone(),
+            Arc::clone(&db),
+            openai_client,
+            config,
            reranker_pool,
            storage,
            embedding_provider,
@@ -181,11 +182,17 @@ impl IngestionPipeline {
            .saturating_sub(1)
            .min(tuning.retry_backoff_cap_exponent);
        let multiplier = 2_u64.pow(capped_attempt);
-        let delay = tuning.retry_base_delay_secs * multiplier;
+        let delay = tuning
+            .retry_base_delay_secs
+            .saturating_mul(multiplier);

        Duration::from_secs(delay.min(tuning.retry_max_delay_secs))
    }

+    fn duration_millis(duration: Duration) -> u64 {
+        u64::try_from(duration.as_millis()).unwrap_or(u64::MAX)
+    }
+
    #[tracing::instrument(
        skip_all,
        fields(task_id = %task.id, attempt = task.attempts, user_id = %task.user_id)
@@ -231,14 +238,14 @@ impl IngestionPipeline {
        let persist_duration = stage_start.elapsed();

        let total_duration = pipeline_started.elapsed();
-        let prepare_ms = prepare_duration.as_millis() as u64;
-        let retrieve_ms = retrieve_duration.as_millis() as u64;
-        let enrich_ms = enrich_duration.as_millis() as u64;
-        let persist_ms = persist_duration.as_millis() as u64;
+        let prepare_ms = Self::duration_millis(prepare_duration);
+        let retrieve_ms = Self::duration_millis(retrieve_duration);
+        let enrich_ms = Self::duration_millis(enrich_duration);
+        let persist_ms = Self::duration_millis(persist_duration);
        info!(
            task_id = %ctx.task_id,
            attempt = ctx.attempt,
-            total_ms = total_duration.as_millis() as u64,
+            total_ms = Self::duration_millis(total_duration),
            prepare_ms,
            retrieve_ms,
            enrich_ms,
--- a/ingestion-pipeline/src/pipeline/services.rs
+++ b/ingestion-pipeline/src/pipeline/services.rs
@@ -228,7 +228,7 @@ impl PipelineServices for DefaultPipelineServices {
    ) -> Result<(Vec<EmbeddedKnowledgeEntity>, Vec<KnowledgeRelationship>), AppError> {
        analysis
            .to_database_entities(
-                &content.get_id(),
+                content.get_id(),
                &content.user_id,
                &self.openai_client,
                &self.db,
@@ -327,13 +327,13 @@ fn truncate_for_embedding(text: &str, max_chars: usize) -> String {
        return text.to_string();
    }

-    let mut truncated = String::with_capacity(max_chars + 3);
+    let mut truncated = String::with_capacity(max_chars.saturating_add(3));
    for (idx, ch) in text.chars().enumerate() {
        if idx >= max_chars {
            break;
        }
        truncated.push(ch);
    }
-    truncated.push_str("…");
+    truncated.push('…');
    truncated
 }
--- a/ingestion-pipeline/src/pipeline/stages/mod.rs
+++ b/ingestion-pipeline/src/pipeline/stages/mod.rs
@@ -20,6 +20,22 @@ use super::{
    state::{ContentPrepared, Enriched, IngestionMachine, Persisted, Ready, Retrieved},
 };

+const STORE_RELATIONSHIPS: &str = r"
+    BEGIN TRANSACTION;
+    LET $relationships = $relationships;
+
+    FOR $relationship IN $relationships {
+        LET $in_node = type::thing('knowledge_entity', $relationship.in);
+        LET $out_node = type::thing('knowledge_entity', $relationship.out);
+        RELATE $in_node->relates_to->$out_node CONTENT {
+            id: type::thing('relates_to', $relationship.id),
+            metadata: $relationship.metadata
+        };
+    };
+
+    COMMIT TRANSACTION;
+";
+
 #[instrument(
    level = "trace",
    skip_all,
@@ -40,8 +56,7 @@ pub async fn prepare_content(
    let context_len = text_content
        .context
        .as_ref()
-        .map(|c| c.chars().count())
-        .unwrap_or(0);
+        .map_or(0, |c| c.chars().count());

    tracing::info!(
        task_id = %ctx.task_id,
@@ -65,7 +80,7 @@ pub async fn prepare_content(

    machine
        .prepare()
-        .map_err(|(_, guard)| map_guard_error("prepare", guard))
+        .map_err(|(_, guard)| map_guard_error("prepare", &guard))
 }

 #[instrument(
@@ -80,7 +95,7 @@ pub async fn retrieve_related(
    if ctx.pipeline_config.chunk_only {
        return machine
            .retrieve()
-            .map_err(|(_, guard)| map_guard_error("retrieve", guard));
+            .map_err(|(_, guard)| map_guard_error("retrieve", &guard));
    }

    let content = ctx.text_content()?;
@@ -97,7 +112,7 @@ pub async fn retrieve_related(

    machine
        .retrieve()
-        .map_err(|(_, guard)| map_guard_error("retrieve", guard))
+        .map_err(|(_, guard)| map_guard_error("retrieve", &guard))
 }

 #[instrument(
@@ -116,7 +131,7 @@ pub async fn enrich(
        });
        return machine
            .enrich()
-            .map_err(|(_, guard)| map_guard_error("enrich", guard));
+        .map_err(|(_, guard)| map_guard_error("enrich", &guard));
    }

    let content = ctx.text_content()?;
@@ -137,7 +152,7 @@ pub async fn enrich(

    machine
        .enrich()
-        .map_err(|(_, guard)| map_guard_error("enrich", guard))
+        .map_err(|(_, guard)| map_guard_error("enrich", &guard))
 }

 #[instrument(
@@ -182,10 +197,10 @@ pub async fn persist(

    machine
        .persist()
-        .map_err(|(_, guard)| map_guard_error("persist", guard))
+        .map_err(|(_, guard)| map_guard_error("persist", &guard))
 }

-fn map_guard_error(event: &str, guard: GuardError) -> AppError {
+fn map_guard_error(event: &str, guard: &GuardError) -> AppError {
    AppError::InternalError(format!(
        "invalid ingestion pipeline transition during {event}: {guard:?}"
    ))
@@ -206,43 +221,31 @@ async fn store_graph_entities(
        return Ok(());
    }

-    const STORE_RELATIONSHIPS: &str = r"
-        BEGIN TRANSACTION;
-        LET $relationships = $relationships;
-
-        FOR $relationship IN $relationships {
-            LET $in_node = type::thing('knowledge_entity', $relationship.in);
-            LET $out_node = type::thing('knowledge_entity', $relationship.out);
-            RELATE $in_node->relates_to->$out_node CONTENT {
-                id: type::thing('relates_to', $relationship.id),
-                metadata: $relationship.metadata
-            };
-        };
-
-        COMMIT TRANSACTION;
-    ";
-
    let relationships = Arc::new(relationships);

    let mut backoff_ms = tuning.graph_initial_backoff_ms;
+    let last_attempt = tuning.graph_store_attempts.saturating_sub(1);

    for attempt in 0..tuning.graph_store_attempts {
        let result = db
            .client
            .query(STORE_RELATIONSHIPS)
-            .bind(("relationships", relationships.clone()))
+            .bind(("relationships", Arc::clone(&relationships)))
            .await;

        match result {
            Ok(_) => return Ok(()),
            Err(err) => {
-                if is_retryable_conflict(&err) && attempt + 1 < tuning.graph_store_attempts {
+                if is_retryable_conflict(&err) && attempt < last_attempt {
+                    let next_attempt = attempt.saturating_add(1);
                    warn!(
-                        attempt = attempt + 1,
+                        attempt = next_attempt,
                        "Transient SurrealDB conflict while storing graph data; retrying"
                    );
                    sleep(Duration::from_millis(backoff_ms)).await;
-                    backoff_ms = (backoff_ms * 2).min(tuning.graph_max_backoff_ms);
+                    backoff_ms = backoff_ms
+                        .saturating_mul(2)
+                        .min(tuning.graph_max_backoff_ms);
                    continue;
                }

--- a/ingestion-pipeline/src/utils/file_text_extraction.rs
+++ b/ingestion-pipeline/src/utils/file_text_extraction.rs
@@ -65,7 +65,7 @@ fn infer_extension(file_info: &FileInfo) -> Option<String> {
    Path::new(&file_info.path)
        .extension()
        .and_then(|ext| ext.to_str())
-        .map(|ext| ext.to_string())
+        .map(std::string::ToString::to_string)
 }

 pub async fn extract_text_from_file(
--- a/ingestion-pipeline/src/utils/pdf_ingestion.rs
+++ b/ingestion-pipeline/src/utils/pdf_ingestion.rs
@@ -116,6 +116,7 @@ async fn load_page_numbers(pdf_bytes: Vec<u8>) -> Result<Vec<u32>, AppError> {
 }

 /// Uses the existing headless Chrome dependency to rasterize the requested PDF pages into PNGs.
+#[allow(clippy::too_many_lines)]
 async fn render_pdf_pages(file_path: &Path, pages: &[u32]) -> Result<Vec<Vec<u8>>, AppError> {
    let file_url = url::Url::from_file_path(file_path)
        .map_err(|()| AppError::Processing("Unable to construct PDF file URL".into()))?;
@@ -148,7 +149,7 @@ async fn render_pdf_pages(file_path: &Path, pages: &[u32]) -> Result<Vec<Vec<u8>
                loaded = true;
                break;
            }
-            if attempt + 1 < NAVIGATION_RETRY_ATTEMPTS {
+            if attempt < NAVIGATION_RETRY_ATTEMPTS.saturating_sub(1) {
                sleep(Duration::from_millis(NAVIGATION_RETRY_INTERVAL_MS)).await;
            }
        }
@@ -172,7 +173,7 @@ async fn render_pdf_pages(file_path: &Path, pages: &[u32]) -> Result<Vec<Vec<u8>
                    break;
                }
                Ok(None) => {
-                    if attempt + 1 < CANVAS_VIEWPORT_ATTEMPTS {
+                    if attempt < CANVAS_VIEWPORT_ATTEMPTS.saturating_sub(1) {
                        tokio::time::sleep(Duration::from_millis(CANVAS_VIEWPORT_WAIT_MS)).await;
                    }
                }
@@ -260,6 +261,7 @@ fn create_browser() -> Result<Browser, AppError> {
 }

 /// Sends one or more rendered pages to the configured multimodal model and stitches the resulting Markdown chunks together.
+#[allow(clippy::too_many_lines)]
 async fn vision_markdown(
    rendered_pages: Vec<Vec<u8>>,
    db: &SurrealDbClient,
@@ -303,10 +305,11 @@ async fn vision_markdown(

        let mut batch_markdown: Option<String> = None;

+        let last_attempt = MAX_VISION_ATTEMPTS.saturating_sub(1);
        for attempt in 0..MAX_VISION_ATTEMPTS {
            let prompt_text = prompt_for_attempt(attempt, prompt);

-            let mut content_parts = Vec::with_capacity(encoded_images.len() + 1);
+            let mut content_parts = Vec::with_capacity(encoded_images.len().saturating_add(1));
            content_parts.push(
                ChatCompletionRequestMessageContentPartTextArgs::default()
                    .text(prompt_text)
@@ -375,7 +378,7 @@ async fn vision_markdown(
                    batch = batch_idx,
                    attempt, "Vision model returned low quality response"
                );
-                if attempt + 1 == MAX_VISION_ATTEMPTS {
+                if attempt == last_attempt {
                    return Err(AppError::Processing(
                        "Vision model failed to transcribe PDF page contents".into(),
                    ));
@@ -400,6 +403,7 @@ async fn vision_markdown(
 }

 /// Heuristic that determines whether the fast-path text looks like well-formed prose.
+#[allow(clippy::cast_precision_loss)]
 fn looks_good_enough(text: &str) -> bool {
    if text.len() < FAST_PATH_MIN_LEN {
        return false;
--- a/ingestion-pipeline/src/utils/url_text_retrieval.rs
+++ b/ingestion-pipeline/src/utils/url_text_retrieval.rs
@@ -50,7 +50,7 @@ pub async fn extract_text_from_url(
    )?;

    let mut tmp_file = NamedTempFile::new()?;
-    let temp_path_str = format!("{:?}", tmp_file.path());
+    let temp_path_str = tmp_file.path().display().to_string();

    tmp_file.write_all(&screenshot)?;
    tmp_file.as_file().sync_all()?;
@@ -108,14 +108,11 @@ fn ensure_ingestion_url_allowed(url: &url::Url) -> Result<String, AppError> {
        }
    }

-    let host = match url.host_str() {
-        Some(host) => host,
-        None => {
-            warn!(%url, "Rejected ingestion URL missing host");
-            return Err(AppError::Validation(
-                "URL is missing a host component".to_string(),
-            ));
-        }
+    let Some(host) = url.host_str() else {
+        warn!(%url, "Rejected ingestion URL missing host");
+        return Err(AppError::Validation(
+            "URL is missing a host component".to_string(),
+        ));
    };

    if host.eq_ignore_ascii_case("localhost") {