feat: refactoring complete?

2026-04-17 22:49:43 +02:00 · 2024-11-21 21:23:49 +01:00
parent 94f328e542
commit 1e789e1153
27 changed files with 428 additions and 338 deletions
--- a/src/ingress/content_processor.rs
+++ b/src/ingress/content_processor.rs
@@ -0,0 +1,119 @@
+use text_splitter::TextSplitter;
+use tracing::{debug, info};
+
+use crate::{
+    error::ProcessingError,
+    retrieval::vector::find_items_by_vector_similarity,
+    storage::{
+        db::{store_item, SurrealDbClient},
+        types::{
+            knowledge_entity::KnowledgeEntity, knowledge_relationship::KnowledgeRelationship,
+            text_chunk::TextChunk, text_content::TextContent,
+        },
+    },
+    utils::embedding::generate_embedding,
+};
+
+use super::analysis::{
+    ingress_analyser::IngressAnalyzer, types::llm_analysis_result::LLMGraphAnalysisResult,
+};
+
+pub struct ContentProcessor {
+    db_client: SurrealDbClient,
+    openai_client: async_openai::Client<async_openai::config::OpenAIConfig>,
+}
+
+impl ContentProcessor {
+    pub async fn new() -> Result<Self, ProcessingError> {
+        Ok(Self {
+            db_client: SurrealDbClient::new().await?,
+            openai_client: async_openai::Client::new(),
+        })
+    }
+
+    pub async fn process(&self, content: &TextContent) -> Result<(), ProcessingError> {
+        // Store original content
+        store_item(&self.db_client, content.clone()).await?;
+
+        // Process in parallel where possible
+        let (analysis, _similar_chunks) = tokio::try_join!(
+            self.perform_semantic_analysis(content),
+            self.find_similar_content(content),
+        )?;
+
+        // Convert and store entities
+        let (entities, relationships) = analysis
+            .to_database_entities(&content.id, &self.openai_client)
+            .await?;
+
+        // Store everything
+        tokio::try_join!(
+            self.store_graph_entities(entities, relationships),
+            self.store_vector_chunks(content),
+        )?;
+
+        self.db_client.rebuild_indexes().await?;
+        Ok(())
+    }
+
+    async fn perform_semantic_analysis(
+        &self,
+        content: &TextContent,
+    ) -> Result<LLMGraphAnalysisResult, ProcessingError> {
+        let analyser = IngressAnalyzer::new(&self.db_client, &self.openai_client);
+        analyser
+            .analyze_content(&content.category, &content.instructions, &content.text)
+            .await
+    }
+
+    async fn find_similar_content(
+        &self,
+        content: &TextContent,
+    ) -> Result<Vec<TextChunk>, ProcessingError> {
+        find_items_by_vector_similarity(
+            3,
+            content.text.clone(),
+            &self.db_client,
+            "text_chunk".to_string(),
+            &self.openai_client,
+        )
+        .await
+    }
+
+    async fn store_graph_entities(
+        &self,
+        entities: Vec<KnowledgeEntity>,
+        relationships: Vec<KnowledgeRelationship>,
+    ) -> Result<(), ProcessingError> {
+        for entity in &entities {
+            debug!("Storing entity: {:?}", entity);
+            store_item(&self.db_client, entity.clone()).await?;
+        }
+
+        for relationship in &relationships {
+            debug!("Storing relationship: {:?}", relationship);
+            store_item(&self.db_client, relationship.clone()).await?;
+        }
+
+        info!(
+            "Stored {} entities and {} relationships",
+            entities.len(),
+            relationships.len()
+        );
+        Ok(())
+    }
+
+    async fn store_vector_chunks(&self, content: &TextContent) -> Result<(), ProcessingError> {
+        let splitter = TextSplitter::new(500..2000);
+        let chunks = splitter.chunks(&content.text);
+
+        // Could potentially process chunks in parallel with a bounded concurrent limit
+        for chunk in chunks {
+            let embedding = generate_embedding(&self.openai_client, chunk.to_string()).await?;
+            let text_chunk = TextChunk::new(content.id.to_string(), chunk.to_string(), embedding);
+            store_item(&self.db_client, text_chunk).await?;
+        }
+
+        Ok(())
+    }
+}