From 200707af9015074d487e8407d5996950c8c178f2 Mon Sep 17 00:00:00 2001 From: Per Stark Date: Thu, 21 Nov 2024 09:23:42 +0100 Subject: [PATCH] refactoring: completed storage, now using new fn to construct --- src/models/graph_entities.rs | 60 ------------------- src/models/ingress_object.rs | 40 +++++-------- src/models/text_content.rs | 65 +++------------------ src/storage/types/knowledge_entity.rs | 55 +++++++++++++++++ src/storage/types/knowledge_relationship.rs | 26 +++++++++ src/storage/types/mod.rs | 4 +- src/storage/types/text_content.rs | 11 ++-- src/utils/llm.rs | 8 ++- 8 files changed, 121 insertions(+), 148 deletions(-) create mode 100644 src/storage/types/knowledge_entity.rs create mode 100644 src/storage/types/knowledge_relationship.rs diff --git a/src/models/graph_entities.rs b/src/models/graph_entities.rs index b8b1e1b..efc0d51 100644 --- a/src/models/graph_entities.rs +++ b/src/models/graph_entities.rs @@ -1,66 +1,6 @@ -use serde::Deserialize; -use serde::Deserializer; -use serde::Serialize; use std::collections::HashMap; -use surrealdb::sql::Thing; use uuid::Uuid; -/// Represents a generic knowledge entity in the graph. -#[derive(Debug, Serialize, Deserialize, Clone)] -pub struct KnowledgeEntity { - #[serde(deserialize_with = "thing_to_string")] - pub id: String, - pub name: String, - pub description: String, - pub entity_type: KnowledgeEntityType, - pub source_id: String, - pub metadata: Option, - pub embedding: Option>, -} - -pub fn thing_to_string<'de, D>(deserializer: D) -> Result -where - D: Deserializer<'de>, -{ - let thing = Thing::deserialize(deserializer)?; - Ok(thing.id.to_raw()) -} - -#[derive(Debug, Serialize, Deserialize, Clone)] -pub enum KnowledgeEntityType { - Idea, - Project, - Document, - Page, - TextSnippet, - // Add more types as needed -} - -impl From for KnowledgeEntityType { - fn from(s: String) -> Self { - match s.to_lowercase().as_str() { - "idea" => KnowledgeEntityType::Idea, - "project" => KnowledgeEntityType::Project, - "document" => KnowledgeEntityType::Document, - "page" => KnowledgeEntityType::Page, - "textsnippet" => KnowledgeEntityType::TextSnippet, - _ => KnowledgeEntityType::Document, // Default case - } - } -} - -/// Represents a relationship between two knowledge entities. -#[derive(Debug, Serialize, Deserialize, Clone)] -pub struct KnowledgeRelationship { - #[serde(deserialize_with = "thing_to_string")] - pub id: String, - #[serde(rename = "in")] - pub in_: String, // Target KnowledgeEntity ID - pub out: String, // Source KnowledgeEntity ID - pub relationship_type: String, // e.g., RelatedTo, RelevantTo - pub metadata: Option, // Additional metadata -} - /// Intermediate struct to hold mapping between LLM keys and generated IDs. #[derive(Clone)] pub struct GraphMapper { diff --git a/src/models/ingress_object.rs b/src/models/ingress_object.rs index b37e5ad..3a2e69d 100644 --- a/src/models/ingress_object.rs +++ b/src/models/ingress_object.rs @@ -40,43 +40,35 @@ impl IngressObject { category, } => { let text = Self::fetch_text_from_url(url).await?; - let id = Uuid::new_v4(); - Ok(TextContent { - id: id.to_string(), + Ok(TextContent::new( text, - instructions: instructions.clone(), - category: category.clone(), - file_info: None, - }) + instructions.into(), + category.into(), + None, + )) } IngressObject::Text { text, instructions, category, - } => { - let id = Uuid::new_v4(); - Ok(TextContent { - id: id.to_string(), - text: text.clone(), - instructions: instructions.clone(), - category: category.clone(), - file_info: None, - }) - } + } => Ok(TextContent::new( + text.into(), + instructions.into(), + category.into(), + None, + )), IngressObject::File { file_info, instructions, category, } => { - let id = Uuid::new_v4(); let text = Self::extract_text_from_file(file_info).await?; - Ok(TextContent { - id: id.to_string(), + Ok(TextContent::new( text, - instructions: instructions.clone(), - category: category.clone(), - file_info: Some(file_info.clone()), - }) + instructions.into(), + category.into(), + Some(file_info.to_owned()), + )) } } } diff --git a/src/models/text_content.rs b/src/models/text_content.rs index 5a42a1a..89df5f2 100644 --- a/src/models/text_content.rs +++ b/src/models/text_content.rs @@ -1,5 +1,6 @@ -use crate::storage; use crate::storage::db::store_item; +use crate::storage::types::knowledge_entity::KnowledgeEntity; +use crate::storage::types::knowledge_relationship::KnowledgeRelationship; use crate::storage::types::text_chunk::TextChunk; use crate::storage::types::text_content::TextContent; use crate::{ @@ -10,29 +11,6 @@ use crate::{ use surrealdb::{engine::remote::ws::Client, Surreal}; use text_splitter::TextSplitter; use tracing::{debug, info}; -use uuid::Uuid; - -use super::graph_entities::{KnowledgeEntity, KnowledgeRelationship}; - -// #[derive(Serialize, Deserialize, Debug)] -// struct TextChunk { -// #[serde(deserialize_with = "thing_to_string")] -// id: String, -// source_id: String, -// chunk: String, -// embedding: Vec, -// } - -/// Represents a single piece of text content extracted from various sources. -// #[derive(Debug, Serialize, Deserialize, Clone)] -// pub struct TextContent { -// #[serde(deserialize_with = "thing_to_string")] -// pub id: String, -// pub text: String, -// pub file_info: Option, -// pub instructions: String, -// pub category: String, -// } async fn vector_comparison( take: u8, @@ -70,14 +48,14 @@ async fn get_related_nodes( impl TextContent { /// Processes the `TextContent` by sending it to an LLM, storing in a graph DB, and vector DB. pub async fn process(&self) -> Result<(), ProcessingError> { - // Store TextContent let db_client = SurrealDbClient::new().await?; let openai_client = async_openai::Client::new(); - let create_operation = storage::db::store_item(&db_client, self.clone()).await?; + // Store TextContent + let create_operation = store_item(&db_client, self.clone()).await?; info!("{:?}", create_operation); - // self.store_text_content(&db_client).await?; + // Get related nodes let closest_text_content: Vec = vector_comparison( 3, self.text.clone(), @@ -148,42 +126,20 @@ impl TextContent { db_client: &Surreal, ) -> Result<(), ProcessingError> { for entity in &entities { - info!( + debug!( "{:?}, {:?}, {:?}", &entity.id, &entity.name, &entity.description ); - let _created: Option = db_client - .create(("knowledge_entity", &entity.id.to_string())) - .content(entity.clone()) - .await?; - - debug!("{:?}", _created); + store_item(db_client, entity.clone()).await?; } for relationship in &relationships { - // info!("{:?}", relationship); + debug!("{:?}", relationship); - let _created: Option = db_client - .insert(("knowledge_relationship", &relationship.id.to_string())) - .content(relationship.clone()) - .await?; - - debug!("{:?}", _created); + store_item(db_client, relationship.clone()).await?; } - // for relationship in &relationships { - // let in_entity: Option = db_client.select(("knowledge_entity",relationship.in_.to_string())).await?; - // let out_entity: Option = db_client.select(("knowledge_entity", relationship.out.to_string())).await?; - - // if let (Some(in_), Some(out)) = (in_entity, out_entity) { - // info!("{} - {} is {} to {} - {}", in_.id, in_.name, relationship.relationship_type, out.id, out.name); - // } - // else { - // info!("No in or out entities found"); - // } - // } - info!( "Inserted to database: {:?} entities, {:?} relationships", entities.len(), @@ -194,7 +150,6 @@ impl TextContent { } /// Splits text and stores it in a vector database. - #[allow(dead_code)] async fn store_in_vector_db( &self, db_client: &Surreal, @@ -210,8 +165,6 @@ impl TextContent { let embedding = generate_embedding(&openai_client, chunk.to_string()).await?; let text_chunk = TextChunk::new(self.id.to_string(), chunk.to_string(), embedding); - info!("{:?}", text_chunk); - store_item(db_client, text_chunk).await?; } diff --git a/src/storage/types/knowledge_entity.rs b/src/storage/types/knowledge_entity.rs new file mode 100644 index 0000000..8419c6a --- /dev/null +++ b/src/storage/types/knowledge_entity.rs @@ -0,0 +1,55 @@ +use crate::stored_object; +use uuid::Uuid; + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub enum KnowledgeEntityType { + Idea, + Project, + Document, + Page, + TextSnippet, + // Add more types as needed +} + +impl From for KnowledgeEntityType { + fn from(s: String) -> Self { + match s.to_lowercase().as_str() { + "idea" => KnowledgeEntityType::Idea, + "project" => KnowledgeEntityType::Project, + "document" => KnowledgeEntityType::Document, + "page" => KnowledgeEntityType::Page, + "textsnippet" => KnowledgeEntityType::TextSnippet, + _ => KnowledgeEntityType::Document, // Default case + } + } +} + +stored_object!(KnowledgeEntity, "knowledge_entity", { + source_id: String, + name: String, + description: String, + entity_type: KnowledgeEntityType, + metadata: Option, + embedding: Vec +}); + +impl KnowledgeEntity { + pub fn new( + source_id: String, + name: String, + description: String, + entity_type: KnowledgeEntityType, + metadata: Option, + embedding: Vec, + ) -> Self { + Self { + id: Uuid::new_v4().to_string(), + source_id, + name, + description, + entity_type, + metadata, + embedding, + } + } +} diff --git a/src/storage/types/knowledge_relationship.rs b/src/storage/types/knowledge_relationship.rs new file mode 100644 index 0000000..a588721 --- /dev/null +++ b/src/storage/types/knowledge_relationship.rs @@ -0,0 +1,26 @@ +use crate::stored_object; +use uuid::Uuid; + +stored_object!(KnowledgeRelationship, "knowledge_relationship", { + in_: String, + out: String, + relationship_type: String, + metadata: Option +}); + +impl KnowledgeRelationship { + pub fn new( + in_: String, + out: String, + relationship_type: String, + metadata: Option, + ) -> Self { + Self { + id: Uuid::new_v4().to_string(), + in_, + out, + relationship_type, + metadata, + } + } +} diff --git a/src/storage/types/mod.rs b/src/storage/types/mod.rs index c1d619d..4c40e90 100644 --- a/src/storage/types/mod.rs +++ b/src/storage/types/mod.rs @@ -1,5 +1,7 @@ use axum::async_trait; use serde::{Deserialize, Serialize}; +pub mod knowledge_entity; +pub mod knowledge_relationship; pub mod text_chunk; pub mod text_content; @@ -11,7 +13,7 @@ pub trait StoredObject: Serialize + for<'de> Deserialize<'de> { #[macro_export] macro_rules! stored_object { - ($name:ident, $table:expr, {$($field:ident: $ty:ty),*}) => { + ($name:ident, $table:expr, {$($(#[$attr:meta])* $field:ident: $ty:ty),*}) => { use axum::async_trait; use serde::{Deserialize, Deserializer, Serialize}; use surrealdb::sql::Thing; diff --git a/src/storage/types/text_content.rs b/src/storage/types/text_content.rs index f3d4f77..314de66 100644 --- a/src/storage/types/text_content.rs +++ b/src/storage/types/text_content.rs @@ -11,15 +11,18 @@ stored_object!(TextContent, "text_content", { }); impl TextContent { - pub fn new(text: String, instructions: String, category: String) -> Self { + pub fn new( + text: String, + instructions: String, + category: String, + file_info: Option, + ) -> Self { Self { id: Uuid::new_v4().to_string(), text, - file_info: None, + file_info, instructions, category, } } - - // Other methods... } diff --git a/src/utils/llm.rs b/src/utils/llm.rs index aa4495d..dbedc1f 100644 --- a/src/utils/llm.rs +++ b/src/utils/llm.rs @@ -1,7 +1,9 @@ use crate::{ error::ProcessingError, - models::graph_entities::{ - GraphMapper, KnowledgeEntity, KnowledgeEntityType, KnowledgeRelationship, + models::graph_entities::GraphMapper, + storage::types::{ + knowledge_entity::{KnowledgeEntity, KnowledgeEntityType}, + knowledge_relationship::KnowledgeRelationship, }, }; use async_openai::types::{ @@ -119,7 +121,7 @@ impl LLMGraphAnalysisResult { entity_type: KnowledgeEntityType::from(llm_entity.entity_type.clone()), source_id: source_id.to_string(), metadata: None, - embedding: Some(embedding), + embedding, }; entities.push(knowledge_entity);