refactoring: completed storage, now using new fn to construct

This commit is contained in:
Per Stark
2024-11-21 09:23:42 +01:00
parent fc20526789
commit 200707af90
8 changed files with 121 additions and 148 deletions

View File

@@ -1,66 +1,6 @@
use serde::Deserialize;
use serde::Deserializer;
use serde::Serialize;
use std::collections::HashMap; use std::collections::HashMap;
use surrealdb::sql::Thing;
use uuid::Uuid; use uuid::Uuid;
/// Represents a generic knowledge entity in the graph.
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct KnowledgeEntity {
#[serde(deserialize_with = "thing_to_string")]
pub id: String,
pub name: String,
pub description: String,
pub entity_type: KnowledgeEntityType,
pub source_id: String,
pub metadata: Option<serde_json::Value>,
pub embedding: Option<Vec<f32>>,
}
pub fn thing_to_string<'de, D>(deserializer: D) -> Result<String, D::Error>
where
D: Deserializer<'de>,
{
let thing = Thing::deserialize(deserializer)?;
Ok(thing.id.to_raw())
}
#[derive(Debug, Serialize, Deserialize, Clone)]
pub enum KnowledgeEntityType {
Idea,
Project,
Document,
Page,
TextSnippet,
// Add more types as needed
}
impl From<String> for KnowledgeEntityType {
fn from(s: String) -> Self {
match s.to_lowercase().as_str() {
"idea" => KnowledgeEntityType::Idea,
"project" => KnowledgeEntityType::Project,
"document" => KnowledgeEntityType::Document,
"page" => KnowledgeEntityType::Page,
"textsnippet" => KnowledgeEntityType::TextSnippet,
_ => KnowledgeEntityType::Document, // Default case
}
}
}
/// Represents a relationship between two knowledge entities.
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct KnowledgeRelationship {
#[serde(deserialize_with = "thing_to_string")]
pub id: String,
#[serde(rename = "in")]
pub in_: String, // Target KnowledgeEntity ID
pub out: String, // Source KnowledgeEntity ID
pub relationship_type: String, // e.g., RelatedTo, RelevantTo
pub metadata: Option<serde_json::Value>, // Additional metadata
}
/// Intermediate struct to hold mapping between LLM keys and generated IDs. /// Intermediate struct to hold mapping between LLM keys and generated IDs.
#[derive(Clone)] #[derive(Clone)]
pub struct GraphMapper { pub struct GraphMapper {

View File

@@ -40,43 +40,35 @@ impl IngressObject {
category, category,
} => { } => {
let text = Self::fetch_text_from_url(url).await?; let text = Self::fetch_text_from_url(url).await?;
let id = Uuid::new_v4(); Ok(TextContent::new(
Ok(TextContent {
id: id.to_string(),
text, text,
instructions: instructions.clone(), instructions.into(),
category: category.clone(), category.into(),
file_info: None, None,
}) ))
} }
IngressObject::Text { IngressObject::Text {
text, text,
instructions, instructions,
category, category,
} => { } => Ok(TextContent::new(
let id = Uuid::new_v4(); text.into(),
Ok(TextContent { instructions.into(),
id: id.to_string(), category.into(),
text: text.clone(), None,
instructions: instructions.clone(), )),
category: category.clone(),
file_info: None,
})
}
IngressObject::File { IngressObject::File {
file_info, file_info,
instructions, instructions,
category, category,
} => { } => {
let id = Uuid::new_v4();
let text = Self::extract_text_from_file(file_info).await?; let text = Self::extract_text_from_file(file_info).await?;
Ok(TextContent { Ok(TextContent::new(
id: id.to_string(),
text, text,
instructions: instructions.clone(), instructions.into(),
category: category.clone(), category.into(),
file_info: Some(file_info.clone()), Some(file_info.to_owned()),
}) ))
} }
} }
} }

View File

@@ -1,5 +1,6 @@
use crate::storage;
use crate::storage::db::store_item; use crate::storage::db::store_item;
use crate::storage::types::knowledge_entity::KnowledgeEntity;
use crate::storage::types::knowledge_relationship::KnowledgeRelationship;
use crate::storage::types::text_chunk::TextChunk; use crate::storage::types::text_chunk::TextChunk;
use crate::storage::types::text_content::TextContent; use crate::storage::types::text_content::TextContent;
use crate::{ use crate::{
@@ -10,29 +11,6 @@ use crate::{
use surrealdb::{engine::remote::ws::Client, Surreal}; use surrealdb::{engine::remote::ws::Client, Surreal};
use text_splitter::TextSplitter; use text_splitter::TextSplitter;
use tracing::{debug, info}; use tracing::{debug, info};
use uuid::Uuid;
use super::graph_entities::{KnowledgeEntity, KnowledgeRelationship};
// #[derive(Serialize, Deserialize, Debug)]
// struct TextChunk {
// #[serde(deserialize_with = "thing_to_string")]
// id: String,
// source_id: String,
// chunk: String,
// embedding: Vec<f32>,
// }
/// Represents a single piece of text content extracted from various sources.
// #[derive(Debug, Serialize, Deserialize, Clone)]
// pub struct TextContent {
// #[serde(deserialize_with = "thing_to_string")]
// pub id: String,
// pub text: String,
// pub file_info: Option<FileInfo>,
// pub instructions: String,
// pub category: String,
// }
async fn vector_comparison<T>( async fn vector_comparison<T>(
take: u8, take: u8,
@@ -70,14 +48,14 @@ async fn get_related_nodes(
impl TextContent { impl TextContent {
/// Processes the `TextContent` by sending it to an LLM, storing in a graph DB, and vector DB. /// Processes the `TextContent` by sending it to an LLM, storing in a graph DB, and vector DB.
pub async fn process(&self) -> Result<(), ProcessingError> { pub async fn process(&self) -> Result<(), ProcessingError> {
// Store TextContent
let db_client = SurrealDbClient::new().await?; let db_client = SurrealDbClient::new().await?;
let openai_client = async_openai::Client::new(); let openai_client = async_openai::Client::new();
let create_operation = storage::db::store_item(&db_client, self.clone()).await?; // Store TextContent
let create_operation = store_item(&db_client, self.clone()).await?;
info!("{:?}", create_operation); info!("{:?}", create_operation);
// self.store_text_content(&db_client).await?;
// Get related nodes
let closest_text_content: Vec<TextChunk> = vector_comparison( let closest_text_content: Vec<TextChunk> = vector_comparison(
3, 3,
self.text.clone(), self.text.clone(),
@@ -148,42 +126,20 @@ impl TextContent {
db_client: &Surreal<Client>, db_client: &Surreal<Client>,
) -> Result<(), ProcessingError> { ) -> Result<(), ProcessingError> {
for entity in &entities { for entity in &entities {
info!( debug!(
"{:?}, {:?}, {:?}", "{:?}, {:?}, {:?}",
&entity.id, &entity.name, &entity.description &entity.id, &entity.name, &entity.description
); );
let _created: Option<KnowledgeEntity> = db_client store_item(db_client, entity.clone()).await?;
.create(("knowledge_entity", &entity.id.to_string()))
.content(entity.clone())
.await?;
debug!("{:?}", _created);
} }
for relationship in &relationships { for relationship in &relationships {
// info!("{:?}", relationship); debug!("{:?}", relationship);
let _created: Option<KnowledgeRelationship> = db_client store_item(db_client, relationship.clone()).await?;
.insert(("knowledge_relationship", &relationship.id.to_string()))
.content(relationship.clone())
.await?;
debug!("{:?}", _created);
} }
// for relationship in &relationships {
// let in_entity: Option<KnowledgeEntity> = db_client.select(("knowledge_entity",relationship.in_.to_string())).await?;
// let out_entity: Option<KnowledgeEntity> = db_client.select(("knowledge_entity", relationship.out.to_string())).await?;
// if let (Some(in_), Some(out)) = (in_entity, out_entity) {
// info!("{} - {} is {} to {} - {}", in_.id, in_.name, relationship.relationship_type, out.id, out.name);
// }
// else {
// info!("No in or out entities found");
// }
// }
info!( info!(
"Inserted to database: {:?} entities, {:?} relationships", "Inserted to database: {:?} entities, {:?} relationships",
entities.len(), entities.len(),
@@ -194,7 +150,6 @@ impl TextContent {
} }
/// Splits text and stores it in a vector database. /// Splits text and stores it in a vector database.
#[allow(dead_code)]
async fn store_in_vector_db( async fn store_in_vector_db(
&self, &self,
db_client: &Surreal<Client>, db_client: &Surreal<Client>,
@@ -210,8 +165,6 @@ impl TextContent {
let embedding = generate_embedding(&openai_client, chunk.to_string()).await?; let embedding = generate_embedding(&openai_client, chunk.to_string()).await?;
let text_chunk = TextChunk::new(self.id.to_string(), chunk.to_string(), embedding); let text_chunk = TextChunk::new(self.id.to_string(), chunk.to_string(), embedding);
info!("{:?}", text_chunk);
store_item(db_client, text_chunk).await?; store_item(db_client, text_chunk).await?;
} }

View File

@@ -0,0 +1,55 @@
use crate::stored_object;
use uuid::Uuid;
#[derive(Debug, Serialize, Deserialize, Clone)]
pub enum KnowledgeEntityType {
Idea,
Project,
Document,
Page,
TextSnippet,
// Add more types as needed
}
impl From<String> for KnowledgeEntityType {
fn from(s: String) -> Self {
match s.to_lowercase().as_str() {
"idea" => KnowledgeEntityType::Idea,
"project" => KnowledgeEntityType::Project,
"document" => KnowledgeEntityType::Document,
"page" => KnowledgeEntityType::Page,
"textsnippet" => KnowledgeEntityType::TextSnippet,
_ => KnowledgeEntityType::Document, // Default case
}
}
}
stored_object!(KnowledgeEntity, "knowledge_entity", {
source_id: String,
name: String,
description: String,
entity_type: KnowledgeEntityType,
metadata: Option<serde_json::Value>,
embedding: Vec<f32>
});
impl KnowledgeEntity {
pub fn new(
source_id: String,
name: String,
description: String,
entity_type: KnowledgeEntityType,
metadata: Option<serde_json::Value>,
embedding: Vec<f32>,
) -> Self {
Self {
id: Uuid::new_v4().to_string(),
source_id,
name,
description,
entity_type,
metadata,
embedding,
}
}
}

View File

@@ -0,0 +1,26 @@
use crate::stored_object;
use uuid::Uuid;
stored_object!(KnowledgeRelationship, "knowledge_relationship", {
in_: String,
out: String,
relationship_type: String,
metadata: Option<serde_json::Value>
});
impl KnowledgeRelationship {
pub fn new(
in_: String,
out: String,
relationship_type: String,
metadata: Option<serde_json::Value>,
) -> Self {
Self {
id: Uuid::new_v4().to_string(),
in_,
out,
relationship_type,
metadata,
}
}
}

View File

@@ -1,5 +1,7 @@
use axum::async_trait; use axum::async_trait;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
pub mod knowledge_entity;
pub mod knowledge_relationship;
pub mod text_chunk; pub mod text_chunk;
pub mod text_content; pub mod text_content;
@@ -11,7 +13,7 @@ pub trait StoredObject: Serialize + for<'de> Deserialize<'de> {
#[macro_export] #[macro_export]
macro_rules! stored_object { macro_rules! stored_object {
($name:ident, $table:expr, {$($field:ident: $ty:ty),*}) => { ($name:ident, $table:expr, {$($(#[$attr:meta])* $field:ident: $ty:ty),*}) => {
use axum::async_trait; use axum::async_trait;
use serde::{Deserialize, Deserializer, Serialize}; use serde::{Deserialize, Deserializer, Serialize};
use surrealdb::sql::Thing; use surrealdb::sql::Thing;

View File

@@ -11,15 +11,18 @@ stored_object!(TextContent, "text_content", {
}); });
impl TextContent { impl TextContent {
pub fn new(text: String, instructions: String, category: String) -> Self { pub fn new(
text: String,
instructions: String,
category: String,
file_info: Option<FileInfo>,
) -> Self {
Self { Self {
id: Uuid::new_v4().to_string(), id: Uuid::new_v4().to_string(),
text, text,
file_info: None, file_info,
instructions, instructions,
category, category,
} }
} }
// Other methods...
} }

View File

@@ -1,7 +1,9 @@
use crate::{ use crate::{
error::ProcessingError, error::ProcessingError,
models::graph_entities::{ models::graph_entities::GraphMapper,
GraphMapper, KnowledgeEntity, KnowledgeEntityType, KnowledgeRelationship, storage::types::{
knowledge_entity::{KnowledgeEntity, KnowledgeEntityType},
knowledge_relationship::KnowledgeRelationship,
}, },
}; };
use async_openai::types::{ use async_openai::types::{
@@ -119,7 +121,7 @@ impl LLMGraphAnalysisResult {
entity_type: KnowledgeEntityType::from(llm_entity.entity_type.clone()), entity_type: KnowledgeEntityType::from(llm_entity.entity_type.clone()),
source_id: source_id.to_string(), source_id: source_id.to_string(),
metadata: None, metadata: None,
embedding: Some(embedding), embedding,
}; };
entities.push(knowledge_entity); entities.push(knowledge_entity);