refactoring: completed storage, now using new fn to construct

This commit is contained in:
Per Stark
2024-11-21 09:23:42 +01:00
parent fc20526789
commit 200707af90
8 changed files with 121 additions and 148 deletions

View File

@@ -1,66 +1,6 @@
use serde::Deserialize;
use serde::Deserializer;
use serde::Serialize;
use std::collections::HashMap;
use surrealdb::sql::Thing;
use uuid::Uuid;
/// Represents a generic knowledge entity in the graph.
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct KnowledgeEntity {
#[serde(deserialize_with = "thing_to_string")]
pub id: String,
pub name: String,
pub description: String,
pub entity_type: KnowledgeEntityType,
pub source_id: String,
pub metadata: Option<serde_json::Value>,
pub embedding: Option<Vec<f32>>,
}
pub fn thing_to_string<'de, D>(deserializer: D) -> Result<String, D::Error>
where
D: Deserializer<'de>,
{
let thing = Thing::deserialize(deserializer)?;
Ok(thing.id.to_raw())
}
#[derive(Debug, Serialize, Deserialize, Clone)]
pub enum KnowledgeEntityType {
Idea,
Project,
Document,
Page,
TextSnippet,
// Add more types as needed
}
impl From<String> for KnowledgeEntityType {
fn from(s: String) -> Self {
match s.to_lowercase().as_str() {
"idea" => KnowledgeEntityType::Idea,
"project" => KnowledgeEntityType::Project,
"document" => KnowledgeEntityType::Document,
"page" => KnowledgeEntityType::Page,
"textsnippet" => KnowledgeEntityType::TextSnippet,
_ => KnowledgeEntityType::Document, // Default case
}
}
}
/// Represents a relationship between two knowledge entities.
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct KnowledgeRelationship {
#[serde(deserialize_with = "thing_to_string")]
pub id: String,
#[serde(rename = "in")]
pub in_: String, // Target KnowledgeEntity ID
pub out: String, // Source KnowledgeEntity ID
pub relationship_type: String, // e.g., RelatedTo, RelevantTo
pub metadata: Option<serde_json::Value>, // Additional metadata
}
/// Intermediate struct to hold mapping between LLM keys and generated IDs.
#[derive(Clone)]
pub struct GraphMapper {

View File

@@ -40,43 +40,35 @@ impl IngressObject {
category,
} => {
let text = Self::fetch_text_from_url(url).await?;
let id = Uuid::new_v4();
Ok(TextContent {
id: id.to_string(),
Ok(TextContent::new(
text,
instructions: instructions.clone(),
category: category.clone(),
file_info: None,
})
instructions.into(),
category.into(),
None,
))
}
IngressObject::Text {
text,
instructions,
category,
} => {
let id = Uuid::new_v4();
Ok(TextContent {
id: id.to_string(),
text: text.clone(),
instructions: instructions.clone(),
category: category.clone(),
file_info: None,
})
}
} => Ok(TextContent::new(
text.into(),
instructions.into(),
category.into(),
None,
)),
IngressObject::File {
file_info,
instructions,
category,
} => {
let id = Uuid::new_v4();
let text = Self::extract_text_from_file(file_info).await?;
Ok(TextContent {
id: id.to_string(),
Ok(TextContent::new(
text,
instructions: instructions.clone(),
category: category.clone(),
file_info: Some(file_info.clone()),
})
instructions.into(),
category.into(),
Some(file_info.to_owned()),
))
}
}
}

View File

@@ -1,5 +1,6 @@
use crate::storage;
use crate::storage::db::store_item;
use crate::storage::types::knowledge_entity::KnowledgeEntity;
use crate::storage::types::knowledge_relationship::KnowledgeRelationship;
use crate::storage::types::text_chunk::TextChunk;
use crate::storage::types::text_content::TextContent;
use crate::{
@@ -10,29 +11,6 @@ use crate::{
use surrealdb::{engine::remote::ws::Client, Surreal};
use text_splitter::TextSplitter;
use tracing::{debug, info};
use uuid::Uuid;
use super::graph_entities::{KnowledgeEntity, KnowledgeRelationship};
// #[derive(Serialize, Deserialize, Debug)]
// struct TextChunk {
// #[serde(deserialize_with = "thing_to_string")]
// id: String,
// source_id: String,
// chunk: String,
// embedding: Vec<f32>,
// }
/// Represents a single piece of text content extracted from various sources.
// #[derive(Debug, Serialize, Deserialize, Clone)]
// pub struct TextContent {
// #[serde(deserialize_with = "thing_to_string")]
// pub id: String,
// pub text: String,
// pub file_info: Option<FileInfo>,
// pub instructions: String,
// pub category: String,
// }
async fn vector_comparison<T>(
take: u8,
@@ -70,14 +48,14 @@ async fn get_related_nodes(
impl TextContent {
/// Processes the `TextContent` by sending it to an LLM, storing in a graph DB, and vector DB.
pub async fn process(&self) -> Result<(), ProcessingError> {
// Store TextContent
let db_client = SurrealDbClient::new().await?;
let openai_client = async_openai::Client::new();
let create_operation = storage::db::store_item(&db_client, self.clone()).await?;
// Store TextContent
let create_operation = store_item(&db_client, self.clone()).await?;
info!("{:?}", create_operation);
// self.store_text_content(&db_client).await?;
// Get related nodes
let closest_text_content: Vec<TextChunk> = vector_comparison(
3,
self.text.clone(),
@@ -148,42 +126,20 @@ impl TextContent {
db_client: &Surreal<Client>,
) -> Result<(), ProcessingError> {
for entity in &entities {
info!(
debug!(
"{:?}, {:?}, {:?}",
&entity.id, &entity.name, &entity.description
);
let _created: Option<KnowledgeEntity> = db_client
.create(("knowledge_entity", &entity.id.to_string()))
.content(entity.clone())
.await?;
debug!("{:?}", _created);
store_item(db_client, entity.clone()).await?;
}
for relationship in &relationships {
// info!("{:?}", relationship);
debug!("{:?}", relationship);
let _created: Option<KnowledgeRelationship> = db_client
.insert(("knowledge_relationship", &relationship.id.to_string()))
.content(relationship.clone())
.await?;
debug!("{:?}", _created);
store_item(db_client, relationship.clone()).await?;
}
// for relationship in &relationships {
// let in_entity: Option<KnowledgeEntity> = db_client.select(("knowledge_entity",relationship.in_.to_string())).await?;
// let out_entity: Option<KnowledgeEntity> = db_client.select(("knowledge_entity", relationship.out.to_string())).await?;
// if let (Some(in_), Some(out)) = (in_entity, out_entity) {
// info!("{} - {} is {} to {} - {}", in_.id, in_.name, relationship.relationship_type, out.id, out.name);
// }
// else {
// info!("No in or out entities found");
// }
// }
info!(
"Inserted to database: {:?} entities, {:?} relationships",
entities.len(),
@@ -194,7 +150,6 @@ impl TextContent {
}
/// Splits text and stores it in a vector database.
#[allow(dead_code)]
async fn store_in_vector_db(
&self,
db_client: &Surreal<Client>,
@@ -210,8 +165,6 @@ impl TextContent {
let embedding = generate_embedding(&openai_client, chunk.to_string()).await?;
let text_chunk = TextChunk::new(self.id.to_string(), chunk.to_string(), embedding);
info!("{:?}", text_chunk);
store_item(db_client, text_chunk).await?;
}

View File

@@ -0,0 +1,55 @@
use crate::stored_object;
use uuid::Uuid;
#[derive(Debug, Serialize, Deserialize, Clone)]
pub enum KnowledgeEntityType {
Idea,
Project,
Document,
Page,
TextSnippet,
// Add more types as needed
}
impl From<String> for KnowledgeEntityType {
fn from(s: String) -> Self {
match s.to_lowercase().as_str() {
"idea" => KnowledgeEntityType::Idea,
"project" => KnowledgeEntityType::Project,
"document" => KnowledgeEntityType::Document,
"page" => KnowledgeEntityType::Page,
"textsnippet" => KnowledgeEntityType::TextSnippet,
_ => KnowledgeEntityType::Document, // Default case
}
}
}
stored_object!(KnowledgeEntity, "knowledge_entity", {
source_id: String,
name: String,
description: String,
entity_type: KnowledgeEntityType,
metadata: Option<serde_json::Value>,
embedding: Vec<f32>
});
impl KnowledgeEntity {
pub fn new(
source_id: String,
name: String,
description: String,
entity_type: KnowledgeEntityType,
metadata: Option<serde_json::Value>,
embedding: Vec<f32>,
) -> Self {
Self {
id: Uuid::new_v4().to_string(),
source_id,
name,
description,
entity_type,
metadata,
embedding,
}
}
}

View File

@@ -0,0 +1,26 @@
use crate::stored_object;
use uuid::Uuid;
stored_object!(KnowledgeRelationship, "knowledge_relationship", {
in_: String,
out: String,
relationship_type: String,
metadata: Option<serde_json::Value>
});
impl KnowledgeRelationship {
pub fn new(
in_: String,
out: String,
relationship_type: String,
metadata: Option<serde_json::Value>,
) -> Self {
Self {
id: Uuid::new_v4().to_string(),
in_,
out,
relationship_type,
metadata,
}
}
}

View File

@@ -1,5 +1,7 @@
use axum::async_trait;
use serde::{Deserialize, Serialize};
pub mod knowledge_entity;
pub mod knowledge_relationship;
pub mod text_chunk;
pub mod text_content;
@@ -11,7 +13,7 @@ pub trait StoredObject: Serialize + for<'de> Deserialize<'de> {
#[macro_export]
macro_rules! stored_object {
($name:ident, $table:expr, {$($field:ident: $ty:ty),*}) => {
($name:ident, $table:expr, {$($(#[$attr:meta])* $field:ident: $ty:ty),*}) => {
use axum::async_trait;
use serde::{Deserialize, Deserializer, Serialize};
use surrealdb::sql::Thing;

View File

@@ -11,15 +11,18 @@ stored_object!(TextContent, "text_content", {
});
impl TextContent {
pub fn new(text: String, instructions: String, category: String) -> Self {
pub fn new(
text: String,
instructions: String,
category: String,
file_info: Option<FileInfo>,
) -> Self {
Self {
id: Uuid::new_v4().to_string(),
text,
file_info: None,
file_info,
instructions,
category,
}
}
// Other methods...
}

View File

@@ -1,7 +1,9 @@
use crate::{
error::ProcessingError,
models::graph_entities::{
GraphMapper, KnowledgeEntity, KnowledgeEntityType, KnowledgeRelationship,
models::graph_entities::GraphMapper,
storage::types::{
knowledge_entity::{KnowledgeEntity, KnowledgeEntityType},
knowledge_relationship::KnowledgeRelationship,
},
};
use async_openai::types::{
@@ -119,7 +121,7 @@ impl LLMGraphAnalysisResult {
entity_type: KnowledgeEntityType::from(llm_entity.entity_type.clone()),
source_id: source_id.to_string(),
metadata: None,
embedding: Some(embedding),
embedding,
};
entities.push(knowledge_entity);