feat: text splitting and storage

This commit is contained in:
Per Stark
2024-11-20 12:10:23 +01:00
parent 8ba853a329
commit c3ccb8c034
6 changed files with 257 additions and 89 deletions

131
Cargo.lock generated
View File

@@ -261,7 +261,7 @@ checksum = "965c2d33e53cb6b267e148a4cb0760bc01f4904c1cd4bb4002a085bb016d1490"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.77", "syn 2.0.87",
"synstructure", "synstructure",
] ]
@@ -273,7 +273,7 @@ checksum = "7b18050c2cd6fe86c3a76584ef5e0baf286d038cda203eb6223df2cc413565f7"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.77", "syn 2.0.87",
] ]
[[package]] [[package]]
@@ -391,7 +391,7 @@ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"strum", "strum",
"syn 2.0.77", "syn 2.0.87",
"thiserror", "thiserror",
] ]
@@ -535,7 +535,7 @@ checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.77", "syn 2.0.87",
] ]
[[package]] [[package]]
@@ -552,7 +552,7 @@ checksum = "a27b8a3a6e1a44fa4c8baf1f653e4172e81486d4941f2237e20dc2d0cf4ddff1"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.77", "syn 2.0.87",
] ]
[[package]] [[package]]
@@ -572,6 +572,18 @@ version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0"
[[package]]
name = "auto_enums"
version = "0.8.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "459b77b7e855f875fd15f101064825cd79eb83185a961d66e6298560126facfb"
dependencies = [
"derive_utils",
"proc-macro2",
"quote",
"syn 2.0.87",
]
[[package]] [[package]]
name = "autocfg" name = "autocfg"
version = "1.3.0" version = "1.3.0"
@@ -643,7 +655,7 @@ checksum = "57d123550fa8d071b7255cb0cc04dc302baa6c8c4a79f55701552684d8399bce"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.77", "syn 2.0.87",
] ]
[[package]] [[package]]
@@ -675,7 +687,7 @@ dependencies = [
"heck 0.5.0", "heck 0.5.0",
"proc-macro-error", "proc-macro-error",
"quote", "quote",
"syn 2.0.77", "syn 2.0.87",
"ubyte", "ubyte",
] ]
@@ -870,7 +882,7 @@ dependencies = [
"proc-macro-crate", "proc-macro-crate",
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.77", "syn 2.0.87",
"syn_derive", "syn_derive",
] ]
@@ -1178,7 +1190,7 @@ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"strsim", "strsim",
"syn 2.0.77", "syn 2.0.87",
] ]
[[package]] [[package]]
@@ -1189,7 +1201,7 @@ checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806"
dependencies = [ dependencies = [
"darling_core", "darling_core",
"quote", "quote",
"syn 2.0.77", "syn 2.0.87",
] ]
[[package]] [[package]]
@@ -1246,7 +1258,7 @@ checksum = "8034092389675178f570469e6c3b0465d3d30b4505c294a6550db47f3c17ad18"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.77", "syn 2.0.87",
] ]
[[package]] [[package]]
@@ -1277,7 +1289,7 @@ dependencies = [
"darling", "darling",
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.77", "syn 2.0.87",
] ]
[[package]] [[package]]
@@ -1287,7 +1299,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4abae7035bf79b9877b779505d8cf3749285b80c43941eda66604841889451dc" checksum = "4abae7035bf79b9877b779505d8cf3749285b80c43941eda66604841889451dc"
dependencies = [ dependencies = [
"derive_builder_core", "derive_builder_core",
"syn 2.0.77", "syn 2.0.87",
]
[[package]]
name = "derive_utils"
version = "0.14.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "65f152f4b8559c4da5d574bafc7af85454d706b4c5fe8b530d508cacbb6807ea"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.87",
] ]
[[package]] [[package]]
@@ -1345,7 +1368,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.77", "syn 2.0.87",
] ]
[[package]] [[package]]
@@ -1643,7 +1666,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.77", "syn 2.0.87",
] ]
[[package]] [[package]]
@@ -1864,7 +1887,7 @@ dependencies = [
"markup5ever", "markup5ever",
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.77", "syn 2.0.87",
] ]
[[package]] [[package]]
@@ -2350,7 +2373,7 @@ checksum = "49e7bc1560b95a3c4a25d03de42fe76ca718ab92d1a22a55b9b4cf67b3ae635c"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.77", "syn 2.0.87",
] ]
[[package]] [[package]]
@@ -2419,7 +2442,7 @@ dependencies = [
"cfg-if", "cfg-if",
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.77", "syn 2.0.87",
] ]
[[package]] [[package]]
@@ -2616,9 +2639,9 @@ dependencies = [
[[package]] [[package]]
name = "once_cell" name = "once_cell"
version = "1.19.0" version = "1.20.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775"
[[package]] [[package]]
name = "openssl-probe" name = "openssl-probe"
@@ -2824,7 +2847,7 @@ dependencies = [
"phf_shared 0.11.2", "phf_shared 0.11.2",
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.77", "syn 2.0.87",
"unicase", "unicase",
] ]
@@ -2870,7 +2893,7 @@ checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.77", "syn 2.0.87",
] ]
[[package]] [[package]]
@@ -3278,7 +3301,7 @@ checksum = "bcc303e793d3734489387d205e9b186fac9c6cfacedd98cbb2e8a5943595f3e6"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.77", "syn 2.0.87",
] ]
[[package]] [[package]]
@@ -3419,7 +3442,7 @@ checksum = "5f0ec466e5d8dca9965eb6871879677bef5590cf7525ad96cae14376efb75073"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.77", "syn 2.0.87",
] ]
[[package]] [[package]]
@@ -3828,7 +3851,7 @@ checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.77", "syn 2.0.87",
] ]
[[package]] [[package]]
@@ -3893,7 +3916,7 @@ dependencies = [
"darling", "darling",
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.77", "syn 2.0.87",
] ]
[[package]] [[package]]
@@ -4157,7 +4180,7 @@ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"rustversion", "rustversion",
"syn 2.0.77", "syn 2.0.87",
] ]
[[package]] [[package]]
@@ -4312,9 +4335,9 @@ dependencies = [
[[package]] [[package]]
name = "syn" name = "syn"
version = "2.0.77" version = "2.0.87"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9f35bcdf61fd8e7be6caf75f429fdca8beb3ed76584befb503b1569faee373ed" checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
@@ -4330,7 +4353,7 @@ dependencies = [
"proc-macro-error", "proc-macro-error",
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.77", "syn 2.0.87",
] ]
[[package]] [[package]]
@@ -4356,7 +4379,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.77", "syn 2.0.87",
] ]
[[package]] [[package]]
@@ -4419,23 +4442,40 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3369f5ac52d5eb6ab48c6b4ffdc8efbcad6b89c765749064ba298f2c68a16a76" checksum = "3369f5ac52d5eb6ab48c6b4ffdc8efbcad6b89c765749064ba298f2c68a16a76"
[[package]] [[package]]
name = "thiserror" name = "text-splitter"
version = "1.0.63" version = "0.18.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c0342370b38b6a11b6cc11d6a805569958d54cfa061a29969c3b5ce2ea405724" checksum = "189450e9eaff1a8037cca4d60ca62c134a7e601187430bdd487c86e25e8d6641"
dependencies = [
"ahash 0.8.11",
"auto_enums",
"either",
"itertools 0.13.0",
"once_cell",
"regex",
"strum",
"thiserror",
"unicode-segmentation",
]
[[package]]
name = "thiserror"
version = "1.0.69"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
dependencies = [ dependencies = [
"thiserror-impl", "thiserror-impl",
] ]
[[package]] [[package]]
name = "thiserror-impl" name = "thiserror-impl"
version = "1.0.63" version = "1.0.69"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261" checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.77", "syn 2.0.87",
] ]
[[package]] [[package]]
@@ -4529,7 +4569,7 @@ checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.77", "syn 2.0.87",
] ]
[[package]] [[package]]
@@ -4649,7 +4689,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.77", "syn 2.0.87",
] ]
[[package]] [[package]]
@@ -4805,6 +4845,12 @@ dependencies = [
"unicode-script", "unicode-script",
] ]
[[package]]
name = "unicode-segmentation"
version = "1.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493"
[[package]] [[package]]
name = "unicode-width" name = "unicode-width"
version = "0.1.14" version = "0.1.14"
@@ -4923,7 +4969,7 @@ dependencies = [
"once_cell", "once_cell",
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.77", "syn 2.0.87",
"wasm-bindgen-shared", "wasm-bindgen-shared",
] ]
@@ -4957,7 +5003,7 @@ checksum = "afc340c74d9005395cf9dd098506f7f44e38f2b4a21c6aaacf9a105ea5e1e836"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.77", "syn 2.0.87",
"wasm-bindgen-backend", "wasm-bindgen-backend",
"wasm-bindgen-shared", "wasm-bindgen-shared",
] ]
@@ -5324,7 +5370,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.77", "syn 2.0.87",
] ]
[[package]] [[package]]
@@ -5351,6 +5397,7 @@ dependencies = [
"sha2", "sha2",
"surrealdb", "surrealdb",
"tempfile", "tempfile",
"text-splitter",
"thiserror", "thiserror",
"tokio", "tokio",
"tracing", "tracing",

View File

@@ -18,6 +18,7 @@ serde_json = "1.0.128"
sha2 = "0.10.8" sha2 = "0.10.8"
surrealdb = "2.0.4" surrealdb = "2.0.4"
tempfile = "3.12.0" tempfile = "3.12.0"
text-splitter = "0.18.1"
thiserror = "1.0.63" thiserror = "1.0.63"
tokio = { version = "1.40.0", features = ["full"] } tokio = { version = "1.40.0", features = ["full"] }
tracing = "0.1.40" tracing = "0.1.40"

View File

@@ -18,7 +18,7 @@ pub struct KnowledgeEntity {
pub embedding: Option<Vec<f32>>, pub embedding: Option<Vec<f32>>,
} }
fn thing_to_string<'de, D>(deserializer: D) -> Result<String, D::Error> pub fn thing_to_string<'de, D>(deserializer: D) -> Result<String, D::Error>
where where
D: Deserializer<'de>, D: Deserializer<'de>,
{ {

View File

@@ -38,7 +38,7 @@ impl IngressObject {
let text = Self::fetch_text_from_url(url).await?; let text = Self::fetch_text_from_url(url).await?;
let id = Uuid::new_v4(); let id = Uuid::new_v4();
Ok(TextContent { Ok(TextContent {
id, id: id.to_string(),
text, text,
instructions: instructions.clone(), instructions: instructions.clone(),
category: category.clone(), category: category.clone(),
@@ -48,7 +48,7 @@ impl IngressObject {
IngressObject::Text { text, instructions, category } => { IngressObject::Text { text, instructions, category } => {
let id = Uuid::new_v4(); let id = Uuid::new_v4();
Ok(TextContent { Ok(TextContent {
id, id: id.to_string(),
text: text.clone(), text: text.clone(),
instructions: instructions.clone(), instructions: instructions.clone(),
category: category.clone(), category: category.clone(),
@@ -59,7 +59,7 @@ impl IngressObject {
let id = Uuid::new_v4(); let id = Uuid::new_v4();
let text = Self::extract_text_from_file(file_info).await?; let text = Self::extract_text_from_file(file_info).await?;
Ok(TextContent { Ok(TextContent {
id, id: id.to_string(),
text, text,
instructions: instructions.clone(), instructions: instructions.clone(),
category: category.clone(), category: category.clone(),

View File

@@ -1,17 +1,32 @@
use crate::{
models::file_info::FileInfo,
surrealdb::{SurrealDbClient, SurrealError},
utils::llm::{create_json_ld, generate_embedding},
};
use async_openai::error::OpenAIError; use async_openai::error::OpenAIError;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use surrealdb::{engine::remote::ws::Client, Surreal}; use surrealdb::{engine::remote::ws::Client, Surreal};
use text_splitter::TextSplitter;
use thiserror::Error;
use tracing::{debug, info}; use tracing::{debug, info};
use uuid::Uuid; use uuid::Uuid;
use crate::{models::file_info::FileInfo, surrealdb::{SurrealDbClient, SurrealError}, utils::llm::create_json_ld};
use thiserror::Error;
use super::graph_entities::{KnowledgeEntity, KnowledgeRelationship}; use super::graph_entities::{thing_to_string, KnowledgeEntity, KnowledgeRelationship};
#[derive(Serialize, Deserialize, Debug)]
struct TextChunk {
#[serde(deserialize_with = "thing_to_string")]
id: String,
source_id: String,
chunk: String,
embedding: Vec<f32>,
}
/// Represents a single piece of text content extracted from various sources. /// Represents a single piece of text content extracted from various sources.
#[derive(Debug, Serialize, Deserialize, Clone)] #[derive(Debug, Serialize, Deserialize, Clone)]
pub struct TextContent { pub struct TextContent {
pub id: Uuid, #[serde(deserialize_with = "thing_to_string")]
pub id: String,
pub text: String, pub text: String,
pub file_info: Option<FileInfo>, pub file_info: Option<FileInfo>,
pub instructions: String, pub instructions: String,
@@ -26,13 +41,13 @@ pub enum ProcessingError {
#[error("SurrealDB error: {0}")] #[error("SurrealDB error: {0}")]
SurrealError(#[from] SurrealError), SurrealError(#[from] SurrealError),
#[error("SurrealDb error: {0}")] #[error("SurrealDb error: {0}")]
SurrealDbError(#[from] surrealdb::Error), SurrealDbError(#[from] surrealdb::Error),
#[error("Graph DB storage error: {0}")] #[error("Graph DB storage error: {0}")]
GraphDBError(String), GraphDBError(String),
#[error("Vector DB storage error: {0}")] #[error("Vector DB storage error: {0}")]
VectorDBError(String), VectorDBError(String),
@@ -43,39 +58,106 @@ pub enum ProcessingError {
OpenAIerror(#[from] OpenAIError), OpenAIerror(#[from] OpenAIError),
} }
async fn vector_comparison<T>(
take: u8,
input_text: String,
db_client: &Surreal<Client>,
table: String,
openai_client: &async_openai::Client<async_openai::config::OpenAIConfig>,
) -> Result<Vec<T>, ProcessingError>
where
T: for<'de> serde::Deserialize<'de>, // Add this trait bound for deserialization
{
let input_embedding = generate_embedding(&openai_client, input_text).await?;
// Construct the query
let closest_query = format!("SELECT *, vector::distance::knn() AS distance FROM {} WHERE embedding <|{},40|> {:?} ORDER BY distance",table, take, input_embedding);
// Perform query and deserialize to struct
let closest_entities: Vec<T> = db_client.query(closest_query).await?.take(0)?;
Ok(closest_entities)
}
async fn get_related_nodes(
id: String,
db_client: &Surreal<Client>,
) -> Result<Vec<KnowledgeEntity>, ProcessingError> {
let query = format!("SELECT -> knowledge_relationship -> knowledge_entity as related_nodes FROM knowledge_entity WHERE source_id = `{}`", id);
// let query = format!("SELECT * FROM knowledge_entity WHERE in OR out {}", id);
let related_nodes: Vec<KnowledgeEntity> = db_client.query(query).await?.take(0)?;
Ok(related_nodes)
}
impl TextContent { impl TextContent {
/// Processes the `TextContent` by sending it to an LLM, storing in a graph DB, and vector DB. /// Processes the `TextContent` by sending it to an LLM, storing in a graph DB, and vector DB.
pub async fn process(&self) -> Result<(), ProcessingError> { pub async fn process(&self) -> Result<(), ProcessingError> {
// Store TextContent // Store TextContent
let db_client = SurrealDbClient::new().await?; let db_client = SurrealDbClient::new().await?;
let openai_client = async_openai::Client::new();
// let deleted: Vec<KnowledgeEntity> = db_client.delete("knowledge_entity").await?; self.store_text_content(&db_client).await?;
let closest_text_content: Vec<TextChunk> = vector_comparison(
4,
self.text.clone(),
&db_client,
"text_chunk".to_string(),
&openai_client,
)
.await?;
for node in closest_text_content {
info!("{}-{}", node.id, node.source_id);
let related_nodes = get_related_nodes(node.source_id, &db_client).await?;
info!("{:?}", related_nodes);
}
panic!("STOPPING");
// let deleted: Vec<TextChunk> = db_client.delete("text_chunk").await?;
// info! {"{:?} KnowledgeEntities deleted", deleted.len()}; // info! {"{:?} KnowledgeEntities deleted", deleted.len()};
// let relationships_deleted: Vec<KnowledgeRelationship> = // let relationships_deleted: Vec<KnowledgeRelationship> =
// db_client.delete("knowledge_relationship").await?; // db_client.delete("knowledge_relationship").await?;
// info!("{:?} Relationships deleted", relationships_deleted.len()); // info!("{:?} Relationships deleted", relationships_deleted.len());
// panic!("STOP"); // panic!("STOP");
// db_client.query("REMOVE INDEX embeddings ON knowledge_entity").await?; // db_client.query("REMOVE INDEX embeddings ON knowledge_entity").await?;
// db_client.query("DEFINE INDEX embeddings ON knowledge_entity FIELDS embedding HNSW DIMENSION 1536").await?; // db_client
db_client.query("REBUILD INDEX IF EXISTS embeddings ON knowledge_entity").await?; // .query("DEFINE INDEX idx_embedding ON text_chunk FIELDS embedding HNSW DIMENSION 1536")
// .await?;
db_client
.query("REBUILD INDEX IF EXISTS idx_embedding ON text_chunk")
.await?;
db_client
.query("REBUILD INDEX IF EXISTS embeddings ON knowledge_entity")
.await?;
// Step 1: Send to LLM for analysis // Step 1: Send to LLM for analysis
let analysis = create_json_ld(&self.category, &self.instructions, &self.text, &db_client).await?; let analysis = create_json_ld(
&self.category,
&self.instructions,
&self.text,
&db_client,
&openai_client,
)
.await?;
// info!("{:#?}", &analysis); // info!("{:#?}", &analysis);
// Step 2: Convert LLM analysis to database entities // Step 2: Convert LLM analysis to database entities
let (entities, relationships) = analysis.to_database_entities(&self.id).await?; let (entities, relationships) = analysis
.to_database_entities(&self.id, &openai_client)
.await?;
// Step 3: Store in database // Step 3: Store in database
self.store_in_graph_db(entities, relationships, &db_client).await?; self.store_in_graph_db(entities, relationships, &db_client)
.await?;
// Step 4: Split text and store in Vector DB // Step 4: Split text and store in Vector DB
// self.store_in_vector_db().await?; self.store_in_vector_db(&db_client, &openai_client).await?;
Ok(()) Ok(())
} }
@@ -87,14 +169,17 @@ impl TextContent {
db_client: &Surreal<Client>, db_client: &Surreal<Client>,
) -> Result<(), ProcessingError> { ) -> Result<(), ProcessingError> {
for entity in &entities { for entity in &entities {
info!("{:?}, {:?}, {:?}", &entity.id, &entity.name, &entity.description); info!(
"{:?}, {:?}, {:?}",
&entity.id, &entity.name, &entity.description
);
let _created: Option<KnowledgeEntity> = db_client let _created: Option<KnowledgeEntity> = db_client
.create(("knowledge_entity", &entity.id.to_string())) .create(("knowledge_entity", &entity.id.to_string()))
.content(entity.clone()) .content(entity.clone())
.await?; .await?;
debug!("{:?}",_created); debug!("{:?}", _created);
} }
for relationship in &relationships { for relationship in &relationships {
@@ -105,13 +190,13 @@ impl TextContent {
.content(relationship.clone()) .content(relationship.clone())
.await?; .await?;
debug!("{:?}",_created); debug!("{:?}", _created);
} }
// for relationship in &relationships { // for relationship in &relationships {
// let in_entity: Option<KnowledgeEntity> = db_client.select(("knowledge_entity",relationship.in_.to_string())).await?; // let in_entity: Option<KnowledgeEntity> = db_client.select(("knowledge_entity",relationship.in_.to_string())).await?;
// let out_entity: Option<KnowledgeEntity> = db_client.select(("knowledge_entity", relationship.out.to_string())).await?; // let out_entity: Option<KnowledgeEntity> = db_client.select(("knowledge_entity", relationship.out.to_string())).await?;
// if let (Some(in_), Some(out)) = (in_entity, out_entity) { // if let (Some(in_), Some(out)) = (in_entity, out_entity) {
// info!("{} - {} is {} to {} - {}", in_.id, in_.name, relationship.relationship_type, out.id, out.name); // info!("{} - {} is {} to {} - {}", in_.id, in_.name, relationship.relationship_type, out.id, out.name);
// } // }
@@ -120,24 +205,59 @@ impl TextContent {
// } // }
// } // }
info!("Inserted to database: {:?} entities, {:?} relationships", entities.len(), relationships.len()); info!(
"Inserted to database: {:?} entities, {:?} relationships",
entities.len(),
relationships.len()
);
Ok(()) Ok(())
} }
/// Splits text and stores it in a vector database. /// Splits text and stores it in a vector database.
#[allow(dead_code)] #[allow(dead_code)]
async fn store_in_vector_db(&self) -> Result<(), ProcessingError> { async fn store_in_vector_db(
// TODO: Implement text splitting and vector storage logic. &self,
// Example: db_client: &Surreal<Client>,
/* openai_client: &async_openai::Client<async_openai::config::OpenAIConfig>,
let chunks = text_splitter::split(&self.text); ) -> Result<(), ProcessingError> {
let vector_db = VectorDB::new("http://vector-db:5000"); let max_characters = 500..2000;
let splitter = TextSplitter::new(max_characters);
let chunks = splitter.chunks(self.text.as_str());
for chunk in chunks { for chunk in chunks {
vector_db.insert(chunk).await.map_err(|e| ProcessingError::VectorDBError(e.to_string()))?; info!("Chunk: {}", chunk);
let embedding = generate_embedding(&openai_client, chunk.to_string()).await?;
let text_chunk = TextChunk {
id: Uuid::new_v4().to_string(),
source_id: self.id.clone(),
chunk: chunk.to_string(),
embedding,
};
info!("{:?}", text_chunk);
let _created: Option<TextChunk> = db_client
.create(("text_chunk", text_chunk.id.clone()))
.content(text_chunk)
.await?;
debug!("{:?}", _created);
} }
*/
unimplemented!() Ok(())
}
/// Stores text content in database
async fn store_text_content(&self, db_client: &Surreal<Client>) -> Result<(), ProcessingError> {
let _created: Option<TextContent> = db_client
.create(("text_content", self.id.clone()))
.content(self.clone())
.await?;
debug!("{:?}", _created);
Ok(())
} }
} }

View File

@@ -38,7 +38,7 @@ pub struct LLMGraphAnalysisResult {
pub relationships: Vec<LLMRelationship>, pub relationships: Vec<LLMRelationship>,
} }
async fn generate_embedding( pub async fn generate_embedding(
client: &async_openai::Client<async_openai::config::OpenAIConfig>, client: &async_openai::Client<async_openai::config::OpenAIConfig>,
input: String, input: String,
) -> Result<Vec<f32>, ProcessingError> { ) -> Result<Vec<f32>, ProcessingError> {
@@ -73,13 +73,15 @@ impl LLMGraphAnalysisResult {
/// # Arguments /// # Arguments
/// ///
/// * `source_id` - A UUID representing the source identifier. /// * `source_id` - A UUID representing the source identifier.
/// * `openai_client` - OpenAI client for LLM calls.
/// ///
/// # Returns /// # Returns
/// ///
/// * `Result<(Vec<KnowledgeEntity>, Vec<KnowledgeRelationship>), ProcessingError>` - A tuple containing vectors of `KnowledgeEntity` and `KnowledgeRelationship`. /// * `Result<(Vec<KnowledgeEntity>, Vec<KnowledgeRelationship>), ProcessingError>` - A tuple containing vectors of `KnowledgeEntity` and `KnowledgeRelationship`.
pub async fn to_database_entities( pub async fn to_database_entities(
&self, &self,
source_id: &Uuid, source_id: &String,
openai_client: &async_openai::Client<async_openai::config::OpenAIConfig>,
) -> Result<(Vec<KnowledgeEntity>, Vec<KnowledgeRelationship>), ProcessingError> { ) -> Result<(Vec<KnowledgeEntity>, Vec<KnowledgeRelationship>), ProcessingError> {
let mut mapper = GraphMapper::new(); let mut mapper = GraphMapper::new();
@@ -88,7 +90,6 @@ impl LLMGraphAnalysisResult {
mapper.assign_id(&llm_entity.key); mapper.assign_id(&llm_entity.key);
} }
let openai_client = async_openai::Client::new();
let mut entities = vec![]; let mut entities = vec![];
@@ -154,15 +155,13 @@ pub async fn create_json_ld(
instructions: &str, instructions: &str,
text: &str, text: &str,
db_client: &Surreal<Client>, db_client: &Surreal<Client>,
openai_client: &async_openai::Client<async_openai::config::OpenAIConfig>,
) -> Result<LLMGraphAnalysisResult, ProcessingError> { ) -> Result<LLMGraphAnalysisResult, ProcessingError> {
// Initialize llm client
let client = async_openai::Client::new();
// Format the input for more cohesive comparison // Format the input for more cohesive comparison
let input_text = format!("content: {:?}, category: {:?}, user_instructions: {:?}", text, category, instructions); let input_text = format!("content: {:?}, category: {:?}, user_instructions: {:?}", text, category, instructions);
// Generate embedding of the input // Generate embedding of the input
let input_embedding = generate_embedding(&client, input_text).await?; let input_embedding = generate_embedding(&openai_client, input_text).await?;
let number_of_entities_to_get = 10; let number_of_entities_to_get = 10;
@@ -276,6 +275,7 @@ pub async fn create_json_ld(
6. You will be presented with a few existing KnowledgeEntities that are similar to the current ones. They will have an existing UUID. When creating relationships to these entities, use their UUID. 6. You will be presented with a few existing KnowledgeEntities that are similar to the current ones. They will have an existing UUID. When creating relationships to these entities, use their UUID.
7. Only create relationships between existing KnowledgeEntities. 7. Only create relationships between existing KnowledgeEntities.
8. Entities that exist already in the database should NOT be created again. If there is only a minor overlap, skip creating a new entity. 8. Entities that exist already in the database should NOT be created again. If there is only a minor overlap, skip creating a new entity.
9. A new relationship MUST include a newly created KnowledgeEntity.
"#; "#;
let user_message = format!( let user_message = format!(
@@ -297,7 +297,7 @@ pub async fn create_json_ld(
.map_err(|e| ProcessingError::LLMError(e.to_string()))?; .map_err(|e| ProcessingError::LLMError(e.to_string()))?;
// Send the request to OpenAI // Send the request to OpenAI
let response = client let response = openai_client
.chat() .chat()
.create(request) .create(request)
.await .await