mirror of
https://github.com/perstarkse/minne.git
synced 2026-05-05 23:43:55 +02:00
feat: text splitting and storage
This commit is contained in:
131
Cargo.lock
generated
131
Cargo.lock
generated
@@ -261,7 +261,7 @@ checksum = "965c2d33e53cb6b267e148a4cb0760bc01f4904c1cd4bb4002a085bb016d1490"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.77",
|
||||
"syn 2.0.87",
|
||||
"synstructure",
|
||||
]
|
||||
|
||||
@@ -273,7 +273,7 @@ checksum = "7b18050c2cd6fe86c3a76584ef5e0baf286d038cda203eb6223df2cc413565f7"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.77",
|
||||
"syn 2.0.87",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -391,7 +391,7 @@ dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"strum",
|
||||
"syn 2.0.77",
|
||||
"syn 2.0.87",
|
||||
"thiserror",
|
||||
]
|
||||
|
||||
@@ -535,7 +535,7 @@ checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.77",
|
||||
"syn 2.0.87",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -552,7 +552,7 @@ checksum = "a27b8a3a6e1a44fa4c8baf1f653e4172e81486d4941f2237e20dc2d0cf4ddff1"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.77",
|
||||
"syn 2.0.87",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -572,6 +572,18 @@ version = "1.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0"
|
||||
|
||||
[[package]]
|
||||
name = "auto_enums"
|
||||
version = "0.8.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "459b77b7e855f875fd15f101064825cd79eb83185a961d66e6298560126facfb"
|
||||
dependencies = [
|
||||
"derive_utils",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.87",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "autocfg"
|
||||
version = "1.3.0"
|
||||
@@ -643,7 +655,7 @@ checksum = "57d123550fa8d071b7255cb0cc04dc302baa6c8c4a79f55701552684d8399bce"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.77",
|
||||
"syn 2.0.87",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -675,7 +687,7 @@ dependencies = [
|
||||
"heck 0.5.0",
|
||||
"proc-macro-error",
|
||||
"quote",
|
||||
"syn 2.0.77",
|
||||
"syn 2.0.87",
|
||||
"ubyte",
|
||||
]
|
||||
|
||||
@@ -870,7 +882,7 @@ dependencies = [
|
||||
"proc-macro-crate",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.77",
|
||||
"syn 2.0.87",
|
||||
"syn_derive",
|
||||
]
|
||||
|
||||
@@ -1178,7 +1190,7 @@ dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"strsim",
|
||||
"syn 2.0.77",
|
||||
"syn 2.0.87",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1189,7 +1201,7 @@ checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806"
|
||||
dependencies = [
|
||||
"darling_core",
|
||||
"quote",
|
||||
"syn 2.0.77",
|
||||
"syn 2.0.87",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1246,7 +1258,7 @@ checksum = "8034092389675178f570469e6c3b0465d3d30b4505c294a6550db47f3c17ad18"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.77",
|
||||
"syn 2.0.87",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1277,7 +1289,7 @@ dependencies = [
|
||||
"darling",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.77",
|
||||
"syn 2.0.87",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1287,7 +1299,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4abae7035bf79b9877b779505d8cf3749285b80c43941eda66604841889451dc"
|
||||
dependencies = [
|
||||
"derive_builder_core",
|
||||
"syn 2.0.77",
|
||||
"syn 2.0.87",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "derive_utils"
|
||||
version = "0.14.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "65f152f4b8559c4da5d574bafc7af85454d706b4c5fe8b530d508cacbb6807ea"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.87",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1345,7 +1368,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.77",
|
||||
"syn 2.0.87",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1643,7 +1666,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.77",
|
||||
"syn 2.0.87",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1864,7 +1887,7 @@ dependencies = [
|
||||
"markup5ever",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.77",
|
||||
"syn 2.0.87",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2350,7 +2373,7 @@ checksum = "49e7bc1560b95a3c4a25d03de42fe76ca718ab92d1a22a55b9b4cf67b3ae635c"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.77",
|
||||
"syn 2.0.87",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2419,7 +2442,7 @@ dependencies = [
|
||||
"cfg-if",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.77",
|
||||
"syn 2.0.87",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2616,9 +2639,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "once_cell"
|
||||
version = "1.19.0"
|
||||
version = "1.20.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
|
||||
checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775"
|
||||
|
||||
[[package]]
|
||||
name = "openssl-probe"
|
||||
@@ -2824,7 +2847,7 @@ dependencies = [
|
||||
"phf_shared 0.11.2",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.77",
|
||||
"syn 2.0.87",
|
||||
"unicase",
|
||||
]
|
||||
|
||||
@@ -2870,7 +2893,7 @@ checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.77",
|
||||
"syn 2.0.87",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3278,7 +3301,7 @@ checksum = "bcc303e793d3734489387d205e9b186fac9c6cfacedd98cbb2e8a5943595f3e6"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.77",
|
||||
"syn 2.0.87",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3419,7 +3442,7 @@ checksum = "5f0ec466e5d8dca9965eb6871879677bef5590cf7525ad96cae14376efb75073"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.77",
|
||||
"syn 2.0.87",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3828,7 +3851,7 @@ checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.77",
|
||||
"syn 2.0.87",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3893,7 +3916,7 @@ dependencies = [
|
||||
"darling",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.77",
|
||||
"syn 2.0.87",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4157,7 +4180,7 @@ dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"rustversion",
|
||||
"syn 2.0.77",
|
||||
"syn 2.0.87",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4312,9 +4335,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "2.0.77"
|
||||
version = "2.0.87"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9f35bcdf61fd8e7be6caf75f429fdca8beb3ed76584befb503b1569faee373ed"
|
||||
checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@@ -4330,7 +4353,7 @@ dependencies = [
|
||||
"proc-macro-error",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.77",
|
||||
"syn 2.0.87",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4356,7 +4379,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.77",
|
||||
"syn 2.0.87",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4419,23 +4442,40 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3369f5ac52d5eb6ab48c6b4ffdc8efbcad6b89c765749064ba298f2c68a16a76"
|
||||
|
||||
[[package]]
|
||||
name = "thiserror"
|
||||
version = "1.0.63"
|
||||
name = "text-splitter"
|
||||
version = "0.18.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c0342370b38b6a11b6cc11d6a805569958d54cfa061a29969c3b5ce2ea405724"
|
||||
checksum = "189450e9eaff1a8037cca4d60ca62c134a7e601187430bdd487c86e25e8d6641"
|
||||
dependencies = [
|
||||
"ahash 0.8.11",
|
||||
"auto_enums",
|
||||
"either",
|
||||
"itertools 0.13.0",
|
||||
"once_cell",
|
||||
"regex",
|
||||
"strum",
|
||||
"thiserror",
|
||||
"unicode-segmentation",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror"
|
||||
version = "1.0.69"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
|
||||
dependencies = [
|
||||
"thiserror-impl",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror-impl"
|
||||
version = "1.0.63"
|
||||
version = "1.0.69"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261"
|
||||
checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.77",
|
||||
"syn 2.0.87",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4529,7 +4569,7 @@ checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.77",
|
||||
"syn 2.0.87",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4649,7 +4689,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.77",
|
||||
"syn 2.0.87",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4805,6 +4845,12 @@ dependencies = [
|
||||
"unicode-script",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicode-segmentation"
|
||||
version = "1.12.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-width"
|
||||
version = "0.1.14"
|
||||
@@ -4923,7 +4969,7 @@ dependencies = [
|
||||
"once_cell",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.77",
|
||||
"syn 2.0.87",
|
||||
"wasm-bindgen-shared",
|
||||
]
|
||||
|
||||
@@ -4957,7 +5003,7 @@ checksum = "afc340c74d9005395cf9dd098506f7f44e38f2b4a21c6aaacf9a105ea5e1e836"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.77",
|
||||
"syn 2.0.87",
|
||||
"wasm-bindgen-backend",
|
||||
"wasm-bindgen-shared",
|
||||
]
|
||||
@@ -5324,7 +5370,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.77",
|
||||
"syn 2.0.87",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -5351,6 +5397,7 @@ dependencies = [
|
||||
"sha2",
|
||||
"surrealdb",
|
||||
"tempfile",
|
||||
"text-splitter",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tracing",
|
||||
|
||||
@@ -18,6 +18,7 @@ serde_json = "1.0.128"
|
||||
sha2 = "0.10.8"
|
||||
surrealdb = "2.0.4"
|
||||
tempfile = "3.12.0"
|
||||
text-splitter = "0.18.1"
|
||||
thiserror = "1.0.63"
|
||||
tokio = { version = "1.40.0", features = ["full"] }
|
||||
tracing = "0.1.40"
|
||||
|
||||
@@ -18,7 +18,7 @@ pub struct KnowledgeEntity {
|
||||
pub embedding: Option<Vec<f32>>,
|
||||
}
|
||||
|
||||
fn thing_to_string<'de, D>(deserializer: D) -> Result<String, D::Error>
|
||||
pub fn thing_to_string<'de, D>(deserializer: D) -> Result<String, D::Error>
|
||||
where
|
||||
D: Deserializer<'de>,
|
||||
{
|
||||
|
||||
@@ -38,7 +38,7 @@ impl IngressObject {
|
||||
let text = Self::fetch_text_from_url(url).await?;
|
||||
let id = Uuid::new_v4();
|
||||
Ok(TextContent {
|
||||
id,
|
||||
id: id.to_string(),
|
||||
text,
|
||||
instructions: instructions.clone(),
|
||||
category: category.clone(),
|
||||
@@ -48,7 +48,7 @@ impl IngressObject {
|
||||
IngressObject::Text { text, instructions, category } => {
|
||||
let id = Uuid::new_v4();
|
||||
Ok(TextContent {
|
||||
id,
|
||||
id: id.to_string(),
|
||||
text: text.clone(),
|
||||
instructions: instructions.clone(),
|
||||
category: category.clone(),
|
||||
@@ -59,7 +59,7 @@ impl IngressObject {
|
||||
let id = Uuid::new_v4();
|
||||
let text = Self::extract_text_from_file(file_info).await?;
|
||||
Ok(TextContent {
|
||||
id,
|
||||
id: id.to_string(),
|
||||
text,
|
||||
instructions: instructions.clone(),
|
||||
category: category.clone(),
|
||||
|
||||
@@ -1,17 +1,32 @@
|
||||
use crate::{
|
||||
models::file_info::FileInfo,
|
||||
surrealdb::{SurrealDbClient, SurrealError},
|
||||
utils::llm::{create_json_ld, generate_embedding},
|
||||
};
|
||||
use async_openai::error::OpenAIError;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use surrealdb::{engine::remote::ws::Client, Surreal};
|
||||
use text_splitter::TextSplitter;
|
||||
use thiserror::Error;
|
||||
use tracing::{debug, info};
|
||||
use uuid::Uuid;
|
||||
use crate::{models::file_info::FileInfo, surrealdb::{SurrealDbClient, SurrealError}, utils::llm::create_json_ld};
|
||||
use thiserror::Error;
|
||||
|
||||
use super::graph_entities::{KnowledgeEntity, KnowledgeRelationship};
|
||||
use super::graph_entities::{thing_to_string, KnowledgeEntity, KnowledgeRelationship};
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
struct TextChunk {
|
||||
#[serde(deserialize_with = "thing_to_string")]
|
||||
id: String,
|
||||
source_id: String,
|
||||
chunk: String,
|
||||
embedding: Vec<f32>,
|
||||
}
|
||||
|
||||
/// Represents a single piece of text content extracted from various sources.
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct TextContent {
|
||||
pub id: Uuid,
|
||||
#[serde(deserialize_with = "thing_to_string")]
|
||||
pub id: String,
|
||||
pub text: String,
|
||||
pub file_info: Option<FileInfo>,
|
||||
pub instructions: String,
|
||||
@@ -26,13 +41,13 @@ pub enum ProcessingError {
|
||||
|
||||
#[error("SurrealDB error: {0}")]
|
||||
SurrealError(#[from] SurrealError),
|
||||
|
||||
|
||||
#[error("SurrealDb error: {0}")]
|
||||
SurrealDbError(#[from] surrealdb::Error),
|
||||
|
||||
|
||||
#[error("Graph DB storage error: {0}")]
|
||||
GraphDBError(String),
|
||||
|
||||
|
||||
#[error("Vector DB storage error: {0}")]
|
||||
VectorDBError(String),
|
||||
|
||||
@@ -43,39 +58,106 @@ pub enum ProcessingError {
|
||||
OpenAIerror(#[from] OpenAIError),
|
||||
}
|
||||
|
||||
async fn vector_comparison<T>(
|
||||
take: u8,
|
||||
input_text: String,
|
||||
db_client: &Surreal<Client>,
|
||||
table: String,
|
||||
openai_client: &async_openai::Client<async_openai::config::OpenAIConfig>,
|
||||
) -> Result<Vec<T>, ProcessingError>
|
||||
where
|
||||
T: for<'de> serde::Deserialize<'de>, // Add this trait bound for deserialization
|
||||
{
|
||||
let input_embedding = generate_embedding(&openai_client, input_text).await?;
|
||||
|
||||
// Construct the query
|
||||
let closest_query = format!("SELECT *, vector::distance::knn() AS distance FROM {} WHERE embedding <|{},40|> {:?} ORDER BY distance",table, take, input_embedding);
|
||||
|
||||
// Perform query and deserialize to struct
|
||||
let closest_entities: Vec<T> = db_client.query(closest_query).await?.take(0)?;
|
||||
|
||||
Ok(closest_entities)
|
||||
}
|
||||
|
||||
async fn get_related_nodes(
|
||||
id: String,
|
||||
db_client: &Surreal<Client>,
|
||||
) -> Result<Vec<KnowledgeEntity>, ProcessingError> {
|
||||
let query = format!("SELECT -> knowledge_relationship -> knowledge_entity as related_nodes FROM knowledge_entity WHERE source_id = `{}`", id);
|
||||
|
||||
// let query = format!("SELECT * FROM knowledge_entity WHERE in OR out {}", id);
|
||||
let related_nodes: Vec<KnowledgeEntity> = db_client.query(query).await?.take(0)?;
|
||||
|
||||
Ok(related_nodes)
|
||||
}
|
||||
|
||||
impl TextContent {
|
||||
/// Processes the `TextContent` by sending it to an LLM, storing in a graph DB, and vector DB.
|
||||
pub async fn process(&self) -> Result<(), ProcessingError> {
|
||||
// Store TextContent
|
||||
let db_client = SurrealDbClient::new().await?;
|
||||
let openai_client = async_openai::Client::new();
|
||||
|
||||
// let deleted: Vec<KnowledgeEntity> = db_client.delete("knowledge_entity").await?;
|
||||
self.store_text_content(&db_client).await?;
|
||||
|
||||
let closest_text_content: Vec<TextChunk> = vector_comparison(
|
||||
4,
|
||||
self.text.clone(),
|
||||
&db_client,
|
||||
"text_chunk".to_string(),
|
||||
&openai_client,
|
||||
)
|
||||
.await?;
|
||||
|
||||
for node in closest_text_content {
|
||||
info!("{}-{}", node.id, node.source_id);
|
||||
let related_nodes = get_related_nodes(node.source_id, &db_client).await?;
|
||||
info!("{:?}", related_nodes);
|
||||
}
|
||||
|
||||
panic!("STOPPING");
|
||||
// let deleted: Vec<TextChunk> = db_client.delete("text_chunk").await?;
|
||||
// info! {"{:?} KnowledgeEntities deleted", deleted.len()};
|
||||
|
||||
|
||||
// let relationships_deleted: Vec<KnowledgeRelationship> =
|
||||
// db_client.delete("knowledge_relationship").await?;
|
||||
// info!("{:?} Relationships deleted", relationships_deleted.len());
|
||||
|
||||
|
||||
// panic!("STOP");
|
||||
|
||||
// db_client.query("REMOVE INDEX embeddings ON knowledge_entity").await?;
|
||||
// db_client.query("DEFINE INDEX embeddings ON knowledge_entity FIELDS embedding HNSW DIMENSION 1536").await?;
|
||||
db_client.query("REBUILD INDEX IF EXISTS embeddings ON knowledge_entity").await?;
|
||||
|
||||
// db_client
|
||||
// .query("DEFINE INDEX idx_embedding ON text_chunk FIELDS embedding HNSW DIMENSION 1536")
|
||||
// .await?;
|
||||
db_client
|
||||
.query("REBUILD INDEX IF EXISTS idx_embedding ON text_chunk")
|
||||
.await?;
|
||||
db_client
|
||||
.query("REBUILD INDEX IF EXISTS embeddings ON knowledge_entity")
|
||||
.await?;
|
||||
|
||||
// Step 1: Send to LLM for analysis
|
||||
let analysis = create_json_ld(&self.category, &self.instructions, &self.text, &db_client).await?;
|
||||
let analysis = create_json_ld(
|
||||
&self.category,
|
||||
&self.instructions,
|
||||
&self.text,
|
||||
&db_client,
|
||||
&openai_client,
|
||||
)
|
||||
.await?;
|
||||
// info!("{:#?}", &analysis);
|
||||
|
||||
// Step 2: Convert LLM analysis to database entities
|
||||
let (entities, relationships) = analysis.to_database_entities(&self.id).await?;
|
||||
|
||||
let (entities, relationships) = analysis
|
||||
.to_database_entities(&self.id, &openai_client)
|
||||
.await?;
|
||||
|
||||
// Step 3: Store in database
|
||||
self.store_in_graph_db(entities, relationships, &db_client).await?;
|
||||
|
||||
self.store_in_graph_db(entities, relationships, &db_client)
|
||||
.await?;
|
||||
|
||||
// Step 4: Split text and store in Vector DB
|
||||
// self.store_in_vector_db().await?;
|
||||
self.store_in_vector_db(&db_client, &openai_client).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -87,14 +169,17 @@ impl TextContent {
|
||||
db_client: &Surreal<Client>,
|
||||
) -> Result<(), ProcessingError> {
|
||||
for entity in &entities {
|
||||
info!("{:?}, {:?}, {:?}", &entity.id, &entity.name, &entity.description);
|
||||
|
||||
info!(
|
||||
"{:?}, {:?}, {:?}",
|
||||
&entity.id, &entity.name, &entity.description
|
||||
);
|
||||
|
||||
let _created: Option<KnowledgeEntity> = db_client
|
||||
.create(("knowledge_entity", &entity.id.to_string()))
|
||||
.content(entity.clone())
|
||||
.await?;
|
||||
|
||||
debug!("{:?}",_created);
|
||||
debug!("{:?}", _created);
|
||||
}
|
||||
|
||||
for relationship in &relationships {
|
||||
@@ -105,13 +190,13 @@ impl TextContent {
|
||||
.content(relationship.clone())
|
||||
.await?;
|
||||
|
||||
debug!("{:?}",_created);
|
||||
debug!("{:?}", _created);
|
||||
}
|
||||
|
||||
// for relationship in &relationships {
|
||||
// let in_entity: Option<KnowledgeEntity> = db_client.select(("knowledge_entity",relationship.in_.to_string())).await?;
|
||||
// let out_entity: Option<KnowledgeEntity> = db_client.select(("knowledge_entity", relationship.out.to_string())).await?;
|
||||
|
||||
|
||||
// if let (Some(in_), Some(out)) = (in_entity, out_entity) {
|
||||
// info!("{} - {} is {} to {} - {}", in_.id, in_.name, relationship.relationship_type, out.id, out.name);
|
||||
// }
|
||||
@@ -120,24 +205,59 @@ impl TextContent {
|
||||
// }
|
||||
// }
|
||||
|
||||
info!("Inserted to database: {:?} entities, {:?} relationships", entities.len(), relationships.len());
|
||||
info!(
|
||||
"Inserted to database: {:?} entities, {:?} relationships",
|
||||
entities.len(),
|
||||
relationships.len()
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
/// Splits text and stores it in a vector database.
|
||||
#[allow(dead_code)]
|
||||
async fn store_in_vector_db(&self) -> Result<(), ProcessingError> {
|
||||
// TODO: Implement text splitting and vector storage logic.
|
||||
// Example:
|
||||
/*
|
||||
let chunks = text_splitter::split(&self.text);
|
||||
let vector_db = VectorDB::new("http://vector-db:5000");
|
||||
async fn store_in_vector_db(
|
||||
&self,
|
||||
db_client: &Surreal<Client>,
|
||||
openai_client: &async_openai::Client<async_openai::config::OpenAIConfig>,
|
||||
) -> Result<(), ProcessingError> {
|
||||
let max_characters = 500..2000;
|
||||
let splitter = TextSplitter::new(max_characters);
|
||||
|
||||
let chunks = splitter.chunks(self.text.as_str());
|
||||
|
||||
for chunk in chunks {
|
||||
vector_db.insert(chunk).await.map_err(|e| ProcessingError::VectorDBError(e.to_string()))?;
|
||||
info!("Chunk: {}", chunk);
|
||||
let embedding = generate_embedding(&openai_client, chunk.to_string()).await?;
|
||||
let text_chunk = TextChunk {
|
||||
id: Uuid::new_v4().to_string(),
|
||||
source_id: self.id.clone(),
|
||||
chunk: chunk.to_string(),
|
||||
embedding,
|
||||
};
|
||||
|
||||
info!("{:?}", text_chunk);
|
||||
|
||||
let _created: Option<TextChunk> = db_client
|
||||
.create(("text_chunk", text_chunk.id.clone()))
|
||||
.content(text_chunk)
|
||||
.await?;
|
||||
|
||||
debug!("{:?}", _created);
|
||||
}
|
||||
*/
|
||||
unimplemented!()
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Stores text content in database
|
||||
async fn store_text_content(&self, db_client: &Surreal<Client>) -> Result<(), ProcessingError> {
|
||||
let _created: Option<TextContent> = db_client
|
||||
.create(("text_content", self.id.clone()))
|
||||
.content(self.clone())
|
||||
.await?;
|
||||
|
||||
debug!("{:?}", _created);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -38,7 +38,7 @@ pub struct LLMGraphAnalysisResult {
|
||||
pub relationships: Vec<LLMRelationship>,
|
||||
}
|
||||
|
||||
async fn generate_embedding(
|
||||
pub async fn generate_embedding(
|
||||
client: &async_openai::Client<async_openai::config::OpenAIConfig>,
|
||||
input: String,
|
||||
) -> Result<Vec<f32>, ProcessingError> {
|
||||
@@ -73,13 +73,15 @@ impl LLMGraphAnalysisResult {
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `source_id` - A UUID representing the source identifier.
|
||||
/// * `openai_client` - OpenAI client for LLM calls.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// * `Result<(Vec<KnowledgeEntity>, Vec<KnowledgeRelationship>), ProcessingError>` - A tuple containing vectors of `KnowledgeEntity` and `KnowledgeRelationship`.
|
||||
pub async fn to_database_entities(
|
||||
&self,
|
||||
source_id: &Uuid,
|
||||
source_id: &String,
|
||||
openai_client: &async_openai::Client<async_openai::config::OpenAIConfig>,
|
||||
) -> Result<(Vec<KnowledgeEntity>, Vec<KnowledgeRelationship>), ProcessingError> {
|
||||
let mut mapper = GraphMapper::new();
|
||||
|
||||
@@ -88,7 +90,6 @@ impl LLMGraphAnalysisResult {
|
||||
mapper.assign_id(&llm_entity.key);
|
||||
}
|
||||
|
||||
let openai_client = async_openai::Client::new();
|
||||
|
||||
let mut entities = vec![];
|
||||
|
||||
@@ -154,15 +155,13 @@ pub async fn create_json_ld(
|
||||
instructions: &str,
|
||||
text: &str,
|
||||
db_client: &Surreal<Client>,
|
||||
openai_client: &async_openai::Client<async_openai::config::OpenAIConfig>,
|
||||
) -> Result<LLMGraphAnalysisResult, ProcessingError> {
|
||||
// Initialize llm client
|
||||
let client = async_openai::Client::new();
|
||||
|
||||
// Format the input for more cohesive comparison
|
||||
let input_text = format!("content: {:?}, category: {:?}, user_instructions: {:?}", text, category, instructions);
|
||||
|
||||
// Generate embedding of the input
|
||||
let input_embedding = generate_embedding(&client, input_text).await?;
|
||||
let input_embedding = generate_embedding(&openai_client, input_text).await?;
|
||||
|
||||
let number_of_entities_to_get = 10;
|
||||
|
||||
@@ -276,6 +275,7 @@ pub async fn create_json_ld(
|
||||
6. You will be presented with a few existing KnowledgeEntities that are similar to the current ones. They will have an existing UUID. When creating relationships to these entities, use their UUID.
|
||||
7. Only create relationships between existing KnowledgeEntities.
|
||||
8. Entities that exist already in the database should NOT be created again. If there is only a minor overlap, skip creating a new entity.
|
||||
9. A new relationship MUST include a newly created KnowledgeEntity.
|
||||
"#;
|
||||
|
||||
let user_message = format!(
|
||||
@@ -297,7 +297,7 @@ pub async fn create_json_ld(
|
||||
.map_err(|e| ProcessingError::LLMError(e.to_string()))?;
|
||||
|
||||
// Send the request to OpenAI
|
||||
let response = client
|
||||
let response = openai_client
|
||||
.chat()
|
||||
.create(request)
|
||||
.await
|
||||
|
||||
Reference in New Issue
Block a user