diff --git a/database.md b/database.md deleted file mode 100644 index 16c50be..0000000 --- a/database.md +++ /dev/null @@ -1,61 +0,0 @@ -# SurrealDB only - -Right now we have the FileInfo stored in "files" - -- Change the uuid to Uuid type, and have the database layer still use String. Means parsing and unparsing but thats fine. - -``` -pub struct FileInfo { - pub uuid: String, - pub sha256: String, - pub path: String, - pub mime_type: String, -} -``` - -We create TextContent objects, which we should store? - -- We store the "snippets" along with the vectors, but it would make sense to store the whole textcontent, at least for not enormous files? - -``` -pub struct TextContent { - pub id: Uuid, - pub text: String, - pub file_info: Option, - pub instructions: String, - pub category: String, -} -``` - -We create KnowledgeSource, which we will store as a node, and its relationship as edges - -- Add a uuid to we can link the textcontent and files to the knowledge sources? -- Rename id to name, and use it as the id? Id represents the TextContent link, from which several knowledgeentities can be gathered. - -``` -pub struct KnowledgeSource { - pub name: String, - pub id: Uuid, - pub title: String, - pub description: String, - pub relationships: Vec, -} -``` - -We will create embeddings and vector representations of TextContent, possibly split up and store in vector DB - -``` -pub struct VectorEmbeddingOfTextContent { - pub id: Uuid, - pub vectors: Vec(or something), - pub text_content: String, - pub category: String, -} -``` - -______________________________________________________________________ - -## Goals - -- Smooth operations when updating, removing and adding data -- Smooth queries where one can search, get a vector snippet, which links to a graph node and its edges, and also the fulltext document. diff --git a/src/analysis/ingress/ingress_analyser.rs b/src/analysis/ingress/ingress_analyser.rs new file mode 100644 index 0000000..cbf65d1 --- /dev/null +++ b/src/analysis/ingress/ingress_analyser.rs @@ -0,0 +1,147 @@ +use crate::{ + analysis::ingress::{ + prompt::{get_ingress_analysis_schema, INGRESS_ANALYSIS_SYSTEM_MESSAGE}, + types::llm_analysis_result::LLMGraphAnalysisResult, + }, + error::ProcessingError, + retrieval::vector::find_items_by_vector_similarity, + storage::types::{knowledge_entity::KnowledgeEntity, StoredObject}, +}; +use async_openai::types::{ + ChatCompletionRequestSystemMessage, ChatCompletionRequestUserMessage, + CreateChatCompletionRequest, CreateChatCompletionRequestArgs, ResponseFormat, + ResponseFormatJsonSchema, +}; +use serde_json::json; +use surrealdb::engine::remote::ws::Client; +use surrealdb::Surreal; +use tracing::{debug, instrument}; + +pub struct IngressAnalyzer<'a> { + db_client: &'a Surreal, + openai_client: &'a async_openai::Client, +} + +impl<'a> IngressAnalyzer<'a> { + pub fn new( + db_client: &'a Surreal, + openai_client: &'a async_openai::Client, + ) -> Self { + Self { + db_client, + openai_client, + } + } + + #[instrument(skip(self))] + pub async fn analyze_content( + &self, + category: &str, + instructions: &str, + text: &str, + ) -> Result { + let similar_entities = self + .find_similar_entities(category, instructions, text) + .await?; + let llm_request = + self.prepare_llm_request(category, instructions, text, &similar_entities)?; + self.perform_analysis(llm_request).await + } + + #[instrument(skip(self))] + async fn find_similar_entities( + &self, + category: &str, + instructions: &str, + text: &str, + ) -> Result, ProcessingError> { + let input_text = format!( + "content: {}, category: {}, user_instructions: {}", + text, category, instructions + ); + + find_items_by_vector_similarity( + 10, + input_text, + self.db_client, + KnowledgeEntity::table_name().to_string(), + self.openai_client, + ) + .await + } + + #[instrument(skip(self))] + fn prepare_llm_request( + &self, + category: &str, + instructions: &str, + text: &str, + similar_entities: &[KnowledgeEntity], + ) -> Result { + let entities_json = json!(similar_entities + .iter() + .map(|entity| { + json!({ + "KnowledgeEntity": { + "id": entity.id, + "name": entity.name, + "description": entity.description + } + }) + }) + .collect::>()); + + let user_message = format!( + "Category:\n{}\nInstructions:\n{}\nContent:\n{}\nExisting KnowledgeEntities in database:\n{}", + category, instructions, text, entities_json + ); + + debug!("Prepared LLM request message: {}", user_message); + + let response_format = ResponseFormat::JsonSchema { + json_schema: ResponseFormatJsonSchema { + description: Some("Structured analysis of the submitted content".into()), + name: "content_analysis".into(), + schema: Some(get_ingress_analysis_schema()), + strict: Some(true), + }, + }; + + CreateChatCompletionRequestArgs::default() + .model("gpt-4-mini") + .temperature(0.2) + .max_tokens(2048u32) + .messages([ + ChatCompletionRequestSystemMessage::from(INGRESS_ANALYSIS_SYSTEM_MESSAGE).into(), + ChatCompletionRequestUserMessage::from(user_message).into(), + ]) + .response_format(response_format) + .build() + .map_err(|e| ProcessingError::LLMParsingError(e.to_string())) + } + + #[instrument(skip(self, request))] + async fn perform_analysis( + &self, + request: CreateChatCompletionRequest, + ) -> Result { + let response = self.openai_client.chat().create(request).await?; + debug!("Received LLM response: {:?}", response); + + response + .choices + .first() + .and_then(|choice| choice.message.content.as_ref()) + .ok_or(ProcessingError::LLMParsingError( + "No content found in LLM response".into(), + )) + .and_then(|content| { + serde_json::from_str(content).map_err(|e| { + ProcessingError::LLMParsingError(format!( + "Failed to parse LLM response into analysis: {}", + e + )) + }) + }) + } +} diff --git a/src/analysis/ingress/mod.rs b/src/analysis/ingress/mod.rs new file mode 100644 index 0000000..5a2916f --- /dev/null +++ b/src/analysis/ingress/mod.rs @@ -0,0 +1,3 @@ +pub mod ingress_analyser; +pub mod prompt; +pub mod types; diff --git a/src/analysis/ingress/prompt.rs b/src/analysis/ingress/prompt.rs new file mode 100644 index 0000000..a3c028f --- /dev/null +++ b/src/analysis/ingress/prompt.rs @@ -0,0 +1,81 @@ +use serde_json::{json, Value}; + +pub static INGRESS_ANALYSIS_SYSTEM_MESSAGE: &str = r#" + You are an expert document analyzer. You will receive a document's text content, along with user instructions and a category. Your task is to provide a structured JSON object representing the content in a graph format suitable for a graph database. You will also be presented with some existing knowledge_entities from the database, do not replicate these! + + The JSON should have the following structure: + + { + "knowledge_entities": [ + { + "key": "unique-key-1", + "name": "Entity Name", + "description": "A detailed description of the entity.", + "entity_type": "TypeOfEntity" + }, + // More entities... + ], + "relationships": [ + { + "type": "RelationshipType", + "source": "unique-key-1 or UUID from existing database", + "target": "unique-key-1 or UUID from existing database" + }, + // More relationships... + ] + } + + Guidelines: + 1. Do NOT generate any IDs or UUIDs. Use a unique `key` for each knowledge entity. + 2. Each KnowledgeEntity should have a unique `key`, a meaningful `name`, and a descriptive `description`. + 3. Define the type of each KnowledgeEntity using the following categories: Idea, Project, Document, Page, TextSnippet. + 4. Establish relationships between entities using types like RelatedTo, RelevantTo, SimilarTo. + 5. Use the `source` key to indicate the originating entity and the `target` key to indicate the related entity" + 6. You will be presented with a few existing KnowledgeEntities that are similar to the current ones. They will have an existing UUID. When creating relationships to these entities, use their UUID. + 7. Only create relationships between existing KnowledgeEntities. + 8. Entities that exist already in the database should NOT be created again. If there is only a minor overlap, skip creating a new entity. + 9. A new relationship MUST include a newly created KnowledgeEntity. + "#; + +pub fn get_ingress_analysis_schema() -> Value { + json!({ + "type": "object", + "properties": { + "knowledge_entities": { + "type": "array", + "items": { + "type": "object", + "properties": { + "key": { "type": "string" }, + "name": { "type": "string" }, + "description": { "type": "string" }, + "entity_type": { + "type": "string", + "enum": ["idea", "project", "document", "page", "textsnippet"] + } + }, + "required": ["key", "name", "description", "entity_type"], + "additionalProperties": false + } + }, + "relationships": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": ["RelatedTo", "RelevantTo", "SimilarTo"] + }, + "source": { "type": "string" }, + "target": { "type": "string" } + }, + "required": ["type", "source", "target"], + "additionalProperties": false + } + } + }, + "required": ["knowledge_entities", "relationships"], + "additionalProperties": false + }) +} diff --git a/src/analysis/ingress/types/llm_analysis_result.rs b/src/analysis/ingress/types/llm_analysis_result.rs new file mode 100644 index 0000000..b97aaab --- /dev/null +++ b/src/analysis/ingress/types/llm_analysis_result.rs @@ -0,0 +1,174 @@ +use std::sync::{Arc, Mutex}; + +use serde::{Deserialize, Serialize}; +use tokio::task; + +use crate::{ + error::ProcessingError, + models::graph_entities::GraphMapper, + storage::types::{ + knowledge_entity::{KnowledgeEntity, KnowledgeEntityType}, + knowledge_relationship::KnowledgeRelationship, + }, + utils::embedding::generate_embedding, +}; +use futures::future::try_join_all; // For future parallelization + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct LLMKnowledgeEntity { + pub key: String, // Temporary identifier + pub name: String, + pub description: String, + pub entity_type: String, // Should match KnowledgeEntityType variants +} + +/// Represents a single relationship from the LLM. +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct LLMRelationship { + #[serde(rename = "type")] + pub type_: String, // e.g., RelatedTo, RelevantTo + pub source: String, // Key of the source entity + pub target: String, // Key of the target entity +} + +/// Represents the entire graph analysis result from the LLM. +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct LLMGraphAnalysisResult { + pub knowledge_entities: Vec, + pub relationships: Vec, +} + +/// Converts the LLM graph analysis result into database entities and relationships. +/// Processes embeddings sequentially for simplicity. +/// +/// # Arguments +/// +/// * `source_id` - A UUID representing the source identifier. +/// * `openai_client` - OpenAI client for LLM calls. +/// +/// # Returns +/// +/// * `Result<(Vec, Vec), ProcessingError>` - A tuple containing vectors of `KnowledgeEntity` and `KnowledgeRelationship`. + +impl LLMGraphAnalysisResult { + // Split the main function into smaller, focused functions + pub async fn to_database_entities( + &self, + source_id: &str, + openai_client: &async_openai::Client, + ) -> Result<(Vec, Vec), ProcessingError> { + // Create mapper and pre-assign IDs + let mapper = Arc::new(Mutex::new(self.create_mapper()?)); + + // Process entities (prepared for future parallelization) + let entities = self + .process_entities(source_id, Arc::clone(&mapper), openai_client) + .await?; + + // Process relationships + let relationships = self.process_relationships(Arc::clone(&mapper))?; + + Ok((entities, relationships)) + } + + fn create_mapper(&self) -> Result { + let mut mapper = GraphMapper::new(); + + // Pre-assign all IDs + for entity in &self.knowledge_entities { + mapper.assign_id(&entity.key); + } + + Ok(mapper) + } + + async fn process_entities( + &self, + source_id: &str, + mapper: Arc>, + openai_client: &async_openai::Client, + ) -> Result, ProcessingError> { + let futures: Vec<_> = self + .knowledge_entities + .iter() + .map(|entity| { + let mapper = Arc::clone(&mapper); + let openai_client = openai_client.clone(); + let source_id = source_id.to_string(); + let entity = entity.clone(); + + task::spawn(async move { + create_single_entity(&entity, &source_id, mapper, &openai_client).await + }) + }) + .collect(); + + let results = try_join_all(futures) + .await? + .into_iter() + .collect::, _>>()?; + + Ok(results) + } + + fn process_relationships( + &self, + mapper: Arc>, + ) -> Result, ProcessingError> { + let mut mapper_guard = mapper + .lock() + .map_err(|_| ProcessingError::GraphProcessingError("Failed to lock mapper".into()))?; + self.relationships + .iter() + .map(|rel| { + let source_db_id = mapper_guard.get_or_parse_id(&rel.source); + let target_db_id = mapper_guard.get_or_parse_id(&rel.target); + + Ok(KnowledgeRelationship::new( + source_db_id.to_string(), + target_db_id.to_string(), + rel.type_.clone(), + None, + )) + }) + .collect() + } +} +async fn create_single_entity( + llm_entity: &LLMKnowledgeEntity, + source_id: &str, + mapper: Arc>, + openai_client: &async_openai::Client, +) -> Result { + let assigned_id = { + let mapper = mapper + .lock() + .map_err(|_| ProcessingError::GraphProcessingError("Failed to lock mapper".into()))?; + mapper + .get_id(&llm_entity.key) + .ok_or_else(|| { + ProcessingError::GraphProcessingError(format!( + "ID not found for key: {}", + llm_entity.key + )) + })? + .to_string() + }; + + let embedding_input = format!( + "name: {}, description: {}, type: {}", + llm_entity.name, llm_entity.description, llm_entity.entity_type + ); + + let embedding = generate_embedding(openai_client, embedding_input).await?; + + Ok(KnowledgeEntity { + id: assigned_id, + name: llm_entity.name.to_string(), + description: llm_entity.description.to_string(), + entity_type: KnowledgeEntityType::from(llm_entity.entity_type.to_string()), + source_id: source_id.to_string(), + metadata: None, + embedding, + }) +} diff --git a/src/analysis/ingress/types/mod.rs b/src/analysis/ingress/types/mod.rs new file mode 100644 index 0000000..3f2223e --- /dev/null +++ b/src/analysis/ingress/types/mod.rs @@ -0,0 +1 @@ +pub mod llm_analysis_result; diff --git a/src/analysis/mod.rs b/src/analysis/mod.rs new file mode 100644 index 0000000..d8d3864 --- /dev/null +++ b/src/analysis/mod.rs @@ -0,0 +1 @@ +pub mod ingress; diff --git a/src/error.rs b/src/error.rs index 7545043..b25cb88 100644 --- a/src/error.rs +++ b/src/error.rs @@ -1,5 +1,6 @@ use async_openai::error::OpenAIError; use thiserror::Error; +use tokio::task::JoinError; /// Error types for processing `TextContent`. #[derive(Error, Debug)] @@ -18,4 +19,7 @@ pub enum ProcessingError { #[error("LLM parsing error: {0}")] LLMParsingError(String), + + #[error("Task join error: {0}")] + JoinError(#[from] JoinError), } diff --git a/src/lib.rs b/src/lib.rs index 046b5f0..380e0b4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,4 @@ +pub mod analysis; pub mod error; pub mod models; pub mod rabbitmq; diff --git a/src/models/ingress_object.rs b/src/models/ingress_object.rs index 3a2e69d..2b1297a 100644 --- a/src/models/ingress_object.rs +++ b/src/models/ingress_object.rs @@ -2,7 +2,6 @@ use super::ingress_content::IngressContentError; use crate::models::file_info::FileInfo; use crate::storage::types::text_content::TextContent; use serde::{Deserialize, Serialize}; -use uuid::Uuid; /// Knowledge object type, containing the content or reference to it, as well as metadata #[derive(Debug, Serialize, Deserialize, Clone)] diff --git a/src/models/text_content.rs b/src/models/text_content.rs index 4c2b530..e3cf7b5 100644 --- a/src/models/text_content.rs +++ b/src/models/text_content.rs @@ -1,3 +1,4 @@ +use crate::analysis::ingress::ingress_analyser::IngressAnalyzer; use crate::retrieval::graph::find_entities_by_source_id; use crate::retrieval::vector::find_items_by_vector_similarity; use crate::storage::db::store_item; @@ -7,7 +8,7 @@ use crate::storage::types::text_chunk::TextChunk; use crate::storage::types::text_content::TextContent; use crate::storage::types::StoredObject; use crate::utils::embedding::generate_embedding; -use crate::{error::ProcessingError, surrealdb::SurrealDbClient, utils::llm::create_json_ld}; +use crate::{error::ProcessingError, surrealdb::SurrealDbClient}; use surrealdb::{engine::remote::ws::Client, Surreal}; use text_splitter::TextSplitter; use tracing::{debug, info}; @@ -19,8 +20,7 @@ impl TextContent { let openai_client = async_openai::Client::new(); // Store TextContent - let create_operation = store_item(&db_client, self.clone()).await?; - info!("{:?}", create_operation); + store_item(&db_client, self.clone()).await?; // Get related nodes let closest_text_content: Vec = find_items_by_vector_similarity( @@ -48,15 +48,10 @@ impl TextContent { db_client.rebuild_indexes().await?; // Step 1: Send to LLM for analysis - let analysis = create_json_ld( - &self.category, - &self.instructions, - &self.text, - &db_client, - &openai_client, - ) - .await?; - // info!("{:#?}", &analysis); + let analyser = IngressAnalyzer::new(&db_client, &openai_client); + let analysis = analyser + .analyze_content(&self.category, &self.instructions, &self.text) + .await?; // Step 2: Convert LLM analysis to database entities let (entities, relationships) = analysis @@ -116,7 +111,7 @@ impl TextContent { for chunk in chunks { info!("Chunk: {}", chunk); - let embedding = generate_embedding(&openai_client, chunk.to_string()).await?; + let embedding = generate_embedding(openai_client, chunk.to_string()).await?; let text_chunk = TextChunk::new(self.id.to_string(), chunk.to_string(), embedding); store_item(db_client, text_chunk).await?; diff --git a/src/utils/llm.rs b/src/utils/llm.rs deleted file mode 100644 index 7138346..0000000 --- a/src/utils/llm.rs +++ /dev/null @@ -1,299 +0,0 @@ -use crate::{ - error::ProcessingError, - models::graph_entities::GraphMapper, - retrieval::vector::find_items_by_vector_similarity, - storage::types::{ - knowledge_entity::{KnowledgeEntity, KnowledgeEntityType}, - knowledge_relationship::KnowledgeRelationship, - StoredObject, - }, -}; -use async_openai::types::{ - ChatCompletionRequestSystemMessage, ChatCompletionRequestUserMessage, - CreateChatCompletionRequestArgs, -}; -use serde::{Deserialize, Serialize}; -use serde_json::json; -use surrealdb::engine::remote::ws::Client; -use surrealdb::Surreal; -use tracing::debug; -use uuid::Uuid; - -use super::embedding::generate_embedding; - -/// Represents a single knowledge entity from the LLM. -#[derive(Debug, Serialize, Deserialize, Clone)] -pub struct LLMKnowledgeEntity { - pub key: String, // Temporary identifier - pub name: String, - pub description: String, - pub entity_type: String, // Should match KnowledgeEntityType variants -} - -/// Represents a single relationship from the LLM. -#[derive(Debug, Serialize, Deserialize, Clone)] -pub struct LLMRelationship { - #[serde(rename = "type")] - pub type_: String, // e.g., RelatedTo, RelevantTo - pub source: String, // Key of the source entity - pub target: String, // Key of the target entity -} - -/// Represents the entire graph analysis result from the LLM. -#[derive(Debug, Serialize, Deserialize, Clone)] -pub struct LLMGraphAnalysisResult { - pub knowledge_entities: Vec, - pub relationships: Vec, -} - -impl LLMGraphAnalysisResult { - /// Converts the LLM graph analysis result into database entities and relationships. - /// Processes embeddings sequentially for simplicity. - /// - /// # Arguments - /// - /// * `source_id` - A UUID representing the source identifier. - /// * `openai_client` - OpenAI client for LLM calls. - /// - /// # Returns - /// - /// * `Result<(Vec, Vec), ProcessingError>` - A tuple containing vectors of `KnowledgeEntity` and `KnowledgeRelationship`. - pub async fn to_database_entities( - &self, - source_id: &String, - openai_client: &async_openai::Client, - ) -> Result<(Vec, Vec), ProcessingError> { - let mut mapper = GraphMapper::new(); - - // Step 1: Assign unique IDs to all knowledge entities upfront - for llm_entity in &self.knowledge_entities { - mapper.assign_id(&llm_entity.key); - } - - let mut entities = vec![]; - - // Step 2: Process each knowledge entity sequentially - for llm_entity in &self.knowledge_entities { - // Retrieve the assigned ID for the current entity - let assigned_id = mapper - .get_id(&llm_entity.key) - .ok_or_else(|| { - ProcessingError::GraphProcessingError(format!( - "ID not found for key: {}", - llm_entity.key - )) - })? - .clone(); - - // Prepare the embedding input - let embedding_input = format!( - "name: {}, description: {}, type: {}", - llm_entity.name, llm_entity.description, llm_entity.entity_type - ); - - // Generate embedding - let embedding = generate_embedding(&openai_client, embedding_input).await?; - - // Construct the KnowledgeEntity with embedding - let knowledge_entity = KnowledgeEntity { - id: assigned_id.to_string(), - name: llm_entity.name.clone(), - description: llm_entity.description.clone(), - entity_type: KnowledgeEntityType::from(llm_entity.entity_type.clone()), - source_id: source_id.to_string(), - metadata: None, - embedding, - }; - - entities.push(knowledge_entity); - } - - // Step 3: Process relationships using the pre-assigned IDs - let relationships: Vec = self - .relationships - .iter() - .filter_map(|llm_rel| { - let source_db_id = mapper.get_or_parse_id(&llm_rel.source); - let target_db_id = mapper.get_or_parse_id(&llm_rel.target); - debug!("IN: {}, OUT: {}", &source_db_id, &target_db_id); - - Some(KnowledgeRelationship::new( - source_db_id.to_string(), - target_db_id.to_string(), - llm_rel.type_.to_owned(), - None, - )) - }) - .collect(); - - Ok((entities, relationships)) - } -} - -/// Sends text to an LLM for analysis. -pub async fn create_json_ld( - category: &str, - instructions: &str, - text: &str, - db_client: &Surreal, - openai_client: &async_openai::Client, -) -> Result { - // Format the input for more cohesive comparison - let input_text = format!( - "content: {}, category: {}, user_instructions: {}", - text, category, instructions - ); - - let closest_entities: Vec = find_items_by_vector_similarity( - 10, - input_text, - db_client, - KnowledgeEntity::table_name().to_string(), - openai_client, - ) - .await?; - - // Format the KnowledgeEntity, remove redudant fields - let closest_entities_to_llm = json!(closest_entities - .iter() - .map(|entity| { - json!({ - "KnowledgeEntity": { - "id": entity.id, - "name": entity.name, - "description": entity.description - } - }) - }) - .collect::>()); - - // let db_context = serde_json::to_string_pretty(&closest_entities_to_llm).unwrap(); - - let schema = json!({ - "type": "object", - "properties": { - "knowledge_entities": { - "type": "array", - "items": { - "type": "object", - "properties": { - "key": { "type": "string" }, - "name": { "type": "string" }, - "description": { "type": "string" }, - "entity_type": { - "type": "string", - "enum": ["idea", "project", "document", "page", "textsnippet"] - } - }, - "required": ["key", "name", "description", "entity_type"], - "additionalProperties": false - } - }, - "relationships": { - "type": "array", - "items": { - "type": "object", - "properties": { - "type": { - "type": "string", - "enum": ["RelatedTo", "RelevantTo", "SimilarTo"] - }, - "source": { "type": "string" }, - "target": { "type": "string" } - }, - "required": ["type", "source", "target"], - "additionalProperties": false - } - } - }, - "required": ["knowledge_entities", "relationships"], - "additionalProperties": false - }); - - let response_format = async_openai::types::ResponseFormat::JsonSchema { - json_schema: async_openai::types::ResponseFormatJsonSchema { - description: Some("Structured analysis of the submitted content".into()), - name: "content_analysis".into(), - schema: Some(schema), - strict: Some(true), - }, - }; - - // Construct the system and user messages - let system_message = r#" - You are an expert document analyzer. You will receive a document's text content, along with user instructions and a category. Your task is to provide a structured JSON object representing the content in a graph format suitable for a graph database. You will also be presented with some existing knowledge_entities from the database, do not replicate these! - - The JSON should have the following structure: - - { - "knowledge_entities": [ - { - "key": "unique-key-1", - "name": "Entity Name", - "description": "A detailed description of the entity.", - "entity_type": "TypeOfEntity" - }, - // More entities... - ], - "relationships": [ - { - "type": "RelationshipType", - "source": "unique-key-1 or UUID from existing database", - "target": "unique-key-1 or UUID from existing database" - }, - // More relationships... - ] - } - - Guidelines: - 1. Do NOT generate any IDs or UUIDs. Use a unique `key` for each knowledge entity. - 2. Each KnowledgeEntity should have a unique `key`, a meaningful `name`, and a descriptive `description`. - 3. Define the type of each KnowledgeEntity using the following categories: Idea, Project, Document, Page, TextSnippet. - 4. Establish relationships between entities using types like RelatedTo, RelevantTo, SimilarTo. - 5. Use the `source` key to indicate the originating entity and the `target` key to indicate the related entity" - 6. You will be presented with a few existing KnowledgeEntities that are similar to the current ones. They will have an existing UUID. When creating relationships to these entities, use their UUID. - 7. Only create relationships between existing KnowledgeEntities. - 8. Entities that exist already in the database should NOT be created again. If there is only a minor overlap, skip creating a new entity. - 9. A new relationship MUST include a newly created KnowledgeEntity. - "#; - - let user_message = format!( - "Category:\n{}\nInstructions:\n{}\nContent:\n{}\nExisting KnowledgeEntities in database:\n{}", - category, instructions, text, closest_entities_to_llm - ); - - debug!("{}", user_message); - - // Build the chat completion request - let request = CreateChatCompletionRequestArgs::default() - .model("gpt-4o-mini") - .temperature(0.2) - .max_tokens(2048u32) - .messages([ - ChatCompletionRequestSystemMessage::from(system_message).into(), - ChatCompletionRequestUserMessage::from(user_message).into(), - ]) - .response_format(response_format) - .build()?; - - // Send the request to OpenAI - let response = openai_client.chat().create(request).await?; - - debug!("{:?}", response); - - response - .choices - .first() - .and_then(|choice| choice.message.content.as_ref()) - .ok_or(ProcessingError::LLMParsingError( - "No content found in LLM response".into(), - )) - .and_then(|content| { - serde_json::from_str(content).map_err(|e| { - ProcessingError::LLMParsingError(format!( - "Failed to parse LLM response into analysis: {}", - e - )) - }) - }) -} diff --git a/src/utils/mod.rs b/src/utils/mod.rs index 4bc235f..1768e8b 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -1,2 +1 @@ pub mod embedding; -pub mod llm;