refactoring: started work on llm

This commit is contained in:
Per Stark
2024-11-21 14:47:51 +01:00
parent bbab5a381f
commit 22abd3d731
2 changed files with 36 additions and 57 deletions

View File

@@ -44,20 +44,7 @@ impl TextContent {
} }
} }
// panic!("STOPPING"); // Rebuild indexes
// let deleted: Vec<TextChunk> = db_client.delete("text_chunk").await?;
// info! {"{:?} KnowledgeEntities deleted", deleted.len()};
// let relationships_deleted: Vec<KnowledgeRelationship> =
// db_client.delete("knowledge_relationship").await?;
// info!("{:?} Relationships deleted", relationships_deleted.len());
// panic!("STOP");
// db_client.query("REMOVE INDEX embeddings ON knowledge_entity").await?;
// db_client
// .query("DEFINE INDEX idx_embedding ON text_chunk FIELDS embedding HNSW DIMENSION 1536")
// .await?;
db_client.rebuild_indexes().await?; db_client.rebuild_indexes().await?;
// Step 1: Send to LLM for analysis // Step 1: Send to LLM for analysis

View File

@@ -1,20 +1,22 @@
use crate::{ use crate::{
error::ProcessingError, error::ProcessingError,
models::graph_entities::GraphMapper, models::graph_entities::GraphMapper,
retrieval::vector::find_items_by_vector_similarity,
storage::types::{ storage::types::{
knowledge_entity::{KnowledgeEntity, KnowledgeEntityType}, knowledge_entity::{KnowledgeEntity, KnowledgeEntityType},
knowledge_relationship::KnowledgeRelationship, knowledge_relationship::KnowledgeRelationship,
StoredObject,
}, },
}; };
use async_openai::types::{ use async_openai::types::{
ChatCompletionRequestSystemMessage, ChatCompletionRequestUserMessage, ChatCompletionRequestSystemMessage, ChatCompletionRequestUserMessage,
CreateChatCompletionRequestArgs, CreateEmbeddingRequestArgs, CreateChatCompletionRequestArgs,
}; };
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use serde_json::json; use serde_json::json;
use surrealdb::engine::remote::ws::Client; use surrealdb::engine::remote::ws::Client;
use surrealdb::Surreal; use surrealdb::Surreal;
use tracing::{debug, info}; use tracing::debug;
use uuid::Uuid; use uuid::Uuid;
use super::embedding::generate_embedding; use super::embedding::generate_embedding;
@@ -115,13 +117,12 @@ impl LLMGraphAnalysisResult {
let target_db_id = mapper.get_or_parse_id(&llm_rel.target); let target_db_id = mapper.get_or_parse_id(&llm_rel.target);
debug!("IN: {}, OUT: {}", &source_db_id, &target_db_id); debug!("IN: {}, OUT: {}", &source_db_id, &target_db_id);
Some(KnowledgeRelationship { Some(KnowledgeRelationship::new(
id: Uuid::new_v4().to_string(), source_db_id.to_string(),
in_: source_db_id.to_string(), target_db_id.to_string(),
out: target_db_id.to_string(), llm_rel.type_.to_owned(),
relationship_type: llm_rel.type_.clone(), None,
metadata: None, ))
})
}) })
.collect(); .collect();
@@ -139,45 +140,34 @@ pub async fn create_json_ld(
) -> Result<LLMGraphAnalysisResult, ProcessingError> { ) -> Result<LLMGraphAnalysisResult, ProcessingError> {
// Format the input for more cohesive comparison // Format the input for more cohesive comparison
let input_text = format!( let input_text = format!(
"content: {:?}, category: {:?}, user_instructions: {:?}", "content: {}, category: {}, user_instructions: {}",
text, category, instructions text, category, instructions
); );
// Generate embedding of the input let closest_entities: Vec<KnowledgeEntity> = find_items_by_vector_similarity(
let input_embedding = generate_embedding(&openai_client, input_text).await?; 10,
input_text,
db_client,
KnowledgeEntity::table_name().to_string(),
openai_client,
)
.await?;
let number_of_entities_to_get = 10; // Format the KnowledgeEntity, remove redudant fields
let closest_entities_to_llm = json!(closest_entities
// Construct the query .iter()
let closest_query = format!("SELECT *, vector::distance::knn() AS distance FROM knowledge_entity WHERE embedding <|{},40|> {:?} ORDER BY distance",number_of_entities_to_get, input_embedding); .map(|entity| {
json!({
// Perform query and deserialize to struct "KnowledgeEntity": {
let closest_entities: Vec<KnowledgeEntity> = db_client.query(closest_query).await?.take(0)?; "id": entity.id,
#[allow(dead_code)] "name": entity.name,
#[derive(Debug)] "description": entity.description
struct KnowledgeEntityToLLM { }
id: String, })
name: String,
description: String,
}
info!(
"Number of KnowledgeEntities sent as context: {}",
closest_entities.len()
);
// Only keep most relevant information
let closest_entities_to_llm: Vec<KnowledgeEntityToLLM> = closest_entities
.clone()
.into_iter()
.map(|entity| KnowledgeEntityToLLM {
id: entity.id,
name: entity.name,
description: entity.description,
}) })
.collect(); .collect::<Vec<_>>());
debug!("{:?}", closest_entities_to_llm); // let db_context = serde_json::to_string_pretty(&closest_entities_to_llm).unwrap();
let schema = json!({ let schema = json!({
"type": "object", "type": "object",
@@ -268,10 +258,12 @@ pub async fn create_json_ld(
"#; "#;
let user_message = format!( let user_message = format!(
"Category: {}\nInstructions: {}\nContent:\n{}\nExisting KnowledgeEntities in database:{:?}", "Category:\n{}\nInstructions:\n{}\nContent:\n{}\nExisting KnowledgeEntities in database:\n{}",
category, instructions, text, closest_entities_to_llm category, instructions, text, closest_entities_to_llm
); );
debug!("{}", user_message);
// Build the chat completion request // Build the chat completion request
let request = CreateChatCompletionRequestArgs::default() let request = CreateChatCompletionRequestArgs::default()
.model("gpt-4o-mini") .model("gpt-4o-mini")