comments, llm schema

This commit is contained in:
Per Stark
2024-10-01 19:01:29 +02:00
parent a348950234
commit 8e7e762130
15 changed files with 442 additions and 72 deletions
+1 -14
View File
@@ -24,6 +24,7 @@ pub struct FileInfo {
pub mime_type: String,
}
/// Errors that can occur during FileInfo operations
#[derive(Error, Debug)]
pub enum FileError {
#[error("IO error occurred: {0}")]
@@ -98,11 +99,9 @@ impl FileInfo {
/// Creates a new `FileInfo` instance from uploaded field data.
///
/// # Arguments
///
/// * `field_data` - The uploaded file data.
///
/// # Returns
///
/// * `Result<FileInfo, FileError>` - The created `FileInfo` or an error.
pub async fn new(field_data: FieldData<NamedTempFile>, redis_client: &RedisClient) -> Result<FileInfo, FileError> {
let file = field_data.contents; // NamedTempFile
@@ -157,11 +156,9 @@ impl FileInfo {
/// Retrieves `FileInfo` based on UUID.
///
/// # Arguments
///
/// * `uuid` - The UUID of the file.
///
/// # Returns
///
/// * `Result<FileInfo, FileError>` - The `FileInfo` or an error.
pub async fn get(uuid: Uuid, redis_client: &RedisClient) -> Result<FileInfo, FileError> {
// Fetch SHA256 from UUID mapping
@@ -178,13 +175,11 @@ impl FileInfo {
/// Updates an existing file identified by UUID with new file data.
///
/// # Arguments
///
/// * `uuid` - The UUID of the file to update.
/// * `new_field_data` - The new file data.
/// * `redis_client` - Reference to the RedisClient.
///
/// # Returns
///
/// * `Result<FileInfo, FileError>` - The updated `FileInfo` or an error.
pub async fn update(uuid: Uuid, new_field_data: FieldData<NamedTempFile>, redis_client: &RedisClient) -> Result<FileInfo, FileError> {
let new_file = new_field_data.contents;
@@ -237,12 +232,10 @@ impl FileInfo {
/// Deletes a file and its corresponding metadata based on UUID.
///
/// # Arguments
///
/// * `uuid` - The UUID of the file to delete.
/// * `redis_client` - Reference to the RedisClient.
///
/// # Returns
///
/// * `Result<(), FileError>` - Empty result or an error.
pub async fn delete(uuid: Uuid, redis_client: &RedisClient) -> Result<(), FileError> {
// Retrieve FileInfo to get SHA256 and path
@@ -279,13 +272,11 @@ impl FileInfo {
/// Persists the file to the filesystem under `./data/{uuid}/{file_name}`.
///
/// # Arguments
///
/// * `uuid` - The UUID of the file.
/// * `file` - The temporary file to persist.
/// * `file_name` - The sanitized file name.
///
/// # Returns
///
/// * `Result<PathBuf, FileError>` - The persisted file path or an error.
async fn persist_file(uuid: &Uuid, file: NamedTempFile, file_name: &str) -> Result<PathBuf, FileError> {
let base_dir = Path::new("./data");
@@ -309,11 +300,9 @@ impl FileInfo {
/// Calculates the SHA256 hash of the given file.
///
/// # Arguments
///
/// * `file` - The file to hash.
///
/// # Returns
///
/// * `Result<String, FileError>` - The SHA256 hash as a hex string or an error.
async fn get_sha(file: &NamedTempFile) -> Result<String, FileError> {
let mut reader = BufReader::new(file.as_file());
@@ -335,11 +324,9 @@ impl FileInfo {
/// Guesses the MIME type based on the file extension.
///
/// # Arguments
///
/// * `path` - The path to the file.
///
/// # Returns
///
/// * `String` - The guessed MIME type as a string.
fn guess_mime_type(path: &Path) -> String {
from_path(path)
+11
View File
@@ -7,6 +7,7 @@ use crate::redis::client::RedisClient;
use super::{file_info::FileInfo, ingress_object::IngressObject };
/// Struct defining the expected body when ingressing content.
#[derive(Serialize, Deserialize, Debug)]
pub struct IngressInput {
pub content: Option<String>,
@@ -41,6 +42,13 @@ pub enum IngressContentError {
}
/// Function to create ingress objects from input.
///
/// # Arguments
/// * `input` - IngressInput containing information needed to ingress content.
/// * `redis_client` - Initialized redis client needed to retrieve file information
///
/// # Returns
/// * `Vec<IngressObject>` - An array containing the ingressed objects, one file/contenttype per object.
pub async fn create_ingress_objects(
input: IngressInput,
redis_client: &RedisClient,
@@ -48,6 +56,7 @@ pub async fn create_ingress_objects(
// Initialize list
let mut object_list = Vec::new();
// Create a IngressObject from input.content if it exists, checking for URL or text
if let Some(input_content) = input.content {
match Url::parse(&input_content) {
Ok(url) => {
@@ -69,6 +78,7 @@ pub async fn create_ingress_objects(
}
}
// Look up FileInfo objects using the redis db and the submitted uuids in input.files
if let Some(file_uuids) = input.files {
for uuid_str in file_uuids {
let uuid = Uuid::parse_str(&uuid_str)?;
@@ -88,6 +98,7 @@ pub async fn create_ingress_objects(
}
}
// If no objects are constructed, we return Err
if object_list.is_empty() {
return Err(IngressContentError::MimeDetection(
"No valid content or files provided".into(),
+14
View File
@@ -3,6 +3,7 @@ use serde::{Deserialize, Serialize};
use super::{ingress_content::IngressContentError, text_content::TextContent};
/// Knowledge object type, containing the content or reference to it, as well as metadata
#[derive(Debug, Serialize, Deserialize, Clone)]
pub enum IngressObject {
Url {
@@ -21,7 +22,15 @@ pub enum IngressObject {
category: String,
},
}
impl IngressObject {
/// Creates a new `TextContent` instance from a `IngressObject`.
///
/// # Arguments
/// `&self` - A reference to the `IngressObject`.
///
/// # Returns
/// `TextContent` - An object containing a text representation of the object, could be a scraped URL, parsed PDF, etc.
pub async fn to_text_content(&self) -> Result<TextContent, IngressContentError> {
match self {
IngressObject::Url { url, instructions, category } => {
@@ -66,6 +75,11 @@ impl IngressObject {
let content = tokio::fs::read_to_string(&file_info.path).await?;
Ok(content)
}
"text/markdown" => {
// Read the file and return its content
let content = tokio::fs::read_to_string(&file_info.path).await?;
Ok(content)
}
"application/pdf" => {
// TODO: Implement PDF text extraction using a crate like `pdf-extract` or `lopdf`
Err(IngressContentError::UnsupportedMime(file_info.mime_type.clone()))
+54 -19
View File
@@ -70,26 +70,61 @@ impl TextContent {
let client = async_openai::Client::new();
// Define the JSON Schema for the expected response
let schema = json!({
"type": "object",
"properties": {
"json_ld": {
"type": "object",
"properties": {
"@context": { "type": "string" },
"@type": { "type": "string" },
"name": { "type": "string" }
// Define only the essential properties
},
"required": ["@context", "@type", "name"],
"additionalProperties": false
// let schema = json!({
// "type": "object",
// "properties": {
// "json_ld": {
// "type": "object",
// "properties": {
// "@context": { "type": "string" },
// "@type": { "type": "string" },
// "name": { "type": "string" }
// // Define only the essential properties
// },
// "required": ["@context", "@type", "name"],
// "additionalProperties": false,
// },
// "description": { "type": "string" },
// "related_category": { "type": "string" },
// "instructions": { "type": "string" }
// },
// "required": ["json_ld", "description", "related_category", "instructions"],
// "additionalProperties": false
// });
let schema = json!({
"type": "object",
"properties": {
"knowledge_sources": {
"type": "array",
"items": {
"type": "object",
"properties": {
"id": {"type": "string"},
"type": {"type": "string", "enum": ["Document", "Page", "TextSnippet"]},
"title": {"type": "string"},
"description": {"type": "string"},
"relationships": {
"type": "array",
"items": {
"type": "object",
"properties": {
"type": {"type": "string", "enum": ["RelatedTo", "RelevantTo", "SimilarTo"]},
"target": {"type": "string", "description": "ID of the related knowledge source"}
},
"required": ["type", "target"],
"additionalProperties": false,
}
}
},
"description": { "type": "string" },
"related_category": { "type": "string" },
"instructions": { "type": "string" }
"required": ["id", "type", "title", "description", "relationships"],
"additionalProperties": false,
}
},
"required": ["json_ld", "description", "related_category", "instructions"],
"additionalProperties": false
"category": {"type": "string"},
"instructions": {"type": "string"}
},
"required": ["knowledge_sources", "category", "instructions"],
"additionalProperties": false
});
let response_format = async_openai::types::ResponseFormat::JsonSchema {
@@ -114,7 +149,7 @@ impl TextContent {
// Build the chat completion request
let request = CreateChatCompletionRequestArgs::default()
.model("gpt-4o-mini")
.max_tokens(1024u32)
.max_tokens(2048u32)
.messages([
ChatCompletionRequestSystemMessage::from(system_message).into(),
ChatCompletionRequestUserMessage::from(user_message).into(),