mirror of
https://github.com/perstarkse/minne.git
synced 2026-04-27 03:08:41 +02:00
feat: refactoring complete?
This commit is contained in:
107
src/ingress/types/ingress_input.rs
Normal file
107
src/ingress/types/ingress_input.rs
Normal file
@@ -0,0 +1,107 @@
|
||||
use super::ingress_object::IngressObject;
|
||||
use crate::storage::{db::SurrealDbClient, types::file_info::FileInfo};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use thiserror::Error;
|
||||
use tracing::info;
|
||||
use url::Url;
|
||||
use uuid::Uuid;
|
||||
|
||||
/// Struct defining the expected body when ingressing content.
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct IngressInput {
|
||||
pub content: Option<String>,
|
||||
pub instructions: String,
|
||||
pub category: String,
|
||||
pub files: Option<Vec<String>>,
|
||||
}
|
||||
|
||||
/// Error types for processing ingress content.
|
||||
#[derive(Error, Debug)]
|
||||
pub enum IngressContentError {
|
||||
#[error("IO error occurred: {0}")]
|
||||
Io(#[from] std::io::Error),
|
||||
|
||||
#[error("UTF-8 conversion error: {0}")]
|
||||
Utf8(#[from] std::string::FromUtf8Error),
|
||||
|
||||
#[error("MIME type detection failed for input: {0}")]
|
||||
MimeDetection(String),
|
||||
|
||||
#[error("Unsupported MIME type: {0}")]
|
||||
UnsupportedMime(String),
|
||||
|
||||
#[error("URL parse error: {0}")]
|
||||
UrlParse(#[from] url::ParseError),
|
||||
|
||||
#[error("UUID parse error: {0}")]
|
||||
UuidParse(#[from] uuid::Error),
|
||||
|
||||
#[error("Redis error: {0}")]
|
||||
RedisError(String),
|
||||
}
|
||||
|
||||
/// Function to create ingress objects from input.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `input` - IngressInput containing information needed to ingress content.
|
||||
/// * `redis_client` - Initialized redis client needed to retrieve file information
|
||||
///
|
||||
/// # Returns
|
||||
/// * `Vec<IngressObject>` - An array containing the ingressed objects, one file/contenttype per object.
|
||||
pub async fn create_ingress_objects(
|
||||
input: IngressInput,
|
||||
db_client: &SurrealDbClient,
|
||||
) -> Result<Vec<IngressObject>, IngressContentError> {
|
||||
// Initialize list
|
||||
let mut object_list = Vec::new();
|
||||
|
||||
// Create a IngressObject from input.content if it exists, checking for URL or text
|
||||
if let Some(input_content) = input.content {
|
||||
match Url::parse(&input_content) {
|
||||
Ok(url) => {
|
||||
info!("Detected URL: {}", url);
|
||||
object_list.push(IngressObject::Url {
|
||||
url: url.to_string(),
|
||||
instructions: input.instructions.clone(),
|
||||
category: input.category.clone(),
|
||||
});
|
||||
}
|
||||
Err(_) => {
|
||||
info!("Treating input as plain text");
|
||||
object_list.push(IngressObject::Text {
|
||||
text: input_content.to_string(),
|
||||
instructions: input.instructions.clone(),
|
||||
category: input.category.clone(),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Look up FileInfo objects using the db and the submitted uuids in input.files
|
||||
if let Some(file_uuids) = input.files {
|
||||
for uuid_str in file_uuids {
|
||||
let uuid = Uuid::parse_str(&uuid_str)?;
|
||||
match FileInfo::get_by_uuid(uuid, db_client).await {
|
||||
Ok(file_info) => {
|
||||
object_list.push(IngressObject::File {
|
||||
file_info,
|
||||
instructions: input.instructions.clone(),
|
||||
category: input.category.clone(),
|
||||
});
|
||||
}
|
||||
_ => {
|
||||
info!("No file with UUID: {}", uuid);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If no objects are constructed, we return Err
|
||||
if object_list.is_empty() {
|
||||
return Err(IngressContentError::MimeDetection(
|
||||
"No valid content or files provided".into(),
|
||||
));
|
||||
}
|
||||
|
||||
Ok(object_list)
|
||||
}
|
||||
119
src/ingress/types/ingress_object.rs
Normal file
119
src/ingress/types/ingress_object.rs
Normal file
@@ -0,0 +1,119 @@
|
||||
use crate::storage::types::{file_info::FileInfo, text_content::TextContent};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::ingress_input::IngressContentError;
|
||||
|
||||
/// Knowledge object type, containing the content or reference to it, as well as metadata
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
pub enum IngressObject {
|
||||
Url {
|
||||
url: String,
|
||||
instructions: String,
|
||||
category: String,
|
||||
},
|
||||
Text {
|
||||
text: String,
|
||||
instructions: String,
|
||||
category: String,
|
||||
},
|
||||
File {
|
||||
file_info: FileInfo,
|
||||
instructions: String,
|
||||
category: String,
|
||||
},
|
||||
}
|
||||
|
||||
impl IngressObject {
|
||||
/// Creates a new `TextContent` instance from a `IngressObject`.
|
||||
///
|
||||
/// # Arguments
|
||||
/// `&self` - A reference to the `IngressObject`.
|
||||
///
|
||||
/// # Returns
|
||||
/// `TextContent` - An object containing a text representation of the object, could be a scraped URL, parsed PDF, etc.
|
||||
pub async fn to_text_content(&self) -> Result<TextContent, IngressContentError> {
|
||||
match self {
|
||||
IngressObject::Url {
|
||||
url,
|
||||
instructions,
|
||||
category,
|
||||
} => {
|
||||
let text = Self::fetch_text_from_url(url).await?;
|
||||
Ok(TextContent::new(
|
||||
text,
|
||||
instructions.into(),
|
||||
category.into(),
|
||||
None,
|
||||
))
|
||||
}
|
||||
IngressObject::Text {
|
||||
text,
|
||||
instructions,
|
||||
category,
|
||||
} => Ok(TextContent::new(
|
||||
text.into(),
|
||||
instructions.into(),
|
||||
category.into(),
|
||||
None,
|
||||
)),
|
||||
IngressObject::File {
|
||||
file_info,
|
||||
instructions,
|
||||
category,
|
||||
} => {
|
||||
let text = Self::extract_text_from_file(file_info).await?;
|
||||
Ok(TextContent::new(
|
||||
text,
|
||||
instructions.into(),
|
||||
category.into(),
|
||||
Some(file_info.to_owned()),
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Fetches and extracts text from a URL.
|
||||
async fn fetch_text_from_url(_url: &str) -> Result<String, IngressContentError> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
/// Extracts text from a file based on its MIME type.
|
||||
async fn extract_text_from_file(file_info: &FileInfo) -> Result<String, IngressContentError> {
|
||||
match file_info.mime_type.as_str() {
|
||||
"text/plain" => {
|
||||
// Read the file and return its content
|
||||
let content = tokio::fs::read_to_string(&file_info.path).await?;
|
||||
Ok(content)
|
||||
}
|
||||
"text/markdown" => {
|
||||
// Read the file and return its content
|
||||
let content = tokio::fs::read_to_string(&file_info.path).await?;
|
||||
Ok(content)
|
||||
}
|
||||
"application/pdf" => {
|
||||
// TODO: Implement PDF text extraction using a crate like `pdf-extract` or `lopdf`
|
||||
Err(IngressContentError::UnsupportedMime(
|
||||
file_info.mime_type.clone(),
|
||||
))
|
||||
}
|
||||
"image/png" | "image/jpeg" => {
|
||||
// TODO: Implement OCR on image using a crate like `tesseract`
|
||||
Err(IngressContentError::UnsupportedMime(
|
||||
file_info.mime_type.clone(),
|
||||
))
|
||||
}
|
||||
"application/octet-stream" => {
|
||||
let content = tokio::fs::read_to_string(&file_info.path).await?;
|
||||
Ok(content)
|
||||
}
|
||||
"text/x-rust" => {
|
||||
let content = tokio::fs::read_to_string(&file_info.path).await?;
|
||||
Ok(content)
|
||||
}
|
||||
// Handle other MIME types as needed
|
||||
_ => Err(IngressContentError::UnsupportedMime(
|
||||
file_info.mime_type.clone(),
|
||||
)),
|
||||
}
|
||||
}
|
||||
}
|
||||
2
src/ingress/types/mod.rs
Normal file
2
src/ingress/types/mod.rs
Normal file
@@ -0,0 +1,2 @@
|
||||
pub mod ingress_input;
|
||||
pub mod ingress_object;
|
||||
Reference in New Issue
Block a user