mirror of
https://github.com/perstarkse/minne.git
synced 2026-07-02 11:01:38 +02:00
refactor: implemented state machine for ingestion pipeline, improved performance
changelog additional moving around moved files around a bit
This commit is contained in:
@@ -0,0 +1,63 @@
|
||||
use common::{
|
||||
error::AppError,
|
||||
storage::{db::SurrealDbClient, store, types::file_info::FileInfo},
|
||||
utils::config::AppConfig,
|
||||
};
|
||||
|
||||
use super::{
|
||||
audio_transcription::transcribe_audio_file, image_parsing::extract_text_from_image,
|
||||
pdf_ingestion::extract_pdf_content,
|
||||
};
|
||||
|
||||
pub async fn extract_text_from_file(
|
||||
file_info: &FileInfo,
|
||||
db_client: &SurrealDbClient,
|
||||
openai_client: &async_openai::Client<async_openai::config::OpenAIConfig>,
|
||||
config: &AppConfig,
|
||||
) -> Result<String, AppError> {
|
||||
let base_path = store::resolve_base_dir(config);
|
||||
let absolute_path = base_path.join(&file_info.path);
|
||||
|
||||
match file_info.mime_type.as_str() {
|
||||
"text/plain" | "text/markdown" | "application/octet-stream" | "text/x-rust" => {
|
||||
let content = tokio::fs::read_to_string(&absolute_path).await?;
|
||||
Ok(content)
|
||||
}
|
||||
"application/pdf" => {
|
||||
extract_pdf_content(
|
||||
&absolute_path,
|
||||
db_client,
|
||||
openai_client,
|
||||
&config.pdf_ingest_mode,
|
||||
)
|
||||
.await
|
||||
}
|
||||
"image/png" | "image/jpeg" => {
|
||||
let path_str = absolute_path
|
||||
.to_str()
|
||||
.ok_or_else(|| {
|
||||
AppError::Processing(format!(
|
||||
"Encountered a non-UTF8 path while reading image {}",
|
||||
file_info.id
|
||||
))
|
||||
})?
|
||||
.to_string();
|
||||
let content = extract_text_from_image(&path_str, db_client, openai_client).await?;
|
||||
Ok(content)
|
||||
}
|
||||
"audio/mpeg" | "audio/mp3" | "audio/wav" | "audio/x-wav" | "audio/webm" | "audio/mp4"
|
||||
| "audio/ogg" | "audio/flac" => {
|
||||
let path_str = absolute_path
|
||||
.to_str()
|
||||
.ok_or_else(|| {
|
||||
AppError::Processing(format!(
|
||||
"Encountered a non-UTF8 path while reading audio {}",
|
||||
file_info.id
|
||||
))
|
||||
})?
|
||||
.to_string();
|
||||
transcribe_audio_file(&path_str, db_client, openai_client).await
|
||||
}
|
||||
_ => Err(AppError::NotFound(file_info.mime_type.clone())),
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,53 @@
|
||||
use common::error::AppError;
|
||||
use std::collections::HashMap;
|
||||
use uuid::Uuid;
|
||||
|
||||
/// Intermediate struct to hold mapping between LLM keys and generated IDs.
|
||||
#[derive(Clone)]
|
||||
pub struct GraphMapper {
|
||||
pub key_to_id: HashMap<String, Uuid>,
|
||||
}
|
||||
|
||||
impl Default for GraphMapper {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl GraphMapper {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
key_to_id: HashMap::new(),
|
||||
}
|
||||
}
|
||||
/// Tries to get an ID by first parsing the key as a UUID,
|
||||
/// and if that fails, looking it up in the internal map.
|
||||
pub fn get_or_parse_id(&self, key: &str) -> Result<Uuid, AppError> {
|
||||
// First, try to parse the key as a UUID.
|
||||
if let Ok(parsed_uuid) = Uuid::parse_str(key) {
|
||||
return Ok(parsed_uuid);
|
||||
}
|
||||
|
||||
// If parsing fails, look it up in the map.
|
||||
self.key_to_id.get(key).copied().ok_or_else(|| {
|
||||
AppError::GraphMapper(format!(
|
||||
"Key '{key}' is not a valid UUID and was not found in the map."
|
||||
))
|
||||
})
|
||||
}
|
||||
|
||||
/// Assigns a new UUID for a given key. (No changes needed here)
|
||||
pub fn assign_id(&mut self, key: &str) -> Uuid {
|
||||
let id = Uuid::new_v4();
|
||||
self.key_to_id.insert(key.to_string(), id);
|
||||
id
|
||||
}
|
||||
|
||||
/// Retrieves the UUID for a given key, returning a Result for consistency.
|
||||
pub fn get_id(&self, key: &str) -> Result<Uuid, AppError> {
|
||||
self.key_to_id
|
||||
.get(key)
|
||||
.copied()
|
||||
.ok_or_else(|| AppError::GraphMapper(format!("Key '{key}' not found in map.")))
|
||||
}
|
||||
}
|
||||
@@ -1,58 +1,7 @@
|
||||
pub mod audio_transcription;
|
||||
pub mod file_text_extraction;
|
||||
pub mod graph_mapper;
|
||||
pub mod image_parsing;
|
||||
pub mod llm_instructions;
|
||||
pub mod pdf_ingestion;
|
||||
|
||||
use common::error::AppError;
|
||||
use std::collections::HashMap;
|
||||
use uuid::Uuid;
|
||||
|
||||
/// Intermediate struct to hold mapping between LLM keys and generated IDs.
|
||||
#[derive(Clone)]
|
||||
pub struct GraphMapper {
|
||||
pub key_to_id: HashMap<String, Uuid>,
|
||||
}
|
||||
|
||||
impl Default for GraphMapper {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl GraphMapper {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
key_to_id: HashMap::new(),
|
||||
}
|
||||
}
|
||||
/// Tries to get an ID by first parsing the key as a UUID,
|
||||
/// and if that fails, looking it up in the internal map.
|
||||
pub fn get_or_parse_id(&self, key: &str) -> Result<Uuid, AppError> {
|
||||
// First, try to parse the key as a UUID.
|
||||
if let Ok(parsed_uuid) = Uuid::parse_str(key) {
|
||||
return Ok(parsed_uuid);
|
||||
}
|
||||
|
||||
// If parsing fails, look it up in the map.
|
||||
self.key_to_id.get(key).copied().ok_or_else(|| {
|
||||
AppError::GraphMapper(format!(
|
||||
"Key '{key}' is not a valid UUID and was not found in the map."
|
||||
))
|
||||
})
|
||||
}
|
||||
|
||||
/// Assigns a new UUID for a given key. (No changes needed here)
|
||||
pub fn assign_id(&mut self, key: &str) -> Uuid {
|
||||
let id = Uuid::new_v4();
|
||||
self.key_to_id.insert(key.to_string(), id);
|
||||
id
|
||||
}
|
||||
|
||||
/// Retrieves the UUID for a given key, returning a Result for consistency.
|
||||
pub fn get_id(&self, key: &str) -> Result<Uuid, AppError> {
|
||||
self.key_to_id
|
||||
.get(key)
|
||||
.copied()
|
||||
.ok_or_else(|| AppError::GraphMapper(format!("Key '{key}' not found in map.")))
|
||||
}
|
||||
}
|
||||
pub mod url_text_retrieval;
|
||||
|
||||
@@ -0,0 +1,174 @@
|
||||
use axum::http::HeaderMap;
|
||||
use axum_typed_multipart::{FieldData, FieldMetadata};
|
||||
use chrono::Utc;
|
||||
use common::{
|
||||
error::AppError,
|
||||
storage::{db::SurrealDbClient, types::file_info::FileInfo},
|
||||
utils::config::AppConfig,
|
||||
};
|
||||
use dom_smoothie::{Article, Readability, TextMode};
|
||||
use headless_chrome::Browser;
|
||||
use std::{
|
||||
io::{Seek, SeekFrom, Write},
|
||||
net::IpAddr,
|
||||
time::Instant,
|
||||
};
|
||||
use tempfile::NamedTempFile;
|
||||
use tracing::{error, info, warn};
|
||||
pub async fn extract_text_from_url(
|
||||
url: &str,
|
||||
db: &SurrealDbClient,
|
||||
user_id: &str,
|
||||
config: &AppConfig,
|
||||
) -> Result<(Article, FileInfo), AppError> {
|
||||
info!("Fetching URL: {}", url);
|
||||
let now = Instant::now();
|
||||
|
||||
let browser = {
|
||||
#[cfg(feature = "docker")]
|
||||
{
|
||||
let options = headless_chrome::LaunchOptionsBuilder::default()
|
||||
.sandbox(false)
|
||||
.build()
|
||||
.map_err(|e| AppError::InternalError(e.to_string()))?;
|
||||
Browser::new(options)?
|
||||
}
|
||||
#[cfg(not(feature = "docker"))]
|
||||
{
|
||||
Browser::default()?
|
||||
}
|
||||
};
|
||||
|
||||
let tab = browser.new_tab()?;
|
||||
let page = tab.navigate_to(url)?;
|
||||
let loaded_page = page.wait_until_navigated()?;
|
||||
let raw_content = loaded_page.get_content()?;
|
||||
let screenshot = loaded_page.capture_screenshot(
|
||||
headless_chrome::protocol::cdp::Page::CaptureScreenshotFormatOption::Jpeg,
|
||||
None,
|
||||
None,
|
||||
true,
|
||||
)?;
|
||||
|
||||
let mut tmp_file = NamedTempFile::new()?;
|
||||
let temp_path_str = format!("{:?}", tmp_file.path());
|
||||
|
||||
tmp_file.write_all(&screenshot)?;
|
||||
tmp_file.as_file().sync_all()?;
|
||||
|
||||
if let Err(e) = tmp_file.seek(SeekFrom::Start(0)) {
|
||||
error!(
|
||||
"URL: {}. Failed to seek temp file {} to start: {:?}. Proceeding, but hashing might fail.",
|
||||
url, temp_path_str, e
|
||||
);
|
||||
}
|
||||
|
||||
let parsed_url =
|
||||
url::Url::parse(url).map_err(|_| AppError::Validation("Invalid URL".to_string()))?;
|
||||
|
||||
let domain = ensure_ingestion_url_allowed(&parsed_url)?;
|
||||
let timestamp = Utc::now().format("%Y%m%d%H%M%S");
|
||||
let file_name = format!("{}_{}_{}.jpg", domain, "screenshot", timestamp);
|
||||
|
||||
let metadata = FieldMetadata {
|
||||
file_name: Some(file_name),
|
||||
content_type: Some("image/jpeg".to_string()),
|
||||
name: None,
|
||||
headers: HeaderMap::new(),
|
||||
};
|
||||
let field_data = FieldData {
|
||||
contents: tmp_file,
|
||||
metadata,
|
||||
};
|
||||
|
||||
let file_info = FileInfo::new(field_data, db, user_id, config).await?;
|
||||
|
||||
let config = dom_smoothie::Config {
|
||||
text_mode: TextMode::Markdown,
|
||||
..Default::default()
|
||||
};
|
||||
let mut readability = Readability::new(raw_content, None, Some(config))?;
|
||||
let article: Article = readability.parse()?;
|
||||
let end = now.elapsed();
|
||||
info!(
|
||||
"URL: {}. Total time: {:?}. Final File ID: {}",
|
||||
url, end, file_info.id
|
||||
);
|
||||
|
||||
Ok((article, file_info))
|
||||
}
|
||||
|
||||
fn ensure_ingestion_url_allowed(url: &url::Url) -> Result<String, AppError> {
|
||||
match url.scheme() {
|
||||
"http" | "https" => {}
|
||||
scheme => {
|
||||
warn!(%url, %scheme, "Rejected ingestion URL due to unsupported scheme");
|
||||
return Err(AppError::Validation(
|
||||
"Unsupported URL scheme for ingestion".to_string(),
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
let host = match url.host_str() {
|
||||
Some(host) => host,
|
||||
None => {
|
||||
warn!(%url, "Rejected ingestion URL missing host");
|
||||
return Err(AppError::Validation(
|
||||
"URL is missing a host component".to_string(),
|
||||
));
|
||||
}
|
||||
};
|
||||
|
||||
if host.eq_ignore_ascii_case("localhost") {
|
||||
warn!(%url, host, "Rejected ingestion URL to localhost");
|
||||
return Err(AppError::Validation(
|
||||
"Ingestion URL host is not allowed".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
if let Ok(ip) = host.parse::<IpAddr>() {
|
||||
let is_disallowed = match ip {
|
||||
IpAddr::V4(v4) => v4.is_private() || v4.is_link_local(),
|
||||
IpAddr::V6(v6) => v6.is_unique_local() || v6.is_unicast_link_local(),
|
||||
};
|
||||
|
||||
if ip.is_loopback() || ip.is_unspecified() || ip.is_multicast() || is_disallowed {
|
||||
warn!(%url, host, %ip, "Rejected ingestion URL pointing to restricted network range");
|
||||
return Err(AppError::Validation(
|
||||
"Ingestion URL host is not allowed".to_string(),
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(host.replace(|c: char| !c.is_alphanumeric(), "_"))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn rejects_unsupported_scheme() {
|
||||
let url = url::Url::parse("ftp://example.com").expect("url");
|
||||
assert!(ensure_ingestion_url_allowed(&url).is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rejects_localhost() {
|
||||
let url = url::Url::parse("http://localhost/resource").expect("url");
|
||||
assert!(ensure_ingestion_url_allowed(&url).is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rejects_private_ipv4() {
|
||||
let url = url::Url::parse("http://192.168.1.10/index.html").expect("url");
|
||||
assert!(ensure_ingestion_url_allowed(&url).is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn allows_public_domain_and_sanitizes() {
|
||||
let url = url::Url::parse("https://sub.example.com/path").expect("url");
|
||||
let sanitized = ensure_ingestion_url_allowed(&url).expect("allowed");
|
||||
assert_eq!(sanitized, "sub_example_com");
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user