refactor: implemented state machine for ingestion pipeline, improved performance

changelog

additional moving around

moved files around a bit
This commit is contained in:
Per Stark
2025-10-19 16:08:46 +02:00
parent 83d39afad4
commit 07b3e1a0e8
20 changed files with 1762 additions and 802 deletions
@@ -0,0 +1,63 @@
use common::{
error::AppError,
storage::{db::SurrealDbClient, store, types::file_info::FileInfo},
utils::config::AppConfig,
};
use super::{
audio_transcription::transcribe_audio_file, image_parsing::extract_text_from_image,
pdf_ingestion::extract_pdf_content,
};
pub async fn extract_text_from_file(
file_info: &FileInfo,
db_client: &SurrealDbClient,
openai_client: &async_openai::Client<async_openai::config::OpenAIConfig>,
config: &AppConfig,
) -> Result<String, AppError> {
let base_path = store::resolve_base_dir(config);
let absolute_path = base_path.join(&file_info.path);
match file_info.mime_type.as_str() {
"text/plain" | "text/markdown" | "application/octet-stream" | "text/x-rust" => {
let content = tokio::fs::read_to_string(&absolute_path).await?;
Ok(content)
}
"application/pdf" => {
extract_pdf_content(
&absolute_path,
db_client,
openai_client,
&config.pdf_ingest_mode,
)
.await
}
"image/png" | "image/jpeg" => {
let path_str = absolute_path
.to_str()
.ok_or_else(|| {
AppError::Processing(format!(
"Encountered a non-UTF8 path while reading image {}",
file_info.id
))
})?
.to_string();
let content = extract_text_from_image(&path_str, db_client, openai_client).await?;
Ok(content)
}
"audio/mpeg" | "audio/mp3" | "audio/wav" | "audio/x-wav" | "audio/webm" | "audio/mp4"
| "audio/ogg" | "audio/flac" => {
let path_str = absolute_path
.to_str()
.ok_or_else(|| {
AppError::Processing(format!(
"Encountered a non-UTF8 path while reading audio {}",
file_info.id
))
})?
.to_string();
transcribe_audio_file(&path_str, db_client, openai_client).await
}
_ => Err(AppError::NotFound(file_info.mime_type.clone())),
}
}
@@ -0,0 +1,53 @@
use common::error::AppError;
use std::collections::HashMap;
use uuid::Uuid;
/// Intermediate struct to hold mapping between LLM keys and generated IDs.
#[derive(Clone)]
pub struct GraphMapper {
pub key_to_id: HashMap<String, Uuid>,
}
impl Default for GraphMapper {
fn default() -> Self {
Self::new()
}
}
impl GraphMapper {
pub fn new() -> Self {
Self {
key_to_id: HashMap::new(),
}
}
/// Tries to get an ID by first parsing the key as a UUID,
/// and if that fails, looking it up in the internal map.
pub fn get_or_parse_id(&self, key: &str) -> Result<Uuid, AppError> {
// First, try to parse the key as a UUID.
if let Ok(parsed_uuid) = Uuid::parse_str(key) {
return Ok(parsed_uuid);
}
// If parsing fails, look it up in the map.
self.key_to_id.get(key).copied().ok_or_else(|| {
AppError::GraphMapper(format!(
"Key '{key}' is not a valid UUID and was not found in the map."
))
})
}
/// Assigns a new UUID for a given key. (No changes needed here)
pub fn assign_id(&mut self, key: &str) -> Uuid {
let id = Uuid::new_v4();
self.key_to_id.insert(key.to_string(), id);
id
}
/// Retrieves the UUID for a given key, returning a Result for consistency.
pub fn get_id(&self, key: &str) -> Result<Uuid, AppError> {
self.key_to_id
.get(key)
.copied()
.ok_or_else(|| AppError::GraphMapper(format!("Key '{key}' not found in map.")))
}
}
+3 -54
View File
@@ -1,58 +1,7 @@
pub mod audio_transcription;
pub mod file_text_extraction;
pub mod graph_mapper;
pub mod image_parsing;
pub mod llm_instructions;
pub mod pdf_ingestion;
use common::error::AppError;
use std::collections::HashMap;
use uuid::Uuid;
/// Intermediate struct to hold mapping between LLM keys and generated IDs.
#[derive(Clone)]
pub struct GraphMapper {
pub key_to_id: HashMap<String, Uuid>,
}
impl Default for GraphMapper {
fn default() -> Self {
Self::new()
}
}
impl GraphMapper {
pub fn new() -> Self {
Self {
key_to_id: HashMap::new(),
}
}
/// Tries to get an ID by first parsing the key as a UUID,
/// and if that fails, looking it up in the internal map.
pub fn get_or_parse_id(&self, key: &str) -> Result<Uuid, AppError> {
// First, try to parse the key as a UUID.
if let Ok(parsed_uuid) = Uuid::parse_str(key) {
return Ok(parsed_uuid);
}
// If parsing fails, look it up in the map.
self.key_to_id.get(key).copied().ok_or_else(|| {
AppError::GraphMapper(format!(
"Key '{key}' is not a valid UUID and was not found in the map."
))
})
}
/// Assigns a new UUID for a given key. (No changes needed here)
pub fn assign_id(&mut self, key: &str) -> Uuid {
let id = Uuid::new_v4();
self.key_to_id.insert(key.to_string(), id);
id
}
/// Retrieves the UUID for a given key, returning a Result for consistency.
pub fn get_id(&self, key: &str) -> Result<Uuid, AppError> {
self.key_to_id
.get(key)
.copied()
.ok_or_else(|| AppError::GraphMapper(format!("Key '{key}' not found in map.")))
}
}
pub mod url_text_retrieval;
@@ -0,0 +1,174 @@
use axum::http::HeaderMap;
use axum_typed_multipart::{FieldData, FieldMetadata};
use chrono::Utc;
use common::{
error::AppError,
storage::{db::SurrealDbClient, types::file_info::FileInfo},
utils::config::AppConfig,
};
use dom_smoothie::{Article, Readability, TextMode};
use headless_chrome::Browser;
use std::{
io::{Seek, SeekFrom, Write},
net::IpAddr,
time::Instant,
};
use tempfile::NamedTempFile;
use tracing::{error, info, warn};
pub async fn extract_text_from_url(
url: &str,
db: &SurrealDbClient,
user_id: &str,
config: &AppConfig,
) -> Result<(Article, FileInfo), AppError> {
info!("Fetching URL: {}", url);
let now = Instant::now();
let browser = {
#[cfg(feature = "docker")]
{
let options = headless_chrome::LaunchOptionsBuilder::default()
.sandbox(false)
.build()
.map_err(|e| AppError::InternalError(e.to_string()))?;
Browser::new(options)?
}
#[cfg(not(feature = "docker"))]
{
Browser::default()?
}
};
let tab = browser.new_tab()?;
let page = tab.navigate_to(url)?;
let loaded_page = page.wait_until_navigated()?;
let raw_content = loaded_page.get_content()?;
let screenshot = loaded_page.capture_screenshot(
headless_chrome::protocol::cdp::Page::CaptureScreenshotFormatOption::Jpeg,
None,
None,
true,
)?;
let mut tmp_file = NamedTempFile::new()?;
let temp_path_str = format!("{:?}", tmp_file.path());
tmp_file.write_all(&screenshot)?;
tmp_file.as_file().sync_all()?;
if let Err(e) = tmp_file.seek(SeekFrom::Start(0)) {
error!(
"URL: {}. Failed to seek temp file {} to start: {:?}. Proceeding, but hashing might fail.",
url, temp_path_str, e
);
}
let parsed_url =
url::Url::parse(url).map_err(|_| AppError::Validation("Invalid URL".to_string()))?;
let domain = ensure_ingestion_url_allowed(&parsed_url)?;
let timestamp = Utc::now().format("%Y%m%d%H%M%S");
let file_name = format!("{}_{}_{}.jpg", domain, "screenshot", timestamp);
let metadata = FieldMetadata {
file_name: Some(file_name),
content_type: Some("image/jpeg".to_string()),
name: None,
headers: HeaderMap::new(),
};
let field_data = FieldData {
contents: tmp_file,
metadata,
};
let file_info = FileInfo::new(field_data, db, user_id, config).await?;
let config = dom_smoothie::Config {
text_mode: TextMode::Markdown,
..Default::default()
};
let mut readability = Readability::new(raw_content, None, Some(config))?;
let article: Article = readability.parse()?;
let end = now.elapsed();
info!(
"URL: {}. Total time: {:?}. Final File ID: {}",
url, end, file_info.id
);
Ok((article, file_info))
}
fn ensure_ingestion_url_allowed(url: &url::Url) -> Result<String, AppError> {
match url.scheme() {
"http" | "https" => {}
scheme => {
warn!(%url, %scheme, "Rejected ingestion URL due to unsupported scheme");
return Err(AppError::Validation(
"Unsupported URL scheme for ingestion".to_string(),
));
}
}
let host = match url.host_str() {
Some(host) => host,
None => {
warn!(%url, "Rejected ingestion URL missing host");
return Err(AppError::Validation(
"URL is missing a host component".to_string(),
));
}
};
if host.eq_ignore_ascii_case("localhost") {
warn!(%url, host, "Rejected ingestion URL to localhost");
return Err(AppError::Validation(
"Ingestion URL host is not allowed".to_string(),
));
}
if let Ok(ip) = host.parse::<IpAddr>() {
let is_disallowed = match ip {
IpAddr::V4(v4) => v4.is_private() || v4.is_link_local(),
IpAddr::V6(v6) => v6.is_unique_local() || v6.is_unicast_link_local(),
};
if ip.is_loopback() || ip.is_unspecified() || ip.is_multicast() || is_disallowed {
warn!(%url, host, %ip, "Rejected ingestion URL pointing to restricted network range");
return Err(AppError::Validation(
"Ingestion URL host is not allowed".to_string(),
));
}
}
Ok(host.replace(|c: char| !c.is_alphanumeric(), "_"))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn rejects_unsupported_scheme() {
let url = url::Url::parse("ftp://example.com").expect("url");
assert!(ensure_ingestion_url_allowed(&url).is_err());
}
#[test]
fn rejects_localhost() {
let url = url::Url::parse("http://localhost/resource").expect("url");
assert!(ensure_ingestion_url_allowed(&url).is_err());
}
#[test]
fn rejects_private_ipv4() {
let url = url::Url::parse("http://192.168.1.10/index.html").expect("url");
assert!(ensure_ingestion_url_allowed(&url).is_err());
}
#[test]
fn allows_public_domain_and_sanitizes() {
let url = url::Url::parse("https://sub.example.com/path").expect("url");
let sanitized = ensure_ingestion_url_allowed(&url).expect("allowed");
assert_eq!(sanitized, "sub_example_com");
}
}