feat: readability parsing, screenshot of page, file serving

This commit is contained in:
Per Stark
2025-04-30 08:06:18 +02:00
parent 776a454a88
commit 02198dc21a
20 changed files with 707 additions and 309 deletions

View File

@@ -33,4 +33,6 @@ pub enum AppError {
Tiktoken(#[from] anyhow::Error),
#[error("Ingress Processing error: {0}")]
Processing(String),
#[error("DOM smoothie error: {0}")]
DomSmoothie(#[from] dom_smoothie::ReadabilityError),
}

View File

@@ -38,7 +38,8 @@ stored_object!(FileInfo, "file", {
sha256: String,
path: String,
file_name: String,
mime_type: String
mime_type: String,
user_id: String
});
impl FileInfo {
@@ -83,6 +84,7 @@ impl FileInfo {
.to_string_lossy()
.into(),
mime_type: Self::guess_mime_type(Path::new(&sanitized_file_name)),
user_id: user_id.to_string(),
};
// Store in database
@@ -258,6 +260,22 @@ impl FileInfo {
Ok(())
}
/// Retrieves a `FileInfo` by its ID.
///
/// # Arguments
/// * `id` - The ID string of the file.
/// * `db_client` - Reference to the SurrealDbClient.
///
/// # Returns
/// * `Result<FileInfo, FileError>` - The `FileInfo` or an error if not found or on DB issues.
pub async fn get_by_id(id: &str, db_client: &SurrealDbClient) -> Result<FileInfo, FileError> {
match db_client.get_item::<FileInfo>(id).await {
Ok(Some(file_info)) => Ok(file_info),
Ok(None) => Err(FileError::FileNotFound(id.to_string())),
Err(e) => Err(FileError::SurrealError(e)),
}
}
}
#[cfg(test)]
@@ -460,6 +478,7 @@ mod tests {
id: Uuid::new_v4().to_string(),
created_at: now,
updated_at: now,
user_id: "user123".to_string(),
sha256: "test_sha256_hash".to_string(),
path: "/path/to/file.txt".to_string(),
file_name: "manual_file.txt".to_string(),
@@ -517,6 +536,7 @@ mod tests {
// The file path should point to our test file
let file_info = FileInfo {
id: file_id.clone(),
user_id: "user123".to_string(),
created_at: now,
updated_at: now,
sha256: "test_sha256_hash".to_string(),
@@ -586,4 +606,72 @@ mod tests {
_ => panic!("Expected FileNotFound error"),
}
}
#[tokio::test]
async fn test_get_by_id() {
// Setup in-memory database for testing
let namespace = "test_ns";
let database = &Uuid::new_v4().to_string();
let db = SurrealDbClient::memory(namespace, database)
.await
.expect("Failed to start in-memory surrealdb");
// Create a FileInfo instance directly
let now = Utc::now();
let file_id = Uuid::new_v4().to_string();
let original_file_info = FileInfo {
id: file_id.clone(),
user_id: "user123".to_string(),
created_at: now,
updated_at: now,
sha256: "test_sha256_for_get_by_id".to_string(),
path: "/path/to/get_by_id_test.txt".to_string(),
file_name: "get_by_id_test.txt".to_string(),
mime_type: "text/plain".to_string(),
};
// Store it in the database
db.store_item(original_file_info.clone())
.await
.expect("Failed to store item for get_by_id test");
// Retrieve it using get_by_id
let result = FileInfo::get_by_id(&file_id, &db).await;
// Assert success and content match
assert!(result.is_ok());
let retrieved_info = result.unwrap();
assert_eq!(retrieved_info.id, original_file_info.id);
assert_eq!(retrieved_info.sha256, original_file_info.sha256);
assert_eq!(retrieved_info.file_name, original_file_info.file_name);
assert_eq!(retrieved_info.path, original_file_info.path);
assert_eq!(retrieved_info.mime_type, original_file_info.mime_type);
// Optionally compare timestamps if precision isn't an issue
// assert_eq!(retrieved_info.created_at, original_file_info.created_at);
}
#[tokio::test]
async fn test_get_by_id_not_found() {
// Setup in-memory database for testing
let namespace = "test_ns";
let database = &Uuid::new_v4().to_string();
let db = SurrealDbClient::memory(namespace, database)
.await
.expect("Failed to start in-memory surrealdb");
// Try to retrieve a non-existent ID
let non_existent_id = "non-existent-file-id";
let result = FileInfo::get_by_id(non_existent_id, &db).await;
// Assert failure
assert!(result.is_err());
// Assert the specific error type is FileNotFound
match result {
Err(FileError::FileNotFound(id)) => {
assert_eq!(id, non_existent_id);
}
Err(e) => panic!("Expected FileNotFound error, but got {:?}", e),
Ok(_) => panic!("Expected an error, but got Ok"),
}
}
}

View File

@@ -114,6 +114,7 @@ mod tests {
id: mock.id,
sha256: "mock-sha256".to_string(),
path: "/mock/path".to_string(),
user_id: "user123".to_string(),
file_name: "mock.txt".to_string(),
mime_type: "text/plain".to_string(),
created_at: Utc::now(),

View File

@@ -31,19 +31,7 @@ impl SystemSettings {
let settings: Option<Self> = db.get_item("current").await?;
if settings.is_none() {
let created_settings = SystemSettings {
id: "current".to_string(),
registrations_enabled: true,
require_email_verification: false,
query_model: "gpt-4o-mini".to_string(),
processing_model: "gpt-4o-mini".to_string(),
query_system_prompt:
crate::storage::types::system_prompts::DEFAULT_QUERY_SYSTEM_PROMPT.to_string(),
ingestion_system_prompt:
crate::storage::types::system_prompts::DEFAULT_INGRESS_ANALYSIS_SYSTEM_PROMPT
.to_string(),
};
let created_settings = Self::new();
let stored: Option<Self> = db.store_item(created_settings).await?;
return stored.ok_or(AppError::Validation("Failed to initialize settings".into()));
}

View File

@@ -5,10 +5,17 @@ use crate::{error::AppError, storage::db::SurrealDbClient, stored_object};
use super::file_info::FileInfo;
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
pub struct UrlInfo {
pub url: String,
pub title: String,
pub image_id: String,
}
stored_object!(TextContent, "text_content", {
text: String,
file_info: Option<FileInfo>,
url: Option<String>,
url_info: Option<UrlInfo>,
instructions: String,
category: String,
user_id: String
@@ -20,7 +27,7 @@ impl TextContent {
instructions: String,
category: String,
file_info: Option<FileInfo>,
url: Option<String>,
url_info: Option<UrlInfo>,
user_id: String,
) -> Self {
let now = Utc::now();
@@ -30,7 +37,7 @@ impl TextContent {
updated_at: now,
text,
file_info,
url,
url_info,
instructions,
category,
user_id,
@@ -85,7 +92,7 @@ mod tests {
assert_eq!(text_content.category, category);
assert_eq!(text_content.user_id, user_id);
assert!(text_content.file_info.is_none());
assert!(text_content.url.is_none());
assert!(text_content.url_info.is_none());
assert!(!text_content.id.is_empty());
}
@@ -96,19 +103,27 @@ mod tests {
let instructions = "URL instructions".to_string();
let category = "URL category".to_string();
let user_id = "user123".to_string();
let url = Some("https://example.com/document.pdf".to_string());
let title = "page_title".to_string();
let image_id = "image12312".to_string();
let url = "https://example.com/document.pdf".to_string();
let url_info = Some(UrlInfo {
url,
title,
image_id,
});
let text_content = TextContent::new(
text.clone(),
instructions.clone(),
category.clone(),
None,
url.clone(),
url_info.clone(),
user_id.clone(),
);
// Check URL field is set
assert_eq!(text_content.url, url);
assert_eq!(text_content.url_info, url_info);
}
#[tokio::test]