refactor: file_info

This commit is contained in:
Per Stark
2024-12-09 21:18:03 +01:00
parent b61dae1ea8
commit 6ad4071d63
3 changed files with 114 additions and 360 deletions

View File

@@ -1,10 +1,12 @@
use super::ingress_object::IngressObject;
use crate::storage::{db::SurrealDbClient, types::file_info::FileInfo};
use crate::storage::{
db::{get_item, SurrealDbClient},
types::file_info::FileInfo,
};
use serde::{Deserialize, Serialize};
use thiserror::Error;
use tracing::info;
use url::Url;
use uuid::Uuid;
/// Struct defining the expected body when ingressing content.
#[derive(Serialize, Deserialize, Debug)]
@@ -24,6 +26,9 @@ pub enum IngressContentError {
#[error("UTF-8 conversion error: {0}")]
Utf8(#[from] std::string::FromUtf8Error),
#[error("SurrealDb error: {0}")]
SurrealDbError(#[from] surrealdb::Error),
#[error("MIME type detection failed for input: {0}")]
MimeDetection(String),
@@ -79,19 +84,15 @@ pub async fn create_ingress_objects(
// Look up FileInfo objects using the db and the submitted uuids in input.files
if let Some(file_uuids) = input.files {
for uuid_str in file_uuids {
let uuid = Uuid::parse_str(&uuid_str)?;
match FileInfo::get_by_uuid(uuid, db_client).await {
Ok(file_info) => {
object_list.push(IngressObject::File {
file_info,
instructions: input.instructions.clone(),
category: input.category.clone(),
});
}
_ => {
info!("No file with UUID: {}", uuid);
}
for uuid in file_uuids {
if let Some(file_info) = get_item::<FileInfo>(&db_client, &uuid).await? {
object_list.push(IngressObject::File {
file_info,
instructions: input.instructions.clone(),
category: input.category.clone(),
});
} else {
info!("No file with UUID: {}", uuid);
}
}
}

View File

@@ -98,3 +98,18 @@ where
{
db_client.select(T::table_name()).await
}
/// Operation to retrieve a single object by its ID, requires the struct to implement StoredObject
///
/// # Arguments
/// * `db_client` - An initialized database client
/// * `id` - The ID of the item to retrieve
///
/// # Returns
/// * `Result<Option<T>, Error>` - The found item or Error
pub async fn get_item<T>(db_client: &Surreal<Client>, id: &str) -> Result<Option<T>, Error>
where
T: for<'de> StoredObject,
{
Ok(db_client.select((T::table_name(), id)).await?)
}

View File

@@ -1,282 +1,142 @@
use axum::{
http::StatusCode,
response::{IntoResponse, Response},
Json,
};
use axum_typed_multipart::FieldData;
use mime_guess::from_path;
use serde::{Deserialize, Serialize};
use serde_json::json;
use sha2::{Digest, Sha256};
use std::{
io::{BufReader, Read},
path::{Path, PathBuf},
};
use surrealdb::RecordId;
use tempfile::NamedTempFile;
use thiserror::Error;
use tracing::{debug, info};
use tracing::info;
use uuid::Uuid;
use crate::storage::db::SurrealDbClient;
use crate::{
storage::db::{store_item, SurrealDbClient},
stored_object,
};
#[derive(Debug, Deserialize)]
struct Record {
#[allow(dead_code)]
id: RecordId,
}
/// Represents metadata and storage information for a file.
#[derive(Debug, PartialEq, Clone, Deserialize, Serialize)]
pub struct FileInfo {
pub uuid: String,
pub sha256: String,
pub path: String,
pub mime_type: String,
}
/// Errors that can occur during FileInfo operations
#[derive(Error, Debug)]
pub enum FileError {
#[error("IO error occurred: {0}")]
Io(#[from] std::io::Error),
#[error("UTF-8 conversion error: {0}")]
Utf8(#[from] std::string::FromUtf8Error),
#[error("MIME type detection failed for input: {0}")]
MimeDetection(String),
#[error("Unsupported MIME type: {0}")]
UnsupportedMime(String),
#[error("SurrealDB error: {0}")]
SurrealError(#[from] surrealdb::Error),
#[error("File not found for UUID: {0}")]
FileNotFound(String),
#[error("IO error occurred: {0}")]
Io(#[from] std::io::Error),
#[error("Duplicate file detected with SHA256: {0}")]
DuplicateFile(String),
#[error("Hash collision detected")]
HashCollision,
#[error("SurrealDB error: {0}")]
SurrealError(#[from] surrealdb::Error),
#[error("Invalid UUID format: {0}")]
InvalidUuid(String),
#[error("Failed to persist file: {0}")]
PersistError(#[from] tempfile::PersistError),
#[error("File name missing in metadata")]
MissingFileName,
#[error("Failed to persist file: {0}")]
PersistError(String),
#[error("Serialization error: {0}")]
SerializationError(String),
#[error("Deserialization error: {0}")]
DeserializationError(String),
// Add more error variants as needed.
}
impl IntoResponse for FileError {
fn into_response(self) -> Response {
let (status, error_message) = match self {
FileError::Io(_) => (StatusCode::INTERNAL_SERVER_ERROR, "Internal server error"),
FileError::Utf8(_) => (StatusCode::BAD_REQUEST, "Invalid UTF-8 data"),
FileError::MimeDetection(_) => (StatusCode::BAD_REQUEST, "MIME type detection failed"),
FileError::UnsupportedMime(_) => {
(StatusCode::UNSUPPORTED_MEDIA_TYPE, "Unsupported MIME type")
}
FileError::FileNotFound(_) => (StatusCode::NOT_FOUND, "File not found"),
FileError::DuplicateFile(_) => (StatusCode::CONFLICT, "Duplicate file detected"),
FileError::HashCollision => {
(StatusCode::INTERNAL_SERVER_ERROR, "Hash collision detected")
}
FileError::InvalidUuid(_) => (StatusCode::BAD_REQUEST, "Invalid UUID format"),
FileError::MissingFileName => {
(StatusCode::BAD_REQUEST, "Missing file name in metadata")
}
FileError::PersistError(_) => {
(StatusCode::INTERNAL_SERVER_ERROR, "Failed to persist file")
}
FileError::SerializationError(_) => {
(StatusCode::INTERNAL_SERVER_ERROR, "Serialization error")
}
FileError::DeserializationError(_) => {
(StatusCode::BAD_REQUEST, "Deserialization error")
}
FileError::SurrealError(_) => {
(StatusCode::INTERNAL_SERVER_ERROR, "Serialization error")
}
};
let body = Json(json!({
"error": error_message,
}));
(status, body).into_response()
}
}
stored_object!(FileInfo, "file", {
sha256: String,
path: String,
mime_type: String
});
impl FileInfo {
pub async fn new(
field_data: FieldData<NamedTempFile>,
db_client: &SurrealDbClient,
) -> Result<FileInfo, FileError> {
let file = field_data.contents; // NamedTempFile
let metadata = field_data.metadata;
) -> Result<Self, FileError> {
let file = field_data.contents;
let file_name = field_data
.metadata
.file_name
.ok_or(FileError::MissingFileName)?;
// Extract file name from metadata
let file_name = metadata.file_name.ok_or(FileError::MissingFileName)?;
info!("File name: {:?}", file_name);
// Calculate SHA256
let sha256 = Self::get_sha(&file).await?;
// Calculate SHA256 hash of the file
let sha = Self::get_sha(&file).await?;
info!("SHA256: {:?}", sha);
// Check if SHA exists in SurrealDB
if let Ok(file) = Self::get_by_sha(&sha, db_client).await {
info!("File already exists in database with SHA256: {}", sha);
// SHA exists: return FileInfo
return Ok(file);
// Early return if file already exists
match Self::get_by_sha(&sha256, db_client).await {
Ok(existing_file) => {
info!("File already exists with SHA256: {}", sha256);
return Ok(existing_file);
}
Err(FileError::FileNotFound(_)) => (), // Expected case for new files
Err(e) => return Err(e), // Propagate unexpected errors
}
// Generate a new UUID
// Generate UUID and prepare paths
let uuid = Uuid::new_v4();
info!("UUID: {:?}", uuid);
let sanitized_file_name = Self::sanitize_file_name(&file_name);
// Sanitize file name
let sanitized_file_name = sanitize_file_name(&file_name);
info!("Sanitized file name: {:?}", sanitized_file_name);
// Persist the file to the filesystem
let persisted_path = Self::persist_file(&uuid, file, &sanitized_file_name).await?;
// Guess the MIME type
let mime_type = Self::guess_mime_type(&persisted_path);
info!("Mime type: {:?}", mime_type);
// Construct the FileInfo object
let file_info = FileInfo {
uuid: uuid.to_string(),
sha256: sha.clone(),
path: persisted_path.to_string_lossy().to_string(),
mime_type,
// Create new FileInfo instance
let file_info = Self {
id: uuid.to_string(),
sha256,
path: Self::persist_file(&uuid, file, &sanitized_file_name)
.await?
.to_string_lossy()
.into(),
mime_type: Self::guess_mime_type(Path::new(&sanitized_file_name)),
};
// Store FileInfo in SurrealDB
Self::create_record(&file_info, db_client).await?;
// Store in database
store_item(&db_client.client, file_info.clone()).await?;
Ok(file_info)
}
/// Updates an existing file identified by UUID with new file data.
/// Guesses the MIME type based on the file extension.
///
/// # Arguments
/// * `uuid` - The UUID of the file to update.
/// * `new_field_data` - The new file data.
/// * `redis_client` - Reference to the RedisClient.
/// * `path` - The path to the file.
///
/// # Returns
/// * `Result<FileInfo, FileError>` - The updated `FileInfo` or an error.
pub async fn update(
uuid: Uuid,
new_field_data: FieldData<NamedTempFile>,
db_client: &SurrealDbClient,
) -> Result<FileInfo, FileError> {
let new_file = new_field_data.contents;
let new_metadata = new_field_data.metadata;
// Extract new file name
let new_file_name = new_metadata.file_name.ok_or(FileError::MissingFileName)?;
// Calculate SHA256 of the new file
let new_sha = Self::get_sha(&new_file).await?;
// Check if the new SHA already exists
if let Ok(file) = Self::get_by_sha(&new_sha, db_client).await {
info!("File already exists in database with SHA256: {}", new_sha);
// SHA exists: return FileInfo
return Ok(file);
}
// Sanitize new file name
let sanitized_new_file_name = sanitize_file_name(&new_file_name);
// Persist the new file
let new_persisted_path =
Self::persist_file(&uuid, new_file, &sanitized_new_file_name).await?;
// Guess the new MIME type
let new_mime_type = Self::guess_mime_type(&new_persisted_path);
// Get the existing item and remove it
let old_record = Self::get_by_uuid(uuid, db_client).await?;
Self::delete_record(&old_record.sha256, db_client).await?;
// Update FileInfo
let updated_file_info = FileInfo {
uuid: uuid.to_string(),
sha256: new_sha.clone(),
path: new_persisted_path.to_string_lossy().to_string(),
mime_type: new_mime_type,
};
// Save the new item
Self::create_record(&updated_file_info, db_client).await?;
// Optionally, delete the old file from the filesystem if it's no longer referenced
// This requires reference counting or checking if other FileInfo entries point to the same SHA
// For simplicity, this step is omitted.
Ok(updated_file_info)
/// * `String` - The guessed MIME type as a string.
fn guess_mime_type(path: &Path) -> String {
from_path(path)
.first_or(mime::APPLICATION_OCTET_STREAM)
.to_string()
}
/// Deletes a file and its corresponding metadata based on UUID.
/// Calculates the SHA256 hash of the given file.
///
/// # Arguments
/// * `uuid` - The UUID of the file to delete.
/// * `redis_client` - Reference to the RedisClient.
/// * `file` - The file to hash.
///
/// # Returns
/// * `Result<(), FileError>` - Empty result or an error.
pub async fn delete(uuid: Uuid, db_client: &SurrealDbClient) -> Result<(), FileError> {
// Retrieve FileInfo to get SHA256 and path
let file_info = Self::get_by_uuid(uuid, db_client).await?;
/// * `Result<String, FileError>` - The SHA256 hash as a hex string or an error.
async fn get_sha(file: &NamedTempFile) -> Result<String, FileError> {
let mut reader = BufReader::new(file.as_file());
let mut hasher = Sha256::new();
let mut buffer = [0u8; 8192]; // 8KB buffer
// Delete the file from the filesystem
let file_path = Path::new(&file_info.path);
if file_path.exists() {
tokio::fs::remove_file(file_path)
.await
.map_err(FileError::Io)?;
info!("Deleted file at path: {}", file_info.path);
} else {
info!(
"File path does not exist, skipping deletion: {}",
file_info.path
);
}
// Delete the FileInfo from database
Self::delete_record(&file_info.sha256, db_client).await?;
// Remove the UUID directory if empty
let uuid_dir = file_path
.parent()
.ok_or(FileError::FileNotFound(uuid.to_string()))?;
if uuid_dir.exists() {
let mut entries = tokio::fs::read_dir(uuid_dir).await.map_err(FileError::Io)?;
if entries.next_entry().await?.is_none() {
tokio::fs::remove_dir(uuid_dir)
.await
.map_err(FileError::Io)?;
info!("Deleted empty UUID directory: {:?}", uuid_dir);
loop {
let n = reader.read(&mut buffer)?;
if n == 0 {
break;
}
hasher.update(&buffer[..n]);
}
Ok(())
let digest = hasher.finalize();
Ok(format!("{:x}", digest))
}
/// Sanitizes the file name to prevent security vulnerabilities like directory traversal.
/// Replaces any non-alphanumeric characters (excluding '.' and '_') with underscores.
fn sanitize_file_name(file_name: &str) -> String {
file_name
.chars()
.map(|c| {
if c.is_ascii_alphanumeric() || c == '.' || c == '_' {
c
} else {
'_'
}
})
.collect()
}
/// Persists the file to the filesystem under `./data/{uuid}/{file_name}`.
@@ -306,99 +166,13 @@ impl FileInfo {
info!("Final path: {:?}", final_path);
// Persist the temporary file to the final path
file.persist(&final_path)
.map_err(|e| FileError::PersistError(e.to_string()))?;
file.persist(&final_path)?;
info!("Persisted file to {:?}", final_path);
Ok(final_path)
}
/// Calculates the SHA256 hash of the given file.
///
/// # Arguments
/// * `file` - The file to hash.
///
/// # Returns
/// * `Result<String, FileError>` - The SHA256 hash as a hex string or an error.
async fn get_sha(file: &NamedTempFile) -> Result<String, FileError> {
let mut reader = BufReader::new(file.as_file());
let mut hasher = Sha256::new();
let mut buffer = [0u8; 8192]; // 8KB buffer
loop {
let n = reader.read(&mut buffer)?;
if n == 0 {
break;
}
hasher.update(&buffer[..n]);
}
let digest = hasher.finalize();
Ok(format!("{:x}", digest))
}
/// Guesses the MIME type based on the file extension.
///
/// # Arguments
/// * `path` - The path to the file.
///
/// # Returns
/// * `String` - The guessed MIME type as a string.
fn guess_mime_type(path: &Path) -> String {
from_path(path)
.first_or(mime::APPLICATION_OCTET_STREAM)
.to_string()
}
/// Creates a new record in SurrealDB for the given `FileInfo`.
///
/// # Arguments
/// * `file_info` - The `FileInfo` to store.
/// * `db_client` - Reference to the SurrealDbClient.
///
/// # Returns
/// * `Result<(), FileError>` - Empty result or an error.
async fn create_record(
file_info: &FileInfo,
db_client: &SurrealDbClient,
) -> Result<(), FileError> {
// Create the record
let _created: Option<Record> = db_client
.client
.create(("file", &file_info.uuid))
.content(file_info.clone())
.await?;
debug!("{:?}", _created);
info!("Created FileInfo record with SHA256: {}", file_info.sha256);
Ok(())
}
/// Retrieves a `FileInfo` by UUID.
///
/// # Arguments
/// * `uuid` - The UUID string.
/// * `db_client` - Reference to the SurrealDbClient.
///
/// # Returns
/// * `Result<FileInfo, FileError>` - The `FileInfo` or `Error` if not found.
pub async fn get_by_uuid(
uuid: Uuid,
db_client: &SurrealDbClient,
) -> Result<FileInfo, FileError> {
let query = format!("SELECT * FROM file WHERE uuid = '{}'", uuid);
let response: Vec<FileInfo> = db_client.client.query(query).await?.take(0)?;
response
.into_iter()
.next()
.ok_or(FileError::FileNotFound(uuid.to_string()))
}
/// Retrieves a `FileInfo` by SHA256.
///
/// # Arguments
@@ -411,45 +185,9 @@ impl FileInfo {
let query = format!("SELECT * FROM file WHERE sha256 = '{}'", &sha256);
let response: Vec<FileInfo> = db_client.client.query(query).await?.take(0)?;
debug!("{:?}", response);
response
.into_iter()
.next()
.ok_or(FileError::FileNotFound(sha256.to_string()))
}
/// Deletes a `FileInfo` record by SHA256.
///
/// # Arguments
/// * `sha256` - The SHA256 hash string.
/// * `db_client` - Reference to the SurrealDbClient.
///
/// # Returns
/// * `Result<(), FileError>` - Empty result or an error.
async fn delete_record(sha256: &str, db_client: &SurrealDbClient) -> Result<(), FileError> {
let table = "file";
let primary_key = sha256;
let _created: Option<Record> = db_client.client.delete((table, primary_key)).await?;
info!("Deleted FileInfo record with SHA256: {}", sha256);
Ok(())
}
}
/// Sanitizes the file name to prevent security vulnerabilities like directory traversal.
/// Replaces any non-alphanumeric characters (excluding '.' and '_') with underscores.
fn sanitize_file_name(file_name: &str) -> String {
file_name
.chars()
.map(|c| {
if c.is_ascii_alphanumeric() || c == '.' || c == '_' {
c
} else {
'_'
}
})
.collect()
}