feat: pdf support

This commit is contained in:
Per Stark
2025-09-28 20:53:35 +02:00
parent 7403195df5
commit 5cb15dab45
9 changed files with 1105 additions and 30 deletions

View File

@@ -196,7 +196,7 @@ pub fn split_object_path(path: &str) -> AnyResult<(String, String)> {
#[cfg(test)]
mod tests {
use super::*;
use crate::utils::config::StorageKind;
use crate::utils::config::{PdfIngestMode::LlmFirst, StorageKind};
use bytes::Bytes;
use futures::TryStreamExt;
use uuid::Uuid;
@@ -213,6 +213,7 @@ mod tests {
http_port: 0,
openai_base_url: "..".into(),
storage: StorageKind::Local,
pdf_ingest_mode: LlmFirst,
}
}

View File

@@ -277,7 +277,7 @@ impl FileInfo {
#[cfg(test)]
mod tests {
use super::*;
use crate::utils::config::StorageKind;
use crate::utils::config::{PdfIngestMode::LlmFirst, StorageKind};
use axum::http::HeaderMap;
use axum_typed_multipart::FieldMetadata;
use std::io::Write;
@@ -332,6 +332,7 @@ mod tests {
http_port: 3000,
openai_base_url: "..".to_string(),
storage: StorageKind::Local,
pdf_ingest_mode: LlmFirst,
};
// Test file creation
@@ -392,6 +393,7 @@ mod tests {
http_port: 3000,
openai_base_url: "..".to_string(),
storage: StorageKind::Local,
pdf_ingest_mode: LlmFirst,
};
// Store the original file
@@ -448,6 +450,7 @@ mod tests {
http_port: 3000,
openai_base_url: "..".to_string(),
storage: StorageKind::Local,
pdf_ingest_mode: LlmFirst,
};
let file_info = FileInfo::new(field_data, &db, user_id, &config).await;
@@ -505,6 +508,7 @@ mod tests {
http_port: 3000,
openai_base_url: "..".to_string(),
storage: StorageKind::Local,
pdf_ingest_mode: LlmFirst,
};
let field_data1 = create_test_file(content, file_name);
@@ -669,6 +673,7 @@ mod tests {
http_port: 0,
openai_base_url: "".to_string(),
storage: crate::utils::config::StorageKind::Local,
pdf_ingest_mode: LlmFirst,
};
let temp = create_test_file(b"test content", "test_file.txt");
let file_info = FileInfo::new(temp, &db, user_id, &cfg)
@@ -723,6 +728,7 @@ mod tests {
http_port: 0,
openai_base_url: "".to_string(),
storage: crate::utils::config::StorageKind::Local,
pdf_ingest_mode: LlmFirst,
},
)
.await;
@@ -831,6 +837,7 @@ mod tests {
http_port: 3000,
openai_base_url: "..".to_string(),
storage: StorageKind::Local,
pdf_ingest_mode: LlmFirst,
};
// Test file creation

View File

@@ -11,6 +11,20 @@ fn default_storage_kind() -> StorageKind {
StorageKind::Local
}
/// Selects the strategy used for PDF ingestion.
#[derive(Clone, Deserialize, Debug)]
#[serde(rename_all = "kebab-case")]
pub enum PdfIngestMode {
/// Only rely on classic text extraction (no LLM fallbacks).
Classic,
/// Prefer fast text extraction, but fall back to the LLM rendering path when needed.
LlmFirst,
}
fn default_pdf_ingest_mode() -> PdfIngestMode {
PdfIngestMode::LlmFirst
}
#[derive(Clone, Deserialize, Debug)]
pub struct AppConfig {
pub openai_api_key: String,
@@ -26,6 +40,8 @@ pub struct AppConfig {
pub openai_base_url: String,
#[serde(default = "default_storage_kind")]
pub storage: StorageKind,
#[serde(default = "default_pdf_ingest_mode")]
pub pdf_ingest_mode: PdfIngestMode,
}
fn default_data_dir() -> String {