mirror of
https://github.com/perstarkse/minne.git
synced 2026-03-22 17:40:00 +01:00
feat: pdf support
This commit is contained in:
@@ -196,7 +196,7 @@ pub fn split_object_path(path: &str) -> AnyResult<(String, String)> {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::utils::config::StorageKind;
|
||||
use crate::utils::config::{PdfIngestMode::LlmFirst, StorageKind};
|
||||
use bytes::Bytes;
|
||||
use futures::TryStreamExt;
|
||||
use uuid::Uuid;
|
||||
@@ -213,6 +213,7 @@ mod tests {
|
||||
http_port: 0,
|
||||
openai_base_url: "..".into(),
|
||||
storage: StorageKind::Local,
|
||||
pdf_ingest_mode: LlmFirst,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -277,7 +277,7 @@ impl FileInfo {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::utils::config::StorageKind;
|
||||
use crate::utils::config::{PdfIngestMode::LlmFirst, StorageKind};
|
||||
use axum::http::HeaderMap;
|
||||
use axum_typed_multipart::FieldMetadata;
|
||||
use std::io::Write;
|
||||
@@ -332,6 +332,7 @@ mod tests {
|
||||
http_port: 3000,
|
||||
openai_base_url: "..".to_string(),
|
||||
storage: StorageKind::Local,
|
||||
pdf_ingest_mode: LlmFirst,
|
||||
};
|
||||
|
||||
// Test file creation
|
||||
@@ -392,6 +393,7 @@ mod tests {
|
||||
http_port: 3000,
|
||||
openai_base_url: "..".to_string(),
|
||||
storage: StorageKind::Local,
|
||||
pdf_ingest_mode: LlmFirst,
|
||||
};
|
||||
|
||||
// Store the original file
|
||||
@@ -448,6 +450,7 @@ mod tests {
|
||||
http_port: 3000,
|
||||
openai_base_url: "..".to_string(),
|
||||
storage: StorageKind::Local,
|
||||
pdf_ingest_mode: LlmFirst,
|
||||
};
|
||||
let file_info = FileInfo::new(field_data, &db, user_id, &config).await;
|
||||
|
||||
@@ -505,6 +508,7 @@ mod tests {
|
||||
http_port: 3000,
|
||||
openai_base_url: "..".to_string(),
|
||||
storage: StorageKind::Local,
|
||||
pdf_ingest_mode: LlmFirst,
|
||||
};
|
||||
|
||||
let field_data1 = create_test_file(content, file_name);
|
||||
@@ -669,6 +673,7 @@ mod tests {
|
||||
http_port: 0,
|
||||
openai_base_url: "".to_string(),
|
||||
storage: crate::utils::config::StorageKind::Local,
|
||||
pdf_ingest_mode: LlmFirst,
|
||||
};
|
||||
let temp = create_test_file(b"test content", "test_file.txt");
|
||||
let file_info = FileInfo::new(temp, &db, user_id, &cfg)
|
||||
@@ -723,6 +728,7 @@ mod tests {
|
||||
http_port: 0,
|
||||
openai_base_url: "".to_string(),
|
||||
storage: crate::utils::config::StorageKind::Local,
|
||||
pdf_ingest_mode: LlmFirst,
|
||||
},
|
||||
)
|
||||
.await;
|
||||
@@ -831,6 +837,7 @@ mod tests {
|
||||
http_port: 3000,
|
||||
openai_base_url: "..".to_string(),
|
||||
storage: StorageKind::Local,
|
||||
pdf_ingest_mode: LlmFirst,
|
||||
};
|
||||
|
||||
// Test file creation
|
||||
|
||||
@@ -11,6 +11,20 @@ fn default_storage_kind() -> StorageKind {
|
||||
StorageKind::Local
|
||||
}
|
||||
|
||||
/// Selects the strategy used for PDF ingestion.
|
||||
#[derive(Clone, Deserialize, Debug)]
|
||||
#[serde(rename_all = "kebab-case")]
|
||||
pub enum PdfIngestMode {
|
||||
/// Only rely on classic text extraction (no LLM fallbacks).
|
||||
Classic,
|
||||
/// Prefer fast text extraction, but fall back to the LLM rendering path when needed.
|
||||
LlmFirst,
|
||||
}
|
||||
|
||||
fn default_pdf_ingest_mode() -> PdfIngestMode {
|
||||
PdfIngestMode::LlmFirst
|
||||
}
|
||||
|
||||
#[derive(Clone, Deserialize, Debug)]
|
||||
pub struct AppConfig {
|
||||
pub openai_api_key: String,
|
||||
@@ -26,6 +40,8 @@ pub struct AppConfig {
|
||||
pub openai_base_url: String,
|
||||
#[serde(default = "default_storage_kind")]
|
||||
pub storage: StorageKind,
|
||||
#[serde(default = "default_pdf_ingest_mode")]
|
||||
pub pdf_ingest_mode: PdfIngestMode,
|
||||
}
|
||||
|
||||
fn default_data_dir() -> String {
|
||||
|
||||
Reference in New Issue
Block a user