feat: image ingestion

This commit is contained in:
Per Stark
2025-06-17 08:26:15 +02:00
parent f567b7198b
commit 9a23c1ea1b
10 changed files with 91 additions and 6 deletions

View File

@@ -22,9 +22,9 @@ text-splitter = { workspace = true }
url = { workspace = true }
uuid = { workspace = true }
headless_chrome = { workspace = true }
base64 = { workspace = true }
common = { path = "../common" }
composite-retrieval = { path = "../composite-retrieval" }
[features]
docker = []

View File

@@ -59,7 +59,8 @@ impl IngestionPipeline {
)
.await?;
let text_content = to_text_content(task.content, &self.db, &self.config).await?;
let text_content =
to_text_content(task.content, &self.db, &self.config, &self.openai_client).await?;
match self.process(&text_content).await {
Ok(_) => {

View File

@@ -22,10 +22,13 @@ use std::io::{Seek, SeekFrom};
use tempfile::NamedTempFile;
use tracing::{error, info};
use crate::utils::image_parsing::extract_text_from_image;
pub async fn to_text_content(
ingestion_payload: IngestionPayload,
db: &SurrealDbClient,
config: &AppConfig,
openai_client: &async_openai::Client<async_openai::config::OpenAIConfig>,
) -> Result<TextContent, AppError> {
match ingestion_payload {
IngestionPayload::Url {
@@ -67,7 +70,7 @@ pub async fn to_text_content(
category,
user_id,
} => {
let text = extract_text_from_file(&file_info).await?;
let text = extract_text_from_file(&file_info, db, openai_client).await?;
Ok(TextContent::new(
text,
Some(context),
@@ -195,7 +198,11 @@ async fn fetch_article_from_url(
}
/// Extracts text from a file based on its MIME type.
async fn extract_text_from_file(file_info: &FileInfo) -> Result<String, AppError> {
async fn extract_text_from_file(
file_info: &FileInfo,
db_client: &SurrealDbClient,
openai_client: &async_openai::Client<async_openai::config::OpenAIConfig>,
) -> Result<String, AppError> {
match file_info.mime_type.as_str() {
"text/plain" => {
// Read the file and return its content
@@ -212,8 +219,9 @@ async fn extract_text_from_file(file_info: &FileInfo) -> Result<String, AppError
Err(AppError::NotFound(file_info.mime_type.clone()))
}
"image/png" | "image/jpeg" => {
// TODO: Implement OCR on image using a crate like `tesseract`
Err(AppError::NotFound(file_info.mime_type.clone()))
let content =
extract_text_from_image(&file_info.path, db_client, openai_client).await?;
Ok(content)
}
"application/octet-stream" => {
let content = tokio::fs::read_to_string(&file_info.path).await?;

View File

@@ -0,0 +1,62 @@
use async_openai::types::{
ChatCompletionRequestMessageContentPartImageArgs,
ChatCompletionRequestMessageContentPartTextArgs, ChatCompletionRequestUserMessageArgs,
CreateChatCompletionRequestArgs, ImageDetail, ImageUrlArgs,
};
use base64::{engine::general_purpose::STANDARD, Engine as _};
use common::{
error::AppError,
storage::{db::SurrealDbClient, types::system_settings::SystemSettings},
};
pub async fn extract_text_from_image(
path: &str,
db: &SurrealDbClient,
client: &async_openai::Client<async_openai::config::OpenAIConfig>,
) -> Result<String, AppError> {
let system_settings = SystemSettings::get_current(db).await?;
let image_bytes = tokio::fs::read(&path).await?;
let base64_image = STANDARD.encode(&image_bytes);
let image_url = format!("data:image/png;base64,{}", base64_image);
let request = CreateChatCompletionRequestArgs::default()
.model(system_settings.processing_model)
.max_tokens(6400_u32)
.messages([ChatCompletionRequestUserMessageArgs::default()
.content(vec![
ChatCompletionRequestMessageContentPartTextArgs::default()
.text(r#"Analyze this image and respond based on its primary content:
- If the image is mainly text (document, screenshot, sign), transcribe the text verbatim.
- If the image is mainly visual (photograph, art, landscape), provide a concise description of the scene.
- For hybrid images (diagrams, ads), briefly describe the visual, then transcribe the text under a "Text:" heading.
Respond directly with the analysis."#)
.build()?
.into(),
ChatCompletionRequestMessageContentPartImageArgs::default()
.image_url(
ImageUrlArgs::default()
.url(image_url)
.detail(ImageDetail::High)
.build()?,
)
.build()?
.into(),
])
.build()?
.into()])
.build()?;
let response = client.chat().create(request).await?;
let description = response
.choices
.get(0)
.and_then(|c| c.message.content.as_ref())
.cloned()
.unwrap_or_else(|| "No description found.".to_string());
Ok(description)
}

View File

@@ -1,3 +1,4 @@
pub mod image_parsing;
pub mod llm_instructions;
use common::error::AppError;