feat: image ingestion

2026-04-25 10:18:38 +02:00 · 2025-06-17 08:26:15 +02:00
parent f567b7198b
commit 9a23c1ea1b
10 changed files with 91 additions and 6 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2819,6 +2819,7 @@ dependencies = [
 "async-openai",
 "axum",
 "axum_typed_multipart",
 "base64 0.22.1",
 "chrono",
 "common",
 "composite-retrieval",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -53,6 +53,7 @@ tracing-subscriber = { version = "0.3.18", features = ["env-filter"] }
 url = { version = "2.5.2", features = ["serde"] }
 uuid = { version = "1.10.0", features = ["v4", "serde"] }
 tokio-retry = "0.3.0"
 base64 = "0.22.1"
 [profile.dist]
 inherits = "release"
--- a/html-router/templates/content/content_list.html
+++ b/html-router/templates/content/content_list.html
@@ -7,6 +7,11 @@
      <img src="/file/{{text_content.url_info.image_id}}" alt="website screenshot" />
    </figure>
    {% endif %}
    {% if text_content.file_info.mime_type == "image/png" or text_content.file_info.mime_type == "image/jpeg" %}
    <figure>
      <img src="/file/{{text_content.file_info.id}}" alt="{{text_content.file_info.file_name}}" />
    </figure>
    {% endif %}
    <div class="card-body max-w-[95vw]">
      <h2 class="card-title truncate">
        {% if text_content.url_info %}
--- a/html-router/templates/content/read_content_modal.html
+++ b/html-router/templates/content/read_content_modal.html
@@ -6,6 +6,11 @@
 {% if text_content.url_info.image_id %}
 <img class="rounded-t-md overflow-clip" src="/file/{{text_content.url_info.image_id}}" alt="Screenshot of the site" />
 {% endif %}
 {% if text_content.file_info.mime_type == "image/png" or text_content.file_info.mime_type == "image/jpeg" %}
 <figure>
  <img src="/file/{{text_content.file_info.id}}" alt="{{text_content.file_info.file_name}}" />
 </figure>
 {% endif %}
 <div id="reader-{{text_content.id}}" class="markdown-content prose" data-content="{{text_content.text | escape }}">
  {{text_content.text | escape }}
 </div>
--- a/ingestion-pipeline/Cargo.toml
+++ b/ingestion-pipeline/Cargo.toml
@@ -22,9 +22,9 @@ text-splitter = { workspace = true }
 url = { workspace = true }
 uuid = { workspace = true }
 headless_chrome = { workspace = true }
 base64 = { workspace = true }
 common = { path = "../common" }
 composite-retrieval = { path = "../composite-retrieval" }
 [features]
 docker = []
--- a/ingestion-pipeline/src/pipeline.rs
+++ b/ingestion-pipeline/src/pipeline.rs
@@ -59,7 +59,8 @@ impl IngestionPipeline {
        )
        .await?;
-        let text_content = to_text_content(task.content, &self.db, &self.config).await?;
+        let text_content =
            to_text_content(task.content, &self.db, &self.config, &self.openai_client).await?;
        match self.process(&text_content).await {
            Ok(_) => {
--- a/ingestion-pipeline/src/types/mod.rs
+++ b/ingestion-pipeline/src/types/mod.rs
@@ -22,10 +22,13 @@ use std::io::{Seek, SeekFrom};
 use tempfile::NamedTempFile;
 use tracing::{error, info};
 use crate::utils::image_parsing::extract_text_from_image;
 pub async fn to_text_content(
    ingestion_payload: IngestionPayload,
    db: &SurrealDbClient,
    config: &AppConfig,
    openai_client: &async_openai::Client<async_openai::config::OpenAIConfig>,
 ) -> Result<TextContent, AppError> {
    match ingestion_payload {
        IngestionPayload::Url {
@@ -67,7 +70,7 @@ pub async fn to_text_content(
            category,
            user_id,
        } => {
-            let text = extract_text_from_file(&file_info).await?;
+            let text = extract_text_from_file(&file_info, db, openai_client).await?;
            Ok(TextContent::new(
                text,
                Some(context),
@@ -195,7 +198,11 @@ async fn fetch_article_from_url(
 }
 /// Extracts text from a file based on its MIME type.
-async fn extract_text_from_file(file_info: &FileInfo) -> Result<String, AppError> {
+async fn extract_text_from_file(
    file_info: &FileInfo,
    db_client: &SurrealDbClient,
    openai_client: &async_openai::Client<async_openai::config::OpenAIConfig>,
 ) -> Result<String, AppError> {
    match file_info.mime_type.as_str() {
        "text/plain" => {
            // Read the file and return its content
@@ -212,8 +219,9 @@ async fn extract_text_from_file(file_info: &FileInfo) -> Result<String, AppError
            Err(AppError::NotFound(file_info.mime_type.clone()))
        }
        "image/png" | "image/jpeg" => {
-            // TODO: Implement OCR on image using a crate like `tesseract`
+            let content =
-            Err(AppError::NotFound(file_info.mime_type.clone()))
+                extract_text_from_image(&file_info.path, db_client, openai_client).await?;
            Ok(content)
        }
        "application/octet-stream" => {
            let content = tokio::fs::read_to_string(&file_info.path).await?;
--- a/ingestion-pipeline/src/utils/image_parsing.rs
+++ b/ingestion-pipeline/src/utils/image_parsing.rs
@@ -0,0 +1,62 @@
 use async_openai::types::{
    ChatCompletionRequestMessageContentPartImageArgs,
    ChatCompletionRequestMessageContentPartTextArgs, ChatCompletionRequestUserMessageArgs,
    CreateChatCompletionRequestArgs, ImageDetail, ImageUrlArgs,
 };
 use base64::{engine::general_purpose::STANDARD, Engine as _};
 use common::{
    error::AppError,
    storage::{db::SurrealDbClient, types::system_settings::SystemSettings},
 };
 pub async fn extract_text_from_image(
    path: &str,
    db: &SurrealDbClient,
    client: &async_openai::Client<async_openai::config::OpenAIConfig>,
 ) -> Result<String, AppError> {
    let system_settings = SystemSettings::get_current(db).await?;
    let image_bytes = tokio::fs::read(&path).await?;
    let base64_image = STANDARD.encode(&image_bytes);
    let image_url = format!("data:image/png;base64,{}", base64_image);
    let request = CreateChatCompletionRequestArgs::default()
        .model(system_settings.processing_model)
        .max_tokens(6400_u32)
        .messages([ChatCompletionRequestUserMessageArgs::default()
            .content(vec![
                ChatCompletionRequestMessageContentPartTextArgs::default()
                    .text(r#"Analyze this image and respond based on its primary content:
                            - If the image is mainly text (document, screenshot, sign), transcribe the text verbatim.
                            - If the image is mainly visual (photograph, art, landscape), provide a concise description of the scene.
                            - For hybrid images (diagrams, ads), briefly describe the visual, then transcribe the text under a "Text:" heading.
                            Respond directly with the analysis."#)
                    .build()?
                    .into(),
                ChatCompletionRequestMessageContentPartImageArgs::default()
                    .image_url(
                        ImageUrlArgs::default()
                            .url(image_url)
                            .detail(ImageDetail::High)
                            .build()?,
                    )
                    .build()?
                    .into(),
            ])
            .build()?
            .into()])
        .build()?;
    let response = client.chat().create(request).await?;
    let description = response
        .choices
        .get(0)
        .and_then(|c| c.message.content.as_ref())
        .cloned()
        .unwrap_or_else(|| "No description found.".to_string());
    Ok(description)
 }
--- a/ingestion-pipeline/src/utils/mod.rs
+++ b/ingestion-pipeline/src/utils/mod.rs
@@ -1,3 +1,4 @@
 pub mod image_parsing;
 pub mod llm_instructions;
 use common::error::AppError;
--- a/todo.md
+++ b/todo.md
@@ -1,3 +1,4 @@
 [] implement prompt and model choice for image processing?
 [x] ollama and changing of openai_base_url
 [x] allow changing of port the server listens to
 [] archive ingressed webpage, pdf would be easy