feat: image ingestion

2026-01-11 20:50:24 +01:00 · 2025-06-17 08:26:15 +02:00
parent f567b7198b
commit 9a23c1ea1b
10 changed files with 91 additions and 6 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2819,6 +2819,7 @@ dependencies = [
 "async-openai",
 "axum",
 "axum_typed_multipart",
+ "base64 0.22.1",
 "chrono",
 "common",
 "composite-retrieval",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -53,6 +53,7 @@ tracing-subscriber = { version = "0.3.18", features = ["env-filter"] }
 url = { version = "2.5.2", features = ["serde"] }
 uuid = { version = "1.10.0", features = ["v4", "serde"] }
 tokio-retry = "0.3.0"
+base64 = "0.22.1"

 [profile.dist]
 inherits = "release"
--- a/html-router/templates/content/content_list.html
+++ b/html-router/templates/content/content_list.html
@@ -7,6 +7,11 @@
      <img src="/file/{{text_content.url_info.image_id}}" alt="website screenshot" />
    </figure>
    {% endif %}
+    {% if text_content.file_info.mime_type == "image/png" or text_content.file_info.mime_type == "image/jpeg" %}
+    <figure>
+      <img src="/file/{{text_content.file_info.id}}" alt="{{text_content.file_info.file_name}}" />
+    </figure>
+    {% endif %}
    <div class="card-body max-w-[95vw]">
      <h2 class="card-title truncate">
        {% if text_content.url_info %}
--- a/html-router/templates/content/read_content_modal.html
+++ b/html-router/templates/content/read_content_modal.html
@@ -6,6 +6,11 @@
 {% if text_content.url_info.image_id %}
 <img class="rounded-t-md overflow-clip" src="/file/{{text_content.url_info.image_id}}" alt="Screenshot of the site" />
 {% endif %}
+{% if text_content.file_info.mime_type == "image/png" or text_content.file_info.mime_type == "image/jpeg" %}
+<figure>
+  <img src="/file/{{text_content.file_info.id}}" alt="{{text_content.file_info.file_name}}" />
+</figure>
+{% endif %}
 <div id="reader-{{text_content.id}}" class="markdown-content prose" data-content="{{text_content.text | escape }}">
  {{text_content.text | escape }}
 </div>
--- a/ingestion-pipeline/Cargo.toml
+++ b/ingestion-pipeline/Cargo.toml
@@ -22,9 +22,9 @@ text-splitter = { workspace = true }
 url = { workspace = true }
 uuid = { workspace = true }
 headless_chrome = { workspace = true }
+base64 = { workspace = true }

 common = { path = "../common" }
 composite-retrieval = { path = "../composite-retrieval" }
-
 [features]
 docker = []
--- a/ingestion-pipeline/src/pipeline.rs
+++ b/ingestion-pipeline/src/pipeline.rs
@@ -59,7 +59,8 @@ impl IngestionPipeline {
        )
        .await?;

-        let text_content = to_text_content(task.content, &self.db, &self.config).await?;
+        let text_content =
+            to_text_content(task.content, &self.db, &self.config, &self.openai_client).await?;

        match self.process(&text_content).await {
            Ok(_) => {
--- a/ingestion-pipeline/src/types/mod.rs
+++ b/ingestion-pipeline/src/types/mod.rs
@@ -22,10 +22,13 @@ use std::io::{Seek, SeekFrom};
 use tempfile::NamedTempFile;
 use tracing::{error, info};

+use crate::utils::image_parsing::extract_text_from_image;
+
 pub async fn to_text_content(
    ingestion_payload: IngestionPayload,
    db: &SurrealDbClient,
    config: &AppConfig,
+    openai_client: &async_openai::Client<async_openai::config::OpenAIConfig>,
 ) -> Result<TextContent, AppError> {
    match ingestion_payload {
        IngestionPayload::Url {
@@ -67,7 +70,7 @@ pub async fn to_text_content(
            category,
            user_id,
        } => {
-            let text = extract_text_from_file(&file_info).await?;
+            let text = extract_text_from_file(&file_info, db, openai_client).await?;
            Ok(TextContent::new(
                text,
                Some(context),
@@ -195,7 +198,11 @@ async fn fetch_article_from_url(
 }

 /// Extracts text from a file based on its MIME type.
-async fn extract_text_from_file(file_info: &FileInfo) -> Result<String, AppError> {
+async fn extract_text_from_file(
+    file_info: &FileInfo,
+    db_client: &SurrealDbClient,
+    openai_client: &async_openai::Client<async_openai::config::OpenAIConfig>,
+) -> Result<String, AppError> {
    match file_info.mime_type.as_str() {
        "text/plain" => {
            // Read the file and return its content
@@ -212,8 +219,9 @@ async fn extract_text_from_file(file_info: &FileInfo) -> Result<String, AppError
            Err(AppError::NotFound(file_info.mime_type.clone()))
        }
        "image/png" | "image/jpeg" => {
-            // TODO: Implement OCR on image using a crate like `tesseract`
-            Err(AppError::NotFound(file_info.mime_type.clone()))
+            let content =
+                extract_text_from_image(&file_info.path, db_client, openai_client).await?;
+            Ok(content)
        }
        "application/octet-stream" => {
            let content = tokio::fs::read_to_string(&file_info.path).await?;
--- a/ingestion-pipeline/src/utils/image_parsing.rs
+++ b/ingestion-pipeline/src/utils/image_parsing.rs
@@ -0,0 +1,62 @@
+use async_openai::types::{
+    ChatCompletionRequestMessageContentPartImageArgs,
+    ChatCompletionRequestMessageContentPartTextArgs, ChatCompletionRequestUserMessageArgs,
+    CreateChatCompletionRequestArgs, ImageDetail, ImageUrlArgs,
+};
+use base64::{engine::general_purpose::STANDARD, Engine as _};
+use common::{
+    error::AppError,
+    storage::{db::SurrealDbClient, types::system_settings::SystemSettings},
+};
+
+pub async fn extract_text_from_image(
+    path: &str,
+    db: &SurrealDbClient,
+    client: &async_openai::Client<async_openai::config::OpenAIConfig>,
+) -> Result<String, AppError> {
+    let system_settings = SystemSettings::get_current(db).await?;
+    let image_bytes = tokio::fs::read(&path).await?;
+
+    let base64_image = STANDARD.encode(&image_bytes);
+
+    let image_url = format!("data:image/png;base64,{}", base64_image);
+
+    let request = CreateChatCompletionRequestArgs::default()
+        .model(system_settings.processing_model)
+        .max_tokens(6400_u32)
+        .messages([ChatCompletionRequestUserMessageArgs::default()
+            .content(vec![
+                ChatCompletionRequestMessageContentPartTextArgs::default()
+                    .text(r#"Analyze this image and respond based on its primary content:
+                            - If the image is mainly text (document, screenshot, sign), transcribe the text verbatim.
+                            - If the image is mainly visual (photograph, art, landscape), provide a concise description of the scene.
+                            - For hybrid images (diagrams, ads), briefly describe the visual, then transcribe the text under a "Text:" heading.
+                            
+                            Respond directly with the analysis."#)
+                    .build()?
+                    .into(),
+                ChatCompletionRequestMessageContentPartImageArgs::default()
+                    .image_url(
+                        ImageUrlArgs::default()
+                            .url(image_url)
+                            .detail(ImageDetail::High)
+                            .build()?,
+                    )
+                    .build()?
+                    .into(),
+            ])
+            .build()?
+            .into()])
+        .build()?;
+
+    let response = client.chat().create(request).await?;
+
+    let description = response
+        .choices
+        .get(0)
+        .and_then(|c| c.message.content.as_ref())
+        .cloned()
+        .unwrap_or_else(|| "No description found.".to_string());
+
+    Ok(description)
+}
--- a/ingestion-pipeline/src/utils/mod.rs
+++ b/ingestion-pipeline/src/utils/mod.rs
@@ -1,3 +1,4 @@
+pub mod image_parsing;
 pub mod llm_instructions;

 use common::error::AppError;
--- a/todo.md
+++ b/todo.md
@@ -1,3 +1,4 @@
+[] implement prompt and model choice for image processing?
 [x] ollama and changing of openai_base_url
 [x] allow changing of port the server listens to
 [] archive ingressed webpage, pdf would be easy