feat: image ingestion

This commit is contained in:
Per Stark
2025-06-17 08:26:15 +02:00
parent f567b7198b
commit 9a23c1ea1b
10 changed files with 91 additions and 6 deletions

1
Cargo.lock generated
View File

@@ -2819,6 +2819,7 @@ dependencies = [
"async-openai",
"axum",
"axum_typed_multipart",
"base64 0.22.1",
"chrono",
"common",
"composite-retrieval",

View File

@@ -53,6 +53,7 @@ tracing-subscriber = { version = "0.3.18", features = ["env-filter"] }
url = { version = "2.5.2", features = ["serde"] }
uuid = { version = "1.10.0", features = ["v4", "serde"] }
tokio-retry = "0.3.0"
base64 = "0.22.1"
[profile.dist]
inherits = "release"

View File

@@ -7,6 +7,11 @@
<img src="/file/{{text_content.url_info.image_id}}" alt="website screenshot" />
</figure>
{% endif %}
{% if text_content.file_info.mime_type == "image/png" or text_content.file_info.mime_type == "image/jpeg" %}
<figure>
<img src="/file/{{text_content.file_info.id}}" alt="{{text_content.file_info.file_name}}" />
</figure>
{% endif %}
<div class="card-body max-w-[95vw]">
<h2 class="card-title truncate">
{% if text_content.url_info %}

View File

@@ -6,6 +6,11 @@
{% if text_content.url_info.image_id %}
<img class="rounded-t-md overflow-clip" src="/file/{{text_content.url_info.image_id}}" alt="Screenshot of the site" />
{% endif %}
{% if text_content.file_info.mime_type == "image/png" or text_content.file_info.mime_type == "image/jpeg" %}
<figure>
<img src="/file/{{text_content.file_info.id}}" alt="{{text_content.file_info.file_name}}" />
</figure>
{% endif %}
<div id="reader-{{text_content.id}}" class="markdown-content prose" data-content="{{text_content.text | escape }}">
{{text_content.text | escape }}
</div>

View File

@@ -22,9 +22,9 @@ text-splitter = { workspace = true }
url = { workspace = true }
uuid = { workspace = true }
headless_chrome = { workspace = true }
base64 = { workspace = true }
common = { path = "../common" }
composite-retrieval = { path = "../composite-retrieval" }
[features]
docker = []

View File

@@ -59,7 +59,8 @@ impl IngestionPipeline {
)
.await?;
let text_content = to_text_content(task.content, &self.db, &self.config).await?;
let text_content =
to_text_content(task.content, &self.db, &self.config, &self.openai_client).await?;
match self.process(&text_content).await {
Ok(_) => {

View File

@@ -22,10 +22,13 @@ use std::io::{Seek, SeekFrom};
use tempfile::NamedTempFile;
use tracing::{error, info};
use crate::utils::image_parsing::extract_text_from_image;
pub async fn to_text_content(
ingestion_payload: IngestionPayload,
db: &SurrealDbClient,
config: &AppConfig,
openai_client: &async_openai::Client<async_openai::config::OpenAIConfig>,
) -> Result<TextContent, AppError> {
match ingestion_payload {
IngestionPayload::Url {
@@ -67,7 +70,7 @@ pub async fn to_text_content(
category,
user_id,
} => {
let text = extract_text_from_file(&file_info).await?;
let text = extract_text_from_file(&file_info, db, openai_client).await?;
Ok(TextContent::new(
text,
Some(context),
@@ -195,7 +198,11 @@ async fn fetch_article_from_url(
}
/// Extracts text from a file based on its MIME type.
async fn extract_text_from_file(file_info: &FileInfo) -> Result<String, AppError> {
async fn extract_text_from_file(
file_info: &FileInfo,
db_client: &SurrealDbClient,
openai_client: &async_openai::Client<async_openai::config::OpenAIConfig>,
) -> Result<String, AppError> {
match file_info.mime_type.as_str() {
"text/plain" => {
// Read the file and return its content
@@ -212,8 +219,9 @@ async fn extract_text_from_file(file_info: &FileInfo) -> Result<String, AppError
Err(AppError::NotFound(file_info.mime_type.clone()))
}
"image/png" | "image/jpeg" => {
// TODO: Implement OCR on image using a crate like `tesseract`
Err(AppError::NotFound(file_info.mime_type.clone()))
let content =
extract_text_from_image(&file_info.path, db_client, openai_client).await?;
Ok(content)
}
"application/octet-stream" => {
let content = tokio::fs::read_to_string(&file_info.path).await?;

View File

@@ -0,0 +1,62 @@
use async_openai::types::{
ChatCompletionRequestMessageContentPartImageArgs,
ChatCompletionRequestMessageContentPartTextArgs, ChatCompletionRequestUserMessageArgs,
CreateChatCompletionRequestArgs, ImageDetail, ImageUrlArgs,
};
use base64::{engine::general_purpose::STANDARD, Engine as _};
use common::{
error::AppError,
storage::{db::SurrealDbClient, types::system_settings::SystemSettings},
};
pub async fn extract_text_from_image(
path: &str,
db: &SurrealDbClient,
client: &async_openai::Client<async_openai::config::OpenAIConfig>,
) -> Result<String, AppError> {
let system_settings = SystemSettings::get_current(db).await?;
let image_bytes = tokio::fs::read(&path).await?;
let base64_image = STANDARD.encode(&image_bytes);
let image_url = format!("data:image/png;base64,{}", base64_image);
let request = CreateChatCompletionRequestArgs::default()
.model(system_settings.processing_model)
.max_tokens(6400_u32)
.messages([ChatCompletionRequestUserMessageArgs::default()
.content(vec![
ChatCompletionRequestMessageContentPartTextArgs::default()
.text(r#"Analyze this image and respond based on its primary content:
- If the image is mainly text (document, screenshot, sign), transcribe the text verbatim.
- If the image is mainly visual (photograph, art, landscape), provide a concise description of the scene.
- For hybrid images (diagrams, ads), briefly describe the visual, then transcribe the text under a "Text:" heading.
Respond directly with the analysis."#)
.build()?
.into(),
ChatCompletionRequestMessageContentPartImageArgs::default()
.image_url(
ImageUrlArgs::default()
.url(image_url)
.detail(ImageDetail::High)
.build()?,
)
.build()?
.into(),
])
.build()?
.into()])
.build()?;
let response = client.chat().create(request).await?;
let description = response
.choices
.get(0)
.and_then(|c| c.message.content.as_ref())
.cloned()
.unwrap_or_else(|| "No description found.".to_string());
Ok(description)
}

View File

@@ -1,3 +1,4 @@
pub mod image_parsing;
pub mod llm_instructions;
use common::error::AppError;

View File

@@ -1,3 +1,4 @@
[] implement prompt and model choice for image processing?
[x] ollama and changing of openai_base_url
[x] allow changing of port the server listens to
[] archive ingressed webpage, pdf would be easy