mirror of
https://github.com/perstarkse/minne.git
synced 2026-04-25 10:18:38 +02:00
feat: image ingestion
This commit is contained in:
1
Cargo.lock
generated
1
Cargo.lock
generated
@@ -2819,6 +2819,7 @@ dependencies = [
|
|||||||
"async-openai",
|
"async-openai",
|
||||||
"axum",
|
"axum",
|
||||||
"axum_typed_multipart",
|
"axum_typed_multipart",
|
||||||
|
"base64 0.22.1",
|
||||||
"chrono",
|
"chrono",
|
||||||
"common",
|
"common",
|
||||||
"composite-retrieval",
|
"composite-retrieval",
|
||||||
|
|||||||
@@ -53,6 +53,7 @@ tracing-subscriber = { version = "0.3.18", features = ["env-filter"] }
|
|||||||
url = { version = "2.5.2", features = ["serde"] }
|
url = { version = "2.5.2", features = ["serde"] }
|
||||||
uuid = { version = "1.10.0", features = ["v4", "serde"] }
|
uuid = { version = "1.10.0", features = ["v4", "serde"] }
|
||||||
tokio-retry = "0.3.0"
|
tokio-retry = "0.3.0"
|
||||||
|
base64 = "0.22.1"
|
||||||
|
|
||||||
[profile.dist]
|
[profile.dist]
|
||||||
inherits = "release"
|
inherits = "release"
|
||||||
|
|||||||
@@ -7,6 +7,11 @@
|
|||||||
<img src="/file/{{text_content.url_info.image_id}}" alt="website screenshot" />
|
<img src="/file/{{text_content.url_info.image_id}}" alt="website screenshot" />
|
||||||
</figure>
|
</figure>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
{% if text_content.file_info.mime_type == "image/png" or text_content.file_info.mime_type == "image/jpeg" %}
|
||||||
|
<figure>
|
||||||
|
<img src="/file/{{text_content.file_info.id}}" alt="{{text_content.file_info.file_name}}" />
|
||||||
|
</figure>
|
||||||
|
{% endif %}
|
||||||
<div class="card-body max-w-[95vw]">
|
<div class="card-body max-w-[95vw]">
|
||||||
<h2 class="card-title truncate">
|
<h2 class="card-title truncate">
|
||||||
{% if text_content.url_info %}
|
{% if text_content.url_info %}
|
||||||
|
|||||||
@@ -6,6 +6,11 @@
|
|||||||
{% if text_content.url_info.image_id %}
|
{% if text_content.url_info.image_id %}
|
||||||
<img class="rounded-t-md overflow-clip" src="/file/{{text_content.url_info.image_id}}" alt="Screenshot of the site" />
|
<img class="rounded-t-md overflow-clip" src="/file/{{text_content.url_info.image_id}}" alt="Screenshot of the site" />
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
{% if text_content.file_info.mime_type == "image/png" or text_content.file_info.mime_type == "image/jpeg" %}
|
||||||
|
<figure>
|
||||||
|
<img src="/file/{{text_content.file_info.id}}" alt="{{text_content.file_info.file_name}}" />
|
||||||
|
</figure>
|
||||||
|
{% endif %}
|
||||||
<div id="reader-{{text_content.id}}" class="markdown-content prose" data-content="{{text_content.text | escape }}">
|
<div id="reader-{{text_content.id}}" class="markdown-content prose" data-content="{{text_content.text | escape }}">
|
||||||
{{text_content.text | escape }}
|
{{text_content.text | escape }}
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
@@ -22,9 +22,9 @@ text-splitter = { workspace = true }
|
|||||||
url = { workspace = true }
|
url = { workspace = true }
|
||||||
uuid = { workspace = true }
|
uuid = { workspace = true }
|
||||||
headless_chrome = { workspace = true }
|
headless_chrome = { workspace = true }
|
||||||
|
base64 = { workspace = true }
|
||||||
|
|
||||||
common = { path = "../common" }
|
common = { path = "../common" }
|
||||||
composite-retrieval = { path = "../composite-retrieval" }
|
composite-retrieval = { path = "../composite-retrieval" }
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
docker = []
|
docker = []
|
||||||
|
|||||||
@@ -59,7 +59,8 @@ impl IngestionPipeline {
|
|||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
let text_content = to_text_content(task.content, &self.db, &self.config).await?;
|
let text_content =
|
||||||
|
to_text_content(task.content, &self.db, &self.config, &self.openai_client).await?;
|
||||||
|
|
||||||
match self.process(&text_content).await {
|
match self.process(&text_content).await {
|
||||||
Ok(_) => {
|
Ok(_) => {
|
||||||
|
|||||||
@@ -22,10 +22,13 @@ use std::io::{Seek, SeekFrom};
|
|||||||
use tempfile::NamedTempFile;
|
use tempfile::NamedTempFile;
|
||||||
use tracing::{error, info};
|
use tracing::{error, info};
|
||||||
|
|
||||||
|
use crate::utils::image_parsing::extract_text_from_image;
|
||||||
|
|
||||||
pub async fn to_text_content(
|
pub async fn to_text_content(
|
||||||
ingestion_payload: IngestionPayload,
|
ingestion_payload: IngestionPayload,
|
||||||
db: &SurrealDbClient,
|
db: &SurrealDbClient,
|
||||||
config: &AppConfig,
|
config: &AppConfig,
|
||||||
|
openai_client: &async_openai::Client<async_openai::config::OpenAIConfig>,
|
||||||
) -> Result<TextContent, AppError> {
|
) -> Result<TextContent, AppError> {
|
||||||
match ingestion_payload {
|
match ingestion_payload {
|
||||||
IngestionPayload::Url {
|
IngestionPayload::Url {
|
||||||
@@ -67,7 +70,7 @@ pub async fn to_text_content(
|
|||||||
category,
|
category,
|
||||||
user_id,
|
user_id,
|
||||||
} => {
|
} => {
|
||||||
let text = extract_text_from_file(&file_info).await?;
|
let text = extract_text_from_file(&file_info, db, openai_client).await?;
|
||||||
Ok(TextContent::new(
|
Ok(TextContent::new(
|
||||||
text,
|
text,
|
||||||
Some(context),
|
Some(context),
|
||||||
@@ -195,7 +198,11 @@ async fn fetch_article_from_url(
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Extracts text from a file based on its MIME type.
|
/// Extracts text from a file based on its MIME type.
|
||||||
async fn extract_text_from_file(file_info: &FileInfo) -> Result<String, AppError> {
|
async fn extract_text_from_file(
|
||||||
|
file_info: &FileInfo,
|
||||||
|
db_client: &SurrealDbClient,
|
||||||
|
openai_client: &async_openai::Client<async_openai::config::OpenAIConfig>,
|
||||||
|
) -> Result<String, AppError> {
|
||||||
match file_info.mime_type.as_str() {
|
match file_info.mime_type.as_str() {
|
||||||
"text/plain" => {
|
"text/plain" => {
|
||||||
// Read the file and return its content
|
// Read the file and return its content
|
||||||
@@ -212,8 +219,9 @@ async fn extract_text_from_file(file_info: &FileInfo) -> Result<String, AppError
|
|||||||
Err(AppError::NotFound(file_info.mime_type.clone()))
|
Err(AppError::NotFound(file_info.mime_type.clone()))
|
||||||
}
|
}
|
||||||
"image/png" | "image/jpeg" => {
|
"image/png" | "image/jpeg" => {
|
||||||
// TODO: Implement OCR on image using a crate like `tesseract`
|
let content =
|
||||||
Err(AppError::NotFound(file_info.mime_type.clone()))
|
extract_text_from_image(&file_info.path, db_client, openai_client).await?;
|
||||||
|
Ok(content)
|
||||||
}
|
}
|
||||||
"application/octet-stream" => {
|
"application/octet-stream" => {
|
||||||
let content = tokio::fs::read_to_string(&file_info.path).await?;
|
let content = tokio::fs::read_to_string(&file_info.path).await?;
|
||||||
|
|||||||
62
ingestion-pipeline/src/utils/image_parsing.rs
Normal file
62
ingestion-pipeline/src/utils/image_parsing.rs
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
use async_openai::types::{
|
||||||
|
ChatCompletionRequestMessageContentPartImageArgs,
|
||||||
|
ChatCompletionRequestMessageContentPartTextArgs, ChatCompletionRequestUserMessageArgs,
|
||||||
|
CreateChatCompletionRequestArgs, ImageDetail, ImageUrlArgs,
|
||||||
|
};
|
||||||
|
use base64::{engine::general_purpose::STANDARD, Engine as _};
|
||||||
|
use common::{
|
||||||
|
error::AppError,
|
||||||
|
storage::{db::SurrealDbClient, types::system_settings::SystemSettings},
|
||||||
|
};
|
||||||
|
|
||||||
|
pub async fn extract_text_from_image(
|
||||||
|
path: &str,
|
||||||
|
db: &SurrealDbClient,
|
||||||
|
client: &async_openai::Client<async_openai::config::OpenAIConfig>,
|
||||||
|
) -> Result<String, AppError> {
|
||||||
|
let system_settings = SystemSettings::get_current(db).await?;
|
||||||
|
let image_bytes = tokio::fs::read(&path).await?;
|
||||||
|
|
||||||
|
let base64_image = STANDARD.encode(&image_bytes);
|
||||||
|
|
||||||
|
let image_url = format!("data:image/png;base64,{}", base64_image);
|
||||||
|
|
||||||
|
let request = CreateChatCompletionRequestArgs::default()
|
||||||
|
.model(system_settings.processing_model)
|
||||||
|
.max_tokens(6400_u32)
|
||||||
|
.messages([ChatCompletionRequestUserMessageArgs::default()
|
||||||
|
.content(vec![
|
||||||
|
ChatCompletionRequestMessageContentPartTextArgs::default()
|
||||||
|
.text(r#"Analyze this image and respond based on its primary content:
|
||||||
|
- If the image is mainly text (document, screenshot, sign), transcribe the text verbatim.
|
||||||
|
- If the image is mainly visual (photograph, art, landscape), provide a concise description of the scene.
|
||||||
|
- For hybrid images (diagrams, ads), briefly describe the visual, then transcribe the text under a "Text:" heading.
|
||||||
|
|
||||||
|
Respond directly with the analysis."#)
|
||||||
|
.build()?
|
||||||
|
.into(),
|
||||||
|
ChatCompletionRequestMessageContentPartImageArgs::default()
|
||||||
|
.image_url(
|
||||||
|
ImageUrlArgs::default()
|
||||||
|
.url(image_url)
|
||||||
|
.detail(ImageDetail::High)
|
||||||
|
.build()?,
|
||||||
|
)
|
||||||
|
.build()?
|
||||||
|
.into(),
|
||||||
|
])
|
||||||
|
.build()?
|
||||||
|
.into()])
|
||||||
|
.build()?;
|
||||||
|
|
||||||
|
let response = client.chat().create(request).await?;
|
||||||
|
|
||||||
|
let description = response
|
||||||
|
.choices
|
||||||
|
.get(0)
|
||||||
|
.and_then(|c| c.message.content.as_ref())
|
||||||
|
.cloned()
|
||||||
|
.unwrap_or_else(|| "No description found.".to_string());
|
||||||
|
|
||||||
|
Ok(description)
|
||||||
|
}
|
||||||
@@ -1,3 +1,4 @@
|
|||||||
|
pub mod image_parsing;
|
||||||
pub mod llm_instructions;
|
pub mod llm_instructions;
|
||||||
|
|
||||||
use common::error::AppError;
|
use common::error::AppError;
|
||||||
|
|||||||
1
todo.md
1
todo.md
@@ -1,3 +1,4 @@
|
|||||||
|
[] implement prompt and model choice for image processing?
|
||||||
[x] ollama and changing of openai_base_url
|
[x] ollama and changing of openai_base_url
|
||||||
[x] allow changing of port the server listens to
|
[x] allow changing of port the server listens to
|
||||||
[] archive ingressed webpage, pdf would be easy
|
[] archive ingressed webpage, pdf would be easy
|
||||||
|
|||||||
Reference in New Issue
Block a user