mirror of
https://github.com/perstarkse/minne.git
synced 2026-06-25 03:16:26 +02:00
refactor: replace headless_chrome with lighter alternatives
This commit is contained in:
@@ -2,6 +2,10 @@
|
||||
|
||||
## Unreleased
|
||||
|
||||
- Refactor: web scraping now uses `servo-fetch` (pure-Rust Servo engine) and PDF rendering uses `pdfium-render` (direct PDFium bindings) — reduces Docker image size by ~300MB, improves startup latency by ~100× for PDF rendering, and provides more stable output
|
||||
- Fix: added `pkgs.libglvnd` to `LD_LIBRARY_PATH` in devenv so Servo engine can find `libEGL.so` at runtime
|
||||
- Fix: updated Dockerfile to add `libegl1 libegl-mesa0 libgles2 libfontconfig1 libfreetype6` runtime dependencies for servo-fetch
|
||||
- Docs: updated architecture, features, and installation docs to reflect the new web processing stack
|
||||
- Fix: added pre-commit hooks to further maintain code consistency.
|
||||
- Security: updated some deps because dependabot told me, good bot.
|
||||
- Security: bump `async-openai` to 0.41.1 (feature-gated types, transcription API rename; removes `backoff` transitive dep)
|
||||
|
||||
Generated
+6002
-218
File diff suppressed because it is too large
Load Diff
+20
-7
@@ -7,13 +7,18 @@ members = [
|
||||
"ingestion-pipeline",
|
||||
"retrieval-pipeline",
|
||||
"json-stream-parser",
|
||||
"evaluations"
|
||||
"evaluations",
|
||||
]
|
||||
resolver = "3"
|
||||
|
||||
[workspace.dependencies]
|
||||
anyhow = "1.0.94"
|
||||
async-openai = { version = "0.41.1", features = ["chat-completion", "embedding", "audio", "model"] }
|
||||
async-openai = { version = "0.41.1", features = [
|
||||
"chat-completion",
|
||||
"embedding",
|
||||
"audio",
|
||||
"model",
|
||||
] }
|
||||
async-stream = "0.3.6"
|
||||
async-trait = "0.1.88"
|
||||
axum-htmx = "0.7.0"
|
||||
@@ -27,7 +32,6 @@ chrono = { version = "0.4.39", features = ["serde"] }
|
||||
config = "0.15.4"
|
||||
dom_smoothie = "0.10.0"
|
||||
futures = "0.3.31"
|
||||
headless_chrome = "1.0.17"
|
||||
include_dir = "0.7.4"
|
||||
mime = "0.3.17"
|
||||
mime_guess = "2.0.5"
|
||||
@@ -35,7 +39,7 @@ minijinja-autoreload = "2.5.0"
|
||||
minijinja-contrib = { version = "2.6.0", features = ["datetime", "timezone"] }
|
||||
minijinja-embed = { version = "2.8.0" }
|
||||
minijinja = { version = "2.5.0", features = ["loader", "multi_template"] }
|
||||
reqwest = {version = "0.12.12", features = ["charset", "json"]}
|
||||
reqwest = { version = "0.12.12", features = ["charset", "json"] }
|
||||
serde_json = "1.0.128"
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
sha2 = "0.10.8"
|
||||
@@ -61,14 +65,24 @@ bytes = "1.7.1"
|
||||
state-machines = "0.9"
|
||||
pdf-extract = "0.9"
|
||||
lopdf = "0.32"
|
||||
fastembed = { version = "5.2.0", default-features = false, features = ["hf-hub-native-tls", "ort-load-dynamic"] }
|
||||
pdfium-auto = "0.3"
|
||||
pdfium-render = "0.8"
|
||||
servo-fetch = "0.13"
|
||||
tendril = "0.4"
|
||||
image = { version = "0.25", default-features = false, features = ["png"] }
|
||||
fastembed = { version = "5.2.0", default-features = false, features = [
|
||||
"hf-hub-native-tls",
|
||||
"ort-load-dynamic",
|
||||
] }
|
||||
|
||||
[profile.dist]
|
||||
inherits = "release"
|
||||
lto = "thin"
|
||||
|
||||
[workspace.lints.rust]
|
||||
unexpected_cfgs = { level = "warn", check-cfg = ["cfg(feature, values(\"inspect\"))"] }
|
||||
unexpected_cfgs = { level = "warn", check-cfg = [
|
||||
"cfg(feature, values(\"inspect\"))",
|
||||
] }
|
||||
|
||||
[workspace.lints.clippy]
|
||||
# Performance-focused lints
|
||||
@@ -118,4 +132,3 @@ needless_raw_string_hashes = "allow"
|
||||
multiple_bound_locations = "allow"
|
||||
cargo_common_metadata = "allow"
|
||||
multiple-crate-versions = "allow"
|
||||
|
||||
|
||||
+5
-6
@@ -14,18 +14,18 @@ COPY html-router/Cargo.toml ./html-router/
|
||||
COPY ingestion-pipeline/Cargo.toml ./ingestion-pipeline/
|
||||
COPY json-stream-parser/Cargo.toml ./json-stream-parser/
|
||||
COPY main/Cargo.toml ./main/
|
||||
RUN cargo build --release --bin main --features ingestion-pipeline/docker || true
|
||||
RUN cargo build --release --bin main || true
|
||||
|
||||
# Build
|
||||
COPY . .
|
||||
RUN cargo build --release --bin main --features ingestion-pipeline/docker
|
||||
RUN cargo build --release --bin main
|
||||
|
||||
# === Runtime ===
|
||||
FROM debian:bookworm-slim
|
||||
|
||||
# Chromium + runtime deps + OpenMP for ORT
|
||||
# Servo engine (for servo-fetch web scraping) + runtime deps + OpenMP for ORT
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
chromium libnss3 libasound2 libgbm1 libxshmfence1 \
|
||||
libegl1 libegl-mesa0 libgles2 libfontconfig1 libfreetype6 \
|
||||
ca-certificates fonts-dejavu fonts-noto-color-emoji \
|
||||
libgomp1 libstdc++6 curl \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
@@ -39,8 +39,7 @@ RUN ORT_VERSION="${ORT_VERSION:-$(tr -d '[:space:]' < /tmp/ort-version)}" && \
|
||||
"https://github.com/microsoft/onnxruntime/releases/download/v${ORT_VERSION}/onnxruntime-linux-x64-${ORT_VERSION}.tgz" && \
|
||||
tar -xzf /tmp/ort.tgz -C /opt/onnxruntime --strip-components=1 && rm /tmp/ort.tgz
|
||||
|
||||
ENV CHROME_BIN=/usr/bin/chromium \
|
||||
SSL_CERT_FILE=/etc/ssl/certs/ca-certificates.crt \
|
||||
ENV SSL_CERT_FILE=/etc/ssl/certs/ca-certificates.crt \
|
||||
ORT_DYLIB_PATH=/opt/onnxruntime/lib/libonnxruntime.so
|
||||
|
||||
# Non-root
|
||||
|
||||
@@ -121,7 +121,7 @@ fastembed_cache_dir: "/var/lib/minne/fastembed" # optional override, defaults t
|
||||
- **Frontend:** HTML with HTMX and minimal JavaScript for interactivity
|
||||
- **Database:** SurrealDB (graph, document, and vector search)
|
||||
- **AI Integration:** OpenAI-compatible API with structured outputs
|
||||
- **Web Processing:** Headless Chrome for robust webpage content extraction
|
||||
- **Web Processing:** Embedded Servo engine (servo-fetch) for webpage content extraction + PDFium for PDF rendering
|
||||
|
||||
## Configuration
|
||||
|
||||
@@ -172,7 +172,7 @@ cd minne
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
The included `docker-compose.yml` handles SurrealDB and Chromium dependencies automatically.
|
||||
The included `docker-compose.yml` handles SurrealDB automatically.
|
||||
|
||||
### 2. Nix
|
||||
|
||||
@@ -180,13 +180,13 @@ The included `docker-compose.yml` handles SurrealDB and Chromium dependencies au
|
||||
nix run 'github:perstarkse/minne#main'
|
||||
```
|
||||
|
||||
This fetches Minne and all dependencies, including Chromium.
|
||||
This fetches Minne and all dependencies.
|
||||
|
||||
### 3. Pre-built Binaries
|
||||
|
||||
Download binaries for Windows, macOS, and Linux from the [GitHub Releases](https://github.com/perstarkse/minne/releases/latest).
|
||||
|
||||
**Requirements:** You'll need to provide SurrealDB and Chromium separately.
|
||||
**Requirements:** You'll need to provide SurrealDB separately.
|
||||
|
||||
### 4. Build from Source
|
||||
|
||||
@@ -196,7 +196,7 @@ cd minne
|
||||
cargo run --release --bin main
|
||||
```
|
||||
|
||||
**Requirements:** SurrealDB and Chromium must be installed and accessible in your PATH.
|
||||
**Requirements:** SurrealDB must be installed and accessible in your PATH.
|
||||
|
||||
## Application Architecture
|
||||
|
||||
|
||||
+12
@@ -41,6 +41,14 @@ in {
|
||||
pkgs.onnxruntime
|
||||
pkgs.cargo-watch
|
||||
pkgs.tailwindcss_4
|
||||
pkgs.python3
|
||||
pkgs.fontconfig
|
||||
pkgs.fontconfig.dev
|
||||
pkgs.libGL
|
||||
pkgs.libGLU
|
||||
pkgs.libclang
|
||||
pkgs.wayland
|
||||
pkgs.libxkbcommon
|
||||
];
|
||||
|
||||
languages.rust = {
|
||||
@@ -53,6 +61,10 @@ in {
|
||||
};
|
||||
|
||||
env = {
|
||||
# tikv-jemalloc-sys configure flags: -O0 + -Werror triggers glibc _FORTIFY_SOURCE warning
|
||||
NIX_CFLAGS_COMPILE = "-Wno-error=cpp";
|
||||
LIBCLANG_PATH = "${pkgs.libclang.lib}/lib";
|
||||
LD_LIBRARY_PATH = "${pkgs.wayland}/lib:${pkgs.libxkbcommon}/lib:${pkgs.pipewire}/lib:${pkgs.libglvnd}/lib";
|
||||
ORT_DYLIB_PATH = "${pkgs.onnxruntime}/lib/libonnxruntime.so";
|
||||
S3_ENDPOINT = "http://127.0.0.1:19000";
|
||||
S3_BUCKET = "minne-tests";
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
| Frontend | HTML + HTMX + minimal JS |
|
||||
| Database | SurrealDB (graph, document, vector) |
|
||||
| AI | OpenAI-compatible API |
|
||||
| Web Processing | Headless Chromium |
|
||||
| Web Processing | Servo engine (servo-fetch) + PDFium |
|
||||
|
||||
## Crate Structure
|
||||
|
||||
|
||||
+3
-1
@@ -10,7 +10,7 @@
|
||||
|
||||
Minne automatically processes saved content:
|
||||
|
||||
1. **Web scraping** extracts readable text from URLs (via headless Chrome)
|
||||
1. **Web scraping** extracts readable text from URLs (via embedded Servo engine)
|
||||
2. **Text analysis** identifies key concepts and relationships
|
||||
3. **Graph creation** builds connections between related content
|
||||
4. **Embedding generation** enables semantic search
|
||||
@@ -43,6 +43,7 @@ Optional **reranking** can rescore fused chunk lists with a cross-encoder model;
|
||||
When enabled, retrieval results are rescored with a cross-encoder model for improved relevance. Powered by [fastembed-rs](https://github.com/Anush008/fastembed-rs).
|
||||
|
||||
**Trade-offs:**
|
||||
|
||||
- Downloads ~1.1 GB of model data
|
||||
- Adds latency per query
|
||||
- Potentially improves answer quality, see [blog post](https://blog.stark.pub/posts/eval-retrieval-refactor/)
|
||||
@@ -52,6 +53,7 @@ Enable via `RERANKING_ENABLED=true`. See [Configuration](./configuration.md).
|
||||
## Multi-Format Ingestion
|
||||
|
||||
Supported content types:
|
||||
|
||||
- Plain text and notes
|
||||
- URLs (web pages)
|
||||
- PDF documents
|
||||
|
||||
@@ -12,13 +12,13 @@ cd minne
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
The included `docker-compose.yml` handles SurrealDB and Chromium automatically.
|
||||
The included `docker-compose.yml` handles SurrealDB automatically.
|
||||
|
||||
**Required:** Set your `OPENAI_API_KEY` in `docker-compose.yml` before starting.
|
||||
|
||||
## Nix
|
||||
|
||||
Run Minne directly with Nix (includes Chromium):
|
||||
Run Minne directly with Nix:
|
||||
|
||||
```bash
|
||||
nix run 'github:perstarkse/minne#main'
|
||||
@@ -31,8 +31,9 @@ Configure via environment variables or a `config.yaml` file. See [Configuration]
|
||||
Download binaries for Windows, macOS, and Linux from [GitHub Releases](https://github.com/perstarkse/minne/releases/latest).
|
||||
|
||||
**Requirements:**
|
||||
|
||||
- SurrealDB instance (local or remote)
|
||||
- Chromium (for web scraping)
|
||||
- `libEGL` + `libfontconfig` (for servo-fetch web scraping)
|
||||
|
||||
## Build from Source
|
||||
|
||||
@@ -45,9 +46,10 @@ cargo build --release --bin main
|
||||
The binary will be at `target/release/main`.
|
||||
|
||||
**Requirements:**
|
||||
|
||||
- Rust toolchain
|
||||
- SurrealDB accessible at configured address
|
||||
- Chromium in PATH
|
||||
- `libEGL` + `libfontconfig` for servo-fetch (web scraping) — bundled in Nix and Docker images
|
||||
|
||||
## Process Modes
|
||||
|
||||
|
||||
@@ -50,16 +50,16 @@
|
||||
doCheck = false;
|
||||
|
||||
nativeBuildInputs = [pkgs.pkg-config pkgs.rustfmt pkgs.makeWrapper];
|
||||
buildInputs = [pkgs.openssl pkgs.chromium pkgs.onnxruntime];
|
||||
buildInputs = [pkgs.openssl pkgs.libglvnd pkgs.onnxruntime];
|
||||
|
||||
postInstall = ''
|
||||
wrapProgram $out/bin/main \
|
||||
--set CHROME ${pkgs.chromium}/bin/chromium \
|
||||
--prefix LD_LIBRARY_PATH : ${pkgs.libglvnd}/lib \
|
||||
--set ORT_DYLIB_PATH ${pkgs.onnxruntime}/lib/libonnxruntime.${libExt}
|
||||
for b in worker server; do
|
||||
if [ -x "$out/bin/$b" ]; then
|
||||
wrapProgram $out/bin/$b \
|
||||
--set CHROME ${pkgs.chromium}/bin/chromium \
|
||||
--prefix LD_LIBRARY_PATH : ${pkgs.libglvnd}/lib \
|
||||
--set ORT_DYLIB_PATH ${pkgs.onnxruntime}/lib/libonnxruntime.${libExt}
|
||||
fi
|
||||
done
|
||||
|
||||
@@ -18,17 +18,22 @@ async-openai = { workspace = true }
|
||||
surrealdb = { workspace = true }
|
||||
dom_smoothie = { workspace = true }
|
||||
tempfile = { workspace = true }
|
||||
axum_typed_multipart = { workspace = true}
|
||||
anyhow = { workspace = true }
|
||||
axum_typed_multipart = { workspace = true }
|
||||
anyhow = { workspace = true }
|
||||
reqwest = { workspace = true }
|
||||
chrono = { workspace = true }
|
||||
text-splitter = { workspace = true }
|
||||
url = { workspace = true }
|
||||
uuid = { workspace = true }
|
||||
headless_chrome = { workspace = true }
|
||||
base64 = { workspace = true }
|
||||
pdf-extract = { workspace = true }
|
||||
lopdf = { workspace = true }
|
||||
tendril = { workspace = true }
|
||||
servo-fetch = { workspace = true }
|
||||
servo-allocator = { version = "0.2", features = ["use-system-allocator"] }
|
||||
pdfium-auto = { workspace = true }
|
||||
pdfium-render = { workspace = true }
|
||||
image = { workspace = true }
|
||||
bytes = { workspace = true }
|
||||
async-trait = { workspace = true }
|
||||
state-machines = { workspace = true }
|
||||
@@ -37,7 +42,6 @@ common = { path = "../common" }
|
||||
retrieval-pipeline = { path = "../retrieval-pipeline" }
|
||||
|
||||
[features]
|
||||
docker = []
|
||||
|
||||
[dev-dependencies]
|
||||
common = { path = "../common", features = ["test-utils"] }
|
||||
|
||||
@@ -24,6 +24,6 @@ pub async fn transcribe_audio_file(
|
||||
.transcription()
|
||||
.create(request)
|
||||
.await
|
||||
.map_err(|e| AppError::Processing(format!("Audio transcription failed: {e}")))?;
|
||||
.map_err(|e| AppError::Processing(format!("audio transcription failed: {e}")))?;
|
||||
Ok(response.text)
|
||||
}
|
||||
|
||||
@@ -1,27 +0,0 @@
|
||||
use common::error::AppError;
|
||||
use headless_chrome::Browser;
|
||||
|
||||
/// Launches a headless Chrome instance, honoring the `docker` feature flag
|
||||
/// (which disables the Chrome sandbox for container environments).
|
||||
///
|
||||
/// This is the single place the crate spawns a browser. If the rendering backend
|
||||
/// is ever swapped away from headless Chrome to something leaner, this function is
|
||||
/// the seam to change; callers only depend on getting back a `Browser`.
|
||||
pub(crate) fn launch_browser() -> Result<Browser, AppError> {
|
||||
#[cfg(feature = "docker")]
|
||||
{
|
||||
let options = headless_chrome::LaunchOptionsBuilder::default()
|
||||
.sandbox(false)
|
||||
.build()
|
||||
.map_err(|err| {
|
||||
AppError::Processing(format!("Failed to build headless browser options: {err}"))
|
||||
})?;
|
||||
Browser::new(options)
|
||||
.map_err(|err| AppError::Processing(format!("Failed to start headless browser: {err}")))
|
||||
}
|
||||
#[cfg(not(feature = "docker"))]
|
||||
{
|
||||
Browser::default()
|
||||
.map_err(|err| AppError::Processing(format!("Failed to start headless browser: {err}")))
|
||||
}
|
||||
}
|
||||
@@ -1,8 +1,8 @@
|
||||
pub mod audio_transcription;
|
||||
pub mod browser;
|
||||
pub mod file_text_extraction;
|
||||
pub mod graph_mapper;
|
||||
pub mod image_parsing;
|
||||
pub mod llm_instructions;
|
||||
pub mod page_fetcher;
|
||||
pub mod pdf;
|
||||
pub mod url_text_retrieval;
|
||||
|
||||
@@ -0,0 +1,117 @@
|
||||
//! Page-fetching abstraction that decouples URL extraction from the underlying engine.
|
||||
//!
|
||||
//! The primary implementation uses [`servo_fetch`], a pure-Rust Servo engine that
|
||||
//! provides high extraction quality (word-F1 0.819), fast startup (~331ms), and a
|
||||
//! small memory footprint (~64MB peak).
|
||||
|
||||
use std::time::Duration;
|
||||
|
||||
use common::error::AppError;
|
||||
use tracing::info;
|
||||
|
||||
/// Captured content from a single page fetch.
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub(crate) struct PageCapture {
|
||||
/// Raw HTML source of the page.
|
||||
pub html: String,
|
||||
/// Readable Markdown extracted from the page content.
|
||||
pub markdown: String,
|
||||
/// JPEG/PNG screenshot bytes, or empty if not captured.
|
||||
pub screenshot: Vec<u8>,
|
||||
}
|
||||
|
||||
/// Abstraction over a page-fetching engine.
|
||||
pub(crate) trait PageFetcher: Send + Sync + std::fmt::Debug {
|
||||
/// Fetches a URL and returns the captured content (HTML, markdown, screenshot).
|
||||
fn fetch(&self, url: &str) -> Result<PageCapture, AppError>;
|
||||
}
|
||||
|
||||
/// Fetcher powered by the embedded Servo engine via `servo-fetch`.
|
||||
///
|
||||
/// Provides HTML, extracted Markdown, and a PNG screenshot.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct ServoFetchFetcher;
|
||||
|
||||
impl PageFetcher for ServoFetchFetcher {
|
||||
fn fetch(&self, url: &str) -> Result<PageCapture, AppError> {
|
||||
let page = servo_fetch::blocking::fetch(
|
||||
&servo_fetch::FetchOptions::screenshot(url, true)
|
||||
.timeout(Duration::from_secs(30))
|
||||
.settle(Duration::from_millis(3000)),
|
||||
)
|
||||
.map_err(|err| AppError::Processing(format!("servo-fetch failed for {url}: {err}")))?;
|
||||
|
||||
let html = page.html.clone();
|
||||
let markdown = page
|
||||
.markdown()
|
||||
.map_err(|err| AppError::Processing(format!("failed to extract markdown: {err}")))?;
|
||||
let screenshot = page.screenshot_png().unwrap_or_default().to_vec();
|
||||
|
||||
info!(
|
||||
url = %url,
|
||||
html_bytes = html.len(),
|
||||
md_chars = markdown.len(),
|
||||
screenshot_bytes = screenshot.len(),
|
||||
"servo-fetch completed"
|
||||
);
|
||||
|
||||
Ok(PageCapture {
|
||||
html,
|
||||
markdown,
|
||||
screenshot,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates the default page fetcher for the current configuration.
|
||||
#[allow(unreachable_pub)]
|
||||
pub(crate) fn create_fetcher() -> Box<dyn PageFetcher> {
|
||||
Box::new(ServoFetchFetcher)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_default_fetcher_constructs() {
|
||||
let fetcher = create_fetcher();
|
||||
assert!(!format!("{fetcher:?}").is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_servo_fetcher_constructs() {
|
||||
let _ = ServoFetchFetcher;
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_trait_object_dispatch() {
|
||||
let fetcher: Box<dyn PageFetcher> = Box::new(ServoFetchFetcher);
|
||||
assert!(!format!("{fetcher:?}").is_empty());
|
||||
}
|
||||
|
||||
/// Smoke test: Servo engine initialises even without display server.
|
||||
/// Wrap in `catch_unwind` because child-thread panics from servo
|
||||
/// (e.g. missing wayland) would otherwise escape the test harness.
|
||||
#[test]
|
||||
fn test_servo_engine_initializes() {
|
||||
let fetcher = ServoFetchFetcher;
|
||||
let result = std::panic::catch_unwind(move || {
|
||||
let _ = fetcher.fetch("about:blank");
|
||||
});
|
||||
|
||||
if let Err(panic) = result {
|
||||
let msg = panic
|
||||
.downcast_ref::<&str>()
|
||||
.copied()
|
||||
.or_else(|| panic.downcast_ref::<String>().map(String::as_str))
|
||||
.unwrap_or("unknown panic");
|
||||
assert!(
|
||||
!(msg.contains("wayland")
|
||||
|| msg.contains("Library")
|
||||
|| msg.contains("servo-engine")),
|
||||
"Servo engine initialization failed: {msg}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,32 +1,25 @@
|
||||
//! Headless-Chrome rasterization of PDF pages into PNG screenshots.
|
||||
//! PDF page rasterization using pdfium-render via pdfium-auto.
|
||||
//!
|
||||
//! This is the only Chrome-dependent part of PDF ingestion. It depends on the
|
||||
//! browser's internal PDF-viewer shadow DOM, so it is inherently fragile across
|
||||
//! Chrome upgrades; a full-page-capture fallback guards the common failure modes.
|
||||
//! Uses direct `PDFium` bindings for reliable, pixel-perfect page rendering —
|
||||
//! starts in ~5ms, requires no display server, and produces consistent output
|
||||
//! independent of PDF reader version. Each page is rendered at a generous
|
||||
//! resolution and encoded as PNG for downstream LLM vision ingestion.
|
||||
|
||||
use std::{
|
||||
path::{Path, PathBuf},
|
||||
time::{Duration, SystemTime, UNIX_EPOCH},
|
||||
time::{SystemTime, UNIX_EPOCH},
|
||||
};
|
||||
|
||||
use base64::{engine::general_purpose::STANDARD, Engine as _};
|
||||
use headless_chrome::protocol::cdp::{Emulation, Page, DOM};
|
||||
use image::ImageFormat;
|
||||
use lopdf::Document;
|
||||
use serde_json::Value;
|
||||
use pdfium_render::prelude::PdfRenderConfig;
|
||||
use tracing::{debug, warn};
|
||||
|
||||
use common::error::AppError;
|
||||
|
||||
use crate::utils::browser::launch_browser;
|
||||
|
||||
const NAVIGATION_RETRY_INTERVAL_MS: u64 = 120;
|
||||
const NAVIGATION_RETRY_ATTEMPTS: usize = 10;
|
||||
const MIN_PAGE_IMAGE_BYTES: usize = 1_024;
|
||||
const DEFAULT_VIEWPORT_WIDTH: u32 = 1_248; // generous width to reduce horizontal clipping
|
||||
const DEFAULT_VIEWPORT_HEIGHT: u32 = 1_800; // tall enough to capture full page at fit-to-width scale
|
||||
const DEFAULT_DEVICE_SCALE_FACTOR: f64 = 1.0;
|
||||
const CANVAS_VIEWPORT_ATTEMPTS: usize = 12;
|
||||
const CANVAS_VIEWPORT_WAIT_MS: u64 = 200;
|
||||
const RENDER_TARGET_WIDTH: i32 = 1200;
|
||||
const RENDER_MAX_HEIGHT: i32 = 2000;
|
||||
const DEBUG_IMAGE_ENV_VAR: &str = "MINNE_PDF_DEBUG_DIR";
|
||||
|
||||
/// Parses the PDF structure to discover the available page numbers while keeping work off
|
||||
@@ -34,7 +27,7 @@ const DEBUG_IMAGE_ENV_VAR: &str = "MINNE_PDF_DEBUG_DIR";
|
||||
pub(super) async fn load_page_numbers(pdf_bytes: Vec<u8>) -> Result<Vec<u32>, AppError> {
|
||||
let pages = tokio::task::spawn_blocking(move || -> Result<Vec<u32>, AppError> {
|
||||
let document = Document::load_mem(&pdf_bytes)
|
||||
.map_err(|err| AppError::Processing(format!("Failed to parse PDF: {err}")))?;
|
||||
.map_err(|err| AppError::Processing(format!("failed to parse PDF: {err}")))?;
|
||||
let mut page_numbers: Vec<u32> = document.get_pages().keys().copied().collect();
|
||||
page_numbers.sort_unstable();
|
||||
Ok(page_numbers)
|
||||
@@ -44,7 +37,9 @@ pub(super) async fn load_page_numbers(pdf_bytes: Vec<u8>) -> Result<Vec<u32>, Ap
|
||||
Ok(pages)
|
||||
}
|
||||
|
||||
/// Uses the existing headless Chrome dependency to rasterize the requested PDF pages into PNGs.
|
||||
/// Renders the requested PDF pages as PNG-encoded byte vectors using `PDFium`.
|
||||
///
|
||||
/// Work is offloaded to a blocking thread since `PDFium`'s C API is not async-safe.
|
||||
pub(super) async fn render_pdf_pages(
|
||||
file_path: &Path,
|
||||
pages: &[u32],
|
||||
@@ -52,8 +47,8 @@ pub(super) async fn render_pdf_pages(
|
||||
let file_path = file_path.to_path_buf();
|
||||
let pages = pages.to_vec();
|
||||
let page_numbers = pages.clone();
|
||||
let captures =
|
||||
tokio::task::spawn_blocking(move || render_pdf_pages_inner(&file_path, &pages)).await??;
|
||||
|
||||
let captures = tokio::task::spawn_blocking(move || render_inner(&file_path, &pages)).await??;
|
||||
|
||||
for (page_number, png) in page_numbers.iter().zip(captures.iter()) {
|
||||
if let Err(err) = maybe_dump_debug_image(*page_number, png).await {
|
||||
@@ -68,306 +63,65 @@ pub(super) async fn render_pdf_pages(
|
||||
Ok(captures)
|
||||
}
|
||||
|
||||
fn render_pdf_pages_inner(file_path: &Path, pages: &[u32]) -> Result<Vec<Vec<u8>>, AppError> {
|
||||
let file_url = url::Url::from_file_path(file_path)
|
||||
.map_err(|()| AppError::Processing("Unable to construct PDF file URL".into()))?;
|
||||
/// Initializes `PDFium`, opens the file, and renders each requested page.
|
||||
fn render_inner(file_path: &Path, pages: &[u32]) -> Result<Vec<Vec<u8>>, AppError> {
|
||||
let pdfium = pdfium_auto::bind_pdfium_silent()
|
||||
.map_err(|err| AppError::Processing(format!("failed to bind PDFium library: {err}")))?;
|
||||
|
||||
let browser = launch_browser()?;
|
||||
let tab = browser
|
||||
.new_tab()
|
||||
.map_err(|err| AppError::Processing(format!("Failed to create Chrome tab: {err}")))?;
|
||||
let doc = pdfium
|
||||
.load_pdf_from_file(file_path, None)
|
||||
.map_err(|err| AppError::Processing(format!("failed to load PDF file: {err}")))?;
|
||||
|
||||
tab.set_default_timeout(Duration::from_secs(10));
|
||||
configure_tab(&tab)?;
|
||||
set_pdf_viewport(&tab)?;
|
||||
let render_config = PdfRenderConfig::new()
|
||||
.set_target_width(RENDER_TARGET_WIDTH)
|
||||
.set_maximum_height(RENDER_MAX_HEIGHT);
|
||||
|
||||
let mut captures = Vec::with_capacity(pages.len());
|
||||
|
||||
for page in pages.iter().copied() {
|
||||
let target = format!("{file_url}#page={page}&toolbar=0&statusbar=0&zoom=page-fit");
|
||||
tab.navigate_to(&target)
|
||||
.map_err(|err| AppError::Processing(format!("Failed to navigate to PDF page: {err}")))?
|
||||
.wait_until_navigated()
|
||||
.map_err(|err| AppError::Processing(format!("Navigation to PDF page failed: {err}")))?;
|
||||
for &page_num in pages {
|
||||
let page_index = page_num.saturating_sub(1); // PDFium uses 0-based indices
|
||||
let page = doc
|
||||
.pages()
|
||||
.get(u16::try_from(page_index).unwrap_or(u16::MAX))
|
||||
.map_err(|err| {
|
||||
AppError::Processing(format!("failed to get PDF page {page_num}: {err}"))
|
||||
})?;
|
||||
|
||||
let mut loaded = false;
|
||||
for attempt in 0..NAVIGATION_RETRY_ATTEMPTS {
|
||||
if tab
|
||||
.wait_for_element("embed, canvas, body")
|
||||
.map(|_| ())
|
||||
.is_ok()
|
||||
{
|
||||
loaded = true;
|
||||
break;
|
||||
}
|
||||
if attempt < NAVIGATION_RETRY_ATTEMPTS.saturating_sub(1) {
|
||||
std::thread::sleep(Duration::from_millis(NAVIGATION_RETRY_INTERVAL_MS));
|
||||
}
|
||||
}
|
||||
let bitmap = page.render_with_config(&render_config).map_err(|err| {
|
||||
AppError::Processing(format!("failed to render PDF page {page_num}: {err}"))
|
||||
})?;
|
||||
|
||||
if !loaded {
|
||||
return Err(AppError::Processing(
|
||||
"Timed out waiting for Chrome to render PDF page".into(),
|
||||
));
|
||||
}
|
||||
let image = bitmap.as_image();
|
||||
|
||||
wait_for_pdf_ready(&tab, page)?;
|
||||
std::thread::sleep(Duration::from_millis(350));
|
||||
let mut png_bytes = Vec::new();
|
||||
image
|
||||
.write_to(&mut std::io::Cursor::new(&mut png_bytes), ImageFormat::Png)
|
||||
.map_err(|err| {
|
||||
AppError::Processing(format!(
|
||||
"failed to encode PDF page {page_num} as PNG: {err}"
|
||||
))
|
||||
})?;
|
||||
|
||||
prepare_pdf_viewer(&tab, page);
|
||||
debug!(
|
||||
page = page_num,
|
||||
bytes = png_bytes.len(),
|
||||
"Rendered PDF page via PDFium"
|
||||
);
|
||||
|
||||
let mut viewport: Option<Page::Viewport> = None;
|
||||
for attempt in 0..CANVAS_VIEWPORT_ATTEMPTS {
|
||||
match canvas_viewport_for_page(&tab, page) {
|
||||
Ok(Some(vp)) => {
|
||||
viewport = Some(vp);
|
||||
break;
|
||||
}
|
||||
Ok(None) => {
|
||||
if attempt < CANVAS_VIEWPORT_ATTEMPTS.saturating_sub(1) {
|
||||
std::thread::sleep(Duration::from_millis(CANVAS_VIEWPORT_WAIT_MS));
|
||||
}
|
||||
}
|
||||
Err(err) => {
|
||||
warn!(page, error = %err, "Failed to derive canvas viewport");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let png = if let Some(clip) = viewport {
|
||||
match tab.call_method(Page::CaptureScreenshot {
|
||||
format: Some(Page::CaptureScreenshotFormatOption::Png),
|
||||
quality: None,
|
||||
clip: Some(clip),
|
||||
from_surface: Some(true),
|
||||
capture_beyond_viewport: Some(true),
|
||||
optimize_for_speed: Some(false),
|
||||
}) {
|
||||
Ok(data) => match STANDARD.decode(data.data) {
|
||||
Ok(bytes) => bytes,
|
||||
Err(err) => {
|
||||
warn!(error = %err, page, "Failed to decode clipped screenshot; falling back to full page capture");
|
||||
capture_full_page_png(&tab)?
|
||||
}
|
||||
},
|
||||
Err(err) => {
|
||||
warn!(error = %err, page, "Clipped screenshot failed; falling back to full page capture");
|
||||
capture_full_page_png(&tab)?
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if png_bytes.len() < MIN_PAGE_IMAGE_BYTES {
|
||||
warn!(
|
||||
page,
|
||||
"Unable to determine canvas viewport; capturing full page"
|
||||
);
|
||||
capture_full_page_png(&tab)?
|
||||
};
|
||||
|
||||
debug!(page, bytes = png.len(), "Captured PDF page screenshot");
|
||||
|
||||
if is_suspicious_image(png.len()) {
|
||||
warn!(
|
||||
page,
|
||||
bytes = png.len(),
|
||||
"Screenshot size below threshold; check rendering output"
|
||||
page = page_num,
|
||||
bytes = png_bytes.len(),
|
||||
"Rendered page size below threshold; check PDF quality"
|
||||
);
|
||||
}
|
||||
|
||||
captures.push(png);
|
||||
captures.push(png_bytes);
|
||||
}
|
||||
|
||||
Ok(captures)
|
||||
}
|
||||
|
||||
fn configure_tab(tab: &headless_chrome::Tab) -> Result<(), AppError> {
|
||||
tab.call_method(Emulation::SetDefaultBackgroundColorOverride {
|
||||
color: Some(DOM::RGBA {
|
||||
r: 255,
|
||||
g: 255,
|
||||
b: 255,
|
||||
a: Some(1.0),
|
||||
}),
|
||||
})
|
||||
.map_err(|err| {
|
||||
AppError::Processing(format!("Failed to configure Chrome page background: {err}"))
|
||||
})?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn set_pdf_viewport(tab: &headless_chrome::Tab) -> Result<(), AppError> {
|
||||
tab.call_method(Emulation::SetDeviceMetricsOverride {
|
||||
width: DEFAULT_VIEWPORT_WIDTH,
|
||||
height: DEFAULT_VIEWPORT_HEIGHT,
|
||||
device_scale_factor: DEFAULT_DEVICE_SCALE_FACTOR,
|
||||
mobile: false,
|
||||
scale: None,
|
||||
screen_width: Some(DEFAULT_VIEWPORT_WIDTH),
|
||||
screen_height: Some(DEFAULT_VIEWPORT_HEIGHT),
|
||||
position_x: None,
|
||||
position_y: None,
|
||||
dont_set_visible_size: Some(false),
|
||||
screen_orientation: None,
|
||||
viewport: None,
|
||||
display_feature: None,
|
||||
device_posture: None,
|
||||
})
|
||||
.map_err(|err| AppError::Processing(format!("Failed to configure Chrome viewport: {err}")))?;
|
||||
|
||||
tab.call_method(Emulation::SetVisibleSize {
|
||||
width: DEFAULT_VIEWPORT_WIDTH,
|
||||
height: DEFAULT_VIEWPORT_HEIGHT,
|
||||
})
|
||||
.map_err(|err| AppError::Processing(format!("Failed to apply Chrome visible size: {err}")))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn wait_for_pdf_ready(
|
||||
tab: &headless_chrome::Tab,
|
||||
page_number: u32,
|
||||
) -> Result<headless_chrome::Element<'_>, AppError> {
|
||||
let embed_selector = "embed[type='application/pdf']";
|
||||
let element = tab
|
||||
.wait_for_element_with_custom_timeout(embed_selector, Duration::from_secs(8))
|
||||
.or_else(|_| tab.wait_for_element_with_custom_timeout("embed", Duration::from_secs(8)))
|
||||
.map_err(|err| AppError::Processing(format!("Timed out waiting for PDF content: {err}")))?;
|
||||
|
||||
if let Err(err) = element.scroll_into_view() {
|
||||
debug!("Failed to scroll PDF element into view: {err}");
|
||||
}
|
||||
|
||||
debug!(page = page_number, "PDF viewer element located");
|
||||
|
||||
Ok(element)
|
||||
}
|
||||
|
||||
fn prepare_pdf_viewer(tab: &headless_chrome::Tab, page_number: u32) {
|
||||
let script = format!(
|
||||
r#"(function() {{
|
||||
const embed = document.querySelector('embed[type="application/pdf"]') || document.querySelector('embed');
|
||||
if (!embed || !embed.shadowRoot) return false;
|
||||
const viewer = embed.shadowRoot.querySelector('pdf-viewer');
|
||||
if (!viewer || !viewer.shadowRoot) return false;
|
||||
const app = viewer.shadowRoot.querySelector('viewer-app');
|
||||
if (app && app.shadowRoot) {{
|
||||
const toolbar = app.shadowRoot.querySelector('#toolbar');
|
||||
if (toolbar) {{ toolbar.style.display = 'none'; }}
|
||||
}}
|
||||
const page = viewer.shadowRoot.querySelector('viewer-page:nth-of-type({page_number})');
|
||||
if (page && page.scrollIntoView) {{
|
||||
page.scrollIntoView({{ block: 'start', inline: 'center' }});
|
||||
}}
|
||||
const canvas = viewer.shadowRoot.querySelector('canvas[aria-label="Page {page_number}"]');
|
||||
return !!canvas;
|
||||
}})()"#
|
||||
);
|
||||
|
||||
match tab.evaluate(&script, false) {
|
||||
Ok(result) => {
|
||||
let ready = result
|
||||
.value
|
||||
.as_ref()
|
||||
.and_then(Value::as_bool)
|
||||
.unwrap_or(false);
|
||||
debug!(page = page_number, ready, "Prepared PDF viewer page");
|
||||
}
|
||||
Err(err) => {
|
||||
debug!(page = page_number, error = %err, "Unable to run PDF viewer preparation script");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn canvas_viewport_for_page(
|
||||
tab: &headless_chrome::Tab,
|
||||
page_number: u32,
|
||||
) -> Result<Option<Page::Viewport>, AppError> {
|
||||
let script = format!(
|
||||
r#"(function() {{
|
||||
const embed = document.querySelector('embed[type="application/pdf"]') || document.querySelector('embed');
|
||||
if (!embed || !embed.shadowRoot) return null;
|
||||
const viewer = embed.shadowRoot.querySelector('pdf-viewer');
|
||||
if (!viewer || !viewer.shadowRoot) return null;
|
||||
const canvas = viewer.shadowRoot.querySelector('canvas[aria-label="Page {page_number}"]');
|
||||
if (!canvas) return null;
|
||||
const rect = canvas.getBoundingClientRect();
|
||||
return {{ x: rect.x, y: rect.y, width: rect.width, height: rect.height }};
|
||||
}})()"#
|
||||
);
|
||||
|
||||
let result = tab
|
||||
.evaluate(&script, false)
|
||||
.map_err(|err| AppError::Processing(format!("Failed to inspect PDF canvas: {err}")))?;
|
||||
|
||||
let Some(value) = result.value else {
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
if value.is_null() {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let x = value
|
||||
.get("x")
|
||||
.and_then(Value::as_f64)
|
||||
.unwrap_or_default()
|
||||
.max(0.0);
|
||||
let y = value
|
||||
.get("y")
|
||||
.and_then(Value::as_f64)
|
||||
.unwrap_or_default()
|
||||
.max(0.0);
|
||||
let width = value
|
||||
.get("width")
|
||||
.and_then(Value::as_f64)
|
||||
.unwrap_or_default();
|
||||
let height = value
|
||||
.get("height")
|
||||
.and_then(Value::as_f64)
|
||||
.unwrap_or_default();
|
||||
|
||||
if width <= 0.0 || height <= 0.0 {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
debug!(
|
||||
page = page_number,
|
||||
x, y, width, height, "Derived canvas viewport"
|
||||
);
|
||||
|
||||
Ok(Some(Page::Viewport {
|
||||
x,
|
||||
y,
|
||||
width,
|
||||
height,
|
||||
scale: 1.0,
|
||||
}))
|
||||
}
|
||||
|
||||
fn capture_full_page_png(tab: &headless_chrome::Tab) -> Result<Vec<u8>, AppError> {
|
||||
let screenshot = tab
|
||||
.call_method(Page::CaptureScreenshot {
|
||||
format: Some(Page::CaptureScreenshotFormatOption::Png),
|
||||
quality: None,
|
||||
clip: None,
|
||||
from_surface: Some(true),
|
||||
capture_beyond_viewport: Some(true),
|
||||
optimize_for_speed: Some(false),
|
||||
})
|
||||
.map_err(|err| {
|
||||
AppError::Processing(format!("Failed to capture PDF page (fallback): {err}"))
|
||||
})?;
|
||||
|
||||
STANDARD.decode(screenshot.data).map_err(|err| {
|
||||
AppError::Processing(format!("Failed to decode PDF screenshot (fallback): {err}"))
|
||||
})
|
||||
}
|
||||
|
||||
const fn is_suspicious_image(len: usize) -> bool {
|
||||
len < MIN_PAGE_IMAGE_BYTES
|
||||
}
|
||||
|
||||
fn debug_dump_directory() -> Option<PathBuf> {
|
||||
std::env::var(DEBUG_IMAGE_ENV_VAR)
|
||||
.ok()
|
||||
@@ -394,6 +148,8 @@ async fn maybe_dump_debug_image(page_index: u32, bytes: &[u8]) -> Result<(), App
|
||||
mod tests {
|
||||
use super::*;
|
||||
use anyhow::{self};
|
||||
use lopdf::dictionary;
|
||||
use lopdf::Object;
|
||||
|
||||
#[test]
|
||||
fn test_debug_dump_directory_env_var() -> anyhow::Result<()> {
|
||||
@@ -409,10 +165,108 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_suspicious_image_threshold() {
|
||||
assert!(is_suspicious_image(0));
|
||||
assert!(is_suspicious_image(MIN_PAGE_IMAGE_BYTES - 1));
|
||||
assert!(!is_suspicious_image(MIN_PAGE_IMAGE_BYTES + 1));
|
||||
#[tokio::test]
|
||||
async fn test_load_page_numbers_empty_pdf() -> anyhow::Result<()> {
|
||||
let pdf_bytes = create_minimal_pdf(0);
|
||||
let pages = load_page_numbers(pdf_bytes).await?;
|
||||
assert!(pages.is_empty());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_load_page_numbers_single_page() -> anyhow::Result<()> {
|
||||
let pdf_bytes = create_minimal_pdf(1);
|
||||
let pages = load_page_numbers(pdf_bytes).await?;
|
||||
assert_eq!(pages, vec![1u32]);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_load_page_numbers_multi_page() -> anyhow::Result<()> {
|
||||
let pdf_bytes = create_minimal_pdf(5);
|
||||
let pages = load_page_numbers(pdf_bytes).await?;
|
||||
assert_eq!(pages, vec![1, 2, 3, 4, 5]);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_load_page_numbers_invalid_pdf() {
|
||||
let result = load_page_numbers(b"not a pdf".to_vec()).await;
|
||||
assert!(result.is_err());
|
||||
}
|
||||
|
||||
/// Creates a minimal valid PDF with the given number of empty pages.
|
||||
#[allow(clippy::similar_names, clippy::expect_used)]
|
||||
fn create_minimal_pdf(page_count: u32) -> Vec<u8> {
|
||||
let mut doc = Document::with_version("1.5");
|
||||
let pages_id = doc.new_object_id();
|
||||
|
||||
let mut page_ids = Vec::with_capacity(page_count as usize);
|
||||
for _ in 0..page_count {
|
||||
let page_id = doc.add_object(dictionary! {
|
||||
"Type" => "Page",
|
||||
"Parent" => pages_id,
|
||||
"MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
|
||||
});
|
||||
page_ids.push(page_id);
|
||||
}
|
||||
|
||||
let pages = dictionary! {
|
||||
"Type" => "Pages",
|
||||
"Kids" => page_ids.iter().map(|id| Object::Reference(*id)).collect::<Vec<_>>(),
|
||||
"Count" => i32::try_from(page_count).unwrap_or(i32::MAX),
|
||||
"MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
|
||||
};
|
||||
doc.objects.insert(pages_id, Object::Dictionary(pages));
|
||||
|
||||
let catalog_id = doc.add_object(dictionary! {
|
||||
"Type" => "Catalog",
|
||||
"Pages" => pages_id,
|
||||
});
|
||||
doc.trailer.set("Root", catalog_id);
|
||||
|
||||
let mut buf = Vec::new();
|
||||
doc.save_to(&mut buf).expect("failed to serialize test PDF");
|
||||
buf
|
||||
}
|
||||
|
||||
/// Renders a simple 1-page PDF and verifies the output is a valid PNG ≥ 1KB.
|
||||
/// This test skips gracefully when `PDFium` is not available (e.g., CI without internet).
|
||||
#[tokio::test]
|
||||
async fn test_render_single_page_pdfium() -> anyhow::Result<()> {
|
||||
let pdf_bytes = create_minimal_pdf(1);
|
||||
let dir = tempfile::TempDir::new()?;
|
||||
let file_path = dir.path().join("test.pdf");
|
||||
tokio::fs::write(&file_path, &pdf_bytes).await?;
|
||||
|
||||
let result = render_pdf_pages(&file_path, &[1]).await;
|
||||
match result {
|
||||
Ok(pages) => {
|
||||
assert_eq!(pages.len(), 1, "should render one page");
|
||||
#[allow(clippy::expect_used)]
|
||||
let first_page = pages.into_iter().next().expect("already asserted len == 1");
|
||||
assert!(
|
||||
first_page.len() >= MIN_PAGE_IMAGE_BYTES,
|
||||
"rendered page {} bytes is below threshold {}",
|
||||
first_page.len(),
|
||||
MIN_PAGE_IMAGE_BYTES
|
||||
);
|
||||
// Verify it's a valid PNG by checking header bytes
|
||||
let header = first_page
|
||||
.get(..4.min(first_page.len()))
|
||||
.unwrap_or(&[0u8; 0]);
|
||||
assert_eq!(header, &[0x89, 0x50, 0x4E, 0x47], "output must be PNG");
|
||||
}
|
||||
Err(e) => {
|
||||
// PDFium may not be available — that's acceptable in environments
|
||||
// without network access to download the binary.
|
||||
let msg = e.to_string();
|
||||
if !msg.contains("PDFium") && !msg.contains("library") && !msg.contains("bind") {
|
||||
anyhow::bail!("unexpected error: {e}");
|
||||
}
|
||||
eprintln!("SKIP: PDFium not available ({msg})");
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
//! Fast-path PDF text extraction and Markdown reflow heuristics.
|
||||
//!
|
||||
//! These are pure (non-IO, non-Chrome) helpers used before falling back to the
|
||||
//! vision pipeline, plus the Markdown normalization applied to both paths.
|
||||
//! Pure text-extraction helpers that run before falling back to the vision pipeline,
|
||||
//! plus the Markdown normalization applied to both paths. The fast path uses
|
||||
//! `pdf-extract` to pull embedded text layers directly, avoiding the cost of
|
||||
//! page-by-page rasterization for well-structured PDFs.
|
||||
|
||||
use common::error::AppError;
|
||||
|
||||
@@ -15,7 +17,7 @@ pub(super) async fn try_fast_path(pdf_bytes: Vec<u8>) -> Result<Option<String>,
|
||||
pdf_extract::extract_text_from_mem(&pdf_bytes).map(|s| s.trim().to_string())
|
||||
})
|
||||
.await?
|
||||
.map_err(|err| AppError::Processing(format!("Failed to extract text from PDF: {err}")))?;
|
||||
.map_err(|err| AppError::Processing(format!("failed to extract text from PDF: {err}")))?;
|
||||
|
||||
if extraction.is_empty() {
|
||||
return Ok(None);
|
||||
@@ -28,7 +30,7 @@ pub(super) async fn try_fast_path(pdf_bytes: Vec<u8>) -> Result<Option<String>,
|
||||
Ok(Some(normalize_fast_text(&extraction)))
|
||||
}
|
||||
|
||||
/// Heuristic that determines whether the fast-path text looks like well-formed prose.
|
||||
/// Heuristic that determines whether the fast-path text looks like readable text.
|
||||
#[allow(clippy::cast_precision_loss)]
|
||||
fn looks_good_enough(text: &str) -> bool {
|
||||
if text.len() < FAST_PATH_MIN_LEN {
|
||||
|
||||
@@ -116,7 +116,7 @@ async fn transcribe_batch(
|
||||
);
|
||||
if attempt == last_attempt {
|
||||
return Err(AppError::Processing(
|
||||
"Vision model failed to transcribe PDF page contents".into(),
|
||||
"vision model failed to transcribe PDF page contents".into(),
|
||||
));
|
||||
}
|
||||
continue;
|
||||
@@ -126,7 +126,7 @@ async fn transcribe_batch(
|
||||
}
|
||||
|
||||
Err(AppError::Processing(
|
||||
"Vision model did not return usable Markdown".into(),
|
||||
"vision model did not return usable Markdown".into(),
|
||||
))
|
||||
}
|
||||
|
||||
|
||||
@@ -5,14 +5,18 @@ use common::{
|
||||
error::AppError,
|
||||
storage::{db::SurrealDbClient, store::StorageManager, types::file_info::FileInfo},
|
||||
};
|
||||
use dom_smoothie::{Article, Readability, TextMode};
|
||||
use dom_smoothie::Article;
|
||||
use std::{
|
||||
io::{Seek, SeekFrom, Write},
|
||||
net::IpAddr,
|
||||
time::Instant,
|
||||
};
|
||||
use tempfile::NamedTempFile;
|
||||
use tracing::{error, info, warn};
|
||||
use tendril::StrTendril;
|
||||
use tracing::{info, warn};
|
||||
|
||||
use crate::utils::page_fetcher::create_fetcher;
|
||||
|
||||
pub async fn extract_text_from_url(
|
||||
url: &str,
|
||||
db: &SurrealDbClient,
|
||||
@@ -22,46 +26,22 @@ pub async fn extract_text_from_url(
|
||||
info!("Fetching URL: {}", url);
|
||||
let now = Instant::now();
|
||||
|
||||
let browser = crate::utils::browser::launch_browser()?;
|
||||
|
||||
let tab = browser
|
||||
.new_tab()
|
||||
.map_err(|e| AppError::InternalError(e.to_string()))?;
|
||||
let page = tab
|
||||
.navigate_to(url)
|
||||
.map_err(|e| AppError::InternalError(e.to_string()))?;
|
||||
let loaded_page = page
|
||||
.wait_until_navigated()
|
||||
.map_err(|e| AppError::InternalError(e.to_string()))?;
|
||||
let raw_content = loaded_page
|
||||
.get_content()
|
||||
.map_err(|e| AppError::InternalError(e.to_string()))?;
|
||||
let screenshot = loaded_page
|
||||
.capture_screenshot(
|
||||
headless_chrome::protocol::cdp::Page::CaptureScreenshotFormatOption::Jpeg,
|
||||
None,
|
||||
None,
|
||||
true,
|
||||
)
|
||||
.map_err(|e| AppError::InternalError(e.to_string()))?;
|
||||
|
||||
let mut tmp_file = NamedTempFile::new()?;
|
||||
let temp_path_str = tmp_file.path().display().to_string();
|
||||
|
||||
tmp_file.write_all(&screenshot)?;
|
||||
tmp_file.as_file().sync_all()?;
|
||||
|
||||
if let Err(e) = tmp_file.seek(SeekFrom::Start(0)) {
|
||||
error!(
|
||||
"URL: {}. Failed to seek temp file {} to start: {:?}. Proceeding, but hashing might fail.",
|
||||
url, temp_path_str, e
|
||||
);
|
||||
}
|
||||
|
||||
let parsed_url =
|
||||
url::Url::parse(url).map_err(|_| AppError::Validation("invalid URL".to_string()))?;
|
||||
|
||||
let domain = ensure_ingestion_url_allowed(&parsed_url)?;
|
||||
|
||||
let fetcher = create_fetcher();
|
||||
let capture = fetcher.fetch(url)?;
|
||||
|
||||
// Save the screenshot to storage
|
||||
let mut tmp_file = NamedTempFile::new()?;
|
||||
|
||||
if !capture.screenshot.is_empty() {
|
||||
tmp_file.write_all(&capture.screenshot)?;
|
||||
tmp_file.as_file().sync_all()?;
|
||||
tmp_file.seek(SeekFrom::Start(0))?;
|
||||
}
|
||||
|
||||
let timestamp = Utc::now().format("%Y%m%d%H%M%S");
|
||||
let file_name = format!("{}_{}_{}.jpg", domain, "screenshot", timestamp);
|
||||
|
||||
@@ -78,12 +58,25 @@ pub async fn extract_text_from_url(
|
||||
|
||||
let file_info = FileInfo::new_with_storage(field_data, db, user_id, storage).await?;
|
||||
|
||||
let config = dom_smoothie::Config {
|
||||
text_mode: TextMode::Markdown,
|
||||
..Default::default()
|
||||
// servo-fetch doesn't extract byline/site_name/metadata, so those are left empty.
|
||||
let title = extract_title_from_html(&capture.html);
|
||||
let article = Article {
|
||||
title,
|
||||
byline: None,
|
||||
content: StrTendril::from_slice(&capture.markdown),
|
||||
text_content: StrTendril::from_slice(&capture.markdown),
|
||||
length: capture.markdown.len(),
|
||||
excerpt: None,
|
||||
site_name: None,
|
||||
dir: None,
|
||||
lang: None,
|
||||
published_time: None,
|
||||
modified_time: None,
|
||||
image: None,
|
||||
favicon: None,
|
||||
url: Some(url.to_string()),
|
||||
};
|
||||
let mut readability = Readability::new(raw_content, None, Some(config))?;
|
||||
let article: Article = readability.parse()?;
|
||||
|
||||
let end = now.elapsed();
|
||||
info!(
|
||||
"URL: {}. Total time: {:?}. Final File ID: {}",
|
||||
@@ -93,13 +86,31 @@ pub async fn extract_text_from_url(
|
||||
Ok((article, file_info))
|
||||
}
|
||||
|
||||
/// Extracts a page title from raw HTML. Returns empty string when no title is found.
|
||||
fn extract_title_from_html(html: &str) -> String {
|
||||
let lower = html.to_ascii_lowercase();
|
||||
if let Some(start) = lower.find("<title>") {
|
||||
let content_start = start.saturating_add("<title>".len());
|
||||
if let Some(end) = lower[content_start..].find("</title>") {
|
||||
let title_end = content_start.saturating_add(end);
|
||||
if title_end <= html.len() {
|
||||
let title = html[content_start..title_end].trim().to_string();
|
||||
if !title.is_empty() {
|
||||
return title;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
String::new()
|
||||
}
|
||||
|
||||
fn ensure_ingestion_url_allowed(url: &url::Url) -> Result<String, AppError> {
|
||||
match url.scheme() {
|
||||
"http" | "https" => {}
|
||||
scheme => {
|
||||
warn!(%url, %scheme, "Rejected ingestion URL due to unsupported scheme");
|
||||
return Err(AppError::Validation(
|
||||
"Unsupported URL scheme for ingestion".to_string(),
|
||||
"unsupported URL scheme for ingestion".to_string(),
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -107,14 +118,14 @@ fn ensure_ingestion_url_allowed(url: &url::Url) -> Result<String, AppError> {
|
||||
let Some(host) = url.host_str() else {
|
||||
warn!(%url, "Rejected ingestion URL missing host");
|
||||
return Err(AppError::Validation(
|
||||
"URL is missing a host component".to_string(),
|
||||
"URL missing a host component".to_string(),
|
||||
));
|
||||
};
|
||||
|
||||
if host.eq_ignore_ascii_case("localhost") {
|
||||
warn!(%url, host, "Rejected ingestion URL to localhost");
|
||||
return Err(AppError::Validation(
|
||||
"Ingestion URL host is not allowed".to_string(),
|
||||
"ingestion URL host is not allowed".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
@@ -127,7 +138,7 @@ fn ensure_ingestion_url_allowed(url: &url::Url) -> Result<String, AppError> {
|
||||
if ip.is_loopback() || ip.is_unspecified() || ip.is_multicast() || is_disallowed {
|
||||
warn!(%url, host, %ip, "Rejected ingestion URL pointing to restricted network range");
|
||||
return Err(AppError::Validation(
|
||||
"Ingestion URL host is not allowed".to_string(),
|
||||
"ingestion URL host is not allowed".to_string(),
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -168,4 +179,28 @@ mod tests {
|
||||
assert_eq!(sanitized, "sub_example_com");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_title_from_html_with_title() {
|
||||
let html = "<html><head><title>Hello World</title></head><body></body></html>";
|
||||
assert_eq!(extract_title_from_html(html), "Hello World");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_title_from_html_mixed_case() {
|
||||
let html = "<html><head><TITLE>Mixed Case</TITLE></head><body></body></html>";
|
||||
assert_eq!(extract_title_from_html(html), "Mixed Case");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_title_from_html_no_title() {
|
||||
let html = "<html><head></head><body><p>No title here</p></body></html>";
|
||||
assert_eq!(extract_title_from_html(html), "");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_title_from_html_empty_title() {
|
||||
let html = "<html><head><title></title></head><body></body></html>";
|
||||
assert_eq!(extract_title_from_html(html), "");
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user