refactor: replace headless_chrome with lighter alternatives

2026-06-30 10:01:40 +02:00 · 2026-06-21 18:15:54 +02:00
parent 87e6fa14b2
commit 588e616baf
19 changed files with 6440 additions and 639 deletions
@@ -2,6 +2,10 @@
 ## Unreleased
 - Refactor: web scraping now uses `servo-fetch` (pure-Rust Servo engine) and PDF rendering uses `pdfium-render` (direct PDFium bindings) — reduces Docker image size by ~300MB, improves startup latency by ~100× for PDF rendering, and provides more stable output
 - Fix: added `pkgs.libglvnd` to `LD_LIBRARY_PATH` in devenv so Servo engine can find `libEGL.so` at runtime
 - Fix: updated Dockerfile to add `libegl1 libegl-mesa0 libgles2 libfontconfig1 libfreetype6` runtime dependencies for servo-fetch
 - Docs: updated architecture, features, and installation docs to reflect the new web processing stack
 - Fix: added pre-commit hooks to further maintain code consistency.
 - Security: updated some deps because dependabot told me, good bot.
 - Security: bump `async-openai` to 0.41.1 (feature-gated types, transcription API rename; removes `backoff` transitive dep)
@@ -7,13 +7,18 @@ members = [
  "ingestion-pipeline",
  "retrieval-pipeline",
  "json-stream-parser",
-  "evaluations"
+  "evaluations",
 ]
 resolver = "3"
 [workspace.dependencies]
 anyhow = "1.0.94"
-async-openai = { version = "0.41.1", features = ["chat-completion", "embedding", "audio", "model"] }
+async-openai = { version = "0.41.1", features = [
  "chat-completion",
  "embedding",
  "audio",
  "model",
 ] }
 async-stream = "0.3.6"
 async-trait = "0.1.88"
 axum-htmx = "0.7.0"
@@ -27,7 +32,6 @@ chrono = { version = "0.4.39", features = ["serde"] }
 config = "0.15.4"
 dom_smoothie = "0.10.0"
 futures = "0.3.31"
 headless_chrome = "1.0.17"
 include_dir = "0.7.4"
 mime = "0.3.17"
 mime_guess = "2.0.5"
@@ -35,7 +39,7 @@ minijinja-autoreload = "2.5.0"
 minijinja-contrib = { version = "2.6.0", features = ["datetime", "timezone"] }
 minijinja-embed = { version = "2.8.0" }
 minijinja = { version = "2.5.0", features = ["loader", "multi_template"] }
-reqwest = {version = "0.12.12", features = ["charset", "json"]}
+reqwest = { version = "0.12.12", features = ["charset", "json"] }
 serde_json = "1.0.128"
 serde = { version = "1", features = ["derive"] }
 sha2 = "0.10.8"
@@ -61,14 +65,24 @@ bytes = "1.7.1"
 state-machines = "0.9"
 pdf-extract = "0.9"
 lopdf = "0.32"
-fastembed = { version = "5.2.0", default-features = false, features = ["hf-hub-native-tls", "ort-load-dynamic"] }
+pdfium-auto = "0.3"
 pdfium-render = "0.8"
 servo-fetch = "0.13"
 tendril = "0.4"
 image = { version = "0.25", default-features = false, features = ["png"] }
 fastembed = { version = "5.2.0", default-features = false, features = [
  "hf-hub-native-tls",
  "ort-load-dynamic",
 ] }
 [profile.dist]
 inherits = "release"
 lto = "thin"
 [workspace.lints.rust]
-unexpected_cfgs = { level = "warn", check-cfg = ["cfg(feature, values(\"inspect\"))"] }
+unexpected_cfgs = { level = "warn", check-cfg = [
  "cfg(feature, values(\"inspect\"))",
 ] }
 [workspace.lints.clippy]
 # Performance-focused lints
@@ -118,4 +132,3 @@ needless_raw_string_hashes = "allow"
 multiple_bound_locations = "allow"
 cargo_common_metadata = "allow"
 multiple-crate-versions = "allow"
@@ -14,18 +14,18 @@ COPY html-router/Cargo.toml ./html-router/
 COPY ingestion-pipeline/Cargo.toml ./ingestion-pipeline/
 COPY json-stream-parser/Cargo.toml ./json-stream-parser/
 COPY main/Cargo.toml ./main/
-RUN cargo build --release --bin main --features ingestion-pipeline/docker || true
+RUN cargo build --release --bin main || true
 # Build
 COPY . .
-RUN cargo build --release --bin main --features ingestion-pipeline/docker
+RUN cargo build --release --bin main
 # === Runtime ===
 FROM debian:bookworm-slim
-# Chromium + runtime deps + OpenMP for ORT
+# Servo engine (for servo-fetch web scraping) + runtime deps + OpenMP for ORT
 RUN apt-get update && apt-get install -y --no-install-recommends \
-    chromium libnss3 libasound2 libgbm1 libxshmfence1 \
+    libegl1 libegl-mesa0 libgles2 libfontconfig1 libfreetype6 \
    ca-certificates fonts-dejavu fonts-noto-color-emoji \
    libgomp1 libstdc++6 curl \
  && rm -rf /var/lib/apt/lists/*
@@ -39,8 +39,7 @@ RUN ORT_VERSION="${ORT_VERSION:-$(tr -d '[:space:]' < /tmp/ort-version)}" && \
      "https://github.com/microsoft/onnxruntime/releases/download/v${ORT_VERSION}/onnxruntime-linux-x64-${ORT_VERSION}.tgz" && \
    tar -xzf /tmp/ort.tgz -C /opt/onnxruntime --strip-components=1 && rm /tmp/ort.tgz
-ENV CHROME_BIN=/usr/bin/chromium \
+ENV SSL_CERT_FILE=/etc/ssl/certs/ca-certificates.crt \
    SSL_CERT_FILE=/etc/ssl/certs/ca-certificates.crt \
    ORT_DYLIB_PATH=/opt/onnxruntime/lib/libonnxruntime.so
 # Non-root
@@ -121,7 +121,7 @@ fastembed_cache_dir: "/var/lib/minne/fastembed"  # optional override, defaults t
 - **Frontend:** HTML with HTMX and minimal JavaScript for interactivity
 - **Database:** SurrealDB (graph, document, and vector search)
 - **AI Integration:** OpenAI-compatible API with structured outputs
- **Web Processing:** Headless Chrome for robust webpage content extraction
+- **Web Processing:** Embedded Servo engine (servo-fetch) for webpage content extraction + PDFium for PDF rendering
 ## Configuration
@@ -172,7 +172,7 @@ cd minne
 docker compose up -d
 ```
-The included `docker-compose.yml` handles SurrealDB and Chromium dependencies automatically.
+The included `docker-compose.yml` handles SurrealDB automatically.
 ### 2. Nix
@@ -180,13 +180,13 @@ The included `docker-compose.yml` handles SurrealDB and Chromium dependencies au
 nix run 'github:perstarkse/minne#main'
 ```
-This fetches Minne and all dependencies, including Chromium.
+This fetches Minne and all dependencies.
 ### 3. Pre-built Binaries
 Download binaries for Windows, macOS, and Linux from the [GitHub Releases](https://github.com/perstarkse/minne/releases/latest).
-**Requirements:** You'll need to provide SurrealDB and Chromium separately.
+**Requirements:** You'll need to provide SurrealDB separately.
 ### 4. Build from Source
@@ -196,7 +196,7 @@ cd minne
 cargo run --release --bin main
 ```
-**Requirements:** SurrealDB and Chromium must be installed and accessible in your PATH.
+**Requirements:** SurrealDB must be installed and accessible in your PATH.
 ## Application Architecture
@@ -41,6 +41,14 @@ in {
    pkgs.onnxruntime
    pkgs.cargo-watch
    pkgs.tailwindcss_4
    pkgs.python3
    pkgs.fontconfig
    pkgs.fontconfig.dev
    pkgs.libGL
    pkgs.libGLU
    pkgs.libclang
    pkgs.wayland
    pkgs.libxkbcommon
  ];
  languages.rust = {
@@ -53,6 +61,10 @@ in {
  };
  env = {
    # tikv-jemalloc-sys configure flags: -O0 + -Werror triggers glibc _FORTIFY_SOURCE warning
    NIX_CFLAGS_COMPILE = "-Wno-error=cpp";
    LIBCLANG_PATH = "${pkgs.libclang.lib}/lib";
    LD_LIBRARY_PATH = "${pkgs.wayland}/lib:${pkgs.libxkbcommon}/lib:${pkgs.pipewire}/lib:${pkgs.libglvnd}/lib";
    ORT_DYLIB_PATH = "${pkgs.onnxruntime}/lib/libonnxruntime.so";
    S3_ENDPOINT = "http://127.0.0.1:19000";
    S3_BUCKET = "minne-tests";
@@ -8,7 +8,7 @@
 | Frontend | HTML + HTMX + minimal JS |
 | Database | SurrealDB (graph, document, vector) |
 | AI | OpenAI-compatible API |
-| Web Processing | Headless Chromium |
+| Web Processing | Servo engine (servo-fetch) + PDFium |
 ## Crate Structure
@@ -10,7 +10,7 @@
 Minne automatically processes saved content:
-1. **Web scraping** extracts readable text from URLs (via headless Chrome)
+1. **Web scraping** extracts readable text from URLs (via embedded Servo engine)
 2. **Text analysis** identifies key concepts and relationships
 3. **Graph creation** builds connections between related content
 4. **Embedding generation** enables semantic search
@@ -43,6 +43,7 @@ Optional **reranking** can rescore fused chunk lists with a cross-encoder model;
 When enabled, retrieval results are rescored with a cross-encoder model for improved relevance. Powered by [fastembed-rs](https://github.com/Anush008/fastembed-rs).
 **Trade-offs:**
 - Downloads ~1.1 GB of model data
 - Adds latency per query
 - Potentially improves answer quality, see [blog post](https://blog.stark.pub/posts/eval-retrieval-refactor/)
@@ -52,6 +53,7 @@ Enable via `RERANKING_ENABLED=true`. See [Configuration](./configuration.md).
 ## Multi-Format Ingestion
 Supported content types:
 - Plain text and notes
 - URLs (web pages)
 - PDF documents
@@ -12,13 +12,13 @@ cd minne
 docker compose up -d
 ```
-The included `docker-compose.yml` handles SurrealDB and Chromium automatically.
+The included `docker-compose.yml` handles SurrealDB automatically.
 **Required:** Set your `OPENAI_API_KEY` in `docker-compose.yml` before starting.
 ## Nix
-Run Minne directly with Nix (includes Chromium):
+Run Minne directly with Nix:
 ```bash
 nix run 'github:perstarkse/minne#main'
@@ -31,8 +31,9 @@ Configure via environment variables or a `config.yaml` file. See [Configuration]
 Download binaries for Windows, macOS, and Linux from [GitHub Releases](https://github.com/perstarkse/minne/releases/latest).
 **Requirements:**
 - SurrealDB instance (local or remote)
- Chromium (for web scraping)
+- `libEGL` + `libfontconfig` (for servo-fetch web scraping)
 ## Build from Source
@@ -45,9 +46,10 @@ cargo build --release --bin main
 The binary will be at `target/release/main`.
 **Requirements:**
 - Rust toolchain
 - SurrealDB accessible at configured address
- Chromium in PATH
+- `libEGL` + `libfontconfig` for servo-fetch (web scraping) — bundled in Nix and Docker images
 ## Process Modes
@@ -50,16 +50,16 @@
        doCheck = false;
        nativeBuildInputs = [pkgs.pkg-config pkgs.rustfmt pkgs.makeWrapper];
-        buildInputs = [pkgs.openssl pkgs.chromium pkgs.onnxruntime];
+        buildInputs = [pkgs.openssl pkgs.libglvnd pkgs.onnxruntime];
        postInstall = ''
          wrapProgram $out/bin/main \
-            --set CHROME ${pkgs.chromium}/bin/chromium \
+            --prefix LD_LIBRARY_PATH : ${pkgs.libglvnd}/lib \
            --set ORT_DYLIB_PATH ${pkgs.onnxruntime}/lib/libonnxruntime.${libExt}
          for b in worker server; do
            if [ -x "$out/bin/$b" ]; then
              wrapProgram $out/bin/$b \
-                --set CHROME ${pkgs.chromium}/bin/chromium \
+                --prefix LD_LIBRARY_PATH : ${pkgs.libglvnd}/lib \
                --set ORT_DYLIB_PATH ${pkgs.onnxruntime}/lib/libonnxruntime.${libExt}
            fi
          done
@@ -18,17 +18,22 @@ async-openai = { workspace = true }
 surrealdb = { workspace = true }
 dom_smoothie = { workspace = true }
 tempfile = { workspace = true }
-axum_typed_multipart = { workspace = true}
+axum_typed_multipart = { workspace = true }
-anyhow = { workspace = true } 
+anyhow = { workspace = true }
 reqwest = { workspace = true }
 chrono = { workspace = true }
 text-splitter = { workspace = true }
 url = { workspace = true }
 uuid = { workspace = true }
 headless_chrome = { workspace = true }
 base64 = { workspace = true }
 pdf-extract = { workspace = true }
 lopdf = { workspace = true }
 tendril = { workspace = true }
 servo-fetch = { workspace = true }
 servo-allocator = { version = "0.2", features = ["use-system-allocator"] }
 pdfium-auto = { workspace = true }
 pdfium-render = { workspace = true }
 image = { workspace = true }
 bytes = { workspace = true }
 async-trait = { workspace = true }
 state-machines = { workspace = true }
@@ -37,7 +42,6 @@ common = { path = "../common" }
 retrieval-pipeline = { path = "../retrieval-pipeline" }
 [features]
 docker = []
 [dev-dependencies]
 common = { path = "../common", features = ["test-utils"] }
@@ -24,6 +24,6 @@ pub async fn transcribe_audio_file(
        .transcription()
        .create(request)
        .await
-        .map_err(|e| AppError::Processing(format!("Audio transcription failed: {e}")))?;
+        .map_err(|e| AppError::Processing(format!("audio transcription failed: {e}")))?;
    Ok(response.text)
 }
@@ -1,27 +0,0 @@
 use common::error::AppError;
 use headless_chrome::Browser;
 /// Launches a headless Chrome instance, honoring the `docker` feature flag
 /// (which disables the Chrome sandbox for container environments).
 ///
 /// This is the single place the crate spawns a browser. If the rendering backend
 /// is ever swapped away from headless Chrome to something leaner, this function is
 /// the seam to change; callers only depend on getting back a `Browser`.
 pub(crate) fn launch_browser() -> Result<Browser, AppError> {
    #[cfg(feature = "docker")]
    {
        let options = headless_chrome::LaunchOptionsBuilder::default()
            .sandbox(false)
            .build()
            .map_err(|err| {
                AppError::Processing(format!("Failed to build headless browser options: {err}"))
            })?;
        Browser::new(options)
            .map_err(|err| AppError::Processing(format!("Failed to start headless browser: {err}")))
    }
    #[cfg(not(feature = "docker"))]
    {
        Browser::default()
            .map_err(|err| AppError::Processing(format!("Failed to start headless browser: {err}")))
    }
 }
@@ -1,8 +1,8 @@
 pub mod audio_transcription;
 pub mod browser;
 pub mod file_text_extraction;
 pub mod graph_mapper;
 pub mod image_parsing;
 pub mod llm_instructions;
 pub mod page_fetcher;
 pub mod pdf;
 pub mod url_text_retrieval;
@@ -0,0 +1,117 @@
 //! Page-fetching abstraction that decouples URL extraction from the underlying engine.
 //!
 //! The primary implementation uses [`servo_fetch`], a pure-Rust Servo engine that
 //! provides high extraction quality (word-F1 0.819), fast startup (~331ms), and a
 //! small memory footprint (~64MB peak).
 use std::time::Duration;
 use common::error::AppError;
 use tracing::info;
 /// Captured content from a single page fetch.
 #[derive(Debug, Clone, PartialEq)]
 pub(crate) struct PageCapture {
    /// Raw HTML source of the page.
    pub html: String,
    /// Readable Markdown extracted from the page content.
    pub markdown: String,
    /// JPEG/PNG screenshot bytes, or empty if not captured.
    pub screenshot: Vec<u8>,
 }
 /// Abstraction over a page-fetching engine.
 pub(crate) trait PageFetcher: Send + Sync + std::fmt::Debug {
    /// Fetches a URL and returns the captured content (HTML, markdown, screenshot).
    fn fetch(&self, url: &str) -> Result<PageCapture, AppError>;
 }
 /// Fetcher powered by the embedded Servo engine via `servo-fetch`.
 ///
 /// Provides HTML, extracted Markdown, and a PNG screenshot.
 #[derive(Debug)]
 pub(crate) struct ServoFetchFetcher;
 impl PageFetcher for ServoFetchFetcher {
    fn fetch(&self, url: &str) -> Result<PageCapture, AppError> {
        let page = servo_fetch::blocking::fetch(
            &servo_fetch::FetchOptions::screenshot(url, true)
                .timeout(Duration::from_secs(30))
                .settle(Duration::from_millis(3000)),
        )
        .map_err(|err| AppError::Processing(format!("servo-fetch failed for {url}: {err}")))?;
        let html = page.html.clone();
        let markdown = page
            .markdown()
            .map_err(|err| AppError::Processing(format!("failed to extract markdown: {err}")))?;
        let screenshot = page.screenshot_png().unwrap_or_default().to_vec();
        info!(
            url = %url,
            html_bytes = html.len(),
            md_chars = markdown.len(),
            screenshot_bytes = screenshot.len(),
            "servo-fetch completed"
        );
        Ok(PageCapture {
            html,
            markdown,
            screenshot,
        })
    }
 }
 /// Creates the default page fetcher for the current configuration.
 #[allow(unreachable_pub)]
 pub(crate) fn create_fetcher() -> Box<dyn PageFetcher> {
    Box::new(ServoFetchFetcher)
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn test_default_fetcher_constructs() {
        let fetcher = create_fetcher();
        assert!(!format!("{fetcher:?}").is_empty());
    }
    #[test]
    fn test_servo_fetcher_constructs() {
        let _ = ServoFetchFetcher;
    }
    #[test]
    fn test_trait_object_dispatch() {
        let fetcher: Box<dyn PageFetcher> = Box::new(ServoFetchFetcher);
        assert!(!format!("{fetcher:?}").is_empty());
    }
    /// Smoke test: Servo engine initialises even without display server.
    /// Wrap in `catch_unwind` because child-thread panics from servo
    /// (e.g. missing wayland) would otherwise escape the test harness.
    #[test]
    fn test_servo_engine_initializes() {
        let fetcher = ServoFetchFetcher;
        let result = std::panic::catch_unwind(move || {
            let _ = fetcher.fetch("about:blank");
        });
        if let Err(panic) = result {
            let msg = panic
                .downcast_ref::<&str>()
                .copied()
                .or_else(|| panic.downcast_ref::<String>().map(String::as_str))
                .unwrap_or("unknown panic");
            assert!(
                !(msg.contains("wayland")
                    || msg.contains("Library")
                    || msg.contains("servo-engine")),
                "Servo engine initialization failed: {msg}"
            );
        }
    }
 }
@@ -1,32 +1,25 @@
-//! Headless-Chrome rasterization of PDF pages into PNG screenshots.
+//! PDF page rasterization using pdfium-render via pdfium-auto.
 //!
-//! This is the only Chrome-dependent part of PDF ingestion. It depends on the
+//! Uses direct `PDFium` bindings for reliable, pixel-perfect page rendering —
-//! browser's internal PDF-viewer shadow DOM, so it is inherently fragile across
+//! starts in ~5ms, requires no display server, and produces consistent output
-//! Chrome upgrades; a full-page-capture fallback guards the common failure modes.
+//! independent of PDF reader version. Each page is rendered at a generous
 //! resolution and encoded as PNG for downstream LLM vision ingestion.
 use std::{
    path::{Path, PathBuf},
-    time::{Duration, SystemTime, UNIX_EPOCH},
+    time::{SystemTime, UNIX_EPOCH},
 };
-use base64::{engine::general_purpose::STANDARD, Engine as _};
+use image::ImageFormat;
 use headless_chrome::protocol::cdp::{Emulation, Page, DOM};
 use lopdf::Document;
-use serde_json::Value;
+use pdfium_render::prelude::PdfRenderConfig;
 use tracing::{debug, warn};
 use common::error::AppError;
 use crate::utils::browser::launch_browser;
 const NAVIGATION_RETRY_INTERVAL_MS: u64 = 120;
 const NAVIGATION_RETRY_ATTEMPTS: usize = 10;
 const MIN_PAGE_IMAGE_BYTES: usize = 1_024;
-const DEFAULT_VIEWPORT_WIDTH: u32 = 1_248; // generous width to reduce horizontal clipping
+const RENDER_TARGET_WIDTH: i32 = 1200;
-const DEFAULT_VIEWPORT_HEIGHT: u32 = 1_800; // tall enough to capture full page at fit-to-width scale
+const RENDER_MAX_HEIGHT: i32 = 2000;
 const DEFAULT_DEVICE_SCALE_FACTOR: f64 = 1.0;
 const CANVAS_VIEWPORT_ATTEMPTS: usize = 12;
 const CANVAS_VIEWPORT_WAIT_MS: u64 = 200;
 const DEBUG_IMAGE_ENV_VAR: &str = "MINNE_PDF_DEBUG_DIR";
 /// Parses the PDF structure to discover the available page numbers while keeping work off
@@ -34,7 +27,7 @@ const DEBUG_IMAGE_ENV_VAR: &str = "MINNE_PDF_DEBUG_DIR";
 pub(super) async fn load_page_numbers(pdf_bytes: Vec<u8>) -> Result<Vec<u32>, AppError> {
    let pages = tokio::task::spawn_blocking(move || -> Result<Vec<u32>, AppError> {
        let document = Document::load_mem(&pdf_bytes)
-            .map_err(|err| AppError::Processing(format!("Failed to parse PDF: {err}")))?;
+            .map_err(|err| AppError::Processing(format!("failed to parse PDF: {err}")))?;
        let mut page_numbers: Vec<u32> = document.get_pages().keys().copied().collect();
        page_numbers.sort_unstable();
        Ok(page_numbers)
@@ -44,7 +37,9 @@ pub(super) async fn load_page_numbers(pdf_bytes: Vec<u8>) -> Result<Vec<u32>, Ap
    Ok(pages)
 }
-/// Uses the existing headless Chrome dependency to rasterize the requested PDF pages into PNGs.
+/// Renders the requested PDF pages as PNG-encoded byte vectors using `PDFium`.
 ///
 /// Work is offloaded to a blocking thread since `PDFium`'s C API is not async-safe.
 pub(super) async fn render_pdf_pages(
    file_path: &Path,
    pages: &[u32],
@@ -52,8 +47,8 @@ pub(super) async fn render_pdf_pages(
    let file_path = file_path.to_path_buf();
    let pages = pages.to_vec();
    let page_numbers = pages.clone();
-    let captures =
+
-        tokio::task::spawn_blocking(move || render_pdf_pages_inner(&file_path, &pages)).await??;
+    let captures = tokio::task::spawn_blocking(move || render_inner(&file_path, &pages)).await??;
    for (page_number, png) in page_numbers.iter().zip(captures.iter()) {
        if let Err(err) = maybe_dump_debug_image(*page_number, png).await {
@@ -68,306 +63,65 @@ pub(super) async fn render_pdf_pages(
    Ok(captures)
 }
-fn render_pdf_pages_inner(file_path: &Path, pages: &[u32]) -> Result<Vec<Vec<u8>>, AppError> {
+/// Initializes `PDFium`, opens the file, and renders each requested page.
-    let file_url = url::Url::from_file_path(file_path)
+fn render_inner(file_path: &Path, pages: &[u32]) -> Result<Vec<Vec<u8>>, AppError> {
-        .map_err(|()| AppError::Processing("Unable to construct PDF file URL".into()))?;
+    let pdfium = pdfium_auto::bind_pdfium_silent()
        .map_err(|err| AppError::Processing(format!("failed to bind PDFium library: {err}")))?;
-    let browser = launch_browser()?;
+    let doc = pdfium
-    let tab = browser
+        .load_pdf_from_file(file_path, None)
-        .new_tab()
+        .map_err(|err| AppError::Processing(format!("failed to load PDF file: {err}")))?;
        .map_err(|err| AppError::Processing(format!("Failed to create Chrome tab: {err}")))?;
-    tab.set_default_timeout(Duration::from_secs(10));
+    let render_config = PdfRenderConfig::new()
-    configure_tab(&tab)?;
+        .set_target_width(RENDER_TARGET_WIDTH)
-    set_pdf_viewport(&tab)?;
+        .set_maximum_height(RENDER_MAX_HEIGHT);
    let mut captures = Vec::with_capacity(pages.len());
-    for page in pages.iter().copied() {
+    for &page_num in pages {
-        let target = format!("{file_url}#page={page}&toolbar=0&statusbar=0&zoom=page-fit");
+        let page_index = page_num.saturating_sub(1); // PDFium uses 0-based indices
-        tab.navigate_to(&target)
+        let page = doc
-            .map_err(|err| AppError::Processing(format!("Failed to navigate to PDF page: {err}")))?
+            .pages()
-            .wait_until_navigated()
+            .get(u16::try_from(page_index).unwrap_or(u16::MAX))
-            .map_err(|err| AppError::Processing(format!("Navigation to PDF page failed: {err}")))?;
+            .map_err(|err| {
                AppError::Processing(format!("failed to get PDF page {page_num}: {err}"))
            })?;
-        let mut loaded = false;
+        let bitmap = page.render_with_config(&render_config).map_err(|err| {
-        for attempt in 0..NAVIGATION_RETRY_ATTEMPTS {
+            AppError::Processing(format!("failed to render PDF page {page_num}: {err}"))
-            if tab
+        })?;
                .wait_for_element("embed, canvas, body")
                .map(|_| ())
                .is_ok()
            {
                loaded = true;
                break;
            }
            if attempt < NAVIGATION_RETRY_ATTEMPTS.saturating_sub(1) {
                std::thread::sleep(Duration::from_millis(NAVIGATION_RETRY_INTERVAL_MS));
            }
        }
-        if !loaded {
+        let image = bitmap.as_image();
            return Err(AppError::Processing(
                "Timed out waiting for Chrome to render PDF page".into(),
            ));
        }
-        wait_for_pdf_ready(&tab, page)?;
+        let mut png_bytes = Vec::new();
-        std::thread::sleep(Duration::from_millis(350));
+        image
            .write_to(&mut std::io::Cursor::new(&mut png_bytes), ImageFormat::Png)
            .map_err(|err| {
                AppError::Processing(format!(
                    "failed to encode PDF page {page_num} as PNG: {err}"
                ))
            })?;
-        prepare_pdf_viewer(&tab, page);
+        debug!(
            page = page_num,
            bytes = png_bytes.len(),
            "Rendered PDF page via PDFium"
        );
-        let mut viewport: Option<Page::Viewport> = None;
+        if png_bytes.len() < MIN_PAGE_IMAGE_BYTES {
        for attempt in 0..CANVAS_VIEWPORT_ATTEMPTS {
            match canvas_viewport_for_page(&tab, page) {
                Ok(Some(vp)) => {
                    viewport = Some(vp);
                    break;
                }
                Ok(None) => {
                    if attempt < CANVAS_VIEWPORT_ATTEMPTS.saturating_sub(1) {
                        std::thread::sleep(Duration::from_millis(CANVAS_VIEWPORT_WAIT_MS));
                    }
                }
                Err(err) => {
                    warn!(page, error = %err, "Failed to derive canvas viewport");
                    break;
                }
            }
        }
        let png = if let Some(clip) = viewport {
            match tab.call_method(Page::CaptureScreenshot {
                format: Some(Page::CaptureScreenshotFormatOption::Png),
                quality: None,
                clip: Some(clip),
                from_surface: Some(true),
                capture_beyond_viewport: Some(true),
                optimize_for_speed: Some(false),
            }) {
                Ok(data) => match STANDARD.decode(data.data) {
                    Ok(bytes) => bytes,
                    Err(err) => {
                        warn!(error = %err, page, "Failed to decode clipped screenshot; falling back to full page capture");
                        capture_full_page_png(&tab)?
                    }
                },
                Err(err) => {
                    warn!(error = %err, page, "Clipped screenshot failed; falling back to full page capture");
                    capture_full_page_png(&tab)?
                }
            }
        } else {
            warn!(
-                page,
+                page = page_num,
-                "Unable to determine canvas viewport; capturing full page"
+                bytes = png_bytes.len(),
-            );
+                "Rendered page size below threshold; check PDF quality"
            capture_full_page_png(&tab)?
        };
        debug!(page, bytes = png.len(), "Captured PDF page screenshot");
        if is_suspicious_image(png.len()) {
            warn!(
                page,
                bytes = png.len(),
                "Screenshot size below threshold; check rendering output"
            );
        }
-        captures.push(png);
+        captures.push(png_bytes);
    }
    Ok(captures)
 }
 fn configure_tab(tab: &headless_chrome::Tab) -> Result<(), AppError> {
    tab.call_method(Emulation::SetDefaultBackgroundColorOverride {
        color: Some(DOM::RGBA {
            r: 255,
            g: 255,
            b: 255,
            a: Some(1.0),
        }),
    })
    .map_err(|err| {
        AppError::Processing(format!("Failed to configure Chrome page background: {err}"))
    })?;
    Ok(())
 }
 fn set_pdf_viewport(tab: &headless_chrome::Tab) -> Result<(), AppError> {
    tab.call_method(Emulation::SetDeviceMetricsOverride {
        width: DEFAULT_VIEWPORT_WIDTH,
        height: DEFAULT_VIEWPORT_HEIGHT,
        device_scale_factor: DEFAULT_DEVICE_SCALE_FACTOR,
        mobile: false,
        scale: None,
        screen_width: Some(DEFAULT_VIEWPORT_WIDTH),
        screen_height: Some(DEFAULT_VIEWPORT_HEIGHT),
        position_x: None,
        position_y: None,
        dont_set_visible_size: Some(false),
        screen_orientation: None,
        viewport: None,
        display_feature: None,
        device_posture: None,
    })
    .map_err(|err| AppError::Processing(format!("Failed to configure Chrome viewport: {err}")))?;
    tab.call_method(Emulation::SetVisibleSize {
        width: DEFAULT_VIEWPORT_WIDTH,
        height: DEFAULT_VIEWPORT_HEIGHT,
    })
    .map_err(|err| AppError::Processing(format!("Failed to apply Chrome visible size: {err}")))?;
    Ok(())
 }
 fn wait_for_pdf_ready(
    tab: &headless_chrome::Tab,
    page_number: u32,
 ) -> Result<headless_chrome::Element<'_>, AppError> {
    let embed_selector = "embed[type='application/pdf']";
    let element = tab
        .wait_for_element_with_custom_timeout(embed_selector, Duration::from_secs(8))
        .or_else(|_| tab.wait_for_element_with_custom_timeout("embed", Duration::from_secs(8)))
        .map_err(|err| AppError::Processing(format!("Timed out waiting for PDF content: {err}")))?;
    if let Err(err) = element.scroll_into_view() {
        debug!("Failed to scroll PDF element into view: {err}");
    }
    debug!(page = page_number, "PDF viewer element located");
    Ok(element)
 }
 fn prepare_pdf_viewer(tab: &headless_chrome::Tab, page_number: u32) {
    let script = format!(
        r#"(function() {{
            const embed = document.querySelector('embed[type="application/pdf"]') || document.querySelector('embed');
            if (!embed || !embed.shadowRoot) return false;
            const viewer = embed.shadowRoot.querySelector('pdf-viewer');
            if (!viewer || !viewer.shadowRoot) return false;
            const app = viewer.shadowRoot.querySelector('viewer-app');
            if (app && app.shadowRoot) {{
                const toolbar = app.shadowRoot.querySelector('#toolbar');
                if (toolbar) {{ toolbar.style.display = 'none'; }}
            }}
            const page = viewer.shadowRoot.querySelector('viewer-page:nth-of-type({page_number})');
            if (page && page.scrollIntoView) {{
                page.scrollIntoView({{ block: 'start', inline: 'center' }});
            }}
            const canvas = viewer.shadowRoot.querySelector('canvas[aria-label="Page {page_number}"]');
            return !!canvas;
        }})()"#
    );
    match tab.evaluate(&script, false) {
        Ok(result) => {
            let ready = result
                .value
                .as_ref()
                .and_then(Value::as_bool)
                .unwrap_or(false);
            debug!(page = page_number, ready, "Prepared PDF viewer page");
        }
        Err(err) => {
            debug!(page = page_number, error = %err, "Unable to run PDF viewer preparation script");
        }
    }
 }
 fn canvas_viewport_for_page(
    tab: &headless_chrome::Tab,
    page_number: u32,
 ) -> Result<Option<Page::Viewport>, AppError> {
    let script = format!(
        r#"(function() {{
            const embed = document.querySelector('embed[type="application/pdf"]') || document.querySelector('embed');
            if (!embed || !embed.shadowRoot) return null;
            const viewer = embed.shadowRoot.querySelector('pdf-viewer');
            if (!viewer || !viewer.shadowRoot) return null;
            const canvas = viewer.shadowRoot.querySelector('canvas[aria-label="Page {page_number}"]');
            if (!canvas) return null;
            const rect = canvas.getBoundingClientRect();
            return {{ x: rect.x, y: rect.y, width: rect.width, height: rect.height }};
        }})()"#
    );
    let result = tab
        .evaluate(&script, false)
        .map_err(|err| AppError::Processing(format!("Failed to inspect PDF canvas: {err}")))?;
    let Some(value) = result.value else {
        return Ok(None);
    };
    if value.is_null() {
        return Ok(None);
    }
    let x = value
        .get("x")
        .and_then(Value::as_f64)
        .unwrap_or_default()
        .max(0.0);
    let y = value
        .get("y")
        .and_then(Value::as_f64)
        .unwrap_or_default()
        .max(0.0);
    let width = value
        .get("width")
        .and_then(Value::as_f64)
        .unwrap_or_default();
    let height = value
        .get("height")
        .and_then(Value::as_f64)
        .unwrap_or_default();
    if width <= 0.0 || height <= 0.0 {
        return Ok(None);
    }
    debug!(
        page = page_number,
        x, y, width, height, "Derived canvas viewport"
    );
    Ok(Some(Page::Viewport {
        x,
        y,
        width,
        height,
        scale: 1.0,
    }))
 }
 fn capture_full_page_png(tab: &headless_chrome::Tab) -> Result<Vec<u8>, AppError> {
    let screenshot = tab
        .call_method(Page::CaptureScreenshot {
            format: Some(Page::CaptureScreenshotFormatOption::Png),
            quality: None,
            clip: None,
            from_surface: Some(true),
            capture_beyond_viewport: Some(true),
            optimize_for_speed: Some(false),
        })
        .map_err(|err| {
            AppError::Processing(format!("Failed to capture PDF page (fallback): {err}"))
        })?;
    STANDARD.decode(screenshot.data).map_err(|err| {
        AppError::Processing(format!("Failed to decode PDF screenshot (fallback): {err}"))
    })
 }
 const fn is_suspicious_image(len: usize) -> bool {
    len < MIN_PAGE_IMAGE_BYTES
 }
 fn debug_dump_directory() -> Option<PathBuf> {
    std::env::var(DEBUG_IMAGE_ENV_VAR)
        .ok()
@@ -394,6 +148,8 @@ async fn maybe_dump_debug_image(page_index: u32, bytes: &[u8]) -> Result<(), App
 mod tests {
    use super::*;
    use anyhow::{self};
    use lopdf::dictionary;
    use lopdf::Object;
    #[test]
    fn test_debug_dump_directory_env_var() -> anyhow::Result<()> {
@@ -409,10 +165,108 @@ mod tests {
        Ok(())
    }
-    #[test]
+    #[tokio::test]
-    fn test_is_suspicious_image_threshold() {
+    async fn test_load_page_numbers_empty_pdf() -> anyhow::Result<()> {
-        assert!(is_suspicious_image(0));
+        let pdf_bytes = create_minimal_pdf(0);
-        assert!(is_suspicious_image(MIN_PAGE_IMAGE_BYTES - 1));
+        let pages = load_page_numbers(pdf_bytes).await?;
-        assert!(!is_suspicious_image(MIN_PAGE_IMAGE_BYTES + 1));
+        assert!(pages.is_empty());
        Ok(())
    }
    #[tokio::test]
    async fn test_load_page_numbers_single_page() -> anyhow::Result<()> {
        let pdf_bytes = create_minimal_pdf(1);
        let pages = load_page_numbers(pdf_bytes).await?;
        assert_eq!(pages, vec![1u32]);
        Ok(())
    }
    #[tokio::test]
    async fn test_load_page_numbers_multi_page() -> anyhow::Result<()> {
        let pdf_bytes = create_minimal_pdf(5);
        let pages = load_page_numbers(pdf_bytes).await?;
        assert_eq!(pages, vec![1, 2, 3, 4, 5]);
        Ok(())
    }
    #[tokio::test]
    async fn test_load_page_numbers_invalid_pdf() {
        let result = load_page_numbers(b"not a pdf".to_vec()).await;
        assert!(result.is_err());
    }
    /// Creates a minimal valid PDF with the given number of empty pages.
    #[allow(clippy::similar_names, clippy::expect_used)]
    fn create_minimal_pdf(page_count: u32) -> Vec<u8> {
        let mut doc = Document::with_version("1.5");
        let pages_id = doc.new_object_id();
        let mut page_ids = Vec::with_capacity(page_count as usize);
        for _ in 0..page_count {
            let page_id = doc.add_object(dictionary! {
                "Type" => "Page",
                "Parent" => pages_id,
                "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
            });
            page_ids.push(page_id);
        }
        let pages = dictionary! {
            "Type" => "Pages",
            "Kids" => page_ids.iter().map(|id| Object::Reference(*id)).collect::<Vec<_>>(),
            "Count" => i32::try_from(page_count).unwrap_or(i32::MAX),
            "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
        };
        doc.objects.insert(pages_id, Object::Dictionary(pages));
        let catalog_id = doc.add_object(dictionary! {
            "Type" => "Catalog",
            "Pages" => pages_id,
        });
        doc.trailer.set("Root", catalog_id);
        let mut buf = Vec::new();
        doc.save_to(&mut buf).expect("failed to serialize test PDF");
        buf
    }
    /// Renders a simple 1-page PDF and verifies the output is a valid PNG ≥ 1KB.
    /// This test skips gracefully when `PDFium` is not available (e.g., CI without internet).
    #[tokio::test]
    async fn test_render_single_page_pdfium() -> anyhow::Result<()> {
        let pdf_bytes = create_minimal_pdf(1);
        let dir = tempfile::TempDir::new()?;
        let file_path = dir.path().join("test.pdf");
        tokio::fs::write(&file_path, &pdf_bytes).await?;
        let result = render_pdf_pages(&file_path, &[1]).await;
        match result {
            Ok(pages) => {
                assert_eq!(pages.len(), 1, "should render one page");
                #[allow(clippy::expect_used)]
                let first_page = pages.into_iter().next().expect("already asserted len == 1");
                assert!(
                    first_page.len() >= MIN_PAGE_IMAGE_BYTES,
                    "rendered page {} bytes is below threshold {}",
                    first_page.len(),
                    MIN_PAGE_IMAGE_BYTES
                );
                // Verify it's a valid PNG by checking header bytes
                let header = first_page
                    .get(..4.min(first_page.len()))
                    .unwrap_or(&[0u8; 0]);
                assert_eq!(header, &[0x89, 0x50, 0x4E, 0x47], "output must be PNG");
            }
            Err(e) => {
                // PDFium may not be available — that's acceptable in environments
                // without network access to download the binary.
                let msg = e.to_string();
                if !msg.contains("PDFium") && !msg.contains("library") && !msg.contains("bind") {
                    anyhow::bail!("unexpected error: {e}");
                }
                eprintln!("SKIP: PDFium not available ({msg})");
            }
        }
        Ok(())
    }
 }
@@ -1,7 +1,9 @@
 //! Fast-path PDF text extraction and Markdown reflow heuristics.
 //!
-//! These are pure (non-IO, non-Chrome) helpers used before falling back to the
+//! Pure text-extraction helpers that run before falling back to the vision pipeline,
-//! vision pipeline, plus the Markdown normalization applied to both paths.
+//! plus the Markdown normalization applied to both paths. The fast path uses
 //! `pdf-extract` to pull embedded text layers directly, avoiding the cost of
 //! page-by-page rasterization for well-structured PDFs.
 use common::error::AppError;
@@ -15,7 +17,7 @@ pub(super) async fn try_fast_path(pdf_bytes: Vec<u8>) -> Result<Option<String>,
        pdf_extract::extract_text_from_mem(&pdf_bytes).map(|s| s.trim().to_string())
    })
    .await?
-    .map_err(|err| AppError::Processing(format!("Failed to extract text from PDF: {err}")))?;
+    .map_err(|err| AppError::Processing(format!("failed to extract text from PDF: {err}")))?;
    if extraction.is_empty() {
        return Ok(None);
@@ -28,7 +30,7 @@ pub(super) async fn try_fast_path(pdf_bytes: Vec<u8>) -> Result<Option<String>,
    Ok(Some(normalize_fast_text(&extraction)))
 }
-/// Heuristic that determines whether the fast-path text looks like well-formed prose.
+/// Heuristic that determines whether the fast-path text looks like readable text.
 #[allow(clippy::cast_precision_loss)]
 fn looks_good_enough(text: &str) -> bool {
    if text.len() < FAST_PATH_MIN_LEN {
@@ -116,7 +116,7 @@ async fn transcribe_batch(
            );
            if attempt == last_attempt {
                return Err(AppError::Processing(
-                    "Vision model failed to transcribe PDF page contents".into(),
+                    "vision model failed to transcribe PDF page contents".into(),
                ));
            }
            continue;
@@ -126,7 +126,7 @@ async fn transcribe_batch(
    }
    Err(AppError::Processing(
-        "Vision model did not return usable Markdown".into(),
+        "vision model did not return usable Markdown".into(),
    ))
 }
@@ -5,14 +5,18 @@ use common::{
    error::AppError,
    storage::{db::SurrealDbClient, store::StorageManager, types::file_info::FileInfo},
 };
-use dom_smoothie::{Article, Readability, TextMode};
+use dom_smoothie::Article;
 use std::{
    io::{Seek, SeekFrom, Write},
    net::IpAddr,
    time::Instant,
 };
 use tempfile::NamedTempFile;
-use tracing::{error, info, warn};
+use tendril::StrTendril;
 use tracing::{info, warn};
 use crate::utils::page_fetcher::create_fetcher;
 pub async fn extract_text_from_url(
    url: &str,
    db: &SurrealDbClient,
@@ -22,46 +26,22 @@ pub async fn extract_text_from_url(
    info!("Fetching URL: {}", url);
    let now = Instant::now();
    let browser = crate::utils::browser::launch_browser()?;
    let tab = browser
        .new_tab()
        .map_err(|e| AppError::InternalError(e.to_string()))?;
    let page = tab
        .navigate_to(url)
        .map_err(|e| AppError::InternalError(e.to_string()))?;
    let loaded_page = page
        .wait_until_navigated()
        .map_err(|e| AppError::InternalError(e.to_string()))?;
    let raw_content = loaded_page
        .get_content()
        .map_err(|e| AppError::InternalError(e.to_string()))?;
    let screenshot = loaded_page
        .capture_screenshot(
            headless_chrome::protocol::cdp::Page::CaptureScreenshotFormatOption::Jpeg,
            None,
            None,
            true,
        )
        .map_err(|e| AppError::InternalError(e.to_string()))?;
    let mut tmp_file = NamedTempFile::new()?;
    let temp_path_str = tmp_file.path().display().to_string();
    tmp_file.write_all(&screenshot)?;
    tmp_file.as_file().sync_all()?;
    if let Err(e) = tmp_file.seek(SeekFrom::Start(0)) {
        error!(
            "URL: {}. Failed to seek temp file {} to start: {:?}. Proceeding, but hashing might fail.",
            url, temp_path_str, e
        );
    }
    let parsed_url =
        url::Url::parse(url).map_err(|_| AppError::Validation("invalid URL".to_string()))?;
    let domain = ensure_ingestion_url_allowed(&parsed_url)?;
    let fetcher = create_fetcher();
    let capture = fetcher.fetch(url)?;
    // Save the screenshot to storage
    let mut tmp_file = NamedTempFile::new()?;
    if !capture.screenshot.is_empty() {
        tmp_file.write_all(&capture.screenshot)?;
        tmp_file.as_file().sync_all()?;
        tmp_file.seek(SeekFrom::Start(0))?;
    }
    let timestamp = Utc::now().format("%Y%m%d%H%M%S");
    let file_name = format!("{}_{}_{}.jpg", domain, "screenshot", timestamp);
@@ -78,12 +58,25 @@ pub async fn extract_text_from_url(
    let file_info = FileInfo::new_with_storage(field_data, db, user_id, storage).await?;
-    let config = dom_smoothie::Config {
+    // servo-fetch doesn't extract byline/site_name/metadata, so those are left empty.
-        text_mode: TextMode::Markdown,
+    let title = extract_title_from_html(&capture.html);
-        ..Default::default()
+    let article = Article {
        title,
        byline: None,
        content: StrTendril::from_slice(&capture.markdown),
        text_content: StrTendril::from_slice(&capture.markdown),
        length: capture.markdown.len(),
        excerpt: None,
        site_name: None,
        dir: None,
        lang: None,
        published_time: None,
        modified_time: None,
        image: None,
        favicon: None,
        url: Some(url.to_string()),
    };
-    let mut readability = Readability::new(raw_content, None, Some(config))?;
+
    let article: Article = readability.parse()?;
    let end = now.elapsed();
    info!(
        "URL: {}. Total time: {:?}. Final File ID: {}",
@@ -93,13 +86,31 @@ pub async fn extract_text_from_url(
    Ok((article, file_info))
 }
 /// Extracts a page title from raw HTML. Returns empty string when no title is found.
 fn extract_title_from_html(html: &str) -> String {
    let lower = html.to_ascii_lowercase();
    if let Some(start) = lower.find("<title>") {
        let content_start = start.saturating_add("<title>".len());
        if let Some(end) = lower[content_start..].find("</title>") {
            let title_end = content_start.saturating_add(end);
            if title_end <= html.len() {
                let title = html[content_start..title_end].trim().to_string();
                if !title.is_empty() {
                    return title;
                }
            }
        }
    }
    String::new()
 }
 fn ensure_ingestion_url_allowed(url: &url::Url) -> Result<String, AppError> {
    match url.scheme() {
        "http" | "https" => {}
        scheme => {
            warn!(%url, %scheme, "Rejected ingestion URL due to unsupported scheme");
            return Err(AppError::Validation(
-                "Unsupported URL scheme for ingestion".to_string(),
+                "unsupported URL scheme for ingestion".to_string(),
            ));
        }
    }
@@ -107,14 +118,14 @@ fn ensure_ingestion_url_allowed(url: &url::Url) -> Result<String, AppError> {
    let Some(host) = url.host_str() else {
        warn!(%url, "Rejected ingestion URL missing host");
        return Err(AppError::Validation(
-            "URL is missing a host component".to_string(),
+            "URL missing a host component".to_string(),
        ));
    };
    if host.eq_ignore_ascii_case("localhost") {
        warn!(%url, host, "Rejected ingestion URL to localhost");
        return Err(AppError::Validation(
-            "Ingestion URL host is not allowed".to_string(),
+            "ingestion URL host is not allowed".to_string(),
        ));
    }
@@ -127,7 +138,7 @@ fn ensure_ingestion_url_allowed(url: &url::Url) -> Result<String, AppError> {
        if ip.is_loopback() || ip.is_unspecified() || ip.is_multicast() || is_disallowed {
            warn!(%url, host, %ip, "Rejected ingestion URL pointing to restricted network range");
            return Err(AppError::Validation(
-                "Ingestion URL host is not allowed".to_string(),
+                "ingestion URL host is not allowed".to_string(),
            ));
        }
    }
@@ -168,4 +179,28 @@ mod tests {
        assert_eq!(sanitized, "sub_example_com");
        Ok(())
    }
    #[test]
    fn test_extract_title_from_html_with_title() {
        let html = "<html><head><title>Hello World</title></head><body></body></html>";
        assert_eq!(extract_title_from_html(html), "Hello World");
    }
    #[test]
    fn test_extract_title_from_html_mixed_case() {
        let html = "<html><head><TITLE>Mixed Case</TITLE></head><body></body></html>";
        assert_eq!(extract_title_from_html(html), "Mixed Case");
    }
    #[test]
    fn test_extract_title_from_html_no_title() {
        let html = "<html><head></head><body><p>No title here</p></body></html>";
        assert_eq!(extract_title_from_html(html), "");
    }
    #[test]
    fn test_extract_title_from_html_empty_title() {
        let html = "<html><head><title></title></head><body></body></html>";
        assert_eq!(extract_title_from_html(html), "");
    }
 }