mirror of
https://github.com/perstarkse/minne.git
synced 2026-06-30 10:01:40 +02:00
refactor: replace headless_chrome with lighter alternatives
This commit is contained in:
@@ -2,6 +2,10 @@
|
|||||||
|
|
||||||
## Unreleased
|
## Unreleased
|
||||||
|
|
||||||
|
- Refactor: web scraping now uses `servo-fetch` (pure-Rust Servo engine) and PDF rendering uses `pdfium-render` (direct PDFium bindings) — reduces Docker image size by ~300MB, improves startup latency by ~100× for PDF rendering, and provides more stable output
|
||||||
|
- Fix: added `pkgs.libglvnd` to `LD_LIBRARY_PATH` in devenv so Servo engine can find `libEGL.so` at runtime
|
||||||
|
- Fix: updated Dockerfile to add `libegl1 libegl-mesa0 libgles2 libfontconfig1 libfreetype6` runtime dependencies for servo-fetch
|
||||||
|
- Docs: updated architecture, features, and installation docs to reflect the new web processing stack
|
||||||
- Fix: added pre-commit hooks to further maintain code consistency.
|
- Fix: added pre-commit hooks to further maintain code consistency.
|
||||||
- Security: updated some deps because dependabot told me, good bot.
|
- Security: updated some deps because dependabot told me, good bot.
|
||||||
- Security: bump `async-openai` to 0.41.1 (feature-gated types, transcription API rename; removes `backoff` transitive dep)
|
- Security: bump `async-openai` to 0.41.1 (feature-gated types, transcription API rename; removes `backoff` transitive dep)
|
||||||
|
|||||||
Generated
+6002
-218
File diff suppressed because it is too large
Load Diff
+20
-7
@@ -7,13 +7,18 @@ members = [
|
|||||||
"ingestion-pipeline",
|
"ingestion-pipeline",
|
||||||
"retrieval-pipeline",
|
"retrieval-pipeline",
|
||||||
"json-stream-parser",
|
"json-stream-parser",
|
||||||
"evaluations"
|
"evaluations",
|
||||||
]
|
]
|
||||||
resolver = "3"
|
resolver = "3"
|
||||||
|
|
||||||
[workspace.dependencies]
|
[workspace.dependencies]
|
||||||
anyhow = "1.0.94"
|
anyhow = "1.0.94"
|
||||||
async-openai = { version = "0.41.1", features = ["chat-completion", "embedding", "audio", "model"] }
|
async-openai = { version = "0.41.1", features = [
|
||||||
|
"chat-completion",
|
||||||
|
"embedding",
|
||||||
|
"audio",
|
||||||
|
"model",
|
||||||
|
] }
|
||||||
async-stream = "0.3.6"
|
async-stream = "0.3.6"
|
||||||
async-trait = "0.1.88"
|
async-trait = "0.1.88"
|
||||||
axum-htmx = "0.7.0"
|
axum-htmx = "0.7.0"
|
||||||
@@ -27,7 +32,6 @@ chrono = { version = "0.4.39", features = ["serde"] }
|
|||||||
config = "0.15.4"
|
config = "0.15.4"
|
||||||
dom_smoothie = "0.10.0"
|
dom_smoothie = "0.10.0"
|
||||||
futures = "0.3.31"
|
futures = "0.3.31"
|
||||||
headless_chrome = "1.0.17"
|
|
||||||
include_dir = "0.7.4"
|
include_dir = "0.7.4"
|
||||||
mime = "0.3.17"
|
mime = "0.3.17"
|
||||||
mime_guess = "2.0.5"
|
mime_guess = "2.0.5"
|
||||||
@@ -35,7 +39,7 @@ minijinja-autoreload = "2.5.0"
|
|||||||
minijinja-contrib = { version = "2.6.0", features = ["datetime", "timezone"] }
|
minijinja-contrib = { version = "2.6.0", features = ["datetime", "timezone"] }
|
||||||
minijinja-embed = { version = "2.8.0" }
|
minijinja-embed = { version = "2.8.0" }
|
||||||
minijinja = { version = "2.5.0", features = ["loader", "multi_template"] }
|
minijinja = { version = "2.5.0", features = ["loader", "multi_template"] }
|
||||||
reqwest = {version = "0.12.12", features = ["charset", "json"]}
|
reqwest = { version = "0.12.12", features = ["charset", "json"] }
|
||||||
serde_json = "1.0.128"
|
serde_json = "1.0.128"
|
||||||
serde = { version = "1", features = ["derive"] }
|
serde = { version = "1", features = ["derive"] }
|
||||||
sha2 = "0.10.8"
|
sha2 = "0.10.8"
|
||||||
@@ -61,14 +65,24 @@ bytes = "1.7.1"
|
|||||||
state-machines = "0.9"
|
state-machines = "0.9"
|
||||||
pdf-extract = "0.9"
|
pdf-extract = "0.9"
|
||||||
lopdf = "0.32"
|
lopdf = "0.32"
|
||||||
fastembed = { version = "5.2.0", default-features = false, features = ["hf-hub-native-tls", "ort-load-dynamic"] }
|
pdfium-auto = "0.3"
|
||||||
|
pdfium-render = "0.8"
|
||||||
|
servo-fetch = "0.13"
|
||||||
|
tendril = "0.4"
|
||||||
|
image = { version = "0.25", default-features = false, features = ["png"] }
|
||||||
|
fastembed = { version = "5.2.0", default-features = false, features = [
|
||||||
|
"hf-hub-native-tls",
|
||||||
|
"ort-load-dynamic",
|
||||||
|
] }
|
||||||
|
|
||||||
[profile.dist]
|
[profile.dist]
|
||||||
inherits = "release"
|
inherits = "release"
|
||||||
lto = "thin"
|
lto = "thin"
|
||||||
|
|
||||||
[workspace.lints.rust]
|
[workspace.lints.rust]
|
||||||
unexpected_cfgs = { level = "warn", check-cfg = ["cfg(feature, values(\"inspect\"))"] }
|
unexpected_cfgs = { level = "warn", check-cfg = [
|
||||||
|
"cfg(feature, values(\"inspect\"))",
|
||||||
|
] }
|
||||||
|
|
||||||
[workspace.lints.clippy]
|
[workspace.lints.clippy]
|
||||||
# Performance-focused lints
|
# Performance-focused lints
|
||||||
@@ -118,4 +132,3 @@ needless_raw_string_hashes = "allow"
|
|||||||
multiple_bound_locations = "allow"
|
multiple_bound_locations = "allow"
|
||||||
cargo_common_metadata = "allow"
|
cargo_common_metadata = "allow"
|
||||||
multiple-crate-versions = "allow"
|
multiple-crate-versions = "allow"
|
||||||
|
|
||||||
|
|||||||
+5
-6
@@ -14,18 +14,18 @@ COPY html-router/Cargo.toml ./html-router/
|
|||||||
COPY ingestion-pipeline/Cargo.toml ./ingestion-pipeline/
|
COPY ingestion-pipeline/Cargo.toml ./ingestion-pipeline/
|
||||||
COPY json-stream-parser/Cargo.toml ./json-stream-parser/
|
COPY json-stream-parser/Cargo.toml ./json-stream-parser/
|
||||||
COPY main/Cargo.toml ./main/
|
COPY main/Cargo.toml ./main/
|
||||||
RUN cargo build --release --bin main --features ingestion-pipeline/docker || true
|
RUN cargo build --release --bin main || true
|
||||||
|
|
||||||
# Build
|
# Build
|
||||||
COPY . .
|
COPY . .
|
||||||
RUN cargo build --release --bin main --features ingestion-pipeline/docker
|
RUN cargo build --release --bin main
|
||||||
|
|
||||||
# === Runtime ===
|
# === Runtime ===
|
||||||
FROM debian:bookworm-slim
|
FROM debian:bookworm-slim
|
||||||
|
|
||||||
# Chromium + runtime deps + OpenMP for ORT
|
# Servo engine (for servo-fetch web scraping) + runtime deps + OpenMP for ORT
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
chromium libnss3 libasound2 libgbm1 libxshmfence1 \
|
libegl1 libegl-mesa0 libgles2 libfontconfig1 libfreetype6 \
|
||||||
ca-certificates fonts-dejavu fonts-noto-color-emoji \
|
ca-certificates fonts-dejavu fonts-noto-color-emoji \
|
||||||
libgomp1 libstdc++6 curl \
|
libgomp1 libstdc++6 curl \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
@@ -39,8 +39,7 @@ RUN ORT_VERSION="${ORT_VERSION:-$(tr -d '[:space:]' < /tmp/ort-version)}" && \
|
|||||||
"https://github.com/microsoft/onnxruntime/releases/download/v${ORT_VERSION}/onnxruntime-linux-x64-${ORT_VERSION}.tgz" && \
|
"https://github.com/microsoft/onnxruntime/releases/download/v${ORT_VERSION}/onnxruntime-linux-x64-${ORT_VERSION}.tgz" && \
|
||||||
tar -xzf /tmp/ort.tgz -C /opt/onnxruntime --strip-components=1 && rm /tmp/ort.tgz
|
tar -xzf /tmp/ort.tgz -C /opt/onnxruntime --strip-components=1 && rm /tmp/ort.tgz
|
||||||
|
|
||||||
ENV CHROME_BIN=/usr/bin/chromium \
|
ENV SSL_CERT_FILE=/etc/ssl/certs/ca-certificates.crt \
|
||||||
SSL_CERT_FILE=/etc/ssl/certs/ca-certificates.crt \
|
|
||||||
ORT_DYLIB_PATH=/opt/onnxruntime/lib/libonnxruntime.so
|
ORT_DYLIB_PATH=/opt/onnxruntime/lib/libonnxruntime.so
|
||||||
|
|
||||||
# Non-root
|
# Non-root
|
||||||
|
|||||||
@@ -121,7 +121,7 @@ fastembed_cache_dir: "/var/lib/minne/fastembed" # optional override, defaults t
|
|||||||
- **Frontend:** HTML with HTMX and minimal JavaScript for interactivity
|
- **Frontend:** HTML with HTMX and minimal JavaScript for interactivity
|
||||||
- **Database:** SurrealDB (graph, document, and vector search)
|
- **Database:** SurrealDB (graph, document, and vector search)
|
||||||
- **AI Integration:** OpenAI-compatible API with structured outputs
|
- **AI Integration:** OpenAI-compatible API with structured outputs
|
||||||
- **Web Processing:** Headless Chrome for robust webpage content extraction
|
- **Web Processing:** Embedded Servo engine (servo-fetch) for webpage content extraction + PDFium for PDF rendering
|
||||||
|
|
||||||
## Configuration
|
## Configuration
|
||||||
|
|
||||||
@@ -172,7 +172,7 @@ cd minne
|
|||||||
docker compose up -d
|
docker compose up -d
|
||||||
```
|
```
|
||||||
|
|
||||||
The included `docker-compose.yml` handles SurrealDB and Chromium dependencies automatically.
|
The included `docker-compose.yml` handles SurrealDB automatically.
|
||||||
|
|
||||||
### 2. Nix
|
### 2. Nix
|
||||||
|
|
||||||
@@ -180,13 +180,13 @@ The included `docker-compose.yml` handles SurrealDB and Chromium dependencies au
|
|||||||
nix run 'github:perstarkse/minne#main'
|
nix run 'github:perstarkse/minne#main'
|
||||||
```
|
```
|
||||||
|
|
||||||
This fetches Minne and all dependencies, including Chromium.
|
This fetches Minne and all dependencies.
|
||||||
|
|
||||||
### 3. Pre-built Binaries
|
### 3. Pre-built Binaries
|
||||||
|
|
||||||
Download binaries for Windows, macOS, and Linux from the [GitHub Releases](https://github.com/perstarkse/minne/releases/latest).
|
Download binaries for Windows, macOS, and Linux from the [GitHub Releases](https://github.com/perstarkse/minne/releases/latest).
|
||||||
|
|
||||||
**Requirements:** You'll need to provide SurrealDB and Chromium separately.
|
**Requirements:** You'll need to provide SurrealDB separately.
|
||||||
|
|
||||||
### 4. Build from Source
|
### 4. Build from Source
|
||||||
|
|
||||||
@@ -196,7 +196,7 @@ cd minne
|
|||||||
cargo run --release --bin main
|
cargo run --release --bin main
|
||||||
```
|
```
|
||||||
|
|
||||||
**Requirements:** SurrealDB and Chromium must be installed and accessible in your PATH.
|
**Requirements:** SurrealDB must be installed and accessible in your PATH.
|
||||||
|
|
||||||
## Application Architecture
|
## Application Architecture
|
||||||
|
|
||||||
|
|||||||
+12
@@ -41,6 +41,14 @@ in {
|
|||||||
pkgs.onnxruntime
|
pkgs.onnxruntime
|
||||||
pkgs.cargo-watch
|
pkgs.cargo-watch
|
||||||
pkgs.tailwindcss_4
|
pkgs.tailwindcss_4
|
||||||
|
pkgs.python3
|
||||||
|
pkgs.fontconfig
|
||||||
|
pkgs.fontconfig.dev
|
||||||
|
pkgs.libGL
|
||||||
|
pkgs.libGLU
|
||||||
|
pkgs.libclang
|
||||||
|
pkgs.wayland
|
||||||
|
pkgs.libxkbcommon
|
||||||
];
|
];
|
||||||
|
|
||||||
languages.rust = {
|
languages.rust = {
|
||||||
@@ -53,6 +61,10 @@ in {
|
|||||||
};
|
};
|
||||||
|
|
||||||
env = {
|
env = {
|
||||||
|
# tikv-jemalloc-sys configure flags: -O0 + -Werror triggers glibc _FORTIFY_SOURCE warning
|
||||||
|
NIX_CFLAGS_COMPILE = "-Wno-error=cpp";
|
||||||
|
LIBCLANG_PATH = "${pkgs.libclang.lib}/lib";
|
||||||
|
LD_LIBRARY_PATH = "${pkgs.wayland}/lib:${pkgs.libxkbcommon}/lib:${pkgs.pipewire}/lib:${pkgs.libglvnd}/lib";
|
||||||
ORT_DYLIB_PATH = "${pkgs.onnxruntime}/lib/libonnxruntime.so";
|
ORT_DYLIB_PATH = "${pkgs.onnxruntime}/lib/libonnxruntime.so";
|
||||||
S3_ENDPOINT = "http://127.0.0.1:19000";
|
S3_ENDPOINT = "http://127.0.0.1:19000";
|
||||||
S3_BUCKET = "minne-tests";
|
S3_BUCKET = "minne-tests";
|
||||||
|
|||||||
@@ -8,7 +8,7 @@
|
|||||||
| Frontend | HTML + HTMX + minimal JS |
|
| Frontend | HTML + HTMX + minimal JS |
|
||||||
| Database | SurrealDB (graph, document, vector) |
|
| Database | SurrealDB (graph, document, vector) |
|
||||||
| AI | OpenAI-compatible API |
|
| AI | OpenAI-compatible API |
|
||||||
| Web Processing | Headless Chromium |
|
| Web Processing | Servo engine (servo-fetch) + PDFium |
|
||||||
|
|
||||||
## Crate Structure
|
## Crate Structure
|
||||||
|
|
||||||
|
|||||||
+3
-1
@@ -10,7 +10,7 @@
|
|||||||
|
|
||||||
Minne automatically processes saved content:
|
Minne automatically processes saved content:
|
||||||
|
|
||||||
1. **Web scraping** extracts readable text from URLs (via headless Chrome)
|
1. **Web scraping** extracts readable text from URLs (via embedded Servo engine)
|
||||||
2. **Text analysis** identifies key concepts and relationships
|
2. **Text analysis** identifies key concepts and relationships
|
||||||
3. **Graph creation** builds connections between related content
|
3. **Graph creation** builds connections between related content
|
||||||
4. **Embedding generation** enables semantic search
|
4. **Embedding generation** enables semantic search
|
||||||
@@ -43,6 +43,7 @@ Optional **reranking** can rescore fused chunk lists with a cross-encoder model;
|
|||||||
When enabled, retrieval results are rescored with a cross-encoder model for improved relevance. Powered by [fastembed-rs](https://github.com/Anush008/fastembed-rs).
|
When enabled, retrieval results are rescored with a cross-encoder model for improved relevance. Powered by [fastembed-rs](https://github.com/Anush008/fastembed-rs).
|
||||||
|
|
||||||
**Trade-offs:**
|
**Trade-offs:**
|
||||||
|
|
||||||
- Downloads ~1.1 GB of model data
|
- Downloads ~1.1 GB of model data
|
||||||
- Adds latency per query
|
- Adds latency per query
|
||||||
- Potentially improves answer quality, see [blog post](https://blog.stark.pub/posts/eval-retrieval-refactor/)
|
- Potentially improves answer quality, see [blog post](https://blog.stark.pub/posts/eval-retrieval-refactor/)
|
||||||
@@ -52,6 +53,7 @@ Enable via `RERANKING_ENABLED=true`. See [Configuration](./configuration.md).
|
|||||||
## Multi-Format Ingestion
|
## Multi-Format Ingestion
|
||||||
|
|
||||||
Supported content types:
|
Supported content types:
|
||||||
|
|
||||||
- Plain text and notes
|
- Plain text and notes
|
||||||
- URLs (web pages)
|
- URLs (web pages)
|
||||||
- PDF documents
|
- PDF documents
|
||||||
|
|||||||
@@ -12,13 +12,13 @@ cd minne
|
|||||||
docker compose up -d
|
docker compose up -d
|
||||||
```
|
```
|
||||||
|
|
||||||
The included `docker-compose.yml` handles SurrealDB and Chromium automatically.
|
The included `docker-compose.yml` handles SurrealDB automatically.
|
||||||
|
|
||||||
**Required:** Set your `OPENAI_API_KEY` in `docker-compose.yml` before starting.
|
**Required:** Set your `OPENAI_API_KEY` in `docker-compose.yml` before starting.
|
||||||
|
|
||||||
## Nix
|
## Nix
|
||||||
|
|
||||||
Run Minne directly with Nix (includes Chromium):
|
Run Minne directly with Nix:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
nix run 'github:perstarkse/minne#main'
|
nix run 'github:perstarkse/minne#main'
|
||||||
@@ -31,8 +31,9 @@ Configure via environment variables or a `config.yaml` file. See [Configuration]
|
|||||||
Download binaries for Windows, macOS, and Linux from [GitHub Releases](https://github.com/perstarkse/minne/releases/latest).
|
Download binaries for Windows, macOS, and Linux from [GitHub Releases](https://github.com/perstarkse/minne/releases/latest).
|
||||||
|
|
||||||
**Requirements:**
|
**Requirements:**
|
||||||
|
|
||||||
- SurrealDB instance (local or remote)
|
- SurrealDB instance (local or remote)
|
||||||
- Chromium (for web scraping)
|
- `libEGL` + `libfontconfig` (for servo-fetch web scraping)
|
||||||
|
|
||||||
## Build from Source
|
## Build from Source
|
||||||
|
|
||||||
@@ -45,9 +46,10 @@ cargo build --release --bin main
|
|||||||
The binary will be at `target/release/main`.
|
The binary will be at `target/release/main`.
|
||||||
|
|
||||||
**Requirements:**
|
**Requirements:**
|
||||||
|
|
||||||
- Rust toolchain
|
- Rust toolchain
|
||||||
- SurrealDB accessible at configured address
|
- SurrealDB accessible at configured address
|
||||||
- Chromium in PATH
|
- `libEGL` + `libfontconfig` for servo-fetch (web scraping) — bundled in Nix and Docker images
|
||||||
|
|
||||||
## Process Modes
|
## Process Modes
|
||||||
|
|
||||||
|
|||||||
@@ -50,16 +50,16 @@
|
|||||||
doCheck = false;
|
doCheck = false;
|
||||||
|
|
||||||
nativeBuildInputs = [pkgs.pkg-config pkgs.rustfmt pkgs.makeWrapper];
|
nativeBuildInputs = [pkgs.pkg-config pkgs.rustfmt pkgs.makeWrapper];
|
||||||
buildInputs = [pkgs.openssl pkgs.chromium pkgs.onnxruntime];
|
buildInputs = [pkgs.openssl pkgs.libglvnd pkgs.onnxruntime];
|
||||||
|
|
||||||
postInstall = ''
|
postInstall = ''
|
||||||
wrapProgram $out/bin/main \
|
wrapProgram $out/bin/main \
|
||||||
--set CHROME ${pkgs.chromium}/bin/chromium \
|
--prefix LD_LIBRARY_PATH : ${pkgs.libglvnd}/lib \
|
||||||
--set ORT_DYLIB_PATH ${pkgs.onnxruntime}/lib/libonnxruntime.${libExt}
|
--set ORT_DYLIB_PATH ${pkgs.onnxruntime}/lib/libonnxruntime.${libExt}
|
||||||
for b in worker server; do
|
for b in worker server; do
|
||||||
if [ -x "$out/bin/$b" ]; then
|
if [ -x "$out/bin/$b" ]; then
|
||||||
wrapProgram $out/bin/$b \
|
wrapProgram $out/bin/$b \
|
||||||
--set CHROME ${pkgs.chromium}/bin/chromium \
|
--prefix LD_LIBRARY_PATH : ${pkgs.libglvnd}/lib \
|
||||||
--set ORT_DYLIB_PATH ${pkgs.onnxruntime}/lib/libonnxruntime.${libExt}
|
--set ORT_DYLIB_PATH ${pkgs.onnxruntime}/lib/libonnxruntime.${libExt}
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|||||||
@@ -18,17 +18,22 @@ async-openai = { workspace = true }
|
|||||||
surrealdb = { workspace = true }
|
surrealdb = { workspace = true }
|
||||||
dom_smoothie = { workspace = true }
|
dom_smoothie = { workspace = true }
|
||||||
tempfile = { workspace = true }
|
tempfile = { workspace = true }
|
||||||
axum_typed_multipart = { workspace = true}
|
axum_typed_multipart = { workspace = true }
|
||||||
anyhow = { workspace = true }
|
anyhow = { workspace = true }
|
||||||
reqwest = { workspace = true }
|
reqwest = { workspace = true }
|
||||||
chrono = { workspace = true }
|
chrono = { workspace = true }
|
||||||
text-splitter = { workspace = true }
|
text-splitter = { workspace = true }
|
||||||
url = { workspace = true }
|
url = { workspace = true }
|
||||||
uuid = { workspace = true }
|
uuid = { workspace = true }
|
||||||
headless_chrome = { workspace = true }
|
|
||||||
base64 = { workspace = true }
|
base64 = { workspace = true }
|
||||||
pdf-extract = { workspace = true }
|
pdf-extract = { workspace = true }
|
||||||
lopdf = { workspace = true }
|
lopdf = { workspace = true }
|
||||||
|
tendril = { workspace = true }
|
||||||
|
servo-fetch = { workspace = true }
|
||||||
|
servo-allocator = { version = "0.2", features = ["use-system-allocator"] }
|
||||||
|
pdfium-auto = { workspace = true }
|
||||||
|
pdfium-render = { workspace = true }
|
||||||
|
image = { workspace = true }
|
||||||
bytes = { workspace = true }
|
bytes = { workspace = true }
|
||||||
async-trait = { workspace = true }
|
async-trait = { workspace = true }
|
||||||
state-machines = { workspace = true }
|
state-machines = { workspace = true }
|
||||||
@@ -37,7 +42,6 @@ common = { path = "../common" }
|
|||||||
retrieval-pipeline = { path = "../retrieval-pipeline" }
|
retrieval-pipeline = { path = "../retrieval-pipeline" }
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
docker = []
|
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
common = { path = "../common", features = ["test-utils"] }
|
common = { path = "../common", features = ["test-utils"] }
|
||||||
|
|||||||
@@ -24,6 +24,6 @@ pub async fn transcribe_audio_file(
|
|||||||
.transcription()
|
.transcription()
|
||||||
.create(request)
|
.create(request)
|
||||||
.await
|
.await
|
||||||
.map_err(|e| AppError::Processing(format!("Audio transcription failed: {e}")))?;
|
.map_err(|e| AppError::Processing(format!("audio transcription failed: {e}")))?;
|
||||||
Ok(response.text)
|
Ok(response.text)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,27 +0,0 @@
|
|||||||
use common::error::AppError;
|
|
||||||
use headless_chrome::Browser;
|
|
||||||
|
|
||||||
/// Launches a headless Chrome instance, honoring the `docker` feature flag
|
|
||||||
/// (which disables the Chrome sandbox for container environments).
|
|
||||||
///
|
|
||||||
/// This is the single place the crate spawns a browser. If the rendering backend
|
|
||||||
/// is ever swapped away from headless Chrome to something leaner, this function is
|
|
||||||
/// the seam to change; callers only depend on getting back a `Browser`.
|
|
||||||
pub(crate) fn launch_browser() -> Result<Browser, AppError> {
|
|
||||||
#[cfg(feature = "docker")]
|
|
||||||
{
|
|
||||||
let options = headless_chrome::LaunchOptionsBuilder::default()
|
|
||||||
.sandbox(false)
|
|
||||||
.build()
|
|
||||||
.map_err(|err| {
|
|
||||||
AppError::Processing(format!("Failed to build headless browser options: {err}"))
|
|
||||||
})?;
|
|
||||||
Browser::new(options)
|
|
||||||
.map_err(|err| AppError::Processing(format!("Failed to start headless browser: {err}")))
|
|
||||||
}
|
|
||||||
#[cfg(not(feature = "docker"))]
|
|
||||||
{
|
|
||||||
Browser::default()
|
|
||||||
.map_err(|err| AppError::Processing(format!("Failed to start headless browser: {err}")))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,8 +1,8 @@
|
|||||||
pub mod audio_transcription;
|
pub mod audio_transcription;
|
||||||
pub mod browser;
|
|
||||||
pub mod file_text_extraction;
|
pub mod file_text_extraction;
|
||||||
pub mod graph_mapper;
|
pub mod graph_mapper;
|
||||||
pub mod image_parsing;
|
pub mod image_parsing;
|
||||||
pub mod llm_instructions;
|
pub mod llm_instructions;
|
||||||
|
pub mod page_fetcher;
|
||||||
pub mod pdf;
|
pub mod pdf;
|
||||||
pub mod url_text_retrieval;
|
pub mod url_text_retrieval;
|
||||||
|
|||||||
@@ -0,0 +1,117 @@
|
|||||||
|
//! Page-fetching abstraction that decouples URL extraction from the underlying engine.
|
||||||
|
//!
|
||||||
|
//! The primary implementation uses [`servo_fetch`], a pure-Rust Servo engine that
|
||||||
|
//! provides high extraction quality (word-F1 0.819), fast startup (~331ms), and a
|
||||||
|
//! small memory footprint (~64MB peak).
|
||||||
|
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
use common::error::AppError;
|
||||||
|
use tracing::info;
|
||||||
|
|
||||||
|
/// Captured content from a single page fetch.
|
||||||
|
#[derive(Debug, Clone, PartialEq)]
|
||||||
|
pub(crate) struct PageCapture {
|
||||||
|
/// Raw HTML source of the page.
|
||||||
|
pub html: String,
|
||||||
|
/// Readable Markdown extracted from the page content.
|
||||||
|
pub markdown: String,
|
||||||
|
/// JPEG/PNG screenshot bytes, or empty if not captured.
|
||||||
|
pub screenshot: Vec<u8>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Abstraction over a page-fetching engine.
|
||||||
|
pub(crate) trait PageFetcher: Send + Sync + std::fmt::Debug {
|
||||||
|
/// Fetches a URL and returns the captured content (HTML, markdown, screenshot).
|
||||||
|
fn fetch(&self, url: &str) -> Result<PageCapture, AppError>;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Fetcher powered by the embedded Servo engine via `servo-fetch`.
|
||||||
|
///
|
||||||
|
/// Provides HTML, extracted Markdown, and a PNG screenshot.
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub(crate) struct ServoFetchFetcher;
|
||||||
|
|
||||||
|
impl PageFetcher for ServoFetchFetcher {
|
||||||
|
fn fetch(&self, url: &str) -> Result<PageCapture, AppError> {
|
||||||
|
let page = servo_fetch::blocking::fetch(
|
||||||
|
&servo_fetch::FetchOptions::screenshot(url, true)
|
||||||
|
.timeout(Duration::from_secs(30))
|
||||||
|
.settle(Duration::from_millis(3000)),
|
||||||
|
)
|
||||||
|
.map_err(|err| AppError::Processing(format!("servo-fetch failed for {url}: {err}")))?;
|
||||||
|
|
||||||
|
let html = page.html.clone();
|
||||||
|
let markdown = page
|
||||||
|
.markdown()
|
||||||
|
.map_err(|err| AppError::Processing(format!("failed to extract markdown: {err}")))?;
|
||||||
|
let screenshot = page.screenshot_png().unwrap_or_default().to_vec();
|
||||||
|
|
||||||
|
info!(
|
||||||
|
url = %url,
|
||||||
|
html_bytes = html.len(),
|
||||||
|
md_chars = markdown.len(),
|
||||||
|
screenshot_bytes = screenshot.len(),
|
||||||
|
"servo-fetch completed"
|
||||||
|
);
|
||||||
|
|
||||||
|
Ok(PageCapture {
|
||||||
|
html,
|
||||||
|
markdown,
|
||||||
|
screenshot,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates the default page fetcher for the current configuration.
|
||||||
|
#[allow(unreachable_pub)]
|
||||||
|
pub(crate) fn create_fetcher() -> Box<dyn PageFetcher> {
|
||||||
|
Box::new(ServoFetchFetcher)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_default_fetcher_constructs() {
|
||||||
|
let fetcher = create_fetcher();
|
||||||
|
assert!(!format!("{fetcher:?}").is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_servo_fetcher_constructs() {
|
||||||
|
let _ = ServoFetchFetcher;
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_trait_object_dispatch() {
|
||||||
|
let fetcher: Box<dyn PageFetcher> = Box::new(ServoFetchFetcher);
|
||||||
|
assert!(!format!("{fetcher:?}").is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Smoke test: Servo engine initialises even without display server.
|
||||||
|
/// Wrap in `catch_unwind` because child-thread panics from servo
|
||||||
|
/// (e.g. missing wayland) would otherwise escape the test harness.
|
||||||
|
#[test]
|
||||||
|
fn test_servo_engine_initializes() {
|
||||||
|
let fetcher = ServoFetchFetcher;
|
||||||
|
let result = std::panic::catch_unwind(move || {
|
||||||
|
let _ = fetcher.fetch("about:blank");
|
||||||
|
});
|
||||||
|
|
||||||
|
if let Err(panic) = result {
|
||||||
|
let msg = panic
|
||||||
|
.downcast_ref::<&str>()
|
||||||
|
.copied()
|
||||||
|
.or_else(|| panic.downcast_ref::<String>().map(String::as_str))
|
||||||
|
.unwrap_or("unknown panic");
|
||||||
|
assert!(
|
||||||
|
!(msg.contains("wayland")
|
||||||
|
|| msg.contains("Library")
|
||||||
|
|| msg.contains("servo-engine")),
|
||||||
|
"Servo engine initialization failed: {msg}"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,32 +1,25 @@
|
|||||||
//! Headless-Chrome rasterization of PDF pages into PNG screenshots.
|
//! PDF page rasterization using pdfium-render via pdfium-auto.
|
||||||
//!
|
//!
|
||||||
//! This is the only Chrome-dependent part of PDF ingestion. It depends on the
|
//! Uses direct `PDFium` bindings for reliable, pixel-perfect page rendering —
|
||||||
//! browser's internal PDF-viewer shadow DOM, so it is inherently fragile across
|
//! starts in ~5ms, requires no display server, and produces consistent output
|
||||||
//! Chrome upgrades; a full-page-capture fallback guards the common failure modes.
|
//! independent of PDF reader version. Each page is rendered at a generous
|
||||||
|
//! resolution and encoded as PNG for downstream LLM vision ingestion.
|
||||||
|
|
||||||
use std::{
|
use std::{
|
||||||
path::{Path, PathBuf},
|
path::{Path, PathBuf},
|
||||||
time::{Duration, SystemTime, UNIX_EPOCH},
|
time::{SystemTime, UNIX_EPOCH},
|
||||||
};
|
};
|
||||||
|
|
||||||
use base64::{engine::general_purpose::STANDARD, Engine as _};
|
use image::ImageFormat;
|
||||||
use headless_chrome::protocol::cdp::{Emulation, Page, DOM};
|
|
||||||
use lopdf::Document;
|
use lopdf::Document;
|
||||||
use serde_json::Value;
|
use pdfium_render::prelude::PdfRenderConfig;
|
||||||
use tracing::{debug, warn};
|
use tracing::{debug, warn};
|
||||||
|
|
||||||
use common::error::AppError;
|
use common::error::AppError;
|
||||||
|
|
||||||
use crate::utils::browser::launch_browser;
|
|
||||||
|
|
||||||
const NAVIGATION_RETRY_INTERVAL_MS: u64 = 120;
|
|
||||||
const NAVIGATION_RETRY_ATTEMPTS: usize = 10;
|
|
||||||
const MIN_PAGE_IMAGE_BYTES: usize = 1_024;
|
const MIN_PAGE_IMAGE_BYTES: usize = 1_024;
|
||||||
const DEFAULT_VIEWPORT_WIDTH: u32 = 1_248; // generous width to reduce horizontal clipping
|
const RENDER_TARGET_WIDTH: i32 = 1200;
|
||||||
const DEFAULT_VIEWPORT_HEIGHT: u32 = 1_800; // tall enough to capture full page at fit-to-width scale
|
const RENDER_MAX_HEIGHT: i32 = 2000;
|
||||||
const DEFAULT_DEVICE_SCALE_FACTOR: f64 = 1.0;
|
|
||||||
const CANVAS_VIEWPORT_ATTEMPTS: usize = 12;
|
|
||||||
const CANVAS_VIEWPORT_WAIT_MS: u64 = 200;
|
|
||||||
const DEBUG_IMAGE_ENV_VAR: &str = "MINNE_PDF_DEBUG_DIR";
|
const DEBUG_IMAGE_ENV_VAR: &str = "MINNE_PDF_DEBUG_DIR";
|
||||||
|
|
||||||
/// Parses the PDF structure to discover the available page numbers while keeping work off
|
/// Parses the PDF structure to discover the available page numbers while keeping work off
|
||||||
@@ -34,7 +27,7 @@ const DEBUG_IMAGE_ENV_VAR: &str = "MINNE_PDF_DEBUG_DIR";
|
|||||||
pub(super) async fn load_page_numbers(pdf_bytes: Vec<u8>) -> Result<Vec<u32>, AppError> {
|
pub(super) async fn load_page_numbers(pdf_bytes: Vec<u8>) -> Result<Vec<u32>, AppError> {
|
||||||
let pages = tokio::task::spawn_blocking(move || -> Result<Vec<u32>, AppError> {
|
let pages = tokio::task::spawn_blocking(move || -> Result<Vec<u32>, AppError> {
|
||||||
let document = Document::load_mem(&pdf_bytes)
|
let document = Document::load_mem(&pdf_bytes)
|
||||||
.map_err(|err| AppError::Processing(format!("Failed to parse PDF: {err}")))?;
|
.map_err(|err| AppError::Processing(format!("failed to parse PDF: {err}")))?;
|
||||||
let mut page_numbers: Vec<u32> = document.get_pages().keys().copied().collect();
|
let mut page_numbers: Vec<u32> = document.get_pages().keys().copied().collect();
|
||||||
page_numbers.sort_unstable();
|
page_numbers.sort_unstable();
|
||||||
Ok(page_numbers)
|
Ok(page_numbers)
|
||||||
@@ -44,7 +37,9 @@ pub(super) async fn load_page_numbers(pdf_bytes: Vec<u8>) -> Result<Vec<u32>, Ap
|
|||||||
Ok(pages)
|
Ok(pages)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Uses the existing headless Chrome dependency to rasterize the requested PDF pages into PNGs.
|
/// Renders the requested PDF pages as PNG-encoded byte vectors using `PDFium`.
|
||||||
|
///
|
||||||
|
/// Work is offloaded to a blocking thread since `PDFium`'s C API is not async-safe.
|
||||||
pub(super) async fn render_pdf_pages(
|
pub(super) async fn render_pdf_pages(
|
||||||
file_path: &Path,
|
file_path: &Path,
|
||||||
pages: &[u32],
|
pages: &[u32],
|
||||||
@@ -52,8 +47,8 @@ pub(super) async fn render_pdf_pages(
|
|||||||
let file_path = file_path.to_path_buf();
|
let file_path = file_path.to_path_buf();
|
||||||
let pages = pages.to_vec();
|
let pages = pages.to_vec();
|
||||||
let page_numbers = pages.clone();
|
let page_numbers = pages.clone();
|
||||||
let captures =
|
|
||||||
tokio::task::spawn_blocking(move || render_pdf_pages_inner(&file_path, &pages)).await??;
|
let captures = tokio::task::spawn_blocking(move || render_inner(&file_path, &pages)).await??;
|
||||||
|
|
||||||
for (page_number, png) in page_numbers.iter().zip(captures.iter()) {
|
for (page_number, png) in page_numbers.iter().zip(captures.iter()) {
|
||||||
if let Err(err) = maybe_dump_debug_image(*page_number, png).await {
|
if let Err(err) = maybe_dump_debug_image(*page_number, png).await {
|
||||||
@@ -68,306 +63,65 @@ pub(super) async fn render_pdf_pages(
|
|||||||
Ok(captures)
|
Ok(captures)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn render_pdf_pages_inner(file_path: &Path, pages: &[u32]) -> Result<Vec<Vec<u8>>, AppError> {
|
/// Initializes `PDFium`, opens the file, and renders each requested page.
|
||||||
let file_url = url::Url::from_file_path(file_path)
|
fn render_inner(file_path: &Path, pages: &[u32]) -> Result<Vec<Vec<u8>>, AppError> {
|
||||||
.map_err(|()| AppError::Processing("Unable to construct PDF file URL".into()))?;
|
let pdfium = pdfium_auto::bind_pdfium_silent()
|
||||||
|
.map_err(|err| AppError::Processing(format!("failed to bind PDFium library: {err}")))?;
|
||||||
|
|
||||||
let browser = launch_browser()?;
|
let doc = pdfium
|
||||||
let tab = browser
|
.load_pdf_from_file(file_path, None)
|
||||||
.new_tab()
|
.map_err(|err| AppError::Processing(format!("failed to load PDF file: {err}")))?;
|
||||||
.map_err(|err| AppError::Processing(format!("Failed to create Chrome tab: {err}")))?;
|
|
||||||
|
|
||||||
tab.set_default_timeout(Duration::from_secs(10));
|
let render_config = PdfRenderConfig::new()
|
||||||
configure_tab(&tab)?;
|
.set_target_width(RENDER_TARGET_WIDTH)
|
||||||
set_pdf_viewport(&tab)?;
|
.set_maximum_height(RENDER_MAX_HEIGHT);
|
||||||
|
|
||||||
let mut captures = Vec::with_capacity(pages.len());
|
let mut captures = Vec::with_capacity(pages.len());
|
||||||
|
|
||||||
for page in pages.iter().copied() {
|
for &page_num in pages {
|
||||||
let target = format!("{file_url}#page={page}&toolbar=0&statusbar=0&zoom=page-fit");
|
let page_index = page_num.saturating_sub(1); // PDFium uses 0-based indices
|
||||||
tab.navigate_to(&target)
|
let page = doc
|
||||||
.map_err(|err| AppError::Processing(format!("Failed to navigate to PDF page: {err}")))?
|
.pages()
|
||||||
.wait_until_navigated()
|
.get(u16::try_from(page_index).unwrap_or(u16::MAX))
|
||||||
.map_err(|err| AppError::Processing(format!("Navigation to PDF page failed: {err}")))?;
|
.map_err(|err| {
|
||||||
|
AppError::Processing(format!("failed to get PDF page {page_num}: {err}"))
|
||||||
|
})?;
|
||||||
|
|
||||||
let mut loaded = false;
|
let bitmap = page.render_with_config(&render_config).map_err(|err| {
|
||||||
for attempt in 0..NAVIGATION_RETRY_ATTEMPTS {
|
AppError::Processing(format!("failed to render PDF page {page_num}: {err}"))
|
||||||
if tab
|
})?;
|
||||||
.wait_for_element("embed, canvas, body")
|
|
||||||
.map(|_| ())
|
|
||||||
.is_ok()
|
|
||||||
{
|
|
||||||
loaded = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if attempt < NAVIGATION_RETRY_ATTEMPTS.saturating_sub(1) {
|
|
||||||
std::thread::sleep(Duration::from_millis(NAVIGATION_RETRY_INTERVAL_MS));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if !loaded {
|
let image = bitmap.as_image();
|
||||||
return Err(AppError::Processing(
|
|
||||||
"Timed out waiting for Chrome to render PDF page".into(),
|
|
||||||
));
|
|
||||||
}
|
|
||||||
|
|
||||||
wait_for_pdf_ready(&tab, page)?;
|
let mut png_bytes = Vec::new();
|
||||||
std::thread::sleep(Duration::from_millis(350));
|
image
|
||||||
|
.write_to(&mut std::io::Cursor::new(&mut png_bytes), ImageFormat::Png)
|
||||||
|
.map_err(|err| {
|
||||||
|
AppError::Processing(format!(
|
||||||
|
"failed to encode PDF page {page_num} as PNG: {err}"
|
||||||
|
))
|
||||||
|
})?;
|
||||||
|
|
||||||
prepare_pdf_viewer(&tab, page);
|
debug!(
|
||||||
|
page = page_num,
|
||||||
|
bytes = png_bytes.len(),
|
||||||
|
"Rendered PDF page via PDFium"
|
||||||
|
);
|
||||||
|
|
||||||
let mut viewport: Option<Page::Viewport> = None;
|
if png_bytes.len() < MIN_PAGE_IMAGE_BYTES {
|
||||||
for attempt in 0..CANVAS_VIEWPORT_ATTEMPTS {
|
|
||||||
match canvas_viewport_for_page(&tab, page) {
|
|
||||||
Ok(Some(vp)) => {
|
|
||||||
viewport = Some(vp);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
Ok(None) => {
|
|
||||||
if attempt < CANVAS_VIEWPORT_ATTEMPTS.saturating_sub(1) {
|
|
||||||
std::thread::sleep(Duration::from_millis(CANVAS_VIEWPORT_WAIT_MS));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Err(err) => {
|
|
||||||
warn!(page, error = %err, "Failed to derive canvas viewport");
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let png = if let Some(clip) = viewport {
|
|
||||||
match tab.call_method(Page::CaptureScreenshot {
|
|
||||||
format: Some(Page::CaptureScreenshotFormatOption::Png),
|
|
||||||
quality: None,
|
|
||||||
clip: Some(clip),
|
|
||||||
from_surface: Some(true),
|
|
||||||
capture_beyond_viewport: Some(true),
|
|
||||||
optimize_for_speed: Some(false),
|
|
||||||
}) {
|
|
||||||
Ok(data) => match STANDARD.decode(data.data) {
|
|
||||||
Ok(bytes) => bytes,
|
|
||||||
Err(err) => {
|
|
||||||
warn!(error = %err, page, "Failed to decode clipped screenshot; falling back to full page capture");
|
|
||||||
capture_full_page_png(&tab)?
|
|
||||||
}
|
|
||||||
},
|
|
||||||
Err(err) => {
|
|
||||||
warn!(error = %err, page, "Clipped screenshot failed; falling back to full page capture");
|
|
||||||
capture_full_page_png(&tab)?
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
warn!(
|
warn!(
|
||||||
page,
|
page = page_num,
|
||||||
"Unable to determine canvas viewport; capturing full page"
|
bytes = png_bytes.len(),
|
||||||
);
|
"Rendered page size below threshold; check PDF quality"
|
||||||
capture_full_page_png(&tab)?
|
|
||||||
};
|
|
||||||
|
|
||||||
debug!(page, bytes = png.len(), "Captured PDF page screenshot");
|
|
||||||
|
|
||||||
if is_suspicious_image(png.len()) {
|
|
||||||
warn!(
|
|
||||||
page,
|
|
||||||
bytes = png.len(),
|
|
||||||
"Screenshot size below threshold; check rendering output"
|
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
captures.push(png);
|
captures.push(png_bytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(captures)
|
Ok(captures)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn configure_tab(tab: &headless_chrome::Tab) -> Result<(), AppError> {
|
|
||||||
tab.call_method(Emulation::SetDefaultBackgroundColorOverride {
|
|
||||||
color: Some(DOM::RGBA {
|
|
||||||
r: 255,
|
|
||||||
g: 255,
|
|
||||||
b: 255,
|
|
||||||
a: Some(1.0),
|
|
||||||
}),
|
|
||||||
})
|
|
||||||
.map_err(|err| {
|
|
||||||
AppError::Processing(format!("Failed to configure Chrome page background: {err}"))
|
|
||||||
})?;
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn set_pdf_viewport(tab: &headless_chrome::Tab) -> Result<(), AppError> {
|
|
||||||
tab.call_method(Emulation::SetDeviceMetricsOverride {
|
|
||||||
width: DEFAULT_VIEWPORT_WIDTH,
|
|
||||||
height: DEFAULT_VIEWPORT_HEIGHT,
|
|
||||||
device_scale_factor: DEFAULT_DEVICE_SCALE_FACTOR,
|
|
||||||
mobile: false,
|
|
||||||
scale: None,
|
|
||||||
screen_width: Some(DEFAULT_VIEWPORT_WIDTH),
|
|
||||||
screen_height: Some(DEFAULT_VIEWPORT_HEIGHT),
|
|
||||||
position_x: None,
|
|
||||||
position_y: None,
|
|
||||||
dont_set_visible_size: Some(false),
|
|
||||||
screen_orientation: None,
|
|
||||||
viewport: None,
|
|
||||||
display_feature: None,
|
|
||||||
device_posture: None,
|
|
||||||
})
|
|
||||||
.map_err(|err| AppError::Processing(format!("Failed to configure Chrome viewport: {err}")))?;
|
|
||||||
|
|
||||||
tab.call_method(Emulation::SetVisibleSize {
|
|
||||||
width: DEFAULT_VIEWPORT_WIDTH,
|
|
||||||
height: DEFAULT_VIEWPORT_HEIGHT,
|
|
||||||
})
|
|
||||||
.map_err(|err| AppError::Processing(format!("Failed to apply Chrome visible size: {err}")))?;
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn wait_for_pdf_ready(
|
|
||||||
tab: &headless_chrome::Tab,
|
|
||||||
page_number: u32,
|
|
||||||
) -> Result<headless_chrome::Element<'_>, AppError> {
|
|
||||||
let embed_selector = "embed[type='application/pdf']";
|
|
||||||
let element = tab
|
|
||||||
.wait_for_element_with_custom_timeout(embed_selector, Duration::from_secs(8))
|
|
||||||
.or_else(|_| tab.wait_for_element_with_custom_timeout("embed", Duration::from_secs(8)))
|
|
||||||
.map_err(|err| AppError::Processing(format!("Timed out waiting for PDF content: {err}")))?;
|
|
||||||
|
|
||||||
if let Err(err) = element.scroll_into_view() {
|
|
||||||
debug!("Failed to scroll PDF element into view: {err}");
|
|
||||||
}
|
|
||||||
|
|
||||||
debug!(page = page_number, "PDF viewer element located");
|
|
||||||
|
|
||||||
Ok(element)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn prepare_pdf_viewer(tab: &headless_chrome::Tab, page_number: u32) {
|
|
||||||
let script = format!(
|
|
||||||
r#"(function() {{
|
|
||||||
const embed = document.querySelector('embed[type="application/pdf"]') || document.querySelector('embed');
|
|
||||||
if (!embed || !embed.shadowRoot) return false;
|
|
||||||
const viewer = embed.shadowRoot.querySelector('pdf-viewer');
|
|
||||||
if (!viewer || !viewer.shadowRoot) return false;
|
|
||||||
const app = viewer.shadowRoot.querySelector('viewer-app');
|
|
||||||
if (app && app.shadowRoot) {{
|
|
||||||
const toolbar = app.shadowRoot.querySelector('#toolbar');
|
|
||||||
if (toolbar) {{ toolbar.style.display = 'none'; }}
|
|
||||||
}}
|
|
||||||
const page = viewer.shadowRoot.querySelector('viewer-page:nth-of-type({page_number})');
|
|
||||||
if (page && page.scrollIntoView) {{
|
|
||||||
page.scrollIntoView({{ block: 'start', inline: 'center' }});
|
|
||||||
}}
|
|
||||||
const canvas = viewer.shadowRoot.querySelector('canvas[aria-label="Page {page_number}"]');
|
|
||||||
return !!canvas;
|
|
||||||
}})()"#
|
|
||||||
);
|
|
||||||
|
|
||||||
match tab.evaluate(&script, false) {
|
|
||||||
Ok(result) => {
|
|
||||||
let ready = result
|
|
||||||
.value
|
|
||||||
.as_ref()
|
|
||||||
.and_then(Value::as_bool)
|
|
||||||
.unwrap_or(false);
|
|
||||||
debug!(page = page_number, ready, "Prepared PDF viewer page");
|
|
||||||
}
|
|
||||||
Err(err) => {
|
|
||||||
debug!(page = page_number, error = %err, "Unable to run PDF viewer preparation script");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn canvas_viewport_for_page(
|
|
||||||
tab: &headless_chrome::Tab,
|
|
||||||
page_number: u32,
|
|
||||||
) -> Result<Option<Page::Viewport>, AppError> {
|
|
||||||
let script = format!(
|
|
||||||
r#"(function() {{
|
|
||||||
const embed = document.querySelector('embed[type="application/pdf"]') || document.querySelector('embed');
|
|
||||||
if (!embed || !embed.shadowRoot) return null;
|
|
||||||
const viewer = embed.shadowRoot.querySelector('pdf-viewer');
|
|
||||||
if (!viewer || !viewer.shadowRoot) return null;
|
|
||||||
const canvas = viewer.shadowRoot.querySelector('canvas[aria-label="Page {page_number}"]');
|
|
||||||
if (!canvas) return null;
|
|
||||||
const rect = canvas.getBoundingClientRect();
|
|
||||||
return {{ x: rect.x, y: rect.y, width: rect.width, height: rect.height }};
|
|
||||||
}})()"#
|
|
||||||
);
|
|
||||||
|
|
||||||
let result = tab
|
|
||||||
.evaluate(&script, false)
|
|
||||||
.map_err(|err| AppError::Processing(format!("Failed to inspect PDF canvas: {err}")))?;
|
|
||||||
|
|
||||||
let Some(value) = result.value else {
|
|
||||||
return Ok(None);
|
|
||||||
};
|
|
||||||
|
|
||||||
if value.is_null() {
|
|
||||||
return Ok(None);
|
|
||||||
}
|
|
||||||
|
|
||||||
let x = value
|
|
||||||
.get("x")
|
|
||||||
.and_then(Value::as_f64)
|
|
||||||
.unwrap_or_default()
|
|
||||||
.max(0.0);
|
|
||||||
let y = value
|
|
||||||
.get("y")
|
|
||||||
.and_then(Value::as_f64)
|
|
||||||
.unwrap_or_default()
|
|
||||||
.max(0.0);
|
|
||||||
let width = value
|
|
||||||
.get("width")
|
|
||||||
.and_then(Value::as_f64)
|
|
||||||
.unwrap_or_default();
|
|
||||||
let height = value
|
|
||||||
.get("height")
|
|
||||||
.and_then(Value::as_f64)
|
|
||||||
.unwrap_or_default();
|
|
||||||
|
|
||||||
if width <= 0.0 || height <= 0.0 {
|
|
||||||
return Ok(None);
|
|
||||||
}
|
|
||||||
|
|
||||||
debug!(
|
|
||||||
page = page_number,
|
|
||||||
x, y, width, height, "Derived canvas viewport"
|
|
||||||
);
|
|
||||||
|
|
||||||
Ok(Some(Page::Viewport {
|
|
||||||
x,
|
|
||||||
y,
|
|
||||||
width,
|
|
||||||
height,
|
|
||||||
scale: 1.0,
|
|
||||||
}))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn capture_full_page_png(tab: &headless_chrome::Tab) -> Result<Vec<u8>, AppError> {
|
|
||||||
let screenshot = tab
|
|
||||||
.call_method(Page::CaptureScreenshot {
|
|
||||||
format: Some(Page::CaptureScreenshotFormatOption::Png),
|
|
||||||
quality: None,
|
|
||||||
clip: None,
|
|
||||||
from_surface: Some(true),
|
|
||||||
capture_beyond_viewport: Some(true),
|
|
||||||
optimize_for_speed: Some(false),
|
|
||||||
})
|
|
||||||
.map_err(|err| {
|
|
||||||
AppError::Processing(format!("Failed to capture PDF page (fallback): {err}"))
|
|
||||||
})?;
|
|
||||||
|
|
||||||
STANDARD.decode(screenshot.data).map_err(|err| {
|
|
||||||
AppError::Processing(format!("Failed to decode PDF screenshot (fallback): {err}"))
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
const fn is_suspicious_image(len: usize) -> bool {
|
|
||||||
len < MIN_PAGE_IMAGE_BYTES
|
|
||||||
}
|
|
||||||
|
|
||||||
fn debug_dump_directory() -> Option<PathBuf> {
|
fn debug_dump_directory() -> Option<PathBuf> {
|
||||||
std::env::var(DEBUG_IMAGE_ENV_VAR)
|
std::env::var(DEBUG_IMAGE_ENV_VAR)
|
||||||
.ok()
|
.ok()
|
||||||
@@ -394,6 +148,8 @@ async fn maybe_dump_debug_image(page_index: u32, bytes: &[u8]) -> Result<(), App
|
|||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
use anyhow::{self};
|
use anyhow::{self};
|
||||||
|
use lopdf::dictionary;
|
||||||
|
use lopdf::Object;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_debug_dump_directory_env_var() -> anyhow::Result<()> {
|
fn test_debug_dump_directory_env_var() -> anyhow::Result<()> {
|
||||||
@@ -409,10 +165,108 @@ mod tests {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[tokio::test]
|
||||||
fn test_is_suspicious_image_threshold() {
|
async fn test_load_page_numbers_empty_pdf() -> anyhow::Result<()> {
|
||||||
assert!(is_suspicious_image(0));
|
let pdf_bytes = create_minimal_pdf(0);
|
||||||
assert!(is_suspicious_image(MIN_PAGE_IMAGE_BYTES - 1));
|
let pages = load_page_numbers(pdf_bytes).await?;
|
||||||
assert!(!is_suspicious_image(MIN_PAGE_IMAGE_BYTES + 1));
|
assert!(pages.is_empty());
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_load_page_numbers_single_page() -> anyhow::Result<()> {
|
||||||
|
let pdf_bytes = create_minimal_pdf(1);
|
||||||
|
let pages = load_page_numbers(pdf_bytes).await?;
|
||||||
|
assert_eq!(pages, vec![1u32]);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_load_page_numbers_multi_page() -> anyhow::Result<()> {
|
||||||
|
let pdf_bytes = create_minimal_pdf(5);
|
||||||
|
let pages = load_page_numbers(pdf_bytes).await?;
|
||||||
|
assert_eq!(pages, vec![1, 2, 3, 4, 5]);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_load_page_numbers_invalid_pdf() {
|
||||||
|
let result = load_page_numbers(b"not a pdf".to_vec()).await;
|
||||||
|
assert!(result.is_err());
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates a minimal valid PDF with the given number of empty pages.
|
||||||
|
#[allow(clippy::similar_names, clippy::expect_used)]
|
||||||
|
fn create_minimal_pdf(page_count: u32) -> Vec<u8> {
|
||||||
|
let mut doc = Document::with_version("1.5");
|
||||||
|
let pages_id = doc.new_object_id();
|
||||||
|
|
||||||
|
let mut page_ids = Vec::with_capacity(page_count as usize);
|
||||||
|
for _ in 0..page_count {
|
||||||
|
let page_id = doc.add_object(dictionary! {
|
||||||
|
"Type" => "Page",
|
||||||
|
"Parent" => pages_id,
|
||||||
|
"MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
|
||||||
|
});
|
||||||
|
page_ids.push(page_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
let pages = dictionary! {
|
||||||
|
"Type" => "Pages",
|
||||||
|
"Kids" => page_ids.iter().map(|id| Object::Reference(*id)).collect::<Vec<_>>(),
|
||||||
|
"Count" => i32::try_from(page_count).unwrap_or(i32::MAX),
|
||||||
|
"MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
|
||||||
|
};
|
||||||
|
doc.objects.insert(pages_id, Object::Dictionary(pages));
|
||||||
|
|
||||||
|
let catalog_id = doc.add_object(dictionary! {
|
||||||
|
"Type" => "Catalog",
|
||||||
|
"Pages" => pages_id,
|
||||||
|
});
|
||||||
|
doc.trailer.set("Root", catalog_id);
|
||||||
|
|
||||||
|
let mut buf = Vec::new();
|
||||||
|
doc.save_to(&mut buf).expect("failed to serialize test PDF");
|
||||||
|
buf
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Renders a simple 1-page PDF and verifies the output is a valid PNG ≥ 1KB.
|
||||||
|
/// This test skips gracefully when `PDFium` is not available (e.g., CI without internet).
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_render_single_page_pdfium() -> anyhow::Result<()> {
|
||||||
|
let pdf_bytes = create_minimal_pdf(1);
|
||||||
|
let dir = tempfile::TempDir::new()?;
|
||||||
|
let file_path = dir.path().join("test.pdf");
|
||||||
|
tokio::fs::write(&file_path, &pdf_bytes).await?;
|
||||||
|
|
||||||
|
let result = render_pdf_pages(&file_path, &[1]).await;
|
||||||
|
match result {
|
||||||
|
Ok(pages) => {
|
||||||
|
assert_eq!(pages.len(), 1, "should render one page");
|
||||||
|
#[allow(clippy::expect_used)]
|
||||||
|
let first_page = pages.into_iter().next().expect("already asserted len == 1");
|
||||||
|
assert!(
|
||||||
|
first_page.len() >= MIN_PAGE_IMAGE_BYTES,
|
||||||
|
"rendered page {} bytes is below threshold {}",
|
||||||
|
first_page.len(),
|
||||||
|
MIN_PAGE_IMAGE_BYTES
|
||||||
|
);
|
||||||
|
// Verify it's a valid PNG by checking header bytes
|
||||||
|
let header = first_page
|
||||||
|
.get(..4.min(first_page.len()))
|
||||||
|
.unwrap_or(&[0u8; 0]);
|
||||||
|
assert_eq!(header, &[0x89, 0x50, 0x4E, 0x47], "output must be PNG");
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
// PDFium may not be available — that's acceptable in environments
|
||||||
|
// without network access to download the binary.
|
||||||
|
let msg = e.to_string();
|
||||||
|
if !msg.contains("PDFium") && !msg.contains("library") && !msg.contains("bind") {
|
||||||
|
anyhow::bail!("unexpected error: {e}");
|
||||||
|
}
|
||||||
|
eprintln!("SKIP: PDFium not available ({msg})");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,7 +1,9 @@
|
|||||||
//! Fast-path PDF text extraction and Markdown reflow heuristics.
|
//! Fast-path PDF text extraction and Markdown reflow heuristics.
|
||||||
//!
|
//!
|
||||||
//! These are pure (non-IO, non-Chrome) helpers used before falling back to the
|
//! Pure text-extraction helpers that run before falling back to the vision pipeline,
|
||||||
//! vision pipeline, plus the Markdown normalization applied to both paths.
|
//! plus the Markdown normalization applied to both paths. The fast path uses
|
||||||
|
//! `pdf-extract` to pull embedded text layers directly, avoiding the cost of
|
||||||
|
//! page-by-page rasterization for well-structured PDFs.
|
||||||
|
|
||||||
use common::error::AppError;
|
use common::error::AppError;
|
||||||
|
|
||||||
@@ -15,7 +17,7 @@ pub(super) async fn try_fast_path(pdf_bytes: Vec<u8>) -> Result<Option<String>,
|
|||||||
pdf_extract::extract_text_from_mem(&pdf_bytes).map(|s| s.trim().to_string())
|
pdf_extract::extract_text_from_mem(&pdf_bytes).map(|s| s.trim().to_string())
|
||||||
})
|
})
|
||||||
.await?
|
.await?
|
||||||
.map_err(|err| AppError::Processing(format!("Failed to extract text from PDF: {err}")))?;
|
.map_err(|err| AppError::Processing(format!("failed to extract text from PDF: {err}")))?;
|
||||||
|
|
||||||
if extraction.is_empty() {
|
if extraction.is_empty() {
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
@@ -28,7 +30,7 @@ pub(super) async fn try_fast_path(pdf_bytes: Vec<u8>) -> Result<Option<String>,
|
|||||||
Ok(Some(normalize_fast_text(&extraction)))
|
Ok(Some(normalize_fast_text(&extraction)))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Heuristic that determines whether the fast-path text looks like well-formed prose.
|
/// Heuristic that determines whether the fast-path text looks like readable text.
|
||||||
#[allow(clippy::cast_precision_loss)]
|
#[allow(clippy::cast_precision_loss)]
|
||||||
fn looks_good_enough(text: &str) -> bool {
|
fn looks_good_enough(text: &str) -> bool {
|
||||||
if text.len() < FAST_PATH_MIN_LEN {
|
if text.len() < FAST_PATH_MIN_LEN {
|
||||||
|
|||||||
@@ -116,7 +116,7 @@ async fn transcribe_batch(
|
|||||||
);
|
);
|
||||||
if attempt == last_attempt {
|
if attempt == last_attempt {
|
||||||
return Err(AppError::Processing(
|
return Err(AppError::Processing(
|
||||||
"Vision model failed to transcribe PDF page contents".into(),
|
"vision model failed to transcribe PDF page contents".into(),
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
continue;
|
continue;
|
||||||
@@ -126,7 +126,7 @@ async fn transcribe_batch(
|
|||||||
}
|
}
|
||||||
|
|
||||||
Err(AppError::Processing(
|
Err(AppError::Processing(
|
||||||
"Vision model did not return usable Markdown".into(),
|
"vision model did not return usable Markdown".into(),
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -5,14 +5,18 @@ use common::{
|
|||||||
error::AppError,
|
error::AppError,
|
||||||
storage::{db::SurrealDbClient, store::StorageManager, types::file_info::FileInfo},
|
storage::{db::SurrealDbClient, store::StorageManager, types::file_info::FileInfo},
|
||||||
};
|
};
|
||||||
use dom_smoothie::{Article, Readability, TextMode};
|
use dom_smoothie::Article;
|
||||||
use std::{
|
use std::{
|
||||||
io::{Seek, SeekFrom, Write},
|
io::{Seek, SeekFrom, Write},
|
||||||
net::IpAddr,
|
net::IpAddr,
|
||||||
time::Instant,
|
time::Instant,
|
||||||
};
|
};
|
||||||
use tempfile::NamedTempFile;
|
use tempfile::NamedTempFile;
|
||||||
use tracing::{error, info, warn};
|
use tendril::StrTendril;
|
||||||
|
use tracing::{info, warn};
|
||||||
|
|
||||||
|
use crate::utils::page_fetcher::create_fetcher;
|
||||||
|
|
||||||
pub async fn extract_text_from_url(
|
pub async fn extract_text_from_url(
|
||||||
url: &str,
|
url: &str,
|
||||||
db: &SurrealDbClient,
|
db: &SurrealDbClient,
|
||||||
@@ -22,46 +26,22 @@ pub async fn extract_text_from_url(
|
|||||||
info!("Fetching URL: {}", url);
|
info!("Fetching URL: {}", url);
|
||||||
let now = Instant::now();
|
let now = Instant::now();
|
||||||
|
|
||||||
let browser = crate::utils::browser::launch_browser()?;
|
|
||||||
|
|
||||||
let tab = browser
|
|
||||||
.new_tab()
|
|
||||||
.map_err(|e| AppError::InternalError(e.to_string()))?;
|
|
||||||
let page = tab
|
|
||||||
.navigate_to(url)
|
|
||||||
.map_err(|e| AppError::InternalError(e.to_string()))?;
|
|
||||||
let loaded_page = page
|
|
||||||
.wait_until_navigated()
|
|
||||||
.map_err(|e| AppError::InternalError(e.to_string()))?;
|
|
||||||
let raw_content = loaded_page
|
|
||||||
.get_content()
|
|
||||||
.map_err(|e| AppError::InternalError(e.to_string()))?;
|
|
||||||
let screenshot = loaded_page
|
|
||||||
.capture_screenshot(
|
|
||||||
headless_chrome::protocol::cdp::Page::CaptureScreenshotFormatOption::Jpeg,
|
|
||||||
None,
|
|
||||||
None,
|
|
||||||
true,
|
|
||||||
)
|
|
||||||
.map_err(|e| AppError::InternalError(e.to_string()))?;
|
|
||||||
|
|
||||||
let mut tmp_file = NamedTempFile::new()?;
|
|
||||||
let temp_path_str = tmp_file.path().display().to_string();
|
|
||||||
|
|
||||||
tmp_file.write_all(&screenshot)?;
|
|
||||||
tmp_file.as_file().sync_all()?;
|
|
||||||
|
|
||||||
if let Err(e) = tmp_file.seek(SeekFrom::Start(0)) {
|
|
||||||
error!(
|
|
||||||
"URL: {}. Failed to seek temp file {} to start: {:?}. Proceeding, but hashing might fail.",
|
|
||||||
url, temp_path_str, e
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
let parsed_url =
|
let parsed_url =
|
||||||
url::Url::parse(url).map_err(|_| AppError::Validation("invalid URL".to_string()))?;
|
url::Url::parse(url).map_err(|_| AppError::Validation("invalid URL".to_string()))?;
|
||||||
|
|
||||||
let domain = ensure_ingestion_url_allowed(&parsed_url)?;
|
let domain = ensure_ingestion_url_allowed(&parsed_url)?;
|
||||||
|
|
||||||
|
let fetcher = create_fetcher();
|
||||||
|
let capture = fetcher.fetch(url)?;
|
||||||
|
|
||||||
|
// Save the screenshot to storage
|
||||||
|
let mut tmp_file = NamedTempFile::new()?;
|
||||||
|
|
||||||
|
if !capture.screenshot.is_empty() {
|
||||||
|
tmp_file.write_all(&capture.screenshot)?;
|
||||||
|
tmp_file.as_file().sync_all()?;
|
||||||
|
tmp_file.seek(SeekFrom::Start(0))?;
|
||||||
|
}
|
||||||
|
|
||||||
let timestamp = Utc::now().format("%Y%m%d%H%M%S");
|
let timestamp = Utc::now().format("%Y%m%d%H%M%S");
|
||||||
let file_name = format!("{}_{}_{}.jpg", domain, "screenshot", timestamp);
|
let file_name = format!("{}_{}_{}.jpg", domain, "screenshot", timestamp);
|
||||||
|
|
||||||
@@ -78,12 +58,25 @@ pub async fn extract_text_from_url(
|
|||||||
|
|
||||||
let file_info = FileInfo::new_with_storage(field_data, db, user_id, storage).await?;
|
let file_info = FileInfo::new_with_storage(field_data, db, user_id, storage).await?;
|
||||||
|
|
||||||
let config = dom_smoothie::Config {
|
// servo-fetch doesn't extract byline/site_name/metadata, so those are left empty.
|
||||||
text_mode: TextMode::Markdown,
|
let title = extract_title_from_html(&capture.html);
|
||||||
..Default::default()
|
let article = Article {
|
||||||
|
title,
|
||||||
|
byline: None,
|
||||||
|
content: StrTendril::from_slice(&capture.markdown),
|
||||||
|
text_content: StrTendril::from_slice(&capture.markdown),
|
||||||
|
length: capture.markdown.len(),
|
||||||
|
excerpt: None,
|
||||||
|
site_name: None,
|
||||||
|
dir: None,
|
||||||
|
lang: None,
|
||||||
|
published_time: None,
|
||||||
|
modified_time: None,
|
||||||
|
image: None,
|
||||||
|
favicon: None,
|
||||||
|
url: Some(url.to_string()),
|
||||||
};
|
};
|
||||||
let mut readability = Readability::new(raw_content, None, Some(config))?;
|
|
||||||
let article: Article = readability.parse()?;
|
|
||||||
let end = now.elapsed();
|
let end = now.elapsed();
|
||||||
info!(
|
info!(
|
||||||
"URL: {}. Total time: {:?}. Final File ID: {}",
|
"URL: {}. Total time: {:?}. Final File ID: {}",
|
||||||
@@ -93,13 +86,31 @@ pub async fn extract_text_from_url(
|
|||||||
Ok((article, file_info))
|
Ok((article, file_info))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Extracts a page title from raw HTML. Returns empty string when no title is found.
|
||||||
|
fn extract_title_from_html(html: &str) -> String {
|
||||||
|
let lower = html.to_ascii_lowercase();
|
||||||
|
if let Some(start) = lower.find("<title>") {
|
||||||
|
let content_start = start.saturating_add("<title>".len());
|
||||||
|
if let Some(end) = lower[content_start..].find("</title>") {
|
||||||
|
let title_end = content_start.saturating_add(end);
|
||||||
|
if title_end <= html.len() {
|
||||||
|
let title = html[content_start..title_end].trim().to_string();
|
||||||
|
if !title.is_empty() {
|
||||||
|
return title;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
String::new()
|
||||||
|
}
|
||||||
|
|
||||||
fn ensure_ingestion_url_allowed(url: &url::Url) -> Result<String, AppError> {
|
fn ensure_ingestion_url_allowed(url: &url::Url) -> Result<String, AppError> {
|
||||||
match url.scheme() {
|
match url.scheme() {
|
||||||
"http" | "https" => {}
|
"http" | "https" => {}
|
||||||
scheme => {
|
scheme => {
|
||||||
warn!(%url, %scheme, "Rejected ingestion URL due to unsupported scheme");
|
warn!(%url, %scheme, "Rejected ingestion URL due to unsupported scheme");
|
||||||
return Err(AppError::Validation(
|
return Err(AppError::Validation(
|
||||||
"Unsupported URL scheme for ingestion".to_string(),
|
"unsupported URL scheme for ingestion".to_string(),
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -107,14 +118,14 @@ fn ensure_ingestion_url_allowed(url: &url::Url) -> Result<String, AppError> {
|
|||||||
let Some(host) = url.host_str() else {
|
let Some(host) = url.host_str() else {
|
||||||
warn!(%url, "Rejected ingestion URL missing host");
|
warn!(%url, "Rejected ingestion URL missing host");
|
||||||
return Err(AppError::Validation(
|
return Err(AppError::Validation(
|
||||||
"URL is missing a host component".to_string(),
|
"URL missing a host component".to_string(),
|
||||||
));
|
));
|
||||||
};
|
};
|
||||||
|
|
||||||
if host.eq_ignore_ascii_case("localhost") {
|
if host.eq_ignore_ascii_case("localhost") {
|
||||||
warn!(%url, host, "Rejected ingestion URL to localhost");
|
warn!(%url, host, "Rejected ingestion URL to localhost");
|
||||||
return Err(AppError::Validation(
|
return Err(AppError::Validation(
|
||||||
"Ingestion URL host is not allowed".to_string(),
|
"ingestion URL host is not allowed".to_string(),
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -127,7 +138,7 @@ fn ensure_ingestion_url_allowed(url: &url::Url) -> Result<String, AppError> {
|
|||||||
if ip.is_loopback() || ip.is_unspecified() || ip.is_multicast() || is_disallowed {
|
if ip.is_loopback() || ip.is_unspecified() || ip.is_multicast() || is_disallowed {
|
||||||
warn!(%url, host, %ip, "Rejected ingestion URL pointing to restricted network range");
|
warn!(%url, host, %ip, "Rejected ingestion URL pointing to restricted network range");
|
||||||
return Err(AppError::Validation(
|
return Err(AppError::Validation(
|
||||||
"Ingestion URL host is not allowed".to_string(),
|
"ingestion URL host is not allowed".to_string(),
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -168,4 +179,28 @@ mod tests {
|
|||||||
assert_eq!(sanitized, "sub_example_com");
|
assert_eq!(sanitized, "sub_example_com");
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_extract_title_from_html_with_title() {
|
||||||
|
let html = "<html><head><title>Hello World</title></head><body></body></html>";
|
||||||
|
assert_eq!(extract_title_from_html(html), "Hello World");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_extract_title_from_html_mixed_case() {
|
||||||
|
let html = "<html><head><TITLE>Mixed Case</TITLE></head><body></body></html>";
|
||||||
|
assert_eq!(extract_title_from_html(html), "Mixed Case");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_extract_title_from_html_no_title() {
|
||||||
|
let html = "<html><head></head><body><p>No title here</p></body></html>";
|
||||||
|
assert_eq!(extract_title_from_html(html), "");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_extract_title_from_html_empty_title() {
|
||||||
|
let html = "<html><head><title></title></head><body></body></html>";
|
||||||
|
assert_eq!(extract_title_from_html(html), "");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user