14 Commits

Author SHA1 Message Date
Per Stark
c6a6080e1c release: 0.2.2 2025-10-07 11:51:33 +02:00
Per Stark
1159712724 fix: convert to surrealdb datetime before conversion 2025-10-03 15:33:28 +02:00
Per Stark
e5e1414f54 chore: clippy magic 2025-10-01 15:39:45 +02:00
Per Stark
fcc49b1954 design: new icons to match new theme 2025-10-01 10:17:43 +02:00
Per Stark
022f4d8575 fix: compliant with gpt-5 models 2025-10-01 10:17:31 +02:00
Per Stark
945a2b7f37 fix: do not log config here 2025-09-30 15:22:14 +02:00
Per Stark
ff4ea55cd5 fix: user guard on knowledge relationship deletion 2025-09-30 11:15:53 +02:00
Per Stark
c4c76efe92 test: startup smoke test 2025-09-29 21:15:34 +02:00
Per Stark
c0fcad5952 fix: deletion of items, shared files etc 2025-09-29 20:28:06 +02:00
Per Stark
b0ed69330d fix: improved concurrency 2025-09-28 22:08:08 +02:00
Per Stark
5cb15dab45 feat: pdf support 2025-09-28 20:53:51 +02:00
Per Stark
7403195df5 release: 0.2.1
chore: remove stale todo

chore: version bump
2025-09-24 10:25:56 +02:00
Per Stark
9faef31387 fix: json response in api to work with ios shortcuts
fix corrected lockfile
2025-09-23 22:01:58 +02:00
Per Stark
110f7b8a8f no graph screenshot in readme, too much image 2025-09-23 09:08:17 +02:00
35 changed files with 1611 additions and 216 deletions

43
CHANGELOG.md Normal file
View File

@@ -0,0 +1,43 @@
# Changelog
## Unreleased
## Version 0.2.2 (2025-10-07)
- Support for ingestion of PDF files
- Improved ingestion speed
- Fix deletion of items work as expected
- Fix enabling GPT-5 use via OpenAI API
## Version 0.2.1 (2025-09-24)
- Fixed API JSON responses so iOS Shortcuts integrations keep working.
## Version 0.2.0 (2025-09-23)
- Revamped the UI with a neobrutalist theme, better dark mode, and a D3-based knowledge graph.
- Added pagination for entities and content plus new observability metrics on the dashboard.
- Enabled audio ingestion and merged the new storage backend.
- Improved performance, request filtering, and journalctl/systemd compatibility.
## Version 0.1.4 (2025-07-01)
- Added image ingestion with configurable system settings and updated Docker Compose docs.
- Hardened admin flows by fixing concurrent API/database calls and normalizing task statuses.
## Version 0.1.3 (2025-06-08)
- Added support for AI providers beyond OpenAI.
- Made the HTTP port configurable for deployments.
- Smoothed graph mapper failures, long content tiles, and refreshed project documentation.
## Version 0.1.2 (2025-05-26)
- Introduced full-text search across indexed knowledge.
- Polished the UI with consistent titles, icon fallbacks, and improved markdown scrolling.
- Fixed search result links and SurrealDB vector formatting glitches.
## Version 0.1.1 (2025-05-13)
- Added streaming feedback to ingestion tasks for clearer progress updates.
- Made the data storage path configurable.
- Improved release tooling with Chromium-enabled Nix flakes, Docker builds, and migration/template fixes.
## Version 0.1.0 (2025-05-06)
- Initial release with a SurrealDB-backed ingestion pipeline, job queue, vector search, and knowledge graph storage.
- Delivered a chat experience featuring streaming responses, conversation history, markdown rendering, and customizable system prompts.
- Introduced an admin console with analytics, registration and timezone controls, and job monitoring.
- Shipped a Tailwind/daisyUI web UI with responsive layouts, modals, content viewers, and editing flows.
- Provided readability-based content ingestion, API/HTML ingress routes, and Docker/Docker Compose tooling.

270
Cargo.lock generated
View File

@@ -36,6 +36,15 @@ version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627"
[[package]]
name = "adobe-cmap-parser"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ae8abfa9a4688de8fc9f42b3f013b6fffec18ed8a554f5f113577e0b9b3212a3"
dependencies = [
"pom",
]
[[package]]
name = "aead"
version = "0.5.2"
@@ -326,15 +335,6 @@ dependencies = [
"tokio",
]
[[package]]
name = "async-convert"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6d416feee97712e43152cd42874de162b8f9b77295b1c85e5d92725cc8310bae"
dependencies = [
"async-trait",
]
[[package]]
name = "async-executor"
version = "1.13.2"
@@ -422,30 +422,41 @@ dependencies = [
[[package]]
name = "async-openai"
version = "0.24.1"
version = "0.29.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c6db3286b4f52b6556ac5208fb575d035eca61a2bf40d7e75d1db2733ffc599f"
checksum = "d4fc47ec9e669d562e0755f59e1976d157546910e403f3c2da856d0a4d3cdc07"
dependencies = [
"async-convert",
"async-openai-macros",
"backoff",
"base64 0.22.1",
"bytes",
"derive_builder",
"eventsource-stream",
"futures",
"rand 0.8.5",
"rand 0.9.1",
"reqwest",
"reqwest-eventsource",
"secrecy",
"serde",
"serde_json",
"thiserror 1.0.69",
"thiserror 2.0.12",
"tokio",
"tokio-stream",
"tokio-util",
"tracing",
]
[[package]]
name = "async-openai-macros"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0289cba6d5143bfe8251d57b4a8cac036adf158525a76533a7082ba65ec76398"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.101",
]
[[package]]
name = "async-recursion"
version = "1.1.1"
@@ -881,6 +892,15 @@ dependencies = [
"generic-array",
]
[[package]]
name = "block-padding"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8894febbff9f758034a5b8e12d87918f56dfc64a8e1fe757d65e29041538d93"
dependencies = [
"generic-array",
]
[[package]]
name = "blowfish"
version = "0.9.1"
@@ -963,6 +983,12 @@ dependencies = [
"syn 1.0.109",
]
[[package]]
name = "bytecount"
version = "0.6.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "175812e0be2bccb6abe50bb8d566126198344f707e304f45c648fd8f2cc0365e"
[[package]]
name = "bytemuck"
version = "1.23.0"
@@ -993,6 +1019,15 @@ dependencies = [
"rustversion",
]
[[package]]
name = "cbc"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "26b52a9543ae338f279b96b0b9fed9c8093744685043739079ce85cd58f289a6"
dependencies = [
"cipher",
]
[[package]]
name = "cc"
version = "1.2.21"
@@ -1061,6 +1096,12 @@ dependencies = [
"unicode-security",
]
[[package]]
name = "cff-parser"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "31f5b6e9141c036f3ff4ce7b2f7e432b0f00dee416ddcd4f17741d189ddc2e9d"
[[package]]
name = "cfg-if"
version = "1.0.0"
@@ -1840,6 +1881,15 @@ dependencies = [
"num-traits",
]
[[package]]
name = "ecb"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a8bfa975b1aec2145850fcaa1c6fe269a16578c44705a532ae3edc92b8881c7"
dependencies = [
"cipher",
]
[[package]]
name = "either"
version = "1.15.0"
@@ -1892,6 +1942,15 @@ dependencies = [
"windows-sys 0.59.0",
]
[[package]]
name = "euclid"
version = "0.20.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2bb7ef65b3777a325d1eeefefab5b6d4959da54747e33bd6258e789640f307ad"
dependencies = [
"num-traits",
]
[[package]]
name = "event-listener"
version = "5.4.0"
@@ -2866,6 +2925,8 @@ dependencies = [
"dom_smoothie",
"futures",
"headless_chrome",
"lopdf 0.32.0",
"pdf-extract",
"reqwest",
"serde",
"serde_json",
@@ -2904,6 +2965,7 @@ version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "879f10e63c20629ecabbb64a8010319738c66a5cd0c29b02d63d272b03751d01"
dependencies = [
"block-padding",
"generic-array",
]
@@ -3133,6 +3195,12 @@ dependencies = [
"thiserror 1.0.69",
]
[[package]]
name = "linked-hash-map"
version = "0.5.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f"
[[package]]
name = "linux-raw-sys"
version = "0.9.4"
@@ -3161,6 +3229,51 @@ version = "0.4.27"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94"
[[package]]
name = "lopdf"
version = "0.32.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e775e4ee264e8a87d50a9efef7b67b4aa988cf94e75630859875fc347e6c872b"
dependencies = [
"chrono",
"encoding_rs",
"flate2",
"itoa",
"linked-hash-map",
"log",
"md5",
"nom 7.1.3",
"rayon",
"time",
"weezl",
]
[[package]]
name = "lopdf"
version = "0.36.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "59fa2559e99ba0f26a12458aabc754432c805bbb8cba516c427825a997af1fb7"
dependencies = [
"aes",
"bitflags 2.9.0",
"cbc",
"ecb",
"encoding_rs",
"flate2",
"indexmap 2.9.0",
"itoa",
"log",
"md-5",
"nom 8.0.0",
"nom_locate",
"rand 0.9.1",
"rangemap",
"sha2",
"stringprep",
"thiserror 2.0.12",
"weezl",
]
[[package]]
name = "lru"
version = "0.12.5"
@@ -3178,28 +3291,22 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
[[package]]
name = "main"
version = "0.1.4"
version = "0.2.1"
dependencies = [
"anyhow",
"api-router",
"async-openai",
"axum",
"axum_session",
"axum_session_surreal",
"common",
"cookie",
"futures",
"headless_chrome",
"html-router",
"ingestion-pipeline",
"reqwest",
"serde",
"serde_json",
"serial_test",
"surrealdb",
"tempfile",
"thiserror 1.0.69",
"tokio",
"tower",
"tracing",
"tracing-subscriber",
"uuid",
@@ -3282,6 +3389,12 @@ dependencies = [
"digest",
]
[[package]]
name = "md5"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
[[package]]
name = "memchr"
version = "2.7.4"
@@ -3514,6 +3627,17 @@ dependencies = [
"memchr",
]
[[package]]
name = "nom_locate"
version = "5.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b577e2d69827c4740cba2b52efaad1c4cc7c73042860b199710b3575c68438d"
dependencies = [
"bytecount",
"memchr",
"nom 8.0.0",
]
[[package]]
name = "nonempty"
version = "0.7.0"
@@ -3836,6 +3960,23 @@ dependencies = [
"sha2",
]
[[package]]
name = "pdf-extract"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c2f44c6c642e359e2fe7f662bf5438db3811b6b4be60afc6de04b619ce51e1a"
dependencies = [
"adobe-cmap-parser",
"cff-parser",
"encoding_rs",
"euclid",
"log",
"lopdf 0.36.0",
"postscript",
"type1-encoding-parser",
"unicode-normalization",
]
[[package]]
name = "pem"
version = "3.0.5"
@@ -4027,6 +4168,18 @@ dependencies = [
"universal-hash",
]
[[package]]
name = "pom"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "60f6ce597ecdcc9a098e7fddacb1065093a3d66446fa16c675e7e71d1b5c28e6"
[[package]]
name = "postscript"
version = "0.14.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78451badbdaebaf17f053fd9152b3ffb33b516104eacb45e7864aaa9c712f306"
[[package]]
name = "powerfmt"
version = "0.2.0"
@@ -4292,6 +4445,12 @@ dependencies = [
"getrandom 0.3.2",
]
[[package]]
name = "rangemap"
version = "1.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f93e7e49bb0bf967717f7bd674458b3d6b0c5f48ec7e3038166026a69fc22223"
[[package]]
name = "rawpointer"
version = "0.2.1"
@@ -4854,9 +5013,9 @@ checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b"
[[package]]
name = "secrecy"
version = "0.8.0"
version = "0.10.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9bd1c54ea06cfd2f6b63219704de0b9b4f72dcc2b8fdef820be6cd799780e91e"
checksum = "e891af845473308773346dc847b2c23ee78fe442e0472ac50e22a18a93d3ae5a"
dependencies = [
"serde",
"zeroize",
@@ -5053,31 +5212,6 @@ dependencies = [
"syn 2.0.101",
]
[[package]]
name = "serial_test"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0e56dd856803e253c8f298af3f4d7eb0ae5e23a737252cd90bb4f3b435033b2d"
dependencies = [
"dashmap 5.5.3",
"futures",
"lazy_static",
"log",
"parking_lot",
"serial_test_derive",
]
[[package]]
name = "serial_test_derive"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "91d129178576168c589c9ec973feedf7d3126c01ac2bf08795109aa35b69fb8f"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.101",
]
[[package]]
name = "servo_arc"
version = "0.4.0"
@@ -5309,6 +5443,17 @@ dependencies = [
"quote",
]
[[package]]
name = "stringprep"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7b4df3d392d81bd458a8a621b8bffbd2302a12ffe288a9d931670948749463b1"
dependencies = [
"unicode-bidi",
"unicode-normalization",
"unicode-properties",
]
[[package]]
name = "strsim"
version = "0.11.1"
@@ -6141,6 +6286,15 @@ dependencies = [
"utf-8",
]
[[package]]
name = "type1-encoding-parser"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d3d6cc09e1a99c7e01f2afe4953789311a1c50baebbdac5b477ecf78e2e92a5b"
dependencies = [
"pom",
]
[[package]]
name = "typenum"
version = "1.18.0"
@@ -6176,6 +6330,12 @@ version = "2.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "75b844d17643ee918803943289730bec8aac480150456169e647ed0b576ba539"
[[package]]
name = "unicode-bidi"
version = "0.3.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c1cb5db39152898a79168971543b1cb5020dff7fe43c8dc468b0885f5e29df5"
[[package]]
name = "unicode-ident"
version = "1.0.18"
@@ -6191,6 +6351,12 @@ dependencies = [
"tinyvec",
]
[[package]]
name = "unicode-properties"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e70f2a8b45122e719eb623c01822704c4e0907e7e426a05927e1a1cfff5b75d0"
[[package]]
name = "unicode-script"
version = "0.5.7"
@@ -6520,6 +6686,12 @@ dependencies = [
"rustls-pki-types",
]
[[package]]
name = "weezl"
version = "0.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a751b3277700db47d3e574514de2eced5e54dc8a5436a3bf7a0b248b2cee16f3"
[[package]]
name = "which"
version = "7.0.3"

View File

@@ -12,7 +12,7 @@ resolver = "2"
[workspace.dependencies]
anyhow = "1.0.94"
async-openai = "0.24.1"
async-openai = "0.29.3"
async-stream = "0.3.6"
async-trait = "0.1.88"
axum-htmx = "0.7.0"

View File

@@ -7,7 +7,6 @@
[![Latest Release](https://img.shields.io/github/v/release/perstarkse/minne?sort=semver)](https://github.com/perstarkse/minne/releases/latest)
![Screenshot](screenshot-dashboard.webp)
![Graph screenshot](screenshot-graph.webp)
## Demo deployment
@@ -23,11 +22,11 @@ While developing Minne, I discovered [KaraKeep](https://karakeep.com/) (formerly
Minne is designed to make it incredibly easy to save snippets of text, URLs, and other content (limited, pending demand). Simply send content along with a category tag. Minne then ingests this, leveraging AI to create relevant nodes and relationships within its graph database, alongside your manual categorization. This graph backend allows for discoverable connections between your pieces of knowledge.
You can converse with your knowledge base through an LLM-powered chat interface (via OpenAI compatible API, like Ollama or others). For those who like to see the bigger picture, Minne also includes an **experimental feature to visually explore your knowledge graph.**
You can converse with your knowledge base through an LLM-powered chat interface (via OpenAI compatible API, like Ollama or others). For those who like to see the bigger picture, Minne also includes an feature to visually explore your knowledge graph.
You may switch and choose between models used, and have the possiblity to change the prompts to your liking. There is since release **0.1.3** the option to change embeddings length, making it easy to test another embedding model.
You may switch and choose between models used, and have the possiblity to change the prompts to your liking. There is the option to change embeddings length, making it easy to test another embedding model.
The application is built for speed and efficiency using Rust with a Server-Side Rendered (SSR) frontend (HTMX and minimal JavaScript). It's fully responsive, offering a complete mobile interface for reading, editing, and managing your content, including the graph database itself. **PWA (Progressive Web App) support** means you can "install" Minne to your device for a native-like experience. For quick capture on the go on iOS, a [**Shortcut**](https://www.icloud.com/shortcuts/9aa960600ec14329837ba4169f57a166) makes sending content to your Minne instance a breeze.
The application is built for speed and efficiency using Rust with a Server-Side Rendered (SSR) frontend (HTMX and minimal JavaScript). It's fully responsive, offering a complete mobile interface for reading, editing, and managing your content, including the graph database itself. **PWA (Progressive Web App) support** means you can "install" Minne to your device for a native-like experience. For quick capture on the go on iOS, a [**Shortcut**](https://www.icloud.com/shortcuts/e433fbd7602f4e2eaa70dca162323477) makes sending content to your Minne instance a breeze.
Minne is open source (AGPL), self-hostable, and can be deployed flexibly: via Nix, Docker Compose, pre-built binaries, or by building from source. It can run as a single `main` binary or as separate `server` and `worker` processes for optimized resource allocation.
@@ -191,7 +190,7 @@ Minne can be configured using environment variables or a `config.yaml` file plac
- `SURREALDB_PASSWORD`: Password for SurrealDB (e.g., `root_password`).
- `SURREALDB_DATABASE`: Database name in SurrealDB (e.g., `minne_db`).
- `SURREALDB_NAMESPACE`: Namespace in SurrealDB (e.g., `minne_ns`).
- `OPENAI_API_KEY`: Your API key for OpenAI (e.g., `sk-YourActualOpenAIKeyGoesHere`).
- `OPENAI_API_KEY`: Your API key for OpenAI compatible endpoint (e.g., `sk-YourActualOpenAIKeyGoesHere`).
- `HTTP_PORT`: Port for the Minne server to listen on (Default: `3000`).
**Optional Configuration:**
@@ -210,8 +209,8 @@ surrealdb_database: "minne_db"
surrealdb_namespace: "minne_ns"
openai_api_key: "sk-YourActualOpenAIKeyGoesHere"
data_dir: "./minne_app_data"
http_port: 3000
# rust_log: "info"
# http_port: 3000
```
## Application Architecture (Binaries)
@@ -258,8 +257,8 @@ Once you have configured the `OPENAI_BASE_URL` to point to your desired provider
I've developed Minne primarily for my own use, but having been in the selfhosted space for a long time, and using the efforts by others, I thought I'd share with the community. Feature requests are welcome.
The roadmap as of now is:
- Handle uploaded images wisely.
- An updated explorer of the graph database.
~~- Handle uploaded images wisely.~~
~~- An updated explorer of the graph database.~~
- A TUI frontend which opens your system default editor for improved writing and document management.
## Contributing

View File

@@ -1,4 +1,4 @@
use axum::{extract::State, http::StatusCode, response::IntoResponse, Extension};
use axum::{extract::State, http::StatusCode, response::IntoResponse, Extension, Json};
use axum_typed_multipart::{FieldData, TryFromMultipart, TypedMultipart};
use common::{
error::AppError,
@@ -8,6 +8,7 @@ use common::{
},
};
use futures::{future::try_join_all, TryFutureExt};
use serde_json::json;
use tempfile::NamedTempFile;
use tracing::info;
@@ -52,5 +53,5 @@ pub async fn ingest_data(
try_join_all(futures).await?;
Ok(StatusCode::OK)
Ok((StatusCode::OK, Json(json!({ "status": "success" }))))
}

View File

@@ -196,7 +196,7 @@ pub fn split_object_path(path: &str) -> AnyResult<(String, String)> {
#[cfg(test)]
mod tests {
use super::*;
use crate::utils::config::StorageKind;
use crate::utils::config::{PdfIngestMode::LlmFirst, StorageKind};
use bytes::Bytes;
use futures::TryStreamExt;
use uuid::Uuid;
@@ -213,6 +213,7 @@ mod tests {
http_port: 0,
openai_base_url: "..".into(),
storage: StorageKind::Local,
pdf_ingest_mode: LlmFirst,
}
}

View File

@@ -57,7 +57,6 @@ impl FileInfo {
user_id: &str,
config: &AppConfig,
) -> Result<Self, FileError> {
info!("Data_dir: {:?}", config);
let file = field_data.contents;
let file_name = field_data
.metadata
@@ -230,14 +229,8 @@ impl FileInfo {
config: &AppConfig,
) -> Result<(), AppError> {
// Get the FileInfo from the database
let file_info = match db_client.get_item::<FileInfo>(id).await? {
Some(info) => info,
None => {
return Err(AppError::from(FileError::FileNotFound(format!(
"File with id {} was not found",
id
))))
}
let Some(file_info) = db_client.get_item::<FileInfo>(id).await? else {
return Ok(());
};
// Remove the object's parent prefix in the object store
@@ -277,7 +270,7 @@ impl FileInfo {
#[cfg(test)]
mod tests {
use super::*;
use crate::utils::config::StorageKind;
use crate::utils::config::{PdfIngestMode::LlmFirst, StorageKind};
use axum::http::HeaderMap;
use axum_typed_multipart::FieldMetadata;
use std::io::Write;
@@ -332,6 +325,7 @@ mod tests {
http_port: 3000,
openai_base_url: "..".to_string(),
storage: StorageKind::Local,
pdf_ingest_mode: LlmFirst,
};
// Test file creation
@@ -392,6 +386,7 @@ mod tests {
http_port: 3000,
openai_base_url: "..".to_string(),
storage: StorageKind::Local,
pdf_ingest_mode: LlmFirst,
};
// Store the original file
@@ -448,6 +443,7 @@ mod tests {
http_port: 3000,
openai_base_url: "..".to_string(),
storage: StorageKind::Local,
pdf_ingest_mode: LlmFirst,
};
let file_info = FileInfo::new(field_data, &db, user_id, &config).await;
@@ -505,6 +501,7 @@ mod tests {
http_port: 3000,
openai_base_url: "..".to_string(),
storage: StorageKind::Local,
pdf_ingest_mode: LlmFirst,
};
let field_data1 = create_test_file(content, file_name);
@@ -669,6 +666,7 @@ mod tests {
http_port: 0,
openai_base_url: "".to_string(),
storage: crate::utils::config::StorageKind::Local,
pdf_ingest_mode: LlmFirst,
};
let temp = create_test_file(b"test content", "test_file.txt");
let file_info = FileInfo::new(temp, &db, user_id, &cfg)
@@ -723,18 +721,13 @@ mod tests {
http_port: 0,
openai_base_url: "".to_string(),
storage: crate::utils::config::StorageKind::Local,
pdf_ingest_mode: LlmFirst,
},
)
.await;
// Should fail with FileNotFound error
assert!(result.is_err());
match result {
Err(AppError::File(_)) => {
// Expected error
}
_ => panic!("Expected FileNotFound error"),
}
// Should succeed even if the file record does not exist
assert!(result.is_ok());
}
#[tokio::test]
async fn test_get_by_id() {
@@ -831,6 +824,7 @@ mod tests {
http_port: 3000,
openai_base_url: "..".to_string(),
storage: StorageKind::Local,
pdf_ingest_mode: LlmFirst,
};
// Test file creation

View File

@@ -75,13 +75,36 @@ impl KnowledgeRelationship {
pub async fn delete_relationship_by_id(
id: &str,
user_id: &str,
db_client: &SurrealDbClient,
) -> Result<(), AppError> {
let query = format!("DELETE relates_to:`{}`", id);
let mut authorized_result = db_client
.query(format!(
"SELECT * FROM relates_to WHERE id = relates_to:`{}` AND metadata.user_id = '{}'",
id, user_id
))
.await?;
let authorized: Vec<KnowledgeRelationship> = authorized_result.take(0).unwrap_or_default();
db_client.query(query).await?;
if authorized.is_empty() {
let mut exists_result = db_client
.query(format!("SELECT * FROM relates_to:`{}`", id))
.await?;
let existing: Option<KnowledgeRelationship> = exists_result.take(0)?;
Ok(())
if existing.is_some() {
Err(AppError::Auth(
"Not authorized to delete relationship".into(),
))
} else {
Err(AppError::NotFound(format!("Relationship {} not found", id)))
}
} else {
db_client
.query(format!("DELETE relates_to:`{}`", id))
.await?;
Ok(())
}
}
}
@@ -161,7 +184,7 @@ mod tests {
let relationship = KnowledgeRelationship::new(
entity1_id.clone(),
entity2_id.clone(),
user_id,
user_id.clone(),
source_id.clone(),
relationship_type,
);
@@ -209,7 +232,7 @@ mod tests {
let relationship = KnowledgeRelationship::new(
entity1_id.clone(),
entity2_id.clone(),
user_id,
user_id.clone(),
source_id.clone(),
relationship_type,
);
@@ -220,20 +243,107 @@ mod tests {
.await
.expect("Failed to store relationship");
// Ensure relationship exists before deletion attempt
let mut existing_before_delete = db
.query(format!(
"SELECT * FROM relates_to WHERE metadata.user_id = '{}' AND metadata.source_id = '{}'",
user_id, source_id
))
.await
.expect("Query failed");
let before_results: Vec<KnowledgeRelationship> =
existing_before_delete.take(0).unwrap_or_default();
assert!(
!before_results.is_empty(),
"Relationship should exist before deletion"
);
// Delete the relationship by ID
KnowledgeRelationship::delete_relationship_by_id(&relationship.id, &db)
KnowledgeRelationship::delete_relationship_by_id(&relationship.id, &user_id, &db)
.await
.expect("Failed to delete relationship by ID");
// Query to verify the relationship was deleted
let query = format!("SELECT * FROM relates_to WHERE id = '{}'", relationship.id);
let mut result = db.query(query).await.expect("Query failed");
let mut result = db
.query(format!(
"SELECT * FROM relates_to WHERE metadata.user_id = '{}' AND metadata.source_id = '{}'",
user_id, source_id
))
.await
.expect("Query failed");
let results: Vec<KnowledgeRelationship> = result.take(0).unwrap_or_default();
// Verify the relationship no longer exists
assert!(results.is_empty(), "Relationship should be deleted");
}
#[tokio::test]
async fn test_delete_relationship_by_id_unauthorized() {
let namespace = "test_ns";
let database = &Uuid::new_v4().to_string();
let db = SurrealDbClient::memory(namespace, database)
.await
.expect("Failed to start in-memory surrealdb");
let entity1_id = create_test_entity("Entity 1", &db).await;
let entity2_id = create_test_entity("Entity 2", &db).await;
let owner_user_id = "owner-user".to_string();
let source_id = "source123".to_string();
let relationship = KnowledgeRelationship::new(
entity1_id.clone(),
entity2_id.clone(),
owner_user_id.clone(),
source_id,
"references".to_string(),
);
relationship
.store_relationship(&db)
.await
.expect("Failed to store relationship");
let mut before_attempt = db
.query(format!(
"SELECT * FROM relates_to WHERE metadata.user_id = '{}'",
owner_user_id
))
.await
.expect("Query failed");
let before_results: Vec<KnowledgeRelationship> = before_attempt.take(0).unwrap_or_default();
assert!(
!before_results.is_empty(),
"Relationship should exist before unauthorized delete attempt"
);
let result = KnowledgeRelationship::delete_relationship_by_id(
&relationship.id,
"different-user",
&db,
)
.await;
match result {
Err(AppError::Auth(_)) => {}
_ => panic!("Expected authorization error when deleting someone else's relationship"),
}
let mut after_attempt = db
.query(format!(
"SELECT * FROM relates_to WHERE metadata.user_id = '{}'",
owner_user_id
))
.await
.expect("Query failed");
let results: Vec<KnowledgeRelationship> = after_attempt.take(0).unwrap_or_default();
assert!(
!results.is_empty(),
"Relationship should still exist after unauthorized delete attempt"
);
}
#[tokio::test]
async fn test_delete_relationships_by_source_id() {
// Setup in-memory database for testing

View File

@@ -110,6 +110,26 @@ impl TextContent {
Ok(())
}
pub async fn has_other_with_file(
file_id: &str,
exclude_id: &str,
db: &SurrealDbClient,
) -> Result<bool, AppError> {
let mut response = db
.client
.query(
"SELECT VALUE id FROM type::table($table_name) WHERE file_info.id = $file_id AND id != type::thing($table_name, $exclude_id) LIMIT 1",
)
.bind(("table_name", TextContent::table_name()))
.bind(("file_id", file_id.to_owned()))
.bind(("exclude_id", exclude_id.to_owned()))
.await?;
let existing: Option<surrealdb::sql::Thing> = response.take(0)?;
Ok(existing.is_some())
}
pub async fn search(
db: &SurrealDbClient,
search_terms: &str,
@@ -276,4 +296,64 @@ mod tests {
assert_eq!(updated_content.text, new_text);
assert!(updated_content.updated_at > text_content.updated_at);
}
#[tokio::test]
async fn test_has_other_with_file_detects_shared_usage() {
let namespace = "test_ns";
let database = &Uuid::new_v4().to_string();
let db = SurrealDbClient::memory(namespace, database)
.await
.expect("Failed to start in-memory surrealdb");
let user_id = "user123".to_string();
let file_info = FileInfo {
id: "file-1".to_string(),
created_at: chrono::Utc::now(),
updated_at: chrono::Utc::now(),
sha256: "sha-test".to_string(),
path: "user123/file-1/test.txt".to_string(),
file_name: "test.txt".to_string(),
mime_type: "text/plain".to_string(),
user_id: user_id.clone(),
};
let content_a = TextContent::new(
"First".to_string(),
Some("ctx-a".to_string()),
"category".to_string(),
Some(file_info.clone()),
None,
user_id.clone(),
);
let content_b = TextContent::new(
"Second".to_string(),
Some("ctx-b".to_string()),
"category".to_string(),
Some(file_info.clone()),
None,
user_id.clone(),
);
db.store_item(content_a.clone())
.await
.expect("Failed to store first content");
db.store_item(content_b.clone())
.await
.expect("Failed to store second content");
let has_other = TextContent::has_other_with_file(&file_info.id, &content_a.id, &db)
.await
.expect("Failed to check for shared file usage");
assert!(has_other);
let _removed: Option<TextContent> = db
.delete_item(&content_b.id)
.await
.expect("Failed to delete second content");
let has_other_after = TextContent::has_other_with_file(&file_info.id, &content_a.id, &db)
.await
.expect("Failed to check shared usage after delete");
assert!(!has_other_after);
}
}

View File

@@ -109,7 +109,7 @@ impl User {
)
.bind(("table", T::table_name()))
.bind(("user_id", user_id.to_string()))
.bind(("since", since))
.bind(("since", surrealdb::Datetime::from(since)))
.await?
.take(0)?;
Ok(result.map(|r| r.count).unwrap_or(0))

View File

@@ -11,6 +11,20 @@ fn default_storage_kind() -> StorageKind {
StorageKind::Local
}
/// Selects the strategy used for PDF ingestion.
#[derive(Clone, Deserialize, Debug)]
#[serde(rename_all = "kebab-case")]
pub enum PdfIngestMode {
/// Only rely on classic text extraction (no LLM fallbacks).
Classic,
/// Prefer fast text extraction, but fall back to the LLM rendering path when needed.
LlmFirst,
}
fn default_pdf_ingest_mode() -> PdfIngestMode {
PdfIngestMode::LlmFirst
}
#[derive(Clone, Deserialize, Debug)]
pub struct AppConfig {
pub openai_api_key: String,
@@ -26,6 +40,8 @@ pub struct AppConfig {
pub openai_base_url: String,
#[serde(default = "default_storage_kind")]
pub storage: StorageKind,
#[serde(default = "default_pdf_ingest_mode")]
pub pdf_ingest_mode: PdfIngestMode,
}
fn default_data_dir() -> String {

View File

@@ -154,8 +154,6 @@ pub fn create_chat_request(
CreateChatCompletionRequestArgs::default()
.model(&settings.query_model)
.temperature(0.2)
.max_tokens(3048u32)
.messages([
ChatCompletionRequestSystemMessage::from(settings.query_system_prompt.clone()).into(),
ChatCompletionRequestUserMessage::from(user_message).into(),

View File

@@ -1,5 +1,4 @@
use surrealdb::Error;
use tracing::debug;
use common::storage::{db::SurrealDbClient, types::knowledge_entity::KnowledgeEntity};
@@ -57,8 +56,6 @@ pub async fn find_entities_by_relationship_by_id(
entity_id
);
debug!("{}", query);
db.query(query).await?.take(0)
}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 47 KiB

After

Width:  |  Height:  |  Size: 28 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 252 KiB

After

Width:  |  Height:  |  Size: 140 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 42 KiB

After

Width:  |  Height:  |  Size: 25 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 790 B

After

Width:  |  Height:  |  Size: 963 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.2 KiB

After

Width:  |  Height:  |  Size: 1.9 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 15 KiB

After

Width:  |  Height:  |  Size: 15 KiB

View File

@@ -185,8 +185,13 @@ pub async fn delete_text_content(
let text_content = User::get_and_validate_text_content(&id, &user.id, &state.db).await?;
// If it has file info, delete that too
if let Some(file_info) = &text_content.file_info {
FileInfo::delete_by_id(&file_info.id, &state.db, &state.config).await?;
if let Some(file_info) = text_content.file_info.as_ref() {
let file_in_use =
TextContent::has_other_with_file(&file_info.id, &text_content.id, &state.db).await?;
if !file_in_use {
FileInfo::delete_by_id(&file_info.id, &state.db, &state.config).await?;
}
}
// Delete related knowledge entities and text chunks

View File

@@ -6,7 +6,6 @@ use axum::{
};
use futures::try_join;
use serde::Serialize;
use tokio::join;
use crate::{
html_state::HtmlState,
@@ -68,7 +67,7 @@ pub async fn index_handler(
#[derive(Serialize)]
pub struct LatestTextContentData {
latest_text_contents: Vec<TextContent>,
text_contents: Vec<TextContent>,
user: User,
}
@@ -80,31 +79,35 @@ pub async fn delete_text_content(
// Get and validate TextContent
let text_content = get_and_validate_text_content(&state, &id, &user).await?;
// Perform concurrent deletions
let (_res1, _res2, _res3, _res4, _res5) = join!(
async {
if let Some(file_info) = text_content.file_info {
FileInfo::delete_by_id(&file_info.id, &state.db, &state.config).await
} else {
Ok(())
}
},
state.db.delete_item::<TextContent>(&text_content.id),
TextChunk::delete_by_source_id(&text_content.id, &state.db),
KnowledgeEntity::delete_by_source_id(&text_content.id, &state.db),
KnowledgeRelationship::delete_relationships_by_source_id(&text_content.id, &state.db)
);
// Remove stored assets before deleting the text content record
if let Some(file_info) = text_content.file_info.as_ref() {
let file_in_use =
TextContent::has_other_with_file(&file_info.id, &text_content.id, &state.db).await?;
if !file_in_use {
FileInfo::delete_by_id(&file_info.id, &state.db, &state.config).await?;
}
}
// Delete the text content and any related data
TextChunk::delete_by_source_id(&text_content.id, &state.db).await?;
KnowledgeEntity::delete_by_source_id(&text_content.id, &state.db).await?;
KnowledgeRelationship::delete_relationships_by_source_id(&text_content.id, &state.db).await?;
state
.db
.delete_item::<TextContent>(&text_content.id)
.await?;
// Render updated content
let latest_text_contents =
let text_contents =
truncate_text_contents(User::get_latest_text_contents(&user.id, &state.db).await?);
Ok(TemplateResponse::new_partial(
"index/signed_in/recent_content.html",
"dashboard/recent_content.html",
"latest_content_section",
LatestTextContentData {
user: user.to_owned(),
latest_text_contents,
text_contents,
},
))
}

View File

@@ -385,9 +385,7 @@ pub async fn delete_knowledge_relationship(
RequireUser(user): RequireUser,
Path(id): Path<String>,
) -> Result<impl IntoResponse, HtmlError> {
// GOTTA ADD AUTH VALIDATION
KnowledgeRelationship::delete_relationship_by_id(&id, &state.db).await?;
KnowledgeRelationship::delete_relationship_by_id(&id, &user.id, &state.db).await?;
let entities = User::get_knowledge_entities(&user.id, &state.db).await?;

View File

@@ -3,14 +3,10 @@ use common::storage::types::text_content::TextContent;
const TEXT_PREVIEW_LENGTH: usize = 50;
fn maybe_truncate(value: &str) -> Option<String> {
let mut char_count = 0;
for (idx, _) in value.char_indices() {
for (char_count, (idx, _)) in value.char_indices().enumerate() {
if char_count == TEXT_PREVIEW_LENGTH {
return Some(value[..idx].to_string());
}
char_count += 1;
}
None

View File

@@ -1,6 +1,6 @@
{% block latest_content_section %}
<div id="latest_content_section" class="list">
<h2 class="text-2xl mb-2 font-extrabold">Recent content</h2>
{% include "content/content_list.html" %}
{% include "dashboard/recent_content_list.html" %}
</div>
{% endblock %}
{% endblock %}

View File

@@ -0,0 +1,65 @@
<div id="latest_text_content_cards" class="space-y-6">
{% if text_contents|length > 0 %}
<div class="nb-masonry w-full">
{% for text_content in text_contents %}
<article class="nb-card cursor-pointer mx-auto mb-4 w-full max-w-[92vw] space-y-3 sm:max-w-none"
hx-get="/content/{{ text_content.id }}/read" hx-target="#modal" hx-swap="innerHTML">
{% if text_content.url_info %}
<figure class="-mx-4 -mt-4 border-b-2 border-neutral bg-base-200">
<img class="w-full h-auto" src="/file/{{ text_content.url_info.image_id }}" alt="website screenshot" />
</figure>
{% endif %}
{% if text_content.file_info and (text_content.file_info.mime_type == "image/png" or text_content.file_info.mime_type == "image/jpeg") %}
<figure class="-mx-4 -mt-4 border-b-2 border-neutral bg-base-200">
<img class="w-full h-auto" src="/file/{{ text_content.file_info.id }}" alt="{{ text_content.file_info.file_name }}" />
</figure>
{% endif %}
<div class="space-y-3 break-words">
<h2 class="text-lg font-extrabold tracking-tight truncate">
{% if text_content.url_info %}
{{ text_content.url_info.title }}
{% elif text_content.file_info %}
{{ text_content.file_info.file_name }}
{% else %}
{{ text_content.text }}
{% endif %}
</h2>
<div class="flex flex-wrap items-center justify-between gap-3">
<p class="text-xs opacity-60 shrink-0">
{{ text_content.created_at | datetimeformat(format="short", tz=user.timezone) }}
</p>
<span class="nb-badge">{{ text_content.category }}</span>
<div class="flex gap-2" hx-on:click="event.stopPropagation()">
{% if text_content.url_info %}
<a href="{{ text_content.url_info.url }}" target="_blank" rel="noopener noreferrer"
class="nb-btn btn-square btn-sm" aria-label="Open source link">
{% include "icons/link_icon.html" %}
</a>
{% endif %}
<button hx-get="/content/{{ text_content.id }}/read" hx-target="#modal" hx-swap="innerHTML"
class="nb-btn btn-square btn-sm" aria-label="Read content">
{% include "icons/read_icon.html" %}
</button>
<button hx-get="/content/{{ text_content.id }}" hx-target="#modal" hx-swap="innerHTML"
class="nb-btn btn-square btn-sm" aria-label="Edit content">
{% include "icons/edit_icon.html" %}
</button>
<button hx-delete="/text-content/{{ text_content.id }}" hx-target="#latest_content_section"
hx-swap="outerHTML" class="nb-btn btn-square btn-sm" aria-label="Delete content">
{% include "icons/delete_icon.html" %}
</button>
</div>
</div>
<p class="text-sm leading-relaxed">
{{ text_content.instructions }}
</p>
</div>
</article>
{% endfor %}
</div>
{% else %}
<div class="nb-card p-8 text-center text-sm opacity-70">
No content found.
</div>
{% endif %}
</div>

View File

@@ -23,6 +23,8 @@ url = { workspace = true }
uuid = { workspace = true }
headless_chrome = { workspace = true }
base64 = { workspace = true }
pdf-extract = "0.9"
lopdf = "0.32"
common = { path = "../common" }
composite-retrieval = { path = "../composite-retrieval" }

View File

@@ -110,8 +110,6 @@ impl IngestionEnricher {
let request = CreateChatCompletionRequestArgs::default()
.model(&settings.processing_model)
.temperature(0.2)
.max_tokens(6048u32)
.messages([
ChatCompletionRequestSystemMessage::from(INGRESS_ANALYSIS_SYSTEM_MESSAGE).into(),
ChatCompletionRequestUserMessage::from(user_message).into(),

View File

@@ -1,9 +1,9 @@
use std::{sync::Arc, time::Instant};
use chrono::Utc;
use futures::future::try_join_all;
use text_splitter::TextSplitter;
use tracing::info;
use tokio::time::{sleep, Duration};
use tracing::{info, warn};
use common::{
error::AppError,
@@ -135,17 +135,73 @@ impl IngestionPipeline {
entities: Vec<KnowledgeEntity>,
relationships: Vec<KnowledgeRelationship>,
) -> Result<(), AppError> {
let entities = Arc::new(entities);
let relationships = Arc::new(relationships);
let entity_count = entities.len();
let relationship_count = relationships.len();
let entity_futures = entities
.iter()
.map(|entitity| self.db.store_item(entitity.to_owned()));
const STORE_GRAPH_MUTATION: &str = r#"
BEGIN TRANSACTION;
LET $entities = $entities;
LET $relationships = $relationships;
try_join_all(entity_futures).await?;
FOR $entity IN $entities {
CREATE type::thing('knowledge_entity', $entity.id) CONTENT $entity;
};
for relationship in &relationships {
relationship.store_relationship(&self.db).await?;
FOR $relationship IN $relationships {
LET $in_node = type::thing('knowledge_entity', $relationship.in);
LET $out_node = type::thing('knowledge_entity', $relationship.out);
RELATE $in_node->relates_to->$out_node CONTENT {
id: type::thing('relates_to', $relationship.id),
metadata: $relationship.metadata
};
};
COMMIT TRANSACTION;
"#;
const MAX_ATTEMPTS: usize = 3;
const INITIAL_BACKOFF_MS: u64 = 50;
const MAX_BACKOFF_MS: u64 = 800;
let mut backoff_ms = INITIAL_BACKOFF_MS;
let mut success = false;
for attempt in 0..MAX_ATTEMPTS {
let result = self
.db
.client
.query(STORE_GRAPH_MUTATION)
.bind(("entities", entities.clone()))
.bind(("relationships", relationships.clone()))
.await;
match result {
Ok(_) => {
success = true;
break;
}
Err(err) => {
if Self::is_retryable_conflict(&err) && attempt + 1 < MAX_ATTEMPTS {
warn!(
attempt = attempt + 1,
"Transient SurrealDB conflict while storing graph data; retrying"
);
sleep(Duration::from_millis(backoff_ms)).await;
backoff_ms = (backoff_ms * 2).min(MAX_BACKOFF_MS);
continue;
}
return Err(AppError::from(err));
}
}
}
if !success {
return Err(AppError::InternalError(
"Failed to store graph entities after retries".to_string(),
));
}
info!(
@@ -173,4 +229,10 @@ impl IngestionPipeline {
Ok(())
}
fn is_retryable_conflict(error: &surrealdb::Error) -> bool {
error
.to_string()
.contains("Failed to commit transaction due to a read or write conflict")
}
}

View File

@@ -9,10 +9,13 @@ use chrono::Utc;
use common::storage::db::SurrealDbClient;
use common::{
error::AppError,
storage::types::{
file_info::FileInfo,
ingestion_payload::IngestionPayload,
text_content::{TextContent, UrlInfo},
storage::{
store,
types::{
file_info::FileInfo,
ingestion_payload::IngestionPayload,
text_content::{TextContent, UrlInfo},
},
},
utils::config::AppConfig,
};
@@ -24,6 +27,7 @@ use tracing::{error, info};
use crate::utils::{
audio_transcription::transcribe_audio_file, image_parsing::extract_text_from_image,
pdf_ingestion::extract_pdf_content,
};
pub async fn to_text_content(
@@ -72,7 +76,7 @@ pub async fn to_text_content(
category,
user_id,
} => {
let text = extract_text_from_file(&file_info, db, openai_client).await?;
let text = extract_text_from_file(&file_info, db, openai_client, config).await?;
Ok(TextContent::new(
text,
Some(context),
@@ -199,43 +203,55 @@ async fn fetch_article_from_url(
Ok((article, file_info))
}
/// Extracts text from a file based on its MIME type.
/// Extracts text from a stored file by MIME type.
async fn extract_text_from_file(
file_info: &FileInfo,
db_client: &SurrealDbClient,
openai_client: &async_openai::Client<async_openai::config::OpenAIConfig>,
config: &AppConfig,
) -> Result<String, AppError> {
let base_path = store::resolve_base_dir(config);
let absolute_path = base_path.join(&file_info.path);
match file_info.mime_type.as_str() {
"text/plain" => {
// Read the file and return its content
let content = tokio::fs::read_to_string(&file_info.path).await?;
Ok(content)
}
"text/markdown" => {
// Read the file and return its content
let content = tokio::fs::read_to_string(&file_info.path).await?;
"text/plain" | "text/markdown" | "application/octet-stream" | "text/x-rust" => {
let content = tokio::fs::read_to_string(&absolute_path).await?;
Ok(content)
}
"application/pdf" => {
// TODO: Implement PDF text extraction using a crate like `pdf-extract` or `lopdf`
Err(AppError::NotFound(file_info.mime_type.clone()))
extract_pdf_content(
&absolute_path,
db_client,
openai_client,
&config.pdf_ingest_mode,
)
.await
}
"image/png" | "image/jpeg" => {
let content =
extract_text_from_image(&file_info.path, db_client, openai_client).await?;
Ok(content)
}
"application/octet-stream" => {
let content = tokio::fs::read_to_string(&file_info.path).await?;
Ok(content)
}
"text/x-rust" => {
let content = tokio::fs::read_to_string(&file_info.path).await?;
let path_str = absolute_path
.to_str()
.ok_or_else(|| {
AppError::Processing(format!(
"Encountered a non-UTF8 path while reading image {}",
file_info.id
))
})?
.to_string();
let content = extract_text_from_image(&path_str, db_client, openai_client).await?;
Ok(content)
}
"audio/mpeg" | "audio/mp3" | "audio/wav" | "audio/x-wav" | "audio/webm" | "audio/mp4"
| "audio/ogg" | "audio/flac" => {
transcribe_audio_file(&file_info.path, db_client, openai_client).await
let path_str = absolute_path
.to_str()
.ok_or_else(|| {
AppError::Processing(format!(
"Encountered a non-UTF8 path while reading audio {}",
file_info.id
))
})?
.to_string();
transcribe_audio_file(&path_str, db_client, openai_client).await
}
// Handle other MIME types as needed
_ => Err(AppError::NotFound(file_info.mime_type.clone())),

View File

@@ -23,7 +23,6 @@ pub async fn extract_text_from_image(
let request = CreateChatCompletionRequestArgs::default()
.model(system_settings.image_processing_model)
.max_tokens(6400_u32)
.messages([ChatCompletionRequestUserMessageArgs::default()
.content(vec![
ChatCompletionRequestMessageContentPartTextArgs::default()

View File

@@ -1,6 +1,7 @@
pub mod audio_transcription;
pub mod image_parsing;
pub mod llm_instructions;
pub mod pdf_ingestion;
use common::error::AppError;
use std::collections::HashMap;

View File

@@ -0,0 +1,793 @@
use std::{
path::{Path, PathBuf},
time::{Duration, SystemTime, UNIX_EPOCH},
};
use async_openai::types::{
ChatCompletionRequestMessageContentPartImageArgs,
ChatCompletionRequestMessageContentPartTextArgs, ChatCompletionRequestUserMessageArgs,
CreateChatCompletionRequestArgs, ImageDetail, ImageUrlArgs,
};
use base64::{engine::general_purpose::STANDARD, Engine as _};
use headless_chrome::{
protocol::cdp::{Emulation, Page, DOM},
Browser,
};
use lopdf::Document;
use serde_json::Value;
use tokio::time::sleep;
use tracing::{debug, warn};
use common::{
error::AppError,
storage::{db::SurrealDbClient, types::system_settings::SystemSettings},
utils::config::PdfIngestMode,
};
const FAST_PATH_MIN_LEN: usize = 150;
const FAST_PATH_MIN_ASCII_RATIO: f64 = 0.7;
const MAX_VISION_PAGES: usize = 50;
const PAGES_PER_VISION_CHUNK: usize = 4;
const MAX_VISION_ATTEMPTS: usize = 2;
const PDF_MARKDOWN_PROMPT: &str = "Convert these PDF pages to clean Markdown. Preserve headings, lists, tables, blockquotes, code fences, and inline formatting. Keep the original reading order, avoid commentary, and do NOT wrap the entire response in a Markdown code block.";
const PDF_MARKDOWN_PROMPT_RETRY: &str = "You must transcribe the provided PDF page images into accurate Markdown. The images are already supplied, so do not respond that you cannot view them. Extract all visible text, tables, and structure, and do NOT wrap the overall response in a Markdown code block.";
const NAVIGATION_RETRY_INTERVAL_MS: u64 = 120;
const NAVIGATION_RETRY_ATTEMPTS: usize = 10;
const MIN_PAGE_IMAGE_BYTES: usize = 1_024;
const DEFAULT_VIEWPORT_WIDTH: u32 = 1_248; // generous width to reduce horizontal clipping
const DEFAULT_VIEWPORT_HEIGHT: u32 = 1_800; // tall enough to capture full page at fit-to-width scale
const DEFAULT_DEVICE_SCALE_FACTOR: f64 = 1.0;
const CANVAS_VIEWPORT_ATTEMPTS: usize = 12;
const CANVAS_VIEWPORT_WAIT_MS: u64 = 200;
const DEBUG_IMAGE_ENV_VAR: &str = "MINNE_PDF_DEBUG_DIR";
/// Attempts to extract PDF content, using a fast text layer first and falling back to
/// rendering the document for a vision-enabled LLM when needed.
pub async fn extract_pdf_content(
file_path: &Path,
db: &SurrealDbClient,
client: &async_openai::Client<async_openai::config::OpenAIConfig>,
mode: &PdfIngestMode,
) -> Result<String, AppError> {
let pdf_bytes = tokio::fs::read(file_path).await?;
if let Some(candidate) = try_fast_path(pdf_bytes.clone()).await? {
return Ok(candidate);
}
if matches!(mode, PdfIngestMode::Classic) {
return Err(AppError::Processing(
"PDF text extraction failed and LLM-first mode is disabled".into(),
));
}
let page_numbers = load_page_numbers(pdf_bytes.clone()).await?;
if page_numbers.is_empty() {
return Err(AppError::Processing("PDF appears to have no pages".into()));
}
if page_numbers.len() > MAX_VISION_PAGES {
return Err(AppError::Processing(format!(
"PDF has {} pages which exceeds the configured vision processing limit of {}",
page_numbers.len(),
MAX_VISION_PAGES
)));
}
let rendered_pages = render_pdf_pages(file_path, &page_numbers).await?;
let combined_markdown = vision_markdown(rendered_pages, db, client).await?;
Ok(post_process(&combined_markdown))
}
/// Runs `pdf-extract` on the PDF bytes and validates the result with simple heuristics.
/// Returns `Ok(None)` when the text layer is missing or too noisy.
async fn try_fast_path(pdf_bytes: Vec<u8>) -> Result<Option<String>, AppError> {
let extraction = tokio::task::spawn_blocking(move || {
pdf_extract::extract_text_from_mem(&pdf_bytes).map(|s| s.trim().to_string())
})
.await?
.map_err(|err| AppError::Processing(format!("Failed to extract text from PDF: {err}")))?;
if extraction.is_empty() {
return Ok(None);
}
if !looks_good_enough(&extraction) {
return Ok(None);
}
Ok(Some(normalize_fast_text(&extraction)))
}
/// Parses the PDF structure to discover the available page numbers while keeping work off
/// the async executor.
async fn load_page_numbers(pdf_bytes: Vec<u8>) -> Result<Vec<u32>, AppError> {
let pages = tokio::task::spawn_blocking(move || -> Result<Vec<u32>, AppError> {
let document = Document::load_mem(&pdf_bytes)
.map_err(|err| AppError::Processing(format!("Failed to parse PDF: {err}")))?;
let mut page_numbers: Vec<u32> = document.get_pages().keys().copied().collect();
page_numbers.sort_unstable();
Ok(page_numbers)
})
.await??;
Ok(pages)
}
/// Uses the existing headless Chrome dependency to rasterize the requested PDF pages into PNGs.
async fn render_pdf_pages(file_path: &Path, pages: &[u32]) -> Result<Vec<Vec<u8>>, AppError> {
let file_url = url::Url::from_file_path(file_path)
.map_err(|_| AppError::Processing("Unable to construct PDF file URL".into()))?;
let browser = create_browser()?;
let tab = browser
.new_tab()
.map_err(|err| AppError::Processing(format!("Failed to create Chrome tab: {err}")))?;
tab.set_default_timeout(Duration::from_secs(10));
configure_tab(&tab)?;
set_pdf_viewport(&tab)?;
let mut captures = Vec::with_capacity(pages.len());
for (idx, page) in pages.iter().enumerate() {
let target = format!(
"{}#page={}&toolbar=0&statusbar=0&zoom=page-fit",
file_url, page
);
tab.navigate_to(&target)
.map_err(|err| AppError::Processing(format!("Failed to navigate to PDF page: {err}")))?
.wait_until_navigated()
.map_err(|err| AppError::Processing(format!("Navigation to PDF page failed: {err}")))?;
let mut loaded = false;
for attempt in 0..NAVIGATION_RETRY_ATTEMPTS {
if tab
.wait_for_element("embed, canvas, body")
.map(|_| ())
.is_ok()
{
loaded = true;
break;
}
if attempt + 1 < NAVIGATION_RETRY_ATTEMPTS {
sleep(Duration::from_millis(NAVIGATION_RETRY_INTERVAL_MS)).await;
}
}
if !loaded {
return Err(AppError::Processing(
"Timed out waiting for Chrome to render PDF page".into(),
));
}
wait_for_pdf_ready(&tab, *page)?;
tokio::time::sleep(Duration::from_millis(350)).await;
prepare_pdf_viewer(&tab, *page);
let mut viewport: Option<Page::Viewport> = None;
for attempt in 0..CANVAS_VIEWPORT_ATTEMPTS {
match canvas_viewport_for_page(&tab, *page) {
Ok(Some(vp)) => {
viewport = Some(vp);
break;
}
Ok(None) => {
if attempt + 1 < CANVAS_VIEWPORT_ATTEMPTS {
tokio::time::sleep(Duration::from_millis(CANVAS_VIEWPORT_WAIT_MS)).await;
}
}
Err(err) => {
warn!(page = *page, error = %err, "Failed to derive canvas viewport");
break;
}
}
}
let png = if let Some(clip) = viewport {
match tab.call_method(Page::CaptureScreenshot {
format: Some(Page::CaptureScreenshotFormatOption::Png),
quality: None,
clip: Some(clip),
from_surface: Some(true),
capture_beyond_viewport: Some(true),
optimize_for_speed: Some(false),
}) {
Ok(data) => match STANDARD.decode(data.data) {
Ok(bytes) => bytes,
Err(err) => {
warn!(error = %err, page = *page, "Failed to decode clipped screenshot; falling back to full page capture");
capture_full_page_png(&tab)?
}
},
Err(err) => {
warn!(error = %err, page = *page, "Clipped screenshot failed; falling back to full page capture");
capture_full_page_png(&tab)?
}
}
} else {
warn!(
page = *page,
"Unable to determine canvas viewport; capturing full page"
);
capture_full_page_png(&tab)?
};
debug!(
page = *page,
bytes = png.len(),
page_index = idx,
"Captured PDF page screenshot"
);
if is_suspicious_image(png.len()) {
warn!(
page = *page,
bytes = png.len(),
"Screenshot size below threshold; check rendering output"
);
}
if let Err(err) = maybe_dump_debug_image(*page, &png).await {
warn!(
page = *page,
error = %err,
"Failed to write debug screenshot to disk"
);
}
captures.push(png);
}
Ok(captures)
}
/// Launches a headless Chrome instance that respects the existing feature flags.
fn create_browser() -> Result<Browser, AppError> {
#[cfg(feature = "docker")]
{
let options = headless_chrome::LaunchOptionsBuilder::default()
.sandbox(false)
.build()
.map_err(|err| AppError::Processing(format!("Failed to launch Chrome: {err}")))?;
Browser::new(options)
.map_err(|err| AppError::Processing(format!("Failed to start Chrome: {err}")))
}
#[cfg(not(feature = "docker"))]
{
Browser::default()
.map_err(|err| AppError::Processing(format!("Failed to start Chrome: {err}")))
}
}
/// Sends one or more rendered pages to the configured multimodal model and stitches the resulting Markdown chunks together.
async fn vision_markdown(
rendered_pages: Vec<Vec<u8>>,
db: &SurrealDbClient,
client: &async_openai::Client<async_openai::config::OpenAIConfig>,
) -> Result<String, AppError> {
let settings = SystemSettings::get_current(db).await?;
let prompt = PDF_MARKDOWN_PROMPT;
debug!(
pages = rendered_pages.len(),
"Preparing vision batches for PDF conversion"
);
let mut markdown_sections = Vec::with_capacity(rendered_pages.len());
for (batch_idx, chunk) in rendered_pages.chunks(PAGES_PER_VISION_CHUNK).enumerate() {
let total_image_bytes: usize = chunk.iter().map(|bytes| bytes.len()).sum();
debug!(
batch = batch_idx,
pages = chunk.len(),
bytes = total_image_bytes,
"Encoding PDF images for vision batch"
);
let encoded_images: Vec<String> = chunk
.iter()
.enumerate()
.map(|(idx, png_bytes)| {
let encoded = STANDARD.encode(png_bytes);
if encoded.len() < 80 {
warn!(
batch = batch_idx,
page_index = idx,
encoded_bytes = encoded.len(),
"Encoded PDF image payload unusually small"
);
}
encoded
})
.collect();
let mut batch_markdown: Option<String> = None;
for attempt in 0..MAX_VISION_ATTEMPTS {
let prompt_text = prompt_for_attempt(attempt, prompt);
let mut content_parts = Vec::with_capacity(encoded_images.len() + 1);
content_parts.push(
ChatCompletionRequestMessageContentPartTextArgs::default()
.text(prompt_text)
.build()?
.into(),
);
for encoded in &encoded_images {
let image_url = format!("data:image/png;base64,{}", encoded);
content_parts.push(
ChatCompletionRequestMessageContentPartImageArgs::default()
.image_url(
ImageUrlArgs::default()
.url(image_url)
.detail(ImageDetail::High)
.build()?,
)
.build()?
.into(),
);
}
let request = CreateChatCompletionRequestArgs::default()
.model(settings.image_processing_model.clone())
.messages([ChatCompletionRequestUserMessageArgs::default()
.content(content_parts)
.build()?
.into()])
.build()?;
let response = client.chat().create(request).await?;
let Some(choice) = response.choices.first() else {
warn!(
batch = batch_idx,
attempt, "Vision response contained zero choices"
);
continue;
};
let Some(content) = choice.message.content.as_ref() else {
warn!(
batch = batch_idx,
attempt, "Vision response missing content field"
);
continue;
};
debug!(
batch = batch_idx,
attempt,
response_chars = content.len(),
"Received Markdown response for PDF batch"
);
let preview: String = if content.len() > 500 {
let mut snippet = content.chars().take(500).collect::<String>();
snippet.push('…');
snippet
} else {
content.clone()
};
debug!(batch = batch_idx, attempt, preview = %preview, "Vision response content preview");
if is_low_quality_response(content) {
warn!(
batch = batch_idx,
attempt, "Vision model returned low quality response"
);
if attempt + 1 == MAX_VISION_ATTEMPTS {
return Err(AppError::Processing(
"Vision model failed to transcribe PDF page contents".into(),
));
}
continue;
}
batch_markdown = Some(content.trim().to_string());
break;
}
if let Some(markdown) = batch_markdown {
markdown_sections.push(markdown);
} else {
return Err(AppError::Processing(
"Vision model did not return usable Markdown".into(),
));
}
}
Ok(markdown_sections.join("\n\n"))
}
/// Heuristic that determines whether the fast-path text looks like well-formed prose.
fn looks_good_enough(text: &str) -> bool {
if text.len() < FAST_PATH_MIN_LEN {
return false;
}
let total_chars = text.chars().count() as f64;
if total_chars == 0.0 {
return false;
}
let ascii_chars = text.chars().filter(|c| c.is_ascii()).count() as f64;
let ascii_ratio = ascii_chars / total_chars;
if ascii_ratio < FAST_PATH_MIN_ASCII_RATIO {
return false;
}
let letters = text.chars().filter(|c| c.is_alphabetic()).count() as f64;
let letter_ratio = letters / total_chars;
letter_ratio > 0.3
}
/// Normalizes fast-path output so downstream consumers see consistent Markdown.
fn normalize_fast_text(text: &str) -> String {
reflow_markdown(text)
}
/// Cleans, trims, and reflows Markdown created by the LLM path.
fn post_process(markdown: &str) -> String {
let cleaned = markdown.replace('\r', "");
let trimmed = cleaned.trim();
reflow_markdown(trimmed)
}
/// Joins hard-wrapped paragraph text while preserving structural Markdown lines.
fn reflow_markdown(input: &str) -> String {
let mut paragraphs = Vec::new();
let mut buffer: Vec<String> = Vec::new();
for line in input.lines() {
let trimmed = line.trim();
if trimmed.is_empty() {
if !buffer.is_empty() {
paragraphs.push(buffer.join(" "));
buffer.clear();
}
continue;
}
if is_structural_line(trimmed) {
if !buffer.is_empty() {
paragraphs.push(buffer.join(" "));
buffer.clear();
}
paragraphs.push(trimmed.to_string());
continue;
}
buffer.push(trimmed.to_string());
}
if !buffer.is_empty() {
paragraphs.push(buffer.join(" "));
}
paragraphs.join("\n\n")
}
/// Detects whether a line is structural Markdown that should remain on its own.
fn is_structural_line(line: &str) -> bool {
let lowered = line.to_ascii_lowercase();
line.starts_with('#')
|| line.starts_with('-')
|| line.starts_with('*')
|| line.starts_with('>')
|| line.starts_with("```")
|| line.starts_with('~')
|| line.starts_with("| ")
|| line.starts_with("+-")
|| lowered
.chars()
.next()
.map(|c| c.is_ascii_digit())
.unwrap_or(false)
&& lowered.contains('.')
}
fn debug_dump_directory() -> Option<PathBuf> {
std::env::var(DEBUG_IMAGE_ENV_VAR)
.ok()
.map(|value| value.trim().to_string())
.filter(|value| !value.is_empty())
.map(PathBuf::from)
}
fn configure_tab(tab: &headless_chrome::Tab) -> Result<(), AppError> {
tab.call_method(Emulation::SetDefaultBackgroundColorOverride {
color: Some(DOM::RGBA {
r: 255,
g: 255,
b: 255,
a: Some(1.0),
}),
})
.map_err(|err| {
AppError::Processing(format!("Failed to configure Chrome page background: {err}"))
})?;
Ok(())
}
fn set_pdf_viewport(tab: &headless_chrome::Tab) -> Result<(), AppError> {
tab.call_method(Emulation::SetDeviceMetricsOverride {
width: DEFAULT_VIEWPORT_WIDTH,
height: DEFAULT_VIEWPORT_HEIGHT,
device_scale_factor: DEFAULT_DEVICE_SCALE_FACTOR,
mobile: false,
scale: None,
screen_width: Some(DEFAULT_VIEWPORT_WIDTH),
screen_height: Some(DEFAULT_VIEWPORT_HEIGHT),
position_x: None,
position_y: None,
dont_set_visible_size: Some(false),
screen_orientation: None,
viewport: None,
display_feature: None,
device_posture: None,
})
.map_err(|err| AppError::Processing(format!("Failed to configure Chrome viewport: {err}")))?;
tab.call_method(Emulation::SetVisibleSize {
width: DEFAULT_VIEWPORT_WIDTH,
height: DEFAULT_VIEWPORT_HEIGHT,
})
.map_err(|err| AppError::Processing(format!("Failed to apply Chrome visible size: {err}")))?;
Ok(())
}
fn wait_for_pdf_ready(
tab: &headless_chrome::Tab,
page_number: u32,
) -> Result<headless_chrome::Element<'_>, AppError> {
let embed_selector = "embed[type='application/pdf']";
let element = tab
.wait_for_element_with_custom_timeout(embed_selector, Duration::from_secs(8))
.or_else(|_| tab.wait_for_element_with_custom_timeout("embed", Duration::from_secs(8)))
.map_err(|err| AppError::Processing(format!("Timed out waiting for PDF content: {err}")))?;
if let Err(err) = element.scroll_into_view() {
debug!("Failed to scroll PDF element into view: {err}");
}
debug!(page = page_number, "PDF viewer element located");
Ok(element)
}
fn prepare_pdf_viewer(tab: &headless_chrome::Tab, page_number: u32) {
let script = format!(
r#"(function() {{
const embed = document.querySelector('embed[type="application/pdf"]') || document.querySelector('embed');
if (!embed || !embed.shadowRoot) return false;
const viewer = embed.shadowRoot.querySelector('pdf-viewer');
if (!viewer || !viewer.shadowRoot) return false;
const app = viewer.shadowRoot.querySelector('viewer-app');
if (app && app.shadowRoot) {{
const toolbar = app.shadowRoot.querySelector('#toolbar');
if (toolbar) {{ toolbar.style.display = 'none'; }}
}}
const page = viewer.shadowRoot.querySelector('viewer-page:nth-of-type({page})');
if (page && page.scrollIntoView) {{
page.scrollIntoView({{ block: 'start', inline: 'center' }});
}}
const canvas = viewer.shadowRoot.querySelector('canvas[aria-label="Page {page}"]');
return !!canvas;
}})()"#,
page = page_number
);
match tab.evaluate(&script, false) {
Ok(result) => {
let ready = result
.value
.as_ref()
.and_then(Value::as_bool)
.unwrap_or(false);
debug!(page = page_number, ready, "Prepared PDF viewer page");
}
Err(err) => {
debug!(page = page_number, error = %err, "Unable to run PDF viewer preparation script");
}
}
}
fn canvas_viewport_for_page(
tab: &headless_chrome::Tab,
page_number: u32,
) -> Result<Option<Page::Viewport>, AppError> {
let script = format!(
r#"(function() {{
const embed = document.querySelector('embed[type="application/pdf"]') || document.querySelector('embed');
if (!embed || !embed.shadowRoot) return null;
const viewer = embed.shadowRoot.querySelector('pdf-viewer');
if (!viewer || !viewer.shadowRoot) return null;
const canvas = viewer.shadowRoot.querySelector('canvas[aria-label="Page {page}"]');
if (!canvas) return null;
const rect = canvas.getBoundingClientRect();
return {{ x: rect.x, y: rect.y, width: rect.width, height: rect.height }};
}})()"#,
page = page_number
);
let result = tab
.evaluate(&script, false)
.map_err(|err| AppError::Processing(format!("Failed to inspect PDF canvas: {err}")))?;
let Some(value) = result.value else {
return Ok(None);
};
if value.is_null() {
return Ok(None);
}
let x = value
.get("x")
.and_then(Value::as_f64)
.unwrap_or_default()
.max(0.0);
let y = value
.get("y")
.and_then(Value::as_f64)
.unwrap_or_default()
.max(0.0);
let width = value
.get("width")
.and_then(Value::as_f64)
.unwrap_or_default();
let height = value
.get("height")
.and_then(Value::as_f64)
.unwrap_or_default();
if width <= 0.0 || height <= 0.0 {
return Ok(None);
}
debug!(
page = page_number,
x, y, width, height, "Derived canvas viewport"
);
Ok(Some(Page::Viewport {
x,
y,
width,
height,
scale: 1.0,
}))
}
fn capture_full_page_png(tab: &headless_chrome::Tab) -> Result<Vec<u8>, AppError> {
let screenshot = tab
.call_method(Page::CaptureScreenshot {
format: Some(Page::CaptureScreenshotFormatOption::Png),
quality: None,
clip: None,
from_surface: Some(true),
capture_beyond_viewport: Some(true),
optimize_for_speed: Some(false),
})
.map_err(|err| {
AppError::Processing(format!("Failed to capture PDF page (fallback): {err}"))
})?;
STANDARD.decode(screenshot.data).map_err(|err| {
AppError::Processing(format!("Failed to decode PDF screenshot (fallback): {err}"))
})
}
fn is_suspicious_image(len: usize) -> bool {
len < MIN_PAGE_IMAGE_BYTES
}
async fn maybe_dump_debug_image(page_index: u32, bytes: &[u8]) -> Result<(), AppError> {
if let Some(dir) = debug_dump_directory() {
tokio::fs::create_dir_all(&dir).await?;
let timestamp = SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap_or_default()
.as_millis();
let file_path = dir.join(format!("page-{page_index:04}-{timestamp}.png"));
tokio::fs::write(&file_path, bytes).await?;
debug!(?file_path, size = bytes.len(), "Wrote PDF debug screenshot");
}
Ok(())
}
fn is_low_quality_response(content: &str) -> bool {
let trimmed = content.trim();
if trimmed.is_empty() {
return true;
}
let lowered = trimmed.to_ascii_lowercase();
lowered.contains("unable to") || lowered.contains("cannot")
}
fn prompt_for_attempt(attempt: usize, base_prompt: &str) -> &str {
if attempt == 0 {
base_prompt
} else {
PDF_MARKDOWN_PROMPT_RETRY
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_looks_good_enough_short_text() {
assert!(!looks_good_enough("too short"));
}
#[test]
fn test_looks_good_enough_ascii_text() {
let text = "This is a reasonably long ASCII text that should pass the heuristic. \
It contains multiple sentences and a decent amount of letters to satisfy the threshold.";
assert!(looks_good_enough(text));
}
#[test]
fn test_reflow_markdown_preserves_lists() {
let input = "Item one\nItem two\n\n- Bullet\n- Another";
let output = reflow_markdown(input);
assert!(output.contains("Item one Item two"));
assert!(output.contains("- Bullet"));
}
#[test]
fn test_debug_dump_directory_env_var() {
std::env::remove_var(DEBUG_IMAGE_ENV_VAR);
assert!(debug_dump_directory().is_none());
std::env::set_var(DEBUG_IMAGE_ENV_VAR, "/tmp/minne_pdf_debug");
let dir = debug_dump_directory().expect("expected debug directory");
assert_eq!(dir, PathBuf::from("/tmp/minne_pdf_debug"));
std::env::remove_var(DEBUG_IMAGE_ENV_VAR);
}
#[test]
fn test_is_suspicious_image_threshold() {
assert!(is_suspicious_image(0));
assert!(is_suspicious_image(MIN_PAGE_IMAGE_BYTES - 1));
assert!(!is_suspicious_image(MIN_PAGE_IMAGE_BYTES + 1));
}
#[test]
fn test_is_low_quality_response_detection() {
assert!(is_low_quality_response(""));
assert!(is_low_quality_response("I'm unable to help."));
assert!(is_low_quality_response("I cannot read this."));
assert!(!is_low_quality_response("# Heading\nValid content"));
}
#[test]
fn test_prompt_for_attempt_variants() {
assert_eq!(
prompt_for_attempt(0, PDF_MARKDOWN_PROMPT),
PDF_MARKDOWN_PROMPT
);
assert_eq!(
prompt_for_attempt(1, PDF_MARKDOWN_PROMPT),
PDF_MARKDOWN_PROMPT_RETRY
);
assert_eq!(
prompt_for_attempt(5, PDF_MARKDOWN_PROMPT),
PDF_MARKDOWN_PROMPT_RETRY
);
}
#[test]
fn test_markdown_prompts_discourage_code_blocks() {
assert!(!PDF_MARKDOWN_PROMPT.contains("```"));
assert!(!PDF_MARKDOWN_PROMPT_RETRY.contains("```"));
}
}

View File

@@ -1,6 +1,6 @@
[package]
name = "main"
version = "0.2.0"
version = "0.2.2"
edition = "2021"
repository = "https://github.com/perstarkse/minne"
license = "AGPL-3.0-or-later"
@@ -23,6 +23,10 @@ api-router = { path = "../api-router" }
html-router = { path = "../html-router" }
common = { path = "../common" }
[dev-dependencies]
tower = "0.5"
uuid = { workspace = true }
[[bin]]
name = "server"
path = "src/server.rs"
@@ -34,4 +38,3 @@ path = "src/worker.rs"
[[bin]]
name = "main"
path = "src/main.rs"

View File

@@ -129,3 +129,98 @@ struct AppState {
api_state: ApiState,
html_state: HtmlState,
}
#[cfg(test)]
mod tests {
use super::*;
use axum::{body::Body, http::Request, http::StatusCode, Router};
use common::utils::config::{AppConfig, PdfIngestMode, StorageKind};
use std::{path::Path, sync::Arc};
use tower::ServiceExt;
use uuid::Uuid;
fn smoke_test_config(namespace: &str, database: &str, data_dir: &Path) -> AppConfig {
AppConfig {
openai_api_key: "test-key".into(),
surrealdb_address: "mem://".into(),
surrealdb_username: "root".into(),
surrealdb_password: "root".into(),
surrealdb_namespace: namespace.into(),
surrealdb_database: database.into(),
data_dir: data_dir.to_string_lossy().into_owned(),
http_port: 0,
openai_base_url: "https://example.com".into(),
storage: StorageKind::Local,
pdf_ingest_mode: PdfIngestMode::LlmFirst,
}
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn smoke_startup_with_in_memory_surrealdb() {
let namespace = "test_ns";
let database = format!("test_db_{}", Uuid::new_v4());
let data_dir = std::env::temp_dir().join(format!("minne_smoke_{}", Uuid::new_v4()));
tokio::fs::create_dir_all(&data_dir)
.await
.expect("failed to create temp data directory");
let config = smoke_test_config(namespace, &database, &data_dir);
let db = Arc::new(
SurrealDbClient::memory(namespace, &database)
.await
.expect("failed to start in-memory surrealdb"),
);
db.apply_migrations()
.await
.expect("failed to apply migrations");
let session_store = Arc::new(db.create_session_store().await.expect("session store"));
let openai_client = Arc::new(async_openai::Client::with_config(
async_openai::config::OpenAIConfig::new()
.with_api_key(&config.openai_api_key)
.with_api_base(&config.openai_base_url),
));
let html_state =
HtmlState::new_with_resources(db.clone(), openai_client, session_store, config.clone())
.expect("failed to build html state");
let api_state = ApiState {
db: html_state.db.clone(),
config: config.clone(),
};
let app = Router::new()
.nest("/api/v1", api_routes_v1(&api_state))
.merge(html_routes(&html_state))
.with_state(AppState {
api_state,
html_state,
});
let response = app
.clone()
.oneshot(
Request::builder()
.uri("/api/v1/live")
.body(Body::empty())
.expect("request"),
)
.await
.expect("router response");
assert_eq!(response.status(), StatusCode::OK);
let ready_response = app
.oneshot(
Request::builder()
.uri("/api/v1/ready")
.body(Body::empty())
.expect("request"),
)
.await
.expect("ready response");
assert_eq!(ready_response.status(), StatusCode::OK);
tokio::fs::remove_dir_all(&data_dir).await.ok();
}
}

52
todo.md
View File

@@ -1,52 +0,0 @@
[] implement prompt and model choice for image processing?
[x] ollama and changing of openai_base_url
[x] allow changing of port the server listens to
[] archive ingressed webpage, pdf would be easy
[] embed surrealdb for the main binary
[] three js graph explorer
[x] add user_id to ingress objects
[x] admin controls re registration
[x] allow setting of data storage folder, via envs and config
[x] build docker container on release plan
[x] change to smoothie dom
[x] chat functionality
[x] chat history
[x] chat styling overhaul
[x] configs primarily get envs
[x] debug vector search
[x] debug why not automatic retrieval of chrome binary works
[x] filtering on categories
[x] fix card image in content
[x] fix patch_text_content
[x] fix redirect for non hx
[x] full text search
[x] html ingression
[x] hx-redirect
[x] implement migrations
[x] integrate assets folder in release build
[x] integrate templates in release build
[x] ios shortcut generation
[x] job queue
[x] link to ingressed urls or archives
[x] macro for pagedata?
[x] make sure error messages render correctly
[x] markdown rendering in client
[x] marked loading conditions
[x] on updates of knowledgeentity create new embeddings
[x] openai api key in config
[x] option to set models, query and processing
[x] page screenshot?
[x] redirects
[x] rename ingestion instructions to context
[x] restrict retrieval to users own objects
[x] scroll reading window to top
[x] smoothie_dom test
[x] sse ingestion updates
[x] store page title
[x] template customization?
[x] templating
[x] testing core functions
[x] user id to fileinfo and data path?
[x] view content
[x] view graph map
[x] view latest