From 322d1ec318ba94a47ab8e550fd7035c047b57f0f Mon Sep 17 00:00:00 2001 From: Per Stark Date: Mon, 5 May 2025 13:44:57 +0200 Subject: [PATCH] feat: docker & docker-compose example --- .dockerignore | 40 +++++ Cargo.lock | 269 +--------------------------- Dockerfile | 53 ++++++ docker-compose.yml | 44 +++++ ingestion-pipeline/Cargo.toml | 6 +- ingestion-pipeline/src/types/mod.rs | 21 ++- result | 1 - 7 files changed, 163 insertions(+), 271 deletions(-) create mode 100644 .dockerignore create mode 100644 Dockerfile create mode 100644 docker-compose.yml delete mode 120000 result diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..245604d --- /dev/null +++ b/.dockerignore @@ -0,0 +1,40 @@ +# Git stuff +.git/ +.gitignore +.github + +# Node build artifacts +**/node_modules/ + +# Nix/Devenv environment files +.direnv/ +.devenv/ +devenv.lock +devenv.nix +devenv.yaml +docker-compose.yml +.envrc +.devenv.flake.nix +flake.lock +flake.nix + +# Rust build artifacts (crucial for multi-stage builds) +**/target/ + +# Runtime data directories +data/ +database/ + +# Local environment config (sensitive) +.env + +# IDE specific +.vscode/ +.idea/ + +# OS specific +.DS_Store +Thumbs.db + +# Logs / Temporary files +*.log diff --git a/Cargo.lock b/Cargo.lock index 86f8811..3707f74 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -247,15 +247,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "arbitrary" -version = "1.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dde20b3d026af13f561bdd0f15edf01fc734f0dafcedbaf42bba506a9517f223" -dependencies = [ - "derive_arbitrary", -] - [[package]] name = "argon2" version = "0.5.3" @@ -517,7 +508,7 @@ dependencies = [ "quote", "serde", "serde_json", - "ureq 2.12.1", + "ureq", ] [[package]] @@ -953,25 +944,6 @@ dependencies = [ "serde", ] -[[package]] -name = "bzip2" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47" -dependencies = [ - "bzip2-sys", -] - -[[package]] -name = "bzip2-sys" -version = "0.1.13+1.0.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14" -dependencies = [ - "cc", - "pkg-config", -] - [[package]] name = "castaway" version = "0.2.3" @@ -987,8 +959,6 @@ version = "1.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8691782945451c1c383942c4874dbe63814f61cb57ef773cda2972682b7bb3c0" dependencies = [ - "jobserver", - "libc", "shlex", ] @@ -1414,21 +1384,6 @@ dependencies = [ "libc", ] -[[package]] -name = "crc" -version = "3.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69e6e4d7b33a94f0991c26729976b10ebde1d34c3ee82408fb536164fa10d636" -dependencies = [ - "crc-catalog", -] - -[[package]] -name = "crc-catalog" -version = "2.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" - [[package]] name = "crc32fast" version = "1.4.2" @@ -1614,12 +1569,6 @@ version = "2.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2a2330da5de22e8a3cb63252ce2abb30116bf5265e89c0e01bc17015ce30a476" -[[package]] -name = "deflate64" -version = "0.1.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da692b8d1080ea3045efaab14434d40468c3d8657e42abddfffca87b428f4c1b" - [[package]] name = "deranged" version = "0.4.0" @@ -1630,17 +1579,6 @@ dependencies = [ "serde", ] -[[package]] -name = "derive_arbitrary" -version = "1.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30542c1ad912e0e3d22a1935c290e12e8a29d704a420177a31faad4a601a0800" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.101", -] - [[package]] name = "derive_builder" version = "0.20.2" @@ -1720,15 +1658,6 @@ dependencies = [ "subtle", ] -[[package]] -name = "directories" -version = "6.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16f5094c54661b38d03bd7e50df373292118db60b585c08a411c6d840017fe7d" -dependencies = [ - "dirs-sys", -] - [[package]] name = "dirs-next" version = "2.0.0" @@ -1739,18 +1668,6 @@ dependencies = [ "dirs-sys-next", ] -[[package]] -name = "dirs-sys" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e01a3366d27ee9890022452ee61b2b63a67e6f13f58900b651ff5665f0bb1fab" -dependencies = [ - "libc", - "option-ext", - "redox_users 0.5.0", - "windows-sys 0.59.0", -] - [[package]] name = "dirs-sys-next" version = "0.1.2" @@ -1758,7 +1675,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4ebda144c4fe02d1f7ea1a7d9641b6fc6b580adcfa024ae48797ecdeb6825b4d" dependencies = [ "libc", - "redox_users 0.4.6", + "redox_users", "winapi", ] @@ -2410,13 +2327,13 @@ dependencies = [ [[package]] name = "headless_chrome" version = "1.0.17" -source = "git+https://github.com/rust-headless-chrome/rust-headless-chrome#8b66992826245cbf60377d619fc780f8c45abf8e" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c268ea01c2902b2acb382c1fae26818113dd661e0dba036a893f0ba40f00cdd8" dependencies = [ "anyhow", "auto_generate_cdp", "base64 0.22.1", "derive_builder", - "directories", "log", "rand 0.9.1", "regex", @@ -2425,12 +2342,9 @@ dependencies = [ "tempfile", "thiserror 2.0.12", "tungstenite 0.26.2", - "ureq 3.0.11", "url", - "walkdir", "which", "winreg", - "zip", ] [[package]] @@ -3014,16 +2928,6 @@ version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" -[[package]] -name = "jobserver" -version = "0.1.33" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38f262f097c174adebe41eb73d66ae9c06b2844fb0da69969647bbddd9b0538a" -dependencies = [ - "getrandom 0.3.2", - "libc", -] - [[package]] name = "js-sys" version = "0.3.77" @@ -3214,27 +3118,6 @@ dependencies = [ "hashbrown 0.15.3", ] -[[package]] -name = "lzma-rs" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "297e814c836ae64db86b36cf2a557ba54368d03f6afcd7d947c266692f71115e" -dependencies = [ - "byteorder", - "crc", -] - -[[package]] -name = "lzma-sys" -version = "0.1.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" -dependencies = [ - "cc", - "libc", - "pkg-config", -] - [[package]] name = "mac" version = "0.1.1" @@ -3771,12 +3654,6 @@ dependencies = [ "vcpkg", ] -[[package]] -name = "option-ext" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" - [[package]] name = "ordered-multimap" version = "0.7.3" @@ -4416,17 +4293,6 @@ dependencies = [ "thiserror 1.0.69", ] -[[package]] -name = "redox_users" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd6f9d3d47bdd2ad6945c5015a226ec6155d0bcdfd8f7cd29f86b71f8de99d2b" -dependencies = [ - "getrandom 0.2.16", - "libredox", - "thiserror 2.0.12", -] - [[package]] name = "ref-cast" version = "1.0.24" @@ -5245,12 +5111,6 @@ dependencies = [ "libc", ] -[[package]] -name = "simd-adler32" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe" - [[package]] name = "simdutf8" version = "0.1.5" @@ -6348,36 +6208,6 @@ dependencies = [ "webpki-roots", ] -[[package]] -name = "ureq" -version = "3.0.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7a3e9af6113ecd57b8c63d3cd76a385b2e3881365f1f489e54f49801d0c83ea" -dependencies = [ - "base64 0.22.1", - "flate2", - "log", - "percent-encoding", - "rustls", - "rustls-pemfile", - "rustls-pki-types", - "ureq-proto", - "utf-8", - "webpki-roots", -] - -[[package]] -name = "ureq-proto" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fadf18427d33828c311234884b7ba2afb57143e6e7e69fda7ee883b624661e36" -dependencies = [ - "base64 0.22.1", - "http", - "httparse", - "log", -] - [[package]] name = "url" version = "2.5.4" @@ -7041,15 +6871,6 @@ version = "0.8.26" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a62ce76d9b56901b19a74f19431b0d8b3bc7ca4ad685a746dfd78ca8f4fc6bda" -[[package]] -name = "xz2" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" -dependencies = [ - "lzma-sys", -] - [[package]] name = "yaml-rust2" version = "0.10.1" @@ -7151,20 +6972,6 @@ name = "zeroize" version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" -dependencies = [ - "zeroize_derive", -] - -[[package]] -name = "zeroize_derive" -version = "1.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.101", -] [[package]] name = "zerovec" @@ -7187,71 +6994,3 @@ dependencies = [ "quote", "syn 2.0.101", ] - -[[package]] -name = "zip" -version = "2.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1dcb24d0152526ae49b9b96c1dcf71850ca1e0b882e4e28ed898a93c41334744" -dependencies = [ - "aes", - "arbitrary", - "bzip2", - "constant_time_eq", - "crc32fast", - "crossbeam-utils", - "deflate64", - "flate2", - "getrandom 0.3.2", - "hmac", - "indexmap 2.9.0", - "lzma-rs", - "memchr", - "pbkdf2", - "sha1", - "time", - "xz2", - "zeroize", - "zopfli", - "zstd", -] - -[[package]] -name = "zopfli" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "edfc5ee405f504cd4984ecc6f14d02d55cfda60fa4b689434ef4102aae150cd7" -dependencies = [ - "bumpalo", - "crc32fast", - "log", - "simd-adler32", -] - -[[package]] -name = "zstd" -version = "0.13.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" -dependencies = [ - "zstd-safe", -] - -[[package]] -name = "zstd-safe" -version = "7.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d" -dependencies = [ - "zstd-sys", -] - -[[package]] -name = "zstd-sys" -version = "2.0.15+zstd.1.5.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb81183ddd97d0c74cedf1d50d85c8d08c1b8b68ee863bdee9e706eedba1a237" -dependencies = [ - "cc", - "pkg-config", -] diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..aea80ca --- /dev/null +++ b/Dockerfile @@ -0,0 +1,53 @@ +# === Builder Stage === +FROM clux/muslrust:1.86.0-stable as builder + +WORKDIR /usr/src/minne +COPY Cargo.toml Cargo.lock ./ +RUN mkdir -p api-router common composite-retrieval html-router ingestion-pipeline json-stream-parser main worker +COPY api-router/Cargo.toml ./api-router/ +COPY common/Cargo.toml ./common/ +COPY composite-retrieval/Cargo.toml ./composite-retrieval/ +COPY html-router/Cargo.toml ./html-router/ +COPY ingestion-pipeline/Cargo.toml ./ingestion-pipeline/ +COPY json-stream-parser/Cargo.toml ./json-stream-parser/ +COPY main/Cargo.toml ./main/ + +# Build with the MUSL target +RUN cargo build --release --target x86_64-unknown-linux-musl --bin main --features ingestion-pipeline/docker || true + +# Copy the rest of the source code +COPY . . + +# Build the final application binary with the MUSL target +RUN cargo build --release --target x86_64-unknown-linux-musl --bin main --features ingestion-pipeline/docker + +# === Runtime Stage === +FROM alpine:latest + +RUN apk update && apk add --no-cache \ + chromium \ + nss \ + freetype \ + harfbuzz \ + ca-certificates \ + ttf-freefont \ + font-noto-emoji \ + && \ + rm -rf /var/cache/apk/* + +ENV CHROME_BIN=/usr/bin/chromium-browser \ + CHROME_PATH=/usr/lib/chromium/ \ + SSL_CERT_FILE=/etc/ssl/certs/ca-certificates.crt + +# Create a non-root user to run the application +RUN adduser -D -h /home/appuser appuser +WORKDIR /home/appuser +USER appuser + +# Copy the compiled binary from the builder stage (note the target path) +COPY --from=builder /usr/src/minne/target/x86_64-unknown-linux-musl/release/main /usr/local/bin/main + +EXPOSE 3000 +# EXPOSE 8000-9000 + +CMD ["main"] diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..7fae1fb --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,44 @@ +version: '3.8' + +services: + minne: + build: . + container_name: minne_app + ports: + - "3000:3000" + environment: + SURREALDB_ADDRESS: "ws://surrealdb:8000" + SURREALDB_USERNAME: "root_user" + SURREALDB_PASSWORD: "root_password" + SURREALDB_DATABASE: "test" + SURREALDB_NAMESPACE: "test" + OPENAI_API_KEY: "sk-key" + # RUST_LOG: "info" + depends_on: + - surrealdb + networks: + - minne-net + command: ["sh", "-c", "echo 'Waiting for SurrealDB to start...' && sleep 10 && echo 'Starting application...' && /usr/local/bin/main"] + + surrealdb: + image: surrealdb/surrealdb:latest + container_name: minne_surrealdb + ports: + - "8000:8000" + volumes: + - ./database:/database # Mounts a 'database' folder from your project directory + command: > + start + --log debug + --user root_user + --pass root_password + rocksdb:./database/database.db + networks: + - minne-net + +volumes: + surrealdb_data: + +networks: + minne-net: + driver: bridge diff --git a/ingestion-pipeline/Cargo.toml b/ingestion-pipeline/Cargo.toml index 7db40ae..0414766 100644 --- a/ingestion-pipeline/Cargo.toml +++ b/ingestion-pipeline/Cargo.toml @@ -22,8 +22,10 @@ chrono = { version = "0.4.39", features = ["serde"] } text-splitter = "0.18.1" url = { version = "2.5.2", features = ["serde"] } uuid = { version = "1.10.0", features = ["v4", "serde"] } - -headless_chrome = { git = "https://github.com/rust-headless-chrome/rust-headless-chrome", features = ["fetch"] } +headless_chrome = "1.0.17" common = { path = "../common" } composite-retrieval = { path = "../composite-retrieval" } + +[features] +docker = [] diff --git a/ingestion-pipeline/src/types/mod.rs b/ingestion-pipeline/src/types/mod.rs index 381e7d8..d418d78 100644 --- a/ingestion-pipeline/src/types/mod.rs +++ b/ingestion-pipeline/src/types/mod.rs @@ -16,7 +16,8 @@ use common::{ }, }; use dom_smoothie::{Article, Readability, TextMode}; -use headless_chrome::Browser; +use headless_chrome::{Browser, LaunchOptionsBuilder}; +use std::io::{Seek, SeekFrom}; use tempfile::NamedTempFile; use tracing::{error, info}; @@ -76,7 +77,6 @@ pub async fn to_text_content( } } } -use std::io::{Seek, SeekFrom}; // <-- Add Seek and SeekFrom /// Fetches web content from a URL, extracts the main article text as Markdown, /// captures a screenshot, and stores the screenshot returning [`FileInfo`]. @@ -106,7 +106,22 @@ async fn fetch_article_from_url( // Instantiate timer let now = Instant::now(); // Setup browser, navigate and wait - let browser = Browser::default()?; + let browser = { + #[cfg(feature = "docker")] + { + // Use this when compiling for docker + let options = LaunchOptionsBuilder::default() + .sandbox(false) + .build() + .map_err(|e| AppError::InternalError(e.to_string()))?; + Browser::new(options)? + } + #[cfg(not(feature = "docker"))] + { + // Use this otherwise + Browser::default()? + } + }; let tab = browser.new_tab()?; let page = tab.navigate_to(url)?; let loaded_page = page.wait_until_navigated()?; diff --git a/result b/result deleted file mode 120000 index b86213c..0000000 --- a/result +++ /dev/null @@ -1 +0,0 @@ -/nix/store/d4n7jbj3j7ql8v90kn4zlf4kj2zq81sy-minne \ No newline at end of file