From 02198dc21aa6bca73e9e7ac8385db0fda84f5c32 Mon Sep 17 00:00:00 2001 From: Per Stark Date: Wed, 30 Apr 2025 08:06:18 +0200 Subject: [PATCH] feat: readability parsing, screenshot of page, file serving --- Cargo.lock | 507 ++++++++++++++---- Cargo.toml | 1 + common/Cargo.toml | 1 + common/src/error.rs | 2 + common/src/storage/types/file_info.rs | 90 +++- common/src/storage/types/ingestion_payload.rs | 1 + common/src/storage/types/system_settings.rs | 14 +- common/src/storage/types/text_content.rs | 29 +- html-router/Cargo.toml | 1 + html-router/src/routes/index/handlers.rs | 63 ++- html-router/src/routes/index/mod.rs | 3 +- html-router/templates/auth/admin_panel.html | 14 +- .../templates/content/content_list.html | 1 + .../index/signed_in/recent_content.html | 6 +- html-router/templates/sidebar.html | 4 + ingestion-pipeline/Cargo.toml | 9 +- ingestion-pipeline/src/enricher.rs | 2 +- ingestion-pipeline/src/pipeline.rs | 2 +- ingestion-pipeline/src/types/mod.rs | 264 ++++----- main/src/main.rs | 2 +- 20 files changed, 707 insertions(+), 309 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 44f1626..a970824 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -201,6 +201,9 @@ name = "arbitrary" version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dde20b3d026af13f561bdd0f15edf01fc734f0dafcedbaf42bba506a9517f223" +dependencies = [ + "derive_arbitrary", +] [[package]] name = "argon2" @@ -451,6 +454,20 @@ dependencies = [ "syn 2.0.100", ] +[[package]] +name = "auto_generate_cdp" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6e1961a0d5d77969057eba90d448e610d3c439024d135d9dbd98e33ec973520" +dependencies = [ + "convert_case 0.4.0", + "proc-macro2", + "quote", + "serde", + "serde_json", + "ureq 2.12.1", +] + [[package]] name = "autocfg" version = "1.4.0" @@ -835,17 +852,6 @@ dependencies = [ "syn 2.0.100", ] -[[package]] -name = "bstr" -version = "1.12.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "234113d19d0d7d613b40e86fb654acf958910802bcceab913a4f9e7cda03b1a4" -dependencies = [ - "memchr", - "regex-automata 0.4.9", - "serde", -] - [[package]] name = "bumpalo" version = "3.17.0" @@ -895,6 +901,25 @@ dependencies = [ "serde", ] +[[package]] +name = "bzip2" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47" +dependencies = [ + "bzip2-sys", +] + +[[package]] +name = "bzip2-sys" +version = "0.1.13+1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14" +dependencies = [ + "cc", + "pkg-config", +] + [[package]] name = "castaway" version = "0.2.3" @@ -910,6 +935,8 @@ version = "1.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e3a13707ac958681c13b39b458c073d0d9bc8a22cb1b2f4c8e55eb72c13f362" dependencies = [ + "jobserver", + "libc", "shlex", ] @@ -1070,6 +1097,7 @@ dependencies = [ "chrono", "chrono-tz", "config", + "dom_smoothie", "futures", "mime", "mime_guess", @@ -1124,7 +1152,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "595aae20e65c3be792d05818e8c63025294ac3cb7e200f11459063a352a6ef80" dependencies = [ "async-trait", - "convert_case", + "convert_case 0.6.0", "json5", "pathdiff", "ron", @@ -1162,6 +1190,12 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" +[[package]] +name = "convert_case" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e" + [[package]] name = "convert_case" version = "0.6.0" @@ -1221,6 +1255,21 @@ dependencies = [ "libc", ] +[[package]] +name = "crc" +version = "3.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69e6e4d7b33a94f0991c26729976b10ebde1d34c3ee82408fb536164fa10d636" +dependencies = [ + "crc-catalog", +] + +[[package]] +name = "crc-catalog" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" + [[package]] name = "crc32fast" version = "1.4.2" @@ -1385,6 +1434,12 @@ version = "2.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2a2330da5de22e8a3cb63252ce2abb30116bf5265e89c0e01bc17015ce30a476" +[[package]] +name = "deflate64" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da692b8d1080ea3045efaab14434d40468c3d8657e42abddfffca87b428f4c1b" + [[package]] name = "deranged" version = "0.4.0" @@ -1395,6 +1450,17 @@ dependencies = [ "serde", ] +[[package]] +name = "derive_arbitrary" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30542c1ad912e0e3d22a1935c290e12e8a29d704a420177a31faad4a601a0800" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.100", +] + [[package]] name = "derive_builder" version = "0.20.2" @@ -1465,6 +1531,15 @@ dependencies = [ "subtle", ] +[[package]] +name = "directories" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16f5094c54661b38d03bd7e50df373292118db60b585c08a411c6d840017fe7d" +dependencies = [ + "dirs-sys", +] + [[package]] name = "dirs-next" version = "2.0.0" @@ -1475,6 +1550,18 @@ dependencies = [ "dirs-sys-next", ] +[[package]] +name = "dirs-sys" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e01a3366d27ee9890022452ee61b2b63a67e6f13f58900b651ff5665f0bb1fab" +dependencies = [ + "libc", + "option-ext", + "redox_users 0.5.0", + "windows-sys 0.59.0", +] + [[package]] name = "dirs-sys-next" version = "0.1.2" @@ -1482,7 +1569,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4ebda144c4fe02d1f7ea1a7d9641b6fc6b580adcfa024ae48797ecdeb6825b4d" dependencies = [ "libc", - "redox_users", + "redox_users 0.4.6", "winapi", ] @@ -1588,12 +1675,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "ego-tree" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2972feb8dffe7bc8c5463b1dacda1b0dfbed3710e50f977d965429692d74cd8" - [[package]] name = "either" version = "1.15.0" @@ -1624,6 +1705,12 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c34f04666d835ff5d62e058c3995147c06f42fe86ff053337632bca83e42702d" +[[package]] +name = "env_home" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7f84e12ccf0a7ddc17a6c41c93326024c42920d7ee630d04950e6926645c0fe" + [[package]] name = "equivalent" version = "1.0.2" @@ -1695,17 +1782,6 @@ dependencies = [ "tempfile", ] -[[package]] -name = "fancy-regex" -version = "0.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "531e46835a22af56d1e3b66f04844bed63158bc094a628bec1d321d9b4c44bf2" -dependencies = [ - "bit-set 0.5.3", - "regex-automata 0.4.9", - "regex-syntax 0.8.5", -] - [[package]] name = "fastrand" version = "2.3.0" @@ -1736,6 +1812,16 @@ version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b7ac824320a75a52197e8f2d787f6a38b6718bb6897a35142d749af3c0e8f4fe" +[[package]] +name = "flate2" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ced92e76e966ca2fd84c8f7aa01a4aea65b0eb6648d72f7c8f3e2764a67fece" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + [[package]] name = "float_next_after" version = "1.0.0" @@ -1995,15 +2081,6 @@ dependencies = [ "libm", ] -[[package]] -name = "getopts" -version = "0.2.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5" -dependencies = [ - "unicode-width", -] - [[package]] name = "getrandom" version = "0.2.16" @@ -2126,6 +2203,32 @@ dependencies = [ "hashbrown 0.15.2", ] +[[package]] +name = "headless_chrome" +version = "1.0.17" +source = "git+https://github.com/rust-headless-chrome/rust-headless-chrome#8b66992826245cbf60377d619fc780f8c45abf8e" +dependencies = [ + "anyhow", + "auto_generate_cdp", + "base64 0.22.1", + "derive_builder", + "directories", + "log", + "rand 0.9.1", + "regex", + "serde", + "serde_json", + "tempfile", + "thiserror 2.0.12", + "tungstenite 0.26.2", + "ureq 3.0.11", + "url", + "walkdir", + "which", + "winreg", + "zip", +] + [[package]] name = "heapless" version = "0.8.0" @@ -2207,23 +2310,12 @@ dependencies = [ "tempfile", "thiserror 1.0.69", "tokio", + "tokio-util", "tower-http", "tower-serve-static", "tracing", ] -[[package]] -name = "html5ever" -version = "0.29.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b7410cae13cbc75623c98ac4cbfd1f0bedddf3227afc24f370cf0f50a44a11c" -dependencies = [ - "log", - "mac", - "markup5ever 0.14.1", - "match_token", -] - [[package]] name = "html5ever" version = "0.30.0" @@ -2607,20 +2699,22 @@ version = "0.1.0" dependencies = [ "async-openai", "axum", + "axum_typed_multipart", "chrono", "common", "composite-retrieval", "dom_smoothie", "futures", + "headless_chrome", "reqwest", - "scraper", "serde", "serde_json", "surrealdb", + "tempfile", "text-splitter", - "tiktoken-rs", "tokio", "tracing", + "url", "uuid", ] @@ -2701,6 +2795,16 @@ version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" +[[package]] +name = "jobserver" +version = "0.1.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38f262f097c174adebe41eb73d66ae9c06b2844fb0da69969647bbddd9b0538a" +dependencies = [ + "getrandom 0.3.2", + "libc", +] + [[package]] name = "js-sys" version = "0.3.77" @@ -2882,6 +2986,27 @@ dependencies = [ "hashbrown 0.15.2", ] +[[package]] +name = "lzma-rs" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "297e814c836ae64db86b36cf2a557ba54368d03f6afcd7d947c266692f71115e" +dependencies = [ + "byteorder", + "crc", +] + +[[package]] +name = "lzma-sys" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + [[package]] name = "mac" version = "0.1.1" @@ -2915,20 +3040,6 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d" -[[package]] -name = "markup5ever" -version = "0.14.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7a7213d12e1864c0f002f52c2923d4556935a43dec5e71355c2760e0f6e7a18" -dependencies = [ - "log", - "phf", - "phf_codegen", - "string_cache", - "string_cache_codegen", - "tendril", -] - [[package]] name = "markup5ever" version = "0.15.0" @@ -3422,6 +3533,12 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "option-ext" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" + [[package]] name = "ordered-multimap" version = "0.7.3" @@ -3869,7 +3986,7 @@ dependencies = [ "pin-project-lite", "quinn-proto", "quinn-udp", - "rustc-hash 2.1.1", + "rustc-hash", "rustls", "socket2", "thiserror 2.0.12", @@ -3888,7 +4005,7 @@ dependencies = [ "getrandom 0.3.2", "rand 0.9.1", "ring", - "rustc-hash 2.1.1", + "rustc-hash", "rustls", "rustls-pki-types", "slab", @@ -4055,6 +4172,17 @@ dependencies = [ "thiserror 1.0.69", ] +[[package]] +name = "redox_users" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd6f9d3d47bdd2ad6945c5015a226ec6155d0bcdfd8f7cd29f86b71f8de99d2b" +dependencies = [ + "getrandom 0.2.16", + "libredox", + "thiserror 2.0.12", +] + [[package]] name = "ref-cast" version = "1.0.24" @@ -4284,7 +4412,7 @@ dependencies = [ "proc-macro2", "quote", "rinja_parser", - "rustc-hash 2.1.1", + "rustc-hash", "serde", "syn 2.0.100", ] @@ -4444,12 +4572,6 @@ version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" -[[package]] -name = "rustc-hash" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" - [[package]] name = "rustc-hash" version = "2.1.1" @@ -4588,21 +4710,6 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" -[[package]] -name = "scraper" -version = "0.22.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc3d051b884f40e309de6c149734eab57aa8cc1347992710dc80bcc1c2194c15" -dependencies = [ - "cssparser 0.34.0", - "ego-tree", - "getopts", - "html5ever 0.29.1", - "precomputed-hash", - "selectors", - "tendril", -] - [[package]] name = "scrypt" version = "0.11.0" @@ -4888,6 +4995,12 @@ dependencies = [ "libc", ] +[[package]] +name = "simd-adler32" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe" + [[package]] name = "simdutf8" version = "0.1.5" @@ -4974,6 +5087,17 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "socks" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0c3dbbd9ae980613c6dd8e28a9407b50509d3803b57624d5dfe8315218cd58b" +dependencies = [ + "byteorder", + "libc", + "winapi", +] + [[package]] name = "spade" version = "2.13.1" @@ -5427,22 +5551,6 @@ dependencies = [ "once_cell", ] -[[package]] -name = "tiktoken-rs" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44075987ee2486402f0808505dd65692163d243a337fc54363d49afac41087f6" -dependencies = [ - "anyhow", - "base64 0.21.7", - "bstr", - "fancy-regex", - "lazy_static", - "parking_lot", - "regex", - "rustc-hash 1.1.0", -] - [[package]] name = "time" version = "0.3.41" @@ -5597,7 +5705,7 @@ dependencies = [ "rustls-pki-types", "tokio", "tokio-rustls", - "tungstenite", + "tungstenite 0.23.0", "webpki-roots", ] @@ -5829,6 +5937,23 @@ dependencies = [ "utf-8", ] +[[package]] +name = "tungstenite" +version = "0.26.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4793cb5e56680ecbb1d843515b23b6de9a75eb04b66643e256a396d43be33c13" +dependencies = [ + "bytes", + "data-encoding", + "http", + "httparse", + "log", + "rand 0.9.1", + "sha1", + "thiserror 2.0.12", + "utf-8", +] + [[package]] name = "typeid" version = "1.0.3" @@ -5935,6 +6060,53 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" +[[package]] +name = "ureq" +version = "2.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02d1a66277ed75f640d608235660df48c8e3c19f3b4edb6a263315626cc3c01d" +dependencies = [ + "base64 0.22.1", + "flate2", + "log", + "once_cell", + "rustls", + "rustls-pki-types", + "socks", + "url", + "webpki-roots", +] + +[[package]] +name = "ureq" +version = "3.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7a3e9af6113ecd57b8c63d3cd76a385b2e3881365f1f489e54f49801d0c83ea" +dependencies = [ + "base64 0.22.1", + "flate2", + "log", + "percent-encoding", + "rustls", + "rustls-pemfile", + "rustls-pki-types", + "ureq-proto", + "utf-8", + "webpki-roots", +] + +[[package]] +name = "ureq-proto" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fadf18427d33828c311234884b7ba2afb57143e6e7e69fda7ee883b624661e36" +dependencies = [ + "base64 0.22.1", + "http", + "httparse", + "log", +] + [[package]] name = "url" version = "2.5.4" @@ -6191,6 +6363,18 @@ dependencies = [ "rustls-pki-types", ] +[[package]] +name = "which" +version = "7.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d643ce3fd3e5b54854602a080f34fb10ab75e0b813ee32d00ca2b44fa74762" +dependencies = [ + "either", + "env_home", + "rustix", + "winsafe", +] + [[package]] name = "winapi" version = "0.3.9" @@ -6509,6 +6693,22 @@ dependencies = [ "memchr", ] +[[package]] +name = "winreg" +version = "0.55.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb5a765337c50e9ec252c2069be9bf91c7df47afb103b642ba3a53bf8101be97" +dependencies = [ + "cfg-if", + "windows-sys 0.59.0", +] + +[[package]] +name = "winsafe" +version = "0.0.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d135d17ab770252ad95e9a872d365cf3090e3be864a34ab46f48555993efc904" + [[package]] name = "wit-bindgen-rt" version = "0.39.0" @@ -6564,6 +6764,15 @@ version = "0.8.26" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a62ce76d9b56901b19a74f19431b0d8b3bc7ca4ad685a746dfd78ca8f4fc6bda" +[[package]] +name = "xz2" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" +dependencies = [ + "lzma-sys", +] + [[package]] name = "yaml-rust2" version = "0.10.1" @@ -6665,6 +6874,20 @@ name = "zeroize" version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" +dependencies = [ + "zeroize_derive", +] + +[[package]] +name = "zeroize_derive" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.100", +] [[package]] name = "zerovec" @@ -6687,3 +6910,71 @@ dependencies = [ "quote", "syn 2.0.100", ] + +[[package]] +name = "zip" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1dcb24d0152526ae49b9b96c1dcf71850ca1e0b882e4e28ed898a93c41334744" +dependencies = [ + "aes", + "arbitrary", + "bzip2", + "constant_time_eq", + "crc32fast", + "crossbeam-utils", + "deflate64", + "flate2", + "getrandom 0.3.2", + "hmac", + "indexmap 2.9.0", + "lzma-rs", + "memchr", + "pbkdf2", + "sha1", + "time", + "xz2", + "zeroize", + "zopfli", + "zstd", +] + +[[package]] +name = "zopfli" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edfc5ee405f504cd4984ecc6f14d02d55cfda60fa4b689434ef4102aae150cd7" +dependencies = [ + "bumpalo", + "crc32fast", + "log", + "simd-adler32", +] + +[[package]] +name = "zstd" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "7.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.15+zstd.1.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb81183ddd97d0c74cedf1d50d85c8d08c1b8b68ee863bdee9e706eedba1a237" +dependencies = [ + "cc", + "pkg-config", +] diff --git a/Cargo.toml b/Cargo.toml index fa0b50d..9cbff4d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,3 +26,4 @@ axum_session_auth = "0.16" axum_session_surreal = "0.4" axum_typed_multipart = "0.16" tempfile = "3.12.0" +dom_smoothie = "0.10.0" diff --git a/common/Cargo.toml b/common/Cargo.toml index 640abaf..91064e9 100644 --- a/common/Cargo.toml +++ b/common/Cargo.toml @@ -16,6 +16,7 @@ surrealdb = { workspace = true, features = ["kv-mem"] } async-openai = { workspace = true } futures = { workspace = true } tempfile = { workspace = true } +dom_smoothie = { workspace = true } async-trait = "0.1.88" axum_session = { workspace = true } diff --git a/common/src/error.rs b/common/src/error.rs index b1e97c8..1b90b64 100644 --- a/common/src/error.rs +++ b/common/src/error.rs @@ -33,4 +33,6 @@ pub enum AppError { Tiktoken(#[from] anyhow::Error), #[error("Ingress Processing error: {0}")] Processing(String), + #[error("DOM smoothie error: {0}")] + DomSmoothie(#[from] dom_smoothie::ReadabilityError), } diff --git a/common/src/storage/types/file_info.rs b/common/src/storage/types/file_info.rs index 7cdff9a..b0e5aef 100644 --- a/common/src/storage/types/file_info.rs +++ b/common/src/storage/types/file_info.rs @@ -38,7 +38,8 @@ stored_object!(FileInfo, "file", { sha256: String, path: String, file_name: String, - mime_type: String + mime_type: String, + user_id: String }); impl FileInfo { @@ -83,6 +84,7 @@ impl FileInfo { .to_string_lossy() .into(), mime_type: Self::guess_mime_type(Path::new(&sanitized_file_name)), + user_id: user_id.to_string(), }; // Store in database @@ -258,6 +260,22 @@ impl FileInfo { Ok(()) } + + /// Retrieves a `FileInfo` by its ID. + /// + /// # Arguments + /// * `id` - The ID string of the file. + /// * `db_client` - Reference to the SurrealDbClient. + /// + /// # Returns + /// * `Result` - The `FileInfo` or an error if not found or on DB issues. + pub async fn get_by_id(id: &str, db_client: &SurrealDbClient) -> Result { + match db_client.get_item::(id).await { + Ok(Some(file_info)) => Ok(file_info), + Ok(None) => Err(FileError::FileNotFound(id.to_string())), + Err(e) => Err(FileError::SurrealError(e)), + } + } } #[cfg(test)] @@ -460,6 +478,7 @@ mod tests { id: Uuid::new_v4().to_string(), created_at: now, updated_at: now, + user_id: "user123".to_string(), sha256: "test_sha256_hash".to_string(), path: "/path/to/file.txt".to_string(), file_name: "manual_file.txt".to_string(), @@ -517,6 +536,7 @@ mod tests { // The file path should point to our test file let file_info = FileInfo { id: file_id.clone(), + user_id: "user123".to_string(), created_at: now, updated_at: now, sha256: "test_sha256_hash".to_string(), @@ -586,4 +606,72 @@ mod tests { _ => panic!("Expected FileNotFound error"), } } + #[tokio::test] + async fn test_get_by_id() { + // Setup in-memory database for testing + let namespace = "test_ns"; + let database = &Uuid::new_v4().to_string(); + let db = SurrealDbClient::memory(namespace, database) + .await + .expect("Failed to start in-memory surrealdb"); + + // Create a FileInfo instance directly + let now = Utc::now(); + let file_id = Uuid::new_v4().to_string(); + let original_file_info = FileInfo { + id: file_id.clone(), + user_id: "user123".to_string(), + created_at: now, + updated_at: now, + sha256: "test_sha256_for_get_by_id".to_string(), + path: "/path/to/get_by_id_test.txt".to_string(), + file_name: "get_by_id_test.txt".to_string(), + mime_type: "text/plain".to_string(), + }; + + // Store it in the database + db.store_item(original_file_info.clone()) + .await + .expect("Failed to store item for get_by_id test"); + + // Retrieve it using get_by_id + let result = FileInfo::get_by_id(&file_id, &db).await; + + // Assert success and content match + assert!(result.is_ok()); + let retrieved_info = result.unwrap(); + assert_eq!(retrieved_info.id, original_file_info.id); + assert_eq!(retrieved_info.sha256, original_file_info.sha256); + assert_eq!(retrieved_info.file_name, original_file_info.file_name); + assert_eq!(retrieved_info.path, original_file_info.path); + assert_eq!(retrieved_info.mime_type, original_file_info.mime_type); + // Optionally compare timestamps if precision isn't an issue + // assert_eq!(retrieved_info.created_at, original_file_info.created_at); + } + + #[tokio::test] + async fn test_get_by_id_not_found() { + // Setup in-memory database for testing + let namespace = "test_ns"; + let database = &Uuid::new_v4().to_string(); + let db = SurrealDbClient::memory(namespace, database) + .await + .expect("Failed to start in-memory surrealdb"); + + // Try to retrieve a non-existent ID + let non_existent_id = "non-existent-file-id"; + let result = FileInfo::get_by_id(non_existent_id, &db).await; + + // Assert failure + assert!(result.is_err()); + + // Assert the specific error type is FileNotFound + match result { + Err(FileError::FileNotFound(id)) => { + assert_eq!(id, non_existent_id); + } + Err(e) => panic!("Expected FileNotFound error, but got {:?}", e), + Ok(_) => panic!("Expected an error, but got Ok"), + } + } } diff --git a/common/src/storage/types/ingestion_payload.rs b/common/src/storage/types/ingestion_payload.rs index 85757ce..c37532e 100644 --- a/common/src/storage/types/ingestion_payload.rs +++ b/common/src/storage/types/ingestion_payload.rs @@ -114,6 +114,7 @@ mod tests { id: mock.id, sha256: "mock-sha256".to_string(), path: "/mock/path".to_string(), + user_id: "user123".to_string(), file_name: "mock.txt".to_string(), mime_type: "text/plain".to_string(), created_at: Utc::now(), diff --git a/common/src/storage/types/system_settings.rs b/common/src/storage/types/system_settings.rs index 7d2b195..4aa20c3 100644 --- a/common/src/storage/types/system_settings.rs +++ b/common/src/storage/types/system_settings.rs @@ -31,19 +31,7 @@ impl SystemSettings { let settings: Option = db.get_item("current").await?; if settings.is_none() { - let created_settings = SystemSettings { - id: "current".to_string(), - registrations_enabled: true, - require_email_verification: false, - query_model: "gpt-4o-mini".to_string(), - processing_model: "gpt-4o-mini".to_string(), - query_system_prompt: - crate::storage::types::system_prompts::DEFAULT_QUERY_SYSTEM_PROMPT.to_string(), - ingestion_system_prompt: - crate::storage::types::system_prompts::DEFAULT_INGRESS_ANALYSIS_SYSTEM_PROMPT - .to_string(), - }; - + let created_settings = Self::new(); let stored: Option = db.store_item(created_settings).await?; return stored.ok_or(AppError::Validation("Failed to initialize settings".into())); } diff --git a/common/src/storage/types/text_content.rs b/common/src/storage/types/text_content.rs index 790cdae..273daa2 100644 --- a/common/src/storage/types/text_content.rs +++ b/common/src/storage/types/text_content.rs @@ -5,10 +5,17 @@ use crate::{error::AppError, storage::db::SurrealDbClient, stored_object}; use super::file_info::FileInfo; +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] +pub struct UrlInfo { + pub url: String, + pub title: String, + pub image_id: String, +} + stored_object!(TextContent, "text_content", { text: String, file_info: Option, - url: Option, + url_info: Option, instructions: String, category: String, user_id: String @@ -20,7 +27,7 @@ impl TextContent { instructions: String, category: String, file_info: Option, - url: Option, + url_info: Option, user_id: String, ) -> Self { let now = Utc::now(); @@ -30,7 +37,7 @@ impl TextContent { updated_at: now, text, file_info, - url, + url_info, instructions, category, user_id, @@ -85,7 +92,7 @@ mod tests { assert_eq!(text_content.category, category); assert_eq!(text_content.user_id, user_id); assert!(text_content.file_info.is_none()); - assert!(text_content.url.is_none()); + assert!(text_content.url_info.is_none()); assert!(!text_content.id.is_empty()); } @@ -96,19 +103,27 @@ mod tests { let instructions = "URL instructions".to_string(); let category = "URL category".to_string(); let user_id = "user123".to_string(); - let url = Some("https://example.com/document.pdf".to_string()); + let title = "page_title".to_string(); + let image_id = "image12312".to_string(); + let url = "https://example.com/document.pdf".to_string(); + + let url_info = Some(UrlInfo { + url, + title, + image_id, + }); let text_content = TextContent::new( text.clone(), instructions.clone(), category.clone(), None, - url.clone(), + url_info.clone(), user_id.clone(), ); // Check URL field is set - assert_eq!(text_content.url, url); + assert_eq!(text_content.url_info, url_info); } #[tokio::test] diff --git a/html-router/Cargo.toml b/html-router/Cargo.toml index 6d120f5..2f26152 100644 --- a/html-router/Cargo.toml +++ b/html-router/Cargo.toml @@ -31,6 +31,7 @@ tower-http = { version = "0.6.2", features = ["fs"] } chrono-tz = "0.10.1" tower-serve-static = "0.1.1" include_dir = "0.7.4" +tokio-util = { version = "0.7.15", features = ["io"] } common = { path = "../common" } composite-retrieval = { path = "../composite-retrieval" } diff --git a/html-router/src/routes/index/handlers.rs b/html-router/src/routes/index/handlers.rs index 33f5ce5..cf742ee 100644 --- a/html-router/src/routes/index/handlers.rs +++ b/html-router/src/routes/index/handlers.rs @@ -1,9 +1,12 @@ use axum::{ + body::Body, extract::{Path, State}, + http::{header, HeaderMap, HeaderValue, StatusCode}, response::IntoResponse, }; use serde::Serialize; -use tokio::join; +use tokio::{fs::File, join}; +use tokio_util::io::ReaderStream; use crate::{ middlewares::{ @@ -15,9 +18,15 @@ use crate::{ use common::{ error::AppError, storage::types::{ - conversation::Conversation, file_info::FileInfo, ingestion_task::IngestionTask, - knowledge_entity::KnowledgeEntity, knowledge_relationship::KnowledgeRelationship, - text_chunk::TextChunk, text_content::TextContent, user::User, + conversation::Conversation, + file_info::{FileError, FileInfo}, + ingestion_task::IngestionTask, + knowledge_entity::KnowledgeEntity, + knowledge_relationship::KnowledgeRelationship, + text_chunk::TextChunk, + text_content::TextContent, + user::User, + StoredObject, }, }; @@ -167,3 +176,49 @@ pub async fn show_active_jobs( }, )) } + +pub async fn serve_file( + State(state): State, + RequireUser(user): RequireUser, + Path(file_id): Path, +) -> Result { + let file_info = match FileInfo::get_by_id(&file_id, &state.db).await { + Ok(info) => info, + _ => return Ok(TemplateResponse::not_found().into_response()), + }; + + if file_info.user_id != user.id { + return Ok(TemplateResponse::unauthorized().into_response()); + } + + // 3. Open the file asynchronously from the stored path + let path = std::path::Path::new(&file_info.path); + + let file = match File::open(path).await { + Ok(f) => f, + Err(e) => return Ok(TemplateResponse::server_error().into_response()), + }; + + let stream = ReaderStream::new(file); + let body = Body::from_stream(stream); + + let mut headers = HeaderMap::new(); + headers.insert( + header::CONTENT_TYPE, + HeaderValue::from_str(&file_info.mime_type) + .unwrap_or_else(|_| HeaderValue::from_static("application/octet-stream")), + ); + let Ok(disposition_value) = + HeaderValue::from_str(&format!("attachment; filename=\"{}\"", file_info.file_name)) + else { + headers.insert( + header::CONTENT_DISPOSITION, + HeaderValue::from_static("attachment"), + ); + return Ok((StatusCode::OK, headers, body).into_response()); + }; + headers.insert(header::CONTENT_DISPOSITION, disposition_value); + + // 5. Return the response + Ok((StatusCode::OK, headers, body).into_response()) +} diff --git a/html-router/src/routes/index/mod.rs b/html-router/src/routes/index/mod.rs index af32086..a85dde8 100644 --- a/html-router/src/routes/index/mod.rs +++ b/html-router/src/routes/index/mod.rs @@ -5,7 +5,7 @@ use axum::{ routing::{delete, get}, Router, }; -use handlers::{delete_job, delete_text_content, index_handler, show_active_jobs}; +use handlers::{delete_job, delete_text_content, index_handler, serve_file, show_active_jobs}; use crate::html_state::HtmlState; @@ -26,4 +26,5 @@ where .route("/jobs/{job_id}", delete(delete_job)) .route("/active-jobs", get(show_active_jobs)) .route("/text-content/{id}", delete(delete_text_content)) + .route("/file/{id}", get(serve_file)) } diff --git a/html-router/templates/auth/admin_panel.html b/html-router/templates/auth/admin_panel.html index fadaea7..a5276bc 100644 --- a/html-router/templates/auth/admin_panel.html +++ b/html-router/templates/auth/admin_panel.html @@ -54,9 +54,9 @@

Model used for answering user queries

@@ -66,11 +66,11 @@ Processing Model

Model used for content processing and ingestion

diff --git a/html-router/templates/content/content_list.html b/html-router/templates/content/content_list.html index e24bc0b..f4df509 100644 --- a/html-router/templates/content/content_list.html +++ b/html-router/templates/content/content_list.html @@ -1,6 +1,7 @@
{% for text_content in text_contents %}
+

diff --git a/html-router/templates/index/signed_in/recent_content.html b/html-router/templates/index/signed_in/recent_content.html index d2fa1a5..e6af068 100644 --- a/html-router/templates/index/signed_in/recent_content.html +++ b/html-router/templates/index/signed_in/recent_content.html @@ -4,7 +4,7 @@ {% for item in latest_text_contents %}
  • - {% if item.url %} + {% if item.url_info %} {% include "icons/globe_icon.html" %} {% elif item.file_info %} {% include "icons/document_icon.html" %} @@ -14,8 +14,8 @@
    - {% if item.url %} - {{item.url}} + {% if item.url_info %} + {{item.url_info.title}} {% elif item.file_info%} {{item.file_info.file_name}} {% else %} diff --git a/html-router/templates/sidebar.html b/html-router/templates/sidebar.html index 208b900..bea392f 100644 --- a/html-router/templates/sidebar.html +++ b/html-router/templates/sidebar.html @@ -31,6 +31,10 @@
  • {% endfor %} +
  • + +
  • diff --git a/ingestion-pipeline/Cargo.toml b/ingestion-pipeline/Cargo.toml index 0ce3c17..7db40ae 100644 --- a/ingestion-pipeline/Cargo.toml +++ b/ingestion-pipeline/Cargo.toml @@ -13,14 +13,17 @@ serde_json = { workspace = true } futures = { workspace = true } async-openai = { workspace = true } surrealdb = { workspace = true } +dom_smoothie = { workspace = true } +tempfile = { workspace = true } +axum_typed_multipart = { workspace = true} -tiktoken-rs = "0.6.0" reqwest = {version = "0.12.12", features = ["charset", "json"]} -scraper = "0.22.0" chrono = { version = "0.4.39", features = ["serde"] } text-splitter = "0.18.1" +url = { version = "2.5.2", features = ["serde"] } uuid = { version = "1.10.0", features = ["v4", "serde"] } -dom_smoothie = "0.10.0" + +headless_chrome = { git = "https://github.com/rust-headless-chrome/rust-headless-chrome", features = ["fetch"] } common = { path = "../common" } composite-retrieval = { path = "../composite-retrieval" } diff --git a/ingestion-pipeline/src/enricher.rs b/ingestion-pipeline/src/enricher.rs index c2dee13..a7274b1 100644 --- a/ingestion-pipeline/src/enricher.rs +++ b/ingestion-pipeline/src/enricher.rs @@ -111,7 +111,7 @@ impl IngestionEnricher { let request = CreateChatCompletionRequestArgs::default() .model(&settings.processing_model) .temperature(0.2) - .max_tokens(3048u32) + .max_tokens(6048u32) .messages([ ChatCompletionRequestSystemMessage::from(INGRESS_ANALYSIS_SYSTEM_MESSAGE).into(), ChatCompletionRequestUserMessage::from(user_message).into(), diff --git a/ingestion-pipeline/src/pipeline.rs b/ingestion-pipeline/src/pipeline.rs index 26db066..dea01d0 100644 --- a/ingestion-pipeline/src/pipeline.rs +++ b/ingestion-pipeline/src/pipeline.rs @@ -53,7 +53,7 @@ impl IngestionPipeline { ) .await?; - let text_content = to_text_content(task.content, &self.openai_client, &self.db).await?; + let text_content = to_text_content(task.content, &self.db).await?; match self.process(&text_content).await { Ok(_) => { diff --git a/ingestion-pipeline/src/types/mod.rs b/ingestion-pipeline/src/types/mod.rs index cd0ccdf..381e7d8 100644 --- a/ingestion-pipeline/src/types/mod.rs +++ b/ingestion-pipeline/src/types/mod.rs @@ -1,29 +1,28 @@ pub mod llm_enrichment_result; -use std::{sync::Arc, time::Duration}; +use std::io::Write; +use std::time::Instant; -use async_openai::types::{ - ChatCompletionRequestSystemMessage, ChatCompletionRequestUserMessage, - CreateChatCompletionRequestArgs, -}; +use axum::http::HeaderMap; +use axum_typed_multipart::{FieldData, FieldMetadata}; +use chrono::Utc; use common::storage::db::SurrealDbClient; use common::{ error::AppError, storage::types::{ - file_info::FileInfo, ingestion_payload::IngestionPayload, system_settings::SystemSettings, - text_content::TextContent, + file_info::FileInfo, + ingestion_payload::IngestionPayload, + text_content::{TextContent, UrlInfo}, }, }; -use dom_smoothie::TextMode; -use reqwest; -use scraper::{Html, Selector}; -use std::fmt::Write; -use tiktoken_rs::{o200k_base, CoreBPE}; +use dom_smoothie::{Article, Readability, TextMode}; +use headless_chrome::Browser; +use tempfile::NamedTempFile; +use tracing::{error, info}; pub async fn to_text_content( ingestion_payload: IngestionPayload, - openai_client: &Arc>, - db_client: &Arc, + db: &SurrealDbClient, ) -> Result { match ingestion_payload { IngestionPayload::Url { @@ -32,13 +31,17 @@ pub async fn to_text_content( category, user_id, } => { - let text = fetch_text_from_url(&url, openai_client, db_client).await?; + let (article, file_info) = fetch_article_from_url(&url, db, &user_id).await?; Ok(TextContent::new( - text, + article.text_content.into(), instructions, category, None, - Some(url), + Some(UrlInfo { + url, + title: article.title, + image_id: file_info.id, + }), user_id, )) } @@ -73,161 +76,104 @@ pub async fn to_text_content( } } } +use std::io::{Seek, SeekFrom}; // <-- Add Seek and SeekFrom -/// Get text from url, will return it as a markdown formatted string -async fn fetch_text_from_url( +/// Fetches web content from a URL, extracts the main article text as Markdown, +/// captures a screenshot, and stores the screenshot returning [`FileInfo`]. +/// +/// This function handles browser automation, content extraction via Readability, +/// screenshot capture, temporary file handling, and persisting the screenshot +/// details (including deduplication based on content hash via [`FileInfo::new`]). +/// +/// # Arguments +/// +/// * `url` - The URL of the web page to fetch. +/// * `db` - A reference to the database client (`SurrealDbClient`). +/// * `user_id` - The ID of the user performing the action, used for associating the file. +/// +/// # Returns +/// +/// A `Result` containing: +/// * Ok: A tuple `(Article, FileInfo)` where `Article` contains the parsed markdown +/// content and metadata, and `FileInfo` contains the details of the stored screenshot. +/// * Err: An `AppError` if any step fails (navigation, screenshot, file handling, DB operation). +async fn fetch_article_from_url( url: &str, - openai_client: &Arc>, - db_client: &Arc, -) -> Result { - // Use a client with timeouts and reuse - let client = reqwest::ClientBuilder::new() - .timeout(Duration::from_secs(30)) - .build()?; - let response = client.get(url).send().await?.text().await?; + db: &SurrealDbClient, + user_id: &str, +) -> Result<(Article, FileInfo), AppError> { + info!("Fetching URL: {}", url); + // Instantiate timer + let now = Instant::now(); + // Setup browser, navigate and wait + let browser = Browser::default()?; + let tab = browser.new_tab()?; + let page = tab.navigate_to(url)?; + let loaded_page = page.wait_until_navigated()?; + // Get content + let raw_content = loaded_page.get_content()?; + // Get screenshot + let screenshot = loaded_page.capture_screenshot( + headless_chrome::protocol::cdp::Page::CaptureScreenshotFormatOption::Jpeg, + None, + None, + true, + )?; - // Preallocate string with capacity - let mut structured_content = String::with_capacity(response.len() / 2); + // Create temp file + let mut tmp_file = NamedTempFile::new()?; + let temp_path_str = format!("{:?}", tmp_file.path()); - let document = Html::parse_document(&response); - let main_selectors = Selector::parse( - "article, main, .article-content, .post-content, .entry-content, [role='main']", - ) - .unwrap(); + // Write screenshot TO the temp file + tmp_file.write_all(&screenshot)?; - let content_element = document - .select(&main_selectors) - .next() - .or_else(|| document.select(&Selector::parse("body").unwrap()).next()) - .ok_or(AppError::NotFound("No content found".into()))?; + // Ensure the OS buffer is written to the file system _before_ we proceed. + tmp_file.as_file().sync_all()?; - // Compile selectors once - let heading_selector = Selector::parse("h1, h2, h3").unwrap(); - let paragraph_selector = Selector::parse("p").unwrap(); - - // Process content in one pass - for element in content_element.select(&heading_selector) { - let _ = writeln!( - structured_content, - "{}", - element.text().collect::().trim() - ); - } - for element in content_element.select(¶graph_selector) { - let _ = writeln!( - structured_content, - "{}", - element.text().collect::().trim() - ); + // Ensure the file handle's read cursor is at the beginning before hashing occurs. + if let Err(e) = tmp_file.seek(SeekFrom::Start(0)) { + error!("URL: {}. Failed to seek temp file {} to start: {:?}. Proceeding, but hashing might fail.", url, temp_path_str, e); } - let content = structured_content - .replace(|c: char| c.is_control(), " ") - .replace(" ", " "); + // Prepare file metadata + let parsed_url = + url::Url::parse(url).map_err(|_| AppError::Processing("Invalid URL".to_string()))?; + let domain = parsed_url + .host_str() + .unwrap_or("unknown") + .replace(|c: char| !c.is_alphanumeric(), "_"); + let timestamp = Utc::now().format("%Y%m%d%H%M%S"); + let file_name = format!("{}_{}_{}.jpg", domain, "screenshot", timestamp); - process_web_content(content, openai_client, db_client).await - - // let config = dom_smoothie::Config { - // text_mode: TextMode::Markdown, - // ..Default::default() - // }; - // panic!("YOU SHALL NOT PASS"); -} - -pub async fn process_web_content( - content: String, - openai_client: &Arc>, - db_client: &Arc, -) -> Result { - const MAX_TOKENS: usize = 122000; - const SYSTEM_PROMPT: &str = r#" - You are a precise content extractor for web pages. Your task: - - 1. Extract ONLY the main article/content from the provided text - 2. Maintain the original content - do not summarize or modify the core information - 3. Ignore peripheral content such as: - - Navigation elements - - Error messages (e.g., "JavaScript required") - - Related articles sections - - Comments - - Social media links - - Advertisement text - - FORMAT: - - Convert tags to markdown headings (#, ##, ###) - - Convert tags to markdown paragraphs - - Preserve quotes and important formatting - - Remove duplicate content - - Remove any metadata or technical artifacts - - OUTPUT RULES: - - Output ONLY the cleaned content in markdown - - Do not add any explanations or meta-commentary - - Do not add summaries or conclusions - - Do not use any XML/HTML tags in the output - "#; - - let bpe = o200k_base()?; - let settings = SystemSettings::get_current(db_client).await?; - - // Process content in chunks if needed - let truncated_content = if bpe.encode_with_special_tokens(&content).len() > MAX_TOKENS { - truncate_content(&content, MAX_TOKENS, &bpe)? - } else { - content + // Construct FieldData and FieldMetadata + let metadata = FieldMetadata { + file_name: Some(file_name), + content_type: Some("image/jpeg".to_string()), + name: None, + headers: HeaderMap::new(), + }; + let field_data = FieldData { + contents: tmp_file, + metadata, }; - let request = CreateChatCompletionRequestArgs::default() - .model(&settings.processing_model) - .temperature(0.0) - .max_tokens(16200u32) - .messages([ - ChatCompletionRequestSystemMessage::from(SYSTEM_PROMPT).into(), - ChatCompletionRequestUserMessage::from(truncated_content).into(), - ]) - .build()?; + // Store screenshot + let file_info = FileInfo::new(field_data, db, user_id).await?; - let response = openai_client.chat().create(request).await?; + // Parse content... + let config = dom_smoothie::Config { + text_mode: TextMode::Markdown, + ..Default::default() + }; + let mut readability = Readability::new(raw_content, None, Some(config))?; + let article: Article = readability.parse()?; + let end = now.elapsed(); + info!( + "URL: {}. Total time: {:?}. Final File ID: {}", + url, end, file_info.id + ); - // Extract and return the content - response - .choices - .first() - .and_then(|choice| choice.message.content.clone()) - .ok_or(AppError::LLMParsing( - "No content found in LLM response".into(), - )) -} - -fn truncate_content( - content: &str, - max_tokens: usize, - tokenizer: &CoreBPE, -) -> Result { - // Pre-allocate with estimated size - let mut result = String::with_capacity(content.len() / 2); - let mut current_tokens = 0; - - // Process content by paragraph to maintain context - for paragraph in content.split("\n\n") { - let tokens = tokenizer.encode_with_special_tokens(paragraph).len(); - - // Check if adding paragraph exceeds limit - if current_tokens + tokens > max_tokens { - break; - } - - result.push_str(paragraph); - result.push_str("\n\n"); - current_tokens += tokens; - } - - // Ensure we return valid content - if result.is_empty() { - return Err(AppError::Processing("Content exceeds token limit".into())); - } - - Ok(result.trim_end().to_string()) + Ok((article, file_info)) } /// Extracts text from a file based on its MIME type. diff --git a/main/src/main.rs b/main/src/main.rs index 803a356..720073f 100644 --- a/main/src/main.rs +++ b/main/src/main.rs @@ -50,7 +50,7 @@ async fn main() -> Result<(), Box> { // Create Axum router let app = Router::new() .nest("/api/v1", api_routes_v1(&api_state)) - .nest("/", html_routes(&html_state)) + .merge(html_routes(&html_state)) .with_state(AppState { api_state, html_state,