feat: readability parsing, screenshot of page, file serving

This commit is contained in:
Per Stark
2025-04-30 08:06:18 +02:00
parent 776a454a88
commit 02198dc21a
20 changed files with 707 additions and 309 deletions

507
Cargo.lock generated
View File

@@ -201,6 +201,9 @@ name = "arbitrary"
version = "1.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dde20b3d026af13f561bdd0f15edf01fc734f0dafcedbaf42bba506a9517f223"
dependencies = [
"derive_arbitrary",
]
[[package]]
name = "argon2"
@@ -451,6 +454,20 @@ dependencies = [
"syn 2.0.100",
]
[[package]]
name = "auto_generate_cdp"
version = "0.4.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d6e1961a0d5d77969057eba90d448e610d3c439024d135d9dbd98e33ec973520"
dependencies = [
"convert_case 0.4.0",
"proc-macro2",
"quote",
"serde",
"serde_json",
"ureq 2.12.1",
]
[[package]]
name = "autocfg"
version = "1.4.0"
@@ -835,17 +852,6 @@ dependencies = [
"syn 2.0.100",
]
[[package]]
name = "bstr"
version = "1.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "234113d19d0d7d613b40e86fb654acf958910802bcceab913a4f9e7cda03b1a4"
dependencies = [
"memchr",
"regex-automata 0.4.9",
"serde",
]
[[package]]
name = "bumpalo"
version = "3.17.0"
@@ -895,6 +901,25 @@ dependencies = [
"serde",
]
[[package]]
name = "bzip2"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47"
dependencies = [
"bzip2-sys",
]
[[package]]
name = "bzip2-sys"
version = "0.1.13+1.0.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14"
dependencies = [
"cc",
"pkg-config",
]
[[package]]
name = "castaway"
version = "0.2.3"
@@ -910,6 +935,8 @@ version = "1.2.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e3a13707ac958681c13b39b458c073d0d9bc8a22cb1b2f4c8e55eb72c13f362"
dependencies = [
"jobserver",
"libc",
"shlex",
]
@@ -1070,6 +1097,7 @@ dependencies = [
"chrono",
"chrono-tz",
"config",
"dom_smoothie",
"futures",
"mime",
"mime_guess",
@@ -1124,7 +1152,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "595aae20e65c3be792d05818e8c63025294ac3cb7e200f11459063a352a6ef80"
dependencies = [
"async-trait",
"convert_case",
"convert_case 0.6.0",
"json5",
"pathdiff",
"ron",
@@ -1162,6 +1190,12 @@ version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6"
[[package]]
name = "convert_case"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e"
[[package]]
name = "convert_case"
version = "0.6.0"
@@ -1221,6 +1255,21 @@ dependencies = [
"libc",
]
[[package]]
name = "crc"
version = "3.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "69e6e4d7b33a94f0991c26729976b10ebde1d34c3ee82408fb536164fa10d636"
dependencies = [
"crc-catalog",
]
[[package]]
name = "crc-catalog"
version = "2.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5"
[[package]]
name = "crc32fast"
version = "1.4.2"
@@ -1385,6 +1434,12 @@ version = "2.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2a2330da5de22e8a3cb63252ce2abb30116bf5265e89c0e01bc17015ce30a476"
[[package]]
name = "deflate64"
version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da692b8d1080ea3045efaab14434d40468c3d8657e42abddfffca87b428f4c1b"
[[package]]
name = "deranged"
version = "0.4.0"
@@ -1395,6 +1450,17 @@ dependencies = [
"serde",
]
[[package]]
name = "derive_arbitrary"
version = "1.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "30542c1ad912e0e3d22a1935c290e12e8a29d704a420177a31faad4a601a0800"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.100",
]
[[package]]
name = "derive_builder"
version = "0.20.2"
@@ -1465,6 +1531,15 @@ dependencies = [
"subtle",
]
[[package]]
name = "directories"
version = "6.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "16f5094c54661b38d03bd7e50df373292118db60b585c08a411c6d840017fe7d"
dependencies = [
"dirs-sys",
]
[[package]]
name = "dirs-next"
version = "2.0.0"
@@ -1475,6 +1550,18 @@ dependencies = [
"dirs-sys-next",
]
[[package]]
name = "dirs-sys"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e01a3366d27ee9890022452ee61b2b63a67e6f13f58900b651ff5665f0bb1fab"
dependencies = [
"libc",
"option-ext",
"redox_users 0.5.0",
"windows-sys 0.59.0",
]
[[package]]
name = "dirs-sys-next"
version = "0.1.2"
@@ -1482,7 +1569,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ebda144c4fe02d1f7ea1a7d9641b6fc6b580adcfa024ae48797ecdeb6825b4d"
dependencies = [
"libc",
"redox_users",
"redox_users 0.4.6",
"winapi",
]
@@ -1588,12 +1675,6 @@ dependencies = [
"num-traits",
]
[[package]]
name = "ego-tree"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b2972feb8dffe7bc8c5463b1dacda1b0dfbed3710e50f977d965429692d74cd8"
[[package]]
name = "either"
version = "1.15.0"
@@ -1624,6 +1705,12 @@ version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c34f04666d835ff5d62e058c3995147c06f42fe86ff053337632bca83e42702d"
[[package]]
name = "env_home"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c7f84e12ccf0a7ddc17a6c41c93326024c42920d7ee630d04950e6926645c0fe"
[[package]]
name = "equivalent"
version = "1.0.2"
@@ -1695,17 +1782,6 @@ dependencies = [
"tempfile",
]
[[package]]
name = "fancy-regex"
version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "531e46835a22af56d1e3b66f04844bed63158bc094a628bec1d321d9b4c44bf2"
dependencies = [
"bit-set 0.5.3",
"regex-automata 0.4.9",
"regex-syntax 0.8.5",
]
[[package]]
name = "fastrand"
version = "2.3.0"
@@ -1736,6 +1812,16 @@ version = "0.4.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b7ac824320a75a52197e8f2d787f6a38b6718bb6897a35142d749af3c0e8f4fe"
[[package]]
name = "flate2"
version = "1.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7ced92e76e966ca2fd84c8f7aa01a4aea65b0eb6648d72f7c8f3e2764a67fece"
dependencies = [
"crc32fast",
"miniz_oxide",
]
[[package]]
name = "float_next_after"
version = "1.0.0"
@@ -1995,15 +2081,6 @@ dependencies = [
"libm",
]
[[package]]
name = "getopts"
version = "0.2.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5"
dependencies = [
"unicode-width",
]
[[package]]
name = "getrandom"
version = "0.2.16"
@@ -2126,6 +2203,32 @@ dependencies = [
"hashbrown 0.15.2",
]
[[package]]
name = "headless_chrome"
version = "1.0.17"
source = "git+https://github.com/rust-headless-chrome/rust-headless-chrome#8b66992826245cbf60377d619fc780f8c45abf8e"
dependencies = [
"anyhow",
"auto_generate_cdp",
"base64 0.22.1",
"derive_builder",
"directories",
"log",
"rand 0.9.1",
"regex",
"serde",
"serde_json",
"tempfile",
"thiserror 2.0.12",
"tungstenite 0.26.2",
"ureq 3.0.11",
"url",
"walkdir",
"which",
"winreg",
"zip",
]
[[package]]
name = "heapless"
version = "0.8.0"
@@ -2207,23 +2310,12 @@ dependencies = [
"tempfile",
"thiserror 1.0.69",
"tokio",
"tokio-util",
"tower-http",
"tower-serve-static",
"tracing",
]
[[package]]
name = "html5ever"
version = "0.29.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3b7410cae13cbc75623c98ac4cbfd1f0bedddf3227afc24f370cf0f50a44a11c"
dependencies = [
"log",
"mac",
"markup5ever 0.14.1",
"match_token",
]
[[package]]
name = "html5ever"
version = "0.30.0"
@@ -2607,20 +2699,22 @@ version = "0.1.0"
dependencies = [
"async-openai",
"axum",
"axum_typed_multipart",
"chrono",
"common",
"composite-retrieval",
"dom_smoothie",
"futures",
"headless_chrome",
"reqwest",
"scraper",
"serde",
"serde_json",
"surrealdb",
"tempfile",
"text-splitter",
"tiktoken-rs",
"tokio",
"tracing",
"url",
"uuid",
]
@@ -2701,6 +2795,16 @@ version = "1.0.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
[[package]]
name = "jobserver"
version = "0.1.33"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "38f262f097c174adebe41eb73d66ae9c06b2844fb0da69969647bbddd9b0538a"
dependencies = [
"getrandom 0.3.2",
"libc",
]
[[package]]
name = "js-sys"
version = "0.3.77"
@@ -2882,6 +2986,27 @@ dependencies = [
"hashbrown 0.15.2",
]
[[package]]
name = "lzma-rs"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "297e814c836ae64db86b36cf2a557ba54368d03f6afcd7d947c266692f71115e"
dependencies = [
"byteorder",
"crc",
]
[[package]]
name = "lzma-sys"
version = "0.1.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27"
dependencies = [
"cc",
"libc",
"pkg-config",
]
[[package]]
name = "mac"
version = "0.1.1"
@@ -2915,20 +3040,6 @@ version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d"
[[package]]
name = "markup5ever"
version = "0.14.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c7a7213d12e1864c0f002f52c2923d4556935a43dec5e71355c2760e0f6e7a18"
dependencies = [
"log",
"phf",
"phf_codegen",
"string_cache",
"string_cache_codegen",
"tendril",
]
[[package]]
name = "markup5ever"
version = "0.15.0"
@@ -3422,6 +3533,12 @@ dependencies = [
"vcpkg",
]
[[package]]
name = "option-ext"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d"
[[package]]
name = "ordered-multimap"
version = "0.7.3"
@@ -3869,7 +3986,7 @@ dependencies = [
"pin-project-lite",
"quinn-proto",
"quinn-udp",
"rustc-hash 2.1.1",
"rustc-hash",
"rustls",
"socket2",
"thiserror 2.0.12",
@@ -3888,7 +4005,7 @@ dependencies = [
"getrandom 0.3.2",
"rand 0.9.1",
"ring",
"rustc-hash 2.1.1",
"rustc-hash",
"rustls",
"rustls-pki-types",
"slab",
@@ -4055,6 +4172,17 @@ dependencies = [
"thiserror 1.0.69",
]
[[package]]
name = "redox_users"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dd6f9d3d47bdd2ad6945c5015a226ec6155d0bcdfd8f7cd29f86b71f8de99d2b"
dependencies = [
"getrandom 0.2.16",
"libredox",
"thiserror 2.0.12",
]
[[package]]
name = "ref-cast"
version = "1.0.24"
@@ -4284,7 +4412,7 @@ dependencies = [
"proc-macro2",
"quote",
"rinja_parser",
"rustc-hash 2.1.1",
"rustc-hash",
"serde",
"syn 2.0.100",
]
@@ -4444,12 +4572,6 @@ version = "0.1.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f"
[[package]]
name = "rustc-hash"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
[[package]]
name = "rustc-hash"
version = "2.1.1"
@@ -4588,21 +4710,6 @@ version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
[[package]]
name = "scraper"
version = "0.22.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cc3d051b884f40e309de6c149734eab57aa8cc1347992710dc80bcc1c2194c15"
dependencies = [
"cssparser 0.34.0",
"ego-tree",
"getopts",
"html5ever 0.29.1",
"precomputed-hash",
"selectors",
"tendril",
]
[[package]]
name = "scrypt"
version = "0.11.0"
@@ -4888,6 +4995,12 @@ dependencies = [
"libc",
]
[[package]]
name = "simd-adler32"
version = "0.3.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe"
[[package]]
name = "simdutf8"
version = "0.1.5"
@@ -4974,6 +5087,17 @@ dependencies = [
"windows-sys 0.52.0",
]
[[package]]
name = "socks"
version = "0.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f0c3dbbd9ae980613c6dd8e28a9407b50509d3803b57624d5dfe8315218cd58b"
dependencies = [
"byteorder",
"libc",
"winapi",
]
[[package]]
name = "spade"
version = "2.13.1"
@@ -5427,22 +5551,6 @@ dependencies = [
"once_cell",
]
[[package]]
name = "tiktoken-rs"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "44075987ee2486402f0808505dd65692163d243a337fc54363d49afac41087f6"
dependencies = [
"anyhow",
"base64 0.21.7",
"bstr",
"fancy-regex",
"lazy_static",
"parking_lot",
"regex",
"rustc-hash 1.1.0",
]
[[package]]
name = "time"
version = "0.3.41"
@@ -5597,7 +5705,7 @@ dependencies = [
"rustls-pki-types",
"tokio",
"tokio-rustls",
"tungstenite",
"tungstenite 0.23.0",
"webpki-roots",
]
@@ -5829,6 +5937,23 @@ dependencies = [
"utf-8",
]
[[package]]
name = "tungstenite"
version = "0.26.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4793cb5e56680ecbb1d843515b23b6de9a75eb04b66643e256a396d43be33c13"
dependencies = [
"bytes",
"data-encoding",
"http",
"httparse",
"log",
"rand 0.9.1",
"sha1",
"thiserror 2.0.12",
"utf-8",
]
[[package]]
name = "typeid"
version = "1.0.3"
@@ -5935,6 +6060,53 @@ version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
[[package]]
name = "ureq"
version = "2.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "02d1a66277ed75f640d608235660df48c8e3c19f3b4edb6a263315626cc3c01d"
dependencies = [
"base64 0.22.1",
"flate2",
"log",
"once_cell",
"rustls",
"rustls-pki-types",
"socks",
"url",
"webpki-roots",
]
[[package]]
name = "ureq"
version = "3.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b7a3e9af6113ecd57b8c63d3cd76a385b2e3881365f1f489e54f49801d0c83ea"
dependencies = [
"base64 0.22.1",
"flate2",
"log",
"percent-encoding",
"rustls",
"rustls-pemfile",
"rustls-pki-types",
"ureq-proto",
"utf-8",
"webpki-roots",
]
[[package]]
name = "ureq-proto"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fadf18427d33828c311234884b7ba2afb57143e6e7e69fda7ee883b624661e36"
dependencies = [
"base64 0.22.1",
"http",
"httparse",
"log",
]
[[package]]
name = "url"
version = "2.5.4"
@@ -6191,6 +6363,18 @@ dependencies = [
"rustls-pki-types",
]
[[package]]
name = "which"
version = "7.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "24d643ce3fd3e5b54854602a080f34fb10ab75e0b813ee32d00ca2b44fa74762"
dependencies = [
"either",
"env_home",
"rustix",
"winsafe",
]
[[package]]
name = "winapi"
version = "0.3.9"
@@ -6509,6 +6693,22 @@ dependencies = [
"memchr",
]
[[package]]
name = "winreg"
version = "0.55.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cb5a765337c50e9ec252c2069be9bf91c7df47afb103b642ba3a53bf8101be97"
dependencies = [
"cfg-if",
"windows-sys 0.59.0",
]
[[package]]
name = "winsafe"
version = "0.0.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d135d17ab770252ad95e9a872d365cf3090e3be864a34ab46f48555993efc904"
[[package]]
name = "wit-bindgen-rt"
version = "0.39.0"
@@ -6564,6 +6764,15 @@ version = "0.8.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a62ce76d9b56901b19a74f19431b0d8b3bc7ca4ad685a746dfd78ca8f4fc6bda"
[[package]]
name = "xz2"
version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2"
dependencies = [
"lzma-sys",
]
[[package]]
name = "yaml-rust2"
version = "0.10.1"
@@ -6665,6 +6874,20 @@ name = "zeroize"
version = "1.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde"
dependencies = [
"zeroize_derive",
]
[[package]]
name = "zeroize_derive"
version = "1.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.100",
]
[[package]]
name = "zerovec"
@@ -6687,3 +6910,71 @@ dependencies = [
"quote",
"syn 2.0.100",
]
[[package]]
name = "zip"
version = "2.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1dcb24d0152526ae49b9b96c1dcf71850ca1e0b882e4e28ed898a93c41334744"
dependencies = [
"aes",
"arbitrary",
"bzip2",
"constant_time_eq",
"crc32fast",
"crossbeam-utils",
"deflate64",
"flate2",
"getrandom 0.3.2",
"hmac",
"indexmap 2.9.0",
"lzma-rs",
"memchr",
"pbkdf2",
"sha1",
"time",
"xz2",
"zeroize",
"zopfli",
"zstd",
]
[[package]]
name = "zopfli"
version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "edfc5ee405f504cd4984ecc6f14d02d55cfda60fa4b689434ef4102aae150cd7"
dependencies = [
"bumpalo",
"crc32fast",
"log",
"simd-adler32",
]
[[package]]
name = "zstd"
version = "0.13.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a"
dependencies = [
"zstd-safe",
]
[[package]]
name = "zstd-safe"
version = "7.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d"
dependencies = [
"zstd-sys",
]
[[package]]
name = "zstd-sys"
version = "2.0.15+zstd.1.5.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eb81183ddd97d0c74cedf1d50d85c8d08c1b8b68ee863bdee9e706eedba1a237"
dependencies = [
"cc",
"pkg-config",
]

View File

@@ -26,3 +26,4 @@ axum_session_auth = "0.16"
axum_session_surreal = "0.4"
axum_typed_multipart = "0.16"
tempfile = "3.12.0"
dom_smoothie = "0.10.0"

View File

@@ -16,6 +16,7 @@ surrealdb = { workspace = true, features = ["kv-mem"] }
async-openai = { workspace = true }
futures = { workspace = true }
tempfile = { workspace = true }
dom_smoothie = { workspace = true }
async-trait = "0.1.88"
axum_session = { workspace = true }

View File

@@ -33,4 +33,6 @@ pub enum AppError {
Tiktoken(#[from] anyhow::Error),
#[error("Ingress Processing error: {0}")]
Processing(String),
#[error("DOM smoothie error: {0}")]
DomSmoothie(#[from] dom_smoothie::ReadabilityError),
}

View File

@@ -38,7 +38,8 @@ stored_object!(FileInfo, "file", {
sha256: String,
path: String,
file_name: String,
mime_type: String
mime_type: String,
user_id: String
});
impl FileInfo {
@@ -83,6 +84,7 @@ impl FileInfo {
.to_string_lossy()
.into(),
mime_type: Self::guess_mime_type(Path::new(&sanitized_file_name)),
user_id: user_id.to_string(),
};
// Store in database
@@ -258,6 +260,22 @@ impl FileInfo {
Ok(())
}
/// Retrieves a `FileInfo` by its ID.
///
/// # Arguments
/// * `id` - The ID string of the file.
/// * `db_client` - Reference to the SurrealDbClient.
///
/// # Returns
/// * `Result<FileInfo, FileError>` - The `FileInfo` or an error if not found or on DB issues.
pub async fn get_by_id(id: &str, db_client: &SurrealDbClient) -> Result<FileInfo, FileError> {
match db_client.get_item::<FileInfo>(id).await {
Ok(Some(file_info)) => Ok(file_info),
Ok(None) => Err(FileError::FileNotFound(id.to_string())),
Err(e) => Err(FileError::SurrealError(e)),
}
}
}
#[cfg(test)]
@@ -460,6 +478,7 @@ mod tests {
id: Uuid::new_v4().to_string(),
created_at: now,
updated_at: now,
user_id: "user123".to_string(),
sha256: "test_sha256_hash".to_string(),
path: "/path/to/file.txt".to_string(),
file_name: "manual_file.txt".to_string(),
@@ -517,6 +536,7 @@ mod tests {
// The file path should point to our test file
let file_info = FileInfo {
id: file_id.clone(),
user_id: "user123".to_string(),
created_at: now,
updated_at: now,
sha256: "test_sha256_hash".to_string(),
@@ -586,4 +606,72 @@ mod tests {
_ => panic!("Expected FileNotFound error"),
}
}
#[tokio::test]
async fn test_get_by_id() {
// Setup in-memory database for testing
let namespace = "test_ns";
let database = &Uuid::new_v4().to_string();
let db = SurrealDbClient::memory(namespace, database)
.await
.expect("Failed to start in-memory surrealdb");
// Create a FileInfo instance directly
let now = Utc::now();
let file_id = Uuid::new_v4().to_string();
let original_file_info = FileInfo {
id: file_id.clone(),
user_id: "user123".to_string(),
created_at: now,
updated_at: now,
sha256: "test_sha256_for_get_by_id".to_string(),
path: "/path/to/get_by_id_test.txt".to_string(),
file_name: "get_by_id_test.txt".to_string(),
mime_type: "text/plain".to_string(),
};
// Store it in the database
db.store_item(original_file_info.clone())
.await
.expect("Failed to store item for get_by_id test");
// Retrieve it using get_by_id
let result = FileInfo::get_by_id(&file_id, &db).await;
// Assert success and content match
assert!(result.is_ok());
let retrieved_info = result.unwrap();
assert_eq!(retrieved_info.id, original_file_info.id);
assert_eq!(retrieved_info.sha256, original_file_info.sha256);
assert_eq!(retrieved_info.file_name, original_file_info.file_name);
assert_eq!(retrieved_info.path, original_file_info.path);
assert_eq!(retrieved_info.mime_type, original_file_info.mime_type);
// Optionally compare timestamps if precision isn't an issue
// assert_eq!(retrieved_info.created_at, original_file_info.created_at);
}
#[tokio::test]
async fn test_get_by_id_not_found() {
// Setup in-memory database for testing
let namespace = "test_ns";
let database = &Uuid::new_v4().to_string();
let db = SurrealDbClient::memory(namespace, database)
.await
.expect("Failed to start in-memory surrealdb");
// Try to retrieve a non-existent ID
let non_existent_id = "non-existent-file-id";
let result = FileInfo::get_by_id(non_existent_id, &db).await;
// Assert failure
assert!(result.is_err());
// Assert the specific error type is FileNotFound
match result {
Err(FileError::FileNotFound(id)) => {
assert_eq!(id, non_existent_id);
}
Err(e) => panic!("Expected FileNotFound error, but got {:?}", e),
Ok(_) => panic!("Expected an error, but got Ok"),
}
}
}

View File

@@ -114,6 +114,7 @@ mod tests {
id: mock.id,
sha256: "mock-sha256".to_string(),
path: "/mock/path".to_string(),
user_id: "user123".to_string(),
file_name: "mock.txt".to_string(),
mime_type: "text/plain".to_string(),
created_at: Utc::now(),

View File

@@ -31,19 +31,7 @@ impl SystemSettings {
let settings: Option<Self> = db.get_item("current").await?;
if settings.is_none() {
let created_settings = SystemSettings {
id: "current".to_string(),
registrations_enabled: true,
require_email_verification: false,
query_model: "gpt-4o-mini".to_string(),
processing_model: "gpt-4o-mini".to_string(),
query_system_prompt:
crate::storage::types::system_prompts::DEFAULT_QUERY_SYSTEM_PROMPT.to_string(),
ingestion_system_prompt:
crate::storage::types::system_prompts::DEFAULT_INGRESS_ANALYSIS_SYSTEM_PROMPT
.to_string(),
};
let created_settings = Self::new();
let stored: Option<Self> = db.store_item(created_settings).await?;
return stored.ok_or(AppError::Validation("Failed to initialize settings".into()));
}

View File

@@ -5,10 +5,17 @@ use crate::{error::AppError, storage::db::SurrealDbClient, stored_object};
use super::file_info::FileInfo;
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
pub struct UrlInfo {
pub url: String,
pub title: String,
pub image_id: String,
}
stored_object!(TextContent, "text_content", {
text: String,
file_info: Option<FileInfo>,
url: Option<String>,
url_info: Option<UrlInfo>,
instructions: String,
category: String,
user_id: String
@@ -20,7 +27,7 @@ impl TextContent {
instructions: String,
category: String,
file_info: Option<FileInfo>,
url: Option<String>,
url_info: Option<UrlInfo>,
user_id: String,
) -> Self {
let now = Utc::now();
@@ -30,7 +37,7 @@ impl TextContent {
updated_at: now,
text,
file_info,
url,
url_info,
instructions,
category,
user_id,
@@ -85,7 +92,7 @@ mod tests {
assert_eq!(text_content.category, category);
assert_eq!(text_content.user_id, user_id);
assert!(text_content.file_info.is_none());
assert!(text_content.url.is_none());
assert!(text_content.url_info.is_none());
assert!(!text_content.id.is_empty());
}
@@ -96,19 +103,27 @@ mod tests {
let instructions = "URL instructions".to_string();
let category = "URL category".to_string();
let user_id = "user123".to_string();
let url = Some("https://example.com/document.pdf".to_string());
let title = "page_title".to_string();
let image_id = "image12312".to_string();
let url = "https://example.com/document.pdf".to_string();
let url_info = Some(UrlInfo {
url,
title,
image_id,
});
let text_content = TextContent::new(
text.clone(),
instructions.clone(),
category.clone(),
None,
url.clone(),
url_info.clone(),
user_id.clone(),
);
// Check URL field is set
assert_eq!(text_content.url, url);
assert_eq!(text_content.url_info, url_info);
}
#[tokio::test]

View File

@@ -31,6 +31,7 @@ tower-http = { version = "0.6.2", features = ["fs"] }
chrono-tz = "0.10.1"
tower-serve-static = "0.1.1"
include_dir = "0.7.4"
tokio-util = { version = "0.7.15", features = ["io"] }
common = { path = "../common" }
composite-retrieval = { path = "../composite-retrieval" }

View File

@@ -1,9 +1,12 @@
use axum::{
body::Body,
extract::{Path, State},
http::{header, HeaderMap, HeaderValue, StatusCode},
response::IntoResponse,
};
use serde::Serialize;
use tokio::join;
use tokio::{fs::File, join};
use tokio_util::io::ReaderStream;
use crate::{
middlewares::{
@@ -15,9 +18,15 @@ use crate::{
use common::{
error::AppError,
storage::types::{
conversation::Conversation, file_info::FileInfo, ingestion_task::IngestionTask,
knowledge_entity::KnowledgeEntity, knowledge_relationship::KnowledgeRelationship,
text_chunk::TextChunk, text_content::TextContent, user::User,
conversation::Conversation,
file_info::{FileError, FileInfo},
ingestion_task::IngestionTask,
knowledge_entity::KnowledgeEntity,
knowledge_relationship::KnowledgeRelationship,
text_chunk::TextChunk,
text_content::TextContent,
user::User,
StoredObject,
},
};
@@ -167,3 +176,49 @@ pub async fn show_active_jobs(
},
))
}
pub async fn serve_file(
State(state): State<HtmlState>,
RequireUser(user): RequireUser,
Path(file_id): Path<String>,
) -> Result<impl IntoResponse, HtmlError> {
let file_info = match FileInfo::get_by_id(&file_id, &state.db).await {
Ok(info) => info,
_ => return Ok(TemplateResponse::not_found().into_response()),
};
if file_info.user_id != user.id {
return Ok(TemplateResponse::unauthorized().into_response());
}
// 3. Open the file asynchronously from the stored path
let path = std::path::Path::new(&file_info.path);
let file = match File::open(path).await {
Ok(f) => f,
Err(e) => return Ok(TemplateResponse::server_error().into_response()),
};
let stream = ReaderStream::new(file);
let body = Body::from_stream(stream);
let mut headers = HeaderMap::new();
headers.insert(
header::CONTENT_TYPE,
HeaderValue::from_str(&file_info.mime_type)
.unwrap_or_else(|_| HeaderValue::from_static("application/octet-stream")),
);
let Ok(disposition_value) =
HeaderValue::from_str(&format!("attachment; filename=\"{}\"", file_info.file_name))
else {
headers.insert(
header::CONTENT_DISPOSITION,
HeaderValue::from_static("attachment"),
);
return Ok((StatusCode::OK, headers, body).into_response());
};
headers.insert(header::CONTENT_DISPOSITION, disposition_value);
// 5. Return the response
Ok((StatusCode::OK, headers, body).into_response())
}

View File

@@ -5,7 +5,7 @@ use axum::{
routing::{delete, get},
Router,
};
use handlers::{delete_job, delete_text_content, index_handler, show_active_jobs};
use handlers::{delete_job, delete_text_content, index_handler, serve_file, show_active_jobs};
use crate::html_state::HtmlState;
@@ -26,4 +26,5 @@ where
.route("/jobs/{job_id}", delete(delete_job))
.route("/active-jobs", get(show_active_jobs))
.route("/text-content/{id}", delete(delete_text_content))
.route("/file/{id}", get(serve_file))
}

View File

@@ -54,9 +54,9 @@
<select name="query_model" class="select select-bordered w-full">
<option value="gpt-4o-mini" {% if settings.query_model=="gpt-4o-mini" %}selected{% endif %}>GPT-4o Mini
</option>
<option value="gpt-4o" {% if settings.query_model=="gpt-4o" %}selected{% endif %}>GPT-4o</option>
<option value="gpt-3.5-turbo" {% if settings.query_model=="gpt-3.5-turbo" %}selected{% endif %}>GPT-3.5
Turbo</option>
<option value="gpt-4.1" {% if settings.query_model=="gpt-4.1" %}selected{% endif %}>GPT-4.1</option>
<option value="gpt-4.1-mini" {% if settings.query_model=="gpt-4.1-mini" %}selected{% endif %}>GPT-4.1-mini
</option>
</select>
<p class="text-xs text-gray-500 mt-1">Model used for answering user queries</p>
</div>
@@ -66,11 +66,11 @@
<span class="label-text">Processing Model</span>
</label>
<select name="processing_model" class="select select-bordered w-full">
<option value="gpt-4o-mini" {% if settings.processing_model=="gpt-4o-mini" %}selected{% endif %}>GPT-4o Mini
<option value="gpt-4o-mini" {% if settings.query_model=="gpt-4o-mini" %}selected{% endif %}>GPT-4o Mini
</option>
<option value="gpt-4.1" {% if settings.query_model=="gpt-4.1" %}selected{% endif %}>GPT-4.1</option>
<option value="gpt-4.1-mini" {% if settings.query_model=="gpt-4.1-mini" %}selected{% endif %}>GPT-4.1-mini
</option>
<option value="gpt-4o" {% if settings.processing_model=="gpt-4o" %}selected{% endif %}>GPT-4o</option>
<option value="gpt-3.5-turbo" {% if settings.processing_model=="gpt-3.5-turbo" %}selected{% endif %}>GPT-3.5
Turbo</option>
</select>
<p class="text-xs text-gray-500 mt-1">Model used for content processing and ingestion</p>
</div>

View File

@@ -1,6 +1,7 @@
<div class="grid sm:grid-cols-2 lg:grid-cols-3 gap-4" id="text_content_cards">
{% for text_content in text_contents %}
<div class="card min-w-72 bg-base-100 shadow">
<img class="rounded-t-md overflow-clip" src="/file/{{text_content.url_info.image_id}}" />
<div class="card-body">
<div class="flex justify-between space-x-2">
<h2 class="card-title truncate">

View File

@@ -4,7 +4,7 @@
{% for item in latest_text_contents %}
<li class="list-row">
<div class="bg-accent rounded-box size-10 flex justify-center items-center text-accent-content">
{% if item.url %}
{% if item.url_info %}
{% include "icons/globe_icon.html" %}
{% elif item.file_info %}
{% include "icons/document_icon.html" %}
@@ -14,8 +14,8 @@
</div>
<div>
<div class="truncate max-w-[160px]">
{% if item.url %}
{{item.url}}
{% if item.url_info %}
{{item.url_info.title}}
{% elif item.file_info%}
{{item.file_info.file_name}}
{% else %}

View File

@@ -31,6 +31,10 @@
</a>
</li>
{% endfor %}
<li>
<button class="btn btn-primary" hx-get="/ingress-form" hx-target="#modal" hx-swap="innerHTML">Add
Content</button>
</li>
<div class="divider "></div>
</div>

View File

@@ -13,14 +13,17 @@ serde_json = { workspace = true }
futures = { workspace = true }
async-openai = { workspace = true }
surrealdb = { workspace = true }
dom_smoothie = { workspace = true }
tempfile = { workspace = true }
axum_typed_multipart = { workspace = true}
tiktoken-rs = "0.6.0"
reqwest = {version = "0.12.12", features = ["charset", "json"]}
scraper = "0.22.0"
chrono = { version = "0.4.39", features = ["serde"] }
text-splitter = "0.18.1"
url = { version = "2.5.2", features = ["serde"] }
uuid = { version = "1.10.0", features = ["v4", "serde"] }
dom_smoothie = "0.10.0"
headless_chrome = { git = "https://github.com/rust-headless-chrome/rust-headless-chrome", features = ["fetch"] }
common = { path = "../common" }
composite-retrieval = { path = "../composite-retrieval" }

View File

@@ -111,7 +111,7 @@ impl IngestionEnricher {
let request = CreateChatCompletionRequestArgs::default()
.model(&settings.processing_model)
.temperature(0.2)
.max_tokens(3048u32)
.max_tokens(6048u32)
.messages([
ChatCompletionRequestSystemMessage::from(INGRESS_ANALYSIS_SYSTEM_MESSAGE).into(),
ChatCompletionRequestUserMessage::from(user_message).into(),

View File

@@ -53,7 +53,7 @@ impl IngestionPipeline {
)
.await?;
let text_content = to_text_content(task.content, &self.openai_client, &self.db).await?;
let text_content = to_text_content(task.content, &self.db).await?;
match self.process(&text_content).await {
Ok(_) => {

View File

@@ -1,29 +1,28 @@
pub mod llm_enrichment_result;
use std::{sync::Arc, time::Duration};
use std::io::Write;
use std::time::Instant;
use async_openai::types::{
ChatCompletionRequestSystemMessage, ChatCompletionRequestUserMessage,
CreateChatCompletionRequestArgs,
};
use axum::http::HeaderMap;
use axum_typed_multipart::{FieldData, FieldMetadata};
use chrono::Utc;
use common::storage::db::SurrealDbClient;
use common::{
error::AppError,
storage::types::{
file_info::FileInfo, ingestion_payload::IngestionPayload, system_settings::SystemSettings,
text_content::TextContent,
file_info::FileInfo,
ingestion_payload::IngestionPayload,
text_content::{TextContent, UrlInfo},
},
};
use dom_smoothie::TextMode;
use reqwest;
use scraper::{Html, Selector};
use std::fmt::Write;
use tiktoken_rs::{o200k_base, CoreBPE};
use dom_smoothie::{Article, Readability, TextMode};
use headless_chrome::Browser;
use tempfile::NamedTempFile;
use tracing::{error, info};
pub async fn to_text_content(
ingestion_payload: IngestionPayload,
openai_client: &Arc<async_openai::Client<async_openai::config::OpenAIConfig>>,
db_client: &Arc<SurrealDbClient>,
db: &SurrealDbClient,
) -> Result<TextContent, AppError> {
match ingestion_payload {
IngestionPayload::Url {
@@ -32,13 +31,17 @@ pub async fn to_text_content(
category,
user_id,
} => {
let text = fetch_text_from_url(&url, openai_client, db_client).await?;
let (article, file_info) = fetch_article_from_url(&url, db, &user_id).await?;
Ok(TextContent::new(
text,
article.text_content.into(),
instructions,
category,
None,
Some(url),
Some(UrlInfo {
url,
title: article.title,
image_id: file_info.id,
}),
user_id,
))
}
@@ -73,161 +76,104 @@ pub async fn to_text_content(
}
}
}
use std::io::{Seek, SeekFrom}; // <-- Add Seek and SeekFrom
/// Get text from url, will return it as a markdown formatted string
async fn fetch_text_from_url(
/// Fetches web content from a URL, extracts the main article text as Markdown,
/// captures a screenshot, and stores the screenshot returning [`FileInfo`].
///
/// This function handles browser automation, content extraction via Readability,
/// screenshot capture, temporary file handling, and persisting the screenshot
/// details (including deduplication based on content hash via [`FileInfo::new`]).
///
/// # Arguments
///
/// * `url` - The URL of the web page to fetch.
/// * `db` - A reference to the database client (`SurrealDbClient`).
/// * `user_id` - The ID of the user performing the action, used for associating the file.
///
/// # Returns
///
/// A `Result` containing:
/// * Ok: A tuple `(Article, FileInfo)` where `Article` contains the parsed markdown
/// content and metadata, and `FileInfo` contains the details of the stored screenshot.
/// * Err: An `AppError` if any step fails (navigation, screenshot, file handling, DB operation).
async fn fetch_article_from_url(
url: &str,
openai_client: &Arc<async_openai::Client<async_openai::config::OpenAIConfig>>,
db_client: &Arc<SurrealDbClient>,
) -> Result<String, AppError> {
// Use a client with timeouts and reuse
let client = reqwest::ClientBuilder::new()
.timeout(Duration::from_secs(30))
.build()?;
let response = client.get(url).send().await?.text().await?;
db: &SurrealDbClient,
user_id: &str,
) -> Result<(Article, FileInfo), AppError> {
info!("Fetching URL: {}", url);
// Instantiate timer
let now = Instant::now();
// Setup browser, navigate and wait
let browser = Browser::default()?;
let tab = browser.new_tab()?;
let page = tab.navigate_to(url)?;
let loaded_page = page.wait_until_navigated()?;
// Get content
let raw_content = loaded_page.get_content()?;
// Get screenshot
let screenshot = loaded_page.capture_screenshot(
headless_chrome::protocol::cdp::Page::CaptureScreenshotFormatOption::Jpeg,
None,
None,
true,
)?;
// Preallocate string with capacity
let mut structured_content = String::with_capacity(response.len() / 2);
// Create temp file
let mut tmp_file = NamedTempFile::new()?;
let temp_path_str = format!("{:?}", tmp_file.path());
let document = Html::parse_document(&response);
let main_selectors = Selector::parse(
"article, main, .article-content, .post-content, .entry-content, [role='main']",
)
.unwrap();
// Write screenshot TO the temp file
tmp_file.write_all(&screenshot)?;
let content_element = document
.select(&main_selectors)
.next()
.or_else(|| document.select(&Selector::parse("body").unwrap()).next())
.ok_or(AppError::NotFound("No content found".into()))?;
// Ensure the OS buffer is written to the file system _before_ we proceed.
tmp_file.as_file().sync_all()?;
// Compile selectors once
let heading_selector = Selector::parse("h1, h2, h3").unwrap();
let paragraph_selector = Selector::parse("p").unwrap();
// Process content in one pass
for element in content_element.select(&heading_selector) {
let _ = writeln!(
structured_content,
"<heading>{}</heading>",
element.text().collect::<String>().trim()
);
}
for element in content_element.select(&paragraph_selector) {
let _ = writeln!(
structured_content,
"<paragraph>{}</paragraph>",
element.text().collect::<String>().trim()
);
// Ensure the file handle's read cursor is at the beginning before hashing occurs.
if let Err(e) = tmp_file.seek(SeekFrom::Start(0)) {
error!("URL: {}. Failed to seek temp file {} to start: {:?}. Proceeding, but hashing might fail.", url, temp_path_str, e);
}
let content = structured_content
.replace(|c: char| c.is_control(), " ")
.replace(" ", " ");
// Prepare file metadata
let parsed_url =
url::Url::parse(url).map_err(|_| AppError::Processing("Invalid URL".to_string()))?;
let domain = parsed_url
.host_str()
.unwrap_or("unknown")
.replace(|c: char| !c.is_alphanumeric(), "_");
let timestamp = Utc::now().format("%Y%m%d%H%M%S");
let file_name = format!("{}_{}_{}.jpg", domain, "screenshot", timestamp);
process_web_content(content, openai_client, db_client).await
// let config = dom_smoothie::Config {
// text_mode: TextMode::Markdown,
// ..Default::default()
// };
// panic!("YOU SHALL NOT PASS");
}
pub async fn process_web_content(
content: String,
openai_client: &Arc<async_openai::Client<async_openai::config::OpenAIConfig>>,
db_client: &Arc<SurrealDbClient>,
) -> Result<String, AppError> {
const MAX_TOKENS: usize = 122000;
const SYSTEM_PROMPT: &str = r#"
You are a precise content extractor for web pages. Your task:
1. Extract ONLY the main article/content from the provided text
2. Maintain the original content - do not summarize or modify the core information
3. Ignore peripheral content such as:
- Navigation elements
- Error messages (e.g., "JavaScript required")
- Related articles sections
- Comments
- Social media links
- Advertisement text
FORMAT:
- Convert <heading> tags to markdown headings (#, ##, ###)
- Convert <paragraph> tags to markdown paragraphs
- Preserve quotes and important formatting
- Remove duplicate content
- Remove any metadata or technical artifacts
OUTPUT RULES:
- Output ONLY the cleaned content in markdown
- Do not add any explanations or meta-commentary
- Do not add summaries or conclusions
- Do not use any XML/HTML tags in the output
"#;
let bpe = o200k_base()?;
let settings = SystemSettings::get_current(db_client).await?;
// Process content in chunks if needed
let truncated_content = if bpe.encode_with_special_tokens(&content).len() > MAX_TOKENS {
truncate_content(&content, MAX_TOKENS, &bpe)?
} else {
content
// Construct FieldData and FieldMetadata
let metadata = FieldMetadata {
file_name: Some(file_name),
content_type: Some("image/jpeg".to_string()),
name: None,
headers: HeaderMap::new(),
};
let field_data = FieldData {
contents: tmp_file,
metadata,
};
let request = CreateChatCompletionRequestArgs::default()
.model(&settings.processing_model)
.temperature(0.0)
.max_tokens(16200u32)
.messages([
ChatCompletionRequestSystemMessage::from(SYSTEM_PROMPT).into(),
ChatCompletionRequestUserMessage::from(truncated_content).into(),
])
.build()?;
// Store screenshot
let file_info = FileInfo::new(field_data, db, user_id).await?;
let response = openai_client.chat().create(request).await?;
// Parse content...
let config = dom_smoothie::Config {
text_mode: TextMode::Markdown,
..Default::default()
};
let mut readability = Readability::new(raw_content, None, Some(config))?;
let article: Article = readability.parse()?;
let end = now.elapsed();
info!(
"URL: {}. Total time: {:?}. Final File ID: {}",
url, end, file_info.id
);
// Extract and return the content
response
.choices
.first()
.and_then(|choice| choice.message.content.clone())
.ok_or(AppError::LLMParsing(
"No content found in LLM response".into(),
))
}
fn truncate_content(
content: &str,
max_tokens: usize,
tokenizer: &CoreBPE,
) -> Result<String, AppError> {
// Pre-allocate with estimated size
let mut result = String::with_capacity(content.len() / 2);
let mut current_tokens = 0;
// Process content by paragraph to maintain context
for paragraph in content.split("\n\n") {
let tokens = tokenizer.encode_with_special_tokens(paragraph).len();
// Check if adding paragraph exceeds limit
if current_tokens + tokens > max_tokens {
break;
}
result.push_str(paragraph);
result.push_str("\n\n");
current_tokens += tokens;
}
// Ensure we return valid content
if result.is_empty() {
return Err(AppError::Processing("Content exceeds token limit".into()));
}
Ok(result.trim_end().to_string())
Ok((article, file_info))
}
/// Extracts text from a file based on its MIME type.

View File

@@ -50,7 +50,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
// Create Axum router
let app = Router::new()
.nest("/api/v1", api_routes_v1(&api_state))
.nest("/", html_routes(&html_state))
.merge(html_routes(&html_state))
.with_state(AppState {
api_state,
html_state,