mirror of
https://github.com/perstarkse/minne.git
synced 2026-03-13 05:45:35 +01:00
feat: readability parsing, screenshot of page, file serving
This commit is contained in:
507
Cargo.lock
generated
507
Cargo.lock
generated
@@ -201,6 +201,9 @@ name = "arbitrary"
|
||||
version = "1.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dde20b3d026af13f561bdd0f15edf01fc734f0dafcedbaf42bba506a9517f223"
|
||||
dependencies = [
|
||||
"derive_arbitrary",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "argon2"
|
||||
@@ -451,6 +454,20 @@ dependencies = [
|
||||
"syn 2.0.100",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "auto_generate_cdp"
|
||||
version = "0.4.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d6e1961a0d5d77969057eba90d448e610d3c439024d135d9dbd98e33ec973520"
|
||||
dependencies = [
|
||||
"convert_case 0.4.0",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"ureq 2.12.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "autocfg"
|
||||
version = "1.4.0"
|
||||
@@ -835,17 +852,6 @@ dependencies = [
|
||||
"syn 2.0.100",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bstr"
|
||||
version = "1.12.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "234113d19d0d7d613b40e86fb654acf958910802bcceab913a4f9e7cda03b1a4"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
"regex-automata 0.4.9",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bumpalo"
|
||||
version = "3.17.0"
|
||||
@@ -895,6 +901,25 @@ dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bzip2"
|
||||
version = "0.5.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47"
|
||||
dependencies = [
|
||||
"bzip2-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bzip2-sys"
|
||||
version = "0.1.13+1.0.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"pkg-config",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "castaway"
|
||||
version = "0.2.3"
|
||||
@@ -910,6 +935,8 @@ version = "1.2.19"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8e3a13707ac958681c13b39b458c073d0d9bc8a22cb1b2f4c8e55eb72c13f362"
|
||||
dependencies = [
|
||||
"jobserver",
|
||||
"libc",
|
||||
"shlex",
|
||||
]
|
||||
|
||||
@@ -1070,6 +1097,7 @@ dependencies = [
|
||||
"chrono",
|
||||
"chrono-tz",
|
||||
"config",
|
||||
"dom_smoothie",
|
||||
"futures",
|
||||
"mime",
|
||||
"mime_guess",
|
||||
@@ -1124,7 +1152,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "595aae20e65c3be792d05818e8c63025294ac3cb7e200f11459063a352a6ef80"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"convert_case",
|
||||
"convert_case 0.6.0",
|
||||
"json5",
|
||||
"pathdiff",
|
||||
"ron",
|
||||
@@ -1162,6 +1190,12 @@ version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6"
|
||||
|
||||
[[package]]
|
||||
name = "convert_case"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e"
|
||||
|
||||
[[package]]
|
||||
name = "convert_case"
|
||||
version = "0.6.0"
|
||||
@@ -1221,6 +1255,21 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crc"
|
||||
version = "3.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "69e6e4d7b33a94f0991c26729976b10ebde1d34c3ee82408fb536164fa10d636"
|
||||
dependencies = [
|
||||
"crc-catalog",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crc-catalog"
|
||||
version = "2.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5"
|
||||
|
||||
[[package]]
|
||||
name = "crc32fast"
|
||||
version = "1.4.2"
|
||||
@@ -1385,6 +1434,12 @@ version = "2.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2a2330da5de22e8a3cb63252ce2abb30116bf5265e89c0e01bc17015ce30a476"
|
||||
|
||||
[[package]]
|
||||
name = "deflate64"
|
||||
version = "0.1.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "da692b8d1080ea3045efaab14434d40468c3d8657e42abddfffca87b428f4c1b"
|
||||
|
||||
[[package]]
|
||||
name = "deranged"
|
||||
version = "0.4.0"
|
||||
@@ -1395,6 +1450,17 @@ dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "derive_arbitrary"
|
||||
version = "1.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "30542c1ad912e0e3d22a1935c290e12e8a29d704a420177a31faad4a601a0800"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.100",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "derive_builder"
|
||||
version = "0.20.2"
|
||||
@@ -1465,6 +1531,15 @@ dependencies = [
|
||||
"subtle",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "directories"
|
||||
version = "6.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "16f5094c54661b38d03bd7e50df373292118db60b585c08a411c6d840017fe7d"
|
||||
dependencies = [
|
||||
"dirs-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dirs-next"
|
||||
version = "2.0.0"
|
||||
@@ -1475,6 +1550,18 @@ dependencies = [
|
||||
"dirs-sys-next",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dirs-sys"
|
||||
version = "0.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e01a3366d27ee9890022452ee61b2b63a67e6f13f58900b651ff5665f0bb1fab"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"option-ext",
|
||||
"redox_users 0.5.0",
|
||||
"windows-sys 0.59.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dirs-sys-next"
|
||||
version = "0.1.2"
|
||||
@@ -1482,7 +1569,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4ebda144c4fe02d1f7ea1a7d9641b6fc6b580adcfa024ae48797ecdeb6825b4d"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"redox_users",
|
||||
"redox_users 0.4.6",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
@@ -1588,12 +1675,6 @@ dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ego-tree"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b2972feb8dffe7bc8c5463b1dacda1b0dfbed3710e50f977d965429692d74cd8"
|
||||
|
||||
[[package]]
|
||||
name = "either"
|
||||
version = "1.15.0"
|
||||
@@ -1624,6 +1705,12 @@ version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c34f04666d835ff5d62e058c3995147c06f42fe86ff053337632bca83e42702d"
|
||||
|
||||
[[package]]
|
||||
name = "env_home"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c7f84e12ccf0a7ddc17a6c41c93326024c42920d7ee630d04950e6926645c0fe"
|
||||
|
||||
[[package]]
|
||||
name = "equivalent"
|
||||
version = "1.0.2"
|
||||
@@ -1695,17 +1782,6 @@ dependencies = [
|
||||
"tempfile",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fancy-regex"
|
||||
version = "0.13.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "531e46835a22af56d1e3b66f04844bed63158bc094a628bec1d321d9b4c44bf2"
|
||||
dependencies = [
|
||||
"bit-set 0.5.3",
|
||||
"regex-automata 0.4.9",
|
||||
"regex-syntax 0.8.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fastrand"
|
||||
version = "2.3.0"
|
||||
@@ -1736,6 +1812,16 @@ version = "0.4.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b7ac824320a75a52197e8f2d787f6a38b6718bb6897a35142d749af3c0e8f4fe"
|
||||
|
||||
[[package]]
|
||||
name = "flate2"
|
||||
version = "1.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7ced92e76e966ca2fd84c8f7aa01a4aea65b0eb6648d72f7c8f3e2764a67fece"
|
||||
dependencies = [
|
||||
"crc32fast",
|
||||
"miniz_oxide",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "float_next_after"
|
||||
version = "1.0.0"
|
||||
@@ -1995,15 +2081,6 @@ dependencies = [
|
||||
"libm",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "getopts"
|
||||
version = "0.2.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5"
|
||||
dependencies = [
|
||||
"unicode-width",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "getrandom"
|
||||
version = "0.2.16"
|
||||
@@ -2126,6 +2203,32 @@ dependencies = [
|
||||
"hashbrown 0.15.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "headless_chrome"
|
||||
version = "1.0.17"
|
||||
source = "git+https://github.com/rust-headless-chrome/rust-headless-chrome#8b66992826245cbf60377d619fc780f8c45abf8e"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"auto_generate_cdp",
|
||||
"base64 0.22.1",
|
||||
"derive_builder",
|
||||
"directories",
|
||||
"log",
|
||||
"rand 0.9.1",
|
||||
"regex",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tempfile",
|
||||
"thiserror 2.0.12",
|
||||
"tungstenite 0.26.2",
|
||||
"ureq 3.0.11",
|
||||
"url",
|
||||
"walkdir",
|
||||
"which",
|
||||
"winreg",
|
||||
"zip",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "heapless"
|
||||
version = "0.8.0"
|
||||
@@ -2207,23 +2310,12 @@ dependencies = [
|
||||
"tempfile",
|
||||
"thiserror 1.0.69",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"tower-http",
|
||||
"tower-serve-static",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "html5ever"
|
||||
version = "0.29.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3b7410cae13cbc75623c98ac4cbfd1f0bedddf3227afc24f370cf0f50a44a11c"
|
||||
dependencies = [
|
||||
"log",
|
||||
"mac",
|
||||
"markup5ever 0.14.1",
|
||||
"match_token",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "html5ever"
|
||||
version = "0.30.0"
|
||||
@@ -2607,20 +2699,22 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"async-openai",
|
||||
"axum",
|
||||
"axum_typed_multipart",
|
||||
"chrono",
|
||||
"common",
|
||||
"composite-retrieval",
|
||||
"dom_smoothie",
|
||||
"futures",
|
||||
"headless_chrome",
|
||||
"reqwest",
|
||||
"scraper",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"surrealdb",
|
||||
"tempfile",
|
||||
"text-splitter",
|
||||
"tiktoken-rs",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"url",
|
||||
"uuid",
|
||||
]
|
||||
|
||||
@@ -2701,6 +2795,16 @@ version = "1.0.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
|
||||
|
||||
[[package]]
|
||||
name = "jobserver"
|
||||
version = "0.1.33"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "38f262f097c174adebe41eb73d66ae9c06b2844fb0da69969647bbddd9b0538a"
|
||||
dependencies = [
|
||||
"getrandom 0.3.2",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "js-sys"
|
||||
version = "0.3.77"
|
||||
@@ -2882,6 +2986,27 @@ dependencies = [
|
||||
"hashbrown 0.15.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lzma-rs"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "297e814c836ae64db86b36cf2a557ba54368d03f6afcd7d947c266692f71115e"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
"crc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lzma-sys"
|
||||
version = "0.1.20"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"libc",
|
||||
"pkg-config",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "mac"
|
||||
version = "0.1.1"
|
||||
@@ -2915,20 +3040,6 @@ version = "1.0.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d"
|
||||
|
||||
[[package]]
|
||||
name = "markup5ever"
|
||||
version = "0.14.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c7a7213d12e1864c0f002f52c2923d4556935a43dec5e71355c2760e0f6e7a18"
|
||||
dependencies = [
|
||||
"log",
|
||||
"phf",
|
||||
"phf_codegen",
|
||||
"string_cache",
|
||||
"string_cache_codegen",
|
||||
"tendril",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "markup5ever"
|
||||
version = "0.15.0"
|
||||
@@ -3422,6 +3533,12 @@ dependencies = [
|
||||
"vcpkg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "option-ext"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d"
|
||||
|
||||
[[package]]
|
||||
name = "ordered-multimap"
|
||||
version = "0.7.3"
|
||||
@@ -3869,7 +3986,7 @@ dependencies = [
|
||||
"pin-project-lite",
|
||||
"quinn-proto",
|
||||
"quinn-udp",
|
||||
"rustc-hash 2.1.1",
|
||||
"rustc-hash",
|
||||
"rustls",
|
||||
"socket2",
|
||||
"thiserror 2.0.12",
|
||||
@@ -3888,7 +4005,7 @@ dependencies = [
|
||||
"getrandom 0.3.2",
|
||||
"rand 0.9.1",
|
||||
"ring",
|
||||
"rustc-hash 2.1.1",
|
||||
"rustc-hash",
|
||||
"rustls",
|
||||
"rustls-pki-types",
|
||||
"slab",
|
||||
@@ -4055,6 +4172,17 @@ dependencies = [
|
||||
"thiserror 1.0.69",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "redox_users"
|
||||
version = "0.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dd6f9d3d47bdd2ad6945c5015a226ec6155d0bcdfd8f7cd29f86b71f8de99d2b"
|
||||
dependencies = [
|
||||
"getrandom 0.2.16",
|
||||
"libredox",
|
||||
"thiserror 2.0.12",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ref-cast"
|
||||
version = "1.0.24"
|
||||
@@ -4284,7 +4412,7 @@ dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"rinja_parser",
|
||||
"rustc-hash 2.1.1",
|
||||
"rustc-hash",
|
||||
"serde",
|
||||
"syn 2.0.100",
|
||||
]
|
||||
@@ -4444,12 +4572,6 @@ version = "0.1.24"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f"
|
||||
|
||||
[[package]]
|
||||
name = "rustc-hash"
|
||||
version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
|
||||
|
||||
[[package]]
|
||||
name = "rustc-hash"
|
||||
version = "2.1.1"
|
||||
@@ -4588,21 +4710,6 @@ version = "1.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
|
||||
|
||||
[[package]]
|
||||
name = "scraper"
|
||||
version = "0.22.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cc3d051b884f40e309de6c149734eab57aa8cc1347992710dc80bcc1c2194c15"
|
||||
dependencies = [
|
||||
"cssparser 0.34.0",
|
||||
"ego-tree",
|
||||
"getopts",
|
||||
"html5ever 0.29.1",
|
||||
"precomputed-hash",
|
||||
"selectors",
|
||||
"tendril",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "scrypt"
|
||||
version = "0.11.0"
|
||||
@@ -4888,6 +4995,12 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "simd-adler32"
|
||||
version = "0.3.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe"
|
||||
|
||||
[[package]]
|
||||
name = "simdutf8"
|
||||
version = "0.1.5"
|
||||
@@ -4974,6 +5087,17 @@ dependencies = [
|
||||
"windows-sys 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "socks"
|
||||
version = "0.3.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f0c3dbbd9ae980613c6dd8e28a9407b50509d3803b57624d5dfe8315218cd58b"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
"libc",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "spade"
|
||||
version = "2.13.1"
|
||||
@@ -5427,22 +5551,6 @@ dependencies = [
|
||||
"once_cell",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tiktoken-rs"
|
||||
version = "0.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "44075987ee2486402f0808505dd65692163d243a337fc54363d49afac41087f6"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"base64 0.21.7",
|
||||
"bstr",
|
||||
"fancy-regex",
|
||||
"lazy_static",
|
||||
"parking_lot",
|
||||
"regex",
|
||||
"rustc-hash 1.1.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "time"
|
||||
version = "0.3.41"
|
||||
@@ -5597,7 +5705,7 @@ dependencies = [
|
||||
"rustls-pki-types",
|
||||
"tokio",
|
||||
"tokio-rustls",
|
||||
"tungstenite",
|
||||
"tungstenite 0.23.0",
|
||||
"webpki-roots",
|
||||
]
|
||||
|
||||
@@ -5829,6 +5937,23 @@ dependencies = [
|
||||
"utf-8",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tungstenite"
|
||||
version = "0.26.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4793cb5e56680ecbb1d843515b23b6de9a75eb04b66643e256a396d43be33c13"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"data-encoding",
|
||||
"http",
|
||||
"httparse",
|
||||
"log",
|
||||
"rand 0.9.1",
|
||||
"sha1",
|
||||
"thiserror 2.0.12",
|
||||
"utf-8",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "typeid"
|
||||
version = "1.0.3"
|
||||
@@ -5935,6 +6060,53 @@ version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
|
||||
|
||||
[[package]]
|
||||
name = "ureq"
|
||||
version = "2.12.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "02d1a66277ed75f640d608235660df48c8e3c19f3b4edb6a263315626cc3c01d"
|
||||
dependencies = [
|
||||
"base64 0.22.1",
|
||||
"flate2",
|
||||
"log",
|
||||
"once_cell",
|
||||
"rustls",
|
||||
"rustls-pki-types",
|
||||
"socks",
|
||||
"url",
|
||||
"webpki-roots",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ureq"
|
||||
version = "3.0.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b7a3e9af6113ecd57b8c63d3cd76a385b2e3881365f1f489e54f49801d0c83ea"
|
||||
dependencies = [
|
||||
"base64 0.22.1",
|
||||
"flate2",
|
||||
"log",
|
||||
"percent-encoding",
|
||||
"rustls",
|
||||
"rustls-pemfile",
|
||||
"rustls-pki-types",
|
||||
"ureq-proto",
|
||||
"utf-8",
|
||||
"webpki-roots",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ureq-proto"
|
||||
version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fadf18427d33828c311234884b7ba2afb57143e6e7e69fda7ee883b624661e36"
|
||||
dependencies = [
|
||||
"base64 0.22.1",
|
||||
"http",
|
||||
"httparse",
|
||||
"log",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "url"
|
||||
version = "2.5.4"
|
||||
@@ -6191,6 +6363,18 @@ dependencies = [
|
||||
"rustls-pki-types",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "which"
|
||||
version = "7.0.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "24d643ce3fd3e5b54854602a080f34fb10ab75e0b813ee32d00ca2b44fa74762"
|
||||
dependencies = [
|
||||
"either",
|
||||
"env_home",
|
||||
"rustix",
|
||||
"winsafe",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winapi"
|
||||
version = "0.3.9"
|
||||
@@ -6509,6 +6693,22 @@ dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winreg"
|
||||
version = "0.55.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cb5a765337c50e9ec252c2069be9bf91c7df47afb103b642ba3a53bf8101be97"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"windows-sys 0.59.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winsafe"
|
||||
version = "0.0.19"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d135d17ab770252ad95e9a872d365cf3090e3be864a34ab46f48555993efc904"
|
||||
|
||||
[[package]]
|
||||
name = "wit-bindgen-rt"
|
||||
version = "0.39.0"
|
||||
@@ -6564,6 +6764,15 @@ version = "0.8.26"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a62ce76d9b56901b19a74f19431b0d8b3bc7ca4ad685a746dfd78ca8f4fc6bda"
|
||||
|
||||
[[package]]
|
||||
name = "xz2"
|
||||
version = "0.1.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2"
|
||||
dependencies = [
|
||||
"lzma-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "yaml-rust2"
|
||||
version = "0.10.1"
|
||||
@@ -6665,6 +6874,20 @@ name = "zeroize"
|
||||
version = "1.8.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde"
|
||||
dependencies = [
|
||||
"zeroize_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zeroize_derive"
|
||||
version = "1.4.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.100",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zerovec"
|
||||
@@ -6687,3 +6910,71 @@ dependencies = [
|
||||
"quote",
|
||||
"syn 2.0.100",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zip"
|
||||
version = "2.6.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1dcb24d0152526ae49b9b96c1dcf71850ca1e0b882e4e28ed898a93c41334744"
|
||||
dependencies = [
|
||||
"aes",
|
||||
"arbitrary",
|
||||
"bzip2",
|
||||
"constant_time_eq",
|
||||
"crc32fast",
|
||||
"crossbeam-utils",
|
||||
"deflate64",
|
||||
"flate2",
|
||||
"getrandom 0.3.2",
|
||||
"hmac",
|
||||
"indexmap 2.9.0",
|
||||
"lzma-rs",
|
||||
"memchr",
|
||||
"pbkdf2",
|
||||
"sha1",
|
||||
"time",
|
||||
"xz2",
|
||||
"zeroize",
|
||||
"zopfli",
|
||||
"zstd",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zopfli"
|
||||
version = "0.8.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "edfc5ee405f504cd4984ecc6f14d02d55cfda60fa4b689434ef4102aae150cd7"
|
||||
dependencies = [
|
||||
"bumpalo",
|
||||
"crc32fast",
|
||||
"log",
|
||||
"simd-adler32",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zstd"
|
||||
version = "0.13.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a"
|
||||
dependencies = [
|
||||
"zstd-safe",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zstd-safe"
|
||||
version = "7.2.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d"
|
||||
dependencies = [
|
||||
"zstd-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zstd-sys"
|
||||
version = "2.0.15+zstd.1.5.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eb81183ddd97d0c74cedf1d50d85c8d08c1b8b68ee863bdee9e706eedba1a237"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"pkg-config",
|
||||
]
|
||||
|
||||
@@ -26,3 +26,4 @@ axum_session_auth = "0.16"
|
||||
axum_session_surreal = "0.4"
|
||||
axum_typed_multipart = "0.16"
|
||||
tempfile = "3.12.0"
|
||||
dom_smoothie = "0.10.0"
|
||||
|
||||
@@ -16,6 +16,7 @@ surrealdb = { workspace = true, features = ["kv-mem"] }
|
||||
async-openai = { workspace = true }
|
||||
futures = { workspace = true }
|
||||
tempfile = { workspace = true }
|
||||
dom_smoothie = { workspace = true }
|
||||
|
||||
async-trait = "0.1.88"
|
||||
axum_session = { workspace = true }
|
||||
|
||||
@@ -33,4 +33,6 @@ pub enum AppError {
|
||||
Tiktoken(#[from] anyhow::Error),
|
||||
#[error("Ingress Processing error: {0}")]
|
||||
Processing(String),
|
||||
#[error("DOM smoothie error: {0}")]
|
||||
DomSmoothie(#[from] dom_smoothie::ReadabilityError),
|
||||
}
|
||||
|
||||
@@ -38,7 +38,8 @@ stored_object!(FileInfo, "file", {
|
||||
sha256: String,
|
||||
path: String,
|
||||
file_name: String,
|
||||
mime_type: String
|
||||
mime_type: String,
|
||||
user_id: String
|
||||
});
|
||||
|
||||
impl FileInfo {
|
||||
@@ -83,6 +84,7 @@ impl FileInfo {
|
||||
.to_string_lossy()
|
||||
.into(),
|
||||
mime_type: Self::guess_mime_type(Path::new(&sanitized_file_name)),
|
||||
user_id: user_id.to_string(),
|
||||
};
|
||||
|
||||
// Store in database
|
||||
@@ -258,6 +260,22 @@ impl FileInfo {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Retrieves a `FileInfo` by its ID.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `id` - The ID string of the file.
|
||||
/// * `db_client` - Reference to the SurrealDbClient.
|
||||
///
|
||||
/// # Returns
|
||||
/// * `Result<FileInfo, FileError>` - The `FileInfo` or an error if not found or on DB issues.
|
||||
pub async fn get_by_id(id: &str, db_client: &SurrealDbClient) -> Result<FileInfo, FileError> {
|
||||
match db_client.get_item::<FileInfo>(id).await {
|
||||
Ok(Some(file_info)) => Ok(file_info),
|
||||
Ok(None) => Err(FileError::FileNotFound(id.to_string())),
|
||||
Err(e) => Err(FileError::SurrealError(e)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -460,6 +478,7 @@ mod tests {
|
||||
id: Uuid::new_v4().to_string(),
|
||||
created_at: now,
|
||||
updated_at: now,
|
||||
user_id: "user123".to_string(),
|
||||
sha256: "test_sha256_hash".to_string(),
|
||||
path: "/path/to/file.txt".to_string(),
|
||||
file_name: "manual_file.txt".to_string(),
|
||||
@@ -517,6 +536,7 @@ mod tests {
|
||||
// The file path should point to our test file
|
||||
let file_info = FileInfo {
|
||||
id: file_id.clone(),
|
||||
user_id: "user123".to_string(),
|
||||
created_at: now,
|
||||
updated_at: now,
|
||||
sha256: "test_sha256_hash".to_string(),
|
||||
@@ -586,4 +606,72 @@ mod tests {
|
||||
_ => panic!("Expected FileNotFound error"),
|
||||
}
|
||||
}
|
||||
#[tokio::test]
|
||||
async fn test_get_by_id() {
|
||||
// Setup in-memory database for testing
|
||||
let namespace = "test_ns";
|
||||
let database = &Uuid::new_v4().to_string();
|
||||
let db = SurrealDbClient::memory(namespace, database)
|
||||
.await
|
||||
.expect("Failed to start in-memory surrealdb");
|
||||
|
||||
// Create a FileInfo instance directly
|
||||
let now = Utc::now();
|
||||
let file_id = Uuid::new_v4().to_string();
|
||||
let original_file_info = FileInfo {
|
||||
id: file_id.clone(),
|
||||
user_id: "user123".to_string(),
|
||||
created_at: now,
|
||||
updated_at: now,
|
||||
sha256: "test_sha256_for_get_by_id".to_string(),
|
||||
path: "/path/to/get_by_id_test.txt".to_string(),
|
||||
file_name: "get_by_id_test.txt".to_string(),
|
||||
mime_type: "text/plain".to_string(),
|
||||
};
|
||||
|
||||
// Store it in the database
|
||||
db.store_item(original_file_info.clone())
|
||||
.await
|
||||
.expect("Failed to store item for get_by_id test");
|
||||
|
||||
// Retrieve it using get_by_id
|
||||
let result = FileInfo::get_by_id(&file_id, &db).await;
|
||||
|
||||
// Assert success and content match
|
||||
assert!(result.is_ok());
|
||||
let retrieved_info = result.unwrap();
|
||||
assert_eq!(retrieved_info.id, original_file_info.id);
|
||||
assert_eq!(retrieved_info.sha256, original_file_info.sha256);
|
||||
assert_eq!(retrieved_info.file_name, original_file_info.file_name);
|
||||
assert_eq!(retrieved_info.path, original_file_info.path);
|
||||
assert_eq!(retrieved_info.mime_type, original_file_info.mime_type);
|
||||
// Optionally compare timestamps if precision isn't an issue
|
||||
// assert_eq!(retrieved_info.created_at, original_file_info.created_at);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_get_by_id_not_found() {
|
||||
// Setup in-memory database for testing
|
||||
let namespace = "test_ns";
|
||||
let database = &Uuid::new_v4().to_string();
|
||||
let db = SurrealDbClient::memory(namespace, database)
|
||||
.await
|
||||
.expect("Failed to start in-memory surrealdb");
|
||||
|
||||
// Try to retrieve a non-existent ID
|
||||
let non_existent_id = "non-existent-file-id";
|
||||
let result = FileInfo::get_by_id(non_existent_id, &db).await;
|
||||
|
||||
// Assert failure
|
||||
assert!(result.is_err());
|
||||
|
||||
// Assert the specific error type is FileNotFound
|
||||
match result {
|
||||
Err(FileError::FileNotFound(id)) => {
|
||||
assert_eq!(id, non_existent_id);
|
||||
}
|
||||
Err(e) => panic!("Expected FileNotFound error, but got {:?}", e),
|
||||
Ok(_) => panic!("Expected an error, but got Ok"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -114,6 +114,7 @@ mod tests {
|
||||
id: mock.id,
|
||||
sha256: "mock-sha256".to_string(),
|
||||
path: "/mock/path".to_string(),
|
||||
user_id: "user123".to_string(),
|
||||
file_name: "mock.txt".to_string(),
|
||||
mime_type: "text/plain".to_string(),
|
||||
created_at: Utc::now(),
|
||||
|
||||
@@ -31,19 +31,7 @@ impl SystemSettings {
|
||||
let settings: Option<Self> = db.get_item("current").await?;
|
||||
|
||||
if settings.is_none() {
|
||||
let created_settings = SystemSettings {
|
||||
id: "current".to_string(),
|
||||
registrations_enabled: true,
|
||||
require_email_verification: false,
|
||||
query_model: "gpt-4o-mini".to_string(),
|
||||
processing_model: "gpt-4o-mini".to_string(),
|
||||
query_system_prompt:
|
||||
crate::storage::types::system_prompts::DEFAULT_QUERY_SYSTEM_PROMPT.to_string(),
|
||||
ingestion_system_prompt:
|
||||
crate::storage::types::system_prompts::DEFAULT_INGRESS_ANALYSIS_SYSTEM_PROMPT
|
||||
.to_string(),
|
||||
};
|
||||
|
||||
let created_settings = Self::new();
|
||||
let stored: Option<Self> = db.store_item(created_settings).await?;
|
||||
return stored.ok_or(AppError::Validation("Failed to initialize settings".into()));
|
||||
}
|
||||
|
||||
@@ -5,10 +5,17 @@ use crate::{error::AppError, storage::db::SurrealDbClient, stored_object};
|
||||
|
||||
use super::file_info::FileInfo;
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
|
||||
pub struct UrlInfo {
|
||||
pub url: String,
|
||||
pub title: String,
|
||||
pub image_id: String,
|
||||
}
|
||||
|
||||
stored_object!(TextContent, "text_content", {
|
||||
text: String,
|
||||
file_info: Option<FileInfo>,
|
||||
url: Option<String>,
|
||||
url_info: Option<UrlInfo>,
|
||||
instructions: String,
|
||||
category: String,
|
||||
user_id: String
|
||||
@@ -20,7 +27,7 @@ impl TextContent {
|
||||
instructions: String,
|
||||
category: String,
|
||||
file_info: Option<FileInfo>,
|
||||
url: Option<String>,
|
||||
url_info: Option<UrlInfo>,
|
||||
user_id: String,
|
||||
) -> Self {
|
||||
let now = Utc::now();
|
||||
@@ -30,7 +37,7 @@ impl TextContent {
|
||||
updated_at: now,
|
||||
text,
|
||||
file_info,
|
||||
url,
|
||||
url_info,
|
||||
instructions,
|
||||
category,
|
||||
user_id,
|
||||
@@ -85,7 +92,7 @@ mod tests {
|
||||
assert_eq!(text_content.category, category);
|
||||
assert_eq!(text_content.user_id, user_id);
|
||||
assert!(text_content.file_info.is_none());
|
||||
assert!(text_content.url.is_none());
|
||||
assert!(text_content.url_info.is_none());
|
||||
assert!(!text_content.id.is_empty());
|
||||
}
|
||||
|
||||
@@ -96,19 +103,27 @@ mod tests {
|
||||
let instructions = "URL instructions".to_string();
|
||||
let category = "URL category".to_string();
|
||||
let user_id = "user123".to_string();
|
||||
let url = Some("https://example.com/document.pdf".to_string());
|
||||
let title = "page_title".to_string();
|
||||
let image_id = "image12312".to_string();
|
||||
let url = "https://example.com/document.pdf".to_string();
|
||||
|
||||
let url_info = Some(UrlInfo {
|
||||
url,
|
||||
title,
|
||||
image_id,
|
||||
});
|
||||
|
||||
let text_content = TextContent::new(
|
||||
text.clone(),
|
||||
instructions.clone(),
|
||||
category.clone(),
|
||||
None,
|
||||
url.clone(),
|
||||
url_info.clone(),
|
||||
user_id.clone(),
|
||||
);
|
||||
|
||||
// Check URL field is set
|
||||
assert_eq!(text_content.url, url);
|
||||
assert_eq!(text_content.url_info, url_info);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
|
||||
@@ -31,6 +31,7 @@ tower-http = { version = "0.6.2", features = ["fs"] }
|
||||
chrono-tz = "0.10.1"
|
||||
tower-serve-static = "0.1.1"
|
||||
include_dir = "0.7.4"
|
||||
tokio-util = { version = "0.7.15", features = ["io"] }
|
||||
|
||||
common = { path = "../common" }
|
||||
composite-retrieval = { path = "../composite-retrieval" }
|
||||
|
||||
@@ -1,9 +1,12 @@
|
||||
use axum::{
|
||||
body::Body,
|
||||
extract::{Path, State},
|
||||
http::{header, HeaderMap, HeaderValue, StatusCode},
|
||||
response::IntoResponse,
|
||||
};
|
||||
use serde::Serialize;
|
||||
use tokio::join;
|
||||
use tokio::{fs::File, join};
|
||||
use tokio_util::io::ReaderStream;
|
||||
|
||||
use crate::{
|
||||
middlewares::{
|
||||
@@ -15,9 +18,15 @@ use crate::{
|
||||
use common::{
|
||||
error::AppError,
|
||||
storage::types::{
|
||||
conversation::Conversation, file_info::FileInfo, ingestion_task::IngestionTask,
|
||||
knowledge_entity::KnowledgeEntity, knowledge_relationship::KnowledgeRelationship,
|
||||
text_chunk::TextChunk, text_content::TextContent, user::User,
|
||||
conversation::Conversation,
|
||||
file_info::{FileError, FileInfo},
|
||||
ingestion_task::IngestionTask,
|
||||
knowledge_entity::KnowledgeEntity,
|
||||
knowledge_relationship::KnowledgeRelationship,
|
||||
text_chunk::TextChunk,
|
||||
text_content::TextContent,
|
||||
user::User,
|
||||
StoredObject,
|
||||
},
|
||||
};
|
||||
|
||||
@@ -167,3 +176,49 @@ pub async fn show_active_jobs(
|
||||
},
|
||||
))
|
||||
}
|
||||
|
||||
pub async fn serve_file(
|
||||
State(state): State<HtmlState>,
|
||||
RequireUser(user): RequireUser,
|
||||
Path(file_id): Path<String>,
|
||||
) -> Result<impl IntoResponse, HtmlError> {
|
||||
let file_info = match FileInfo::get_by_id(&file_id, &state.db).await {
|
||||
Ok(info) => info,
|
||||
_ => return Ok(TemplateResponse::not_found().into_response()),
|
||||
};
|
||||
|
||||
if file_info.user_id != user.id {
|
||||
return Ok(TemplateResponse::unauthorized().into_response());
|
||||
}
|
||||
|
||||
// 3. Open the file asynchronously from the stored path
|
||||
let path = std::path::Path::new(&file_info.path);
|
||||
|
||||
let file = match File::open(path).await {
|
||||
Ok(f) => f,
|
||||
Err(e) => return Ok(TemplateResponse::server_error().into_response()),
|
||||
};
|
||||
|
||||
let stream = ReaderStream::new(file);
|
||||
let body = Body::from_stream(stream);
|
||||
|
||||
let mut headers = HeaderMap::new();
|
||||
headers.insert(
|
||||
header::CONTENT_TYPE,
|
||||
HeaderValue::from_str(&file_info.mime_type)
|
||||
.unwrap_or_else(|_| HeaderValue::from_static("application/octet-stream")),
|
||||
);
|
||||
let Ok(disposition_value) =
|
||||
HeaderValue::from_str(&format!("attachment; filename=\"{}\"", file_info.file_name))
|
||||
else {
|
||||
headers.insert(
|
||||
header::CONTENT_DISPOSITION,
|
||||
HeaderValue::from_static("attachment"),
|
||||
);
|
||||
return Ok((StatusCode::OK, headers, body).into_response());
|
||||
};
|
||||
headers.insert(header::CONTENT_DISPOSITION, disposition_value);
|
||||
|
||||
// 5. Return the response
|
||||
Ok((StatusCode::OK, headers, body).into_response())
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@ use axum::{
|
||||
routing::{delete, get},
|
||||
Router,
|
||||
};
|
||||
use handlers::{delete_job, delete_text_content, index_handler, show_active_jobs};
|
||||
use handlers::{delete_job, delete_text_content, index_handler, serve_file, show_active_jobs};
|
||||
|
||||
use crate::html_state::HtmlState;
|
||||
|
||||
@@ -26,4 +26,5 @@ where
|
||||
.route("/jobs/{job_id}", delete(delete_job))
|
||||
.route("/active-jobs", get(show_active_jobs))
|
||||
.route("/text-content/{id}", delete(delete_text_content))
|
||||
.route("/file/{id}", get(serve_file))
|
||||
}
|
||||
|
||||
@@ -54,9 +54,9 @@
|
||||
<select name="query_model" class="select select-bordered w-full">
|
||||
<option value="gpt-4o-mini" {% if settings.query_model=="gpt-4o-mini" %}selected{% endif %}>GPT-4o Mini
|
||||
</option>
|
||||
<option value="gpt-4o" {% if settings.query_model=="gpt-4o" %}selected{% endif %}>GPT-4o</option>
|
||||
<option value="gpt-3.5-turbo" {% if settings.query_model=="gpt-3.5-turbo" %}selected{% endif %}>GPT-3.5
|
||||
Turbo</option>
|
||||
<option value="gpt-4.1" {% if settings.query_model=="gpt-4.1" %}selected{% endif %}>GPT-4.1</option>
|
||||
<option value="gpt-4.1-mini" {% if settings.query_model=="gpt-4.1-mini" %}selected{% endif %}>GPT-4.1-mini
|
||||
</option>
|
||||
</select>
|
||||
<p class="text-xs text-gray-500 mt-1">Model used for answering user queries</p>
|
||||
</div>
|
||||
@@ -66,11 +66,11 @@
|
||||
<span class="label-text">Processing Model</span>
|
||||
</label>
|
||||
<select name="processing_model" class="select select-bordered w-full">
|
||||
<option value="gpt-4o-mini" {% if settings.processing_model=="gpt-4o-mini" %}selected{% endif %}>GPT-4o Mini
|
||||
<option value="gpt-4o-mini" {% if settings.query_model=="gpt-4o-mini" %}selected{% endif %}>GPT-4o Mini
|
||||
</option>
|
||||
<option value="gpt-4.1" {% if settings.query_model=="gpt-4.1" %}selected{% endif %}>GPT-4.1</option>
|
||||
<option value="gpt-4.1-mini" {% if settings.query_model=="gpt-4.1-mini" %}selected{% endif %}>GPT-4.1-mini
|
||||
</option>
|
||||
<option value="gpt-4o" {% if settings.processing_model=="gpt-4o" %}selected{% endif %}>GPT-4o</option>
|
||||
<option value="gpt-3.5-turbo" {% if settings.processing_model=="gpt-3.5-turbo" %}selected{% endif %}>GPT-3.5
|
||||
Turbo</option>
|
||||
</select>
|
||||
<p class="text-xs text-gray-500 mt-1">Model used for content processing and ingestion</p>
|
||||
</div>
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
<div class="grid sm:grid-cols-2 lg:grid-cols-3 gap-4" id="text_content_cards">
|
||||
{% for text_content in text_contents %}
|
||||
<div class="card min-w-72 bg-base-100 shadow">
|
||||
<img class="rounded-t-md overflow-clip" src="/file/{{text_content.url_info.image_id}}" />
|
||||
<div class="card-body">
|
||||
<div class="flex justify-between space-x-2">
|
||||
<h2 class="card-title truncate">
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
{% for item in latest_text_contents %}
|
||||
<li class="list-row">
|
||||
<div class="bg-accent rounded-box size-10 flex justify-center items-center text-accent-content">
|
||||
{% if item.url %}
|
||||
{% if item.url_info %}
|
||||
{% include "icons/globe_icon.html" %}
|
||||
{% elif item.file_info %}
|
||||
{% include "icons/document_icon.html" %}
|
||||
@@ -14,8 +14,8 @@
|
||||
</div>
|
||||
<div>
|
||||
<div class="truncate max-w-[160px]">
|
||||
{% if item.url %}
|
||||
{{item.url}}
|
||||
{% if item.url_info %}
|
||||
{{item.url_info.title}}
|
||||
{% elif item.file_info%}
|
||||
{{item.file_info.file_name}}
|
||||
{% else %}
|
||||
|
||||
@@ -31,6 +31,10 @@
|
||||
</a>
|
||||
</li>
|
||||
{% endfor %}
|
||||
<li>
|
||||
<button class="btn btn-primary" hx-get="/ingress-form" hx-target="#modal" hx-swap="innerHTML">Add
|
||||
Content</button>
|
||||
</li>
|
||||
<div class="divider "></div>
|
||||
</div>
|
||||
|
||||
|
||||
@@ -13,14 +13,17 @@ serde_json = { workspace = true }
|
||||
futures = { workspace = true }
|
||||
async-openai = { workspace = true }
|
||||
surrealdb = { workspace = true }
|
||||
dom_smoothie = { workspace = true }
|
||||
tempfile = { workspace = true }
|
||||
axum_typed_multipart = { workspace = true}
|
||||
|
||||
tiktoken-rs = "0.6.0"
|
||||
reqwest = {version = "0.12.12", features = ["charset", "json"]}
|
||||
scraper = "0.22.0"
|
||||
chrono = { version = "0.4.39", features = ["serde"] }
|
||||
text-splitter = "0.18.1"
|
||||
url = { version = "2.5.2", features = ["serde"] }
|
||||
uuid = { version = "1.10.0", features = ["v4", "serde"] }
|
||||
dom_smoothie = "0.10.0"
|
||||
|
||||
headless_chrome = { git = "https://github.com/rust-headless-chrome/rust-headless-chrome", features = ["fetch"] }
|
||||
|
||||
common = { path = "../common" }
|
||||
composite-retrieval = { path = "../composite-retrieval" }
|
||||
|
||||
@@ -111,7 +111,7 @@ impl IngestionEnricher {
|
||||
let request = CreateChatCompletionRequestArgs::default()
|
||||
.model(&settings.processing_model)
|
||||
.temperature(0.2)
|
||||
.max_tokens(3048u32)
|
||||
.max_tokens(6048u32)
|
||||
.messages([
|
||||
ChatCompletionRequestSystemMessage::from(INGRESS_ANALYSIS_SYSTEM_MESSAGE).into(),
|
||||
ChatCompletionRequestUserMessage::from(user_message).into(),
|
||||
|
||||
@@ -53,7 +53,7 @@ impl IngestionPipeline {
|
||||
)
|
||||
.await?;
|
||||
|
||||
let text_content = to_text_content(task.content, &self.openai_client, &self.db).await?;
|
||||
let text_content = to_text_content(task.content, &self.db).await?;
|
||||
|
||||
match self.process(&text_content).await {
|
||||
Ok(_) => {
|
||||
|
||||
@@ -1,29 +1,28 @@
|
||||
pub mod llm_enrichment_result;
|
||||
|
||||
use std::{sync::Arc, time::Duration};
|
||||
use std::io::Write;
|
||||
use std::time::Instant;
|
||||
|
||||
use async_openai::types::{
|
||||
ChatCompletionRequestSystemMessage, ChatCompletionRequestUserMessage,
|
||||
CreateChatCompletionRequestArgs,
|
||||
};
|
||||
use axum::http::HeaderMap;
|
||||
use axum_typed_multipart::{FieldData, FieldMetadata};
|
||||
use chrono::Utc;
|
||||
use common::storage::db::SurrealDbClient;
|
||||
use common::{
|
||||
error::AppError,
|
||||
storage::types::{
|
||||
file_info::FileInfo, ingestion_payload::IngestionPayload, system_settings::SystemSettings,
|
||||
text_content::TextContent,
|
||||
file_info::FileInfo,
|
||||
ingestion_payload::IngestionPayload,
|
||||
text_content::{TextContent, UrlInfo},
|
||||
},
|
||||
};
|
||||
use dom_smoothie::TextMode;
|
||||
use reqwest;
|
||||
use scraper::{Html, Selector};
|
||||
use std::fmt::Write;
|
||||
use tiktoken_rs::{o200k_base, CoreBPE};
|
||||
use dom_smoothie::{Article, Readability, TextMode};
|
||||
use headless_chrome::Browser;
|
||||
use tempfile::NamedTempFile;
|
||||
use tracing::{error, info};
|
||||
|
||||
pub async fn to_text_content(
|
||||
ingestion_payload: IngestionPayload,
|
||||
openai_client: &Arc<async_openai::Client<async_openai::config::OpenAIConfig>>,
|
||||
db_client: &Arc<SurrealDbClient>,
|
||||
db: &SurrealDbClient,
|
||||
) -> Result<TextContent, AppError> {
|
||||
match ingestion_payload {
|
||||
IngestionPayload::Url {
|
||||
@@ -32,13 +31,17 @@ pub async fn to_text_content(
|
||||
category,
|
||||
user_id,
|
||||
} => {
|
||||
let text = fetch_text_from_url(&url, openai_client, db_client).await?;
|
||||
let (article, file_info) = fetch_article_from_url(&url, db, &user_id).await?;
|
||||
Ok(TextContent::new(
|
||||
text,
|
||||
article.text_content.into(),
|
||||
instructions,
|
||||
category,
|
||||
None,
|
||||
Some(url),
|
||||
Some(UrlInfo {
|
||||
url,
|
||||
title: article.title,
|
||||
image_id: file_info.id,
|
||||
}),
|
||||
user_id,
|
||||
))
|
||||
}
|
||||
@@ -73,161 +76,104 @@ pub async fn to_text_content(
|
||||
}
|
||||
}
|
||||
}
|
||||
use std::io::{Seek, SeekFrom}; // <-- Add Seek and SeekFrom
|
||||
|
||||
/// Get text from url, will return it as a markdown formatted string
|
||||
async fn fetch_text_from_url(
|
||||
/// Fetches web content from a URL, extracts the main article text as Markdown,
|
||||
/// captures a screenshot, and stores the screenshot returning [`FileInfo`].
|
||||
///
|
||||
/// This function handles browser automation, content extraction via Readability,
|
||||
/// screenshot capture, temporary file handling, and persisting the screenshot
|
||||
/// details (including deduplication based on content hash via [`FileInfo::new`]).
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `url` - The URL of the web page to fetch.
|
||||
/// * `db` - A reference to the database client (`SurrealDbClient`).
|
||||
/// * `user_id` - The ID of the user performing the action, used for associating the file.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A `Result` containing:
|
||||
/// * Ok: A tuple `(Article, FileInfo)` where `Article` contains the parsed markdown
|
||||
/// content and metadata, and `FileInfo` contains the details of the stored screenshot.
|
||||
/// * Err: An `AppError` if any step fails (navigation, screenshot, file handling, DB operation).
|
||||
async fn fetch_article_from_url(
|
||||
url: &str,
|
||||
openai_client: &Arc<async_openai::Client<async_openai::config::OpenAIConfig>>,
|
||||
db_client: &Arc<SurrealDbClient>,
|
||||
) -> Result<String, AppError> {
|
||||
// Use a client with timeouts and reuse
|
||||
let client = reqwest::ClientBuilder::new()
|
||||
.timeout(Duration::from_secs(30))
|
||||
.build()?;
|
||||
let response = client.get(url).send().await?.text().await?;
|
||||
db: &SurrealDbClient,
|
||||
user_id: &str,
|
||||
) -> Result<(Article, FileInfo), AppError> {
|
||||
info!("Fetching URL: {}", url);
|
||||
// Instantiate timer
|
||||
let now = Instant::now();
|
||||
// Setup browser, navigate and wait
|
||||
let browser = Browser::default()?;
|
||||
let tab = browser.new_tab()?;
|
||||
let page = tab.navigate_to(url)?;
|
||||
let loaded_page = page.wait_until_navigated()?;
|
||||
// Get content
|
||||
let raw_content = loaded_page.get_content()?;
|
||||
// Get screenshot
|
||||
let screenshot = loaded_page.capture_screenshot(
|
||||
headless_chrome::protocol::cdp::Page::CaptureScreenshotFormatOption::Jpeg,
|
||||
None,
|
||||
None,
|
||||
true,
|
||||
)?;
|
||||
|
||||
// Preallocate string with capacity
|
||||
let mut structured_content = String::with_capacity(response.len() / 2);
|
||||
// Create temp file
|
||||
let mut tmp_file = NamedTempFile::new()?;
|
||||
let temp_path_str = format!("{:?}", tmp_file.path());
|
||||
|
||||
let document = Html::parse_document(&response);
|
||||
let main_selectors = Selector::parse(
|
||||
"article, main, .article-content, .post-content, .entry-content, [role='main']",
|
||||
)
|
||||
.unwrap();
|
||||
// Write screenshot TO the temp file
|
||||
tmp_file.write_all(&screenshot)?;
|
||||
|
||||
let content_element = document
|
||||
.select(&main_selectors)
|
||||
.next()
|
||||
.or_else(|| document.select(&Selector::parse("body").unwrap()).next())
|
||||
.ok_or(AppError::NotFound("No content found".into()))?;
|
||||
// Ensure the OS buffer is written to the file system _before_ we proceed.
|
||||
tmp_file.as_file().sync_all()?;
|
||||
|
||||
// Compile selectors once
|
||||
let heading_selector = Selector::parse("h1, h2, h3").unwrap();
|
||||
let paragraph_selector = Selector::parse("p").unwrap();
|
||||
|
||||
// Process content in one pass
|
||||
for element in content_element.select(&heading_selector) {
|
||||
let _ = writeln!(
|
||||
structured_content,
|
||||
"<heading>{}</heading>",
|
||||
element.text().collect::<String>().trim()
|
||||
);
|
||||
}
|
||||
for element in content_element.select(¶graph_selector) {
|
||||
let _ = writeln!(
|
||||
structured_content,
|
||||
"<paragraph>{}</paragraph>",
|
||||
element.text().collect::<String>().trim()
|
||||
);
|
||||
// Ensure the file handle's read cursor is at the beginning before hashing occurs.
|
||||
if let Err(e) = tmp_file.seek(SeekFrom::Start(0)) {
|
||||
error!("URL: {}. Failed to seek temp file {} to start: {:?}. Proceeding, but hashing might fail.", url, temp_path_str, e);
|
||||
}
|
||||
|
||||
let content = structured_content
|
||||
.replace(|c: char| c.is_control(), " ")
|
||||
.replace(" ", " ");
|
||||
// Prepare file metadata
|
||||
let parsed_url =
|
||||
url::Url::parse(url).map_err(|_| AppError::Processing("Invalid URL".to_string()))?;
|
||||
let domain = parsed_url
|
||||
.host_str()
|
||||
.unwrap_or("unknown")
|
||||
.replace(|c: char| !c.is_alphanumeric(), "_");
|
||||
let timestamp = Utc::now().format("%Y%m%d%H%M%S");
|
||||
let file_name = format!("{}_{}_{}.jpg", domain, "screenshot", timestamp);
|
||||
|
||||
process_web_content(content, openai_client, db_client).await
|
||||
|
||||
// let config = dom_smoothie::Config {
|
||||
// text_mode: TextMode::Markdown,
|
||||
// ..Default::default()
|
||||
// };
|
||||
// panic!("YOU SHALL NOT PASS");
|
||||
}
|
||||
|
||||
pub async fn process_web_content(
|
||||
content: String,
|
||||
openai_client: &Arc<async_openai::Client<async_openai::config::OpenAIConfig>>,
|
||||
db_client: &Arc<SurrealDbClient>,
|
||||
) -> Result<String, AppError> {
|
||||
const MAX_TOKENS: usize = 122000;
|
||||
const SYSTEM_PROMPT: &str = r#"
|
||||
You are a precise content extractor for web pages. Your task:
|
||||
|
||||
1. Extract ONLY the main article/content from the provided text
|
||||
2. Maintain the original content - do not summarize or modify the core information
|
||||
3. Ignore peripheral content such as:
|
||||
- Navigation elements
|
||||
- Error messages (e.g., "JavaScript required")
|
||||
- Related articles sections
|
||||
- Comments
|
||||
- Social media links
|
||||
- Advertisement text
|
||||
|
||||
FORMAT:
|
||||
- Convert <heading> tags to markdown headings (#, ##, ###)
|
||||
- Convert <paragraph> tags to markdown paragraphs
|
||||
- Preserve quotes and important formatting
|
||||
- Remove duplicate content
|
||||
- Remove any metadata or technical artifacts
|
||||
|
||||
OUTPUT RULES:
|
||||
- Output ONLY the cleaned content in markdown
|
||||
- Do not add any explanations or meta-commentary
|
||||
- Do not add summaries or conclusions
|
||||
- Do not use any XML/HTML tags in the output
|
||||
"#;
|
||||
|
||||
let bpe = o200k_base()?;
|
||||
let settings = SystemSettings::get_current(db_client).await?;
|
||||
|
||||
// Process content in chunks if needed
|
||||
let truncated_content = if bpe.encode_with_special_tokens(&content).len() > MAX_TOKENS {
|
||||
truncate_content(&content, MAX_TOKENS, &bpe)?
|
||||
} else {
|
||||
content
|
||||
// Construct FieldData and FieldMetadata
|
||||
let metadata = FieldMetadata {
|
||||
file_name: Some(file_name),
|
||||
content_type: Some("image/jpeg".to_string()),
|
||||
name: None,
|
||||
headers: HeaderMap::new(),
|
||||
};
|
||||
let field_data = FieldData {
|
||||
contents: tmp_file,
|
||||
metadata,
|
||||
};
|
||||
|
||||
let request = CreateChatCompletionRequestArgs::default()
|
||||
.model(&settings.processing_model)
|
||||
.temperature(0.0)
|
||||
.max_tokens(16200u32)
|
||||
.messages([
|
||||
ChatCompletionRequestSystemMessage::from(SYSTEM_PROMPT).into(),
|
||||
ChatCompletionRequestUserMessage::from(truncated_content).into(),
|
||||
])
|
||||
.build()?;
|
||||
// Store screenshot
|
||||
let file_info = FileInfo::new(field_data, db, user_id).await?;
|
||||
|
||||
let response = openai_client.chat().create(request).await?;
|
||||
// Parse content...
|
||||
let config = dom_smoothie::Config {
|
||||
text_mode: TextMode::Markdown,
|
||||
..Default::default()
|
||||
};
|
||||
let mut readability = Readability::new(raw_content, None, Some(config))?;
|
||||
let article: Article = readability.parse()?;
|
||||
let end = now.elapsed();
|
||||
info!(
|
||||
"URL: {}. Total time: {:?}. Final File ID: {}",
|
||||
url, end, file_info.id
|
||||
);
|
||||
|
||||
// Extract and return the content
|
||||
response
|
||||
.choices
|
||||
.first()
|
||||
.and_then(|choice| choice.message.content.clone())
|
||||
.ok_or(AppError::LLMParsing(
|
||||
"No content found in LLM response".into(),
|
||||
))
|
||||
}
|
||||
|
||||
fn truncate_content(
|
||||
content: &str,
|
||||
max_tokens: usize,
|
||||
tokenizer: &CoreBPE,
|
||||
) -> Result<String, AppError> {
|
||||
// Pre-allocate with estimated size
|
||||
let mut result = String::with_capacity(content.len() / 2);
|
||||
let mut current_tokens = 0;
|
||||
|
||||
// Process content by paragraph to maintain context
|
||||
for paragraph in content.split("\n\n") {
|
||||
let tokens = tokenizer.encode_with_special_tokens(paragraph).len();
|
||||
|
||||
// Check if adding paragraph exceeds limit
|
||||
if current_tokens + tokens > max_tokens {
|
||||
break;
|
||||
}
|
||||
|
||||
result.push_str(paragraph);
|
||||
result.push_str("\n\n");
|
||||
current_tokens += tokens;
|
||||
}
|
||||
|
||||
// Ensure we return valid content
|
||||
if result.is_empty() {
|
||||
return Err(AppError::Processing("Content exceeds token limit".into()));
|
||||
}
|
||||
|
||||
Ok(result.trim_end().to_string())
|
||||
Ok((article, file_info))
|
||||
}
|
||||
|
||||
/// Extracts text from a file based on its MIME type.
|
||||
|
||||
@@ -50,7 +50,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
// Create Axum router
|
||||
let app = Router::new()
|
||||
.nest("/api/v1", api_routes_v1(&api_state))
|
||||
.nest("/", html_routes(&html_state))
|
||||
.merge(html_routes(&html_state))
|
||||
.with_state(AppState {
|
||||
api_state,
|
||||
html_state,
|
||||
|
||||
Reference in New Issue
Block a user