mirror of
https://github.com/perstarkse/minne.git
synced 2026-06-12 09:14:27 +02:00
feat: readability parsing, screenshot of page, file serving
This commit is contained in:
Generated
+399
-108
@@ -201,6 +201,9 @@ name = "arbitrary"
|
|||||||
version = "1.4.1"
|
version = "1.4.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "dde20b3d026af13f561bdd0f15edf01fc734f0dafcedbaf42bba506a9517f223"
|
checksum = "dde20b3d026af13f561bdd0f15edf01fc734f0dafcedbaf42bba506a9517f223"
|
||||||
|
dependencies = [
|
||||||
|
"derive_arbitrary",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "argon2"
|
name = "argon2"
|
||||||
@@ -451,6 +454,20 @@ dependencies = [
|
|||||||
"syn 2.0.100",
|
"syn 2.0.100",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "auto_generate_cdp"
|
||||||
|
version = "0.4.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "d6e1961a0d5d77969057eba90d448e610d3c439024d135d9dbd98e33ec973520"
|
||||||
|
dependencies = [
|
||||||
|
"convert_case 0.4.0",
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"serde",
|
||||||
|
"serde_json",
|
||||||
|
"ureq 2.12.1",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "autocfg"
|
name = "autocfg"
|
||||||
version = "1.4.0"
|
version = "1.4.0"
|
||||||
@@ -835,17 +852,6 @@ dependencies = [
|
|||||||
"syn 2.0.100",
|
"syn 2.0.100",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "bstr"
|
|
||||||
version = "1.12.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "234113d19d0d7d613b40e86fb654acf958910802bcceab913a4f9e7cda03b1a4"
|
|
||||||
dependencies = [
|
|
||||||
"memchr",
|
|
||||||
"regex-automata 0.4.9",
|
|
||||||
"serde",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "bumpalo"
|
name = "bumpalo"
|
||||||
version = "3.17.0"
|
version = "3.17.0"
|
||||||
@@ -895,6 +901,25 @@ dependencies = [
|
|||||||
"serde",
|
"serde",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "bzip2"
|
||||||
|
version = "0.5.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47"
|
||||||
|
dependencies = [
|
||||||
|
"bzip2-sys",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "bzip2-sys"
|
||||||
|
version = "0.1.13+1.0.8"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14"
|
||||||
|
dependencies = [
|
||||||
|
"cc",
|
||||||
|
"pkg-config",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "castaway"
|
name = "castaway"
|
||||||
version = "0.2.3"
|
version = "0.2.3"
|
||||||
@@ -910,6 +935,8 @@ version = "1.2.19"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "8e3a13707ac958681c13b39b458c073d0d9bc8a22cb1b2f4c8e55eb72c13f362"
|
checksum = "8e3a13707ac958681c13b39b458c073d0d9bc8a22cb1b2f4c8e55eb72c13f362"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"jobserver",
|
||||||
|
"libc",
|
||||||
"shlex",
|
"shlex",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -1070,6 +1097,7 @@ dependencies = [
|
|||||||
"chrono",
|
"chrono",
|
||||||
"chrono-tz",
|
"chrono-tz",
|
||||||
"config",
|
"config",
|
||||||
|
"dom_smoothie",
|
||||||
"futures",
|
"futures",
|
||||||
"mime",
|
"mime",
|
||||||
"mime_guess",
|
"mime_guess",
|
||||||
@@ -1124,7 +1152,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
checksum = "595aae20e65c3be792d05818e8c63025294ac3cb7e200f11459063a352a6ef80"
|
checksum = "595aae20e65c3be792d05818e8c63025294ac3cb7e200f11459063a352a6ef80"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"async-trait",
|
"async-trait",
|
||||||
"convert_case",
|
"convert_case 0.6.0",
|
||||||
"json5",
|
"json5",
|
||||||
"pathdiff",
|
"pathdiff",
|
||||||
"ron",
|
"ron",
|
||||||
@@ -1162,6 +1190,12 @@ version = "0.3.1"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6"
|
checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "convert_case"
|
||||||
|
version = "0.4.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "convert_case"
|
name = "convert_case"
|
||||||
version = "0.6.0"
|
version = "0.6.0"
|
||||||
@@ -1221,6 +1255,21 @@ dependencies = [
|
|||||||
"libc",
|
"libc",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "crc"
|
||||||
|
version = "3.2.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "69e6e4d7b33a94f0991c26729976b10ebde1d34c3ee82408fb536164fa10d636"
|
||||||
|
dependencies = [
|
||||||
|
"crc-catalog",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "crc-catalog"
|
||||||
|
version = "2.4.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "crc32fast"
|
name = "crc32fast"
|
||||||
version = "1.4.2"
|
version = "1.4.2"
|
||||||
@@ -1385,6 +1434,12 @@ version = "2.9.0"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "2a2330da5de22e8a3cb63252ce2abb30116bf5265e89c0e01bc17015ce30a476"
|
checksum = "2a2330da5de22e8a3cb63252ce2abb30116bf5265e89c0e01bc17015ce30a476"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "deflate64"
|
||||||
|
version = "0.1.9"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "da692b8d1080ea3045efaab14434d40468c3d8657e42abddfffca87b428f4c1b"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "deranged"
|
name = "deranged"
|
||||||
version = "0.4.0"
|
version = "0.4.0"
|
||||||
@@ -1395,6 +1450,17 @@ dependencies = [
|
|||||||
"serde",
|
"serde",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "derive_arbitrary"
|
||||||
|
version = "1.4.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "30542c1ad912e0e3d22a1935c290e12e8a29d704a420177a31faad4a601a0800"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn 2.0.100",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "derive_builder"
|
name = "derive_builder"
|
||||||
version = "0.20.2"
|
version = "0.20.2"
|
||||||
@@ -1465,6 +1531,15 @@ dependencies = [
|
|||||||
"subtle",
|
"subtle",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "directories"
|
||||||
|
version = "6.0.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "16f5094c54661b38d03bd7e50df373292118db60b585c08a411c6d840017fe7d"
|
||||||
|
dependencies = [
|
||||||
|
"dirs-sys",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "dirs-next"
|
name = "dirs-next"
|
||||||
version = "2.0.0"
|
version = "2.0.0"
|
||||||
@@ -1475,6 +1550,18 @@ dependencies = [
|
|||||||
"dirs-sys-next",
|
"dirs-sys-next",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "dirs-sys"
|
||||||
|
version = "0.5.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "e01a3366d27ee9890022452ee61b2b63a67e6f13f58900b651ff5665f0bb1fab"
|
||||||
|
dependencies = [
|
||||||
|
"libc",
|
||||||
|
"option-ext",
|
||||||
|
"redox_users 0.5.0",
|
||||||
|
"windows-sys 0.59.0",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "dirs-sys-next"
|
name = "dirs-sys-next"
|
||||||
version = "0.1.2"
|
version = "0.1.2"
|
||||||
@@ -1482,7 +1569,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
checksum = "4ebda144c4fe02d1f7ea1a7d9641b6fc6b580adcfa024ae48797ecdeb6825b4d"
|
checksum = "4ebda144c4fe02d1f7ea1a7d9641b6fc6b580adcfa024ae48797ecdeb6825b4d"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"libc",
|
"libc",
|
||||||
"redox_users",
|
"redox_users 0.4.6",
|
||||||
"winapi",
|
"winapi",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -1588,12 +1675,6 @@ dependencies = [
|
|||||||
"num-traits",
|
"num-traits",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "ego-tree"
|
|
||||||
version = "0.10.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "b2972feb8dffe7bc8c5463b1dacda1b0dfbed3710e50f977d965429692d74cd8"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "either"
|
name = "either"
|
||||||
version = "1.15.0"
|
version = "1.15.0"
|
||||||
@@ -1624,6 +1705,12 @@ version = "0.1.2"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "c34f04666d835ff5d62e058c3995147c06f42fe86ff053337632bca83e42702d"
|
checksum = "c34f04666d835ff5d62e058c3995147c06f42fe86ff053337632bca83e42702d"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "env_home"
|
||||||
|
version = "0.1.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c7f84e12ccf0a7ddc17a6c41c93326024c42920d7ee630d04950e6926645c0fe"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "equivalent"
|
name = "equivalent"
|
||||||
version = "1.0.2"
|
version = "1.0.2"
|
||||||
@@ -1695,17 +1782,6 @@ dependencies = [
|
|||||||
"tempfile",
|
"tempfile",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "fancy-regex"
|
|
||||||
version = "0.13.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "531e46835a22af56d1e3b66f04844bed63158bc094a628bec1d321d9b4c44bf2"
|
|
||||||
dependencies = [
|
|
||||||
"bit-set 0.5.3",
|
|
||||||
"regex-automata 0.4.9",
|
|
||||||
"regex-syntax 0.8.5",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "fastrand"
|
name = "fastrand"
|
||||||
version = "2.3.0"
|
version = "2.3.0"
|
||||||
@@ -1736,6 +1812,16 @@ version = "0.4.7"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "b7ac824320a75a52197e8f2d787f6a38b6718bb6897a35142d749af3c0e8f4fe"
|
checksum = "b7ac824320a75a52197e8f2d787f6a38b6718bb6897a35142d749af3c0e8f4fe"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "flate2"
|
||||||
|
version = "1.1.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "7ced92e76e966ca2fd84c8f7aa01a4aea65b0eb6648d72f7c8f3e2764a67fece"
|
||||||
|
dependencies = [
|
||||||
|
"crc32fast",
|
||||||
|
"miniz_oxide",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "float_next_after"
|
name = "float_next_after"
|
||||||
version = "1.0.0"
|
version = "1.0.0"
|
||||||
@@ -1995,15 +2081,6 @@ dependencies = [
|
|||||||
"libm",
|
"libm",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "getopts"
|
|
||||||
version = "0.2.21"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5"
|
|
||||||
dependencies = [
|
|
||||||
"unicode-width",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "getrandom"
|
name = "getrandom"
|
||||||
version = "0.2.16"
|
version = "0.2.16"
|
||||||
@@ -2126,6 +2203,32 @@ dependencies = [
|
|||||||
"hashbrown 0.15.2",
|
"hashbrown 0.15.2",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "headless_chrome"
|
||||||
|
version = "1.0.17"
|
||||||
|
source = "git+https://github.com/rust-headless-chrome/rust-headless-chrome#8b66992826245cbf60377d619fc780f8c45abf8e"
|
||||||
|
dependencies = [
|
||||||
|
"anyhow",
|
||||||
|
"auto_generate_cdp",
|
||||||
|
"base64 0.22.1",
|
||||||
|
"derive_builder",
|
||||||
|
"directories",
|
||||||
|
"log",
|
||||||
|
"rand 0.9.1",
|
||||||
|
"regex",
|
||||||
|
"serde",
|
||||||
|
"serde_json",
|
||||||
|
"tempfile",
|
||||||
|
"thiserror 2.0.12",
|
||||||
|
"tungstenite 0.26.2",
|
||||||
|
"ureq 3.0.11",
|
||||||
|
"url",
|
||||||
|
"walkdir",
|
||||||
|
"which",
|
||||||
|
"winreg",
|
||||||
|
"zip",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "heapless"
|
name = "heapless"
|
||||||
version = "0.8.0"
|
version = "0.8.0"
|
||||||
@@ -2207,23 +2310,12 @@ dependencies = [
|
|||||||
"tempfile",
|
"tempfile",
|
||||||
"thiserror 1.0.69",
|
"thiserror 1.0.69",
|
||||||
"tokio",
|
"tokio",
|
||||||
|
"tokio-util",
|
||||||
"tower-http",
|
"tower-http",
|
||||||
"tower-serve-static",
|
"tower-serve-static",
|
||||||
"tracing",
|
"tracing",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "html5ever"
|
|
||||||
version = "0.29.1"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "3b7410cae13cbc75623c98ac4cbfd1f0bedddf3227afc24f370cf0f50a44a11c"
|
|
||||||
dependencies = [
|
|
||||||
"log",
|
|
||||||
"mac",
|
|
||||||
"markup5ever 0.14.1",
|
|
||||||
"match_token",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "html5ever"
|
name = "html5ever"
|
||||||
version = "0.30.0"
|
version = "0.30.0"
|
||||||
@@ -2607,20 +2699,22 @@ version = "0.1.0"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"async-openai",
|
"async-openai",
|
||||||
"axum",
|
"axum",
|
||||||
|
"axum_typed_multipart",
|
||||||
"chrono",
|
"chrono",
|
||||||
"common",
|
"common",
|
||||||
"composite-retrieval",
|
"composite-retrieval",
|
||||||
"dom_smoothie",
|
"dom_smoothie",
|
||||||
"futures",
|
"futures",
|
||||||
|
"headless_chrome",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
"scraper",
|
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"surrealdb",
|
"surrealdb",
|
||||||
|
"tempfile",
|
||||||
"text-splitter",
|
"text-splitter",
|
||||||
"tiktoken-rs",
|
|
||||||
"tokio",
|
"tokio",
|
||||||
"tracing",
|
"tracing",
|
||||||
|
"url",
|
||||||
"uuid",
|
"uuid",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -2701,6 +2795,16 @@ version = "1.0.15"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
|
checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "jobserver"
|
||||||
|
version = "0.1.33"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "38f262f097c174adebe41eb73d66ae9c06b2844fb0da69969647bbddd9b0538a"
|
||||||
|
dependencies = [
|
||||||
|
"getrandom 0.3.2",
|
||||||
|
"libc",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "js-sys"
|
name = "js-sys"
|
||||||
version = "0.3.77"
|
version = "0.3.77"
|
||||||
@@ -2882,6 +2986,27 @@ dependencies = [
|
|||||||
"hashbrown 0.15.2",
|
"hashbrown 0.15.2",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "lzma-rs"
|
||||||
|
version = "0.3.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "297e814c836ae64db86b36cf2a557ba54368d03f6afcd7d947c266692f71115e"
|
||||||
|
dependencies = [
|
||||||
|
"byteorder",
|
||||||
|
"crc",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "lzma-sys"
|
||||||
|
version = "0.1.20"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27"
|
||||||
|
dependencies = [
|
||||||
|
"cc",
|
||||||
|
"libc",
|
||||||
|
"pkg-config",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "mac"
|
name = "mac"
|
||||||
version = "0.1.1"
|
version = "0.1.1"
|
||||||
@@ -2915,20 +3040,6 @@ version = "1.0.2"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d"
|
checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "markup5ever"
|
|
||||||
version = "0.14.1"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "c7a7213d12e1864c0f002f52c2923d4556935a43dec5e71355c2760e0f6e7a18"
|
|
||||||
dependencies = [
|
|
||||||
"log",
|
|
||||||
"phf",
|
|
||||||
"phf_codegen",
|
|
||||||
"string_cache",
|
|
||||||
"string_cache_codegen",
|
|
||||||
"tendril",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "markup5ever"
|
name = "markup5ever"
|
||||||
version = "0.15.0"
|
version = "0.15.0"
|
||||||
@@ -3422,6 +3533,12 @@ dependencies = [
|
|||||||
"vcpkg",
|
"vcpkg",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "option-ext"
|
||||||
|
version = "0.2.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "ordered-multimap"
|
name = "ordered-multimap"
|
||||||
version = "0.7.3"
|
version = "0.7.3"
|
||||||
@@ -3869,7 +3986,7 @@ dependencies = [
|
|||||||
"pin-project-lite",
|
"pin-project-lite",
|
||||||
"quinn-proto",
|
"quinn-proto",
|
||||||
"quinn-udp",
|
"quinn-udp",
|
||||||
"rustc-hash 2.1.1",
|
"rustc-hash",
|
||||||
"rustls",
|
"rustls",
|
||||||
"socket2",
|
"socket2",
|
||||||
"thiserror 2.0.12",
|
"thiserror 2.0.12",
|
||||||
@@ -3888,7 +4005,7 @@ dependencies = [
|
|||||||
"getrandom 0.3.2",
|
"getrandom 0.3.2",
|
||||||
"rand 0.9.1",
|
"rand 0.9.1",
|
||||||
"ring",
|
"ring",
|
||||||
"rustc-hash 2.1.1",
|
"rustc-hash",
|
||||||
"rustls",
|
"rustls",
|
||||||
"rustls-pki-types",
|
"rustls-pki-types",
|
||||||
"slab",
|
"slab",
|
||||||
@@ -4055,6 +4172,17 @@ dependencies = [
|
|||||||
"thiserror 1.0.69",
|
"thiserror 1.0.69",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "redox_users"
|
||||||
|
version = "0.5.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "dd6f9d3d47bdd2ad6945c5015a226ec6155d0bcdfd8f7cd29f86b71f8de99d2b"
|
||||||
|
dependencies = [
|
||||||
|
"getrandom 0.2.16",
|
||||||
|
"libredox",
|
||||||
|
"thiserror 2.0.12",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "ref-cast"
|
name = "ref-cast"
|
||||||
version = "1.0.24"
|
version = "1.0.24"
|
||||||
@@ -4284,7 +4412,7 @@ dependencies = [
|
|||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
"rinja_parser",
|
"rinja_parser",
|
||||||
"rustc-hash 2.1.1",
|
"rustc-hash",
|
||||||
"serde",
|
"serde",
|
||||||
"syn 2.0.100",
|
"syn 2.0.100",
|
||||||
]
|
]
|
||||||
@@ -4444,12 +4572,6 @@ version = "0.1.24"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f"
|
checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "rustc-hash"
|
|
||||||
version = "1.1.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "rustc-hash"
|
name = "rustc-hash"
|
||||||
version = "2.1.1"
|
version = "2.1.1"
|
||||||
@@ -4588,21 +4710,6 @@ version = "1.2.0"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
|
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "scraper"
|
|
||||||
version = "0.22.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "cc3d051b884f40e309de6c149734eab57aa8cc1347992710dc80bcc1c2194c15"
|
|
||||||
dependencies = [
|
|
||||||
"cssparser 0.34.0",
|
|
||||||
"ego-tree",
|
|
||||||
"getopts",
|
|
||||||
"html5ever 0.29.1",
|
|
||||||
"precomputed-hash",
|
|
||||||
"selectors",
|
|
||||||
"tendril",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "scrypt"
|
name = "scrypt"
|
||||||
version = "0.11.0"
|
version = "0.11.0"
|
||||||
@@ -4888,6 +4995,12 @@ dependencies = [
|
|||||||
"libc",
|
"libc",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "simd-adler32"
|
||||||
|
version = "0.3.7"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "simdutf8"
|
name = "simdutf8"
|
||||||
version = "0.1.5"
|
version = "0.1.5"
|
||||||
@@ -4974,6 +5087,17 @@ dependencies = [
|
|||||||
"windows-sys 0.52.0",
|
"windows-sys 0.52.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "socks"
|
||||||
|
version = "0.3.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f0c3dbbd9ae980613c6dd8e28a9407b50509d3803b57624d5dfe8315218cd58b"
|
||||||
|
dependencies = [
|
||||||
|
"byteorder",
|
||||||
|
"libc",
|
||||||
|
"winapi",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "spade"
|
name = "spade"
|
||||||
version = "2.13.1"
|
version = "2.13.1"
|
||||||
@@ -5427,22 +5551,6 @@ dependencies = [
|
|||||||
"once_cell",
|
"once_cell",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "tiktoken-rs"
|
|
||||||
version = "0.6.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "44075987ee2486402f0808505dd65692163d243a337fc54363d49afac41087f6"
|
|
||||||
dependencies = [
|
|
||||||
"anyhow",
|
|
||||||
"base64 0.21.7",
|
|
||||||
"bstr",
|
|
||||||
"fancy-regex",
|
|
||||||
"lazy_static",
|
|
||||||
"parking_lot",
|
|
||||||
"regex",
|
|
||||||
"rustc-hash 1.1.0",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "time"
|
name = "time"
|
||||||
version = "0.3.41"
|
version = "0.3.41"
|
||||||
@@ -5597,7 +5705,7 @@ dependencies = [
|
|||||||
"rustls-pki-types",
|
"rustls-pki-types",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tokio-rustls",
|
"tokio-rustls",
|
||||||
"tungstenite",
|
"tungstenite 0.23.0",
|
||||||
"webpki-roots",
|
"webpki-roots",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -5829,6 +5937,23 @@ dependencies = [
|
|||||||
"utf-8",
|
"utf-8",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "tungstenite"
|
||||||
|
version = "0.26.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "4793cb5e56680ecbb1d843515b23b6de9a75eb04b66643e256a396d43be33c13"
|
||||||
|
dependencies = [
|
||||||
|
"bytes",
|
||||||
|
"data-encoding",
|
||||||
|
"http",
|
||||||
|
"httparse",
|
||||||
|
"log",
|
||||||
|
"rand 0.9.1",
|
||||||
|
"sha1",
|
||||||
|
"thiserror 2.0.12",
|
||||||
|
"utf-8",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "typeid"
|
name = "typeid"
|
||||||
version = "1.0.3"
|
version = "1.0.3"
|
||||||
@@ -5935,6 +6060,53 @@ version = "0.9.0"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
|
checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "ureq"
|
||||||
|
version = "2.12.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "02d1a66277ed75f640d608235660df48c8e3c19f3b4edb6a263315626cc3c01d"
|
||||||
|
dependencies = [
|
||||||
|
"base64 0.22.1",
|
||||||
|
"flate2",
|
||||||
|
"log",
|
||||||
|
"once_cell",
|
||||||
|
"rustls",
|
||||||
|
"rustls-pki-types",
|
||||||
|
"socks",
|
||||||
|
"url",
|
||||||
|
"webpki-roots",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "ureq"
|
||||||
|
version = "3.0.11"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b7a3e9af6113ecd57b8c63d3cd76a385b2e3881365f1f489e54f49801d0c83ea"
|
||||||
|
dependencies = [
|
||||||
|
"base64 0.22.1",
|
||||||
|
"flate2",
|
||||||
|
"log",
|
||||||
|
"percent-encoding",
|
||||||
|
"rustls",
|
||||||
|
"rustls-pemfile",
|
||||||
|
"rustls-pki-types",
|
||||||
|
"ureq-proto",
|
||||||
|
"utf-8",
|
||||||
|
"webpki-roots",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "ureq-proto"
|
||||||
|
version = "0.4.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "fadf18427d33828c311234884b7ba2afb57143e6e7e69fda7ee883b624661e36"
|
||||||
|
dependencies = [
|
||||||
|
"base64 0.22.1",
|
||||||
|
"http",
|
||||||
|
"httparse",
|
||||||
|
"log",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "url"
|
name = "url"
|
||||||
version = "2.5.4"
|
version = "2.5.4"
|
||||||
@@ -6191,6 +6363,18 @@ dependencies = [
|
|||||||
"rustls-pki-types",
|
"rustls-pki-types",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "which"
|
||||||
|
version = "7.0.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "24d643ce3fd3e5b54854602a080f34fb10ab75e0b813ee32d00ca2b44fa74762"
|
||||||
|
dependencies = [
|
||||||
|
"either",
|
||||||
|
"env_home",
|
||||||
|
"rustix",
|
||||||
|
"winsafe",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "winapi"
|
name = "winapi"
|
||||||
version = "0.3.9"
|
version = "0.3.9"
|
||||||
@@ -6509,6 +6693,22 @@ dependencies = [
|
|||||||
"memchr",
|
"memchr",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "winreg"
|
||||||
|
version = "0.55.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "cb5a765337c50e9ec252c2069be9bf91c7df47afb103b642ba3a53bf8101be97"
|
||||||
|
dependencies = [
|
||||||
|
"cfg-if",
|
||||||
|
"windows-sys 0.59.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "winsafe"
|
||||||
|
version = "0.0.19"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "d135d17ab770252ad95e9a872d365cf3090e3be864a34ab46f48555993efc904"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "wit-bindgen-rt"
|
name = "wit-bindgen-rt"
|
||||||
version = "0.39.0"
|
version = "0.39.0"
|
||||||
@@ -6564,6 +6764,15 @@ version = "0.8.26"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "a62ce76d9b56901b19a74f19431b0d8b3bc7ca4ad685a746dfd78ca8f4fc6bda"
|
checksum = "a62ce76d9b56901b19a74f19431b0d8b3bc7ca4ad685a746dfd78ca8f4fc6bda"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "xz2"
|
||||||
|
version = "0.1.7"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2"
|
||||||
|
dependencies = [
|
||||||
|
"lzma-sys",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "yaml-rust2"
|
name = "yaml-rust2"
|
||||||
version = "0.10.1"
|
version = "0.10.1"
|
||||||
@@ -6665,6 +6874,20 @@ name = "zeroize"
|
|||||||
version = "1.8.1"
|
version = "1.8.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde"
|
checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde"
|
||||||
|
dependencies = [
|
||||||
|
"zeroize_derive",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "zeroize_derive"
|
||||||
|
version = "1.4.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn 2.0.100",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "zerovec"
|
name = "zerovec"
|
||||||
@@ -6687,3 +6910,71 @@ dependencies = [
|
|||||||
"quote",
|
"quote",
|
||||||
"syn 2.0.100",
|
"syn 2.0.100",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "zip"
|
||||||
|
version = "2.6.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "1dcb24d0152526ae49b9b96c1dcf71850ca1e0b882e4e28ed898a93c41334744"
|
||||||
|
dependencies = [
|
||||||
|
"aes",
|
||||||
|
"arbitrary",
|
||||||
|
"bzip2",
|
||||||
|
"constant_time_eq",
|
||||||
|
"crc32fast",
|
||||||
|
"crossbeam-utils",
|
||||||
|
"deflate64",
|
||||||
|
"flate2",
|
||||||
|
"getrandom 0.3.2",
|
||||||
|
"hmac",
|
||||||
|
"indexmap 2.9.0",
|
||||||
|
"lzma-rs",
|
||||||
|
"memchr",
|
||||||
|
"pbkdf2",
|
||||||
|
"sha1",
|
||||||
|
"time",
|
||||||
|
"xz2",
|
||||||
|
"zeroize",
|
||||||
|
"zopfli",
|
||||||
|
"zstd",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "zopfli"
|
||||||
|
version = "0.8.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "edfc5ee405f504cd4984ecc6f14d02d55cfda60fa4b689434ef4102aae150cd7"
|
||||||
|
dependencies = [
|
||||||
|
"bumpalo",
|
||||||
|
"crc32fast",
|
||||||
|
"log",
|
||||||
|
"simd-adler32",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "zstd"
|
||||||
|
version = "0.13.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a"
|
||||||
|
dependencies = [
|
||||||
|
"zstd-safe",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "zstd-safe"
|
||||||
|
version = "7.2.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d"
|
||||||
|
dependencies = [
|
||||||
|
"zstd-sys",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "zstd-sys"
|
||||||
|
version = "2.0.15+zstd.1.5.7"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "eb81183ddd97d0c74cedf1d50d85c8d08c1b8b68ee863bdee9e706eedba1a237"
|
||||||
|
dependencies = [
|
||||||
|
"cc",
|
||||||
|
"pkg-config",
|
||||||
|
]
|
||||||
|
|||||||
@@ -26,3 +26,4 @@ axum_session_auth = "0.16"
|
|||||||
axum_session_surreal = "0.4"
|
axum_session_surreal = "0.4"
|
||||||
axum_typed_multipart = "0.16"
|
axum_typed_multipart = "0.16"
|
||||||
tempfile = "3.12.0"
|
tempfile = "3.12.0"
|
||||||
|
dom_smoothie = "0.10.0"
|
||||||
|
|||||||
@@ -16,6 +16,7 @@ surrealdb = { workspace = true, features = ["kv-mem"] }
|
|||||||
async-openai = { workspace = true }
|
async-openai = { workspace = true }
|
||||||
futures = { workspace = true }
|
futures = { workspace = true }
|
||||||
tempfile = { workspace = true }
|
tempfile = { workspace = true }
|
||||||
|
dom_smoothie = { workspace = true }
|
||||||
|
|
||||||
async-trait = "0.1.88"
|
async-trait = "0.1.88"
|
||||||
axum_session = { workspace = true }
|
axum_session = { workspace = true }
|
||||||
|
|||||||
@@ -33,4 +33,6 @@ pub enum AppError {
|
|||||||
Tiktoken(#[from] anyhow::Error),
|
Tiktoken(#[from] anyhow::Error),
|
||||||
#[error("Ingress Processing error: {0}")]
|
#[error("Ingress Processing error: {0}")]
|
||||||
Processing(String),
|
Processing(String),
|
||||||
|
#[error("DOM smoothie error: {0}")]
|
||||||
|
DomSmoothie(#[from] dom_smoothie::ReadabilityError),
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -38,7 +38,8 @@ stored_object!(FileInfo, "file", {
|
|||||||
sha256: String,
|
sha256: String,
|
||||||
path: String,
|
path: String,
|
||||||
file_name: String,
|
file_name: String,
|
||||||
mime_type: String
|
mime_type: String,
|
||||||
|
user_id: String
|
||||||
});
|
});
|
||||||
|
|
||||||
impl FileInfo {
|
impl FileInfo {
|
||||||
@@ -83,6 +84,7 @@ impl FileInfo {
|
|||||||
.to_string_lossy()
|
.to_string_lossy()
|
||||||
.into(),
|
.into(),
|
||||||
mime_type: Self::guess_mime_type(Path::new(&sanitized_file_name)),
|
mime_type: Self::guess_mime_type(Path::new(&sanitized_file_name)),
|
||||||
|
user_id: user_id.to_string(),
|
||||||
};
|
};
|
||||||
|
|
||||||
// Store in database
|
// Store in database
|
||||||
@@ -258,6 +260,22 @@ impl FileInfo {
|
|||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Retrieves a `FileInfo` by its ID.
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
/// * `id` - The ID string of the file.
|
||||||
|
/// * `db_client` - Reference to the SurrealDbClient.
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
/// * `Result<FileInfo, FileError>` - The `FileInfo` or an error if not found or on DB issues.
|
||||||
|
pub async fn get_by_id(id: &str, db_client: &SurrealDbClient) -> Result<FileInfo, FileError> {
|
||||||
|
match db_client.get_item::<FileInfo>(id).await {
|
||||||
|
Ok(Some(file_info)) => Ok(file_info),
|
||||||
|
Ok(None) => Err(FileError::FileNotFound(id.to_string())),
|
||||||
|
Err(e) => Err(FileError::SurrealError(e)),
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
@@ -460,6 +478,7 @@ mod tests {
|
|||||||
id: Uuid::new_v4().to_string(),
|
id: Uuid::new_v4().to_string(),
|
||||||
created_at: now,
|
created_at: now,
|
||||||
updated_at: now,
|
updated_at: now,
|
||||||
|
user_id: "user123".to_string(),
|
||||||
sha256: "test_sha256_hash".to_string(),
|
sha256: "test_sha256_hash".to_string(),
|
||||||
path: "/path/to/file.txt".to_string(),
|
path: "/path/to/file.txt".to_string(),
|
||||||
file_name: "manual_file.txt".to_string(),
|
file_name: "manual_file.txt".to_string(),
|
||||||
@@ -517,6 +536,7 @@ mod tests {
|
|||||||
// The file path should point to our test file
|
// The file path should point to our test file
|
||||||
let file_info = FileInfo {
|
let file_info = FileInfo {
|
||||||
id: file_id.clone(),
|
id: file_id.clone(),
|
||||||
|
user_id: "user123".to_string(),
|
||||||
created_at: now,
|
created_at: now,
|
||||||
updated_at: now,
|
updated_at: now,
|
||||||
sha256: "test_sha256_hash".to_string(),
|
sha256: "test_sha256_hash".to_string(),
|
||||||
@@ -586,4 +606,72 @@ mod tests {
|
|||||||
_ => panic!("Expected FileNotFound error"),
|
_ => panic!("Expected FileNotFound error"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_get_by_id() {
|
||||||
|
// Setup in-memory database for testing
|
||||||
|
let namespace = "test_ns";
|
||||||
|
let database = &Uuid::new_v4().to_string();
|
||||||
|
let db = SurrealDbClient::memory(namespace, database)
|
||||||
|
.await
|
||||||
|
.expect("Failed to start in-memory surrealdb");
|
||||||
|
|
||||||
|
// Create a FileInfo instance directly
|
||||||
|
let now = Utc::now();
|
||||||
|
let file_id = Uuid::new_v4().to_string();
|
||||||
|
let original_file_info = FileInfo {
|
||||||
|
id: file_id.clone(),
|
||||||
|
user_id: "user123".to_string(),
|
||||||
|
created_at: now,
|
||||||
|
updated_at: now,
|
||||||
|
sha256: "test_sha256_for_get_by_id".to_string(),
|
||||||
|
path: "/path/to/get_by_id_test.txt".to_string(),
|
||||||
|
file_name: "get_by_id_test.txt".to_string(),
|
||||||
|
mime_type: "text/plain".to_string(),
|
||||||
|
};
|
||||||
|
|
||||||
|
// Store it in the database
|
||||||
|
db.store_item(original_file_info.clone())
|
||||||
|
.await
|
||||||
|
.expect("Failed to store item for get_by_id test");
|
||||||
|
|
||||||
|
// Retrieve it using get_by_id
|
||||||
|
let result = FileInfo::get_by_id(&file_id, &db).await;
|
||||||
|
|
||||||
|
// Assert success and content match
|
||||||
|
assert!(result.is_ok());
|
||||||
|
let retrieved_info = result.unwrap();
|
||||||
|
assert_eq!(retrieved_info.id, original_file_info.id);
|
||||||
|
assert_eq!(retrieved_info.sha256, original_file_info.sha256);
|
||||||
|
assert_eq!(retrieved_info.file_name, original_file_info.file_name);
|
||||||
|
assert_eq!(retrieved_info.path, original_file_info.path);
|
||||||
|
assert_eq!(retrieved_info.mime_type, original_file_info.mime_type);
|
||||||
|
// Optionally compare timestamps if precision isn't an issue
|
||||||
|
// assert_eq!(retrieved_info.created_at, original_file_info.created_at);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_get_by_id_not_found() {
|
||||||
|
// Setup in-memory database for testing
|
||||||
|
let namespace = "test_ns";
|
||||||
|
let database = &Uuid::new_v4().to_string();
|
||||||
|
let db = SurrealDbClient::memory(namespace, database)
|
||||||
|
.await
|
||||||
|
.expect("Failed to start in-memory surrealdb");
|
||||||
|
|
||||||
|
// Try to retrieve a non-existent ID
|
||||||
|
let non_existent_id = "non-existent-file-id";
|
||||||
|
let result = FileInfo::get_by_id(non_existent_id, &db).await;
|
||||||
|
|
||||||
|
// Assert failure
|
||||||
|
assert!(result.is_err());
|
||||||
|
|
||||||
|
// Assert the specific error type is FileNotFound
|
||||||
|
match result {
|
||||||
|
Err(FileError::FileNotFound(id)) => {
|
||||||
|
assert_eq!(id, non_existent_id);
|
||||||
|
}
|
||||||
|
Err(e) => panic!("Expected FileNotFound error, but got {:?}", e),
|
||||||
|
Ok(_) => panic!("Expected an error, but got Ok"),
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -114,6 +114,7 @@ mod tests {
|
|||||||
id: mock.id,
|
id: mock.id,
|
||||||
sha256: "mock-sha256".to_string(),
|
sha256: "mock-sha256".to_string(),
|
||||||
path: "/mock/path".to_string(),
|
path: "/mock/path".to_string(),
|
||||||
|
user_id: "user123".to_string(),
|
||||||
file_name: "mock.txt".to_string(),
|
file_name: "mock.txt".to_string(),
|
||||||
mime_type: "text/plain".to_string(),
|
mime_type: "text/plain".to_string(),
|
||||||
created_at: Utc::now(),
|
created_at: Utc::now(),
|
||||||
|
|||||||
@@ -31,19 +31,7 @@ impl SystemSettings {
|
|||||||
let settings: Option<Self> = db.get_item("current").await?;
|
let settings: Option<Self> = db.get_item("current").await?;
|
||||||
|
|
||||||
if settings.is_none() {
|
if settings.is_none() {
|
||||||
let created_settings = SystemSettings {
|
let created_settings = Self::new();
|
||||||
id: "current".to_string(),
|
|
||||||
registrations_enabled: true,
|
|
||||||
require_email_verification: false,
|
|
||||||
query_model: "gpt-4o-mini".to_string(),
|
|
||||||
processing_model: "gpt-4o-mini".to_string(),
|
|
||||||
query_system_prompt:
|
|
||||||
crate::storage::types::system_prompts::DEFAULT_QUERY_SYSTEM_PROMPT.to_string(),
|
|
||||||
ingestion_system_prompt:
|
|
||||||
crate::storage::types::system_prompts::DEFAULT_INGRESS_ANALYSIS_SYSTEM_PROMPT
|
|
||||||
.to_string(),
|
|
||||||
};
|
|
||||||
|
|
||||||
let stored: Option<Self> = db.store_item(created_settings).await?;
|
let stored: Option<Self> = db.store_item(created_settings).await?;
|
||||||
return stored.ok_or(AppError::Validation("Failed to initialize settings".into()));
|
return stored.ok_or(AppError::Validation("Failed to initialize settings".into()));
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,10 +5,17 @@ use crate::{error::AppError, storage::db::SurrealDbClient, stored_object};
|
|||||||
|
|
||||||
use super::file_info::FileInfo;
|
use super::file_info::FileInfo;
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
|
||||||
|
pub struct UrlInfo {
|
||||||
|
pub url: String,
|
||||||
|
pub title: String,
|
||||||
|
pub image_id: String,
|
||||||
|
}
|
||||||
|
|
||||||
stored_object!(TextContent, "text_content", {
|
stored_object!(TextContent, "text_content", {
|
||||||
text: String,
|
text: String,
|
||||||
file_info: Option<FileInfo>,
|
file_info: Option<FileInfo>,
|
||||||
url: Option<String>,
|
url_info: Option<UrlInfo>,
|
||||||
instructions: String,
|
instructions: String,
|
||||||
category: String,
|
category: String,
|
||||||
user_id: String
|
user_id: String
|
||||||
@@ -20,7 +27,7 @@ impl TextContent {
|
|||||||
instructions: String,
|
instructions: String,
|
||||||
category: String,
|
category: String,
|
||||||
file_info: Option<FileInfo>,
|
file_info: Option<FileInfo>,
|
||||||
url: Option<String>,
|
url_info: Option<UrlInfo>,
|
||||||
user_id: String,
|
user_id: String,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
let now = Utc::now();
|
let now = Utc::now();
|
||||||
@@ -30,7 +37,7 @@ impl TextContent {
|
|||||||
updated_at: now,
|
updated_at: now,
|
||||||
text,
|
text,
|
||||||
file_info,
|
file_info,
|
||||||
url,
|
url_info,
|
||||||
instructions,
|
instructions,
|
||||||
category,
|
category,
|
||||||
user_id,
|
user_id,
|
||||||
@@ -85,7 +92,7 @@ mod tests {
|
|||||||
assert_eq!(text_content.category, category);
|
assert_eq!(text_content.category, category);
|
||||||
assert_eq!(text_content.user_id, user_id);
|
assert_eq!(text_content.user_id, user_id);
|
||||||
assert!(text_content.file_info.is_none());
|
assert!(text_content.file_info.is_none());
|
||||||
assert!(text_content.url.is_none());
|
assert!(text_content.url_info.is_none());
|
||||||
assert!(!text_content.id.is_empty());
|
assert!(!text_content.id.is_empty());
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -96,19 +103,27 @@ mod tests {
|
|||||||
let instructions = "URL instructions".to_string();
|
let instructions = "URL instructions".to_string();
|
||||||
let category = "URL category".to_string();
|
let category = "URL category".to_string();
|
||||||
let user_id = "user123".to_string();
|
let user_id = "user123".to_string();
|
||||||
let url = Some("https://example.com/document.pdf".to_string());
|
let title = "page_title".to_string();
|
||||||
|
let image_id = "image12312".to_string();
|
||||||
|
let url = "https://example.com/document.pdf".to_string();
|
||||||
|
|
||||||
|
let url_info = Some(UrlInfo {
|
||||||
|
url,
|
||||||
|
title,
|
||||||
|
image_id,
|
||||||
|
});
|
||||||
|
|
||||||
let text_content = TextContent::new(
|
let text_content = TextContent::new(
|
||||||
text.clone(),
|
text.clone(),
|
||||||
instructions.clone(),
|
instructions.clone(),
|
||||||
category.clone(),
|
category.clone(),
|
||||||
None,
|
None,
|
||||||
url.clone(),
|
url_info.clone(),
|
||||||
user_id.clone(),
|
user_id.clone(),
|
||||||
);
|
);
|
||||||
|
|
||||||
// Check URL field is set
|
// Check URL field is set
|
||||||
assert_eq!(text_content.url, url);
|
assert_eq!(text_content.url_info, url_info);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
|
|||||||
@@ -31,6 +31,7 @@ tower-http = { version = "0.6.2", features = ["fs"] }
|
|||||||
chrono-tz = "0.10.1"
|
chrono-tz = "0.10.1"
|
||||||
tower-serve-static = "0.1.1"
|
tower-serve-static = "0.1.1"
|
||||||
include_dir = "0.7.4"
|
include_dir = "0.7.4"
|
||||||
|
tokio-util = { version = "0.7.15", features = ["io"] }
|
||||||
|
|
||||||
common = { path = "../common" }
|
common = { path = "../common" }
|
||||||
composite-retrieval = { path = "../composite-retrieval" }
|
composite-retrieval = { path = "../composite-retrieval" }
|
||||||
|
|||||||
@@ -1,9 +1,12 @@
|
|||||||
use axum::{
|
use axum::{
|
||||||
|
body::Body,
|
||||||
extract::{Path, State},
|
extract::{Path, State},
|
||||||
|
http::{header, HeaderMap, HeaderValue, StatusCode},
|
||||||
response::IntoResponse,
|
response::IntoResponse,
|
||||||
};
|
};
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
use tokio::join;
|
use tokio::{fs::File, join};
|
||||||
|
use tokio_util::io::ReaderStream;
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
middlewares::{
|
middlewares::{
|
||||||
@@ -15,9 +18,15 @@ use crate::{
|
|||||||
use common::{
|
use common::{
|
||||||
error::AppError,
|
error::AppError,
|
||||||
storage::types::{
|
storage::types::{
|
||||||
conversation::Conversation, file_info::FileInfo, ingestion_task::IngestionTask,
|
conversation::Conversation,
|
||||||
knowledge_entity::KnowledgeEntity, knowledge_relationship::KnowledgeRelationship,
|
file_info::{FileError, FileInfo},
|
||||||
text_chunk::TextChunk, text_content::TextContent, user::User,
|
ingestion_task::IngestionTask,
|
||||||
|
knowledge_entity::KnowledgeEntity,
|
||||||
|
knowledge_relationship::KnowledgeRelationship,
|
||||||
|
text_chunk::TextChunk,
|
||||||
|
text_content::TextContent,
|
||||||
|
user::User,
|
||||||
|
StoredObject,
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -167,3 +176,49 @@ pub async fn show_active_jobs(
|
|||||||
},
|
},
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub async fn serve_file(
|
||||||
|
State(state): State<HtmlState>,
|
||||||
|
RequireUser(user): RequireUser,
|
||||||
|
Path(file_id): Path<String>,
|
||||||
|
) -> Result<impl IntoResponse, HtmlError> {
|
||||||
|
let file_info = match FileInfo::get_by_id(&file_id, &state.db).await {
|
||||||
|
Ok(info) => info,
|
||||||
|
_ => return Ok(TemplateResponse::not_found().into_response()),
|
||||||
|
};
|
||||||
|
|
||||||
|
if file_info.user_id != user.id {
|
||||||
|
return Ok(TemplateResponse::unauthorized().into_response());
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3. Open the file asynchronously from the stored path
|
||||||
|
let path = std::path::Path::new(&file_info.path);
|
||||||
|
|
||||||
|
let file = match File::open(path).await {
|
||||||
|
Ok(f) => f,
|
||||||
|
Err(e) => return Ok(TemplateResponse::server_error().into_response()),
|
||||||
|
};
|
||||||
|
|
||||||
|
let stream = ReaderStream::new(file);
|
||||||
|
let body = Body::from_stream(stream);
|
||||||
|
|
||||||
|
let mut headers = HeaderMap::new();
|
||||||
|
headers.insert(
|
||||||
|
header::CONTENT_TYPE,
|
||||||
|
HeaderValue::from_str(&file_info.mime_type)
|
||||||
|
.unwrap_or_else(|_| HeaderValue::from_static("application/octet-stream")),
|
||||||
|
);
|
||||||
|
let Ok(disposition_value) =
|
||||||
|
HeaderValue::from_str(&format!("attachment; filename=\"{}\"", file_info.file_name))
|
||||||
|
else {
|
||||||
|
headers.insert(
|
||||||
|
header::CONTENT_DISPOSITION,
|
||||||
|
HeaderValue::from_static("attachment"),
|
||||||
|
);
|
||||||
|
return Ok((StatusCode::OK, headers, body).into_response());
|
||||||
|
};
|
||||||
|
headers.insert(header::CONTENT_DISPOSITION, disposition_value);
|
||||||
|
|
||||||
|
// 5. Return the response
|
||||||
|
Ok((StatusCode::OK, headers, body).into_response())
|
||||||
|
}
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ use axum::{
|
|||||||
routing::{delete, get},
|
routing::{delete, get},
|
||||||
Router,
|
Router,
|
||||||
};
|
};
|
||||||
use handlers::{delete_job, delete_text_content, index_handler, show_active_jobs};
|
use handlers::{delete_job, delete_text_content, index_handler, serve_file, show_active_jobs};
|
||||||
|
|
||||||
use crate::html_state::HtmlState;
|
use crate::html_state::HtmlState;
|
||||||
|
|
||||||
@@ -26,4 +26,5 @@ where
|
|||||||
.route("/jobs/{job_id}", delete(delete_job))
|
.route("/jobs/{job_id}", delete(delete_job))
|
||||||
.route("/active-jobs", get(show_active_jobs))
|
.route("/active-jobs", get(show_active_jobs))
|
||||||
.route("/text-content/{id}", delete(delete_text_content))
|
.route("/text-content/{id}", delete(delete_text_content))
|
||||||
|
.route("/file/{id}", get(serve_file))
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -54,9 +54,9 @@
|
|||||||
<select name="query_model" class="select select-bordered w-full">
|
<select name="query_model" class="select select-bordered w-full">
|
||||||
<option value="gpt-4o-mini" {% if settings.query_model=="gpt-4o-mini" %}selected{% endif %}>GPT-4o Mini
|
<option value="gpt-4o-mini" {% if settings.query_model=="gpt-4o-mini" %}selected{% endif %}>GPT-4o Mini
|
||||||
</option>
|
</option>
|
||||||
<option value="gpt-4o" {% if settings.query_model=="gpt-4o" %}selected{% endif %}>GPT-4o</option>
|
<option value="gpt-4.1" {% if settings.query_model=="gpt-4.1" %}selected{% endif %}>GPT-4.1</option>
|
||||||
<option value="gpt-3.5-turbo" {% if settings.query_model=="gpt-3.5-turbo" %}selected{% endif %}>GPT-3.5
|
<option value="gpt-4.1-mini" {% if settings.query_model=="gpt-4.1-mini" %}selected{% endif %}>GPT-4.1-mini
|
||||||
Turbo</option>
|
</option>
|
||||||
</select>
|
</select>
|
||||||
<p class="text-xs text-gray-500 mt-1">Model used for answering user queries</p>
|
<p class="text-xs text-gray-500 mt-1">Model used for answering user queries</p>
|
||||||
</div>
|
</div>
|
||||||
@@ -66,11 +66,11 @@
|
|||||||
<span class="label-text">Processing Model</span>
|
<span class="label-text">Processing Model</span>
|
||||||
</label>
|
</label>
|
||||||
<select name="processing_model" class="select select-bordered w-full">
|
<select name="processing_model" class="select select-bordered w-full">
|
||||||
<option value="gpt-4o-mini" {% if settings.processing_model=="gpt-4o-mini" %}selected{% endif %}>GPT-4o Mini
|
<option value="gpt-4o-mini" {% if settings.query_model=="gpt-4o-mini" %}selected{% endif %}>GPT-4o Mini
|
||||||
|
</option>
|
||||||
|
<option value="gpt-4.1" {% if settings.query_model=="gpt-4.1" %}selected{% endif %}>GPT-4.1</option>
|
||||||
|
<option value="gpt-4.1-mini" {% if settings.query_model=="gpt-4.1-mini" %}selected{% endif %}>GPT-4.1-mini
|
||||||
</option>
|
</option>
|
||||||
<option value="gpt-4o" {% if settings.processing_model=="gpt-4o" %}selected{% endif %}>GPT-4o</option>
|
|
||||||
<option value="gpt-3.5-turbo" {% if settings.processing_model=="gpt-3.5-turbo" %}selected{% endif %}>GPT-3.5
|
|
||||||
Turbo</option>
|
|
||||||
</select>
|
</select>
|
||||||
<p class="text-xs text-gray-500 mt-1">Model used for content processing and ingestion</p>
|
<p class="text-xs text-gray-500 mt-1">Model used for content processing and ingestion</p>
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
<div class="grid sm:grid-cols-2 lg:grid-cols-3 gap-4" id="text_content_cards">
|
<div class="grid sm:grid-cols-2 lg:grid-cols-3 gap-4" id="text_content_cards">
|
||||||
{% for text_content in text_contents %}
|
{% for text_content in text_contents %}
|
||||||
<div class="card min-w-72 bg-base-100 shadow">
|
<div class="card min-w-72 bg-base-100 shadow">
|
||||||
|
<img class="rounded-t-md overflow-clip" src="/file/{{text_content.url_info.image_id}}" />
|
||||||
<div class="card-body">
|
<div class="card-body">
|
||||||
<div class="flex justify-between space-x-2">
|
<div class="flex justify-between space-x-2">
|
||||||
<h2 class="card-title truncate">
|
<h2 class="card-title truncate">
|
||||||
|
|||||||
@@ -4,7 +4,7 @@
|
|||||||
{% for item in latest_text_contents %}
|
{% for item in latest_text_contents %}
|
||||||
<li class="list-row">
|
<li class="list-row">
|
||||||
<div class="bg-accent rounded-box size-10 flex justify-center items-center text-accent-content">
|
<div class="bg-accent rounded-box size-10 flex justify-center items-center text-accent-content">
|
||||||
{% if item.url %}
|
{% if item.url_info %}
|
||||||
{% include "icons/globe_icon.html" %}
|
{% include "icons/globe_icon.html" %}
|
||||||
{% elif item.file_info %}
|
{% elif item.file_info %}
|
||||||
{% include "icons/document_icon.html" %}
|
{% include "icons/document_icon.html" %}
|
||||||
@@ -14,8 +14,8 @@
|
|||||||
</div>
|
</div>
|
||||||
<div>
|
<div>
|
||||||
<div class="truncate max-w-[160px]">
|
<div class="truncate max-w-[160px]">
|
||||||
{% if item.url %}
|
{% if item.url_info %}
|
||||||
{{item.url}}
|
{{item.url_info.title}}
|
||||||
{% elif item.file_info%}
|
{% elif item.file_info%}
|
||||||
{{item.file_info.file_name}}
|
{{item.file_info.file_name}}
|
||||||
{% else %}
|
{% else %}
|
||||||
|
|||||||
@@ -31,6 +31,10 @@
|
|||||||
</a>
|
</a>
|
||||||
</li>
|
</li>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
|
<li>
|
||||||
|
<button class="btn btn-primary" hx-get="/ingress-form" hx-target="#modal" hx-swap="innerHTML">Add
|
||||||
|
Content</button>
|
||||||
|
</li>
|
||||||
<div class="divider "></div>
|
<div class="divider "></div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
|||||||
@@ -13,14 +13,17 @@ serde_json = { workspace = true }
|
|||||||
futures = { workspace = true }
|
futures = { workspace = true }
|
||||||
async-openai = { workspace = true }
|
async-openai = { workspace = true }
|
||||||
surrealdb = { workspace = true }
|
surrealdb = { workspace = true }
|
||||||
|
dom_smoothie = { workspace = true }
|
||||||
|
tempfile = { workspace = true }
|
||||||
|
axum_typed_multipart = { workspace = true}
|
||||||
|
|
||||||
tiktoken-rs = "0.6.0"
|
|
||||||
reqwest = {version = "0.12.12", features = ["charset", "json"]}
|
reqwest = {version = "0.12.12", features = ["charset", "json"]}
|
||||||
scraper = "0.22.0"
|
|
||||||
chrono = { version = "0.4.39", features = ["serde"] }
|
chrono = { version = "0.4.39", features = ["serde"] }
|
||||||
text-splitter = "0.18.1"
|
text-splitter = "0.18.1"
|
||||||
|
url = { version = "2.5.2", features = ["serde"] }
|
||||||
uuid = { version = "1.10.0", features = ["v4", "serde"] }
|
uuid = { version = "1.10.0", features = ["v4", "serde"] }
|
||||||
dom_smoothie = "0.10.0"
|
|
||||||
|
headless_chrome = { git = "https://github.com/rust-headless-chrome/rust-headless-chrome", features = ["fetch"] }
|
||||||
|
|
||||||
common = { path = "../common" }
|
common = { path = "../common" }
|
||||||
composite-retrieval = { path = "../composite-retrieval" }
|
composite-retrieval = { path = "../composite-retrieval" }
|
||||||
|
|||||||
@@ -111,7 +111,7 @@ impl IngestionEnricher {
|
|||||||
let request = CreateChatCompletionRequestArgs::default()
|
let request = CreateChatCompletionRequestArgs::default()
|
||||||
.model(&settings.processing_model)
|
.model(&settings.processing_model)
|
||||||
.temperature(0.2)
|
.temperature(0.2)
|
||||||
.max_tokens(3048u32)
|
.max_tokens(6048u32)
|
||||||
.messages([
|
.messages([
|
||||||
ChatCompletionRequestSystemMessage::from(INGRESS_ANALYSIS_SYSTEM_MESSAGE).into(),
|
ChatCompletionRequestSystemMessage::from(INGRESS_ANALYSIS_SYSTEM_MESSAGE).into(),
|
||||||
ChatCompletionRequestUserMessage::from(user_message).into(),
|
ChatCompletionRequestUserMessage::from(user_message).into(),
|
||||||
|
|||||||
@@ -53,7 +53,7 @@ impl IngestionPipeline {
|
|||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
let text_content = to_text_content(task.content, &self.openai_client, &self.db).await?;
|
let text_content = to_text_content(task.content, &self.db).await?;
|
||||||
|
|
||||||
match self.process(&text_content).await {
|
match self.process(&text_content).await {
|
||||||
Ok(_) => {
|
Ok(_) => {
|
||||||
|
|||||||
+105
-159
@@ -1,29 +1,28 @@
|
|||||||
pub mod llm_enrichment_result;
|
pub mod llm_enrichment_result;
|
||||||
|
|
||||||
use std::{sync::Arc, time::Duration};
|
use std::io::Write;
|
||||||
|
use std::time::Instant;
|
||||||
|
|
||||||
use async_openai::types::{
|
use axum::http::HeaderMap;
|
||||||
ChatCompletionRequestSystemMessage, ChatCompletionRequestUserMessage,
|
use axum_typed_multipart::{FieldData, FieldMetadata};
|
||||||
CreateChatCompletionRequestArgs,
|
use chrono::Utc;
|
||||||
};
|
|
||||||
use common::storage::db::SurrealDbClient;
|
use common::storage::db::SurrealDbClient;
|
||||||
use common::{
|
use common::{
|
||||||
error::AppError,
|
error::AppError,
|
||||||
storage::types::{
|
storage::types::{
|
||||||
file_info::FileInfo, ingestion_payload::IngestionPayload, system_settings::SystemSettings,
|
file_info::FileInfo,
|
||||||
text_content::TextContent,
|
ingestion_payload::IngestionPayload,
|
||||||
|
text_content::{TextContent, UrlInfo},
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
use dom_smoothie::TextMode;
|
use dom_smoothie::{Article, Readability, TextMode};
|
||||||
use reqwest;
|
use headless_chrome::Browser;
|
||||||
use scraper::{Html, Selector};
|
use tempfile::NamedTempFile;
|
||||||
use std::fmt::Write;
|
use tracing::{error, info};
|
||||||
use tiktoken_rs::{o200k_base, CoreBPE};
|
|
||||||
|
|
||||||
pub async fn to_text_content(
|
pub async fn to_text_content(
|
||||||
ingestion_payload: IngestionPayload,
|
ingestion_payload: IngestionPayload,
|
||||||
openai_client: &Arc<async_openai::Client<async_openai::config::OpenAIConfig>>,
|
db: &SurrealDbClient,
|
||||||
db_client: &Arc<SurrealDbClient>,
|
|
||||||
) -> Result<TextContent, AppError> {
|
) -> Result<TextContent, AppError> {
|
||||||
match ingestion_payload {
|
match ingestion_payload {
|
||||||
IngestionPayload::Url {
|
IngestionPayload::Url {
|
||||||
@@ -32,13 +31,17 @@ pub async fn to_text_content(
|
|||||||
category,
|
category,
|
||||||
user_id,
|
user_id,
|
||||||
} => {
|
} => {
|
||||||
let text = fetch_text_from_url(&url, openai_client, db_client).await?;
|
let (article, file_info) = fetch_article_from_url(&url, db, &user_id).await?;
|
||||||
Ok(TextContent::new(
|
Ok(TextContent::new(
|
||||||
text,
|
article.text_content.into(),
|
||||||
instructions,
|
instructions,
|
||||||
category,
|
category,
|
||||||
None,
|
None,
|
||||||
Some(url),
|
Some(UrlInfo {
|
||||||
|
url,
|
||||||
|
title: article.title,
|
||||||
|
image_id: file_info.id,
|
||||||
|
}),
|
||||||
user_id,
|
user_id,
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
@@ -73,161 +76,104 @@ pub async fn to_text_content(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
use std::io::{Seek, SeekFrom}; // <-- Add Seek and SeekFrom
|
||||||
|
|
||||||
/// Get text from url, will return it as a markdown formatted string
|
/// Fetches web content from a URL, extracts the main article text as Markdown,
|
||||||
async fn fetch_text_from_url(
|
/// captures a screenshot, and stores the screenshot returning [`FileInfo`].
|
||||||
|
///
|
||||||
|
/// This function handles browser automation, content extraction via Readability,
|
||||||
|
/// screenshot capture, temporary file handling, and persisting the screenshot
|
||||||
|
/// details (including deduplication based on content hash via [`FileInfo::new`]).
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
///
|
||||||
|
/// * `url` - The URL of the web page to fetch.
|
||||||
|
/// * `db` - A reference to the database client (`SurrealDbClient`).
|
||||||
|
/// * `user_id` - The ID of the user performing the action, used for associating the file.
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
///
|
||||||
|
/// A `Result` containing:
|
||||||
|
/// * Ok: A tuple `(Article, FileInfo)` where `Article` contains the parsed markdown
|
||||||
|
/// content and metadata, and `FileInfo` contains the details of the stored screenshot.
|
||||||
|
/// * Err: An `AppError` if any step fails (navigation, screenshot, file handling, DB operation).
|
||||||
|
async fn fetch_article_from_url(
|
||||||
url: &str,
|
url: &str,
|
||||||
openai_client: &Arc<async_openai::Client<async_openai::config::OpenAIConfig>>,
|
db: &SurrealDbClient,
|
||||||
db_client: &Arc<SurrealDbClient>,
|
user_id: &str,
|
||||||
) -> Result<String, AppError> {
|
) -> Result<(Article, FileInfo), AppError> {
|
||||||
// Use a client with timeouts and reuse
|
info!("Fetching URL: {}", url);
|
||||||
let client = reqwest::ClientBuilder::new()
|
// Instantiate timer
|
||||||
.timeout(Duration::from_secs(30))
|
let now = Instant::now();
|
||||||
.build()?;
|
// Setup browser, navigate and wait
|
||||||
let response = client.get(url).send().await?.text().await?;
|
let browser = Browser::default()?;
|
||||||
|
let tab = browser.new_tab()?;
|
||||||
|
let page = tab.navigate_to(url)?;
|
||||||
|
let loaded_page = page.wait_until_navigated()?;
|
||||||
|
// Get content
|
||||||
|
let raw_content = loaded_page.get_content()?;
|
||||||
|
// Get screenshot
|
||||||
|
let screenshot = loaded_page.capture_screenshot(
|
||||||
|
headless_chrome::protocol::cdp::Page::CaptureScreenshotFormatOption::Jpeg,
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
true,
|
||||||
|
)?;
|
||||||
|
|
||||||
// Preallocate string with capacity
|
// Create temp file
|
||||||
let mut structured_content = String::with_capacity(response.len() / 2);
|
let mut tmp_file = NamedTempFile::new()?;
|
||||||
|
let temp_path_str = format!("{:?}", tmp_file.path());
|
||||||
|
|
||||||
let document = Html::parse_document(&response);
|
// Write screenshot TO the temp file
|
||||||
let main_selectors = Selector::parse(
|
tmp_file.write_all(&screenshot)?;
|
||||||
"article, main, .article-content, .post-content, .entry-content, [role='main']",
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let content_element = document
|
// Ensure the OS buffer is written to the file system _before_ we proceed.
|
||||||
.select(&main_selectors)
|
tmp_file.as_file().sync_all()?;
|
||||||
.next()
|
|
||||||
.or_else(|| document.select(&Selector::parse("body").unwrap()).next())
|
|
||||||
.ok_or(AppError::NotFound("No content found".into()))?;
|
|
||||||
|
|
||||||
// Compile selectors once
|
// Ensure the file handle's read cursor is at the beginning before hashing occurs.
|
||||||
let heading_selector = Selector::parse("h1, h2, h3").unwrap();
|
if let Err(e) = tmp_file.seek(SeekFrom::Start(0)) {
|
||||||
let paragraph_selector = Selector::parse("p").unwrap();
|
error!("URL: {}. Failed to seek temp file {} to start: {:?}. Proceeding, but hashing might fail.", url, temp_path_str, e);
|
||||||
|
|
||||||
// Process content in one pass
|
|
||||||
for element in content_element.select(&heading_selector) {
|
|
||||||
let _ = writeln!(
|
|
||||||
structured_content,
|
|
||||||
"<heading>{}</heading>",
|
|
||||||
element.text().collect::<String>().trim()
|
|
||||||
);
|
|
||||||
}
|
|
||||||
for element in content_element.select(¶graph_selector) {
|
|
||||||
let _ = writeln!(
|
|
||||||
structured_content,
|
|
||||||
"<paragraph>{}</paragraph>",
|
|
||||||
element.text().collect::<String>().trim()
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
let content = structured_content
|
// Prepare file metadata
|
||||||
.replace(|c: char| c.is_control(), " ")
|
let parsed_url =
|
||||||
.replace(" ", " ");
|
url::Url::parse(url).map_err(|_| AppError::Processing("Invalid URL".to_string()))?;
|
||||||
|
let domain = parsed_url
|
||||||
|
.host_str()
|
||||||
|
.unwrap_or("unknown")
|
||||||
|
.replace(|c: char| !c.is_alphanumeric(), "_");
|
||||||
|
let timestamp = Utc::now().format("%Y%m%d%H%M%S");
|
||||||
|
let file_name = format!("{}_{}_{}.jpg", domain, "screenshot", timestamp);
|
||||||
|
|
||||||
process_web_content(content, openai_client, db_client).await
|
// Construct FieldData and FieldMetadata
|
||||||
|
let metadata = FieldMetadata {
|
||||||
// let config = dom_smoothie::Config {
|
file_name: Some(file_name),
|
||||||
// text_mode: TextMode::Markdown,
|
content_type: Some("image/jpeg".to_string()),
|
||||||
// ..Default::default()
|
name: None,
|
||||||
// };
|
headers: HeaderMap::new(),
|
||||||
// panic!("YOU SHALL NOT PASS");
|
};
|
||||||
}
|
let field_data = FieldData {
|
||||||
|
contents: tmp_file,
|
||||||
pub async fn process_web_content(
|
metadata,
|
||||||
content: String,
|
|
||||||
openai_client: &Arc<async_openai::Client<async_openai::config::OpenAIConfig>>,
|
|
||||||
db_client: &Arc<SurrealDbClient>,
|
|
||||||
) -> Result<String, AppError> {
|
|
||||||
const MAX_TOKENS: usize = 122000;
|
|
||||||
const SYSTEM_PROMPT: &str = r#"
|
|
||||||
You are a precise content extractor for web pages. Your task:
|
|
||||||
|
|
||||||
1. Extract ONLY the main article/content from the provided text
|
|
||||||
2. Maintain the original content - do not summarize or modify the core information
|
|
||||||
3. Ignore peripheral content such as:
|
|
||||||
- Navigation elements
|
|
||||||
- Error messages (e.g., "JavaScript required")
|
|
||||||
- Related articles sections
|
|
||||||
- Comments
|
|
||||||
- Social media links
|
|
||||||
- Advertisement text
|
|
||||||
|
|
||||||
FORMAT:
|
|
||||||
- Convert <heading> tags to markdown headings (#, ##, ###)
|
|
||||||
- Convert <paragraph> tags to markdown paragraphs
|
|
||||||
- Preserve quotes and important formatting
|
|
||||||
- Remove duplicate content
|
|
||||||
- Remove any metadata or technical artifacts
|
|
||||||
|
|
||||||
OUTPUT RULES:
|
|
||||||
- Output ONLY the cleaned content in markdown
|
|
||||||
- Do not add any explanations or meta-commentary
|
|
||||||
- Do not add summaries or conclusions
|
|
||||||
- Do not use any XML/HTML tags in the output
|
|
||||||
"#;
|
|
||||||
|
|
||||||
let bpe = o200k_base()?;
|
|
||||||
let settings = SystemSettings::get_current(db_client).await?;
|
|
||||||
|
|
||||||
// Process content in chunks if needed
|
|
||||||
let truncated_content = if bpe.encode_with_special_tokens(&content).len() > MAX_TOKENS {
|
|
||||||
truncate_content(&content, MAX_TOKENS, &bpe)?
|
|
||||||
} else {
|
|
||||||
content
|
|
||||||
};
|
};
|
||||||
|
|
||||||
let request = CreateChatCompletionRequestArgs::default()
|
// Store screenshot
|
||||||
.model(&settings.processing_model)
|
let file_info = FileInfo::new(field_data, db, user_id).await?;
|
||||||
.temperature(0.0)
|
|
||||||
.max_tokens(16200u32)
|
|
||||||
.messages([
|
|
||||||
ChatCompletionRequestSystemMessage::from(SYSTEM_PROMPT).into(),
|
|
||||||
ChatCompletionRequestUserMessage::from(truncated_content).into(),
|
|
||||||
])
|
|
||||||
.build()?;
|
|
||||||
|
|
||||||
let response = openai_client.chat().create(request).await?;
|
// Parse content...
|
||||||
|
let config = dom_smoothie::Config {
|
||||||
|
text_mode: TextMode::Markdown,
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
let mut readability = Readability::new(raw_content, None, Some(config))?;
|
||||||
|
let article: Article = readability.parse()?;
|
||||||
|
let end = now.elapsed();
|
||||||
|
info!(
|
||||||
|
"URL: {}. Total time: {:?}. Final File ID: {}",
|
||||||
|
url, end, file_info.id
|
||||||
|
);
|
||||||
|
|
||||||
// Extract and return the content
|
Ok((article, file_info))
|
||||||
response
|
|
||||||
.choices
|
|
||||||
.first()
|
|
||||||
.and_then(|choice| choice.message.content.clone())
|
|
||||||
.ok_or(AppError::LLMParsing(
|
|
||||||
"No content found in LLM response".into(),
|
|
||||||
))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn truncate_content(
|
|
||||||
content: &str,
|
|
||||||
max_tokens: usize,
|
|
||||||
tokenizer: &CoreBPE,
|
|
||||||
) -> Result<String, AppError> {
|
|
||||||
// Pre-allocate with estimated size
|
|
||||||
let mut result = String::with_capacity(content.len() / 2);
|
|
||||||
let mut current_tokens = 0;
|
|
||||||
|
|
||||||
// Process content by paragraph to maintain context
|
|
||||||
for paragraph in content.split("\n\n") {
|
|
||||||
let tokens = tokenizer.encode_with_special_tokens(paragraph).len();
|
|
||||||
|
|
||||||
// Check if adding paragraph exceeds limit
|
|
||||||
if current_tokens + tokens > max_tokens {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
result.push_str(paragraph);
|
|
||||||
result.push_str("\n\n");
|
|
||||||
current_tokens += tokens;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Ensure we return valid content
|
|
||||||
if result.is_empty() {
|
|
||||||
return Err(AppError::Processing("Content exceeds token limit".into()));
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(result.trim_end().to_string())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Extracts text from a file based on its MIME type.
|
/// Extracts text from a file based on its MIME type.
|
||||||
|
|||||||
+1
-1
@@ -50,7 +50,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|||||||
// Create Axum router
|
// Create Axum router
|
||||||
let app = Router::new()
|
let app = Router::new()
|
||||||
.nest("/api/v1", api_routes_v1(&api_state))
|
.nest("/api/v1", api_routes_v1(&api_state))
|
||||||
.nest("/", html_routes(&html_state))
|
.merge(html_routes(&html_state))
|
||||||
.with_state(AppState {
|
.with_state(AppState {
|
||||||
api_state,
|
api_state,
|
||||||
html_state,
|
html_state,
|
||||||
|
|||||||
Reference in New Issue
Block a user