webscraping implemented

This commit is contained in:
Per Stark
2025-01-08 11:12:54 +01:00
parent ad9215e251
commit 1066f27162
3 changed files with 418 additions and 48 deletions

321
Cargo.lock generated
View File

@@ -116,7 +116,7 @@ version = "4.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1ab99eae5ee58501ab236beb6f20f6ca39be615267b014899c89b2f0bc18a459"
dependencies = [
"html5ever",
"html5ever 0.27.0",
"maplit",
"once_cell",
"tendril",
@@ -661,7 +661,7 @@ dependencies = [
"serde_urlencoded",
"sync_wrapper",
"tokio",
"tower 0.5.2",
"tower",
"tower-layer",
"tower-service",
"tracing",
@@ -1008,6 +1008,17 @@ dependencies = [
"syn_derive",
]
[[package]]
name = "bstr"
version = "1.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "531a9155a481e2ee699d4f98f43c0ca4ff8ee1bfd55c31e9e98fb29d2b176fe0"
dependencies = [
"memchr",
"regex-automata 0.4.7",
"serde",
]
[[package]]
name = "bumpalo"
version = "3.16.0"
@@ -1365,6 +1376,29 @@ dependencies = [
"typenum",
]
[[package]]
name = "cssparser"
version = "0.34.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b7c66d1cd8ed61bf80b38432613a7a2f09401ab8d0501110655f8b341484a3e3"
dependencies = [
"cssparser-macros",
"dtoa-short",
"itoa",
"phf",
"smallvec",
]
[[package]]
name = "cssparser-macros"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331"
dependencies = [
"quote",
"syn 2.0.87",
]
[[package]]
name = "ctr"
version = "0.9.2"
@@ -1521,6 +1555,17 @@ dependencies = [
"syn 2.0.87",
]
[[package]]
name = "derive_more"
version = "0.99.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5f33878137e4dafd7fa914ad4e259e18a4e8e532b9617a2d0150262bf53abfce"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.87",
]
[[package]]
name = "derive_utils"
version = "0.14.2"
@@ -1620,6 +1665,21 @@ version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1435fa1053d8b2fbbe9be7e97eca7f33d37b28409959813daefc1446a14247f1"
[[package]]
name = "dtoa"
version = "1.0.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dcbb2bf8e87535c23f7a8a321e364ce21462d0ff10cb6407820e8e96dfff6653"
[[package]]
name = "dtoa-short"
version = "0.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87"
dependencies = [
"dtoa",
]
[[package]]
name = "earcutr"
version = "0.4.3"
@@ -1630,6 +1690,12 @@ dependencies = [
"num-traits",
]
[[package]]
name = "ego-tree"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b2972feb8dffe7bc8c5463b1dacda1b0dfbed3710e50f977d965429692d74cd8"
[[package]]
name = "either"
version = "1.13.0"
@@ -1739,6 +1805,17 @@ dependencies = [
"async-trait",
]
[[package]]
name = "fancy-regex"
version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "531e46835a22af56d1e3b66f04844bed63158bc094a628bec1d321d9b4c44bf2"
dependencies = [
"bit-set",
"regex-automata 0.4.7",
"regex-syntax 0.8.4",
]
[[package]]
name = "fastrand"
version = "1.9.0"
@@ -2004,6 +2081,15 @@ dependencies = [
"thread_local",
]
[[package]]
name = "fxhash"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
dependencies = [
"byteorder",
]
[[package]]
name = "generic-array"
version = "0.14.7"
@@ -2054,6 +2140,15 @@ dependencies = [
"libm",
]
[[package]]
name = "getopts"
version = "0.2.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5"
dependencies = [
"unicode-width",
]
[[package]]
name = "getrandom"
version = "0.2.15"
@@ -2083,6 +2178,25 @@ version = "0.31.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32085ea23f3234fc7846555e85283ba4de91e21016dc0455a16286d87a292d64"
[[package]]
name = "h2"
version = "0.4.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ccae279728d634d083c00f6099cb58f01cc99c145b84b8be2f6c74618d79922e"
dependencies = [
"atomic-waker",
"bytes",
"fnv",
"futures-core",
"futures-sink",
"http",
"indexmap 2.6.0",
"slab",
"tokio",
"tokio-util",
"tracing",
]
[[package]]
name = "half"
version = "2.4.1"
@@ -2204,7 +2318,21 @@ checksum = "c13771afe0e6e846f1e67d038d4cb29998a6779f93c809212e4e9c32efd244d4"
dependencies = [
"log",
"mac",
"markup5ever",
"markup5ever 0.12.1",
"proc-macro2",
"quote",
"syn 2.0.87",
]
[[package]]
name = "html5ever"
version = "0.29.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2e15626aaf9c351bc696217cbe29cb9b5e86c43f8a46b5e2f5c6c5cf7cb904ce"
dependencies = [
"log",
"mac",
"markup5ever 0.14.0",
"proc-macro2",
"quote",
"syn 2.0.87",
@@ -2277,6 +2405,7 @@ dependencies = [
"bytes",
"futures-channel",
"futures-util",
"h2",
"http",
"http-body",
"httparse",
@@ -2308,10 +2437,26 @@ dependencies = [
]
[[package]]
name = "hyper-util"
version = "0.1.8"
name = "hyper-tls"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da62f120a8a37763efb0cf8fdf264b884c7b8b9ac8660b900c8661030c00e6ba"
checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0"
dependencies = [
"bytes",
"http-body-util",
"hyper",
"hyper-util",
"native-tls",
"tokio",
"tokio-native-tls",
"tower-service",
]
[[package]]
name = "hyper-util"
version = "0.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df2dcfbe0677734ab2f3ffa7fa7bfd4706bfdc1ef393f2ee30184aed67e631b4"
dependencies = [
"bytes",
"futures-channel",
@@ -2322,7 +2467,6 @@ dependencies = [
"pin-project-lite",
"socket2 0.5.7",
"tokio",
"tower 0.4.13",
"tower-service",
"tracing",
]
@@ -2864,6 +3008,20 @@ dependencies = [
"tendril",
]
[[package]]
name = "markup5ever"
version = "0.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "82c88c6129bd24319e62a0359cb6b958fa7e8be6e19bb1663bc396b90883aca5"
dependencies = [
"log",
"phf",
"phf_codegen",
"string_cache",
"string_cache_codegen",
"tendril",
]
[[package]]
name = "matchers"
version = "0.1.0"
@@ -3601,26 +3759,6 @@ version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5be167a7af36ee22fe3115051bc51f6e6c7054c9348e28deb4f49bd6f705a315"
[[package]]
name = "pin-project"
version = "1.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6bf43b791c5b9e34c3d182969b4abb522f9343702850a2e57f460d00d09b4b3"
dependencies = [
"pin-project-internal",
]
[[package]]
name = "pin-project-internal"
version = "1.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.87",
]
[[package]]
name = "pin-project-lite"
version = "0.2.14"
@@ -3881,7 +4019,7 @@ dependencies = [
"pin-project-lite",
"quinn-proto",
"quinn-udp",
"rustc-hash",
"rustc-hash 2.0.0",
"rustls",
"socket2 0.5.7",
"thiserror",
@@ -3898,7 +4036,7 @@ dependencies = [
"bytes",
"rand",
"ring",
"rustc-hash",
"rustc-hash 2.0.0",
"rustls",
"slab",
"thiserror",
@@ -4108,25 +4246,29 @@ dependencies = [
[[package]]
name = "reqwest"
version = "0.12.8"
version = "0.12.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f713147fbe92361e52392c73b8c9e48c04c6625bce969ef54dc901e58e042a7b"
checksum = "43e734407157c3c2034e0258f5e4473ddb361b1e85f95a66690d67264d7cd1da"
dependencies = [
"base64 0.22.1",
"bytes",
"encoding_rs",
"futures-core",
"futures-util",
"h2",
"http",
"http-body",
"http-body-util",
"hyper",
"hyper-rustls",
"hyper-tls",
"hyper-util",
"ipnet",
"js-sys",
"log",
"mime",
"mime_guess",
"native-tls",
"once_cell",
"percent-encoding",
"pin-project-lite",
@@ -4139,9 +4281,12 @@ dependencies = [
"serde_json",
"serde_urlencoded",
"sync_wrapper",
"system-configuration",
"tokio",
"tokio-native-tls",
"tokio-rustls",
"tokio-util",
"tower",
"tower-service",
"url",
"wasm-bindgen",
@@ -4342,6 +4487,12 @@ version = "0.1.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f"
[[package]]
name = "rustc-hash"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
[[package]]
name = "rustc-hash"
version = "2.0.0"
@@ -4531,6 +4682,21 @@ version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
[[package]]
name = "scraper"
version = "0.22.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cc3d051b884f40e309de6c149734eab57aa8cc1347992710dc80bcc1c2194c15"
dependencies = [
"cssparser",
"ego-tree",
"getopts",
"html5ever 0.29.0",
"precomputed-hash",
"selectors",
"tendril",
]
[[package]]
name = "scrypt"
version = "0.11.0"
@@ -4582,6 +4748,25 @@ dependencies = [
"libc",
]
[[package]]
name = "selectors"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fd568a4c9bb598e291a08244a5c1f5a8a6650bee243b5b0f8dbb3d9cc1d87fe8"
dependencies = [
"bitflags 2.6.0",
"cssparser",
"derive_more",
"fxhash",
"log",
"new_debug_unreachable",
"phf",
"phf_codegen",
"precomputed-hash",
"servo_arc",
"smallvec",
]
[[package]]
name = "self_cell"
version = "1.1.0"
@@ -4706,6 +4891,15 @@ dependencies = [
"syn 2.0.87",
]
[[package]]
name = "servo_arc"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ae65c4249478a2647db249fb43e23cec56a2c8974a427e7bd8cb5a1d0964921a"
dependencies = [
"stable_deref_trait",
]
[[package]]
name = "sha1"
version = "0.10.6"
@@ -5163,6 +5357,27 @@ dependencies = [
"syn 2.0.87",
]
[[package]]
name = "system-configuration"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b"
dependencies = [
"bitflags 2.6.0",
"core-foundation",
"system-configuration-sys",
]
[[package]]
name = "system-configuration-sys"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4"
dependencies = [
"core-foundation-sys",
"libc",
]
[[package]]
name = "tap"
version = "1.0.1"
@@ -5269,6 +5484,22 @@ dependencies = [
"once_cell",
]
[[package]]
name = "tiktoken-rs"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "44075987ee2486402f0808505dd65692163d243a337fc54363d49afac41087f6"
dependencies = [
"anyhow",
"base64 0.21.7",
"bstr",
"fancy-regex",
"lazy_static",
"parking_lot",
"regex",
"rustc-hash 1.1.0",
]
[[package]]
name = "time"
version = "0.3.36"
@@ -5363,6 +5594,16 @@ dependencies = [
"syn 2.0.87",
]
[[package]]
name = "tokio-native-tls"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2"
dependencies = [
"native-tls",
"tokio",
]
[[package]]
name = "tokio-rustls"
version = "0.26.0"
@@ -5449,21 +5690,6 @@ dependencies = [
"winnow",
]
[[package]]
name = "tower"
version = "0.4.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c"
dependencies = [
"futures-core",
"futures-util",
"pin-project",
"pin-project-lite",
"tokio",
"tower-layer",
"tower-service",
]
[[package]]
name = "tower"
version = "0.5.2"
@@ -6382,6 +6608,8 @@ dependencies = [
"minijinja",
"minijinja-autoreload",
"mockall",
"reqwest",
"scraper",
"serde",
"serde_json",
"sha2",
@@ -6389,6 +6617,7 @@ dependencies = [
"tempfile",
"text-splitter",
"thiserror",
"tiktoken-rs",
"tokio",
"tower-http",
"tracing",