webscraping implemented

This commit is contained in:
Per Stark
2025-01-08 11:12:54 +01:00
parent e739e74b8f
commit d2c5b31320
3 changed files with 418 additions and 48 deletions

321
Cargo.lock generated
View File

@@ -116,7 +116,7 @@ version = "4.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1ab99eae5ee58501ab236beb6f20f6ca39be615267b014899c89b2f0bc18a459"
dependencies = [
"html5ever",
"html5ever 0.27.0",
"maplit",
"once_cell",
"tendril",
@@ -661,7 +661,7 @@ dependencies = [
"serde_urlencoded",
"sync_wrapper",
"tokio",
"tower 0.5.2",
"tower",
"tower-layer",
"tower-service",
"tracing",
@@ -1008,6 +1008,17 @@ dependencies = [
"syn_derive",
]
[[package]]
name = "bstr"
version = "1.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "531a9155a481e2ee699d4f98f43c0ca4ff8ee1bfd55c31e9e98fb29d2b176fe0"
dependencies = [
"memchr",
"regex-automata 0.4.7",
"serde",
]
[[package]]
name = "bumpalo"
version = "3.16.0"
@@ -1365,6 +1376,29 @@ dependencies = [
"typenum",
]
[[package]]
name = "cssparser"
version = "0.34.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b7c66d1cd8ed61bf80b38432613a7a2f09401ab8d0501110655f8b341484a3e3"
dependencies = [
"cssparser-macros",
"dtoa-short",
"itoa",
"phf",
"smallvec",
]
[[package]]
name = "cssparser-macros"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331"
dependencies = [
"quote",
"syn 2.0.87",
]
[[package]]
name = "ctr"
version = "0.9.2"
@@ -1521,6 +1555,17 @@ dependencies = [
"syn 2.0.87",
]
[[package]]
name = "derive_more"
version = "0.99.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5f33878137e4dafd7fa914ad4e259e18a4e8e532b9617a2d0150262bf53abfce"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.87",
]
[[package]]
name = "derive_utils"
version = "0.14.2"
@@ -1620,6 +1665,21 @@ version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1435fa1053d8b2fbbe9be7e97eca7f33d37b28409959813daefc1446a14247f1"
[[package]]
name = "dtoa"
version = "1.0.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dcbb2bf8e87535c23f7a8a321e364ce21462d0ff10cb6407820e8e96dfff6653"
[[package]]
name = "dtoa-short"
version = "0.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87"
dependencies = [
"dtoa",
]
[[package]]
name = "earcutr"
version = "0.4.3"
@@ -1630,6 +1690,12 @@ dependencies = [
"num-traits",
]
[[package]]
name = "ego-tree"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b2972feb8dffe7bc8c5463b1dacda1b0dfbed3710e50f977d965429692d74cd8"
[[package]]
name = "either"
version = "1.13.0"
@@ -1739,6 +1805,17 @@ dependencies = [
"async-trait",
]
[[package]]
name = "fancy-regex"
version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "531e46835a22af56d1e3b66f04844bed63158bc094a628bec1d321d9b4c44bf2"
dependencies = [
"bit-set",
"regex-automata 0.4.7",
"regex-syntax 0.8.4",
]
[[package]]
name = "fastrand"
version = "1.9.0"
@@ -2004,6 +2081,15 @@ dependencies = [
"thread_local",
]
[[package]]
name = "fxhash"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
dependencies = [
"byteorder",
]
[[package]]
name = "generic-array"
version = "0.14.7"
@@ -2054,6 +2140,15 @@ dependencies = [
"libm",
]
[[package]]
name = "getopts"
version = "0.2.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5"
dependencies = [
"unicode-width",
]
[[package]]
name = "getrandom"
version = "0.2.15"
@@ -2083,6 +2178,25 @@ version = "0.31.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32085ea23f3234fc7846555e85283ba4de91e21016dc0455a16286d87a292d64"
[[package]]
name = "h2"
version = "0.4.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ccae279728d634d083c00f6099cb58f01cc99c145b84b8be2f6c74618d79922e"
dependencies = [
"atomic-waker",
"bytes",
"fnv",
"futures-core",
"futures-sink",
"http",
"indexmap 2.6.0",
"slab",
"tokio",
"tokio-util",
"tracing",
]
[[package]]
name = "half"
version = "2.4.1"
@@ -2204,7 +2318,21 @@ checksum = "c13771afe0e6e846f1e67d038d4cb29998a6779f93c809212e4e9c32efd244d4"
dependencies = [
"log",
"mac",
"markup5ever",
"markup5ever 0.12.1",
"proc-macro2",
"quote",
"syn 2.0.87",
]
[[package]]
name = "html5ever"
version = "0.29.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2e15626aaf9c351bc696217cbe29cb9b5e86c43f8a46b5e2f5c6c5cf7cb904ce"
dependencies = [
"log",
"mac",
"markup5ever 0.14.0",
"proc-macro2",
"quote",
"syn 2.0.87",
@@ -2277,6 +2405,7 @@ dependencies = [
"bytes",
"futures-channel",
"futures-util",
"h2",
"http",
"http-body",
"httparse",
@@ -2308,10 +2437,26 @@ dependencies = [
]
[[package]]
name = "hyper-util"
version = "0.1.8"
name = "hyper-tls"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da62f120a8a37763efb0cf8fdf264b884c7b8b9ac8660b900c8661030c00e6ba"
checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0"
dependencies = [
"bytes",
"http-body-util",
"hyper",
"hyper-util",
"native-tls",
"tokio",
"tokio-native-tls",
"tower-service",
]
[[package]]
name = "hyper-util"
version = "0.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df2dcfbe0677734ab2f3ffa7fa7bfd4706bfdc1ef393f2ee30184aed67e631b4"
dependencies = [
"bytes",
"futures-channel",
@@ -2322,7 +2467,6 @@ dependencies = [
"pin-project-lite",
"socket2 0.5.7",
"tokio",
"tower 0.4.13",
"tower-service",
"tracing",
]
@@ -2864,6 +3008,20 @@ dependencies = [
"tendril",
]
[[package]]
name = "markup5ever"
version = "0.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "82c88c6129bd24319e62a0359cb6b958fa7e8be6e19bb1663bc396b90883aca5"
dependencies = [
"log",
"phf",
"phf_codegen",
"string_cache",
"string_cache_codegen",
"tendril",
]
[[package]]
name = "matchers"
version = "0.1.0"
@@ -3601,26 +3759,6 @@ version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5be167a7af36ee22fe3115051bc51f6e6c7054c9348e28deb4f49bd6f705a315"
[[package]]
name = "pin-project"
version = "1.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6bf43b791c5b9e34c3d182969b4abb522f9343702850a2e57f460d00d09b4b3"
dependencies = [
"pin-project-internal",
]
[[package]]
name = "pin-project-internal"
version = "1.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.87",
]
[[package]]
name = "pin-project-lite"
version = "0.2.14"
@@ -3881,7 +4019,7 @@ dependencies = [
"pin-project-lite",
"quinn-proto",
"quinn-udp",
"rustc-hash",
"rustc-hash 2.0.0",
"rustls",
"socket2 0.5.7",
"thiserror",
@@ -3898,7 +4036,7 @@ dependencies = [
"bytes",
"rand",
"ring",
"rustc-hash",
"rustc-hash 2.0.0",
"rustls",
"slab",
"thiserror",
@@ -4108,25 +4246,29 @@ dependencies = [
[[package]]
name = "reqwest"
version = "0.12.8"
version = "0.12.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f713147fbe92361e52392c73b8c9e48c04c6625bce969ef54dc901e58e042a7b"
checksum = "43e734407157c3c2034e0258f5e4473ddb361b1e85f95a66690d67264d7cd1da"
dependencies = [
"base64 0.22.1",
"bytes",
"encoding_rs",
"futures-core",
"futures-util",
"h2",
"http",
"http-body",
"http-body-util",
"hyper",
"hyper-rustls",
"hyper-tls",
"hyper-util",
"ipnet",
"js-sys",
"log",
"mime",
"mime_guess",
"native-tls",
"once_cell",
"percent-encoding",
"pin-project-lite",
@@ -4139,9 +4281,12 @@ dependencies = [
"serde_json",
"serde_urlencoded",
"sync_wrapper",
"system-configuration",
"tokio",
"tokio-native-tls",
"tokio-rustls",
"tokio-util",
"tower",
"tower-service",
"url",
"wasm-bindgen",
@@ -4342,6 +4487,12 @@ version = "0.1.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f"
[[package]]
name = "rustc-hash"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
[[package]]
name = "rustc-hash"
version = "2.0.0"
@@ -4531,6 +4682,21 @@ version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
[[package]]
name = "scraper"
version = "0.22.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cc3d051b884f40e309de6c149734eab57aa8cc1347992710dc80bcc1c2194c15"
dependencies = [
"cssparser",
"ego-tree",
"getopts",
"html5ever 0.29.0",
"precomputed-hash",
"selectors",
"tendril",
]
[[package]]
name = "scrypt"
version = "0.11.0"
@@ -4582,6 +4748,25 @@ dependencies = [
"libc",
]
[[package]]
name = "selectors"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fd568a4c9bb598e291a08244a5c1f5a8a6650bee243b5b0f8dbb3d9cc1d87fe8"
dependencies = [
"bitflags 2.6.0",
"cssparser",
"derive_more",
"fxhash",
"log",
"new_debug_unreachable",
"phf",
"phf_codegen",
"precomputed-hash",
"servo_arc",
"smallvec",
]
[[package]]
name = "self_cell"
version = "1.1.0"
@@ -4706,6 +4891,15 @@ dependencies = [
"syn 2.0.87",
]
[[package]]
name = "servo_arc"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ae65c4249478a2647db249fb43e23cec56a2c8974a427e7bd8cb5a1d0964921a"
dependencies = [
"stable_deref_trait",
]
[[package]]
name = "sha1"
version = "0.10.6"
@@ -5163,6 +5357,27 @@ dependencies = [
"syn 2.0.87",
]
[[package]]
name = "system-configuration"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b"
dependencies = [
"bitflags 2.6.0",
"core-foundation",
"system-configuration-sys",
]
[[package]]
name = "system-configuration-sys"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4"
dependencies = [
"core-foundation-sys",
"libc",
]
[[package]]
name = "tap"
version = "1.0.1"
@@ -5269,6 +5484,22 @@ dependencies = [
"once_cell",
]
[[package]]
name = "tiktoken-rs"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "44075987ee2486402f0808505dd65692163d243a337fc54363d49afac41087f6"
dependencies = [
"anyhow",
"base64 0.21.7",
"bstr",
"fancy-regex",
"lazy_static",
"parking_lot",
"regex",
"rustc-hash 1.1.0",
]
[[package]]
name = "time"
version = "0.3.36"
@@ -5363,6 +5594,16 @@ dependencies = [
"syn 2.0.87",
]
[[package]]
name = "tokio-native-tls"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2"
dependencies = [
"native-tls",
"tokio",
]
[[package]]
name = "tokio-rustls"
version = "0.26.0"
@@ -5449,21 +5690,6 @@ dependencies = [
"winnow",
]
[[package]]
name = "tower"
version = "0.4.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c"
dependencies = [
"futures-core",
"futures-util",
"pin-project",
"pin-project-lite",
"tokio",
"tower-layer",
"tower-service",
]
[[package]]
name = "tower"
version = "0.5.2"
@@ -6382,6 +6608,8 @@ dependencies = [
"minijinja",
"minijinja-autoreload",
"mockall",
"reqwest",
"scraper",
"serde",
"serde_json",
"sha2",
@@ -6389,6 +6617,7 @@ dependencies = [
"tempfile",
"text-splitter",
"thiserror",
"tiktoken-rs",
"tokio",
"tower-http",
"tracing",

View File

@@ -21,6 +21,8 @@ mime_guess = "2.0.5"
minijinja = { version = "2.5.0", features = ["loader", "multi_template"] }
minijinja-autoreload = "2.5.0"
mockall = "0.13.0"
reqwest = {version = "0.12.12", features = ["charset", "json"]}
scraper = "0.22.0"
serde = { version = "1.0.210", features = ["derive"] }
serde_json = "1.0.128"
sha2 = "0.10.8"
@@ -28,6 +30,7 @@ surrealdb = "2.0.4"
tempfile = "3.12.0"
text-splitter = "0.18.1"
thiserror = "1.0.63"
tiktoken-rs = "0.6.0"
tokio = { version = "1.40.0", features = ["full"] }
tower-http = { version = "0.6.2", features = ["fs"] }
tracing = "0.1.40"

View File

@@ -2,7 +2,15 @@ use crate::{
error::AppError,
storage::types::{file_info::FileInfo, text_content::TextContent},
};
use async_openai::types::{
ChatCompletionRequestSystemMessage, ChatCompletionRequestUserMessage,
CreateChatCompletionRequestArgs,
};
use reqwest;
use scraper::{Html, Selector};
use serde::{Deserialize, Serialize};
use tiktoken_rs::o200k_base;
use tracing::info;
/// Knowledge object type, containing the content or reference to it, as well as metadata
#[derive(Debug, Serialize, Deserialize, Clone)]
@@ -83,8 +91,138 @@ impl IngressObject {
}
/// Fetches and extracts text from a URL.
async fn fetch_text_from_url(_url: &str) -> Result<String, AppError> {
unimplemented!()
async fn fetch_text_from_url(url: &str) -> Result<String, AppError> {
let response = reqwest::get(url).await?.text().await?;
let document = Html::parse_document(&response);
// Select main content areas first
let main_selectors = Selector::parse(concat!(
"article, main, .article-content,", // Common main content classes
".post-content, .entry-content,", // Common blog/article classes
"[role='main']" // Accessibility marker
))
.unwrap();
// If no main content found, fallback to body
let content_element = document
.select(&main_selectors)
.next()
.or_else(|| document.select(&Selector::parse("body").unwrap()).next())
.ok_or(AppError::NotFound("No content found".into()))?;
// Remove unwanted elements but preserve structure
// let exclude_selector = Selector::parse(concat!(
// "script, style, noscript,",
// "[class*='window'], [id*='window'],",
// "[class*='env'], [id*='env'],",
// "iframe, nav, footer, .comments,",
// ".advertisement, .social-share"
// ))
// .unwrap();
// Collect structured content
let mut structured_content = String::new();
// Process headings
for heading in content_element.select(&Selector::parse("h1, h2, h3").unwrap()) {
structured_content.push_str(&format!(
"<heading>{}</heading>\n",
heading.text().collect::<String>().trim()
));
}
// Process paragraphs
for paragraph in content_element.select(&Selector::parse("p").unwrap()) {
structured_content.push_str(&format!(
"<paragraph>{}</paragraph>\n",
paragraph.text().collect::<String>().trim()
));
}
// Clean up
let content = structured_content
.replace(|c: char| c.is_control(), " ")
.replace(" ", " ");
let processed_content = Self::process_web_content(content.trim().to_string()).await?;
info!("Extracted content from page: {:?}", processed_content);
Ok(processed_content)
}
pub async fn process_web_content(content: String) -> Result<String, AppError> {
let openai_client = async_openai::Client::new();
const MAX_TOKENS: usize = 122000;
const SYSTEM_PROMPT: &str = r#"
You are a precise content extractor for web pages. Your task:
1. Extract ONLY the main article/content from the provided text
2. Maintain the original content - do not summarize or modify the core information
3. Ignore peripheral content such as:
- Navigation elements
- Error messages (e.g., "JavaScript required")
- Related articles sections
- Comments
- Social media links
- Advertisement text
FORMAT:
- Convert <heading> tags to markdown headings (#, ##, ###)
- Convert <paragraph> tags to markdown paragraphs
- Preserve quotes and important formatting
- Remove duplicate content
- Remove any metadata or technical artifacts
OUTPUT RULES:
- Output ONLY the cleaned content in markdown
- Do not add any explanations or meta-commentary
- Do not add summaries or conclusions
- Do not use any XML/HTML tags in the output
"#;
let bpe = o200k_base()?;
let token_count = bpe.encode_with_special_tokens(&content).len();
let content = if token_count > MAX_TOKENS {
// Split content into structural blocks
let blocks: Vec<&str> = content.split(|c| c == '\n').collect();
let mut truncated = String::new();
let mut current_tokens = 0;
// Keep adding blocks until we approach the limit
for block in blocks {
let block_tokens = bpe.encode_with_special_tokens(block).len();
if current_tokens + block_tokens > MAX_TOKENS {
break;
}
truncated.push_str(block);
truncated.push('\n');
current_tokens += block_tokens;
}
truncated
} else {
content
};
let request = CreateChatCompletionRequestArgs::default()
.model("gpt-4o-mini")
.temperature(0.0)
.max_tokens(16200u32)
.messages([
ChatCompletionRequestSystemMessage::from(SYSTEM_PROMPT).into(),
ChatCompletionRequestUserMessage::from(content).into(),
])
.build()?;
let response = openai_client.chat().create(request).await?;
response
.choices
.first()
.and_then(|choice| choice.message.content.as_ref())
.map(|content| content.to_string())
.ok_or(AppError::LLMParsing("No content in response".into()))
}
/// Extracts text from a file based on its MIME type.