From 1066f271625e944aa901a1b6fc38fc4e4abb32f4 Mon Sep 17 00:00:00 2001 From: Per Stark Date: Wed, 8 Jan 2025 11:12:54 +0100 Subject: [PATCH] webscraping implemented --- Cargo.lock | 321 ++++++++++++++++++++++++---- Cargo.toml | 3 + src/ingress/types/ingress_object.rs | 142 +++++++++++- 3 files changed, 418 insertions(+), 48 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bfe1f3f..5b63a20 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -116,7 +116,7 @@ version = "4.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1ab99eae5ee58501ab236beb6f20f6ca39be615267b014899c89b2f0bc18a459" dependencies = [ - "html5ever", + "html5ever 0.27.0", "maplit", "once_cell", "tendril", @@ -661,7 +661,7 @@ dependencies = [ "serde_urlencoded", "sync_wrapper", "tokio", - "tower 0.5.2", + "tower", "tower-layer", "tower-service", "tracing", @@ -1008,6 +1008,17 @@ dependencies = [ "syn_derive", ] +[[package]] +name = "bstr" +version = "1.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "531a9155a481e2ee699d4f98f43c0ca4ff8ee1bfd55c31e9e98fb29d2b176fe0" +dependencies = [ + "memchr", + "regex-automata 0.4.7", + "serde", +] + [[package]] name = "bumpalo" version = "3.16.0" @@ -1365,6 +1376,29 @@ dependencies = [ "typenum", ] +[[package]] +name = "cssparser" +version = "0.34.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7c66d1cd8ed61bf80b38432613a7a2f09401ab8d0501110655f8b341484a3e3" +dependencies = [ + "cssparser-macros", + "dtoa-short", + "itoa", + "phf", + "smallvec", +] + +[[package]] +name = "cssparser-macros" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331" +dependencies = [ + "quote", + "syn 2.0.87", +] + [[package]] name = "ctr" version = "0.9.2" @@ -1521,6 +1555,17 @@ dependencies = [ "syn 2.0.87", ] +[[package]] +name = "derive_more" +version = "0.99.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f33878137e4dafd7fa914ad4e259e18a4e8e532b9617a2d0150262bf53abfce" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + [[package]] name = "derive_utils" version = "0.14.2" @@ -1620,6 +1665,21 @@ version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1435fa1053d8b2fbbe9be7e97eca7f33d37b28409959813daefc1446a14247f1" +[[package]] +name = "dtoa" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcbb2bf8e87535c23f7a8a321e364ce21462d0ff10cb6407820e8e96dfff6653" + +[[package]] +name = "dtoa-short" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87" +dependencies = [ + "dtoa", +] + [[package]] name = "earcutr" version = "0.4.3" @@ -1630,6 +1690,12 @@ dependencies = [ "num-traits", ] +[[package]] +name = "ego-tree" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2972feb8dffe7bc8c5463b1dacda1b0dfbed3710e50f977d965429692d74cd8" + [[package]] name = "either" version = "1.13.0" @@ -1739,6 +1805,17 @@ dependencies = [ "async-trait", ] +[[package]] +name = "fancy-regex" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "531e46835a22af56d1e3b66f04844bed63158bc094a628bec1d321d9b4c44bf2" +dependencies = [ + "bit-set", + "regex-automata 0.4.7", + "regex-syntax 0.8.4", +] + [[package]] name = "fastrand" version = "1.9.0" @@ -2004,6 +2081,15 @@ dependencies = [ "thread_local", ] +[[package]] +name = "fxhash" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" +dependencies = [ + "byteorder", +] + [[package]] name = "generic-array" version = "0.14.7" @@ -2054,6 +2140,15 @@ dependencies = [ "libm", ] +[[package]] +name = "getopts" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5" +dependencies = [ + "unicode-width", +] + [[package]] name = "getrandom" version = "0.2.15" @@ -2083,6 +2178,25 @@ version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32085ea23f3234fc7846555e85283ba4de91e21016dc0455a16286d87a292d64" +[[package]] +name = "h2" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccae279728d634d083c00f6099cb58f01cc99c145b84b8be2f6c74618d79922e" +dependencies = [ + "atomic-waker", + "bytes", + "fnv", + "futures-core", + "futures-sink", + "http", + "indexmap 2.6.0", + "slab", + "tokio", + "tokio-util", + "tracing", +] + [[package]] name = "half" version = "2.4.1" @@ -2204,7 +2318,21 @@ checksum = "c13771afe0e6e846f1e67d038d4cb29998a6779f93c809212e4e9c32efd244d4" dependencies = [ "log", "mac", - "markup5ever", + "markup5ever 0.12.1", + "proc-macro2", + "quote", + "syn 2.0.87", +] + +[[package]] +name = "html5ever" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e15626aaf9c351bc696217cbe29cb9b5e86c43f8a46b5e2f5c6c5cf7cb904ce" +dependencies = [ + "log", + "mac", + "markup5ever 0.14.0", "proc-macro2", "quote", "syn 2.0.87", @@ -2277,6 +2405,7 @@ dependencies = [ "bytes", "futures-channel", "futures-util", + "h2", "http", "http-body", "httparse", @@ -2308,10 +2437,26 @@ dependencies = [ ] [[package]] -name = "hyper-util" -version = "0.1.8" +name = "hyper-tls" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da62f120a8a37763efb0cf8fdf264b884c7b8b9ac8660b900c8661030c00e6ba" +checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0" +dependencies = [ + "bytes", + "http-body-util", + "hyper", + "hyper-util", + "native-tls", + "tokio", + "tokio-native-tls", + "tower-service", +] + +[[package]] +name = "hyper-util" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df2dcfbe0677734ab2f3ffa7fa7bfd4706bfdc1ef393f2ee30184aed67e631b4" dependencies = [ "bytes", "futures-channel", @@ -2322,7 +2467,6 @@ dependencies = [ "pin-project-lite", "socket2 0.5.7", "tokio", - "tower 0.4.13", "tower-service", "tracing", ] @@ -2864,6 +3008,20 @@ dependencies = [ "tendril", ] +[[package]] +name = "markup5ever" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82c88c6129bd24319e62a0359cb6b958fa7e8be6e19bb1663bc396b90883aca5" +dependencies = [ + "log", + "phf", + "phf_codegen", + "string_cache", + "string_cache_codegen", + "tendril", +] + [[package]] name = "matchers" version = "0.1.0" @@ -3601,26 +3759,6 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5be167a7af36ee22fe3115051bc51f6e6c7054c9348e28deb4f49bd6f705a315" -[[package]] -name = "pin-project" -version = "1.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6bf43b791c5b9e34c3d182969b4abb522f9343702850a2e57f460d00d09b4b3" -dependencies = [ - "pin-project-internal", -] - -[[package]] -name = "pin-project-internal" -version = "1.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.87", -] - [[package]] name = "pin-project-lite" version = "0.2.14" @@ -3881,7 +4019,7 @@ dependencies = [ "pin-project-lite", "quinn-proto", "quinn-udp", - "rustc-hash", + "rustc-hash 2.0.0", "rustls", "socket2 0.5.7", "thiserror", @@ -3898,7 +4036,7 @@ dependencies = [ "bytes", "rand", "ring", - "rustc-hash", + "rustc-hash 2.0.0", "rustls", "slab", "thiserror", @@ -4108,25 +4246,29 @@ dependencies = [ [[package]] name = "reqwest" -version = "0.12.8" +version = "0.12.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f713147fbe92361e52392c73b8c9e48c04c6625bce969ef54dc901e58e042a7b" +checksum = "43e734407157c3c2034e0258f5e4473ddb361b1e85f95a66690d67264d7cd1da" dependencies = [ "base64 0.22.1", "bytes", + "encoding_rs", "futures-core", "futures-util", + "h2", "http", "http-body", "http-body-util", "hyper", "hyper-rustls", + "hyper-tls", "hyper-util", "ipnet", "js-sys", "log", "mime", "mime_guess", + "native-tls", "once_cell", "percent-encoding", "pin-project-lite", @@ -4139,9 +4281,12 @@ dependencies = [ "serde_json", "serde_urlencoded", "sync_wrapper", + "system-configuration", "tokio", + "tokio-native-tls", "tokio-rustls", "tokio-util", + "tower", "tower-service", "url", "wasm-bindgen", @@ -4342,6 +4487,12 @@ version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + [[package]] name = "rustc-hash" version = "2.0.0" @@ -4531,6 +4682,21 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" +[[package]] +name = "scraper" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc3d051b884f40e309de6c149734eab57aa8cc1347992710dc80bcc1c2194c15" +dependencies = [ + "cssparser", + "ego-tree", + "getopts", + "html5ever 0.29.0", + "precomputed-hash", + "selectors", + "tendril", +] + [[package]] name = "scrypt" version = "0.11.0" @@ -4582,6 +4748,25 @@ dependencies = [ "libc", ] +[[package]] +name = "selectors" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd568a4c9bb598e291a08244a5c1f5a8a6650bee243b5b0f8dbb3d9cc1d87fe8" +dependencies = [ + "bitflags 2.6.0", + "cssparser", + "derive_more", + "fxhash", + "log", + "new_debug_unreachable", + "phf", + "phf_codegen", + "precomputed-hash", + "servo_arc", + "smallvec", +] + [[package]] name = "self_cell" version = "1.1.0" @@ -4706,6 +4891,15 @@ dependencies = [ "syn 2.0.87", ] +[[package]] +name = "servo_arc" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae65c4249478a2647db249fb43e23cec56a2c8974a427e7bd8cb5a1d0964921a" +dependencies = [ + "stable_deref_trait", +] + [[package]] name = "sha1" version = "0.10.6" @@ -5163,6 +5357,27 @@ dependencies = [ "syn 2.0.87", ] +[[package]] +name = "system-configuration" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b" +dependencies = [ + "bitflags 2.6.0", + "core-foundation", + "system-configuration-sys", +] + +[[package]] +name = "system-configuration-sys" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "tap" version = "1.0.1" @@ -5269,6 +5484,22 @@ dependencies = [ "once_cell", ] +[[package]] +name = "tiktoken-rs" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44075987ee2486402f0808505dd65692163d243a337fc54363d49afac41087f6" +dependencies = [ + "anyhow", + "base64 0.21.7", + "bstr", + "fancy-regex", + "lazy_static", + "parking_lot", + "regex", + "rustc-hash 1.1.0", +] + [[package]] name = "time" version = "0.3.36" @@ -5363,6 +5594,16 @@ dependencies = [ "syn 2.0.87", ] +[[package]] +name = "tokio-native-tls" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" +dependencies = [ + "native-tls", + "tokio", +] + [[package]] name = "tokio-rustls" version = "0.26.0" @@ -5449,21 +5690,6 @@ dependencies = [ "winnow", ] -[[package]] -name = "tower" -version = "0.4.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" -dependencies = [ - "futures-core", - "futures-util", - "pin-project", - "pin-project-lite", - "tokio", - "tower-layer", - "tower-service", -] - [[package]] name = "tower" version = "0.5.2" @@ -6382,6 +6608,8 @@ dependencies = [ "minijinja", "minijinja-autoreload", "mockall", + "reqwest", + "scraper", "serde", "serde_json", "sha2", @@ -6389,6 +6617,7 @@ dependencies = [ "tempfile", "text-splitter", "thiserror", + "tiktoken-rs", "tokio", "tower-http", "tracing", diff --git a/Cargo.toml b/Cargo.toml index 3b6d1d6..7d15177 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,6 +21,8 @@ mime_guess = "2.0.5" minijinja = { version = "2.5.0", features = ["loader", "multi_template"] } minijinja-autoreload = "2.5.0" mockall = "0.13.0" +reqwest = {version = "0.12.12", features = ["charset", "json"]} +scraper = "0.22.0" serde = { version = "1.0.210", features = ["derive"] } serde_json = "1.0.128" sha2 = "0.10.8" @@ -28,6 +30,7 @@ surrealdb = "2.0.4" tempfile = "3.12.0" text-splitter = "0.18.1" thiserror = "1.0.63" +tiktoken-rs = "0.6.0" tokio = { version = "1.40.0", features = ["full"] } tower-http = { version = "0.6.2", features = ["fs"] } tracing = "0.1.40" diff --git a/src/ingress/types/ingress_object.rs b/src/ingress/types/ingress_object.rs index e7dc55b..d101739 100644 --- a/src/ingress/types/ingress_object.rs +++ b/src/ingress/types/ingress_object.rs @@ -2,7 +2,15 @@ use crate::{ error::AppError, storage::types::{file_info::FileInfo, text_content::TextContent}, }; +use async_openai::types::{ + ChatCompletionRequestSystemMessage, ChatCompletionRequestUserMessage, + CreateChatCompletionRequestArgs, +}; +use reqwest; +use scraper::{Html, Selector}; use serde::{Deserialize, Serialize}; +use tiktoken_rs::o200k_base; +use tracing::info; /// Knowledge object type, containing the content or reference to it, as well as metadata #[derive(Debug, Serialize, Deserialize, Clone)] @@ -83,8 +91,138 @@ impl IngressObject { } /// Fetches and extracts text from a URL. - async fn fetch_text_from_url(_url: &str) -> Result { - unimplemented!() + async fn fetch_text_from_url(url: &str) -> Result { + let response = reqwest::get(url).await?.text().await?; + let document = Html::parse_document(&response); + + // Select main content areas first + let main_selectors = Selector::parse(concat!( + "article, main, .article-content,", // Common main content classes + ".post-content, .entry-content,", // Common blog/article classes + "[role='main']" // Accessibility marker + )) + .unwrap(); + + // If no main content found, fallback to body + let content_element = document + .select(&main_selectors) + .next() + .or_else(|| document.select(&Selector::parse("body").unwrap()).next()) + .ok_or(AppError::NotFound("No content found".into()))?; + + // Remove unwanted elements but preserve structure + // let exclude_selector = Selector::parse(concat!( + // "script, style, noscript,", + // "[class*='window'], [id*='window'],", + // "[class*='env'], [id*='env'],", + // "iframe, nav, footer, .comments,", + // ".advertisement, .social-share" + // )) + // .unwrap(); + + // Collect structured content + let mut structured_content = String::new(); + + // Process headings + for heading in content_element.select(&Selector::parse("h1, h2, h3").unwrap()) { + structured_content.push_str(&format!( + "{}\n", + heading.text().collect::().trim() + )); + } + + // Process paragraphs + for paragraph in content_element.select(&Selector::parse("p").unwrap()) { + structured_content.push_str(&format!( + "{}\n", + paragraph.text().collect::().trim() + )); + } + + // Clean up + let content = structured_content + .replace(|c: char| c.is_control(), " ") + .replace(" ", " "); + + let processed_content = Self::process_web_content(content.trim().to_string()).await?; + + info!("Extracted content from page: {:?}", processed_content); + + Ok(processed_content) + } + + pub async fn process_web_content(content: String) -> Result { + let openai_client = async_openai::Client::new(); + const MAX_TOKENS: usize = 122000; + const SYSTEM_PROMPT: &str = r#" + You are a precise content extractor for web pages. Your task: + + 1. Extract ONLY the main article/content from the provided text + 2. Maintain the original content - do not summarize or modify the core information + 3. Ignore peripheral content such as: + - Navigation elements + - Error messages (e.g., "JavaScript required") + - Related articles sections + - Comments + - Social media links + - Advertisement text + + FORMAT: + - Convert tags to markdown headings (#, ##, ###) + - Convert tags to markdown paragraphs + - Preserve quotes and important formatting + - Remove duplicate content + - Remove any metadata or technical artifacts + + OUTPUT RULES: + - Output ONLY the cleaned content in markdown + - Do not add any explanations or meta-commentary + - Do not add summaries or conclusions + - Do not use any XML/HTML tags in the output + "#; + + let bpe = o200k_base()?; + let token_count = bpe.encode_with_special_tokens(&content).len(); + + let content = if token_count > MAX_TOKENS { + // Split content into structural blocks + let blocks: Vec<&str> = content.split(|c| c == '\n').collect(); + let mut truncated = String::new(); + let mut current_tokens = 0; + + // Keep adding blocks until we approach the limit + for block in blocks { + let block_tokens = bpe.encode_with_special_tokens(block).len(); + if current_tokens + block_tokens > MAX_TOKENS { + break; + } + truncated.push_str(block); + truncated.push('\n'); + current_tokens += block_tokens; + } + truncated + } else { + content + }; + + let request = CreateChatCompletionRequestArgs::default() + .model("gpt-4o-mini") + .temperature(0.0) + .max_tokens(16200u32) + .messages([ + ChatCompletionRequestSystemMessage::from(SYSTEM_PROMPT).into(), + ChatCompletionRequestUserMessage::from(content).into(), + ]) + .build()?; + + let response = openai_client.chat().create(request).await?; + + response + .choices + .first() + .and_then(|choice| choice.message.content.as_ref()) + .map(|content| content.to_string()) + .ok_or(AppError::LLMParsing("No content in response".into())) } /// Extracts text from a file based on its MIME type.