From 466233b149107ba12fdbfdb92a881f9dc4d7e0ca Mon Sep 17 00:00:00 2001 From: j-mendez Date: Thu, 4 May 2023 09:08:13 -0400 Subject: [PATCH] chore(fetch): fix resource fetching --- Cargo.lock | 72 ++++++++++++++++++------------------ Cargo.toml | 2 +- README.md | 2 +- examples/cargo.toml | 4 +- proto/website.proto | 11 ++++++ src/packages/spider/utils.rs | 8 ++-- 6 files changed, 54 insertions(+), 45 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1cdc24f..3c7494e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -55,9 +55,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.70" +version = "1.0.71" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7de8ce5e0f9f8d88245311066a578d72b7af3e7088f32783804676302df237e4" +checksum = "9c7d0618f0e0b7e8ff11427422b64564d5fb0be1940354bfe2e0529b18a9d9b8" [[package]] name = "async-compression" @@ -103,9 +103,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "axum" -version = "0.6.16" +version = "0.6.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "113713495a32dd0ab52baf5c10044725aa3aec00b31beda84218e469029b72a3" +checksum = "f8175979259124331c1d7bf6586ee7e0da434155e4b2d48ec2c8386281d8df39" dependencies = [ "async-trait", "axum-core", @@ -513,9 +513,9 @@ checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" [[package]] name = "flate2" -version = "1.0.25" +version = "1.0.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8a2db397cb1c8772f31494cb8917e48cd1e64f0fa7efac59fbd741a0a8ce841" +checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743" dependencies = [ "crc32fast", "miniz_oxide", @@ -949,9 +949,9 @@ dependencies = [ [[package]] name = "linux-raw-sys" -version = "0.3.4" +version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36eb31c1778188ae1e64398743890d0877fef36d11521ac60406b42016e8c2cf" +checksum = "ece97ea872ece730aed82664c424eb4c8291e1ff2480247ccf7409044bc6479f" [[package]] name = "lock_api" @@ -1004,9 +1004,9 @@ checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" [[package]] name = "miniz_oxide" -version = "0.6.2" +version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b275950c28b37e794e8c55d88aeb5e139d0ce23fdbbeda68f8d7174abdf9e8fa" +checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7" dependencies = [ "adler", ] @@ -1096,9 +1096,9 @@ checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3" [[package]] name = "openssl" -version = "0.10.51" +version = "0.10.52" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97ea2d98598bf9ada7ea6ee8a30fb74f9156b63bbe495d64ec2b87c269d2dda3" +checksum = "01b8574602df80f7b85fdfc5392fa884a4e3b3f4f35402c070ab34c3d3f78d56" dependencies = [ "bitflags", "cfg-if", @@ -1128,9 +1128,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] name = "openssl-sys" -version = "0.9.86" +version = "0.9.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "992bac49bdbab4423199c654a5515bd2a6c6a23bf03f2dd3bdb7e5ae6259bc69" +checksum = "8e17f59264b2809d77ae94f0e1ebabc434773f370d6ca667bd223ea10e06cc7e" dependencies = [ "cc", "libc", @@ -1342,9 +1342,9 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] name = "pkg-config" -version = "0.3.26" +version = "0.3.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160" +checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964" [[package]] name = "ppv-lite86" @@ -1564,9 +1564,9 @@ checksum = "a5996294f19bd3aae0453a862ad728f60e6600695733dd5df01da90c54363a3c" [[package]] name = "reqwest" -version = "0.11.16" +version = "0.11.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27b71749df584b7f4cac2c426c127a7c785a5106cc98f7a8feb044115f0fa254" +checksum = "13293b639a097af28fc8a90f22add145a9c954e49d77da06263d58cf44d5fb91" dependencies = [ "async-compression", "base64 0.21.0", @@ -1629,9 +1629,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.37.14" +version = "0.37.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9b864d3c18a5785a05953adeed93e2dca37ed30f18e69bba9f30079d51f363f" +checksum = "acf8729d8542766f1b2cf77eb034d52f40d375bb8b615d0b147089946e16613d" dependencies = [ "bitflags", "errno", @@ -2010,9 +2010,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.27.0" +version = "1.28.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0de47a4eecbe11f498978a9b29d792f0d2692d1dd003650c24c76510e3bc001" +checksum = "c3c786bf8134e5a3a166db9b29ab8f48134739014a3eca7bc6bfa95d673b136f" dependencies = [ "autocfg", "bytes", @@ -2023,7 +2023,7 @@ dependencies = [ "pin-project-lite", "socket2", "tokio-macros", - "windows-sys 0.45.0", + "windows-sys 0.48.0", ] [[package]] @@ -2038,9 +2038,9 @@ dependencies = [ [[package]] name = "tokio-macros" -version = "2.0.0" +version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61a573bdc87985e9d6ddeed1b3d864e8a302c847e40d647746df2f1de209d1ce" +checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e" dependencies = [ "proc-macro2", "quote", @@ -2071,9 +2071,9 @@ dependencies = [ [[package]] name = "tokio-stream" -version = "0.1.12" +version = "0.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fb52b74f05dbf495a8fba459fdc331812b96aa086d9eb78101fa0d4569c3313" +checksum = "397c988d37662c7dda6d2208364a706264bf3d6138b11d436cbac0ad38832842" dependencies = [ "futures-core", "pin-project-lite", @@ -2082,9 +2082,9 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.7" +version = "0.7.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5427d89453009325de0d8f342c9490009f76e999cb7672d77e46267448f7e6b2" +checksum = "806fe8c2c87eccc8b3267cbae29ed3ab2d0bd37fca70ab622e46aaa9375ddb7d" dependencies = [ "bytes", "futures-core", @@ -2181,13 +2181,13 @@ dependencies = [ [[package]] name = "tracing-attributes" -version = "0.1.23" +version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4017f8f45139870ca7e672686113917c71c7a6e02d4924eda67186083c03081a" +checksum = "0f57e3ca2a01450b1a921183a9c9cbfda207fd822cef4ccb00a65402cbba7a74" dependencies = [ "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.15", ] [[package]] @@ -2208,7 +2208,7 @@ checksum = "3528ecfd12c466c6f163363caf2d02a71161dd5e1cc6ae7b34207ea2d42d81ed" [[package]] name = "ua_generator" version = "0.3.5" -source = "git+https://github.com/a11ywatch/ua_generator.git#ee71df851e1551e0bdcbadcb4f009afe352f1827" +source = "git+https://github.com/a11ywatch/ua_generator.git#43e456ad47228b3133014de9c6a0ba0821281a55" dependencies = [ "fastrand", "serde", @@ -2442,7 +2442,7 @@ dependencies = [ [[package]] name = "website_crawler" -version = "0.7.83" +version = "0.7.86" dependencies = [ "ahash", "cc", @@ -2691,6 +2691,6 @@ dependencies = [ [[package]] name = "xml-rs" -version = "0.8.4" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2d7d3948613f75c98fd9328cfdcc45acc4d360655289d0a7d4ec931392200a3" +checksum = "374b609fb36c36ce3501094dc0548f7df5d8d102224b65bc59812e4a5425d571" diff --git a/Cargo.toml b/Cargo.toml index 89416d1..36bf4b1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "website_crawler" -version = "0.7.83" +version = "0.7.86" authors = ["Jeff Mendez "] edition = "2021" description = "gRPC tokio based web crawler" diff --git a/README.md b/README.md index e4a771b..2b9113e 100644 --- a/README.md +++ b/README.md @@ -62,7 +62,7 @@ This is a basic example crawling a web page, add spider to your `Cargo.toml`: ```toml [dependencies] -website_crawler = "0.7.38" +website_crawler = "0.7.85" ``` And then the code: diff --git a/examples/cargo.toml b/examples/cargo.toml index e26180b..07055e1 100644 --- a/examples/cargo.toml +++ b/examples/cargo.toml @@ -1,6 +1,6 @@ [package] name = "website_crawler_example" -version = "0.7.78" +version = "0.7.84" authors = ["Jeff Mendez "] edition = "2018" description = "gRPC tokio based web crawler" @@ -18,7 +18,7 @@ env_logger = "0.9.0" htr = "0.5.23" [dependencies.website_crawler] -version = "0.7.78" +version = "0.7.84" path = "../" default-features = false diff --git a/proto/website.proto b/proto/website.proto index 69d3f99..7d3030f 100644 --- a/proto/website.proto +++ b/proto/website.proto @@ -2,12 +2,15 @@ syntax = "proto3"; package website; +import "google/protobuf/struct.proto"; + // Central API that manages your website between starting single and multi page scans. service WebsiteService { rpc ScanStart (ScanInitParams) returns (Empty) {} // track when scan starts. rpc ScanEnd (ScanInitParams) returns (Empty) {} // tracks when scan completes. rpc Scan (ScanParams) returns (Empty) {} // non stream scanning allowing for full track up time for keep alive cost. rpc ScanStream (ScanParams) returns (stream ScanStreamResponse) {} // stream the scan request and return if scan should continue. + // rpc PageSet (LightHouse) returns (Empty) {} // update a page response directly 1:1 useful for lighthouse and other integrations. } // params to send when scanning pages. @@ -31,4 +34,12 @@ message Empty {} // send streamed response message ScanStreamResponse { string message = 1; // message of the scan success or if should terminate. +} + +// Lighthouse report insight from Pagemind. +message LightHouse { + uint32 user_id = 1; // the user that made the request. + string domain = 2; // the domain for the request [example.com]. + string url = 3; // the url of the request with http or https + google.protobuf.Struct insight = 4; // the json details from lighthouse } \ No newline at end of file diff --git a/src/packages/spider/utils.rs b/src/packages/spider/utils.rs index f56521b..aa49c98 100644 --- a/src/packages/spider/utils.rs +++ b/src/packages/spider/utils.rs @@ -1,13 +1,12 @@ use log::{info, log_enabled, Level}; use reqwest::Client; -use reqwest::StatusCode; -/// Perform a network request to a resource extracting all content as text streaming. +/// Perform a network request to a resource extracting all content streaming. pub async fn fetch_page_html(url: &str, client: &Client) -> Option { use tokio_stream::StreamExt; match client.get(url).send().await { - Ok(res) if res.status() == StatusCode::OK => { + Ok(res) if res.status().is_success() => { let mut stream = res.bytes_stream(); let mut data: String = String::new(); @@ -24,13 +23,12 @@ pub async fn fetch_page_html(url: &str, client: &Client) -> Option { } Ok(_) => None, Err(_) => { - log("- error parsing html text {}", &url); + log("- error parsing {}", &url); None } } } - /// log to console if configuration verbose. pub fn log(message: &'static str, data: impl AsRef) { if log_enabled!(Level::Info) {