Skip to content

Commit

Permalink
chore(fetch): fix resource fetching
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed May 4, 2023
1 parent e331f31 commit 466233b
Show file tree
Hide file tree
Showing 6 changed files with 54 additions and 45 deletions.
72 changes: 36 additions & 36 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "website_crawler"
version = "0.7.83"
version = "0.7.86"
authors = ["Jeff Mendez <[email protected]>"]
edition = "2021"
description = "gRPC tokio based web crawler"
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ This is a basic example crawling a web page, add spider to your `Cargo.toml`:

```toml
[dependencies]
website_crawler = "0.7.38"
website_crawler = "0.7.85"
```

And then the code:
Expand Down
4 changes: 2 additions & 2 deletions examples/cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "website_crawler_example"
version = "0.7.78"
version = "0.7.84"
authors = ["Jeff Mendez <[email protected]>"]
edition = "2018"
description = "gRPC tokio based web crawler"
Expand All @@ -18,7 +18,7 @@ env_logger = "0.9.0"
htr = "0.5.23"

[dependencies.website_crawler]
version = "0.7.78"
version = "0.7.84"
path = "../"
default-features = false

Expand Down
11 changes: 11 additions & 0 deletions proto/website.proto
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,15 @@ syntax = "proto3";

package website;

import "google/protobuf/struct.proto";

// Central API that manages your website between starting single and multi page scans.
service WebsiteService {
rpc ScanStart (ScanInitParams) returns (Empty) {} // track when scan starts.
rpc ScanEnd (ScanInitParams) returns (Empty) {} // tracks when scan completes.
rpc Scan (ScanParams) returns (Empty) {} // non stream scanning allowing for full track up time for keep alive cost.
rpc ScanStream (ScanParams) returns (stream ScanStreamResponse) {} // stream the scan request and return if scan should continue.
// rpc PageSet (LightHouse) returns (Empty) {} // update a page response directly 1:1 useful for lighthouse and other integrations.
}

// params to send when scanning pages.
Expand All @@ -31,4 +34,12 @@ message Empty {}
// send streamed response
message ScanStreamResponse {
string message = 1; // message of the scan success or if should terminate.
}

// Lighthouse report insight from Pagemind.
message LightHouse {
uint32 user_id = 1; // the user that made the request.
string domain = 2; // the domain for the request [example.com].
string url = 3; // the url of the request with http or https
google.protobuf.Struct insight = 4; // the json details from lighthouse
}
8 changes: 3 additions & 5 deletions src/packages/spider/utils.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
use log::{info, log_enabled, Level};
use reqwest::Client;
use reqwest::StatusCode;

/// Perform a network request to a resource extracting all content as text streaming.
/// Perform a network request to a resource extracting all content streaming.
pub async fn fetch_page_html(url: &str, client: &Client) -> Option<String> {
use tokio_stream::StreamExt;

match client.get(url).send().await {
Ok(res) if res.status() == StatusCode::OK => {
Ok(res) if res.status().is_success() => {
let mut stream = res.bytes_stream();
let mut data: String = String::new();

Expand All @@ -24,13 +23,12 @@ pub async fn fetch_page_html(url: &str, client: &Client) -> Option<String> {
}
Ok(_) => None,
Err(_) => {
log("- error parsing html text {}", &url);
log("- error parsing {}", &url);
None
}
}
}


/// log to console if configuration verbose.
pub fn log(message: &'static str, data: impl AsRef<str>) {
if log_enabled!(Level::Info) {
Expand Down

0 comments on commit 466233b

Please sign in to comment.