From d3aa4a353a40c0e3417742164c5c39df3db8c141 Mon Sep 17 00:00:00 2001 From: j-mendez Date: Fri, 29 Sep 2023 09:02:44 -0400 Subject: [PATCH] perf(innate): add x2 scraper link crawling auto kayle --- kayle/package.json | 2 +- kayle_innate/Cargo.lock | 2 +- kayle_innate/Cargo.toml | 10 +-- kayle_innate/README.md | 2 + kayle_innate/src/lib.rs | 154 +++++++++++++++++++++----------------- kayle_innate/src/utils.rs | 17 +++++ kayle_innate/tests/web.rs | 9 ++- 7 files changed, 115 insertions(+), 81 deletions(-) diff --git a/kayle/package.json b/kayle/package.json index ee82d6f4..b48b33a9 100644 --- a/kayle/package.json +++ b/kayle/package.json @@ -1,6 +1,6 @@ { "name": "kayle", - "version": "0.7.9", + "version": "0.7.10", "description": "Extremely fast and accurate accessibility engine built for any headless tool like playwright or puppeteer.", "main": "./build/index.js", "keywords": [ diff --git a/kayle_innate/Cargo.lock b/kayle_innate/Cargo.lock index fba0b445..6107318e 100644 --- a/kayle_innate/Cargo.lock +++ b/kayle_innate/Cargo.lock @@ -233,7 +233,7 @@ dependencies = [ [[package]] name = "kayle_innate" -version = "0.0.17" +version = "0.0.18" dependencies = [ "case_insensitive_string", "console_error_panic_hook", diff --git a/kayle_innate/Cargo.toml b/kayle_innate/Cargo.toml index 32a31bda..0800114e 100644 --- a/kayle_innate/Cargo.toml +++ b/kayle_innate/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "kayle_innate" -version = "0.0.17" +version = "0.0.18" authors = ["j-mendez"] edition = "2018" license = "MIT" @@ -12,18 +12,18 @@ crate-type = ["cdylib", "rlib"] [features] default = ["console_error_panic_hook"] -accessibility = ["scraper", "getrandom"] +accessibility = ["select"] [dependencies] wasm-bindgen = "0.2.63" console_error_panic_hook = { version = "0.1.6", optional = true } wee_alloc = { version = "0.4.5", optional = true } -select = "0.6.0" +select = { version = "0.6.0", optional = true } url = "2.4.0" lazy_static = "1.4.0" case_insensitive_string = "0.1.0" -scraper = { version = "0.17.1", optional = true } -getrandom = { version = "0.2", features = ["js"], optional = true } +scraper = { version = "0.17.1" } +getrandom = { version = "0.2", features = ["js"] } [dev-dependencies] wasm-bindgen-test = "0.3.37" diff --git a/kayle_innate/README.md b/kayle_innate/README.md index ec8ae83f..122dae0d 100644 --- a/kayle_innate/README.md +++ b/kayle_innate/README.md @@ -4,6 +4,8 @@ The rust lib for accessibility things. ## Building +Target the platform that you need like nodejs or browsers etc. + `wasm-pack build --target nodejs` ## Testing diff --git a/kayle_innate/src/lib.rs b/kayle_innate/src/lib.rs index 8d201627..d5875dfd 100644 --- a/kayle_innate/src/lib.rs +++ b/kayle_innate/src/lib.rs @@ -3,33 +3,14 @@ extern crate lazy_static; mod utils; use case_insensitive_string::CaseInsensitiveString; -use select::document::Document; -use select::predicate::Name; use std::collections::HashSet; -use utils::{convert_abs_path, convert_base_path, set_panic_hook}; +use utils::{convert_abs_path, convert_base_path, set_panic_hook, domain_name}; use wasm_bindgen::prelude::*; #[cfg(feature = "wee_alloc")] #[global_allocator] static ALLOC: wee_alloc::WeeAlloc = wee_alloc::WeeAlloc::INIT; -/// get the clean domain name -pub fn domain_name(domain: &url::Url) -> &str { - match domain.host_str() { - Some(b) => { - let b = b.split('.').collect::>(); - let bsize = b.len(); - - if bsize > 0 { - b[bsize - 1] - } else { - "" - } - } - _ => "", - } -} - #[wasm_bindgen] /// setup a structure tree alg for parsing and find links in document. Allow user to perform hybrid audits realtime. pub fn get_document_links(res: &str, domain: &str) -> Box<[JsValue]> { @@ -57,58 +38,87 @@ pub fn get_document_links(res: &str, domain: &str) -> Box<[JsValue]> { let parent_host_scheme = base_url.scheme(); let parent_host = base_url.host_str().unwrap_or_default(); - // todo: move to scraper for x2 performance flat - Document::from(res) - .find(Name("a")) - .filter_map(|n| match n.attr("href") { - Some(link) => { - let mut abs = convert_abs_path(&base_url, link); - let mut can_process = match abs.host_str() { - Some(host) => parent_host.ends_with(host), - _ => false, - }; - - let process = if can_process { - if abs.scheme() != parent_host_scheme { - let _ = abs.set_scheme(parent_host_scheme); - } - - let hchars = abs.path(); - - if let Some(position) = hchars.find('.') { - let resource_ext = &hchars[position + 1..hchars.len()]; - - if !ONLY_RESOURCES - .contains::(&resource_ext.into()) - { - can_process = false; + let h = scraper::Html::parse_fragment(res); + + h.tree + .into_iter() + .filter_map(|node| { + if let Some(element) = node.as_element() { + if element.name() == "a" { + match element.attr("href") { + Some(link) => { + let mut abs = convert_abs_path(&base_url, link); + let mut can_process = match abs.host_str() { + Some(host) => parent_host.ends_with(host), + _ => false, + }; + + let process = if can_process { + if abs.scheme() != parent_host_scheme { + let _ = abs.set_scheme(parent_host_scheme); + } + + let hchars = abs.path(); + + if let Some(position) = hchars.find('.') { + let resource_ext = &hchars[position + 1..hchars.len()]; + + if !ONLY_RESOURCES.contains::( + &resource_ext.into(), + ) { + can_process = false; + } + } + + if can_process + && (base_domain.is_empty() + || base_domain == domain_name(&abs)) + { + Some(JsValue::from_str(&abs.as_str())) + } else { + None + } + } else { + None + }; + + process } + _ => None, } - - if can_process - && (base_domain.is_empty() || base_domain == domain_name(&abs)) - { - Some(JsValue::from_str(&abs.as_str())) - } else { - None + } else { + None + } + } else { + None + } + }) + .collect::>() + } + _ => { + let h = scraper::Html::parse_fragment(res); + + h.tree + .into_iter() + .filter_map(|node| { + if let Some(element) = node.as_element() { + if element.name() == "a" { + match element.attr("href") { + Some(link) => { + // TODO: validate only web links + Some(JsValue::from_str(&link)) + } + _ => None, } } else { None - }; - - process + } + } else { + None } - _ => None, }) .collect::>() } - _ => Document::from(res) - .find(Name("a")) - .filter_map(|n| match n.attr("href") { - Some(link) => Some(JsValue::from_str(link)), - _ => None, - }) - .collect::>(), }; links.into_boxed_slice() @@ -144,12 +154,15 @@ pub fn parse_accessibility_tree(html: &str) { // Element siblings. // Element descendant. // Element props. + // Challenges in binding css to nodes arise from external sheets. + // The chrome browser we can set to ignore all assets and fetch them here but, it would be re-doing the wheel. + // If we can send the Stylesheets from node to rust this could leverage the sheets attached since we just need the node references. let mut n = 0; let t = now(); // measure select parsing doc 1:1 around 34ms - gets slower when using methods possibly due to clones - while let Some(node) = Document::from(html).nth(n) { + while let Some(node) = select::document::Document::from(html).nth(n) { let element_name = node.name(); console_log!("{:?}", element_name); n += 1; @@ -158,23 +171,24 @@ pub fn parse_accessibility_tree(html: &str) { let t = now(); - let h = scraper::Html::parse_fragment(html); - let mut hh = h.tree.into_iter(); + // parse doc will start from html downwards + let h = scraper::Html::parse_document(html); + let mut hh = h.tree.nodes(); - // measure select parsing doc 1:1 around 10ms + // measure select parsing doc 1:1 around 10ms while let Some(node) = hh.next() { - if let Some(element) = node.as_element() { + if let Some(element) = node.value().as_element() { let element_name = element.name(); console_log!("{:?}", element_name); } } - - // "body" // "html" + // "head" // "title" // "meta" // "link" // "style" + // "body" // "header" // "nav" // "a" diff --git a/kayle_innate/src/utils.rs b/kayle_innate/src/utils.rs index c18d2320..37f1d1ed 100644 --- a/kayle_innate/src/utils.rs +++ b/kayle_innate/src/utils.rs @@ -5,6 +5,23 @@ pub fn set_panic_hook() { console_error_panic_hook::set_once(); } +/// get the clean domain name +pub fn domain_name(domain: &url::Url) -> &str { + match domain.host_str() { + Some(b) => { + let b = b.split('.').collect::>(); + let bsize = b.len(); + + if bsize > 0 { + b[bsize - 1] + } else { + "" + } + } + _ => "", + } +} + /// convert to absolute path #[inline] pub fn convert_base_path(mut base: Url) -> Url { diff --git a/kayle_innate/tests/web.rs b/kayle_innate/tests/web.rs index 473f3403..cc3ae970 100644 --- a/kayle_innate/tests/web.rs +++ b/kayle_innate/tests/web.rs @@ -3,7 +3,7 @@ #![cfg(target_arch = "wasm32")] extern crate wasm_bindgen_test; -use kayle_innate::{get_document_links, parse_accessibility_tree}; +use kayle_innate::get_document_links; use wasm_bindgen_test::*; wasm_bindgen_test_configure!(run_in_browser); @@ -63,8 +63,9 @@ fn _get_document_links() { } #[wasm_bindgen_test] +#[cfg(feature = "accessibility")] fn _parse_accessibility_tree() { - parse_accessibility_tree( + kayle_innate::parse_accessibility_tree( r#" @@ -99,6 +100,6 @@ fn _parse_accessibility_tree() { - "# + "#, ); -} \ No newline at end of file +}