Skip to content

Commit

Permalink
perf(innate): add x2 scraper link crawling auto kayle
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Sep 29, 2023
1 parent fa7d14b commit 37c0901
Show file tree
Hide file tree
Showing 7 changed files with 117 additions and 83 deletions.
2 changes: 1 addition & 1 deletion kayle/package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "kayle",
"version": "0.7.9",
"version": "0.7.10",
"description": "Extremely fast and accurate accessibility engine built for any headless tool like playwright or puppeteer.",
"main": "./build/index.js",
"keywords": [
Expand Down
2 changes: 1 addition & 1 deletion kayle_innate/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 6 additions & 6 deletions kayle_innate/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "kayle_innate"
version = "0.0.17"
version = "0.0.18"
authors = ["j-mendez"]
edition = "2018"
license = "MIT"
Expand All @@ -11,19 +11,19 @@ repository = "https://github.com/a11ywatch/kayle"
crate-type = ["cdylib", "rlib"]

[features]
default = ["console_error_panic_hook"]
accessibility = ["scraper", "getrandom"]
default = ["console_error_panic_hook", "accessibility"]
accessibility = ["select"]

[dependencies]
wasm-bindgen = "0.2.63"
console_error_panic_hook = { version = "0.1.6", optional = true }
wee_alloc = { version = "0.4.5", optional = true }
select = "0.6.0"
select = { version = "0.6.0", optional = true }
url = "2.4.0"
lazy_static = "1.4.0"
case_insensitive_string = "0.1.0"
scraper = { version = "0.17.1", optional = true }
getrandom = { version = "0.2", features = ["js"], optional = true }
scraper = { version = "0.17.1" }
getrandom = { version = "0.2", features = ["js"] }

[dev-dependencies]
wasm-bindgen-test = "0.3.37"
Expand Down
2 changes: 2 additions & 0 deletions kayle_innate/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ The rust lib for accessibility things.

## Building

Target the platform that you need like nodejs or browsers etc.

`wasm-pack build --target nodejs`

## Testing
Expand Down
156 changes: 85 additions & 71 deletions kayle_innate/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,33 +3,14 @@ extern crate lazy_static;

mod utils;
use case_insensitive_string::CaseInsensitiveString;
use select::document::Document;
use select::predicate::Name;
use std::collections::HashSet;
use utils::{convert_abs_path, convert_base_path, set_panic_hook};
use utils::{convert_abs_path, convert_base_path, set_panic_hook, domain_name};
use wasm_bindgen::prelude::*;

#[cfg(feature = "wee_alloc")]
#[global_allocator]
static ALLOC: wee_alloc::WeeAlloc = wee_alloc::WeeAlloc::INIT;

/// get the clean domain name
pub fn domain_name(domain: &url::Url) -> &str {
match domain.host_str() {
Some(b) => {
let b = b.split('.').collect::<Vec<&str>>();
let bsize = b.len();

if bsize > 0 {
b[bsize - 1]
} else {
""
}
}
_ => "",
}
}

#[wasm_bindgen]
/// setup a structure tree alg for parsing and find links in document. Allow user to perform hybrid audits realtime.
pub fn get_document_links(res: &str, domain: &str) -> Box<[JsValue]> {
Expand Down Expand Up @@ -57,58 +38,87 @@ pub fn get_document_links(res: &str, domain: &str) -> Box<[JsValue]> {
let parent_host_scheme = base_url.scheme();
let parent_host = base_url.host_str().unwrap_or_default();

// todo: move to scraper for x2 performance flat
Document::from(res)
.find(Name("a"))
.filter_map(|n| match n.attr("href") {
Some(link) => {
let mut abs = convert_abs_path(&base_url, link);
let mut can_process = match abs.host_str() {
Some(host) => parent_host.ends_with(host),
_ => false,
};

let process = if can_process {
if abs.scheme() != parent_host_scheme {
let _ = abs.set_scheme(parent_host_scheme);
}

let hchars = abs.path();

if let Some(position) = hchars.find('.') {
let resource_ext = &hchars[position + 1..hchars.len()];

if !ONLY_RESOURCES
.contains::<CaseInsensitiveString>(&resource_ext.into())
{
can_process = false;
let h = scraper::Html::parse_fragment(res);

h.tree
.into_iter()
.filter_map(|node| {
if let Some(element) = node.as_element() {
if element.name() == "a" {
match element.attr("href") {
Some(link) => {
let mut abs = convert_abs_path(&base_url, link);
let mut can_process = match abs.host_str() {
Some(host) => parent_host.ends_with(host),
_ => false,
};

let process = if can_process {
if abs.scheme() != parent_host_scheme {
let _ = abs.set_scheme(parent_host_scheme);
}

let hchars = abs.path();

if let Some(position) = hchars.find('.') {
let resource_ext = &hchars[position + 1..hchars.len()];

if !ONLY_RESOURCES.contains::<CaseInsensitiveString>(
&resource_ext.into(),
) {
can_process = false;
}
}

if can_process
&& (base_domain.is_empty()
|| base_domain == domain_name(&abs))
{
Some(JsValue::from_str(&abs.as_str()))
} else {
None
}
} else {
None
};

process
}
_ => None,
}

if can_process
&& (base_domain.is_empty() || base_domain == domain_name(&abs))
{
Some(JsValue::from_str(&abs.as_str()))
} else {
None
} else {
None
}
} else {
None
}
})
.collect::<Vec<_>>()
}
_ => {
let h = scraper::Html::parse_fragment(res);

h.tree
.into_iter()
.filter_map(|node| {
if let Some(element) = node.as_element() {
if element.name() == "a" {
match element.attr("href") {
Some(link) => {
// TODO: validate only web links
Some(JsValue::from_str(&link))
}
_ => None,
}
} else {
None
};

process
}
} else {
None
}
_ => None,
})
.collect::<Vec<_>>()
}
_ => Document::from(res)
.find(Name("a"))
.filter_map(|n| match n.attr("href") {
Some(link) => Some(JsValue::from_str(link)),
_ => None,
})
.collect::<Vec<_>>(),
};

links.into_boxed_slice()
Expand Down Expand Up @@ -144,12 +154,15 @@ pub fn parse_accessibility_tree(html: &str) {
// Element siblings.
// Element descendant.
// Element props.

// Challenges in binding css to nodes arise from external sheets.
// The chrome browser we can set to ignore all assets and fetch them here but, it would be re-doing the wheel.
// If we can send the CSSOM from node to rust this could leverage the sheets attached since we just need the node references.

let mut n = 0;
let t = now();

// measure select parsing doc 1:1 around 34ms - gets slower when using methods possibly due to clones
while let Some(node) = Document::from(html).nth(n) {
while let Some(node) = select::document::Document::from(html).nth(n) {
let element_name = node.name();
console_log!("{:?}", element_name);
n += 1;
Expand All @@ -158,23 +171,24 @@ pub fn parse_accessibility_tree(html: &str) {

let t = now();

let h = scraper::Html::parse_fragment(html);
let mut hh = h.tree.into_iter();
// parse doc will start from html downwards
let h = scraper::Html::parse_document(html);
let mut hh = h.tree.nodes();

// measure select parsing doc 1:1 around 10ms
// measure select parsing doc 1:1 around 10ms
while let Some(node) = hh.next() {
if let Some(element) = node.as_element() {
if let Some(element) = node.value().as_element() {
let element_name = element.name();
console_log!("{:?}", element_name);
}
}

// "body"
// "html"
// "head"
// "title"
// "meta"
// "link"
// "style"
// "body"
// "header"
// "nav"
// "a"
Expand Down
17 changes: 17 additions & 0 deletions kayle_innate/src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,23 @@ pub fn set_panic_hook() {
console_error_panic_hook::set_once();
}

/// get the clean domain name
pub fn domain_name(domain: &url::Url) -> &str {
match domain.host_str() {
Some(b) => {
let b = b.split('.').collect::<Vec<&str>>();
let bsize = b.len();

if bsize > 0 {
b[bsize - 1]
} else {
""
}
}
_ => "",
}
}

/// convert to absolute path
#[inline]
pub fn convert_base_path(mut base: Url) -> Url {
Expand Down
9 changes: 5 additions & 4 deletions kayle_innate/tests/web.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
#![cfg(target_arch = "wasm32")]
extern crate wasm_bindgen_test;

use kayle_innate::{get_document_links, parse_accessibility_tree};
use kayle_innate::get_document_links;
use wasm_bindgen_test::*;

wasm_bindgen_test_configure!(run_in_browser);
Expand Down Expand Up @@ -63,8 +63,9 @@ fn _get_document_links() {
}

#[wasm_bindgen_test]
#[cfg(feature = "accessibility")]
fn _parse_accessibility_tree() {
parse_accessibility_tree(
kayle_innate::parse_accessibility_tree(
r#"<!DOCTYPE html>
<html>
<head>
Expand Down Expand Up @@ -99,6 +100,6 @@ fn _parse_accessibility_tree() {
</ul>
</footer>
</body>
</html>"#
</html>"#,
);
}
}

0 comments on commit 37c0901

Please sign in to comment.