From d3aa4a353a40c0e3417742164c5c39df3db8c141 Mon Sep 17 00:00:00 2001
From: j-mendez <jeff@a11ywatch.com>
Date: Fri, 29 Sep 2023 09:02:44 -0400
Subject: [PATCH] perf(innate): add x2 scraper link crawling auto kayle

---
 kayle/package.json        |   2 +-
 kayle_innate/Cargo.lock   |   2 +-
 kayle_innate/Cargo.toml   |  10 +--
 kayle_innate/README.md    |   2 +
 kayle_innate/src/lib.rs   | 154 +++++++++++++++++++++-----------------
 kayle_innate/src/utils.rs |  17 +++++
 kayle_innate/tests/web.rs |   9 ++-
 7 files changed, 115 insertions(+), 81 deletions(-)

diff --git a/kayle/package.json b/kayle/package.json
index ee82d6f4..b48b33a9 100644
--- a/kayle/package.json
+++ b/kayle/package.json
@@ -1,6 +1,6 @@
 {
   "name": "kayle",
-  "version": "0.7.9",
+  "version": "0.7.10",
   "description": "Extremely fast and accurate accessibility engine built for any headless tool like playwright or puppeteer.",
   "main": "./build/index.js",
   "keywords": [
diff --git a/kayle_innate/Cargo.lock b/kayle_innate/Cargo.lock
index fba0b445..6107318e 100644
--- a/kayle_innate/Cargo.lock
+++ b/kayle_innate/Cargo.lock
@@ -233,7 +233,7 @@ dependencies = [
 
 [[package]]
 name = "kayle_innate"
-version = "0.0.17"
+version = "0.0.18"
 dependencies = [
  "case_insensitive_string",
  "console_error_panic_hook",
diff --git a/kayle_innate/Cargo.toml b/kayle_innate/Cargo.toml
index 32a31bda..0800114e 100644
--- a/kayle_innate/Cargo.toml
+++ b/kayle_innate/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "kayle_innate"
-version = "0.0.17"
+version = "0.0.18"
 authors = ["j-mendez"]
 edition = "2018"
 license = "MIT"
@@ -12,18 +12,18 @@ crate-type = ["cdylib", "rlib"]
 
 [features]
 default = ["console_error_panic_hook"]
-accessibility = ["scraper", "getrandom"]
+accessibility = ["select"]
 
 [dependencies]
 wasm-bindgen = "0.2.63"
 console_error_panic_hook = { version = "0.1.6", optional = true }
 wee_alloc = { version = "0.4.5", optional = true }
-select = "0.6.0"
+select = { version = "0.6.0", optional = true }
 url = "2.4.0"
 lazy_static = "1.4.0"
 case_insensitive_string = "0.1.0"
-scraper = { version = "0.17.1", optional = true }
-getrandom = { version = "0.2", features = ["js"], optional = true }
+scraper = { version = "0.17.1" }
+getrandom = { version = "0.2", features = ["js"] }
 
 [dev-dependencies]
 wasm-bindgen-test = "0.3.37"
diff --git a/kayle_innate/README.md b/kayle_innate/README.md
index ec8ae83f..122dae0d 100644
--- a/kayle_innate/README.md
+++ b/kayle_innate/README.md
@@ -4,6 +4,8 @@ The rust lib for accessibility things.
 
 ## Building
 
+Target the platform that you need like nodejs or browsers etc.
+
 `wasm-pack build --target nodejs`
 
 ## Testing
diff --git a/kayle_innate/src/lib.rs b/kayle_innate/src/lib.rs
index 8d201627..d5875dfd 100644
--- a/kayle_innate/src/lib.rs
+++ b/kayle_innate/src/lib.rs
@@ -3,33 +3,14 @@ extern crate lazy_static;
 
 mod utils;
 use case_insensitive_string::CaseInsensitiveString;
-use select::document::Document;
-use select::predicate::Name;
 use std::collections::HashSet;
-use utils::{convert_abs_path, convert_base_path, set_panic_hook};
+use utils::{convert_abs_path, convert_base_path, set_panic_hook, domain_name};
 use wasm_bindgen::prelude::*;
 
 #[cfg(feature = "wee_alloc")]
 #[global_allocator]
 static ALLOC: wee_alloc::WeeAlloc = wee_alloc::WeeAlloc::INIT;
 
-/// get the clean domain name
-pub fn domain_name(domain: &url::Url) -> &str {
-    match domain.host_str() {
-        Some(b) => {
-            let b = b.split('.').collect::<Vec<&str>>();
-            let bsize = b.len();
-
-            if bsize > 0 {
-                b[bsize - 1]
-            }  else {
-                ""
-            }
-        }
-        _ => "",
-    }
-}
-
 #[wasm_bindgen]
 /// setup a structure tree alg for parsing and find links in document. Allow user to perform hybrid audits realtime.
 pub fn get_document_links(res: &str, domain: &str) -> Box<[JsValue]> {
@@ -57,58 +38,87 @@ pub fn get_document_links(res: &str, domain: &str) -> Box<[JsValue]> {
             let parent_host_scheme = base_url.scheme();
             let parent_host = base_url.host_str().unwrap_or_default();
 
-            // todo: move to scraper for x2 performance flat
-            Document::from(res)
-                .find(Name("a"))
-                .filter_map(|n| match n.attr("href") {
-                    Some(link) => {
-                        let mut abs = convert_abs_path(&base_url, link);
-                        let mut can_process = match abs.host_str() {
-                            Some(host) => parent_host.ends_with(host),
-                            _ => false,
-                        };
-
-                        let process = if can_process {
-                            if abs.scheme() != parent_host_scheme {
-                                let _ = abs.set_scheme(parent_host_scheme);
-                            }
-
-                            let hchars = abs.path();
-
-                            if let Some(position) = hchars.find('.') {
-                                let resource_ext = &hchars[position + 1..hchars.len()];
-
-                                if !ONLY_RESOURCES
-                                    .contains::<CaseInsensitiveString>(&resource_ext.into())
-                                {
-                                    can_process = false;
+            let h = scraper::Html::parse_fragment(res);
+
+            h.tree
+                .into_iter()
+                .filter_map(|node| {
+                    if let Some(element) = node.as_element() {
+                        if element.name() == "a" {
+                            match element.attr("href") {
+                                Some(link) => {
+                                    let mut abs = convert_abs_path(&base_url, link);
+                                    let mut can_process = match abs.host_str() {
+                                        Some(host) => parent_host.ends_with(host),
+                                        _ => false,
+                                    };
+
+                                    let process = if can_process {
+                                        if abs.scheme() != parent_host_scheme {
+                                            let _ = abs.set_scheme(parent_host_scheme);
+                                        }
+
+                                        let hchars = abs.path();
+
+                                        if let Some(position) = hchars.find('.') {
+                                            let resource_ext = &hchars[position + 1..hchars.len()];
+
+                                            if !ONLY_RESOURCES.contains::<CaseInsensitiveString>(
+                                                &resource_ext.into(),
+                                            ) {
+                                                can_process = false;
+                                            }
+                                        }
+
+                                        if can_process
+                                            && (base_domain.is_empty()
+                                                || base_domain == domain_name(&abs))
+                                        {
+                                            Some(JsValue::from_str(&abs.as_str()))
+                                        } else {
+                                            None
+                                        }
+                                    } else {
+                                        None
+                                    };
+
+                                    process
                                 }
+                                _ => None,
                             }
-
-                            if can_process
-                                && (base_domain.is_empty() || base_domain == domain_name(&abs))
-                            {
-                                Some(JsValue::from_str(&abs.as_str()))
-                            } else {
-                                None
+                        } else {
+                            None
+                        }
+                    } else {
+                        None
+                    }
+                })
+                .collect::<Vec<_>>()
+        }
+        _ => {
+            let h = scraper::Html::parse_fragment(res);
+
+            h.tree
+                .into_iter()
+                .filter_map(|node| {
+                    if let Some(element) = node.as_element() {
+                        if element.name() == "a" {
+                            match element.attr("href") {
+                                Some(link) => {
+                                    // TODO: validate only web links
+                                    Some(JsValue::from_str(&link))
+                                }
+                                _ => None,
                             }
                         } else {
                             None
-                        };
-
-                        process
+                        }
+                    } else {
+                        None
                     }
-                    _ => None,
                 })
                 .collect::<Vec<_>>()
         }
-        _ => Document::from(res)
-            .find(Name("a"))
-            .filter_map(|n| match n.attr("href") {
-                Some(link) => Some(JsValue::from_str(link)),
-                _ => None,
-            })
-            .collect::<Vec<_>>(),
     };
 
     links.into_boxed_slice()
@@ -144,12 +154,15 @@ pub fn parse_accessibility_tree(html: &str) {
     // Element siblings.
     // Element descendant.
     // Element props.
+    // Challenges in binding css to nodes arise from external sheets.
+    // The chrome browser we can set to ignore all assets and fetch them here but, it would be re-doing the wheel.
+    // If we can send the Stylesheets from node to rust this could leverage the sheets attached since we just need the node references.
 
     let mut n = 0;
     let t = now();
 
     // measure select parsing doc 1:1 around 34ms - gets slower when using methods possibly due to clones
-    while let Some(node) = Document::from(html).nth(n) {
+    while let Some(node) = select::document::Document::from(html).nth(n) {
         let element_name = node.name();
         console_log!("{:?}", element_name);
         n += 1;
@@ -158,23 +171,24 @@ pub fn parse_accessibility_tree(html: &str) {
 
     let t = now();
 
-    let h = scraper::Html::parse_fragment(html);
-    let mut hh = h.tree.into_iter();
+    // parse doc will start from html downwards
+    let h = scraper::Html::parse_document(html);
+    let mut hh = h.tree.nodes();
 
-        // measure select parsing doc 1:1 around 10ms
+    // measure select parsing doc 1:1 around 10ms
     while let Some(node) = hh.next() {
-        if let Some(element) = node.as_element() {
+        if let Some(element) = node.value().as_element() {
             let element_name = element.name();
             console_log!("{:?}", element_name);
         }
     }
-
-    // "body"
     // "html"
+    // "head"
     // "title"
     // "meta"
     // "link"
     // "style"
+    // "body"
     // "header"
     // "nav"
     // "a"
diff --git a/kayle_innate/src/utils.rs b/kayle_innate/src/utils.rs
index c18d2320..37f1d1ed 100644
--- a/kayle_innate/src/utils.rs
+++ b/kayle_innate/src/utils.rs
@@ -5,6 +5,23 @@ pub fn set_panic_hook() {
     console_error_panic_hook::set_once();
 }
 
+/// get the clean domain name
+pub fn domain_name(domain: &url::Url) -> &str {
+    match domain.host_str() {
+        Some(b) => {
+            let b = b.split('.').collect::<Vec<&str>>();
+            let bsize = b.len();
+
+            if bsize > 0 {
+                b[bsize - 1]
+            } else {
+                ""
+            }
+        }
+        _ => "",
+    }
+}
+
 /// convert to absolute path
 #[inline]
 pub fn convert_base_path(mut base: Url) -> Url {
diff --git a/kayle_innate/tests/web.rs b/kayle_innate/tests/web.rs
index 473f3403..cc3ae970 100644
--- a/kayle_innate/tests/web.rs
+++ b/kayle_innate/tests/web.rs
@@ -3,7 +3,7 @@
 #![cfg(target_arch = "wasm32")]
 extern crate wasm_bindgen_test;
 
-use kayle_innate::{get_document_links, parse_accessibility_tree};
+use kayle_innate::get_document_links;
 use wasm_bindgen_test::*;
 
 wasm_bindgen_test_configure!(run_in_browser);
@@ -63,8 +63,9 @@ fn _get_document_links() {
 }
 
 #[wasm_bindgen_test]
+#[cfg(feature = "accessibility")]
 fn _parse_accessibility_tree() {
-    parse_accessibility_tree(
+    kayle_innate::parse_accessibility_tree(
         r#"<!DOCTYPE html>
         <html>
             <head>
@@ -99,6 +100,6 @@ fn _parse_accessibility_tree() {
                     </ul>
                 </footer>
             </body>
-        </html>"#
+        </html>"#,
     );
-}
\ No newline at end of file
+}