From 0874bfeb79c87eba955c4b9f562ac125d8770fb8 Mon Sep 17 00:00:00 2001 From: j-mendez Date: Thu, 5 Oct 2023 23:25:19 -0400 Subject: [PATCH] chore(audit): add auditor base --- kayle_innate/Cargo.lock | 4 +- kayle_innate/Cargo.toml | 2 +- kayle_innate/src/engine/audit/auditor.rs | 65 +++++ kayle_innate/src/engine/audit/mod.rs | 4 + kayle_innate/src/engine/audit/tree.rs | 74 +++++ kayle_innate/src/engine/audit/wcag.rs | 7 +- kayle_innate/src/engine/styles/css_cache.rs | 11 + kayle_innate/src/engine/styles/errors.rs | 12 +- kayle_innate/src/engine/styles/mod.rs | 2 + kayle_innate/src/lib.rs | 290 +++----------------- kayle_innate/src/links.rs | 119 ++++++++ 11 files changed, 319 insertions(+), 271 deletions(-) create mode 100644 kayle_innate/src/engine/audit/auditor.rs create mode 100644 kayle_innate/src/engine/audit/tree.rs create mode 100644 kayle_innate/src/engine/styles/css_cache.rs create mode 100644 kayle_innate/src/links.rs diff --git a/kayle_innate/Cargo.lock b/kayle_innate/Cargo.lock index 0bbbce6..52745e3 100644 --- a/kayle_innate/Cargo.lock +++ b/kayle_innate/Cargo.lock @@ -1209,9 +1209,9 @@ dependencies = [ [[package]] name = "victor_tree" -version = "0.0.5" +version = "0.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "054bfc272757f38a0e8ef28447f2525e1a96dfbc2e2a21c3fafda9f77640108e" +checksum = "6b47ffd0cf2b7c91d117709862b6a8092c31e971d1e6af406876b7ccc3daafaf" dependencies = [ "atomic_refcell", "cssparser", diff --git a/kayle_innate/Cargo.toml b/kayle_innate/Cargo.toml index e50aa14..dcd2fcf 100644 --- a/kayle_innate/Cargo.toml +++ b/kayle_innate/Cargo.toml @@ -30,7 +30,7 @@ js-sys = "0.3.64" selectors = "=0.21.0" smallvec = "0.6" ego-tree = "0.6.2" -victor_tree = { version = "0.0.5" } +victor_tree = { version = "0.0.7" } markup5ever = "0.11.0" [dependencies.cssparser] diff --git a/kayle_innate/src/engine/audit/auditor.rs b/kayle_innate/src/engine/audit/auditor.rs new file mode 100644 index 0000000..d4886b1 --- /dev/null +++ b/kayle_innate/src/engine/audit/auditor.rs @@ -0,0 +1,65 @@ +use scraper_forky::ElementRef; +use scraper_forky::Html; +use victor_tree::style::StyleSet; + +use super::tree::parse_accessibility_tree; + +/// the intro to an audit +pub struct Auditor<'a> { + /// the html document + pub document: &'a Html, + /// the tree to map to nodes + pub tree: std::collections::BTreeMap<&'a str, Vec>>, + /// styles for the audit + pub author: StyleSet, + // /// the matching context for css selectors + pub match_context: selectors::matching::MatchingContext<'a, scraper_forky::selector::Simple>, +} + +impl<'a> Auditor<'a> { + pub fn new( + document: &'a Html, + css_rules: &str, + match_context: selectors::matching::MatchingContext<'a, scraper_forky::selector::Simple>, + ) -> Auditor<'a> { + use crate::{console_log, now}; + let t = now(); + let tree = parse_accessibility_tree(&document); + console_log!("Tree Build Time {:?}", now() - t); + let tt = now(); + + // TODO: make stylesheet building optional and only on first requirement + let author = { + let mut author = victor_tree::style::StyleSetBuilder::new(); + if !css_rules.is_empty() { + author.add_stylesheet(css_rules); + } else { + use markup5ever::local_name; + match tree.get("style") { + Some(styles) => { + for node in styles { + // https://html.spec.whatwg.org/multipage/semantics.html#update-a-style-block + if let Some(type_attr) = node.attr(&local_name!("type")) { + if !type_attr.eq_ignore_ascii_case("text/css") { + continue; + } + author.add_stylesheet(&node.inner_html()) + } + } + } + _ => (), + } + } + author.finish() + }; + + console_log!("StyleSheets Build Time {:?}", now() - tt); + + Auditor { + document, + tree, + author, + match_context, + } + } +} diff --git a/kayle_innate/src/engine/audit/mod.rs b/kayle_innate/src/engine/audit/mod.rs index f039155..c31cc56 100644 --- a/kayle_innate/src/engine/audit/mod.rs +++ b/kayle_innate/src/engine/audit/mod.rs @@ -1,2 +1,6 @@ +/// the auditor +pub mod auditor; +/// the node tree +pub mod tree; /// WCAG audit pub mod wcag; diff --git a/kayle_innate/src/engine/audit/tree.rs b/kayle_innate/src/engine/audit/tree.rs new file mode 100644 index 0000000..153f5c9 --- /dev/null +++ b/kayle_innate/src/engine/audit/tree.rs @@ -0,0 +1,74 @@ +use scraper_forky::ElementRef; +use std::collections::BTreeMap; + +/// try to fix all possible issues using a spec against the tree. +pub fn parse_accessibility_tree( + html: &scraper_forky::Html, + // todo: return the nodes with a tuple of the layout node and the element node +) -> std::collections::BTreeMap<&str, Vec>> { + // use taffy::prelude::*; + // // todo: use optional variable for clips or layout creation + // let mut taffy = Taffy::new(); + + // let header_node = taffy + // .new_leaf(Style { + // size: Size { + // width: points(800.0), + // height: points(100.0), + // }, + // ..Default::default() + // }) + // .unwrap(); + + // let body_node = taffy + // .new_leaf(Style { + // size: Size { + // width: points(800.0), + // height: auto(), + // }, + // flex_grow: 1.0, + // ..Default::default() + // }) + // .unwrap(); + + // let root_node = taffy + // .new_with_children( + // Style { + // flex_direction: FlexDirection::Column, + // size: Size { + // width: points(800.0), + // height: points(600.0), + // }, + // ..Default::default() + // }, + // &[header_node, body_node], + // ) + // .unwrap(); + + // // Call compute_layout on the root of your tree to run the layout algorithm + // taffy.compute_layout(root_node, Size::MAX_CONTENT).unwrap(); + // console_log!("Header Layout {:?}", taffy.layout(header_node).unwrap()); + // We can get the x,y, and height, width of the element on proper tree insert + + // parse doc will start from html downwards + // accessibility tree for ordered element mappings + let mut accessibility_tree: BTreeMap<&str, Vec>> = + BTreeMap::from([("title".into(), Default::default())]); + + for node in html.tree.nodes() { + match scraper_forky::element_ref::ElementRef::wrap(node) { + Some(element) => { + accessibility_tree + .entry(element.value().name()) + .and_modify(|n| n.push(element)) + .or_insert(Vec::from([element])); + } + _ => (), + }; + } + + // console_log!("Getting tree links {:?}", accessibility_tree.get("a")); + // console_log!("Tree {:?}", accessibility_tree); + + accessibility_tree +} diff --git a/kayle_innate/src/engine/audit/wcag.rs b/kayle_innate/src/engine/audit/wcag.rs index 559e499..51a0a98 100644 --- a/kayle_innate/src/engine/audit/wcag.rs +++ b/kayle_innate/src/engine/audit/wcag.rs @@ -1,7 +1,7 @@ use crate::engine::rules::wcag_rule_map::RULES_A; use crate::i18n::locales::{get_message, Langs}; +use crate::Auditor; use crate::{console_log, engine::issue::Issue}; -use scraper_forky::ElementRef; /// baseline for all rules #[derive(Default)] @@ -12,14 +12,13 @@ impl WCAG3AA { /// init the rules pub fn audit( // allow tree mutation until threads or setup the tree with initial elements. - tree: &std::collections::BTreeMap<&str, Vec>>, - // _css: cssparser::Parser<'_, '_>, + auditor: &Auditor<'_>, // todo: get configs like viewport ) -> Vec { let mut issues: Vec = Vec::new(); // go through nodes and map to validation rules - for node in tree { + for node in &auditor.tree { if RULES_A.contains_key(&*node.0) { let rules = RULES_A.get(&*node.0); match rules { diff --git a/kayle_innate/src/engine/styles/css_cache.rs b/kayle_innate/src/engine/styles/css_cache.rs new file mode 100644 index 0000000..08ade4f --- /dev/null +++ b/kayle_innate/src/engine/styles/css_cache.rs @@ -0,0 +1,11 @@ +/// build matching context +pub fn build_matching_context<'a>( + nth_index_cache: &'a mut selectors::NthIndexCache, +) -> selectors::matching::MatchingContext<'a, scraper_forky::selector::Simple> { + selectors::matching::MatchingContext::new( + selectors::matching::MatchingMode::Normal, + None, + Some(nth_index_cache), + selectors::matching::QuirksMode::NoQuirks, + ) +} diff --git a/kayle_innate/src/engine/styles/errors.rs b/kayle_innate/src/engine/styles/errors.rs index 01859f0..3114816 100644 --- a/kayle_innate/src/engine/styles/errors.rs +++ b/kayle_innate/src/engine/styles/errors.rs @@ -1,12 +1,12 @@ -use cssparser::{CowRcStr, ParseError}; +// use cssparser::{CowRcStr, ParseError}; use selectors::parser::SelectorParseErrorKind; -pub type PropertyParseError<'i> = ParseError<'i, PropertyParseErrorKind<'i>>; +// pub type PropertyParseError<'i> = ParseError<'i, PropertyParseErrorKind<'i>>; -pub enum PropertyParseErrorKind<'i> { - UnknownProperty(CowRcStr<'i>), - UnknownUnit(CowRcStr<'i>), -} +// pub enum PropertyParseErrorKind<'i> { +// UnknownProperty(CowRcStr<'i>), +// UnknownUnit(CowRcStr<'i>), +// } pub enum RuleParseErrorKind<'i> { Selector(SelectorParseErrorKind<'i>), diff --git a/kayle_innate/src/engine/styles/mod.rs b/kayle_innate/src/engine/styles/mod.rs index 7578794..ec5fbc3 100644 --- a/kayle_innate/src/engine/styles/mod.rs +++ b/kayle_innate/src/engine/styles/mod.rs @@ -1,5 +1,7 @@ +pub mod css_cache; pub mod errors; pub mod rules; + use crate::console_log; use cssparser::{Parser, ParserInput}; use markup5ever::local_name; diff --git a/kayle_innate/src/lib.rs b/kayle_innate/src/lib.rs index bc71d81..eb085d4 100644 --- a/kayle_innate/src/lib.rs +++ b/kayle_innate/src/lib.rs @@ -1,22 +1,24 @@ #[cfg(feature = "wee_alloc")] #[global_allocator] static ALLOC: wee_alloc::WeeAlloc = wee_alloc::WeeAlloc::INIT; -use wasm_bindgen::prelude::*; +pub use wasm_bindgen::prelude::*; #[macro_use] extern crate lazy_static; +/// the main engine for audits mod engine; +/// locales mod i18n; +/// used for auto-kayle to gather all links in a page. +mod links; +/// helpers mod utils; -use case_insensitive_string::CaseInsensitiveString; #[cfg(feature = "accessibility")] -use scraper_forky::ElementRef; +use crate::engine::audit::auditor::Auditor; #[cfg(feature = "accessibility")] -use std::collections::BTreeMap; -use std::collections::HashSet; -use utils::{convert_abs_path, convert_base_path, domain_name, set_panic_hook}; +use scraper_forky::ElementRef; #[cfg(feature = "accessibility")] #[wasm_bindgen] @@ -37,264 +39,36 @@ macro_rules! console_log { ($($t:tt)*) => (crate::log(&format_args!($($t)*).to_string())) } -#[wasm_bindgen] -/// setup a structure tree alg for parsing and find links in document. Allow user to perform hybrid audits realtime. -pub fn get_document_links(res: &str, domain: &str) -> Box<[JsValue]> { - set_panic_hook(); - - lazy_static! { - /// include only list of resources - static ref ONLY_RESOURCES: HashSet = { - let mut m: HashSet = HashSet::with_capacity(14); - - m.extend([ - "html", "htm", "asp", "aspx", "php", "jps", "jpsx", - // handle .. prefix for urls ending with an extra ending - ".html", ".htm", ".asp", ".aspx", ".php", ".jps", ".jpsx", - ].map(|s| s.into())); - - m - }; - } - - let links = match url::Url::parse(domain) { - Ok(base) => { - let base_url = convert_base_path(base); - let base_domain = domain_name(&base_url); - let parent_host_scheme = base_url.scheme(); - let parent_host = base_url.host_str().unwrap_or_default(); - - let h = scraper_forky::Html::parse_fragment(res); - - h.tree - .into_iter() - .filter_map(|node| { - if let Some(element) = node.as_element() { - if element.name() == "a" { - match element.attr("href") { - Some(link) => { - let mut abs = convert_abs_path(&base_url, link); - let mut can_process = match abs.host_str() { - Some(host) => parent_host.ends_with(host), - _ => false, - }; - - let process = if can_process { - if abs.scheme() != parent_host_scheme { - let _ = abs.set_scheme(parent_host_scheme); - } - - let hchars = abs.path(); - - if let Some(position) = hchars.find('.') { - let resource_ext = &hchars[position + 1..hchars.len()]; - - if !ONLY_RESOURCES.contains::( - &resource_ext.into(), - ) { - can_process = false; - } - } - - if can_process - && (base_domain.is_empty() - || base_domain == domain_name(&abs)) - { - Some(JsValue::from_str(&abs.as_str())) - } else { - None - } - } else { - None - }; - - process - } - _ => None, - } - } else { - None - } - } else { - None - } - }) - .collect::>() - } - _ => { - let h = scraper_forky::Html::parse_fragment(res); - - h.tree - .into_iter() - .filter_map(|node| { - if let Some(element) = node.as_element() { - if element.name() == "a" { - match element.attr("href") { - Some(link) => { - // TODO: validate only web links - Some(JsValue::from_str(&link)) - } - _ => None, - } - } else { - None - } - } else { - None - } - }) - .collect::>() - } - }; - - links.into_boxed_slice() -} - -// RUST_LOG=info wasm-pack test --firefox --headless --features accessibility --release -#[cfg(feature = "accessibility")] -/// try to fix all possible issues using a spec against the tree. -pub fn parse_accessibility_tree( - html: &scraper_forky::Html, - // todo: return the nodes with a tuple of the layout node and the element node -) -> std::collections::BTreeMap<&str, Vec>> { - // use taffy::prelude::*; - // // todo: use optional variable for clips or layout creation - // let mut taffy = Taffy::new(); - - // let header_node = taffy - // .new_leaf(Style { - // size: Size { - // width: points(800.0), - // height: points(100.0), - // }, - // ..Default::default() - // }) - // .unwrap(); - - // let body_node = taffy - // .new_leaf(Style { - // size: Size { - // width: points(800.0), - // height: auto(), - // }, - // flex_grow: 1.0, - // ..Default::default() - // }) - // .unwrap(); - - // let root_node = taffy - // .new_with_children( - // Style { - // flex_direction: FlexDirection::Column, - // size: Size { - // width: points(800.0), - // height: points(600.0), - // }, - // ..Default::default() - // }, - // &[header_node, body_node], - // ) - // .unwrap(); - - // // Call compute_layout on the root of your tree to run the layout algorithm - // taffy.compute_layout(root_node, Size::MAX_CONTENT).unwrap(); - // console_log!("Header Layout {:?}", taffy.layout(header_node).unwrap()); - // We can get the x,y, and height, width of the element on proper tree insert - - let t = now(); - // parse doc will start from html downwards - // accessibility tree for ordered element mappings - let mut accessibility_tree: BTreeMap<&str, Vec>> = - BTreeMap::from([("title".into(), Default::default())]); - - for node in html.tree.nodes() { - match scraper_forky::element_ref::ElementRef::wrap(node) { - Some(element) => { - accessibility_tree - .entry(element.value().name()) - .and_modify(|n| n.push(element)) - .or_insert(Vec::from([element])); - } - _ => (), - }; - } - - console_log!("Scraper Parser: duration {:?}ms", now() - t); - // console_log!("Getting tree links {:?}", accessibility_tree.get("a")); - // console_log!("Tree {:?}", accessibility_tree); - - accessibility_tree -} - #[wasm_bindgen] #[cfg(feature = "accessibility")] /// audit a web page passing the html and css rules. -pub fn _audit_not_ready(html: &str, _css_rules: &str) -> Result { +pub fn _audit_not_ready(html: &str, css_rules: &str) -> Result { + let t = now(); let document = scraper_forky::Html::parse_document(html); - let _tree = parse_accessibility_tree(&document); - let _author = { - let mut author = victor_tree::style::StyleSetBuilder::new(); - if !_css_rules.is_empty() { - author.add_stylesheet(_css_rules); - } else { - use markup5ever::local_name; - match _tree.get("style") { - Some(styles) => { - for node in styles { - // https://html.spec.whatwg.org/multipage/semantics.html#update-a-style-block - if let Some(type_attr) = node.attr(&local_name!("type")) { - if !type_attr.eq_ignore_ascii_case("text/css") { - continue; - } - author.add_stylesheet(&node.inner_html()) - } - } - } - _ => (), - } - } - author.finish() - }; - let _audit = engine::audit::wcag::WCAG3AA::audit(&_tree); - - // // TODO: build struct that can keep lifetimes + console_log!("Parse Document Time {:?}", now() - t); let mut nth_index_cache = selectors::NthIndexCache::from(Default::default()); - let mut match_context = selectors::matching::MatchingContext::new( - selectors::matching::MatchingMode::Normal, - None, - Some(&mut nth_index_cache), - selectors::matching::QuirksMode::NoQuirks, - // selectors::matching::NeedsSelectorFlags::No, - // selectors::matching::IgnoreNthChildForInvalidation::No, + let auditor = Auditor::new( + &document, + &css_rules, + engine::styles::css_cache::build_matching_context(&mut nth_index_cache), ); - - for item in _tree { - for node in item.1 { - let parent_styles = match node.parent() { - Some(n) => match scraper_forky::element_ref::ElementRef::wrap(n) { - Some(element) => { - let _parent_styles = crate::engine::styles::style_for_element( - &_author, - &document, - element, - None, - &mut match_context, - ); - Some(_parent_styles) - } - _ => None, - }, - _ => None, - }; - let _style = crate::engine::styles::style_for_element( - &_author, - &document, - node, - parent_styles.as_deref(), - &mut match_context, - ); - } - } + let ttt = now(); + let _audit = engine::audit::wcag::WCAG3AA::audit(&auditor); + console_log!("Audit Time {:?}", now() - ttt); + + // let mut _match_context = auditor.match_context; + + // for item in auditor.tree { + // for node in item.1 { + // let _style = victor_tree::style::cascade::style_for_element_ref( + // &node, + // &auditor.author, + // &document, + // &mut _match_context, + // ); + // console_log!("{:?}", _style.as_ref().box_size()) + // } + // } // todo: map to JsValues instead of serde Ok(serde_wasm_bindgen::to_value(&_audit)?) diff --git a/kayle_innate/src/links.rs b/kayle_innate/src/links.rs new file mode 100644 index 0000000..4bb6b77 --- /dev/null +++ b/kayle_innate/src/links.rs @@ -0,0 +1,119 @@ +use crate::wasm_bindgen; +use crate::JsValue; + +use crate::utils::{convert_abs_path, convert_base_path, domain_name, set_panic_hook}; +use case_insensitive_string::CaseInsensitiveString; +use std::collections::HashSet; + +#[wasm_bindgen] +/// setup a structure tree alg for parsing and find links in document. Allow user to perform hybrid audits realtime. +pub fn get_document_links(res: &str, domain: &str) -> Box<[JsValue]> { + set_panic_hook(); + + lazy_static! { + /// include only list of resources + static ref ONLY_RESOURCES: HashSet = { + let mut m: HashSet = HashSet::with_capacity(14); + + m.extend([ + "html", "htm", "asp", "aspx", "php", "jps", "jpsx", + // handle .. prefix for urls ending with an extra ending + ".html", ".htm", ".asp", ".aspx", ".php", ".jps", ".jpsx", + ].map(|s| s.into())); + + m + }; + } + + let links = match url::Url::parse(domain) { + Ok(base) => { + let base_url = convert_base_path(base); + let base_domain = domain_name(&base_url); + let parent_host_scheme = base_url.scheme(); + let parent_host = base_url.host_str().unwrap_or_default(); + + let h = scraper_forky::Html::parse_fragment(res); + + h.tree + .into_iter() + .filter_map(|node| { + if let Some(element) = node.as_element() { + if element.name() == "a" { + match element.attr("href") { + Some(link) => { + let mut abs = convert_abs_path(&base_url, link); + let mut can_process = match abs.host_str() { + Some(host) => parent_host.ends_with(host), + _ => false, + }; + + let process = if can_process { + if abs.scheme() != parent_host_scheme { + let _ = abs.set_scheme(parent_host_scheme); + } + + let hchars = abs.path(); + + if let Some(position) = hchars.find('.') { + let resource_ext = &hchars[position + 1..hchars.len()]; + + if !ONLY_RESOURCES.contains::( + &resource_ext.into(), + ) { + can_process = false; + } + } + + if can_process + && (base_domain.is_empty() + || base_domain == domain_name(&abs)) + { + Some(JsValue::from_str(&abs.as_str())) + } else { + None + } + } else { + None + }; + + process + } + _ => None, + } + } else { + None + } + } else { + None + } + }) + .collect::>() + } + _ => { + let h = scraper_forky::Html::parse_fragment(res); + + h.tree + .into_iter() + .filter_map(|node| { + if let Some(element) = node.as_element() { + if element.name() == "a" { + match element.attr("href") { + Some(link) => { + // TODO: validate only web links + Some(JsValue::from_str(&link)) + } + _ => None, + } + } else { + None + } + } else { + None + } + }) + .collect::>() + } + }; + + links.into_boxed_slice() +}