Skip to content

Commit

Permalink
feat(spider): add sitewide auditing
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Mar 29, 2024
1 parent e184e89 commit 13211e0
Show file tree
Hide file tree
Showing 28 changed files with 2,596 additions and 156 deletions.
2,413 changes: 2,318 additions & 95 deletions Cargo.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,6 @@ members = [
smallvec = "1"
selectors = "0.22.0"
cssparser = "0.27.0"
html5ever = "0.27"
fast_html5ever = "0.26.1"
ego-tree = "0.6.2"
lazy_static = "1.4"
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ async fn main() {
1. Shortest path CSS selectors for elements.
1. i18n support for multiple languages.
1. Re-creating layout tree to get element position coordinates.
1. Crawling full websites lightning-fast using [spider](https://github.com/spider-rs/spider).

## [Benchmarks](./benches/)

Expand All @@ -78,6 +79,8 @@ audit-speed/core/audit: medium html (4k iterations)
time: [824.07 µs 830.30 µs 839.37 µs]
audit-speed/core/audit: large html (4k iterations)
time: [1.1206 ms 1.1260 ms 1.1321 ms]
audit-speed/core/audit: spider audit html (4k iterations)
time: [263.33 ms 266.55 ms 269.93 ms]
```

## Examples
Expand All @@ -90,6 +93,7 @@ time: [1.1206 ms 1.1260 ms 1.1321 ms]
1. [tokio](https://docs.rs/tokio/latest/tokio/): Enable tokio async runtime handling. Recommended for high freq server usage.
1. [rayon](https://docs.rs/rayon/latest/rayon/): Parallelism with rayon. (Expensive test future handling)
1. [rayon_wasm](https://lib.rs/crates/rayon-wasm): Enable the wasm runtime for rayon.
1. [spider](https://docs.rs/spider-rs/latest/spider/): Crawl entire websites using spider. Full website audits of 100-1,000 pages within milliseconds.

### Contributing

Expand Down
26 changes: 23 additions & 3 deletions accessibility-rs/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "accessibility-rs"
version = "0.0.59"
version = "0.0.60"
authors = ["The A11yWatch Project Developers", "Jeff Mendez <[email protected]>"]
edition = "2021"
license = "MIT OR Apache-2.0"
Expand All @@ -12,8 +12,8 @@ include = ["/src", "../LICENSE_MIT", "../LICENSE_APACHE", "../README.md", "local

[dependencies]
lazy_static = { workspace = true }
accessibility-scraper = { version = "0.0.11", features = ["main"], default-features = false, path = "../accessibility-scraper" }
accessibility-tree = { version = "0.0.11", path = "../accessibility-tree/victor" }
accessibility-scraper = { version = "0.0.12", features = ["main"], default-features = false, path = "../accessibility-scraper" }
accessibility-tree = { version = "0.0.12", path = "../accessibility-tree/victor" }
getrandom = { version = "0.2", features = ["js"] }
taffy = { version = "0.4.0" }
serde = { version = "1.0", features = ["derive"] }
Expand All @@ -31,12 +31,32 @@ rayon = { version = "1.10.0", optional = true }
crossbeam-channel = { version = "0.5.12", optional = true }
tokio = { version = "1.36.0", features = ["macros"], optional = true }
tokio-stream = { version = "0.1.15", optional = true }
spider = { version = "1.89.3", optional = true }

[features]
default = []
rayon = ["dep:rayon", "dep:crossbeam-channel"]
rayon_wasm = ["rayon/web_spin_lock"]
tokio = ["dep:tokio", "dep:tokio-stream", "accessibility-scraper/tokio", "accessibility-tree/tokio"]
spider = ["tokio", "dep:spider", "accessibility-scraper/spider"]
spider_ua_generator = ["tokio", "spider/ua_generator"]
spider_chrome = ["spider", "spider/chrome"]
spider_chrome_cpu = ["spider", "spider/chrome_cpu"]
spider_screenshot = ["spider", "spider/chrome_screenshot"]
spider_smart = ["spider", "spider/smart"]
spider_headed = ["spider", "spider/chrome_headed"]
spider_store_page = ["spider", "spider/chrome_store_page"]
spider_stealth = ["spider", "spider/chrome_stealth"]
spider_intercept = ["spider", "spider/chrome_intercept"]
spider_headless_new = ["spider", "spider/chrome_headless_new"]
spider_real_browser = ["spider", "spider/real_browser"]
spider_openai = ["spider", "spider/openai"]
spider_openai_slim_fit = ["spider", "spider/openai_slim_fit"]
spider_budget = ["spider", "spider/budget"]
spider_cache = ["spider", "spider/cache"]
spider_cache_mem = ["spider", "spider/cache_mem"]
spider_sitemap = ["tokio", "spider/sitemap"]
spider_control = ["tokio", "spider/control"]

[dev-dependencies]
maud = "0.25.0"
Expand Down
6 changes: 3 additions & 3 deletions accessibility-rs/src/engine/issue.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use serde::{Deserialize, Serialize};

/// clip bounding box
#[derive(Default, Debug, Serialize, Deserialize)]
#[derive(Default, Debug, Clone, Serialize, Deserialize)]
pub struct Clip {
/// the x coords
pub x: u32,
Expand All @@ -14,7 +14,7 @@ pub struct Clip {
}

/// Extra help information for the issue
#[derive(Default, Debug, Serialize, Deserialize)]
#[derive(Default, Debug, Clone, Serialize, Deserialize)]
pub struct RunnerExtras {
/// the url to get more information on the issue
pub help_url: &'static str,
Expand All @@ -25,7 +25,7 @@ pub struct RunnerExtras {
}

/// Details of the problem
#[derive(Default, Debug, Serialize, Deserialize)]
#[derive(Default, Debug, Clone, Serialize, Deserialize)]
pub struct Issue {
/// the context of the issue or raw html
pub context: String,
Expand Down
105 changes: 104 additions & 1 deletion accessibility-rs/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,15 @@
//! let audit = audit(config).await;
//! println!("{:?}", audit);
//! }
//!
//! #[cfg(feature = "spider")]
//! #[tokio::main]
//! async fn main() {
//! let mut config = AuditConfig::default();
//! config.url = "https://example.com";
//! let audit = audit(config).await;
//! println!("{:?}", audit);
//! }
//! ```
//!
Expand All @@ -45,6 +54,9 @@ extern crate lazy_static;
#[macro_use]
extern crate rust_i18n;

#[cfg(feature = "spider")]
pub use spider;

/// the main engine for accessibility auditing.
pub mod engine;
/// locales for translations.
Expand Down Expand Up @@ -78,6 +90,9 @@ pub struct AuditConfig {
pub locale: String,
/// the guideline spec
pub conformance: Conformance,
/// crawl and perform audits on the entire website
#[cfg(feature = "spider")]
pub url: String,
}

/// configs for the audit
Expand All @@ -94,6 +109,9 @@ pub struct AuditConfig<'a> {
pub locale: &'a str,
/// the guideline spec
pub conformance: Conformance,
/// crawl and perform audits on the entire website
#[cfg(feature = "spider")]
pub url: &'a str,
}

#[cfg(not(feature = "tokio"))]
Expand All @@ -116,6 +134,29 @@ impl<'a> AuditConfig<'a> {
..Default::default()
}
}

/// a new audit configuration crawling a website. This does nothing without the 'spider' flag enabled.
#[cfg(feature = "spider")]
pub fn new_website(url: &'a str, css: &'a str, bounding_box: bool, locale: &'a str) -> Self {
AuditConfig {
url: url.into(),
css: css.into(),
bounding_box,
locale: locale.into(),
..Default::default()
}
}

/// a new audit configuration crawling a website. This does nothing without the 'spider' flag enabled.
#[cfg(not(feature = "spider"))]
pub fn new_website(
_url: &'a str,
_css: &'a str,
_bounding_box: bool,
_locale: &'a str,
) -> Self {
AuditConfig::default()
}
}

#[cfg(feature = "tokio")]
Expand All @@ -138,16 +179,78 @@ impl AuditConfig {
..Default::default()
}
}

/// a new audit configuration crawling a website. This does nothing without the 'spider' flag enabled.
#[cfg(feature = "spider")]
pub fn new_website(url: &str, css: &str, bounding_box: bool, locale: &str) -> Self {
AuditConfig {
url: url.into(),
css: css.into(),
bounding_box,
locale: locale.into(),
..Default::default()
}
}
/// a new audit configuration crawling a website. This does nothing without the 'spider' flag enabled.
#[cfg(not(feature = "spider"))]
pub fn new_website(_url: &str, _css: &str, _bounding_box: bool, _locale: &str) -> Self {
AuditConfig::default()
}
}

/// audit a web page passing the html and css rules.
#[cfg(feature = "tokio")]
#[cfg(all(feature = "tokio", not(feature = "spider")))]
pub async fn audit(config: AuditConfig) -> Vec<Issue> {
let document = accessibility_scraper::Html::parse_document(&config.html).await;
let auditor = Auditor::new(&document, &config.css, config.bounding_box, &config.locale);
engine::audit::wcag::WCAGAAA::audit(auditor).await
}

#[cfg(feature = "spider")]
#[derive(Debug, Clone)]
/// The accessibility audit results either a single page or entire website.
pub enum AuditResults {
/// Crawl results from multiple websites
Page(spider::hashbrown::HashMap<String, Vec<Issue>>),
/// Raw html markup results
Html(Vec<Issue>),
}

/// audit a web page passing the html and css rules.
#[cfg(feature = "spider")]
pub async fn audit(config: AuditConfig) -> AuditResults {
if !config.url.is_empty() {
use spider::website::Website;
let mut website: Website = Website::new(&config.url);
let mut rx2: tokio::sync::broadcast::Receiver<spider::page::Page> =
website.subscribe(16).unwrap();
let bounding_box = config.bounding_box;
let locale = config.locale;

let audits = tokio::spawn(async move {
let mut issues: spider::hashbrown::HashMap<String, Vec<Issue>> =
spider::hashbrown::HashMap::new();

while let Ok(res) = rx2.recv().await {
let document = accessibility_scraper::Html::parse_document(&res.get_html()).await;
let auditor = Auditor::new(&document, &"", bounding_box, &locale);
let issue = engine::audit::wcag::WCAGAAA::audit(auditor).await;
issues.insert(res.get_url().into(), issue);
}

issues
});

website.crawl().await;
website.unsubscribe();
AuditResults::Page(audits.await.unwrap_or_default())
} else {
let document = accessibility_scraper::Html::parse_document(&config.html).await;
let auditor = Auditor::new(&document, &config.css, config.bounding_box, &config.locale);
AuditResults::Html(engine::audit::wcag::WCAGAAA::audit(auditor).await)
}
}

/// audit a web page passing the html and css rules.
#[cfg(not(feature = "tokio"))]
pub fn audit(config: AuditConfig) -> Vec<Issue> {
Expand Down
9 changes: 9 additions & 0 deletions accessibility-rs/tests/integration_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,12 @@ async fn _audit_xlarge() {
.await;
println!("{:?}", report)
}

#[cfg(feature = "spider")]
#[spider::tokio::test]
async fn _audit_website() {
let mut audit_config = AuditConfig::default();
audit_config.url = "https://choosealicense.com";
let report = accessibility_rs::audit(audit_config).await;
println!("{:?}", report)
}
2 changes: 2 additions & 0 deletions accessibility-rs/tests/mocks/mock.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#![allow(dead_code)]

/// mock website data from drake.com
pub static MOCK_WEBSITE_HTML: &'static str = r###"
<html class="no-js" lang="en"><!--<![endif]--><head><style>.hs-cta-wrapper p, .hs-cta-wrapper div { margin: 0; padding: 0; } a#cta_button_5479613_e9c3ff65-5303-42df-b466-cd64a84bfc67 {
Expand Down
7 changes: 4 additions & 3 deletions accessibility-scraper/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "accessibility-scraper"
version = "0.0.11"
version = "0.0.12"
edition = "2021"
description = "HTML parsing and querying with CSS selectors with CSS binding styles to elements."
keywords = ["html", "css", "selector", "scraping"]
Expand All @@ -14,7 +14,7 @@ readme = "README.md"
[dependencies]
cssparser = { workspace = true }
ego-tree = { workspace = true }
html5ever = { workspace = true }
fast_html5ever = { workspace = true }
selectors = { workspace = true }
smallvec = { workspace = true }
tendril = "0.4.3"
Expand All @@ -34,4 +34,5 @@ deterministic = ["indexmap"]
main = ["getopts"]
atomic = []
errors = []
tokio = ["dep:tokio", "dep:tokio-stream"]
tokio = ["dep:tokio", "dep:tokio-stream"]
spider = []
2 changes: 1 addition & 1 deletion accessibility-scraper/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ assert_eq!(vec!["Hello, ", "world!"], text);
### Manipulating the DOM

```rust
use html5ever::tree_builder::TreeSink;
use fast_html5ever::tree_builder::TreeSink;
use accessibility-scraper::{Html, Selector};

let html = "<html><body>hello<p class=\"hello\">REMOVE ME</p></body></html>";
Expand Down
2 changes: 1 addition & 1 deletion accessibility-scraper/src/element_ref/element.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use super::ElementRef;
use crate::selector::{CssLocalName, CssString, NonTSPseudoClass, PseudoElement, Simple};
use html5ever::{LocalName, Namespace};
use fast_html5ever::{LocalName, Namespace};
use selectors::{
attr::{AttrSelectorOperation, CaseSensitivity, NamespaceConstraint},
matching, Element, OpaqueElement,
Expand Down
2 changes: 1 addition & 1 deletion accessibility-scraper/src/element_ref/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use std::ops::Deref;

use ego_tree::iter::{Edge, Traverse};
use ego_tree::NodeRef;
use html5ever::serialize::{serialize, SerializeOpts, TraversalScope};
use fast_html5ever::serialize::{serialize, SerializeOpts, TraversalScope};

use crate::node::Element;
use crate::{Node, Selector};
Expand Down
2 changes: 1 addition & 1 deletion accessibility-scraper/src/element_ref/serializable.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use std::io::Error;

use html5ever::serialize::{Serialize, Serializer, TraversalScope};
use fast_html5ever::serialize::{Serialize, Serializer, TraversalScope};

use crate::ElementRef;

Expand Down
Loading

0 comments on commit 13211e0

Please sign in to comment.