diff --git a/.snyk b/.snyk new file mode 100644 index 00000000..8cb541f9 --- /dev/null +++ b/.snyk @@ -0,0 +1,4 @@ +# Snyk (https://snyk.io) policy file, patches or ignores known vulnerabilities. +version: v1.14.1 +ignore: {} +patch: {} diff --git a/README.md b/README.md index 8fc178ed..d8276ecc 100644 --- a/README.md +++ b/README.md @@ -74,6 +74,7 @@ Append columns to each row with metadata about the scan itself, such as how long * `trustymail`: The `trustymail` command, available from the [`trustymail`](https://github.com/dhs-ncats/trustymail) Python package from the [Department of Homeland Security's NCATS team](https://github.com/dhs-ncats). (Override path by setting the `TRUSTYMAIL_PATH` environment variable.) * `third_parties` - What third party web services are in use, using [headless Chrome](https://developers.google.com/web/updates/2017/04/headless-chrome) to trap outgoing requests. (See documentation for [using](#headless-chrome) or [writing](#developing-chrome-scanners) Chrome-based scanners.) * `a11y` - Accessibility issues, using [`pa11y`](https://github.com/pa11y/pa11y). +* `lighthouse` - Scanner that runs [`Google Lighthouse`](https://developers.google.com/web/tools/lighthouse). * `noop` - Test scanner (no-op) used for development and debugging. Does nothing. ### Parallelization diff --git a/lighthouse.js b/lighthouse.js new file mode 100644 index 00000000..a4089ab8 --- /dev/null +++ b/lighthouse.js @@ -0,0 +1,65 @@ +#!/usr/bin/env node + +/** + * Lighthouse scanner + * This module orchestrates parallel Lighthouse scans over one headless Chrome + * instance. + */ + +const chromeLauncher = require('chrome-launcher'); +const lighthouse = require('lighthouse'); +const puppeteer = require('puppeteer'); + + +var getBrowser = async () => { + return await puppeteer.launch({ + // TODO: Let executable path be overrideable. + // executablePath: config.executablePath, + headless: true, + ignoreHTTPSErrors: true, + args: [ + '--no-sandbox', + '--disable-gpu', + '--single-process' + ] + }); +}; + +function launchChromeAndRunLighthouse(url, opts, config = null) { + return chromeLauncher.launch({chromeFlags: opts.chromeFlags}).then(chrome => { + opts.port = chrome.port; + return lighthouse(url, opts, config).then(results => { + return chrome.kill().then(() => results.lhr) + }); + }); +} + +const opts = { + chromeFlags: ['--headless', '--no-sandbox'] +}; + +/* +function configure() { + let port; + chromeLauncher.launch({ + chromeFlags: ['--headless', '--no-sandbox'] + }).then(chrome => { + port = chrome.port + }); +} + +launchChromeAndRunLighthouse('https://www.whitehouse.gov', opts).then(results => { + console.log(results); +}); +*/ + +getBrowser().then(async browser => { + const url = 'https://www.whitehouse.gov'; + const {lhr} = await lighthouse(url, { + port: (new URL(browser.wsEndpoint())).port, + output: 'json', + logLevel: 'info', + }); + console.log(lhr); + await browser.close(); +}); diff --git a/package.json b/package.json index d5de209c..15068a53 100644 --- a/package.json +++ b/package.json @@ -2,7 +2,8 @@ "name": "domain-scan-headless-lambda", "description": "Dependencies for building Lambda containers for headless Chrome in domain-scan.", "dependencies": { - "puppeteer": "^2.0.0", + "lighthouse": "^5.6.0", + "puppeteer": "^2.1.1", "tar": "^5.0.5" } } diff --git a/scanners/lighthouse.js b/scanners/lighthouse.js new file mode 100644 index 00000000..6174d0a6 --- /dev/null +++ b/scanners/lighthouse.js @@ -0,0 +1,50 @@ +'use strict'; + +const lighthouse = require('lighthouse'); + + +const LIGHTHOUSE_AUDITS = [ + 'color-contrast', + 'font-size', + 'image-alt', + 'input-image-alt', + 'performance-budget', + 'speed-index', + 'tap-targets', + 'timing-budget', + 'total-byte-weight', + 'unminified-css', + 'unminified-javascript', + 'uses-text-compression', + 'viewport', +] + + +// JS entry point for Lighthouse scan. +module.exports = { + scan: async (domain, environment, options, browser, page) => { + const url = 'https://' + domain; + try { + const output = await lighthouse(url, { + port: (new URL(browser.wsEndpoint())).port, + onlyAudits: LIGHTHOUSE_AUDITS, + + disableStorageReset: false, + saveAssets: false, + listAllAudits: false, + listTraceCategories: false, + printConfig: false, + output: [ 'json' ], + chromeFlags: '', + enableErrorReporting: false, + logLevel: 'silent', + outputPath: 'stdout', + }); + return output.lhr.audits; + + } catch (exc) { + console.log('problem scanning ' + domain + ' ' + exc.message); + return null; + } + } +} diff --git a/scanners/lighthouse.py b/scanners/lighthouse.py new file mode 100644 index 00000000..7be7c3ef --- /dev/null +++ b/scanners/lighthouse.py @@ -0,0 +1,128 @@ +""" +Implements a Google Lighthouse scan. + +https://developers.google.com/web/tools/lighthouse + +To use, set the `LIGHTHOUSE_PATH` environment variable to the Lighthouse path. +""" + +import os + + +# Can also be run in Lambda. +# NOTE: untested +lambda_support = False + +# Signal that this is a JS-based scan using headless Chrome. +# The scan method will be defined in lighthouse.js instead. +scan_headless = True + +LIGHTHOUSE_PATH = os.environ.get('LIGHTHOUSE_PATH', 'lighthouse') +LIGHTHOUSE_AUDITS = [ + 'color-contrast', + 'font-size', + 'image-alt', + 'input-image-alt', + 'performance-budget', + 'tap-targets', + 'timing-budget', + 'total-byte-weight', + 'unminified-css', + 'unminified-javascript', + 'uses-text-compression', + 'viewport', + 'speed-index', +] +CHROME_PATH = os.environ.get('CHROME_PATH') + + +# Set a default number of workers for a particular scan type. +# Overridden by a --workers flag. +workers = 1 + + +# Required CSV row conversion function. Usually one row, can be more. +# +# Run locally. +def to_rows(data): + return [[ + audit['id'], + audit['description'], + audit['title'], + audit['score'], + audit['scoreDisplayMode'] + ] for name, audit in data.items() if name != 'error'] + + +# CSV headers for each row of data. Referenced locally. +headers = ['ID', 'Description', 'Title', 'Score', 'Score Display Mode'] + + +# +# Below is an implementation that will spawn Lighthouse via its cli rather than +# use a Puppeteer-managed headless Chrome. +# + +# def _url_for_domain(domain: str, cache_dir: str): +# if domain.startswith('http://') or domain.startswith('https://'): +# return domain + +# # If we have data from pshtt, use the canonical endpoint. +# canonical = utils.domain_canonical(domain, cache_dir=cache_dir) +# if canonical: +# return canonical + +# # Otherwise, well, whatever. +# return 'http://' + domain + +# Required scan function. This is the meat of the scanner, where things +# that use the network or are otherwise expensive would go. +# +# Runs locally or in the cloud (Lambda). +# def scan(domain: str, environment: dict, options: dict) -> dict: +# logging.debug('Scan function called with options: %s', options) + +# cache_dir = options.get('_', {}).get('cache_dir', './cache') + +# url = _url_for_domain(domain, cache_dir) +# lighthouse_cmd = ' '.join([ +# LIGHTHOUSE_PATH, +# url, +# '--quiet', +# '--output=json', +# '--chrome-flags="--headless --no-sandbox"', +# *(f'--only-audits={audit}' for audit in LIGHTHOUSE_AUDITS), +# ]) + +# logging.info('Running Lighthouse CLI...') + +# try: +# response = subprocess.check_output( +# lighthouse_cmd, +# stderr=subprocess.STDOUT, +# shell=True, env=None +# ) +# raw = str(response, encoding='UTF-8') +# logging.info('Done running Lighthouse CLI') +# return json.loads(raw)['audits'] +# except subprocess.CalledProcessError: +# logging.warning("Error running Lighthouse scan for URL %s." % url) +# return {} + +# TODO: Add ability to override default LIGHTHOUSE_AUDITS +# Optional handler for custom CLI parameters. Takes the args (as a list of +# strings) and returns a dict of the options values and names that the scanner +# expects, and a list of the arguments it didn't know how to parse. +# +# Should return a dict of the options parsed by this parser (not a mutated form +# of the opts that are passed to it) and a list of the remaining args that it +# didn't recognize. +# def handle_scanner_args(args, opts) -> Tuple[dict, list]: +# parser = ArgumentParser(prefix_chars='--') +# parser.add_argument('--noop-delay', nargs=1) +# parsed, unknown = parser.parse_known_args(args) +# dicted = vars(parsed) +# should_be_single = ['noop_delay'] +# dicted = make_values_single(dicted, should_be_single) +# dicted['noop_delay'] = int(dicted['noop_delay'], 10) +# return dicted, unknown diff --git a/utils/utils.py b/utils/utils.py index 3722df0b..0fe73516 100644 --- a/utils/utils.py +++ b/utils/utils.py @@ -416,12 +416,12 @@ def try_command(command): return False -def scan(command, env=None, allowed_return_codes=[]): +def scan(command, env=None, allowed_return_codes=[], shell=False): try: response = subprocess.check_output( command, stderr=subprocess.STDOUT, - shell=False, env=env + shell=shell, env=env ) return str(response, encoding='UTF-8') except subprocess.CalledProcessError as exc: @@ -707,5 +707,5 @@ def suffix_pattern(suffixes): return re.compile("(?:%s)$" % center) -def flatten(l): - return list(chain.from_iterable(l)) +def flatten(lst): + return list(chain.from_iterable(lst))