From 47aca99a0a04872fbca917723198ede7b43bc613 Mon Sep 17 00:00:00 2001 From: Daniel Naab Date: Thu, 14 May 2020 17:17:57 -0500 Subject: [PATCH] Add in a first-pass at a node.js-based scanner. --- .snyk | 4 ++ lighthouse.js | 65 ++++++++++++++++++++ scanners/lighthouse.js | 51 ++++++++++++++++ scanners/lighthouse.py | 135 ++++++++++++++++++----------------------- 4 files changed, 178 insertions(+), 77 deletions(-) create mode 100644 .snyk create mode 100644 lighthouse.js create mode 100644 scanners/lighthouse.js diff --git a/.snyk b/.snyk new file mode 100644 index 00000000..8cb541f9 --- /dev/null +++ b/.snyk @@ -0,0 +1,4 @@ +# Snyk (https://snyk.io) policy file, patches or ignores known vulnerabilities. +version: v1.14.1 +ignore: {} +patch: {} diff --git a/lighthouse.js b/lighthouse.js new file mode 100644 index 00000000..a4089ab8 --- /dev/null +++ b/lighthouse.js @@ -0,0 +1,65 @@ +#!/usr/bin/env node + +/** + * Lighthouse scanner + * This module orchestrates parallel Lighthouse scans over one headless Chrome + * instance. + */ + +const chromeLauncher = require('chrome-launcher'); +const lighthouse = require('lighthouse'); +const puppeteer = require('puppeteer'); + + +var getBrowser = async () => { + return await puppeteer.launch({ + // TODO: Let executable path be overrideable. + // executablePath: config.executablePath, + headless: true, + ignoreHTTPSErrors: true, + args: [ + '--no-sandbox', + '--disable-gpu', + '--single-process' + ] + }); +}; + +function launchChromeAndRunLighthouse(url, opts, config = null) { + return chromeLauncher.launch({chromeFlags: opts.chromeFlags}).then(chrome => { + opts.port = chrome.port; + return lighthouse(url, opts, config).then(results => { + return chrome.kill().then(() => results.lhr) + }); + }); +} + +const opts = { + chromeFlags: ['--headless', '--no-sandbox'] +}; + +/* +function configure() { + let port; + chromeLauncher.launch({ + chromeFlags: ['--headless', '--no-sandbox'] + }).then(chrome => { + port = chrome.port + }); +} + +launchChromeAndRunLighthouse('https://www.whitehouse.gov', opts).then(results => { + console.log(results); +}); +*/ + +getBrowser().then(async browser => { + const url = 'https://www.whitehouse.gov'; + const {lhr} = await lighthouse(url, { + port: (new URL(browser.wsEndpoint())).port, + output: 'json', + logLevel: 'info', + }); + console.log(lhr); + await browser.close(); +}); diff --git a/scanners/lighthouse.js b/scanners/lighthouse.js new file mode 100644 index 00000000..e1b1ee2a --- /dev/null +++ b/scanners/lighthouse.js @@ -0,0 +1,51 @@ +'use strict'; + +const lighthouse = require('lighthouse'); + + +const LIGHTHOUSE_AUDITS = [ + 'color-contrast', + 'font-size', + 'image-alt', + 'input-image-alt', + 'performance-budget', + 'speed-index', + 'tap-targets', + 'timing-budget', + 'total-byte-weight', + 'unminified-css', + 'unminified-javascript', + 'uses-text-compression', + 'viewport', +] + + +// JS entry point for Lighthouse scan. +module.exports = { + scan: async (domain, environment, options, browser, page) => { + const url = 'https://' + domain; + try { + const output = await lighthouse(url, { + port: (new URL(browser.wsEndpoint())).port, + onlyAudits: LIGHTHOUSE_AUDITS, + + disableStorageReset: false, + saveAssets: false, + listAllAudits: false, + listTraceCategories: false, + printConfig: false, + output: [ 'json' ], + chromeFlags: '', + enableErrorReporting: false, + logLevel: 'silent', + outputPath: 'stdout', + }); + return output.lhr.audits; + + } catch (exc) { + return { + error: exc.message + } + } + } +} diff --git a/scanners/lighthouse.py b/scanners/lighthouse.py index 0eaa2bc9..5d9e0274 100644 --- a/scanners/lighthouse.py +++ b/scanners/lighthouse.py @@ -6,14 +6,16 @@ To use, set the `LIGHTHOUSE_PATH` environment variable to the Lighthouse path. """ - -import json -import logging import os -import subprocess -from utils import utils +# Can also be run in Lambda. +# NOTE: untested +lambda_support = False + +# Signal that this is a JS-based scan using headless Chrome. +# The scan method will be defined in lighthouse.js instead. +scan_headless = True LIGHTHOUSE_PATH = os.environ.get('LIGHTHOUSE_PATH', 'lighthouse') LIGHTHOUSE_AUDITS = [ @@ -39,78 +41,6 @@ workers = 1 -# Optional one-time initialization for all scans. -# If defined, any data returned will be passed to every scan instance and used -# to update the environment dict for that instance -# Will halt scan execution if it returns False or raises an exception. -# -# Run locally. -# def init(environment: dict, options: dict) -> dict: -# logging.debug("Init function.") - -# #cache_dir = options.get('_', {}).get('cache_dir', './cache') - -# return {'constant': 12345} - - -# Optional one-time initialization per-scan. If defined, any data -# returned will be passed to the instance for that domain and used to update -# the environment dict for that particular domain. -# -# Run locally. -# def init_domain(domain: str, environment: dict, options: dict) -> dict: -# logging.debug("Init function for %s." % domain) -# return {'variable': domain} - - -def _url_for_domain(domain: str, cache_dir: str): - if domain.startswith('http://') or domain.startswith('https://'): - return domain - - # If we have data from pshtt, use the canonical endpoint. - canonical = utils.domain_canonical(domain, cache_dir=cache_dir) - if canonical: - return canonical - - # Otherwise, well, whatever. - return 'http://' + domain - - -# Required scan function. This is the meat of the scanner, where things -# that use the network or are otherwise expensive would go. -# -# Runs locally or in the cloud (Lambda). -def scan(domain: str, environment: dict, options: dict) -> dict: - logging.debug('Scan function called with options: %s', options) - - cache_dir = options.get('_', {}).get('cache_dir', './cache') - - url = _url_for_domain(domain, cache_dir) - lighthouse_cmd = ' '.join([ - LIGHTHOUSE_PATH, - url, - '--quiet', - '--output=json', - '--chrome-flags="--headless --no-sandbox"', - *(f'--only-audits={audit}' for audit in LIGHTHOUSE_AUDITS), - ]) - - logging.info('Running Lighthouse CLI...') - - try: - response = subprocess.check_output( - lighthouse_cmd, - stderr=subprocess.STDOUT, - shell=True, env=None - ) - raw = str(response, encoding='UTF-8') - logging.info('Done running Lighthouse CLI') - return json.loads(raw)['audits'] - except subprocess.CalledProcessError: - logging.warning("Error running Lighthouse scan for URL %s." % url) - return {} - - # Required CSV row conversion function. Usually one row, can be more. # # Run locally. @@ -128,6 +58,57 @@ def to_rows(data): headers = ['ID', 'Description', 'Title', 'Score', 'Score Display Mode'] +# +# Below is an implementation that will spawn Lighthouse via its cli rather than +# use a Puppeteer-managed headless Chrome. +# + +# def _url_for_domain(domain: str, cache_dir: str): +# if domain.startswith('http://') or domain.startswith('https://'): +# return domain + +# # If we have data from pshtt, use the canonical endpoint. +# canonical = utils.domain_canonical(domain, cache_dir=cache_dir) +# if canonical: +# return canonical + +# # Otherwise, well, whatever. +# return 'http://' + domain + +# Required scan function. This is the meat of the scanner, where things +# that use the network or are otherwise expensive would go. +# +# Runs locally or in the cloud (Lambda). +# def scan(domain: str, environment: dict, options: dict) -> dict: +# logging.debug('Scan function called with options: %s', options) + +# cache_dir = options.get('_', {}).get('cache_dir', './cache') + +# url = _url_for_domain(domain, cache_dir) +# lighthouse_cmd = ' '.join([ +# LIGHTHOUSE_PATH, +# url, +# '--quiet', +# '--output=json', +# '--chrome-flags="--headless --no-sandbox"', +# *(f'--only-audits={audit}' for audit in LIGHTHOUSE_AUDITS), +# ]) + +# logging.info('Running Lighthouse CLI...') + +# try: +# response = subprocess.check_output( +# lighthouse_cmd, +# stderr=subprocess.STDOUT, +# shell=True, env=None +# ) +# raw = str(response, encoding='UTF-8') +# logging.info('Done running Lighthouse CLI') +# return json.loads(raw)['audits'] +# except subprocess.CalledProcessError: +# logging.warning("Error running Lighthouse scan for URL %s." % url) +# return {} + # TODO: Add ability to override default LIGHTHOUSE_AUDITS # Optional handler for custom CLI parameters. Takes the args (as a list of # strings) and returns a dict of the options values and names that the scanner