18F · danielnaab · Apr 20, 2020 · Apr 20, 2020 · Apr 21, 2020 · Apr 23, 2020
diff --git a/.snyk b/.snyk
@@ -0,0 +1,4 @@
+# Snyk (https://snyk.io) policy file, patches or ignores known vulnerabilities.
+version: v1.14.1
+ignore: {}
+patch: {}
diff --git a/README.md b/README.md
@@ -74,6 +74,7 @@ Append columns to each row with metadata about the scan itself, such as how long
 * `trustymail`: The `trustymail` command, available from the [`trustymail`](https://github.com/dhs-ncats/trustymail) Python package from the [Department of Homeland Security's NCATS team](https://github.com/dhs-ncats). (Override path by setting the `TRUSTYMAIL_PATH` environment variable.)
 * `third_parties` - What third party web services are in use, using [headless Chrome](https://developers.google.com/web/updates/2017/04/headless-chrome) to trap outgoing requests. (See documentation for [using](#headless-chrome) or [writing](#developing-chrome-scanners) Chrome-based scanners.)
 * `a11y` - Accessibility issues, using [`pa11y`](https://github.com/pa11y/pa11y).
+* `lighthouse` - Scanner that runs [`Google Lighthouse`](https://developers.google.com/web/tools/lighthouse).
 * `noop` - Test scanner (no-op) used for development and debugging. Does nothing.
 
 ### Parallelization

diff --git a/lighthouse.js b/lighthouse.js
@@ -0,0 +1,65 @@
+#!/usr/bin/env node
+
+/**
+ * Lighthouse scanner
+ * This module orchestrates parallel Lighthouse scans over one headless Chrome
+ * instance.
+ */
+
+const chromeLauncher = require('chrome-launcher');
+const lighthouse = require('lighthouse');
+const puppeteer = require('puppeteer');
+
+
+var getBrowser = async () => {
+  return await puppeteer.launch({
+    // TODO: Let executable path be overrideable.
+    // executablePath: config.executablePath,
+    headless: true,
+    ignoreHTTPSErrors: true,
+    args: [
+      '--no-sandbox',
+      '--disable-gpu',
+      '--single-process'
+    ]
+  });
+};
+
+function launchChromeAndRunLighthouse(url, opts, config = null) {
+  return chromeLauncher.launch({chromeFlags: opts.chromeFlags}).then(chrome => {
+    opts.port = chrome.port;
+    return lighthouse(url, opts, config).then(results => {
+      return chrome.kill().then(() => results.lhr)
+    });
+  });
+}
+
+const opts = {
+  chromeFlags: ['--headless', '--no-sandbox']
+};
+
+/*
+function configure() {
+  let port;
+  chromeLauncher.launch({
+    chromeFlags: ['--headless', '--no-sandbox']
+  }).then(chrome => {
+    port = chrome.port
+  });
+}
+
+launchChromeAndRunLighthouse('https://www.whitehouse.gov', opts).then(results => {
+  console.log(results);
+});
+*/
+
+getBrowser().then(async browser => {
+  const url = 'https://www.whitehouse.gov';
+  const {lhr} = await lighthouse(url, {
+    port: (new URL(browser.wsEndpoint())).port,
+    output: 'json',
+    logLevel: 'info',
+  });
+  console.log(lhr);
+  await browser.close();
+});
diff --git a/package.json b/package.json
@@ -2,7 +2,8 @@
   "name": "domain-scan-headless-lambda",
   "description": "Dependencies for building Lambda containers for headless Chrome in domain-scan.",
   "dependencies": {
-    "puppeteer": "^2.0.0",
+    "lighthouse": "^5.6.0",
+    "puppeteer": "^2.1.1",
     "tar": "^5.0.5"
   }
 }
diff --git a/scanners/lighthouse.js b/scanners/lighthouse.js
@@ -0,0 +1,50 @@
+'use strict';
+
+const lighthouse = require('lighthouse');
+
+
+const LIGHTHOUSE_AUDITS = [
+    'color-contrast',
+    'font-size',
+    'image-alt',
+    'input-image-alt',
+    'performance-budget',
+    'speed-index',
+    'tap-targets',
+    'timing-budget',
+    'total-byte-weight',
+    'unminified-css',
+    'unminified-javascript',
+    'uses-text-compression',
+    'viewport',
+]
+
+
+// JS entry point for Lighthouse scan.
+module.exports = {
+  scan: async (domain, environment, options, browser, page) => {
+    const url = 'https://' + domain;
+    try {
+      const output = await lighthouse(url, {
+        port: (new URL(browser.wsEndpoint())).port,
+        onlyAudits: LIGHTHOUSE_AUDITS,
+
+        disableStorageReset: false,
+        saveAssets: false,
+        listAllAudits: false,
+        listTraceCategories: false,
+        printConfig: false,
+        output: [ 'json' ],
+        chromeFlags: '',
+        enableErrorReporting: false,
+        logLevel: 'silent',
+        outputPath: 'stdout',
+      });
+      return output.lhr.audits;
+
+    } catch (exc) {
+      console.log('problem scanning ' + domain + ' ' + exc.message);
+      return null;
+    }
+  }
+}
diff --git a/scanners/lighthouse.py b/scanners/lighthouse.py
@@ -0,0 +1,128 @@
+"""
+Implements a Google Lighthouse scan.
+
+https://developers.google.com/web/tools/lighthouse
+
+To use, set the `LIGHTHOUSE_PATH` environment variable to the Lighthouse path.
+"""
+
+import os
+
+
+# Can also be run in Lambda.
+# NOTE: untested
+lambda_support = False
+
+# Signal that this is a JS-based scan using headless Chrome.
+# The scan method will be defined in lighthouse.js instead.
+scan_headless = True
+
+LIGHTHOUSE_PATH = os.environ.get('LIGHTHOUSE_PATH', 'lighthouse')
+LIGHTHOUSE_AUDITS = [
+    'color-contrast',
+    'font-size',
+    'image-alt',
+    'input-image-alt',
+    'performance-budget',
+    'tap-targets',
+    'timing-budget',
+    'total-byte-weight',
+    'unminified-css',
+    'unminified-javascript',
+    'uses-text-compression',
+    'viewport',
+    'speed-index',
+]
+CHROME_PATH = os.environ.get('CHROME_PATH')
+
+
+# Set a default number of workers for a particular scan type.
+# Overridden by a --workers flag.
+workers = 1
+
+
+# Required CSV row conversion function. Usually one row, can be more.
+#
+# Run locally.
+def to_rows(data):
+    return [[
+        audit['id'],
+        audit['description'],
+        audit['title'],
+        audit['score'],
+        audit['scoreDisplayMode']
+    ] for name, audit in data.items() if name != 'error']
+
+
+# CSV headers for each row of data. Referenced locally.
+headers = ['ID', 'Description', 'Title', 'Score', 'Score Display Mode']
+
+
+#
+# Below is an implementation that will spawn Lighthouse via its cli rather than
+# use a Puppeteer-managed headless Chrome.
+#
+
+# def _url_for_domain(domain: str, cache_dir: str):
+#     if domain.startswith('http://') or domain.startswith('https://'):
+#         return domain
+
+#     # If we have data from pshtt, use the canonical endpoint.
+#     canonical = utils.domain_canonical(domain, cache_dir=cache_dir)
+#     if canonical:
+#         return canonical
+
+#     # Otherwise, well, whatever.
+#     return 'http://' + domain
+
+# Required scan function. This is the meat of the scanner, where things
+# that use the network or are otherwise expensive would go.
+#
+# Runs locally or in the cloud (Lambda).
+# def scan(domain: str, environment: dict, options: dict) -> dict:
+#     logging.debug('Scan function called with options: %s', options)
+
+#     cache_dir = options.get('_', {}).get('cache_dir', './cache')
+
+#     url = _url_for_domain(domain, cache_dir)
+#     lighthouse_cmd = ' '.join([
+#         LIGHTHOUSE_PATH,
+#         url,
+#         '--quiet',
+#         '--output=json',
+#         '--chrome-flags="--headless --no-sandbox"',
+#         *(f'--only-audits={audit}' for audit in LIGHTHOUSE_AUDITS),
+#     ])
+
+#     logging.info('Running Lighthouse CLI...')
+
+#     try:
+#         response = subprocess.check_output(
+#             lighthouse_cmd,
+#             stderr=subprocess.STDOUT,
+#             shell=True, env=None
+#         )
+#         raw = str(response, encoding='UTF-8')
+#         logging.info('Done running Lighthouse CLI')
+#         return json.loads(raw)['audits']
+#     except subprocess.CalledProcessError:
+#         logging.warning("Error running Lighthouse scan for URL %s." % url)
+#         return {}
+
+# TODO: Add ability to override default LIGHTHOUSE_AUDITS
+# Optional handler for custom CLI parameters. Takes the args (as a list of
+# strings) and returns a dict of the options values and names that the scanner
+# expects, and a list of the arguments it didn't know how to parse.
+#
+# Should return a dict of the options parsed by this parser (not a mutated form
+# of the opts that are passed to it) and a list of the remaining args that it
+# didn't recognize.
+# def handle_scanner_args(args, opts) -> Tuple[dict, list]:
+#     parser = ArgumentParser(prefix_chars='--')
+#     parser.add_argument('--noop-delay', nargs=1)
+#     parsed, unknown = parser.parse_known_args(args)
+#     dicted = vars(parsed)
+#     should_be_single = ['noop_delay']
+#     dicted = make_values_single(dicted, should_be_single)
+#     dicted['noop_delay'] = int(dicted['noop_delay'], 10)
+#     return dicted, unknown
diff --git a/utils/utils.py b/utils/utils.py
@@ -416,12 +416,12 @@ def try_command(command):
         return False
 
 
-def scan(command, env=None, allowed_return_codes=[]):
+def scan(command, env=None, allowed_return_codes=[], shell=False):
     try:
         response = subprocess.check_output(
             command,
             stderr=subprocess.STDOUT,
-            shell=False, env=env
+            shell=shell, env=env
         )
         return str(response, encoding='UTF-8')
     except subprocess.CalledProcessError as exc:
@@ -707,5 +707,5 @@ def suffix_pattern(suffixes):
     return re.compile("(?:%s)$" % center)
 
 
-def flatten(l):
-    return list(chain.from_iterable(l))
+def flatten(lst):
+    return list(chain.from_iterable(lst))