From 47aca99a0a04872fbca917723198ede7b43bc613 Mon Sep 17 00:00:00 2001
From: Daniel Naab <dnaab@flexion.us>
Date: Thu, 14 May 2020 17:17:57 -0500
Subject: [PATCH] Add in a first-pass at a node.js-based scanner.

---
 .snyk                  |   4 ++
 lighthouse.js          |  65 ++++++++++++++++++++
 scanners/lighthouse.js |  51 ++++++++++++++++
 scanners/lighthouse.py | 135 ++++++++++++++++++-----------------------
 4 files changed, 178 insertions(+), 77 deletions(-)
 create mode 100644 .snyk
 create mode 100644 lighthouse.js
 create mode 100644 scanners/lighthouse.js

diff --git a/.snyk b/.snyk
new file mode 100644
index 00000000..8cb541f9
--- /dev/null
+++ b/.snyk
@@ -0,0 +1,4 @@
+# Snyk (https://snyk.io) policy file, patches or ignores known vulnerabilities.
+version: v1.14.1
+ignore: {}
+patch: {}
diff --git a/lighthouse.js b/lighthouse.js
new file mode 100644
index 00000000..a4089ab8
--- /dev/null
+++ b/lighthouse.js
@@ -0,0 +1,65 @@
+#!/usr/bin/env node
+
+/**
+ * Lighthouse scanner
+ * This module orchestrates parallel Lighthouse scans over one headless Chrome
+ * instance.
+ */
+
+const chromeLauncher = require('chrome-launcher');
+const lighthouse = require('lighthouse');
+const puppeteer = require('puppeteer');
+
+
+var getBrowser = async () => {
+  return await puppeteer.launch({
+    // TODO: Let executable path be overrideable.
+    // executablePath: config.executablePath,
+    headless: true,
+    ignoreHTTPSErrors: true,
+    args: [
+      '--no-sandbox',
+      '--disable-gpu',
+      '--single-process'
+    ]
+  });
+};
+
+function launchChromeAndRunLighthouse(url, opts, config = null) {
+  return chromeLauncher.launch({chromeFlags: opts.chromeFlags}).then(chrome => {
+    opts.port = chrome.port;
+    return lighthouse(url, opts, config).then(results => {
+      return chrome.kill().then(() => results.lhr)
+    });
+  });
+}
+
+const opts = {
+  chromeFlags: ['--headless', '--no-sandbox']
+};
+
+/*
+function configure() {
+  let port;
+  chromeLauncher.launch({
+    chromeFlags: ['--headless', '--no-sandbox']
+  }).then(chrome => {
+    port = chrome.port
+  });
+}
+
+launchChromeAndRunLighthouse('https://www.whitehouse.gov', opts).then(results => {
+  console.log(results);
+});
+*/
+
+getBrowser().then(async browser => {
+  const url = 'https://www.whitehouse.gov';
+  const {lhr} = await lighthouse(url, {
+    port: (new URL(browser.wsEndpoint())).port,
+    output: 'json',
+    logLevel: 'info',
+  });
+  console.log(lhr);
+  await browser.close();
+});
diff --git a/scanners/lighthouse.js b/scanners/lighthouse.js
new file mode 100644
index 00000000..e1b1ee2a
--- /dev/null
+++ b/scanners/lighthouse.js
@@ -0,0 +1,51 @@
+'use strict';
+
+const lighthouse = require('lighthouse');
+
+
+const LIGHTHOUSE_AUDITS = [
+    'color-contrast',
+    'font-size',
+    'image-alt',
+    'input-image-alt',
+    'performance-budget',
+    'speed-index',
+    'tap-targets',
+    'timing-budget',
+    'total-byte-weight',
+    'unminified-css',
+    'unminified-javascript',
+    'uses-text-compression',
+    'viewport',
+]
+
+
+// JS entry point for Lighthouse scan.
+module.exports = {
+  scan: async (domain, environment, options, browser, page) => {
+    const url = 'https://' + domain;
+    try {
+      const output = await lighthouse(url, {
+        port: (new URL(browser.wsEndpoint())).port,
+        onlyAudits: LIGHTHOUSE_AUDITS,
+
+        disableStorageReset: false,
+        saveAssets: false,
+        listAllAudits: false,
+        listTraceCategories: false,
+        printConfig: false,
+        output: [ 'json' ],
+        chromeFlags: '',
+        enableErrorReporting: false,
+        logLevel: 'silent',
+        outputPath: 'stdout',
+      });
+      return output.lhr.audits;
+
+    } catch (exc) {
+      return {
+        error: exc.message
+      }
+    }
+  }
+}
diff --git a/scanners/lighthouse.py b/scanners/lighthouse.py
index 0eaa2bc9..5d9e0274 100644
--- a/scanners/lighthouse.py
+++ b/scanners/lighthouse.py
@@ -6,14 +6,16 @@
 To use, set the `LIGHTHOUSE_PATH` environment variable to the Lighthouse path.
 """
 
-
-import json
-import logging
 import os
-import subprocess
 
-from utils import utils
 
+# Can also be run in Lambda.
+# NOTE: untested
+lambda_support = False
+
+# Signal that this is a JS-based scan using headless Chrome.
+# The scan method will be defined in lighthouse.js instead.
+scan_headless = True
 
 LIGHTHOUSE_PATH = os.environ.get('LIGHTHOUSE_PATH', 'lighthouse')
 LIGHTHOUSE_AUDITS = [
@@ -39,78 +41,6 @@
 workers = 1
 
 
-# Optional one-time initialization for all scans.
-# If defined, any data returned will be passed to every scan instance and used
-# to update the environment dict for that instance
-# Will halt scan execution if it returns False or raises an exception.
-#
-# Run locally.
-# def init(environment: dict, options: dict) -> dict:
-#     logging.debug("Init function.")
-
-#     #cache_dir = options.get('_', {}).get('cache_dir', './cache')
-
-#     return {'constant': 12345}
-
-
-# Optional one-time initialization per-scan. If defined, any data
-# returned will be passed to the instance for that domain and used to update
-# the environment dict for that particular domain.
-#
-# Run locally.
-# def init_domain(domain: str, environment: dict, options: dict) -> dict:
-#     logging.debug("Init function for %s." % domain)
-#     return {'variable': domain}
-
-
-def _url_for_domain(domain: str, cache_dir: str):
-    if domain.startswith('http://') or domain.startswith('https://'):
-        return domain
-
-    # If we have data from pshtt, use the canonical endpoint.
-    canonical = utils.domain_canonical(domain, cache_dir=cache_dir)
-    if canonical:
-        return canonical
-
-    # Otherwise, well, whatever.
-    return 'http://' + domain
-
-
-# Required scan function. This is the meat of the scanner, where things
-# that use the network or are otherwise expensive would go.
-#
-# Runs locally or in the cloud (Lambda).
-def scan(domain: str, environment: dict, options: dict) -> dict:
-    logging.debug('Scan function called with options: %s', options)
-
-    cache_dir = options.get('_', {}).get('cache_dir', './cache')
-
-    url = _url_for_domain(domain, cache_dir)
-    lighthouse_cmd = ' '.join([
-        LIGHTHOUSE_PATH,
-        url,
-        '--quiet',
-        '--output=json',
-        '--chrome-flags="--headless --no-sandbox"',
-        *(f'--only-audits={audit}' for audit in LIGHTHOUSE_AUDITS),
-    ])
-
-    logging.info('Running Lighthouse CLI...')
-
-    try:
-        response = subprocess.check_output(
-            lighthouse_cmd,
-            stderr=subprocess.STDOUT,
-            shell=True, env=None
-        )
-        raw = str(response, encoding='UTF-8')
-        logging.info('Done running Lighthouse CLI')
-        return json.loads(raw)['audits']
-    except subprocess.CalledProcessError:
-        logging.warning("Error running Lighthouse scan for URL %s." % url)
-        return {}
-
-
 # Required CSV row conversion function. Usually one row, can be more.
 #
 # Run locally.
@@ -128,6 +58,57 @@ def to_rows(data):
 headers = ['ID', 'Description', 'Title', 'Score', 'Score Display Mode']
 
 
+#
+# Below is an implementation that will spawn Lighthouse via its cli rather than
+# use a Puppeteer-managed headless Chrome.
+#
+
+# def _url_for_domain(domain: str, cache_dir: str):
+#     if domain.startswith('http://') or domain.startswith('https://'):
+#         return domain
+
+#     # If we have data from pshtt, use the canonical endpoint.
+#     canonical = utils.domain_canonical(domain, cache_dir=cache_dir)
+#     if canonical:
+#         return canonical
+
+#     # Otherwise, well, whatever.
+#     return 'http://' + domain
+
+# Required scan function. This is the meat of the scanner, where things
+# that use the network or are otherwise expensive would go.
+#
+# Runs locally or in the cloud (Lambda).
+# def scan(domain: str, environment: dict, options: dict) -> dict:
+#     logging.debug('Scan function called with options: %s', options)
+
+#     cache_dir = options.get('_', {}).get('cache_dir', './cache')
+
+#     url = _url_for_domain(domain, cache_dir)
+#     lighthouse_cmd = ' '.join([
+#         LIGHTHOUSE_PATH,
+#         url,
+#         '--quiet',
+#         '--output=json',
+#         '--chrome-flags="--headless --no-sandbox"',
+#         *(f'--only-audits={audit}' for audit in LIGHTHOUSE_AUDITS),
+#     ])
+
+#     logging.info('Running Lighthouse CLI...')
+
+#     try:
+#         response = subprocess.check_output(
+#             lighthouse_cmd,
+#             stderr=subprocess.STDOUT,
+#             shell=True, env=None
+#         )
+#         raw = str(response, encoding='UTF-8')
+#         logging.info('Done running Lighthouse CLI')
+#         return json.loads(raw)['audits']
+#     except subprocess.CalledProcessError:
+#         logging.warning("Error running Lighthouse scan for URL %s." % url)
+#         return {}
+
 # TODO: Add ability to override default LIGHTHOUSE_AUDITS
 # Optional handler for custom CLI parameters. Takes the args (as a list of
 # strings) and returns a dict of the options values and names that the scanner