diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..a150a3c1 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,28 @@ +FROM node:lts-alpine + +# Installation of packages for purple-hats and chromium +RUN apk add --no-cache g++ make python3 chromium zip bash git + +WORKDIR /app + +# Copy package.json to working directory, perform npm install before copying the remaining files +COPY package*.json ./ + +# Environment variables for node +ENV NODE_ENV=production + +RUN npm ci --omit=dev + +COPY . . + +# Add non-privileged user so we don't need puppeteer --no-sandbox. +RUN addgroup -S purple && adduser -S -G purple purple +RUN chown -R purple:purple ./ + +# Run everything after as non-privileged user. +USER purple + +# Environment variables for chromium +ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true \ + PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium-browser + diff --git a/constants/constants.js b/constants/constants.js index 97701ea3..a5bf213b 100644 --- a/constants/constants.js +++ b/constants/constants.js @@ -1,5 +1,7 @@ import path from 'path'; import { fileURLToPath } from 'url'; +import fs from 'fs-extra'; + const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); @@ -18,6 +20,13 @@ const scannerTypes = { website: 'website', }; + +// Check if running in docker container +let launchOptionsArgs = []; +if (fs.existsSync('/.dockerenv')) { + launchOptionsArgs = ['--disable-gpu', '--no-sandbox', '--disable-dev-shm-usage']; +} + const devices = [ 'Desktop', 'Blackberry_PlayBook', @@ -158,6 +167,7 @@ export default { urlsCrawledObj, impactOrder, devices, + launchOptionsArgs: launchOptionsArgs, } export const rootPath = __dirname; diff --git a/crawlers/crawlDomain.js b/crawlers/crawlDomain.js index c3a3e5d8..33147589 100644 --- a/crawlers/crawlDomain.js +++ b/crawlers/crawlDomain.js @@ -29,9 +29,14 @@ const crawlDomain = async (url, randomToken, host, viewportSettings) => { } const crawler = new crawlee.PuppeteerCrawler({ + launchContext: { + launchOptions: { + args: constants.launchOptionsArgs, + } + }, requestQueue, preNavigationHooks, - requestHandler: async ({ page, request, enqueueLinks }) => { + requestHandler: async ({page, request, enqueueLinks }) => { if (deviceChosen === 'Custom') { if (device) { await page.emulate(device); @@ -49,7 +54,7 @@ const crawlDomain = async (url, randomToken, host, viewportSettings) => { isMobile: true, }); } - + const currentUrl = request.url; const location = await page.evaluate('location'); @@ -66,6 +71,7 @@ const crawlDomain = async (url, randomToken, host, viewportSettings) => { } else { urlsCrawled.outOfDomain.push(currentUrl); } + }, failedRequestHandler, maxRequestsPerCrawl, diff --git a/crawlers/crawlSitemap.js b/crawlers/crawlSitemap.js index 9d715df7..098dd35f 100644 --- a/crawlers/crawlSitemap.js +++ b/crawlers/crawlSitemap.js @@ -33,6 +33,11 @@ const crawlSitemap = async (sitemapUrl, randomToken, host, viewportSettings) => } } const crawler = new crawlee.PuppeteerCrawler({ + launchContext: { + launchOptions: { + args: constants.launchOptionsArgs, + } + }, requestList, requestQueue, preNavigationHooks,