Skip to content

Commit

Permalink
Run in docker env
Browse files Browse the repository at this point in the history
* Dockerfile
* launchOptions for browser run in container
  • Loading branch information
younglim authored Jan 17, 2023
1 parent fb2334e commit acd9767
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 2 deletions.
28 changes: 28 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
FROM node:lts-alpine

# Installation of packages for purple-hats and chromium
RUN apk add --no-cache g++ make python3 chromium zip bash git

WORKDIR /app

# Copy package.json to working directory, perform npm install before copying the remaining files
COPY package*.json ./

# Environment variables for node
ENV NODE_ENV=production

RUN npm ci --omit=dev

COPY . .

# Add non-privileged user so we don't need puppeteer --no-sandbox.
RUN addgroup -S purple && adduser -S -G purple purple
RUN chown -R purple:purple ./

# Run everything after as non-privileged user.
USER purple

# Environment variables for chromium
ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true \
PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium-browser

10 changes: 10 additions & 0 deletions constants/constants.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import path from 'path';
import { fileURLToPath } from 'url';
import fs from 'fs-extra';

const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);

Expand All @@ -18,6 +20,13 @@ const scannerTypes = {
website: 'website',
};


// Check if running in docker container
let launchOptionsArgs = [];
if (fs.existsSync('/.dockerenv')) {
launchOptionsArgs = ['--disable-gpu', '--no-sandbox', '--disable-dev-shm-usage'];
}

const devices = [
'Desktop',
'Blackberry_PlayBook',
Expand Down Expand Up @@ -158,6 +167,7 @@ export default {
urlsCrawledObj,
impactOrder,
devices,
launchOptionsArgs: launchOptionsArgs,
}

export const rootPath = __dirname;
Expand Down
10 changes: 8 additions & 2 deletions crawlers/crawlDomain.js
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,14 @@ const crawlDomain = async (url, randomToken, host, viewportSettings) => {
}

const crawler = new crawlee.PuppeteerCrawler({
launchContext: {
launchOptions: {
args: constants.launchOptionsArgs,
}
},
requestQueue,
preNavigationHooks,
requestHandler: async ({ page, request, enqueueLinks }) => {
requestHandler: async ({page, request, enqueueLinks }) => {
if (deviceChosen === 'Custom') {
if (device) {
await page.emulate(device);
Expand All @@ -49,7 +54,7 @@ const crawlDomain = async (url, randomToken, host, viewportSettings) => {
isMobile: true,
});
}

const currentUrl = request.url;
const location = await page.evaluate('location');

Expand All @@ -66,6 +71,7 @@ const crawlDomain = async (url, randomToken, host, viewportSettings) => {
} else {
urlsCrawled.outOfDomain.push(currentUrl);
}

},
failedRequestHandler,
maxRequestsPerCrawl,
Expand Down
5 changes: 5 additions & 0 deletions crawlers/crawlSitemap.js
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@ const crawlSitemap = async (sitemapUrl, randomToken, host, viewportSettings) =>
}
}
const crawler = new crawlee.PuppeteerCrawler({
launchContext: {
launchOptions: {
args: constants.launchOptionsArgs,
}
},
requestList,
requestQueue,
preNavigationHooks,
Expand Down

0 comments on commit acd9767

Please sign in to comment.