From cee29ed2b505bc62b84d1910d027adb256b5af90 Mon Sep 17 00:00:00 2001 From: Siddharth VP Date: Sat, 13 Apr 2024 22:42:07 +0530 Subject: [PATCH] Support custom JS preprocessing for db reports --- botbase.ts | 2 +- db-tabulator/MariadbMetadataStore.ts | 5 +- db-tabulator/app.ts | 43 +++++++--- db-tabulator/external-update.ts | 24 ++++++ db-tabulator/main.ts | 4 +- db-tabulator/preprocess.ts | 117 +++++++++++++++++++++++++++ db-tabulator/test.ts | 12 +++ db-tabulator/web-endpoint.ts | 14 +++- jobs.yml | 2 +- utils.ts | 7 +- 10 files changed, 209 insertions(+), 21 deletions(-) create mode 100644 db-tabulator/external-update.ts create mode 100644 db-tabulator/preprocess.ts diff --git a/botbase.ts b/botbase.ts index 8f35b40..7592d5e 100644 --- a/botbase.ts +++ b/botbase.ts @@ -25,7 +25,7 @@ export function emailOnError(err: Error, taskname: string, isFatal = true) { sendMail(`${taskname} error`, `${taskname} task resulted in the error:\n\n${err.stack}\n`); // exit normally } -export function logFullError(err: Error, isFatal = true) { +export function logFullError(err: any, isFatal = true) { if (err.request?.headers?.Authorization) { err.request.headers.Authorization = '***'; } diff --git a/db-tabulator/MariadbMetadataStore.ts b/db-tabulator/MariadbMetadataStore.ts index 18998a8..ac7ee7a 100644 --- a/db-tabulator/MariadbMetadataStore.ts +++ b/db-tabulator/MariadbMetadataStore.ts @@ -1,14 +1,13 @@ -import {TOOLS_DB_HOST, toolsdb} from "../db"; +import {toolsdb} from "../db"; import {fetchQueriesForPage, Query} from "./app"; import {MetadataStore} from "./MetadataStore"; -import {createLocalSSHTunnel, setDifference} from "../utils"; +import {setDifference} from "../utils"; import * as crypto from "crypto"; export class MariadbMetadataStore implements MetadataStore { db: toolsdb; async init() { - await createLocalSSHTunnel(TOOLS_DB_HOST); this.db = new toolsdb('dbreports_p'); } diff --git a/db-tabulator/app.ts b/db-tabulator/app.ts index 553c1dd..349aa42 100644 --- a/db-tabulator/app.ts +++ b/db-tabulator/app.ts @@ -1,11 +1,12 @@ import { argv, bot, emailOnError, log, Mwn, TextExtractor } from "../botbase"; import { enwikidb, SQLError } from "../db"; import { Template } from "../../mwn/build/wikitext"; -import { arrayChunk, createLogStream, lowerFirst, readFile, writeFile } from "../utils"; +import { arrayChunk, createLogStream, lowerFirst, readFile, stripOuterNowikis, writeFile } from "../utils"; import {NS_CATEGORY, NS_FILE, NS_MAIN} from "../namespaces"; import { formatSummary } from "../reports/commons"; import {MetadataStore} from "./MetadataStore"; import {HybridMetadataStore} from "./HybridMetadataStore"; +import {applyJsPreprocessing, processQueriesExternally} from "./preprocess"; export const BOT_NAME = 'SDZeroBot'; export const TEMPLATE = 'Database report'; @@ -40,18 +41,27 @@ export function getQueriesFromText(text: string, title: string): Query[] { log(`[E] Failed to find template on ${title}`); return []; } - return templates.map((template, idx) => new Query(template, title, idx + 1)); + return templates.map((template, idx) => + new Query(template, title, idx + 1, !!template.getValue('js_preprocess')?.trim())); } -// Called from the cronjob export async function processQueries(allQueries: Record) { await bot.batchOperation(Object.entries(allQueries), async ([page, queries]) => { - log(`[+] Processing page ${page}`); - await processQueriesForPage(queries); + if (queries.filter(q => q.needsExternalRun).length > 0) { + // Needs an external process for security + log(`[+] Processing page ${page} using child process`); + await processQueriesExternally(page); + } else { + log(`[+] Processing page ${page}`); + await processQueriesForPage(queries); + } }, CONCURRENCY); } export async function fetchQueriesForPage(page: string): Promise { + if (argv.fake) { + return getQueriesFromText(readFile(FAKE_INPUT_FILE), 'Fake-Configs'); + } let text = (await bot.read(page, { redirects: false }))?.revisions?.[0]?.content; if (!text) { return []; @@ -120,13 +130,18 @@ export class Query { /** Invocation mode */ context: string; - /** Internal tracking for edit summary */ + /** Internal tracking: for edit summary */ endNotFound = false; - constructor(template: Template, page: string, idxOnPage: number) { + /** Internal tracking: for queries with JS preprocessing enabled */ + needsExternalRun = false; + needsForceKill = false; + + constructor(template: Template, page: string, idxOnPage: number, external?: boolean) { this.page = page; this.template = template; this.idx = idxOnPage; + this.needsExternalRun = external; this.context = getContext(); } @@ -155,8 +170,7 @@ export class Query { getSql() { let sql = this.getTemplateValue('sql'); if (/^\s*/.test(sql)) { - return sql.replace(/^\s*/, '') - .replace(/<\/nowiki ?>\s*$/, ''); + return stripOuterNowikis(sql); } else { // @deprecated return sql @@ -238,6 +252,8 @@ export class Query { } this.config.silent = !!this.getTemplateValue('silent'); + + return this; } async runQuery() { @@ -372,6 +388,15 @@ export class Query { return String(value); }); } + if (this.getTemplateValue('js_preprocess')) { + const jsCode = stripOuterNowikis(this.getTemplateValue('js_preprocess')); + try { + result = await applyJsPreprocessing(result, jsCode, this.toString(), this); + } catch (e) { + log(`[E] Error in applyJsPreprocessing`); + log(e); + } + } // Add excerpts for (let {srcIndex, destIndex, namespace, charLimit, charHardLimit} of this.config.excerpts) { diff --git a/db-tabulator/external-update.ts b/db-tabulator/external-update.ts new file mode 100644 index 0000000..e8da10b --- /dev/null +++ b/db-tabulator/external-update.ts @@ -0,0 +1,24 @@ +import {argv, bot, emailOnError} from "../botbase"; +import {metadataStore, fetchQueriesForPage, processQueriesForPage} from "./app"; + +/** + * Entry point invoked in a child Node.js process for queries + * with custom JS preprocessing enabled. + */ +(async function () { + + process.chdir(__dirname); + + await Promise.all([ + bot.getTokensAndSiteInfo(), + metadataStore.init() + ]); + + const queries = await fetchQueriesForPage(argv.page); + await processQueriesForPage(queries); + + if (queries.filter(q => q.needsForceKill).length > 0) { + process.send({ code: 'catastrophic-error' }); + } + +})().catch(e => emailOnError(e, 'db-tabulator-child')); diff --git a/db-tabulator/main.ts b/db-tabulator/main.ts index e049a40..7c66ff9 100644 --- a/db-tabulator/main.ts +++ b/db-tabulator/main.ts @@ -1,7 +1,6 @@ import { argv, bot, emailOnError, log } from "../botbase"; -import { closeTunnels, createLocalSSHTunnel, writeFile } from "../utils"; +import { closeTunnels, writeFile } from "../utils"; import { checkShutoff, FAKE_OUTPUT_FILE, fetchQueries, processQueries, metadataStore } from "./app"; -import { ENWIKI_DB_HOST } from "../db"; /** * Specs: @@ -37,7 +36,6 @@ import { ENWIKI_DB_HOST } from "../db"; await Promise.all([ bot.getTokensAndSiteInfo(), metadataStore.init(), - createLocalSSHTunnel(ENWIKI_DB_HOST), ]); if (argv.fake) { diff --git a/db-tabulator/preprocess.ts b/db-tabulator/preprocess.ts new file mode 100644 index 0000000..451485e --- /dev/null +++ b/db-tabulator/preprocess.ts @@ -0,0 +1,117 @@ +import {argv, log} from "../botbase"; +import {sleep} from "../../mwn/build/utils"; +import {fork} from "child_process"; + +const softTimeout = 1000; +const hardTimeout = 1500; +const processTimeout = 30000; + +interface PreprocessContext { + warnings: Array; + needsForceKill: boolean; +} + +export async function processQueriesExternally(page: string) { + const controller = new AbortController(); + await Promise.race([ + new Promise((resolve, reject) => { + const { signal } = controller; + const child = fork( + 'external-update.js', + [page].concat(argv.fake ? ['--fake'] : []), + { + execArgv: ['--no-node-snapshot'], // required for node 20+ + signal + } + ); + child.on('message', (message: any) => { + if (message.code === 'catastrophic-error') { + controller.abort(); // This triggers exit event + } + }); + child.on('error', (err) => { + log(`[E] Error from child process`); + log(err); + reject(); + }) + child.on('exit', () => resolve()); + }), + sleep(processTimeout).then(() => { + log(`[E] Aborting child process as it took more than 30 seconds`); + // FIXME? looks necessary as some errors in child process cause it to never resolve/reject + controller.abort(); + }) + ]); +} + +export async function applyJsPreprocessing(rows: Record[], jsCode: string, queryId: string, + ctx: PreprocessContext): Promise[]> { + log(`[+] Applying JS preprocessing for ${queryId}`); + let startTime = process.hrtime.bigint(); + + // Import dynamically as this has native dependencies + let {Isolate} = await import('isolated-vm'); + + const isolate = new Isolate({ + memoryLimit: 16, + onCatastrophicError(msg) { + log(`[E] Catastrophic error in isolated-vm: ${msg}`); + ctx.needsForceKill = true; + } + }); + const context = await isolate.createContext(); + const jail = context.global; + await jail.set('__dbQueryResult', JSON.stringify(rows)); + + let result = rows; + let preProcessingComplete = false; + let doPreprocessing = async () => { + try { + // jsCode is expected to declare function preprocess(rows) {...} + let userCode = await isolate.compileScript(jsCode + + '\n ; JSON.stringify(preprocess(JSON.parse(__dbQueryResult))); \n'); + + let userCodeResult = await userCode.run(context, { timeout: softTimeout }); + try { + if (typeof userCodeResult === 'string') { // returns undefined if non-transferable + let userCodeResultParsed = JSON.parse(userCodeResult); + if (Array.isArray(userCodeResultParsed)) { + result = userCodeResultParsed; + } else { + log(`[E] JS preprocessing for ${queryId} returned a non-array: ${userCodeResult.slice(0, 100)} ... Ignoring.`); + ctx.warnings.push(`JS preprocessing didn't return an array of rows, will be ignored`); + } + } else { + log(`[E] JS preprocessing for ${queryId} has an invalid return value: ${userCodeResult}. Ignoring.`); + ctx.warnings.push(`JS preprocessing must have a transferable return value`); + } + } catch (e) { // Shouldn't occur as we are the ones doing the JSON.stringify + log(`[E] JS preprocessing for ${queryId} returned a non-JSON: ${userCodeResult.slice(0, 100)}. Ignoring.`); + } + } catch (e) { + log(`[E] JS preprocessing for ${queryId} failed: ${e.toString()}`); + log(e); + ctx.warnings.push(`JS preprocessing failed: ${e.toString()}`); + } finally { + preProcessingComplete = true; + } + } + + await Promise.race([ + doPreprocessing(), + + // In case isolated-vm timeout doesn't work + sleep(hardTimeout).then(() => { + if (!preProcessingComplete) { + log(`[E] Past ${hardTimeout/1000} second timeout, force-disposing isolate`); + isolate.dispose(); + } + }) + ]); + + let endTime = process.hrtime.bigint(); + let timeTaken = Number(endTime - startTime) / 1e9; + log(`[+] JS preprocessing for ${queryId} took ${timeTaken.toFixed(3)} seconds, cpuTime: ${isolate.cpuTime}, wallTime: ${isolate.wallTime}.`); + + return result; +} diff --git a/db-tabulator/test.ts b/db-tabulator/test.ts index e69d34d..edaeeda 100644 --- a/db-tabulator/test.ts +++ b/db-tabulator/test.ts @@ -4,6 +4,7 @@ import assert = require("assert"); import {NoMetadataStore} from "./NoMetadataStore"; import {Template} from "../../mwn/build/wikitext"; import {MwnDate} from "../../mwn"; +import {applyJsPreprocessing} from "./preprocess"; describe('db-tabulator', () => { @@ -27,4 +28,15 @@ describe('db-tabulator', () => { assert.strictEqual(isUpdateDue(new bot.date().subtract(40, 'hour'), 2), true); }); + it('applyJsPreprocessing', async () => { + console.log(await applyJsPreprocessing( + [{id: '1', name: 'Main Page'}, {id: '2', name: "Talk:Main Page"}], + `function preprocess(rows) { + rows.forEach(row => { + row.id = parseInt(row.id) + 100; + }) + return rows; + }`, 'test', { warnings: [], needsForceKill: false })); + }) + }); diff --git a/db-tabulator/web-endpoint.ts b/db-tabulator/web-endpoint.ts index f8f8be8..f7d3f4b 100644 --- a/db-tabulator/web-endpoint.ts +++ b/db-tabulator/web-endpoint.ts @@ -1,5 +1,13 @@ import * as express from "express"; -import {checkShutoff, fetchQueriesForPage, metadataStore, processQueriesForPage, SHUTOFF_PAGE, TEMPLATE, SUBSCRIPTIONS_CATEGORY} from "./app"; +import { + checkShutoff, + fetchQueriesForPage, + metadataStore, + SHUTOFF_PAGE, + TEMPLATE, + SUBSCRIPTIONS_CATEGORY, + processQueries +} from "./app"; import { createLogStream, mapPath } from "../utils"; import {bot, enwikidb} from "../botbase"; import {getRedisInstance} from "../redis"; @@ -78,8 +86,8 @@ router.get('/', async function (req, res, next) { }); if (queries) { log(`Started processing ${page}`); - try { // should never throw but still ... - await processQueriesForPage(queries); + try { + await processQueries({[page]: queries}); } finally { redis.srem(redisKey, pgKey).catch(handleRedisError); } diff --git a/jobs.yml b/jobs.yml index 7855a4e..a8da5c7 100644 --- a/jobs.yml +++ b/jobs.yml @@ -21,7 +21,7 @@ - {"schedule": "5 0 * * *", "name": "job-g13watch", mem: "256Mi", "command": "~/SDZeroBot/job reports/g13-watch/g13-watch.js", "image": "node18", "emails": "onfailure"} - {"schedule": "15,45 * * * *", "name": "stream-check", mem: "128Mi", "command": "~/SDZeroBot/job eventstream-router/check.js", "image": "node18", "emails": "onfailure"} - {"schedule": "20 * * * *", "name": "bot-monitor", mem: "256Mi", "command": "~/SDZeroBot/job bot-monitor/main.js", "image": "node18", "emails": "onfailure"} -- {"schedule": "25 1,5,9,13,17,21 * * *", "name": "db-tabulator", mem: "256Mi", "command": "~/SDZeroBot/job db-tabulator/main.js", "image": "node18", "emails": "onfailure"} +- {"schedule": "25 1,5,9,13,17,21 * * *", "name": "db-tabulator", mem: "512Mi", "command": "~/SDZeroBot/job db-tabulator/main.js", "image": "node18", "emails": "onfailure"} - {"schedule": "8 16 * * *", "name": "gans-list", mem: "256Mi", "command": "~/SDZeroBot/job most-gans/gans-lister.js", "image": "node18", "emails": "onfailure"} - {"schedule": "0 3 1 * *", "name": "gans-users", mem: "256Mi", "command": "~/SDZeroBot/job most-gans/update-entries.js", "image": "node18", "emails": "onfailure"} - {"schedule": "0 4 * * *", "name": "shells", mem: "128Mi", "command": "~/SDZeroBot/job terminate-shell-pods.js", "image": "node18", "emails": "onfailure"} diff --git a/utils.ts b/utils.ts index f7f9005..37fbce4 100644 --- a/utils.ts +++ b/utils.ts @@ -3,7 +3,7 @@ import { spawn } from "child_process"; import { ENWIKI_DB_HOST, TOOLS_DB_HOST } from "./db"; import { REDIS_HOST } from "./redis"; -export function readFile(file) { +export function readFile(file: string) { try { return fs.readFileSync(file).toString(); } catch (e) { @@ -109,6 +109,11 @@ export function stringifyObject(obj) { } } +export function stripOuterNowikis(str: string): string { + return str.replace(/^\s*/, '') + .replace(/<\/nowiki ?>\s*$/, ''); +} + export function makeSentence(list: string[]) { var text = ''; for (let i = 0; i < list.length; i++) {