From 02f59f08e668b7a841009a380e6463bf12bd498c Mon Sep 17 00:00:00 2001 From: Siddharth VP Date: Sat, 1 Jun 2024 15:03:57 +0530 Subject: [PATCH] db-tabulator: expose more APIs in JS preprocessing --- db-tabulator/app.ts | 2 +- db-tabulator/database-report.hbs | 8 ++++++ db-tabulator/isolate.vm.js | 20 +++++++++---- db-tabulator/preprocess.ts | 48 ++++++++++++++++++++++++++++++-- 4 files changed, 68 insertions(+), 10 deletions(-) diff --git a/db-tabulator/app.ts b/db-tabulator/app.ts index f67ab55..b1a3e11 100644 --- a/db-tabulator/app.ts +++ b/db-tabulator/app.ts @@ -345,7 +345,7 @@ export class Query extends EventEmitter { let excerpts: Record = {}; for (let pageSet of arrayChunk(pages, 100)) { for await (let pg of bot.readGen(pageSet, { - rvsection: 0, + rvsection: '0', redirects: false })) { if (pg.invalid || pg.missing) { diff --git a/db-tabulator/database-report.hbs b/db-tabulator/database-report.hbs index 2cc3c38..2e5c6a3 100644 --- a/db-tabulator/database-report.hbs +++ b/db-tabulator/database-report.hbs @@ -18,6 +18,13 @@ function error(msg) { return `${msg}`; } + function safeStringify(obj) { + try { + return JSON.stringify(obj, undefined, 2); + } catch (e) { + return ''; + } + } let params = new Map(new URLSearchParams(location.search)); let page = params.get('page'); @@ -46,6 +53,7 @@ 'query-executing': data => `Query (${shorten(data.args[0], 80)}) submitted to database.`, 'query-executed': data => `Query finished running in ${data.args[0]} seconds.`, 'preprocessing': _ => `Started JS preprocessing on query result.`, + 'js-logging': data => `Logging output:
${safeStringify(data.args[0])}
`, 'js-no-array': _ => error(`JS preprocess() must return an array. `) + 'Saving result without preprocessing.', 'js-invalid-return': _ => error(`JS preprocess() returned a value which is not transferable. `) + 'Saving result without preprocessing.', diff --git a/db-tabulator/isolate.vm.js b/db-tabulator/isolate.vm.js index 6087b7f..9e61268 100644 --- a/db-tabulator/isolate.vm.js +++ b/db-tabulator/isolate.vm.js @@ -1,12 +1,20 @@ /* eslint-disable no-unused-vars */ -/* global __mwApiGet, __dbQueryResult, preprocess */ +/* global __mwApiGet, __rawReq, __dbQueryResult, preprocess */ (async function() { - "${JS_CODE}"; - - async function mwApiGet(params) { - const response = await __mwApiGet.applySyncPromise(undefined, [JSON.stringify(params)]); - return JSON.parse(response); + const bot = { + async request(url) { + if (typeof url !== 'string') throw new Error('bot.request() needs a string url'); + const response = await __rawReq.applySyncPromise(undefined, [url]); + return JSON.parse(response); + }, + async api(params) { + if (typeof params !== 'object') throw new Error('bot.api() parameters need to be an object'); + const response = await __mwApiGet.applySyncPromise(undefined, [JSON.stringify(params)]); + return JSON.parse(response); + } } + "${JS_CODE}"; + return JSON.stringify(await preprocess(JSON.parse(__dbQueryResult))); }) diff --git a/db-tabulator/preprocess.ts b/db-tabulator/preprocess.ts index 40b8940..437c78b 100644 --- a/db-tabulator/preprocess.ts +++ b/db-tabulator/preprocess.ts @@ -1,4 +1,4 @@ -import {argv, fs, log, Mwn} from "../botbase"; +import {argv, AuthManager, fs, log, Mwn} from "../botbase"; import {fork} from "child_process"; import EventEmitter from "events"; import type {Query} from "./app"; @@ -65,7 +65,8 @@ const apiClient = new Mwn({ apiUrl: 'https://en.wikipedia.org/w/api.php', maxRetries: 0, silent: true, - userAgent: '[[w:en:Template:Database report]], [[w:en:SDZeroBot]], node.js isolated-vm', + userAgent: '[[w:en:Template:Database report]] via [[w:en:SDZeroBot]], node.js isolated-vm', + OAuth2AccessToken: AuthManager.get('sdzerobot-dbreports').OAuth2AccessToken, defaultParams: { maxlag: undefined } @@ -82,7 +83,7 @@ export async function applyJsPreprocessing(rows: Record[], jsCod let startTime = process.hrtime.bigint(); // Import dynamically as this has native dependencies - let {Isolate, Reference} = await import('isolated-vm'); + let {Isolate, Callback, Reference} = await import('isolated-vm'); const isolate = new Isolate({ memoryLimit: 16, @@ -95,11 +96,17 @@ export async function applyJsPreprocessing(rows: Record[], jsCod const jail = context.global; await jail.set('__dbQueryResult', JSON.stringify(rows)); + await jail.set('log', new Callback(function(arg) { + console.log(arg); + query.emit('js-logging', arg); + })); + // Support readonly API access await jail.set('__mwApiGet', new Reference(async function (rawParams: string) { let params = JSON.parse(rawParams); // Disallow write operations params.action = 'query'; + params.format = 'json'; delete params.token; try { return JSON.stringify(await apiClient.query(params)); @@ -108,6 +115,41 @@ export async function applyJsPreprocessing(rows: Record[], jsCod } })); + await jail.set('__rawReq', new Reference(async function (url: string) { + const allowedDomains = [ + 'https://en.wikipedia.org/api/rest_v1/', // Wikimedia REST API + 'https://wikimedia.org/api/rest_v1/', // Wikimedia REST API + 'https://en.wikipedia.org/w/rest.php/', // MediaWiki REST API + 'https://en.wikipedia/org/w/api.php?', // Action API + 'https://api.wikimedia.org/', // Wikimedia API gateway + ]; + + if (!allowedDomains.find(domain => url.startsWith(domain))) { + return JSON.stringify({ error: `Disallowed domain. Allowed domains are: ${allowedDomains.join(', ')}` }); + } + + try { + const response = await apiClient.rawRequest({ + method: 'GET', + url: url, + timeout: 10000, + headers: { + // Bot grant enables apihighlimit (for Action API), and helps avoid throttling for some REST APIs. + // It has no write access. + 'Authorization': `Bearer ${AuthManager.get('sdzerobot-dbreports').OAuth2AccessToken}` + } + }); + try { + return JSON.stringify(response.data); + } catch (e) { + return JSON.stringify({ error: `Non JSON response from ${url}: ${response.data}` }); + } + } catch (err) { + let errMsg = err.statusCode ? (err.statusCode + ': ' + err.statusMessage) : err.message; + return JSON.stringify({ error: errMsg }); + } + })); + let result = rows; let doPreprocessing = async () => {