From b41b13d9a787883b7b14bc701a21f4c5f14500dd Mon Sep 17 00:00:00 2001 From: Siddharth VP Date: Sat, 12 Jun 2021 18:57:18 +0530 Subject: [PATCH] TextExtractor --- TextExtractor.js | 176 ------------------ TextExtractorTest.js => TextExtractor.test.js | 9 +- TextExtractor.ts | 169 +++++++++++++++++ botbase.ts | 3 +- old/g13-watch/old-sqlite/save-to-db.js | 3 +- old/g13-watch/save-to-db.ts | 3 +- reports/peer-review.js | 3 +- reports/unreferenced-blps.js | 3 +- 8 files changed, 178 insertions(+), 191 deletions(-) delete mode 100644 TextExtractor.js rename TextExtractorTest.js => TextExtractor.test.js (77%) create mode 100644 TextExtractor.ts diff --git a/TextExtractor.js b/TextExtractor.js deleted file mode 100644 index 542ac8c..0000000 --- a/TextExtractor.js +++ /dev/null @@ -1,176 +0,0 @@ -/** - * @param {mwn} bot - */ -module.exports = function(bot) { - - class TextExtractor { - - /** - * Get wikitext extract. If you want plain text or HTML extracts, consider using - * the TextExtracts API instead. - * @param {string} pagetext - full page text - * @param {number} [charLimit] - cut off the extract at this many readable characters, or wherever - * the sentence ends after this limit - * @param {number} [hardUpperLimit] - cut off the extract at this many readable characters even if - * the sentence hasn't ended - * @param {Function} [preprocessHook] - optional function to work on the text at the - * beginning - */ - static getExtract(pagetext, charLimit, hardUpperLimit, preprocessHook) { - - if (!pagetext) { - return ''; - } - let extract = pagetext; - - if (preprocessHook) { - extract = preprocessHook(extract); - } - - // Remove images. Can't be done correctly with just regex as there could be wikilinks - // in the captions. - extract = this.removeImages(extract); - - // Remove templates beginning on a new line, such as infoboxes. - // These occasionally contain parameters with part of the content - // beginning on a newline not starting with a | or * or # or ! - // thus can't be handled with the line regex. - extract = this.removeTemplatesOnNewlines(extract); - - // Remove some other templates too - extract = this.removeTemplates(extract, ['efn', 'refn']); - - extract = extract - .replace(//sg, '') - // remove refs, including named ref definitions and named ref invocations - .replace(/|<\/ref>)/sgi, '') - // the magic - .replace(/^\s*[-{|}=*#: charLimit) { - var match = sentenceEnd.exec(extract); - while (match) { - if (this.effCharCount(extract.slice(0, match.index)) > charLimit) { - extract = extract.slice(0, match.index + 1); - break; - } else { - match = sentenceEnd.exec(extract); - } - } - } - } - - if (hardUpperLimit) { - if (this.effCharCount(extract) > hardUpperLimit) { - extract = extract.slice(0, hardUpperLimit) + ' ...'; - } - } - - return extract; - } - - static removeImages(text) { - var wkt = new bot.wikitext(text); - wkt.parseLinks(); - wkt.files.forEach(file => { - wkt.removeEntity(file); - }); - return wkt.getText(); - } - - static removeTemplatesOnNewlines(text) { - var templateOnNewline = /^\{\{/m; // g is omitted for a reason, the text is changing. - var match = templateOnNewline.exec(text); - while (match) { - var template = new bot.wikitext(text.slice(match.index)).parseTemplates({count: 1})[0]; - if (template) { - text = text.replace(template.wikitext, ''); - } else { // just get rid of that line, otherwise we'd enter an infinite loop - text = text.replace(/^\{\{.*$/m, ''); - } - match = templateOnNewline.exec(text); - } - return text; - } - - /** - * @param {string} text - * @param {string[]} templates - */ - static removeTemplates(text, templates) { - var wkt = new bot.wikitext(text); - // TODO: Things to think about: how to generate ONE regex that matches all the given - // templates and which is as efficient as possible? That is, for 'efn' and 'refn' - // the regex generated should be /[rR]?[eE]?fn/ (as efn is a substring of refn) - // Can this be solved using the longest common subsequence problem? - // Or maybe use tries? - const makeRegexFromTemplate = function(template) { - return new RegExp('^[' + template[0].toLowerCase() + template[0].toUpperCase() + ']' + template.slice(1) + '$', 'g'); - } - wkt.parseTemplates({ - namePredicate: name => { - return templates.some(template => { - return makeRegexFromTemplate(template).test(name); - }); - } - }); - for (let template of wkt.templates) { - wkt.removeEntity(template); - } - return wkt.getText(); - } - - static effCharCount(text) { - return text - .replace(/\[\[:?(?:[^|\]]+?\|)?([^\]|]+?)\]\]/g, '$1') - .replace(/''/g, '') - .length; - } - - - /** - * Do away with some of the more bizarre stuff from page extracts that aren't worth - * checking for on a per-page basis - * Minimise the amount of removals done here, since if the extract was cut off, it may - * happen one of the regexes below will match across two different extracts. - * @param {string} content - */ - static finalSanitise(content) { - return content.replace(/\[\[Category:.*?\]\]/gi, '') - // these are just bad - .replace(/__[A-Z]+__/g, '') - // Openings of any unclosed ref tags - .replace(/|(?=\n))/gi, '') - // Harvard referencing - .replace(/\{\{[sS]fnp?\|.*?\}\}/g, '') - // shortcut for named ref invocation - .replace(/\{\{r\|.*?\}\}/gi, '') - // inline parenthetical referencing - .replace(/\{\{[hH]arv\|.*?\}\}/g, '') - // pronunciation - .replace(/\{\{IPA.*?\}\}/g, '') - // audio - .replace(/\{\{[aA]udio\|.*?\}\}/g, ''); - } - } - - return TextExtractor; - -}; diff --git a/TextExtractorTest.js b/TextExtractor.test.js similarity index 77% rename from TextExtractorTest.js rename to TextExtractor.test.js index 95e5138..e4b1612 100644 --- a/TextExtractorTest.js +++ b/TextExtractor.test.js @@ -1,8 +1,5 @@ -/* globals it, before */ - const assert = require('assert'); -const {bot} = require('./botbase'); -const TE = require('./TextExtractor')(bot); +const {bot, TextExtractor} = require('./botbase'); before(function() { return bot.getSiteInfo(); @@ -22,7 +19,7 @@ it('removes templates on new lines', function() { Arthur was an fine tailor. `; - assert.strictEqual(TE.removeTemplatesOnNewlines(text), '\n\nArthur was an fine tailor.\n'); + assert.strictEqual(TextExtractor.removeTemplatesOnNewlines(text), '\n\nArthur was an fine tailor.\n'); }); @@ -31,7 +28,7 @@ it('runs preprocessHook', function () { ==References==`; - let extract = TE.getExtract(text, 250, 500, function(text) { + let extract = TextExtractor.getExtract(text, 250, 500, function(text) { let wkt = new bot.wikitext(text); wkt.parseTemplates({ namePredicate: name => { diff --git a/TextExtractor.ts b/TextExtractor.ts new file mode 100644 index 0000000..3cc5138 --- /dev/null +++ b/TextExtractor.ts @@ -0,0 +1,169 @@ +import {bot} from './botbase'; + +export default class TextExtractor { + + /** + * Get wikitext extract. If you want plain text or HTML extracts, consider using + * the TextExtracts API instead. + * @param {string} pagetext - full page text + * @param {number} [charLimit] - cut off the extract at this many readable characters, or wherever + * the sentence ends after this limit + * @param {number} [hardUpperLimit] - cut off the extract at this many readable characters even if + * the sentence hasn't ended + * @param {Function} [preprocessHook] - optional function to work on the text at the + * beginning + */ + static getExtract(pagetext: string, charLimit: number, hardUpperLimit: number, preprocessHook: ((text: string) => string)) { + + if (!pagetext) { + return ''; + } + let extract = pagetext; + + if (preprocessHook) { + extract = preprocessHook(extract); + } + + // Remove images. Can't be done correctly with just regex as there could be wikilinks + // in the captions. + extract = this.removeImages(extract); + + // Remove templates beginning on a new line, such as infoboxes. + // These occasionally contain parameters with part of the content + // beginning on a newline not starting with a | or * or # or ! + // thus can't be handled with the line regex. + extract = this.removeTemplatesOnNewlines(extract); + + // Remove some other templates too + extract = this.removeTemplates(extract, ['efn', 'refn']); + + extract = extract + .replace(//sg, '') + // remove refs, including named ref definitions and named ref invocations + .replace(/|<\/ref>)/sgi, '') + // the magic + .replace(/^\s*[-{|}=*#: charLimit) { + var match = sentenceEnd.exec(extract); + while (match) { + if (this.effCharCount(extract.slice(0, match.index)) > charLimit) { + extract = extract.slice(0, match.index + 1); + break; + } else { + match = sentenceEnd.exec(extract); + } + } + } + } + + if (hardUpperLimit) { + if (this.effCharCount(extract) > hardUpperLimit) { + extract = extract.slice(0, hardUpperLimit) + ' ...'; + } + } + + return extract; + } + + static removeImages(text: string) { + var wkt = new bot.wikitext(text); + wkt.parseLinks(); + wkt.files.forEach(file => { + wkt.removeEntity(file); + }); + return wkt.getText(); + } + + static removeTemplatesOnNewlines(text: string) { + var templateOnNewline = /^\{\{/m; // g is omitted for a reason, the text is changing. + var match = templateOnNewline.exec(text); + while (match) { + var template = new bot.wikitext(text.slice(match.index)).parseTemplates({count: 1})[0]; + if (template) { + text = text.replace(template.wikitext, ''); + } else { // just get rid of that line, otherwise we'd enter an infinite loop + text = text.replace(/^\{\{.*$/m, ''); + } + match = templateOnNewline.exec(text); + } + return text; + } + + /** + * @param {string} text + * @param {string[]} templates + */ + static removeTemplates(text: string, templates: string[]) { + var wkt = new bot.wikitext(text); + // TODO: Things to think about: how to generate ONE regex that matches all the given + // templates and which is as efficient as possible? That is, for 'efn' and 'refn' + // the regex generated should be /[rR]?[eE]?fn/ (as efn is a substring of refn) + // Can this be solved using the longest common subsequence problem? + // Or maybe use tries? + const makeRegexFromTemplate = function(template) { + return new RegExp('^[' + template[0].toLowerCase() + template[0].toUpperCase() + ']' + template.slice(1) + '$', 'g'); + } + wkt.parseTemplates({ + namePredicate: name => { + return templates.some(template => { + return makeRegexFromTemplate(template).test(name); + }); + } + }); + for (let template of wkt.templates) { + wkt.removeEntity(template); + } + return wkt.getText(); + } + + static effCharCount(text: string) { + return text + .replace(/\[\[:?(?:[^|\]]+?\|)?([^\]|]+?)\]\]/g, '$1') + .replace(/''/g, '') + .length; + } + + + /** + * Do away with some of the more bizarre stuff from page extracts that aren't worth + * checking for on a per-page basis + * Minimise the amount of removals done here, since if the extract was cut off, it may + * happen one of the regexes below will match across two different extracts. + * @param {string} content + */ + static finalSanitise(content: string) { + return content.replace(/\[\[Category:.*?\]\]/gi, '') + // these are just bad + .replace(/__[A-Z]+__/g, '') + // Openings of any unclosed ref tags + .replace(/|(?=\n))/gi, '') + // Harvard referencing + .replace(/\{\{[sS]fnp?\|.*?\}\}/g, '') + // shortcut for named ref invocation + .replace(/\{\{r\|.*?\}\}/gi, '') + // inline parenthetical referencing + .replace(/\{\{[hH]arv\|.*?\}\}/g, '') + // pronunciation + .replace(/\{\{IPA.*?\}\}/g, '') + // audio + .replace(/\{\{[aA]udio\|.*?\}\}/g, ''); + } +} diff --git a/botbase.ts b/botbase.ts index 6a0764a..5afdf04 100644 --- a/botbase.ts +++ b/botbase.ts @@ -59,7 +59,8 @@ export const bot = new mwn({ bot.initOAuth(); -export const TextExtractor = require('./TextExtractor')(bot); +import TextExtractor from "./TextExtractor"; +export { TextExtractor }; // Deprecated exports, import from ./db or ./utils directly export {mysql, db, enwikidb, toolsdb} from './db'; diff --git a/old/g13-watch/old-sqlite/save-to-db.js b/old/g13-watch/old-sqlite/save-to-db.js index d0219d0..746e0a1 100644 --- a/old/g13-watch/old-sqlite/save-to-db.js +++ b/old/g13-watch/old-sqlite/save-to-db.js @@ -1,8 +1,7 @@ // start job using: npm run start -const {bot, log, emailOnError} = require('../botbase'); +const {bot, log, emailOnError, TextExtractor} = require('../botbase'); const EventSource = require('eventsource'); -const TextExtractor = require('../TextExtractor')(bot); const sqlite3 = require('sqlite3'); const sqlite = require('sqlite'); diff --git a/old/g13-watch/save-to-db.ts b/old/g13-watch/save-to-db.ts index 78811f6..6cfd27b 100644 --- a/old/g13-watch/save-to-db.ts +++ b/old/g13-watch/save-to-db.ts @@ -3,10 +3,9 @@ // start job using: npm run start -import {fs, bot, log, mysql, argv} from '../../botbase'; +import {fs, bot, log, mysql, argv, TextExtractor} from '../../botbase'; const {preprocessDraftForExtract} = require('../../reports/commons'); -const TextExtractor = require('../../TextExtractor')(bot); const auth = require('../../.auth'); function logError(err) { diff --git a/reports/peer-review.js b/reports/peer-review.js index 36dfba7..98d9ab9 100644 --- a/reports/peer-review.js +++ b/reports/peer-review.js @@ -1,5 +1,4 @@ -const {bot, mwn, log, emailOnError} = require('../botbase'); -const TextExtractor = require('../TextExtractor')(bot); +const {bot, mwn, log, emailOnError, TextExtractor} = require('../botbase'); (async function() { diff --git a/reports/unreferenced-blps.js b/reports/unreferenced-blps.js index 676cc13..a59bfc6 100644 --- a/reports/unreferenced-blps.js +++ b/reports/unreferenced-blps.js @@ -1,5 +1,4 @@ -const {bot, mwn, log, fs} = require('../botbase'); -const TextExtractor = require('../TextExtractor')(bot); +const {bot, mwn, log, fs, TextExtractor} = require('../botbase'); const wd = new mwn({ ...bot.options,