diff --git a/TextExtractor.js b/TextExtractor.js
deleted file mode 100644
index 542ac8c..0000000
--- a/TextExtractor.js
+++ /dev/null
@@ -1,176 +0,0 @@
-/**
- * @param {mwn} bot
- */
-module.exports = function(bot) {
-
- class TextExtractor {
-
- /**
- * Get wikitext extract. If you want plain text or HTML extracts, consider using
- * the TextExtracts API instead.
- * @param {string} pagetext - full page text
- * @param {number} [charLimit] - cut off the extract at this many readable characters, or wherever
- * the sentence ends after this limit
- * @param {number} [hardUpperLimit] - cut off the extract at this many readable characters even if
- * the sentence hasn't ended
- * @param {Function} [preprocessHook] - optional function to work on the text at the
- * beginning
- */
- static getExtract(pagetext, charLimit, hardUpperLimit, preprocessHook) {
-
- if (!pagetext) {
- return '';
- }
- let extract = pagetext;
-
- if (preprocessHook) {
- extract = preprocessHook(extract);
- }
-
- // Remove images. Can't be done correctly with just regex as there could be wikilinks
- // in the captions.
- extract = this.removeImages(extract);
-
- // Remove templates beginning on a new line, such as infoboxes.
- // These occasionally contain parameters with part of the content
- // beginning on a newline not starting with a | or * or # or !
- // thus can't be handled with the line regex.
- extract = this.removeTemplatesOnNewlines(extract);
-
- // Remove some other templates too
- extract = this.removeTemplates(extract, ['efn', 'refn']);
-
- extract = extract
- .replace(//sg, '')
- // remove refs, including named ref definitions and named ref invocations
- .replace(/|<\/ref>)/sgi, '')
- // the magic
- .replace(/^\s*[-{|}=*#: charLimit) {
- var match = sentenceEnd.exec(extract);
- while (match) {
- if (this.effCharCount(extract.slice(0, match.index)) > charLimit) {
- extract = extract.slice(0, match.index + 1);
- break;
- } else {
- match = sentenceEnd.exec(extract);
- }
- }
- }
- }
-
- if (hardUpperLimit) {
- if (this.effCharCount(extract) > hardUpperLimit) {
- extract = extract.slice(0, hardUpperLimit) + ' ...';
- }
- }
-
- return extract;
- }
-
- static removeImages(text) {
- var wkt = new bot.wikitext(text);
- wkt.parseLinks();
- wkt.files.forEach(file => {
- wkt.removeEntity(file);
- });
- return wkt.getText();
- }
-
- static removeTemplatesOnNewlines(text) {
- var templateOnNewline = /^\{\{/m; // g is omitted for a reason, the text is changing.
- var match = templateOnNewline.exec(text);
- while (match) {
- var template = new bot.wikitext(text.slice(match.index)).parseTemplates({count: 1})[0];
- if (template) {
- text = text.replace(template.wikitext, '');
- } else { // just get rid of that line, otherwise we'd enter an infinite loop
- text = text.replace(/^\{\{.*$/m, '');
- }
- match = templateOnNewline.exec(text);
- }
- return text;
- }
-
- /**
- * @param {string} text
- * @param {string[]} templates
- */
- static removeTemplates(text, templates) {
- var wkt = new bot.wikitext(text);
- // TODO: Things to think about: how to generate ONE regex that matches all the given
- // templates and which is as efficient as possible? That is, for 'efn' and 'refn'
- // the regex generated should be /[rR]?[eE]?fn/ (as efn is a substring of refn)
- // Can this be solved using the longest common subsequence problem?
- // Or maybe use tries?
- const makeRegexFromTemplate = function(template) {
- return new RegExp('^[' + template[0].toLowerCase() + template[0].toUpperCase() + ']' + template.slice(1) + '$', 'g');
- }
- wkt.parseTemplates({
- namePredicate: name => {
- return templates.some(template => {
- return makeRegexFromTemplate(template).test(name);
- });
- }
- });
- for (let template of wkt.templates) {
- wkt.removeEntity(template);
- }
- return wkt.getText();
- }
-
- static effCharCount(text) {
- return text
- .replace(/\[\[:?(?:[^|\]]+?\|)?([^\]|]+?)\]\]/g, '$1')
- .replace(/''/g, '')
- .length;
- }
-
-
- /**
- * Do away with some of the more bizarre stuff from page extracts that aren't worth
- * checking for on a per-page basis
- * Minimise the amount of removals done here, since if the extract was cut off, it may
- * happen one of the regexes below will match across two different extracts.
- * @param {string} content
- */
- static finalSanitise(content) {
- return content.replace(/\[\[Category:.*?\]\]/gi, '')
- // these are just bad
- .replace(/__[A-Z]+__/g, '')
- // Openings of any unclosed ref tags
- .replace(/[|(?=\n))/gi, '')
- // Harvard referencing
- .replace(/\{\{[sS]fnp?\|.*?\}\}/g, '')
- // shortcut for named ref invocation
- .replace(/\{\{r\|.*?\}\}/gi, '')
- // inline parenthetical referencing
- .replace(/\{\{[hH]arv\|.*?\}\}/g, '')
- // pronunciation
- .replace(/\{\{IPA.*?\}\}/g, '')
- // audio
- .replace(/\{\{[aA]udio\|.*?\}\}/g, '');
- }
- }
-
- return TextExtractor;
-
-};
diff --git a/TextExtractorTest.js b/TextExtractor.test.js
similarity index 77%
rename from TextExtractorTest.js
rename to TextExtractor.test.js
index 95e5138..e4b1612 100644
--- a/TextExtractorTest.js
+++ b/TextExtractor.test.js
@@ -1,8 +1,5 @@
-/* globals it, before */
-
const assert = require('assert');
-const {bot} = require('./botbase');
-const TE = require('./TextExtractor')(bot);
+const {bot, TextExtractor} = require('./botbase');
before(function() {
return bot.getSiteInfo();
@@ -22,7 +19,7 @@ it('removes templates on new lines', function() {
Arthur was an fine tailor.
`;
- assert.strictEqual(TE.removeTemplatesOnNewlines(text), '\n\nArthur was an fine tailor.\n');
+ assert.strictEqual(TextExtractor.removeTemplatesOnNewlines(text), '\n\nArthur was an fine tailor.\n');
});
@@ -31,7 +28,7 @@ it('runs preprocessHook', function () {
==References==`;
- let extract = TE.getExtract(text, 250, 500, function(text) {
+ let extract = TextExtractor.getExtract(text, 250, 500, function(text) {
let wkt = new bot.wikitext(text);
wkt.parseTemplates({
namePredicate: name => {
diff --git a/TextExtractor.ts b/TextExtractor.ts
new file mode 100644
index 0000000..3cc5138
--- /dev/null
+++ b/TextExtractor.ts
@@ -0,0 +1,169 @@
+import {bot} from './botbase';
+
+export default class TextExtractor {
+
+ /**
+ * Get wikitext extract. If you want plain text or HTML extracts, consider using
+ * the TextExtracts API instead.
+ * @param {string} pagetext - full page text
+ * @param {number} [charLimit] - cut off the extract at this many readable characters, or wherever
+ * the sentence ends after this limit
+ * @param {number} [hardUpperLimit] - cut off the extract at this many readable characters even if
+ * the sentence hasn't ended
+ * @param {Function} [preprocessHook] - optional function to work on the text at the
+ * beginning
+ */
+ static getExtract(pagetext: string, charLimit: number, hardUpperLimit: number, preprocessHook: ((text: string) => string)) {
+
+ if (!pagetext) {
+ return '';
+ }
+ let extract = pagetext;
+
+ if (preprocessHook) {
+ extract = preprocessHook(extract);
+ }
+
+ // Remove images. Can't be done correctly with just regex as there could be wikilinks
+ // in the captions.
+ extract = this.removeImages(extract);
+
+ // Remove templates beginning on a new line, such as infoboxes.
+ // These occasionally contain parameters with part of the content
+ // beginning on a newline not starting with a | or * or # or !
+ // thus can't be handled with the line regex.
+ extract = this.removeTemplatesOnNewlines(extract);
+
+ // Remove some other templates too
+ extract = this.removeTemplates(extract, ['efn', 'refn']);
+
+ extract = extract
+ .replace(//sg, '')
+ // remove refs, including named ref definitions and named ref invocations
+ .replace(/|<\/ref>)/sgi, '')
+ // the magic
+ .replace(/^\s*[-{|}=*#: charLimit) {
+ var match = sentenceEnd.exec(extract);
+ while (match) {
+ if (this.effCharCount(extract.slice(0, match.index)) > charLimit) {
+ extract = extract.slice(0, match.index + 1);
+ break;
+ } else {
+ match = sentenceEnd.exec(extract);
+ }
+ }
+ }
+ }
+
+ if (hardUpperLimit) {
+ if (this.effCharCount(extract) > hardUpperLimit) {
+ extract = extract.slice(0, hardUpperLimit) + ' ...';
+ }
+ }
+
+ return extract;
+ }
+
+ static removeImages(text: string) {
+ var wkt = new bot.wikitext(text);
+ wkt.parseLinks();
+ wkt.files.forEach(file => {
+ wkt.removeEntity(file);
+ });
+ return wkt.getText();
+ }
+
+ static removeTemplatesOnNewlines(text: string) {
+ var templateOnNewline = /^\{\{/m; // g is omitted for a reason, the text is changing.
+ var match = templateOnNewline.exec(text);
+ while (match) {
+ var template = new bot.wikitext(text.slice(match.index)).parseTemplates({count: 1})[0];
+ if (template) {
+ text = text.replace(template.wikitext, '');
+ } else { // just get rid of that line, otherwise we'd enter an infinite loop
+ text = text.replace(/^\{\{.*$/m, '');
+ }
+ match = templateOnNewline.exec(text);
+ }
+ return text;
+ }
+
+ /**
+ * @param {string} text
+ * @param {string[]} templates
+ */
+ static removeTemplates(text: string, templates: string[]) {
+ var wkt = new bot.wikitext(text);
+ // TODO: Things to think about: how to generate ONE regex that matches all the given
+ // templates and which is as efficient as possible? That is, for 'efn' and 'refn'
+ // the regex generated should be /[rR]?[eE]?fn/ (as efn is a substring of refn)
+ // Can this be solved using the longest common subsequence problem?
+ // Or maybe use tries?
+ const makeRegexFromTemplate = function(template) {
+ return new RegExp('^[' + template[0].toLowerCase() + template[0].toUpperCase() + ']' + template.slice(1) + '$', 'g');
+ }
+ wkt.parseTemplates({
+ namePredicate: name => {
+ return templates.some(template => {
+ return makeRegexFromTemplate(template).test(name);
+ });
+ }
+ });
+ for (let template of wkt.templates) {
+ wkt.removeEntity(template);
+ }
+ return wkt.getText();
+ }
+
+ static effCharCount(text: string) {
+ return text
+ .replace(/\[\[:?(?:[^|\]]+?\|)?([^\]|]+?)\]\]/g, '$1')
+ .replace(/''/g, '')
+ .length;
+ }
+
+
+ /**
+ * Do away with some of the more bizarre stuff from page extracts that aren't worth
+ * checking for on a per-page basis
+ * Minimise the amount of removals done here, since if the extract was cut off, it may
+ * happen one of the regexes below will match across two different extracts.
+ * @param {string} content
+ */
+ static finalSanitise(content: string) {
+ return content.replace(/\[\[Category:.*?\]\]/gi, '')
+ // these are just bad
+ .replace(/__[A-Z]+__/g, '')
+ // Openings of any unclosed ref tags
+ .replace(/][|(?=\n))/gi, '')
+ // Harvard referencing
+ .replace(/\{\{[sS]fnp?\|.*?\}\}/g, '')
+ // shortcut for named ref invocation
+ .replace(/\{\{r\|.*?\}\}/gi, '')
+ // inline parenthetical referencing
+ .replace(/\{\{[hH]arv\|.*?\}\}/g, '')
+ // pronunciation
+ .replace(/\{\{IPA.*?\}\}/g, '')
+ // audio
+ .replace(/\{\{[aA]udio\|.*?\}\}/g, '');
+ }
+}
diff --git a/botbase.ts b/botbase.ts
index 6a0764a..5afdf04 100644
--- a/botbase.ts
+++ b/botbase.ts
@@ -59,7 +59,8 @@ export const bot = new mwn({
bot.initOAuth();
-export const TextExtractor = require('./TextExtractor')(bot);
+import TextExtractor from "./TextExtractor";
+export { TextExtractor };
// Deprecated exports, import from ./db or ./utils directly
export {mysql, db, enwikidb, toolsdb} from './db';
diff --git a/old/g13-watch/old-sqlite/save-to-db.js b/old/g13-watch/old-sqlite/save-to-db.js
index d0219d0..746e0a1 100644
--- a/old/g13-watch/old-sqlite/save-to-db.js
+++ b/old/g13-watch/old-sqlite/save-to-db.js
@@ -1,8 +1,7 @@
// start job using: npm run start
-const {bot, log, emailOnError} = require('../botbase');
+const {bot, log, emailOnError, TextExtractor} = require('../botbase');
const EventSource = require('eventsource');
-const TextExtractor = require('../TextExtractor')(bot);
const sqlite3 = require('sqlite3');
const sqlite = require('sqlite');
diff --git a/old/g13-watch/save-to-db.ts b/old/g13-watch/save-to-db.ts
index 78811f6..6cfd27b 100644
--- a/old/g13-watch/save-to-db.ts
+++ b/old/g13-watch/save-to-db.ts
@@ -3,10 +3,9 @@
// start job using: npm run start
-import {fs, bot, log, mysql, argv} from '../../botbase';
+import {fs, bot, log, mysql, argv, TextExtractor} from '../../botbase';
const {preprocessDraftForExtract} = require('../../reports/commons');
-const TextExtractor = require('../../TextExtractor')(bot);
const auth = require('../../.auth');
function logError(err) {
diff --git a/reports/peer-review.js b/reports/peer-review.js
index 36dfba7..98d9ab9 100644
--- a/reports/peer-review.js
+++ b/reports/peer-review.js
@@ -1,5 +1,4 @@
-const {bot, mwn, log, emailOnError} = require('../botbase');
-const TextExtractor = require('../TextExtractor')(bot);
+const {bot, mwn, log, emailOnError, TextExtractor} = require('../botbase');
(async function() {
diff --git a/reports/unreferenced-blps.js b/reports/unreferenced-blps.js
index 676cc13..a59bfc6 100644
--- a/reports/unreferenced-blps.js
+++ b/reports/unreferenced-blps.js
@@ -1,5 +1,4 @@
-const {bot, mwn, log, fs} = require('../botbase');
-const TextExtractor = require('../TextExtractor')(bot);
+const {bot, mwn, log, fs, TextExtractor} = require('../botbase');
const wd = new mwn({
...bot.options,
]