From b41b13d9a787883b7b14bc701a21f4c5f14500dd Mon Sep 17 00:00:00 2001
From: Siddharth VP <siddharthvp@gmail.com>
Date: Sat, 12 Jun 2021 18:57:18 +0530
Subject: [PATCH] TextExtractor

---
 TextExtractor.js                              | 176 ------------------
 TextExtractorTest.js => TextExtractor.test.js |   9 +-
 TextExtractor.ts                              | 169 +++++++++++++++++
 botbase.ts                                    |   3 +-
 old/g13-watch/old-sqlite/save-to-db.js        |   3 +-
 old/g13-watch/save-to-db.ts                   |   3 +-
 reports/peer-review.js                        |   3 +-
 reports/unreferenced-blps.js                  |   3 +-
 8 files changed, 178 insertions(+), 191 deletions(-)
 delete mode 100644 TextExtractor.js
 rename TextExtractorTest.js => TextExtractor.test.js (77%)
 create mode 100644 TextExtractor.ts

diff --git a/TextExtractor.js b/TextExtractor.js
deleted file mode 100644
index 542ac8c..0000000
--- a/TextExtractor.js
+++ /dev/null
@@ -1,176 +0,0 @@
-/**
- * @param {mwn} bot
- */
-module.exports = function(bot) {
-
-	class TextExtractor {
-
-		/**
-		 * Get wikitext extract. If you want plain text or HTML extracts, consider using
-		 * the TextExtracts API instead.
-		 * @param {string} pagetext - full page text
-		 * @param {number} [charLimit] - cut off the extract at this many readable characters, or wherever
-		 * the sentence ends after this limit
-		 * @param {number} [hardUpperLimit] - cut off the extract at this many readable characters even if
-		 * the sentence hasn't ended
-		 * @param {Function} [preprocessHook] - optional function to work on the text at the
-		 * beginning
-		 */
-		static getExtract(pagetext, charLimit, hardUpperLimit, preprocessHook) {
-
-			if (!pagetext) {
-				return '';
-			}
-			let extract = pagetext;
-
-			if (preprocessHook) {
-				extract = preprocessHook(extract);
-			}
-
-			// Remove images. Can't be done correctly with just regex as there could be wikilinks
-			// in the captions.
-			extract = this.removeImages(extract);
-
-			// Remove templates beginning on a new line, such as infoboxes.
-			// These occasionally contain parameters with part of the content
-			// beginning on a newline not starting with a | or * or # or !
-			// thus can't be handled with the line regex.
-			extract = this.removeTemplatesOnNewlines(extract);
-
-			// Remove some other templates too
-			extract = this.removeTemplates(extract, ['efn', 'refn']);
-
-			extract = extract
-				.replace(/<!--.*?-->/sg, '')
-				// remove refs, including named ref definitions and named ref invocations
-				.replace(/<ref.*?(?:\/>|<\/ref>)/sgi, '')
-				// the magic
-				.replace(/^\s*[-{|}=*#:<!].*$/mg, '')
-				// trim left to prepare for next step
-				.trimLeft()
-				// keep only the first paragraph
-				.replace(/\n\n.*/s, '')
-				// unbold
-				.replace(/'''(.*?)'''/g, '$1')
-				.replace(/\(\{\{[Ll]ang-.*?\}\}\)/, '')
-				.trim();
-
-			if (charLimit) {
-				// We consider a period followed by a space or newline NOT followed by a lowercase char
-				// as a sentence ending. Lowercase chars after period+space is generally use of an abbreviation
-				// XXX: this still results in issues with name like Arthur A. Kempod.
-				//  (?![^[]*?\]\]) so that this is not a period within a link
-				//  (?![^{*]?\}\}) so that this is not a period within a template - doesn't work if there
-				//      is a nested templates after the period.
-				var sentenceEnd = /\.\s(?![a-z])(?![^[]*?\]\])(?![^{]*?\}\})/g;
-
-				if (extract.length > charLimit) {
-					var match = sentenceEnd.exec(extract);
-					while (match) {
-						if (this.effCharCount(extract.slice(0, match.index)) > charLimit) {
-							extract = extract.slice(0, match.index + 1);
-							break;
-						} else {
-							match = sentenceEnd.exec(extract);
-						}
-					}
-				}
-			}
-
-			if (hardUpperLimit) {
-				if (this.effCharCount(extract) > hardUpperLimit) {
-					extract = extract.slice(0, hardUpperLimit) + ' ...';
-				}
-			}
-
-			return extract;
-		}
-
-		static removeImages(text) {
-			var wkt = new bot.wikitext(text);
-			wkt.parseLinks();
-			wkt.files.forEach(file => {
-				wkt.removeEntity(file);
-			});
-			return wkt.getText();
-		}
-
-		static removeTemplatesOnNewlines(text) {
-			var templateOnNewline = /^\{\{/m; // g is omitted for a reason, the text is changing.
-			var match = templateOnNewline.exec(text);
-			while (match) {
-				var template = new bot.wikitext(text.slice(match.index)).parseTemplates({count: 1})[0];
-				if (template) {
-					text = text.replace(template.wikitext, '');
-				} else { // just get rid of that line, otherwise we'd enter an infinite loop
-					text = text.replace(/^\{\{.*$/m, '');
-				}
-				match = templateOnNewline.exec(text);
-			}
-			return text;
-		}
-
-		/**
-		 * @param {string} text
-		 * @param {string[]} templates
-		 */
-		static removeTemplates(text, templates) {
-			var wkt = new bot.wikitext(text);
-			// TODO: Things to think about: how to generate ONE regex that matches all the given
-			// templates and which is as efficient as possible? That is, for 'efn' and 'refn'
-			// the regex generated should be /[rR]?[eE]?fn/ (as efn is a substring of refn)
-			// Can this be solved using the longest common subsequence problem?
-			// Or maybe use tries?
-			const makeRegexFromTemplate = function(template) {
-				return new RegExp('^[' + template[0].toLowerCase() + template[0].toUpperCase() + ']' + template.slice(1) + '$', 'g');
-			}
-			wkt.parseTemplates({
-				namePredicate: name => {
-					return templates.some(template => {
-						return makeRegexFromTemplate(template).test(name);
-					});
-				}
-			});
-			for (let template of wkt.templates) {
-				wkt.removeEntity(template);
-			}
-			return wkt.getText();
-		}
-
-		static effCharCount(text) {
-			return text
-				.replace(/\[\[:?(?:[^|\]]+?\|)?([^\]|]+?)\]\]/g, '$1')
-				.replace(/''/g, '')
-				.length;
-		}
-
-
-		/**
-		 * Do away with some of the more bizarre stuff from page extracts that aren't worth
-		 * checking for on a per-page basis
-		 * Minimise the amount of removals done here, since if the extract was cut off, it may
-		 * happen one of the regexes below will match across two different extracts.
-		 * @param {string} content
-		 */
-		static finalSanitise(content) {
-			return content.replace(/\[\[Category:.*?\]\]/gi, '')
-				// these are just bad
-				.replace(/__[A-Z]+__/g, '')
-				// Openings of any unclosed ref tags
-				.replace(/<ref[^<]*?(>|(?=\n))/gi, '')
-				// Harvard referencing
-				.replace(/\{\{[sS]fnp?\|.*?\}\}/g, '')
-				// shortcut for named ref invocation
-				.replace(/\{\{r\|.*?\}\}/gi, '')
-				// inline parenthetical referencing
-				.replace(/\{\{[hH]arv\|.*?\}\}/g, '')
-				// pronunciation
-				.replace(/\{\{IPA.*?\}\}/g, '')
-				// audio
-				.replace(/\{\{[aA]udio\|.*?\}\}/g, '');
-		}
-	}
-
-	return TextExtractor;
-
-};
diff --git a/TextExtractorTest.js b/TextExtractor.test.js
similarity index 77%
rename from TextExtractorTest.js
rename to TextExtractor.test.js
index 95e5138..e4b1612 100644
--- a/TextExtractorTest.js
+++ b/TextExtractor.test.js
@@ -1,8 +1,5 @@
-/* globals it, before */
-
 const assert = require('assert');
-const {bot} = require('./botbase');
-const TE = require('./TextExtractor')(bot);
+const {bot, TextExtractor} = require('./botbase');
 
 before(function() {
 	return bot.getSiteInfo();
@@ -22,7 +19,7 @@ it('removes templates on new lines', function() {
 Arthur was an fine tailor.
 `;
 
-	assert.strictEqual(TE.removeTemplatesOnNewlines(text), '\n\nArthur was an fine tailor.\n');
+	assert.strictEqual(TextExtractor.removeTemplatesOnNewlines(text), '\n\nArthur was an fine tailor.\n');
 
 });
 
@@ -31,7 +28,7 @@ it('runs preprocessHook', function () {
 	
 ==References==`;
 
-	let extract = TE.getExtract(text, 250, 500, function(text) {
+	let extract = TextExtractor.getExtract(text, 250, 500, function(text) {
 		let wkt = new bot.wikitext(text);
 		wkt.parseTemplates({
 			namePredicate: name => {
diff --git a/TextExtractor.ts b/TextExtractor.ts
new file mode 100644
index 0000000..3cc5138
--- /dev/null
+++ b/TextExtractor.ts
@@ -0,0 +1,169 @@
+import {bot} from './botbase';
+
+export default class TextExtractor {
+
+	/**
+	 * Get wikitext extract. If you want plain text or HTML extracts, consider using
+	 * the TextExtracts API instead.
+	 * @param {string} pagetext - full page text
+	 * @param {number} [charLimit] - cut off the extract at this many readable characters, or wherever
+	 * the sentence ends after this limit
+	 * @param {number} [hardUpperLimit] - cut off the extract at this many readable characters even if
+	 * the sentence hasn't ended
+	 * @param {Function} [preprocessHook] - optional function to work on the text at the
+	 * beginning
+	 */
+	static getExtract(pagetext: string, charLimit: number, hardUpperLimit: number, preprocessHook: ((text: string) => string)) {
+
+		if (!pagetext) {
+			return '';
+		}
+		let extract = pagetext;
+
+		if (preprocessHook) {
+			extract = preprocessHook(extract);
+		}
+
+		// Remove images. Can't be done correctly with just regex as there could be wikilinks
+		// in the captions.
+		extract = this.removeImages(extract);
+
+		// Remove templates beginning on a new line, such as infoboxes.
+		// These occasionally contain parameters with part of the content
+		// beginning on a newline not starting with a | or * or # or !
+		// thus can't be handled with the line regex.
+		extract = this.removeTemplatesOnNewlines(extract);
+
+		// Remove some other templates too
+		extract = this.removeTemplates(extract, ['efn', 'refn']);
+
+		extract = extract
+			.replace(/<!--.*?-->/sg, '')
+			// remove refs, including named ref definitions and named ref invocations
+			.replace(/<ref.*?(?:\/>|<\/ref>)/sgi, '')
+			// the magic
+			.replace(/^\s*[-{|}=*#:<!].*$/mg, '')
+			// trim left to prepare for next step
+			.trimLeft()
+			// keep only the first paragraph
+			.replace(/\n\n.*/s, '')
+			// unbold
+			.replace(/'''(.*?)'''/g, '$1')
+			.replace(/\(\{\{[Ll]ang-.*?\}\}\)/, '')
+			.trim();
+
+		if (charLimit) {
+			// We consider a period followed by a space or newline NOT followed by a lowercase char
+			// as a sentence ending. Lowercase chars after period+space is generally use of an abbreviation
+			// XXX: this still results in issues with name like Arthur A. Kempod.
+			//  (?![^[]*?\]\]) so that this is not a period within a link
+			//  (?![^{*]?\}\}) so that this is not a period within a template - doesn't work if there
+			//      is a nested templates after the period.
+			var sentenceEnd = /\.\s(?![a-z])(?![^[]*?\]\])(?![^{]*?\}\})/g;
+
+			if (extract.length > charLimit) {
+				var match = sentenceEnd.exec(extract);
+				while (match) {
+					if (this.effCharCount(extract.slice(0, match.index)) > charLimit) {
+						extract = extract.slice(0, match.index + 1);
+						break;
+					} else {
+						match = sentenceEnd.exec(extract);
+					}
+				}
+			}
+		}
+
+		if (hardUpperLimit) {
+			if (this.effCharCount(extract) > hardUpperLimit) {
+				extract = extract.slice(0, hardUpperLimit) + ' ...';
+			}
+		}
+
+		return extract;
+	}
+
+	static removeImages(text: string) {
+		var wkt = new bot.wikitext(text);
+		wkt.parseLinks();
+		wkt.files.forEach(file => {
+			wkt.removeEntity(file);
+		});
+		return wkt.getText();
+	}
+
+	static removeTemplatesOnNewlines(text: string) {
+		var templateOnNewline = /^\{\{/m; // g is omitted for a reason, the text is changing.
+		var match = templateOnNewline.exec(text);
+		while (match) {
+			var template = new bot.wikitext(text.slice(match.index)).parseTemplates({count: 1})[0];
+			if (template) {
+				text = text.replace(template.wikitext, '');
+			} else { // just get rid of that line, otherwise we'd enter an infinite loop
+				text = text.replace(/^\{\{.*$/m, '');
+			}
+			match = templateOnNewline.exec(text);
+		}
+		return text;
+	}
+
+	/**
+	 * @param {string} text
+	 * @param {string[]} templates
+	 */
+	static removeTemplates(text: string, templates: string[]) {
+		var wkt = new bot.wikitext(text);
+		// TODO: Things to think about: how to generate ONE regex that matches all the given
+		// templates and which is as efficient as possible? That is, for 'efn' and 'refn'
+		// the regex generated should be /[rR]?[eE]?fn/ (as efn is a substring of refn)
+		// Can this be solved using the longest common subsequence problem?
+		// Or maybe use tries?
+		const makeRegexFromTemplate = function(template) {
+			return new RegExp('^[' + template[0].toLowerCase() + template[0].toUpperCase() + ']' + template.slice(1) + '$', 'g');
+		}
+		wkt.parseTemplates({
+			namePredicate: name => {
+				return templates.some(template => {
+					return makeRegexFromTemplate(template).test(name);
+				});
+			}
+		});
+		for (let template of wkt.templates) {
+			wkt.removeEntity(template);
+		}
+		return wkt.getText();
+	}
+
+	static effCharCount(text: string) {
+		return text
+			.replace(/\[\[:?(?:[^|\]]+?\|)?([^\]|]+?)\]\]/g, '$1')
+			.replace(/''/g, '')
+			.length;
+	}
+
+
+	/**
+	 * Do away with some of the more bizarre stuff from page extracts that aren't worth
+	 * checking for on a per-page basis
+	 * Minimise the amount of removals done here, since if the extract was cut off, it may
+	 * happen one of the regexes below will match across two different extracts.
+	 * @param {string} content
+	 */
+	static finalSanitise(content: string) {
+		return content.replace(/\[\[Category:.*?\]\]/gi, '')
+			// these are just bad
+			.replace(/__[A-Z]+__/g, '')
+			// Openings of any unclosed ref tags
+			.replace(/<ref[^<]*?(>|(?=\n))/gi, '')
+			// Harvard referencing
+			.replace(/\{\{[sS]fnp?\|.*?\}\}/g, '')
+			// shortcut for named ref invocation
+			.replace(/\{\{r\|.*?\}\}/gi, '')
+			// inline parenthetical referencing
+			.replace(/\{\{[hH]arv\|.*?\}\}/g, '')
+			// pronunciation
+			.replace(/\{\{IPA.*?\}\}/g, '')
+			// audio
+			.replace(/\{\{[aA]udio\|.*?\}\}/g, '');
+	}
+}
diff --git a/botbase.ts b/botbase.ts
index 6a0764a..5afdf04 100644
--- a/botbase.ts
+++ b/botbase.ts
@@ -59,7 +59,8 @@ export const bot = new mwn({
 
 bot.initOAuth();
 
-export const TextExtractor = require('./TextExtractor')(bot);
+import TextExtractor from "./TextExtractor";
+export { TextExtractor };
 
 // Deprecated exports, import from ./db or ./utils directly
 export {mysql, db, enwikidb, toolsdb} from './db';
diff --git a/old/g13-watch/old-sqlite/save-to-db.js b/old/g13-watch/old-sqlite/save-to-db.js
index d0219d0..746e0a1 100644
--- a/old/g13-watch/old-sqlite/save-to-db.js
+++ b/old/g13-watch/old-sqlite/save-to-db.js
@@ -1,8 +1,7 @@
 // start job using: npm run start
 
-const {bot, log, emailOnError} = require('../botbase');
+const {bot, log, emailOnError, TextExtractor} = require('../botbase');
 const EventSource = require('eventsource');
-const TextExtractor = require('../TextExtractor')(bot);
 const sqlite3 = require('sqlite3');
 const sqlite = require('sqlite');
 
diff --git a/old/g13-watch/save-to-db.ts b/old/g13-watch/save-to-db.ts
index 78811f6..6cfd27b 100644
--- a/old/g13-watch/save-to-db.ts
+++ b/old/g13-watch/save-to-db.ts
@@ -3,10 +3,9 @@
 
 // start job using: npm run start
 
-import {fs, bot, log, mysql, argv} from '../../botbase';
+import {fs, bot, log, mysql, argv, TextExtractor} from '../../botbase';
 
 const {preprocessDraftForExtract} = require('../../reports/commons');
-const TextExtractor = require('../../TextExtractor')(bot);
 const auth = require('../../.auth');
 
 function logError(err) {
diff --git a/reports/peer-review.js b/reports/peer-review.js
index 36dfba7..98d9ab9 100644
--- a/reports/peer-review.js
+++ b/reports/peer-review.js
@@ -1,5 +1,4 @@
-const {bot, mwn, log, emailOnError} = require('../botbase');
-const TextExtractor = require('../TextExtractor')(bot);
+const {bot, mwn, log, emailOnError, TextExtractor} = require('../botbase');
 
 (async function() {
 
diff --git a/reports/unreferenced-blps.js b/reports/unreferenced-blps.js
index 676cc13..a59bfc6 100644
--- a/reports/unreferenced-blps.js
+++ b/reports/unreferenced-blps.js
@@ -1,5 +1,4 @@
-const {bot, mwn, log, fs} = require('../botbase');
-const TextExtractor = require('../TextExtractor')(bot);
+const {bot, mwn, log, fs, TextExtractor} = require('../botbase');
 
 const wd = new mwn({
 	...bot.options,