TextExtractor

siddharthvp · Jun 12, 2021 · b41b13d · b41b13d
1 parent 0eaa22b
commit b41b13d
Show file tree

Hide file tree

Showing 8 changed files with 178 additions and 191 deletions.
diff --git a/TextExtractor.js b/TextExtractor.js
diff --git a/TextExtractorTest.js → TextExtractor.test.js b/TextExtractorTest.js → TextExtractor.test.js
@@ -1,8 +1,5 @@
-/* globals it, before */
-
 const assert = require('assert');
-const {bot} = require('./botbase');
-const TE = require('./TextExtractor')(bot);
+const {bot, TextExtractor} = require('./botbase');
 
 before(function() {
 	return bot.getSiteInfo();
@@ -22,7 +19,7 @@ it('removes templates on new lines', function() {
 Arthur was an fine tailor.
 `;
 
-	assert.strictEqual(TE.removeTemplatesOnNewlines(text), '\n\nArthur was an fine tailor.\n');
+	assert.strictEqual(TextExtractor.removeTemplatesOnNewlines(text), '\n\nArthur was an fine tailor.\n');
 
 });
 
@@ -31,7 +28,7 @@ it('runs preprocessHook', function () {
 	
 ==References==`;
 
-	let extract = TE.getExtract(text, 250, 500, function(text) {
+	let extract = TextExtractor.getExtract(text, 250, 500, function(text) {
 		let wkt = new bot.wikitext(text);
 		wkt.parseTemplates({
 			namePredicate: name => {

diff --git a/TextExtractor.ts b/TextExtractor.ts
@@ -0,0 +1,169 @@
+import {bot} from './botbase';
+
+export default class TextExtractor {
+
+	/**
+	 * Get wikitext extract. If you want plain text or HTML extracts, consider using
+	 * the TextExtracts API instead.
+	 * @param {string} pagetext - full page text
+	 * @param {number} [charLimit] - cut off the extract at this many readable characters, or wherever
+	 * the sentence ends after this limit
+	 * @param {number} [hardUpperLimit] - cut off the extract at this many readable characters even if
+	 * the sentence hasn't ended
+	 * @param {Function} [preprocessHook] - optional function to work on the text at the
+	 * beginning
+	 */
+	static getExtract(pagetext: string, charLimit: number, hardUpperLimit: number, preprocessHook: ((text: string) => string)) {
+
+		if (!pagetext) {
+			return '';
+		}
+		let extract = pagetext;
+
+		if (preprocessHook) {
+			extract = preprocessHook(extract);
+		}
+
+		// Remove images. Can't be done correctly with just regex as there could be wikilinks
+		// in the captions.
+		extract = this.removeImages(extract);
+
+		// Remove templates beginning on a new line, such as infoboxes.
+		// These occasionally contain parameters with part of the content
+		// beginning on a newline not starting with a | or * or # or !
+		// thus can't be handled with the line regex.
+		extract = this.removeTemplatesOnNewlines(extract);
+
+		// Remove some other templates too
+		extract = this.removeTemplates(extract, ['efn', 'refn']);
+
+		extract = extract
+			.replace(/<!--.*?-->/sg, '')
+			// remove refs, including named ref definitions and named ref invocations
+			.replace(/<ref.*?(?:\/>|<\/ref>)/sgi, '')
+			// the magic
+			.replace(/^\s*[-{|}=*#:<!].*$/mg, '')
+			// trim left to prepare for next step
+			.trimLeft()
+			// keep only the first paragraph
+			.replace(/\n\n.*/s, '')
+			// unbold
+			.replace(/'''(.*?)'''/g, '$1')
+			.replace(/\(\{\{[Ll]ang-.*?\}\}\)/, '')
+			.trim();
+
+		if (charLimit) {
+			// We consider a period followed by a space or newline NOT followed by a lowercase char
+			// as a sentence ending. Lowercase chars after period+space is generally use of an abbreviation
+			// XXX: this still results in issues with name like Arthur A. Kempod.
+			//  (?![^[]*?\]\]) so that this is not a period within a link
+			//  (?![^{*]?\}\}) so that this is not a period within a template - doesn't work if there
+			//      is a nested templates after the period.
+			var sentenceEnd = /\.\s(?![a-z])(?![^[]*?\]\])(?![^{]*?\}\})/g;
+
+			if (extract.length > charLimit) {
+				var match = sentenceEnd.exec(extract);
+				while (match) {
+					if (this.effCharCount(extract.slice(0, match.index)) > charLimit) {
+						extract = extract.slice(0, match.index + 1);
+						break;
+					} else {
+						match = sentenceEnd.exec(extract);
+					}
+				}
+			}
+		}
+
+		if (hardUpperLimit) {
+			if (this.effCharCount(extract) > hardUpperLimit) {
+				extract = extract.slice(0, hardUpperLimit) + ' ...';
+			}
+		}
+
+		return extract;
+	}
+
+	static removeImages(text: string) {
+		var wkt = new bot.wikitext(text);
+		wkt.parseLinks();
+		wkt.files.forEach(file => {
+			wkt.removeEntity(file);
+		});
+		return wkt.getText();
+	}
+
+	static removeTemplatesOnNewlines(text: string) {
+		var templateOnNewline = /^\{\{/m; // g is omitted for a reason, the text is changing.
+		var match = templateOnNewline.exec(text);
+		while (match) {
+			var template = new bot.wikitext(text.slice(match.index)).parseTemplates({count: 1})[0];
+			if (template) {
+				text = text.replace(template.wikitext, '');
+			} else { // just get rid of that line, otherwise we'd enter an infinite loop
+				text = text.replace(/^\{\{.*$/m, '');
+			}
+			match = templateOnNewline.exec(text);
+		}
+		return text;
+	}
+
+	/**
+	 * @param {string} text
+	 * @param {string[]} templates
+	 */
+	static removeTemplates(text: string, templates: string[]) {
+		var wkt = new bot.wikitext(text);
+		// TODO: Things to think about: how to generate ONE regex that matches all the given
+		// templates and which is as efficient as possible? That is, for 'efn' and 'refn'
+		// the regex generated should be /[rR]?[eE]?fn/ (as efn is a substring of refn)
+		// Can this be solved using the longest common subsequence problem?
+		// Or maybe use tries?
+		const makeRegexFromTemplate = function(template) {
+			return new RegExp('^[' + template[0].toLowerCase() + template[0].toUpperCase() + ']' + template.slice(1) + '$', 'g');
+		}
+		wkt.parseTemplates({
+			namePredicate: name => {
+				return templates.some(template => {
+					return makeRegexFromTemplate(template).test(name);
+				});
+			}
+		});
+		for (let template of wkt.templates) {
+			wkt.removeEntity(template);
+		}
+		return wkt.getText();
+	}
+
+	static effCharCount(text: string) {
+		return text
+			.replace(/\[\[:?(?:[^|\]]+?\|)?([^\]|]+?)\]\]/g, '$1')
+			.replace(/''/g, '')
+			.length;
+	}
+
+
+	/**
+	 * Do away with some of the more bizarre stuff from page extracts that aren't worth
+	 * checking for on a per-page basis
+	 * Minimise the amount of removals done here, since if the extract was cut off, it may
+	 * happen one of the regexes below will match across two different extracts.
+	 * @param {string} content
+	 */
+	static finalSanitise(content: string) {
+		return content.replace(/\[\[Category:.*?\]\]/gi, '')
+			// these are just bad
+			.replace(/__[A-Z]+__/g, '')
+			// Openings of any unclosed ref tags
+			.replace(/<ref[^<]*?(>|(?=\n))/gi, '')
+			// Harvard referencing
+			.replace(/\{\{[sS]fnp?\|.*?\}\}/g, '')
+			// shortcut for named ref invocation
+			.replace(/\{\{r\|.*?\}\}/gi, '')
+			// inline parenthetical referencing
+			.replace(/\{\{[hH]arv\|.*?\}\}/g, '')
+			// pronunciation
+			.replace(/\{\{IPA.*?\}\}/g, '')
+			// audio
+			.replace(/\{\{[aA]udio\|.*?\}\}/g, '');
+	}
+}
diff --git a/botbase.ts b/botbase.ts
@@ -59,7 +59,8 @@ export const bot = new mwn({
 
 bot.initOAuth();
 
-export const TextExtractor = require('./TextExtractor')(bot);
+import TextExtractor from "./TextExtractor";
+export { TextExtractor };
 
 // Deprecated exports, import from ./db or ./utils directly
 export {mysql, db, enwikidb, toolsdb} from './db';

diff --git a/old/g13-watch/old-sqlite/save-to-db.js b/old/g13-watch/old-sqlite/save-to-db.js
@@ -1,8 +1,7 @@
 // start job using: npm run start
 
-const {bot, log, emailOnError} = require('../botbase');
+const {bot, log, emailOnError, TextExtractor} = require('../botbase');
 const EventSource = require('eventsource');
-const TextExtractor = require('../TextExtractor')(bot);
 const sqlite3 = require('sqlite3');
 const sqlite = require('sqlite');