Skip to content

Commit

Permalink
TextExtractor
Browse files Browse the repository at this point in the history
  • Loading branch information
siddharthvp committed Jun 12, 2021
1 parent 0eaa22b commit b41b13d
Show file tree
Hide file tree
Showing 8 changed files with 178 additions and 191 deletions.
176 changes: 0 additions & 176 deletions TextExtractor.js

This file was deleted.

9 changes: 3 additions & 6 deletions TextExtractorTest.js → TextExtractor.test.js
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
/* globals it, before */

const assert = require('assert');
const {bot} = require('./botbase');
const TE = require('./TextExtractor')(bot);
const {bot, TextExtractor} = require('./botbase');

before(function() {
return bot.getSiteInfo();
Expand All @@ -22,7 +19,7 @@ it('removes templates on new lines', function() {
Arthur was an fine tailor.
`;

assert.strictEqual(TE.removeTemplatesOnNewlines(text), '\n\nArthur was an fine tailor.\n');
assert.strictEqual(TextExtractor.removeTemplatesOnNewlines(text), '\n\nArthur was an fine tailor.\n');

});

Expand All @@ -31,7 +28,7 @@ it('runs preprocessHook', function () {
==References==`;

let extract = TE.getExtract(text, 250, 500, function(text) {
let extract = TextExtractor.getExtract(text, 250, 500, function(text) {
let wkt = new bot.wikitext(text);
wkt.parseTemplates({
namePredicate: name => {
Expand Down
169 changes: 169 additions & 0 deletions TextExtractor.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
import {bot} from './botbase';

export default class TextExtractor {

/**
* Get wikitext extract. If you want plain text or HTML extracts, consider using
* the TextExtracts API instead.
* @param {string} pagetext - full page text
* @param {number} [charLimit] - cut off the extract at this many readable characters, or wherever
* the sentence ends after this limit
* @param {number} [hardUpperLimit] - cut off the extract at this many readable characters even if
* the sentence hasn't ended
* @param {Function} [preprocessHook] - optional function to work on the text at the
* beginning
*/
static getExtract(pagetext: string, charLimit: number, hardUpperLimit: number, preprocessHook: ((text: string) => string)) {

if (!pagetext) {
return '';
}
let extract = pagetext;

if (preprocessHook) {
extract = preprocessHook(extract);
}

// Remove images. Can't be done correctly with just regex as there could be wikilinks
// in the captions.
extract = this.removeImages(extract);

// Remove templates beginning on a new line, such as infoboxes.
// These occasionally contain parameters with part of the content
// beginning on a newline not starting with a | or * or # or !
// thus can't be handled with the line regex.
extract = this.removeTemplatesOnNewlines(extract);

// Remove some other templates too
extract = this.removeTemplates(extract, ['efn', 'refn']);

extract = extract
.replace(/<!--.*?-->/sg, '')
// remove refs, including named ref definitions and named ref invocations
.replace(/<ref.*?(?:\/>|<\/ref>)/sgi, '')
// the magic
.replace(/^\s*[-{|}=*#:<!].*$/mg, '')
// trim left to prepare for next step
.trimLeft()
// keep only the first paragraph
.replace(/\n\n.*/s, '')
// unbold
.replace(/'''(.*?)'''/g, '$1')
.replace(/\(\{\{[Ll]ang-.*?\}\}\)/, '')
.trim();

if (charLimit) {
// We consider a period followed by a space or newline NOT followed by a lowercase char
// as a sentence ending. Lowercase chars after period+space is generally use of an abbreviation
// XXX: this still results in issues with name like Arthur A. Kempod.
// (?![^[]*?\]\]) so that this is not a period within a link
// (?![^{*]?\}\}) so that this is not a period within a template - doesn't work if there
// is a nested templates after the period.
var sentenceEnd = /\.\s(?![a-z])(?![^[]*?\]\])(?![^{]*?\}\})/g;

if (extract.length > charLimit) {
var match = sentenceEnd.exec(extract);
while (match) {
if (this.effCharCount(extract.slice(0, match.index)) > charLimit) {
extract = extract.slice(0, match.index + 1);
break;
} else {
match = sentenceEnd.exec(extract);
}
}
}
}

if (hardUpperLimit) {
if (this.effCharCount(extract) > hardUpperLimit) {
extract = extract.slice(0, hardUpperLimit) + ' ...';
}
}

return extract;
}

static removeImages(text: string) {
var wkt = new bot.wikitext(text);
wkt.parseLinks();
wkt.files.forEach(file => {
wkt.removeEntity(file);
});
return wkt.getText();
}

static removeTemplatesOnNewlines(text: string) {
var templateOnNewline = /^\{\{/m; // g is omitted for a reason, the text is changing.
var match = templateOnNewline.exec(text);
while (match) {
var template = new bot.wikitext(text.slice(match.index)).parseTemplates({count: 1})[0];
if (template) {
text = text.replace(template.wikitext, '');
} else { // just get rid of that line, otherwise we'd enter an infinite loop
text = text.replace(/^\{\{.*$/m, '');
}
match = templateOnNewline.exec(text);
}
return text;
}

/**
* @param {string} text
* @param {string[]} templates
*/
static removeTemplates(text: string, templates: string[]) {
var wkt = new bot.wikitext(text);
// TODO: Things to think about: how to generate ONE regex that matches all the given
// templates and which is as efficient as possible? That is, for 'efn' and 'refn'
// the regex generated should be /[rR]?[eE]?fn/ (as efn is a substring of refn)
// Can this be solved using the longest common subsequence problem?
// Or maybe use tries?
const makeRegexFromTemplate = function(template) {
return new RegExp('^[' + template[0].toLowerCase() + template[0].toUpperCase() + ']' + template.slice(1) + '$', 'g');
}
wkt.parseTemplates({
namePredicate: name => {
return templates.some(template => {
return makeRegexFromTemplate(template).test(name);
});
}
});
for (let template of wkt.templates) {
wkt.removeEntity(template);
}
return wkt.getText();
}

static effCharCount(text: string) {
return text
.replace(/\[\[:?(?:[^|\]]+?\|)?([^\]|]+?)\]\]/g, '$1')
.replace(/''/g, '')
.length;
}


/**
* Do away with some of the more bizarre stuff from page extracts that aren't worth
* checking for on a per-page basis
* Minimise the amount of removals done here, since if the extract was cut off, it may
* happen one of the regexes below will match across two different extracts.
* @param {string} content
*/
static finalSanitise(content: string) {
return content.replace(/\[\[Category:.*?\]\]/gi, '')
// these are just bad
.replace(/__[A-Z]+__/g, '')
// Openings of any unclosed ref tags
.replace(/<ref[^<]*?(>|(?=\n))/gi, '')
// Harvard referencing
.replace(/\{\{[sS]fnp?\|.*?\}\}/g, '')
// shortcut for named ref invocation
.replace(/\{\{r\|.*?\}\}/gi, '')
// inline parenthetical referencing
.replace(/\{\{[hH]arv\|.*?\}\}/g, '')
// pronunciation
.replace(/\{\{IPA.*?\}\}/g, '')
// audio
.replace(/\{\{[aA]udio\|.*?\}\}/g, '');
}
}
3 changes: 2 additions & 1 deletion botbase.ts
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ export const bot = new mwn({

bot.initOAuth();

export const TextExtractor = require('./TextExtractor')(bot);
import TextExtractor from "./TextExtractor";
export { TextExtractor };

// Deprecated exports, import from ./db or ./utils directly
export {mysql, db, enwikidb, toolsdb} from './db';
Expand Down
3 changes: 1 addition & 2 deletions old/g13-watch/old-sqlite/save-to-db.js
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
// start job using: npm run start

const {bot, log, emailOnError} = require('../botbase');
const {bot, log, emailOnError, TextExtractor} = require('../botbase');
const EventSource = require('eventsource');
const TextExtractor = require('../TextExtractor')(bot);
const sqlite3 = require('sqlite3');
const sqlite = require('sqlite');

Expand Down
Loading

0 comments on commit b41b13d

Please sign in to comment.