-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
0eaa22b
commit b41b13d
Showing
8 changed files
with
178 additions
and
191 deletions.
There are no files selected for viewing
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,169 @@ | ||
import {bot} from './botbase'; | ||
|
||
export default class TextExtractor { | ||
|
||
/** | ||
* Get wikitext extract. If you want plain text or HTML extracts, consider using | ||
* the TextExtracts API instead. | ||
* @param {string} pagetext - full page text | ||
* @param {number} [charLimit] - cut off the extract at this many readable characters, or wherever | ||
* the sentence ends after this limit | ||
* @param {number} [hardUpperLimit] - cut off the extract at this many readable characters even if | ||
* the sentence hasn't ended | ||
* @param {Function} [preprocessHook] - optional function to work on the text at the | ||
* beginning | ||
*/ | ||
static getExtract(pagetext: string, charLimit: number, hardUpperLimit: number, preprocessHook: ((text: string) => string)) { | ||
|
||
if (!pagetext) { | ||
return ''; | ||
} | ||
let extract = pagetext; | ||
|
||
if (preprocessHook) { | ||
extract = preprocessHook(extract); | ||
} | ||
|
||
// Remove images. Can't be done correctly with just regex as there could be wikilinks | ||
// in the captions. | ||
extract = this.removeImages(extract); | ||
|
||
// Remove templates beginning on a new line, such as infoboxes. | ||
// These occasionally contain parameters with part of the content | ||
// beginning on a newline not starting with a | or * or # or ! | ||
// thus can't be handled with the line regex. | ||
extract = this.removeTemplatesOnNewlines(extract); | ||
|
||
// Remove some other templates too | ||
extract = this.removeTemplates(extract, ['efn', 'refn']); | ||
|
||
extract = extract | ||
.replace(/<!--.*?-->/sg, '') | ||
// remove refs, including named ref definitions and named ref invocations | ||
.replace(/<ref.*?(?:\/>|<\/ref>)/sgi, '') | ||
// the magic | ||
.replace(/^\s*[-{|}=*#:<!].*$/mg, '') | ||
// trim left to prepare for next step | ||
.trimLeft() | ||
// keep only the first paragraph | ||
.replace(/\n\n.*/s, '') | ||
// unbold | ||
.replace(/'''(.*?)'''/g, '$1') | ||
.replace(/\(\{\{[Ll]ang-.*?\}\}\)/, '') | ||
.trim(); | ||
|
||
if (charLimit) { | ||
// We consider a period followed by a space or newline NOT followed by a lowercase char | ||
// as a sentence ending. Lowercase chars after period+space is generally use of an abbreviation | ||
// XXX: this still results in issues with name like Arthur A. Kempod. | ||
// (?![^[]*?\]\]) so that this is not a period within a link | ||
// (?![^{*]?\}\}) so that this is not a period within a template - doesn't work if there | ||
// is a nested templates after the period. | ||
var sentenceEnd = /\.\s(?![a-z])(?![^[]*?\]\])(?![^{]*?\}\})/g; | ||
|
||
if (extract.length > charLimit) { | ||
var match = sentenceEnd.exec(extract); | ||
while (match) { | ||
if (this.effCharCount(extract.slice(0, match.index)) > charLimit) { | ||
extract = extract.slice(0, match.index + 1); | ||
break; | ||
} else { | ||
match = sentenceEnd.exec(extract); | ||
} | ||
} | ||
} | ||
} | ||
|
||
if (hardUpperLimit) { | ||
if (this.effCharCount(extract) > hardUpperLimit) { | ||
extract = extract.slice(0, hardUpperLimit) + ' ...'; | ||
} | ||
} | ||
|
||
return extract; | ||
} | ||
|
||
static removeImages(text: string) { | ||
var wkt = new bot.wikitext(text); | ||
wkt.parseLinks(); | ||
wkt.files.forEach(file => { | ||
wkt.removeEntity(file); | ||
}); | ||
return wkt.getText(); | ||
} | ||
|
||
static removeTemplatesOnNewlines(text: string) { | ||
var templateOnNewline = /^\{\{/m; // g is omitted for a reason, the text is changing. | ||
var match = templateOnNewline.exec(text); | ||
while (match) { | ||
var template = new bot.wikitext(text.slice(match.index)).parseTemplates({count: 1})[0]; | ||
if (template) { | ||
text = text.replace(template.wikitext, ''); | ||
} else { // just get rid of that line, otherwise we'd enter an infinite loop | ||
text = text.replace(/^\{\{.*$/m, ''); | ||
} | ||
match = templateOnNewline.exec(text); | ||
} | ||
return text; | ||
} | ||
|
||
/** | ||
* @param {string} text | ||
* @param {string[]} templates | ||
*/ | ||
static removeTemplates(text: string, templates: string[]) { | ||
var wkt = new bot.wikitext(text); | ||
// TODO: Things to think about: how to generate ONE regex that matches all the given | ||
// templates and which is as efficient as possible? That is, for 'efn' and 'refn' | ||
// the regex generated should be /[rR]?[eE]?fn/ (as efn is a substring of refn) | ||
// Can this be solved using the longest common subsequence problem? | ||
// Or maybe use tries? | ||
const makeRegexFromTemplate = function(template) { | ||
return new RegExp('^[' + template[0].toLowerCase() + template[0].toUpperCase() + ']' + template.slice(1) + '$', 'g'); | ||
} | ||
wkt.parseTemplates({ | ||
namePredicate: name => { | ||
return templates.some(template => { | ||
return makeRegexFromTemplate(template).test(name); | ||
}); | ||
} | ||
}); | ||
for (let template of wkt.templates) { | ||
wkt.removeEntity(template); | ||
} | ||
return wkt.getText(); | ||
} | ||
|
||
static effCharCount(text: string) { | ||
return text | ||
.replace(/\[\[:?(?:[^|\]]+?\|)?([^\]|]+?)\]\]/g, '$1') | ||
.replace(/''/g, '') | ||
.length; | ||
} | ||
|
||
|
||
/** | ||
* Do away with some of the more bizarre stuff from page extracts that aren't worth | ||
* checking for on a per-page basis | ||
* Minimise the amount of removals done here, since if the extract was cut off, it may | ||
* happen one of the regexes below will match across two different extracts. | ||
* @param {string} content | ||
*/ | ||
static finalSanitise(content: string) { | ||
return content.replace(/\[\[Category:.*?\]\]/gi, '') | ||
// these are just bad | ||
.replace(/__[A-Z]+__/g, '') | ||
// Openings of any unclosed ref tags | ||
.replace(/<ref[^<]*?(>|(?=\n))/gi, '') | ||
// Harvard referencing | ||
.replace(/\{\{[sS]fnp?\|.*?\}\}/g, '') | ||
// shortcut for named ref invocation | ||
.replace(/\{\{r\|.*?\}\}/gi, '') | ||
// inline parenthetical referencing | ||
.replace(/\{\{[hH]arv\|.*?\}\}/g, '') | ||
// pronunciation | ||
.replace(/\{\{IPA.*?\}\}/g, '') | ||
// audio | ||
.replace(/\{\{[aA]udio\|.*?\}\}/g, ''); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.