-
Notifications
You must be signed in to change notification settings - Fork 4
/
TextExtractor.ts
148 lines (130 loc) · 4.8 KB
/
TextExtractor.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import {bot} from './botbase';
export default class TextExtractor {
/**
* Get wikitext extract. If you want plain text or HTML extracts, consider using
* the TextExtracts API instead.
* @param pagetext - full page text
* @param [charLimit] - cut off the extract at this many readable characters, or wherever
* the sentence ends after this limit
* @param [hardUpperLimit] - cut off the extract at this many readable characters even if
* the sentence hasn't ended
* @param [preprocessHook] - optional function to work on the text at the
* beginning
*/
static getExtract(pagetext: string, charLimit?: number, hardUpperLimit?: number, preprocessHook?: ((text: string) => string)) {
if (!pagetext) {
return '';
}
let extract = pagetext;
if (preprocessHook) {
extract = preprocessHook(extract);
}
// Remove images. Can't be done correctly with just regex as there could be wikilinks
// in the captions.
extract = this.removeImages(extract);
// Remove templates beginning on a new line, such as infoboxes.
// These occasionally contain parameters with part of the content
// beginning on a newline not starting with a | or * or # or !
// thus can't be handled with the line regex.
extract = this.removeTemplatesOnNewlines(extract);
// Remove some other templates too
// Matches r, efn, refn, sfn, sfnp, harv, harvp, audio, and IPA.* family
extract = this.removeTemplates(extract, /^(r|sfn[bp]?|harvp?|r?efn|respell|IPA.*|audio)$/i);
extract = extract
.replace(/<!--.*?-->/sg, '')
// remove refs, including named ref definitions and named ref invocations
.replace(/<ref.*?(?:\/>|<\/ref>)/sgi, '')
// the magic
.replace(/^\s*[-{|}=*#:<!].*$/mg, '')
// trim left to prepare for next step
.trimLeft()
// keep only the first paragraph
.replace(/\n\n.*/s, '')
// unbold
.replace(/'''(.*?)'''/g, '$1')
// cleanup side-effects from removing IPA/audio templates
.replace(/\((?:\s*[,;])+\s*/g, '(')
.replace(/ ?\(\s*\)/g, '')
.trim();
if (charLimit) {
// We consider a period followed by a space or newline NOT followed by a lowercase char
// as a sentence ending. Lowercase chars after period+space is generally use of an abbreviation
// XXX: this still results in issues with name like Arthur A. Kempod.
// (?![^[]*?\]\]) so that this is not a period within a link
// (?![^{*]?\}\}) so that this is not a period within a template - doesn't work if there
// is a nested templates after the period.
const sentenceEnd = /\.\s(?![a-z])(?![^[]*?\]\])(?![^{]*?\}\})/g;
if (extract.length > charLimit) {
let match = sentenceEnd.exec(extract);
while (match) {
if (this.effCharCount(extract.slice(0, match.index)) > charLimit) {
extract = extract.slice(0, match.index + 1);
break;
} else {
match = sentenceEnd.exec(extract);
}
}
}
}
if (hardUpperLimit) {
if (this.effCharCount(extract) > hardUpperLimit) {
extract = extract.slice(0, hardUpperLimit) + ' ...';
}
}
return extract;
}
static removeImages(text: string) {
let wkt = new bot.wikitext(text);
wkt.parseLinks();
wkt.files.forEach(file => {
wkt.removeEntity(file);
});
return wkt.getText();
}
static removeTemplatesOnNewlines(text: string) {
let templateOnNewline = /^\{\{/m; // g is omitted for a reason, the text is changing.
let match = templateOnNewline.exec(text);
while (match) {
let template = new bot.wikitext(text.slice(match.index)).parseTemplates({count: 1})[0];
if (template) {
text = text.replace(template.wikitext, '');
} else { // just get rid of that line, otherwise we'd enter an infinite loop
text = text.replace(/^\{\{.*$/m, '');
}
match = templateOnNewline.exec(text);
}
return text;
}
static removeTemplates(text: string, templateNameRegex: RegExp) {
let wkt = new bot.wikitext(text);
wkt.parseTemplates({
namePredicate: name => templateNameRegex.test(name)
});
for (let template of wkt.templates) {
wkt.removeEntity(template);
}
return wkt.getText();
}
static effCharCount(text: string) {
return text
.replace(/\[\[:?(?:[^|\]]+?\|)?([^\]|]+?)\]\]/g, '$1')
.replace(/''/g, '')
.length;
}
/**
* Do away with some of the more bizarre stuff from page extracts that aren't worth
* checking for on a per-page basis
* Minimise the amount of removals done here, since if the extract was cut off, it may
* happen one of the regexes below will match across two different extracts.
* @param {string} content
*/
static finalSanitise(content: string) {
return content.replace(/\[\[Category:.*?\]\]/gi, '')
// these are just bad
.replace(/__[A-Z]+__/g, '')
// Openings of any unclosed ref tags
.replace(/<ref[^<]*?(>|(?=\n))/gi, '')
// remove categories added via {{post-nomials}}
.replace(/(\|country=[A-Z]{3})-cats/g, '$1');
}
}