forked from IlyaSemenov/wikipedia-word-frequency
-
Notifications
You must be signed in to change notification settings - Fork 0
/
gather_wordfreq.py
executable file
·64 lines (52 loc) · 1.47 KB
/
gather_wordfreq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#!/usr/bin/env python3
from collections import defaultdict
import math
import pickle
import re
import subprocess
import sys
MIN_ARTICLES = 3
line_trans = str.maketrans('–’', "-\'")
words_split_re = re.compile(r'[^\w\-\']')
is_word_re = re.compile(r'^\w.*\w$')
not_is_word_re = re.compile(r'.*\d.*')
if not len(sys.argv) > 1:
sys.stderr.write("Usage: %s dumps/*.bz2\n" % sys.argv[0])
sys.exit(1)
# collect data
word_uses = defaultdict(int)
word_docs = {}
doc_no = 0
for fn in sys.argv[1:]:
sys.stderr.write("Processing %s\n" % fn)
with subprocess.Popen(
"bzcat %s | wikiextractor/WikiExtractor.py --no-templates -o - -" % fn,
stdout=subprocess.PIPE,
shell=True
) as proc:
while True:
line = proc.stdout.readline()
if not line:
break
if line.startswith(b'<'):
doc_no += 1
continue
line = line.decode('utf-8')
line = line.translate(line_trans)
line = line.lower()
for word in filter(None, words_split_re.split(line)):
if is_word_re.match(word) and not not_is_word_re.match(word):
word_uses[word] += 1
if not word in word_docs:
word_docs[word] = {doc_no}
elif len(word_docs[word]) < MIN_ARTICLES:
word_docs[word].add(doc_no)
# remove words only used once
for word in list(word_uses.keys()):
if len(word_docs[word]) < MIN_ARTICLES:
del word_uses[word]
# save raw data
words = list(word_uses.keys())
words.sort(key=lambda w: word_uses[w], reverse=True)
for word in words:
print("%s %d" % (word, word_uses[word]))