-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathextractpages.py
80 lines (66 loc) · 2.71 KB
/
extractpages.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
from __future__ import print_function
try:
from lxml import etree
except ImportError:
import xml.etree.ElementTree as etree
import re
def _get_namespace(tag):
namespace = re.match("^{(.*?)}", tag).group(1)
if not namespace.startswith("http://www.mediawiki.org/xml/export-"):
raise ValueError("%s not recognized as MediaWiki database dump"
% namespace)
return namespace
def extract_pages(f):
"""Extract pages from Wikimedia database dump.
Parameters
----------
f : file-like or str
Handle on Wikimedia article dump. May be any type supported by
etree.iterparse.
Returns
-------
pages : iterable over (int, string, string)
Generates (page_id, title, content) triples.
In Python 2.x, may produce either str or unicode strings.
"""
elems = (elem for _, elem in etree.iterparse(f, events=["end"]))
# We can't rely on the namespace for database dumps, since it's changed
# it every time a small modification to the format is made. So, determine
# those from the first element we find, which will be part of the metadata,
# and construct element paths.
elem = next(elems)
namespace = _get_namespace(elem.tag)
ns_mapping = {"ns": namespace}
page_tag = "{%(ns)s}page" % ns_mapping
text_path = "./{%(ns)s}revision/{%(ns)s}text" % ns_mapping
id_path = "./{%(ns)s}id" % ns_mapping
title_path = "./{%(ns)s}title" % ns_mapping
for elem in elems:
if elem.tag == page_tag:
text = elem.find(text_path).text
if text is None:
continue
yield (int(elem.find(id_path).text),
elem.find(title_path).text,
text)
# Prune the element tree, as per
# http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
# We do this only for <page>s, since we need to inspect the
# ./revision/text element. That shouldn't matter since the pages
# comprise the bulk of the file.
elem.clear()
if hasattr(elem, "getprevious"):
# LXML only: unlink elem from its parent
while elem.getprevious() is not None:
del elem.getparent()[0]
if __name__ == "__main__":
# Test; will write article info + prefix of content to stdout
import sys
if len(sys.argv) > 1:
print("usage: %s; will read from standard input" % sys.argv[0],
file=sys.stderr)
sys.exit(1)
for pageid, title, text in extract_pages(sys.stdin):
title = title.encode("utf-8")
text = text[:40].replace("\n", "_").encode("utf-8")
print("%d '%s' (%s)" % (pageid, title, text))