get_wikinews.py

import urllib.request, bz2, io, os, sys, re, shutil
# from dateparser.search import search_dates  # This one is worse than datefinder
from datefinder import find_dates  # Maybe try sutime instead? (but needs Java)
from collections import OrderedDict
from multiprocessing import cpu_count
from glob import glob
from random import shuffle, seed
from lib.WikiExtractor import process_dump

seed(42)

script_dir = os.path.dirname(os.path.realpath(__file__)) + os.sep


class Document:

	def __init__(self):
		self.title = ""
		self.text = ""
		self.url = ""
		self.lines = []
		self.docnum = 0

	def serialize(self, out_dir=None):
		if out_dir is None:
			out_dir = script_dir + "out" + os.sep + "wikinews" + os.sep

		docname = 'autogum_wikinews_doc' + str(self.docnum)

		# TODO: more metadata
		header = '<text id="' + docname + '" title="' + self.title + '">\n'
		output = header + self.text.strip() + "\n</text>\n"
		with io.open(out_dir + docname + ".xml", 'w', encoding="utf8", newline="\n") as f:
			f.write(output)


def get_dump(dump_file="enwikinews-latest-pages-articles.xml"):
	"""Downloads dump file"""

	dump_url = "https://dumps.wikimedia.org/enwikinews/latest/enwikinews-latest-pages-articles.xml.bz2"

	req = urllib.request.urlopen(dump_url)
	CHUNK = 16 * 1024

	decompressor = bz2.BZ2Decompressor()
	with io.open(dump_file, 'wb') as fp:
		while True:
			chunk = req.read(CHUNK)
			if not chunk:
				break
			fp.write(decompressor.decompress(chunk))
	req.close()


def reformat_text(text):

	def leap(year):
		year = int(year)
		if year % 4 > 0:
			return False
		elif year % 100 > 0:
			return True
		elif year % 400 > 0:
			return False
		else:
			return True

	# Remove XML except hyperlinks and headings
	text = text.replace("&gt;",">").replace("&lt;", "<").replace("&amp;", "&").replace("&nbsp;", " ")
	text = re.sub(r"<(/a|a [^<>]*)>",r"#@Q\1Q@#", text)
	text = re.sub(r"<(/?h[0-9])>", r"#@Q\1Q@#", text)
	text = re.sub('<[^<>]+>', '', text)
	text = text.replace("#@Q", "<").replace("Q@#", ">")

	# Mark up
	text = re.sub(r'<h[0-9]>([^\n<>]+)</h[0-9]>', r"<head>\1</head>", text)
	text = re.sub(r'(<a href)', r"<ref target", text)
	text = re.sub(r'(target="[^"]+)%3A', r"\1:", text)
	text = re.sub(r'(</?)a([ >])', r"\1ref\2", text)
	text = re.sub(r'BULLET::::-?([^\n]+)', r"<item>\1</item>", text)
	text = re.sub(r'^([^<\s][^\n]*)$', r'<p>\1</p>', text,flags=re.MULTILINE)
	text = re.sub(r'♡❤♡([^❤♡]+)♡❤♡', r'<hi rend="bold">\1</hi>', text)
	text = re.sub(r'♡❤([^❤♡]+)♡❤', r'<hi rend="italic">\1</hi>', text)
	text = re.sub(r' +', r" ", text)

	pats = [(r'((<item>.*?</item>\n?)+)', r'<list type="unordered">\n\1</list>\n')]

	text = text.replace("&quot;", '"')
	for pat in pats:
		text = re.sub(pat[0], pat[1], text, flags=re.MULTILINE|re.DOTALL)

	# dates = find_dates(text, source=True, strict=True)  # Non-strict finds garbage
	# unique_dates = set([])
	# if dates is not None:
	# 	for d in dates:
	# 		date_spec, date_text = d
	# 		if len(date_text) < 4:
	# 			continue
	# 		year, month, day = date_spec.year, date_spec.month, date_spec.day
	# 		if year == 0:
	# 			year = "xxxx"
	# 		else:
	# 			year = str(year)
	# 		if month == 0 and day == 0:
	# 			when = '<date when="'+year+'">'
	# 			unique_dates.add((date_text, when))
	# 			continue
	# 		elif month == 0:
	# 			continue
	# 		month = str(month)
	# 		if len(month) == 1:
	# 			month = "0" + month
	# 		if day == 0:
	# 			start = year + "-" + month + "-01"
	# 			end = "31" if month in ["01","03","05","07","08","10","12"] else "30"
	# 			if month == "02":
	# 				if leap(year):
	# 					end = "29"
	# 				else:
	# 					end = "28"
	# 			end = year + "-" + month + "-" + end
	# 			when = '<date notBefore="'+start+'" notAfter="' + end + '">'
	# 		else:  # Full date
	# 			day = str(day)
	# 			if len(day) == 1:
	# 				day = "0"+day
	# 			when = '<date when="' + year + "-" + month + "-" + day + '">'
	# 		unique_dates.add((date_text,when))
	# for t, d in unique_dates:
	# 	text = text.replace(t, d+t+"</date>")
	return text


if __name__ == "__main__":
	dump_file = "enwikinews-latest-pages-articles.xml"
	if not os.path.exists(script_dir + dump_file):
		sys.stderr.write("o Downloading data\n")
		get_dump(dump_file)
	else:
		sys.stderr.write("o Found cached download data\n")

	if not os.path.exists(script_dir + "data"):
		os.mkdir(script_dir + "data")
	if not os.path.exists(script_dir + "data" + os.sep + "wikinews"):
		os.mkdir(script_dir + "data" + os.sep + "wikinews")

	# Set file size for WikiExtractor output
	file_size = "1M"
	power = 'kmg'.find(file_size[-1].lower()) + 1
	file_size = int(file_size[:-1]) * 1024 ** power

	# Get Wiki content
	if not os.path.exists(script_dir + "data" + os.sep + "wikinews" + os.sep + "AA"):
		sys.stderr.write("o Extracting data with WikiExtractor\n")
		template_file = script_dir + "lib" + os.sep + "wikinews_templ.txt"
		default_process_count = max(1, cpu_count() - 1)
		default_process_count = 1
		# If templates file does not exist yet, this will start a LONG pass of all data to harvest templates, then run
		# the extraction
		from types import SimpleNamespace
		options = SimpleNamespace(toHTML=True, keepLists=True)

		process_dump(script_dir+dump_file, template_file, script_dir + "data" + os.sep + "wikinews" + os.sep, file_size,
				 False, default_process_count)
	else:
		sys.stderr.write("o Found processed files in data" + os.sep + "wikinews" + os.sep + "AA\n")

	# Sample documents from 50 randomly selected files
	all_docs = []

	files = glob(script_dir + "data" + os.sep + "wikinews" + os.sep + "**" + os.sep + "wiki*", recursive=True)
	# shuffle(files)
	files = files[:1]

	for file_ in files:
		docs = io.open(file_, encoding="utf8").read().split("</doc>")

		for doc in docs:
			if "<doc" not in doc:
				continue
			current_doc = Document()
			m = re.search('<doc id="([^"]*)" url="([^"]*)" title="([^"]*)"', doc)
			current_doc.url = m.group(2)
			current_doc.title = m.group(3)
			text = re.sub('<doc[^<>]*?>', '', doc)
			text = reformat_text(text)
			current_doc.text = text
			all_docs.append(current_doc)

	if not os.path.exists(script_dir + "out"):
		os.mkdir(script_dir + "out")
	if not os.path.exists(script_dir + "out" + os.sep + "wikinews"):
		os.mkdir(script_dir + "out" + os.sep + "wikinews")
	else:
		shutil.rmtree(script_dir + "out" + os.sep + "wikinews")
		os.mkdir(script_dir + "out" + os.sep + "wikinews")

	out_dir = script_dir + "out" + os.sep + "wikinews" + os.sep
	for i, doc in enumerate(all_docs):
		doc.docnum = i
		doc.serialize(out_dir)

	sys.stdout.write("\no Wrote " + str(len(all_docs)) + " documents\n")