From 1cad2b1fab2a0c1f11e07c8098f8da3d9a4cece9 Mon Sep 17 00:00:00 2001 From: Struan Donald Date: Tue, 20 Aug 2024 16:22:58 +0100 Subject: [PATCH 1/3] [SP] add wrans download --- pyscraper/sp_2024/__main__.py | 65 ++++++++++++++-- pyscraper/sp_2024/download.py | 140 +++++++++++++++++++++++++++++++++- 2 files changed, 199 insertions(+), 6 deletions(-) diff --git a/pyscraper/sp_2024/__main__.py b/pyscraper/sp_2024/__main__.py index 44b7f589..9e7db18b 100644 --- a/pyscraper/sp_2024/__main__.py +++ b/pyscraper/sp_2024/__main__.py @@ -6,12 +6,14 @@ from __future__ import annotations -from .download import fetch_debates_for_dates -from .parse import tidy_up_html -from .convert import convert_xml_to_twfy -import click -from pathlib import Path import datetime +from pathlib import Path + +import click + +from .convert import convert_xml_to_twfy +from .download import fetch_debates_for_dates, fetch_wrans_for_dates +from .parse import tidy_up_html file_dir = Path(__file__).parent parldata = Path(file_dir, "..", "..", "..", "parldata") @@ -113,5 +115,58 @@ def debates( convert_xml_to_twfy(file, output_dir, verbose=verbose) +@cli.command() +@click.option( + "--start-date", help="isodate to start fetching wrans from", required=True +) +@click.option("--end-date", help="isodate to end fetching wrans at", required=True) +@click.option( + "--download", + is_flag=True, + help="Download the wrans, pair with 'override' to redownload all files", +) +@click.option("--parse", is_flag=True, help="Parse the downloaded wrans") +@click.option("--convert", is_flag=True, help="Convert the parsed wrans") +@click.option("--verbose", is_flag=True, help="Print verbose output") +@click.option("--override", is_flag=True, help="Override existing files") +@click.option( + "--partial-file-name", help="Only parse/convert files that match this string" +) +def wrans( + start_date: str, + end_date: str, + download: bool = False, + parse: bool = False, + convert: bool = False, + verbose: bool = False, + override: bool = False, + partial_file_name: str | None = None, +): + """ + Download written answers from Scottish Parliament between a start and end date. + """ + + start = datetime.date.fromisoformat(start_date) + end = datetime.date.fromisoformat(end_date) + + # if none of the flags are set, error that at least one flag must be set + if not any([download, parse, convert]): + click.echo("At least one of the flags must be set") + return + + # iterate through downloaded files if we're downloading them + # otherwise go find the relevant files based on name + if download: + file_iterator = fetch_wrans_for_dates( + start.isoformat(), + end.isoformat(), + verbose=verbose, + cache_dir=download_dir, + override=override, + ) + for file in file_iterator: + pass + + if __name__ == "__main__": cli(prog_name="python -m pyscraper.sp_2024") diff --git a/pyscraper/sp_2024/download.py b/pyscraper/sp_2024/download.py index ae3b00dd..9c8caa72 100644 --- a/pyscraper/sp_2024/download.py +++ b/pyscraper/sp_2024/download.py @@ -9,9 +9,10 @@ from __future__ import annotations +import re +from datetime import datetime from itertools import groupby from pathlib import Path -import re from typing import Iterator, NamedTuple from urllib.parse import parse_qs, urlparse @@ -28,6 +29,8 @@ item_url = "/chamber-and-committees/official-report/search-what-was-said-in-parliament/{slug}?meeting={id}&iob={iob}" major_heading_url = "/chamber-and-committees/official-report/search-what-was-said-in-parliament/{slug}?meeting={id}" +wrans_search_url = "/chamber-and-committees/questions-and-answers?dtDateFrom={start_date}&dtDateTo={end_date}&chkAnswered=true&chkAnswered=false&chkUnAnswered=true&chkUnAnswered=false&chkHolding=true&chkHolding=false&chkChamber=false&chkFmq=false&chkGeneral=false&chkPortfolio=false&chkSpcb=true&chkSpcb=false&chkTopical=false&chkWritten=true&chkWritten=false&chkGiq=true&chkGiq=false&page={page}" + def get_meeting_urls(start_date: str, end_date: str, page: int = 1): """ @@ -237,3 +240,138 @@ def fetch_debates_for_dates( if verbose: print(f"Fetching debates for {grouping.committee_date_slug}") yield grouping.save_xml(cache_dir=cache_dir, override=override) + + +def get_wrans_urls(start_date: str, end_date: str, page: int = 1): + date_url = scot_prefix + wrans_search_url.format( + start_date=start_date, end_date=end_date, page=page + ) + response = requests.get(date_url, headers={"User-Agent": user_agent}) + soup = BeautifulSoup(response.text, "html.parser") + meeting_urls = [ + a["href"] + for a in soup.select("h2 > a[href]") + if "/questions-and-answers/question" in a["href"] + ] + + question_count = len(meeting_urls) + + return question_count, meeting_urls + + +def get_wrans_groupings(start_date: str, end_date: str): + keep_fetching = True + search_page = 1 + question_urls = [] + + while keep_fetching: + question_count, page_result_urls = get_wrans_urls( + start_date, end_date, search_page + ) + question_urls.extend(page_result_urls) + if question_count < 10: + keep_fetching = False + else: + search_page += 1 + + all_questions = [] + groupings = [] + + for url in question_urls: + url = scot_prefix + url + response = requests.get(url, headers={"User-Agent": user_agent}) + raw_html = response.text + + parsed = parse_qs(urlparse(url).query) + question_ref = parsed["ref"][0] + + soup = BeautifulSoup(raw_html, "html.parser") + + questions = soup.select("main > div.basic-content") + if len(questions) != 1: + continue + question = questions[0] + for svg in question.find_all("svg"): + svg.decompose() + + info_box = question.find("ul") + date = None + if info_box is not None: + for li in info_box.find_all("li"): + text = li.text + match = re.search(r"Answered by.*on (\d+ \w+ \d+)", text) + if match: + date = match.group(1) + + # only want answered questions + if date is not None: + date = datetime.strptime(date, "%d %B %Y").date().isoformat() + all_questions.append( + { + "date": date, + "ref": question_ref, + "content": str(question), + "url": url, + } + ) + + def get_question_date(question: dict): + return question["date"] + + for d, items in groupby(all_questions, key=get_question_date): + groupings.append(WransGrouping(date=d, items=list(items))) + + return groupings + + +class WransGrouping(NamedTuple): + date: str + items: list[dict] + committee_date_slug = "wrans" + + def construct_xml(self): + root = etree.Element( + "wrans", + date=self.date, + id=self.date, + ) + + for item in self.items: + el = etree.Element("spwrans", id=item["ref"], url=item["url"]) + raw_html = etree.Element("raw_html") + raw_html.append(etree.fromstring(item["content"])) + el.append(raw_html) + root.append(el) + + etree.indent(root, space=" ") + + return root + + def save_xml(self, cache_dir: Path, override: bool = False) -> Path: + """ + Generated interim xml file and save it to the cache directory + """ + filename = cache_dir / f"{self.date}-{self.committee_date_slug}.xml" + if filename.exists() is False or override: + xml = self.construct_xml() + with filename.open("wb") as f: + f.write(etree.tostring(xml)) + return filename + + +def fetch_wrans_for_dates( + start_date: str, + end_date: str, + cache_dir: Path, + verbose: bool = False, + override: bool = False, +): + """ + Fetch Written Answers for a given date range + """ + # get_wrans_groupings(start_date, end_date) + cache_dir.mkdir(parents=True, exist_ok=True) + for grouping in get_wrans_groupings(start_date, end_date): + if verbose: + print(f"Fetching wrans for {grouping.committee_date_slug}") + yield grouping.save_xml(cache_dir=cache_dir, override=override) From 666f1de5ba6251c79c2edfa1a3ec3918cd187fe2 Mon Sep 17 00:00:00 2001 From: Struan Donald Date: Tue, 20 Aug 2024 18:34:57 +0100 Subject: [PATCH 2/3] [SP] parse wrans into interim XML --- pyscraper/sp_2024/__main__.py | 8 ++ pyscraper/sp_2024/parse_wrans.py | 163 +++++++++++++++++++++++++++++++ 2 files changed, 171 insertions(+) create mode 100644 pyscraper/sp_2024/parse_wrans.py diff --git a/pyscraper/sp_2024/__main__.py b/pyscraper/sp_2024/__main__.py index 9e7db18b..f990f0b0 100644 --- a/pyscraper/sp_2024/__main__.py +++ b/pyscraper/sp_2024/__main__.py @@ -14,6 +14,7 @@ from .convert import convert_xml_to_twfy from .download import fetch_debates_for_dates, fetch_wrans_for_dates from .parse import tidy_up_html +from .parse_wrans import tidy_up_wrans_html file_dir = Path(__file__).parent parldata = Path(file_dir, "..", "..", "..", "parldata") @@ -167,6 +168,13 @@ def wrans( for file in file_iterator: pass + if parse: + file_iterator = cache_dir_iterator(download_dir, start, end, partial_file_name) + for file in file_iterator: + if verbose: + print(f"Parsing up {file}") + tidy_up_wrans_html(file, parsed_dir) + if __name__ == "__main__": cli(prog_name="python -m pyscraper.sp_2024") diff --git a/pyscraper/sp_2024/parse_wrans.py b/pyscraper/sp_2024/parse_wrans.py new file mode 100644 index 00000000..254b28fa --- /dev/null +++ b/pyscraper/sp_2024/parse_wrans.py @@ -0,0 +1,163 @@ +""" +This module contains tools to convert the unstructured HTML of the debates into structured XML. +This is not the TWFY style XML - but tries to retain all information from the original. +""" + +from __future__ import annotations + +import re +from pathlib import Path + +from bs4 import BeautifulSoup, Tag + +# HTML elements we accept moving from raw_html to parsed +acceptable_elements = [ + "a", + "abbr", + "acronym", + "address", + "b", + "big", + "blockquote", + "br", + "caption", + "center", + "cite", + "col", + "colgroup", + "dd", + "dir", + "div", + "dl", + "dt", + "em", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "i", + "img", + "li", + "ol", + "p", + "pre", + "q", + "s", + "small", + "span", + "strike", + "strong", + "sub", + "sup", + "table", + "tbody", + "td", + "tfoot", + "th", + "thead", + "title", + "tr", + "tt", + "u", + "ul", + "timestamp", +] + + +def process_raw_html(raw_html: Tag, wrans_item_url: str) -> BeautifulSoup: + """ + Given the question html, convert it to a structured xml format + This isn't yet matching TWFY schema or using the right IDs. + The goal is to make a structured file that's a bit easier to work with. + """ + + # Deal with timestamps that are not inside anything first + raw_html = str(raw_html) + soup = BeautifulSoup(raw_html, "html.parser") + + # convert a structure where there's a question with a question and a reply inside + + details = soup.find("ul") + speaker_re = re.compile( + r"Asked by:\s*([^,]*),\s*MSP for\s*([^,]*),(.*)", re.MULTILINE + ) + responder_re = re.compile(r".*Answered by\s*(\w.*)\s*on", re.MULTILINE | re.DOTALL) + lodged_re = re.compile(r"Date lodged:\s*(\d+ \w+ \d+)", re.MULTILINE) + speaker = None + seat = None + party = None + responder = None + lodged = None + + parsed = soup.new_tag("parsed") + + for li in details.find_all("li"): + text = li.text.strip() + + speaker_match = re.match(speaker_re, text) + responder_match = re.match(responder_re, text) + lodged_match = re.match(lodged_re, text) + + if speaker_match: + speaker = speaker_match.group(1) + seat = speaker_match.group(2) + party = speaker_match.group(3) + elif responder_match: + responder = responder_match.group(1) + elif lodged_match: + lodged = lodged_match.group(1) + + li.decompose() + + for h in soup.find_all("h3"): + div = h.find_next("div") + text = div.find_all("p") + tag = None + if h.strong.string.strip() == "Question": + tag = soup.new_tag("question") + tag["speaker_name"] = speaker.strip() + tag["speaker_seat"] = seat.strip() + tag["speaker_party"] = party.strip() + tag["lodged"] = lodged.strip() + elif h.strong.string.strip() == "Answer": + tag = soup.new_tag("answer") + tag["speaker_name"] = responder.strip() + + if tag: + tag.extend(text) + parsed.append(tag) + + soup.find("raw_html").replace_with(parsed) + + return soup + + +def tidy_up_wrans_html(xml_path: Path, output_dir: Path): + """ + For each subsection there is a raw_html child + This function will convert the raw_html element to a parsed child. + This can be rerun on already downloaded data. + """ + + with xml_path.open("r") as f: + xml = f.read() + + soup = BeautifulSoup(xml, "html.parser") + + for item in soup.find_all("spwrans"): + wrans_item_url = item.get("url") + + # process html + raw_html = item.find("raw_html") + parsed_data = process_raw_html(raw_html, wrans_item_url=wrans_item_url) + # replace raw_html with parsed + item.find("raw_html").decompose() + item.append(parsed_data.find("parsed")) + + # dump the soup to a file + output_dir.mkdir(parents=True, exist_ok=True) + output_file = output_dir / xml_path.name + with output_file.open("w") as f: + f.write(soup.prettify()) From a3ec5cc72e43b7e1b4cf0d2b9d49a4abd3588b42 Mon Sep 17 00:00:00 2001 From: Struan Donald Date: Wed, 21 Aug 2024 13:56:38 +0100 Subject: [PATCH 3/3] [SP] convert wrans intermediate XML to PW format --- pyscraper/sp_2024/__main__.py | 9 ++ pyscraper/sp_2024/convert_wrans.py | 154 +++++++++++++++++++++++++++++ 2 files changed, 163 insertions(+) create mode 100644 pyscraper/sp_2024/convert_wrans.py diff --git a/pyscraper/sp_2024/__main__.py b/pyscraper/sp_2024/__main__.py index f990f0b0..6d01632d 100644 --- a/pyscraper/sp_2024/__main__.py +++ b/pyscraper/sp_2024/__main__.py @@ -12,6 +12,7 @@ import click from .convert import convert_xml_to_twfy +from .convert_wrans import convert_wrans_xml_to_twfy from .download import fetch_debates_for_dates, fetch_wrans_for_dates from .parse import tidy_up_html from .parse_wrans import tidy_up_wrans_html @@ -22,6 +23,7 @@ download_dir = parldata / "cmpages" / "sp_2024" / "raw" parsed_dir = parldata / "cmpages" / "sp_2024" / "parsed" output_dir = parldata / "scrapedxml" / "sp-new" +output_dir_wrans = parldata / "scrapedxml" / "sp-written" @click.group() @@ -175,6 +177,13 @@ def wrans( print(f"Parsing up {file}") tidy_up_wrans_html(file, parsed_dir) + if convert: + file_iterator = cache_dir_iterator(parsed_dir, start, end, partial_file_name) + for file in file_iterator: + if verbose: + print(f"Converting {file} to TheyWorkForYou format") + convert_wrans_xml_to_twfy(file, output_dir_wrans, verbose=verbose) + if __name__ == "__main__": cli(prog_name="python -m pyscraper.sp_2024") diff --git a/pyscraper/sp_2024/convert_wrans.py b/pyscraper/sp_2024/convert_wrans.py new file mode 100644 index 00000000..8fc0fe31 --- /dev/null +++ b/pyscraper/sp_2024/convert_wrans.py @@ -0,0 +1,154 @@ +""" +Convert the structured data from Scottish Parliament to +the XML format used by TheyWorkForYou + +Link to TWFY IDs for members. +""" + +import datetime +import re +from dataclasses import dataclass +from pathlib import Path +from typing import Optional + +from lxml import etree + +from .resolvenames import get_unique_person_id, is_member_vote + + +@dataclass +class IDFactory: + iso_date: str + ref: str = "" + base_id: str = "uk.org.publicwhip/spwa/" + q_num: int = -1 + + def _current_id(self) -> str: + return f"{self.base_id}{self.iso_date}.{self.latest_major}.{self.latest_minor}" + + def set_ref(self, ref): + self.ref = ref + + def get_next_major_id(self) -> str: + return f"{self.base_id}{self.iso_date}.mh" + + def get_next_minor_id(self) -> str: + self.q_num = 0 + return f"{self.base_id}{self.iso_date}.{self.ref}.h" + + def get_next_q_id(self) -> str: + return f"{self.base_id}{self.iso_date}.{self.ref}.q{self.q_num}" + + def get_next_r_id(self) -> str: + id = f"{self.base_id}{self.iso_date}.{self.ref}.r{self.q_num}" + self.q_num += 1 + return id + + +def convert_wrans_xml_to_twfy(file_path: Path, output_dir: Path, verbose: bool = False): + """ + Convert from the loose structured xml format to the + TWFY xml format + """ + if verbose: + print(f"Converting {file_path}") + + # get source as an xml tree + with file_path.open("r") as f: + source = etree.fromstring(f.read()) + + # root of the tree is a publicwhip object + root = etree.Element("publicwhip") + + iso_date = source.get("date") + + # get the date in format Thursday 9 June 2005 + date_str = datetime.date.fromisoformat(iso_date).strftime("%A %d %B %Y") + + dest_path = output_dir / f"spwa{iso_date}.xml" + dest_path.parent.mkdir(parents=True, exist_ok=True) + + id_factory = IDFactory(iso_date=iso_date) + + # there is only questions for today + major_heading = etree.Element("major-heading") + major_heading.set("id", id_factory.get_next_major_id()) + major_heading.set("nospeaker", "True") + # major_heading.set("url", item.get("url")) + major_heading.text = f"Written Questions for {date_str}" + root.append(major_heading) + + # iterate through the questions + for item in source.iter("spwrans"): + id_factory.set_ref(item.get("id")) + + # each question is a minor heading using the id as the title because + # we don't have anything else to use + minor_heading = etree.Element("minor-heading") + minor_heading.set("id", id_factory.get_next_minor_id()) + minor_heading.text = f"Question {item.get('id')}" + root.append(minor_heading) + + missing_speakers = [] + for subitem in item.find("parsed"): + if subitem.tag == "question": + speaker_name = subitem.get("speaker_name") + person_id = get_unique_person_id(speaker_name, iso_date) + if ( + person_id is None + and speaker_name not in missing_speakers + and verbose + ): + print(f"Could not find person id for {speaker_name}") + missing_speakers.append(speaker_name) + speech = etree.Element("ques") + speech.set("id", id_factory.get_next_q_id()) + speech.set("url", item.get("url") or "") + speech.set("speakername", speaker_name) + speech.set("person_id", person_id or "unknown") + for child in subitem: + speech.append(child) + root.append(speech) + + elif subitem.tag == "answer": + speaker_name = subitem.get("speaker_name") + person_id = get_unique_person_id(speaker_name, iso_date) + if ( + person_id is None + and speaker_name not in missing_speakers + and verbose + ): + print(f"Could not find person id for {speaker_name}") + missing_speakers.append(speaker_name) + speech = etree.Element("reply") + speech.set("id", id_factory.get_next_r_id()) + speech.set("url", item.get("url") or "") + speech.set("speakername", speaker_name) + speech.set("person_id", person_id or "unknown") + for child in subitem: + speech.append(child) + root.append(speech) + + # write the new xml to a file + etree.indent(root, space=" ") + + with dest_path.open("wb") as f: + f.write(etree.tostring(root, pretty_print=True)) + + +def convert_to_twfy( + cache_dir: Path, + output_dir: Path, + partial_file_name: Optional[str] = None, + verbose: bool = False, +): + """ + Given a cache directory, parse the raw_html elements in the xml files + This updates the 'parsed' element under each agenda-item. + """ + if partial_file_name: + xmls = list(cache_dir.glob(f"{partial_file_name}*")) + else: + xmls = list(cache_dir.glob("*.xml")) + for xml in xmls: + convert_wrans_xml_to_twfy(xml, output_dir, verbose=verbose)