From 1cad2b1fab2a0c1f11e07c8098f8da3d9a4cece9 Mon Sep 17 00:00:00 2001
From: Struan Donald <struan@exo.org.uk>
Date: Tue, 20 Aug 2024 16:22:58 +0100
Subject: [PATCH 1/3] [SP] add wrans download

---
 pyscraper/sp_2024/__main__.py |  65 ++++++++++++++--
 pyscraper/sp_2024/download.py | 140 +++++++++++++++++++++++++++++++++-
 2 files changed, 199 insertions(+), 6 deletions(-)

diff --git a/pyscraper/sp_2024/__main__.py b/pyscraper/sp_2024/__main__.py
index 44b7f589..9e7db18b 100644
--- a/pyscraper/sp_2024/__main__.py
+++ b/pyscraper/sp_2024/__main__.py
@@ -6,12 +6,14 @@
 
 from __future__ import annotations
 
-from .download import fetch_debates_for_dates
-from .parse import tidy_up_html
-from .convert import convert_xml_to_twfy
-import click
-from pathlib import Path
 import datetime
+from pathlib import Path
+
+import click
+
+from .convert import convert_xml_to_twfy
+from .download import fetch_debates_for_dates, fetch_wrans_for_dates
+from .parse import tidy_up_html
 
 file_dir = Path(__file__).parent
 parldata = Path(file_dir, "..", "..", "..", "parldata")
@@ -113,5 +115,58 @@ def debates(
             convert_xml_to_twfy(file, output_dir, verbose=verbose)
 
 
+@cli.command()
+@click.option(
+    "--start-date", help="isodate to start fetching wrans from", required=True
+)
+@click.option("--end-date", help="isodate to end fetching wrans at", required=True)
+@click.option(
+    "--download",
+    is_flag=True,
+    help="Download the wrans, pair with 'override' to redownload all files",
+)
+@click.option("--parse", is_flag=True, help="Parse the downloaded wrans")
+@click.option("--convert", is_flag=True, help="Convert the parsed wrans")
+@click.option("--verbose", is_flag=True, help="Print verbose output")
+@click.option("--override", is_flag=True, help="Override existing files")
+@click.option(
+    "--partial-file-name", help="Only parse/convert files that match this string"
+)
+def wrans(
+    start_date: str,
+    end_date: str,
+    download: bool = False,
+    parse: bool = False,
+    convert: bool = False,
+    verbose: bool = False,
+    override: bool = False,
+    partial_file_name: str | None = None,
+):
+    """
+    Download written answers from Scottish Parliament between a start and end date.
+    """
+
+    start = datetime.date.fromisoformat(start_date)
+    end = datetime.date.fromisoformat(end_date)
+
+    # if none of the flags are set, error that at least one flag must be set
+    if not any([download, parse, convert]):
+        click.echo("At least one of the flags must be set")
+        return
+
+    # iterate through downloaded files if we're downloading them
+    # otherwise go find the relevant files based on name
+    if download:
+        file_iterator = fetch_wrans_for_dates(
+            start.isoformat(),
+            end.isoformat(),
+            verbose=verbose,
+            cache_dir=download_dir,
+            override=override,
+        )
+        for file in file_iterator:
+            pass
+
+
 if __name__ == "__main__":
     cli(prog_name="python -m pyscraper.sp_2024")
diff --git a/pyscraper/sp_2024/download.py b/pyscraper/sp_2024/download.py
index ae3b00dd..9c8caa72 100644
--- a/pyscraper/sp_2024/download.py
+++ b/pyscraper/sp_2024/download.py
@@ -9,9 +9,10 @@
 
 from __future__ import annotations
 
+import re
+from datetime import datetime
 from itertools import groupby
 from pathlib import Path
-import re
 from typing import Iterator, NamedTuple
 from urllib.parse import parse_qs, urlparse
 
@@ -28,6 +29,8 @@
 item_url = "/chamber-and-committees/official-report/search-what-was-said-in-parliament/{slug}?meeting={id}&iob={iob}"
 major_heading_url = "/chamber-and-committees/official-report/search-what-was-said-in-parliament/{slug}?meeting={id}"
 
+wrans_search_url = "/chamber-and-committees/questions-and-answers?dtDateFrom={start_date}&dtDateTo={end_date}&chkAnswered=true&chkAnswered=false&chkUnAnswered=true&chkUnAnswered=false&chkHolding=true&chkHolding=false&chkChamber=false&chkFmq=false&chkGeneral=false&chkPortfolio=false&chkSpcb=true&chkSpcb=false&chkTopical=false&chkWritten=true&chkWritten=false&chkGiq=true&chkGiq=false&page={page}"
+
 
 def get_meeting_urls(start_date: str, end_date: str, page: int = 1):
     """
@@ -237,3 +240,138 @@ def fetch_debates_for_dates(
         if verbose:
             print(f"Fetching debates for {grouping.committee_date_slug}")
         yield grouping.save_xml(cache_dir=cache_dir, override=override)
+
+
+def get_wrans_urls(start_date: str, end_date: str, page: int = 1):
+    date_url = scot_prefix + wrans_search_url.format(
+        start_date=start_date, end_date=end_date, page=page
+    )
+    response = requests.get(date_url, headers={"User-Agent": user_agent})
+    soup = BeautifulSoup(response.text, "html.parser")
+    meeting_urls = [
+        a["href"]
+        for a in soup.select("h2 > a[href]")
+        if "/questions-and-answers/question" in a["href"]
+    ]
+
+    question_count = len(meeting_urls)
+
+    return question_count, meeting_urls
+
+
+def get_wrans_groupings(start_date: str, end_date: str):
+    keep_fetching = True
+    search_page = 1
+    question_urls = []
+
+    while keep_fetching:
+        question_count, page_result_urls = get_wrans_urls(
+            start_date, end_date, search_page
+        )
+        question_urls.extend(page_result_urls)
+        if question_count < 10:
+            keep_fetching = False
+        else:
+            search_page += 1
+
+    all_questions = []
+    groupings = []
+
+    for url in question_urls:
+        url = scot_prefix + url
+        response = requests.get(url, headers={"User-Agent": user_agent})
+        raw_html = response.text
+
+        parsed = parse_qs(urlparse(url).query)
+        question_ref = parsed["ref"][0]
+
+        soup = BeautifulSoup(raw_html, "html.parser")
+
+        questions = soup.select("main > div.basic-content")
+        if len(questions) != 1:
+            continue
+        question = questions[0]
+        for svg in question.find_all("svg"):
+            svg.decompose()
+
+        info_box = question.find("ul")
+        date = None
+        if info_box is not None:
+            for li in info_box.find_all("li"):
+                text = li.text
+                match = re.search(r"Answered by.*on (\d+ \w+ \d+)", text)
+                if match:
+                    date = match.group(1)
+
+        # only want answered questions
+        if date is not None:
+            date = datetime.strptime(date, "%d %B %Y").date().isoformat()
+            all_questions.append(
+                {
+                    "date": date,
+                    "ref": question_ref,
+                    "content": str(question),
+                    "url": url,
+                }
+            )
+
+    def get_question_date(question: dict):
+        return question["date"]
+
+    for d, items in groupby(all_questions, key=get_question_date):
+        groupings.append(WransGrouping(date=d, items=list(items)))
+
+    return groupings
+
+
+class WransGrouping(NamedTuple):
+    date: str
+    items: list[dict]
+    committee_date_slug = "wrans"
+
+    def construct_xml(self):
+        root = etree.Element(
+            "wrans",
+            date=self.date,
+            id=self.date,
+        )
+
+        for item in self.items:
+            el = etree.Element("spwrans", id=item["ref"], url=item["url"])
+            raw_html = etree.Element("raw_html")
+            raw_html.append(etree.fromstring(item["content"]))
+            el.append(raw_html)
+            root.append(el)
+
+        etree.indent(root, space="    ")
+
+        return root
+
+    def save_xml(self, cache_dir: Path, override: bool = False) -> Path:
+        """
+        Generated interim xml file and save it to the cache directory
+        """
+        filename = cache_dir / f"{self.date}-{self.committee_date_slug}.xml"
+        if filename.exists() is False or override:
+            xml = self.construct_xml()
+            with filename.open("wb") as f:
+                f.write(etree.tostring(xml))
+        return filename
+
+
+def fetch_wrans_for_dates(
+    start_date: str,
+    end_date: str,
+    cache_dir: Path,
+    verbose: bool = False,
+    override: bool = False,
+):
+    """
+    Fetch Written Answers for a given date range
+    """
+    # get_wrans_groupings(start_date, end_date)
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    for grouping in get_wrans_groupings(start_date, end_date):
+        if verbose:
+            print(f"Fetching wrans for {grouping.committee_date_slug}")
+        yield grouping.save_xml(cache_dir=cache_dir, override=override)

From 666f1de5ba6251c79c2edfa1a3ec3918cd187fe2 Mon Sep 17 00:00:00 2001
From: Struan Donald <struan@exo.org.uk>
Date: Tue, 20 Aug 2024 18:34:57 +0100
Subject: [PATCH 2/3] [SP] parse wrans into interim XML

---
 pyscraper/sp_2024/__main__.py    |   8 ++
 pyscraper/sp_2024/parse_wrans.py | 163 +++++++++++++++++++++++++++++++
 2 files changed, 171 insertions(+)
 create mode 100644 pyscraper/sp_2024/parse_wrans.py

diff --git a/pyscraper/sp_2024/__main__.py b/pyscraper/sp_2024/__main__.py
index 9e7db18b..f990f0b0 100644
--- a/pyscraper/sp_2024/__main__.py
+++ b/pyscraper/sp_2024/__main__.py
@@ -14,6 +14,7 @@
 from .convert import convert_xml_to_twfy
 from .download import fetch_debates_for_dates, fetch_wrans_for_dates
 from .parse import tidy_up_html
+from .parse_wrans import tidy_up_wrans_html
 
 file_dir = Path(__file__).parent
 parldata = Path(file_dir, "..", "..", "..", "parldata")
@@ -167,6 +168,13 @@ def wrans(
         for file in file_iterator:
             pass
 
+    if parse:
+        file_iterator = cache_dir_iterator(download_dir, start, end, partial_file_name)
+        for file in file_iterator:
+            if verbose:
+                print(f"Parsing up {file}")
+            tidy_up_wrans_html(file, parsed_dir)
+
 
 if __name__ == "__main__":
     cli(prog_name="python -m pyscraper.sp_2024")
diff --git a/pyscraper/sp_2024/parse_wrans.py b/pyscraper/sp_2024/parse_wrans.py
new file mode 100644
index 00000000..254b28fa
--- /dev/null
+++ b/pyscraper/sp_2024/parse_wrans.py
@@ -0,0 +1,163 @@
+"""
+This module contains tools to convert the unstructured HTML of the debates into structured XML.
+This is not the TWFY style XML - but tries to retain all information from the original.
+"""
+
+from __future__ import annotations
+
+import re
+from pathlib import Path
+
+from bs4 import BeautifulSoup, Tag
+
+# HTML elements we accept moving from raw_html to parsed
+acceptable_elements = [
+    "a",
+    "abbr",
+    "acronym",
+    "address",
+    "b",
+    "big",
+    "blockquote",
+    "br",
+    "caption",
+    "center",
+    "cite",
+    "col",
+    "colgroup",
+    "dd",
+    "dir",
+    "div",
+    "dl",
+    "dt",
+    "em",
+    "h1",
+    "h2",
+    "h3",
+    "h4",
+    "h5",
+    "h6",
+    "i",
+    "img",
+    "li",
+    "ol",
+    "p",
+    "pre",
+    "q",
+    "s",
+    "small",
+    "span",
+    "strike",
+    "strong",
+    "sub",
+    "sup",
+    "table",
+    "tbody",
+    "td",
+    "tfoot",
+    "th",
+    "thead",
+    "title",
+    "tr",
+    "tt",
+    "u",
+    "ul",
+    "timestamp",
+]
+
+
+def process_raw_html(raw_html: Tag, wrans_item_url: str) -> BeautifulSoup:
+    """
+    Given the question html, convert it to a structured xml format
+    This isn't yet matching TWFY schema or using the right IDs.
+    The goal is to make a structured file that's a bit easier to work with.
+    """
+
+    # Deal with timestamps that are not inside anything first
+    raw_html = str(raw_html)
+    soup = BeautifulSoup(raw_html, "html.parser")
+
+    # convert a structure where there's a question with a question and a reply inside
+
+    details = soup.find("ul")
+    speaker_re = re.compile(
+        r"Asked by:\s*([^,]*),\s*MSP for\s*([^,]*),(.*)", re.MULTILINE
+    )
+    responder_re = re.compile(r".*Answered by\s*(\w.*)\s*on", re.MULTILINE | re.DOTALL)
+    lodged_re = re.compile(r"Date lodged:\s*(\d+ \w+ \d+)", re.MULTILINE)
+    speaker = None
+    seat = None
+    party = None
+    responder = None
+    lodged = None
+
+    parsed = soup.new_tag("parsed")
+
+    for li in details.find_all("li"):
+        text = li.text.strip()
+
+        speaker_match = re.match(speaker_re, text)
+        responder_match = re.match(responder_re, text)
+        lodged_match = re.match(lodged_re, text)
+
+        if speaker_match:
+            speaker = speaker_match.group(1)
+            seat = speaker_match.group(2)
+            party = speaker_match.group(3)
+        elif responder_match:
+            responder = responder_match.group(1)
+        elif lodged_match:
+            lodged = lodged_match.group(1)
+
+        li.decompose()
+
+    for h in soup.find_all("h3"):
+        div = h.find_next("div")
+        text = div.find_all("p")
+        tag = None
+        if h.strong.string.strip() == "Question":
+            tag = soup.new_tag("question")
+            tag["speaker_name"] = speaker.strip()
+            tag["speaker_seat"] = seat.strip()
+            tag["speaker_party"] = party.strip()
+            tag["lodged"] = lodged.strip()
+        elif h.strong.string.strip() == "Answer":
+            tag = soup.new_tag("answer")
+            tag["speaker_name"] = responder.strip()
+
+        if tag:
+            tag.extend(text)
+            parsed.append(tag)
+
+    soup.find("raw_html").replace_with(parsed)
+
+    return soup
+
+
+def tidy_up_wrans_html(xml_path: Path, output_dir: Path):
+    """
+    For each subsection there is a raw_html child
+    This function will convert the raw_html element to a parsed child.
+    This can be rerun on already downloaded data.
+    """
+
+    with xml_path.open("r") as f:
+        xml = f.read()
+
+    soup = BeautifulSoup(xml, "html.parser")
+
+    for item in soup.find_all("spwrans"):
+        wrans_item_url = item.get("url")
+
+        # process html
+        raw_html = item.find("raw_html")
+        parsed_data = process_raw_html(raw_html, wrans_item_url=wrans_item_url)
+        # replace raw_html with parsed
+        item.find("raw_html").decompose()
+        item.append(parsed_data.find("parsed"))
+
+    # dump the soup to a file
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output_file = output_dir / xml_path.name
+    with output_file.open("w") as f:
+        f.write(soup.prettify())

From a3ec5cc72e43b7e1b4cf0d2b9d49a4abd3588b42 Mon Sep 17 00:00:00 2001
From: Struan Donald <struan@exo.org.uk>
Date: Wed, 21 Aug 2024 13:56:38 +0100
Subject: [PATCH 3/3] [SP] convert wrans intermediate XML to PW format

---
 pyscraper/sp_2024/__main__.py      |   9 ++
 pyscraper/sp_2024/convert_wrans.py | 154 +++++++++++++++++++++++++++++
 2 files changed, 163 insertions(+)
 create mode 100644 pyscraper/sp_2024/convert_wrans.py

diff --git a/pyscraper/sp_2024/__main__.py b/pyscraper/sp_2024/__main__.py
index f990f0b0..6d01632d 100644
--- a/pyscraper/sp_2024/__main__.py
+++ b/pyscraper/sp_2024/__main__.py
@@ -12,6 +12,7 @@
 import click
 
 from .convert import convert_xml_to_twfy
+from .convert_wrans import convert_wrans_xml_to_twfy
 from .download import fetch_debates_for_dates, fetch_wrans_for_dates
 from .parse import tidy_up_html
 from .parse_wrans import tidy_up_wrans_html
@@ -22,6 +23,7 @@
 download_dir = parldata / "cmpages" / "sp_2024" / "raw"
 parsed_dir = parldata / "cmpages" / "sp_2024" / "parsed"
 output_dir = parldata / "scrapedxml" / "sp-new"
+output_dir_wrans = parldata / "scrapedxml" / "sp-written"
 
 
 @click.group()
@@ -175,6 +177,13 @@ def wrans(
                 print(f"Parsing up {file}")
             tidy_up_wrans_html(file, parsed_dir)
 
+    if convert:
+        file_iterator = cache_dir_iterator(parsed_dir, start, end, partial_file_name)
+        for file in file_iterator:
+            if verbose:
+                print(f"Converting {file} to TheyWorkForYou format")
+            convert_wrans_xml_to_twfy(file, output_dir_wrans, verbose=verbose)
+
 
 if __name__ == "__main__":
     cli(prog_name="python -m pyscraper.sp_2024")
diff --git a/pyscraper/sp_2024/convert_wrans.py b/pyscraper/sp_2024/convert_wrans.py
new file mode 100644
index 00000000..8fc0fe31
--- /dev/null
+++ b/pyscraper/sp_2024/convert_wrans.py
@@ -0,0 +1,154 @@
+"""
+Convert the structured data from Scottish Parliament to
+the XML format used by TheyWorkForYou
+
+Link to TWFY IDs for members.
+"""
+
+import datetime
+import re
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional
+
+from lxml import etree
+
+from .resolvenames import get_unique_person_id, is_member_vote
+
+
+@dataclass
+class IDFactory:
+    iso_date: str
+    ref: str = ""
+    base_id: str = "uk.org.publicwhip/spwa/"
+    q_num: int = -1
+
+    def _current_id(self) -> str:
+        return f"{self.base_id}{self.iso_date}.{self.latest_major}.{self.latest_minor}"
+
+    def set_ref(self, ref):
+        self.ref = ref
+
+    def get_next_major_id(self) -> str:
+        return f"{self.base_id}{self.iso_date}.mh"
+
+    def get_next_minor_id(self) -> str:
+        self.q_num = 0
+        return f"{self.base_id}{self.iso_date}.{self.ref}.h"
+
+    def get_next_q_id(self) -> str:
+        return f"{self.base_id}{self.iso_date}.{self.ref}.q{self.q_num}"
+
+    def get_next_r_id(self) -> str:
+        id = f"{self.base_id}{self.iso_date}.{self.ref}.r{self.q_num}"
+        self.q_num += 1
+        return id
+
+
+def convert_wrans_xml_to_twfy(file_path: Path, output_dir: Path, verbose: bool = False):
+    """
+    Convert from the loose structured xml format to the
+    TWFY xml format
+    """
+    if verbose:
+        print(f"Converting {file_path}")
+
+    # get source as an xml tree
+    with file_path.open("r") as f:
+        source = etree.fromstring(f.read())
+
+    # root of the tree is a publicwhip object
+    root = etree.Element("publicwhip")
+
+    iso_date = source.get("date")
+
+    # get the date in format Thursday 9 June 2005
+    date_str = datetime.date.fromisoformat(iso_date).strftime("%A %d %B %Y")
+
+    dest_path = output_dir / f"spwa{iso_date}.xml"
+    dest_path.parent.mkdir(parents=True, exist_ok=True)
+
+    id_factory = IDFactory(iso_date=iso_date)
+
+    # there is only questions for today
+    major_heading = etree.Element("major-heading")
+    major_heading.set("id", id_factory.get_next_major_id())
+    major_heading.set("nospeaker", "True")
+    # major_heading.set("url", item.get("url"))
+    major_heading.text = f"Written Questions for {date_str}"
+    root.append(major_heading)
+
+    # iterate through the questions
+    for item in source.iter("spwrans"):
+        id_factory.set_ref(item.get("id"))
+
+        # each question is a minor heading using the id as the title because
+        # we don't have anything else to use
+        minor_heading = etree.Element("minor-heading")
+        minor_heading.set("id", id_factory.get_next_minor_id())
+        minor_heading.text = f"Question {item.get('id')}"
+        root.append(minor_heading)
+
+        missing_speakers = []
+        for subitem in item.find("parsed"):
+            if subitem.tag == "question":
+                speaker_name = subitem.get("speaker_name")
+                person_id = get_unique_person_id(speaker_name, iso_date)
+                if (
+                    person_id is None
+                    and speaker_name not in missing_speakers
+                    and verbose
+                ):
+                    print(f"Could not find person id for {speaker_name}")
+                    missing_speakers.append(speaker_name)
+                speech = etree.Element("ques")
+                speech.set("id", id_factory.get_next_q_id())
+                speech.set("url", item.get("url") or "")
+                speech.set("speakername", speaker_name)
+                speech.set("person_id", person_id or "unknown")
+                for child in subitem:
+                    speech.append(child)
+                root.append(speech)
+
+            elif subitem.tag == "answer":
+                speaker_name = subitem.get("speaker_name")
+                person_id = get_unique_person_id(speaker_name, iso_date)
+                if (
+                    person_id is None
+                    and speaker_name not in missing_speakers
+                    and verbose
+                ):
+                    print(f"Could not find person id for {speaker_name}")
+                    missing_speakers.append(speaker_name)
+                speech = etree.Element("reply")
+                speech.set("id", id_factory.get_next_r_id())
+                speech.set("url", item.get("url") or "")
+                speech.set("speakername", speaker_name)
+                speech.set("person_id", person_id or "unknown")
+                for child in subitem:
+                    speech.append(child)
+                root.append(speech)
+
+    # write the new xml to a file
+    etree.indent(root, space="    ")
+
+    with dest_path.open("wb") as f:
+        f.write(etree.tostring(root, pretty_print=True))
+
+
+def convert_to_twfy(
+    cache_dir: Path,
+    output_dir: Path,
+    partial_file_name: Optional[str] = None,
+    verbose: bool = False,
+):
+    """
+    Given a cache directory, parse the raw_html elements in the xml files
+    This updates the 'parsed' element under each agenda-item.
+    """
+    if partial_file_name:
+        xmls = list(cache_dir.glob(f"{partial_file_name}*"))
+    else:
+        xmls = list(cache_dir.glob("*.xml"))
+    for xml in xmls:
+        convert_wrans_xml_to_twfy(xml, output_dir, verbose=verbose)