Merge pull request #128 from cdonnay/update_scot

Update load_scottish
mggg · Jul 3, 2024 · 2cfe619 · 2cfe619
2 parents 0ef4ead + 25a934a
commit 2cfe619
Show file tree

Hide file tree

Showing 17 changed files with 202 additions and 141 deletions.
diff --git a/.gitignore b/.gitignore
@@ -9,4 +9,5 @@ dist/
 extra_data/
 .venv
 .docs_venv
-docs/_build
+docs/_build
+.dev
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Updated tutorial notebooks; larger focus on slate models, updated notebooks to match current codebase.
 - Removed the seq-RCV transfer rule since it is a dummy function, replaced with lambda function.
 - Update plot MDS to have aspect ratio 1, remove axes labels since they are meaningless in MDS.
+- Update all BLT files in scot-elex repo to be true CSV files, updated `load_scottish` accordingly.
 
 ## Fixed
 - Fixed bug by which slate-PlackettLuce could not generate ballots when some candidate had 0 support.

diff --git a/pyproject.toml b/pyproject.toml
@@ -18,7 +18,7 @@ matplotlib = "^3.7.2"
 pandas = "^1.5.3"
 apportionment = "^1.0"
 scikit-learn = "^1.3.2"
-
+numpy = "^1.26.0"
 
 
 [tool.poetry.group.dev.dependencies]

diff --git a/src/votekit/cvr_loaders.py b/src/votekit/cvr_loaders.py
@@ -1,5 +1,6 @@
 from fractions import Fraction
 import os
+import csv
 import pandas as pd
 from pandas.errors import EmptyDataError, DataError
 import pathlib
@@ -89,87 +90,94 @@ def load_csv(
     return PreferenceProfile(ballots=ballots)
 
 
-def load_scottish(fpath: str) -> tuple[PreferenceProfile, int]:
+def load_scottish(
+    fpath: str,
+) -> tuple[PreferenceProfile, int, list[str], dict[str, str], str]:
     """
-    Given a file path, loads cvr from format used for Scottish election data in
-    (this repo)[https://github.com/mggg/scot-elex].
+    Given a file path, loads cast vote record from format used for Scottish election data
+    in (this repo)[https://github.com/mggg/scot-elex].
 
     Args:
-        fpath (str): Path to cvr file.
+        fpath (str): Path to Scottish election csv file.
 
     Raises:
         FileNotFoundError: If fpath is invalid.
         EmptyDataError: If dataset is empty.
         DataError: If there is missing or incorrect metadata or candidate data.
 
     Returns:
-        tuple: A tuple ``(PreferenceProfile, seats)`` representing the election and the
-            number of seats in the election.
+        tuple: A tuple ``(PreferenceProfile, seats, cand_list, cand_to_party, ward)``
+            representing the election, the number of seats in the election, the candidate
+            names, a dictionary mapping candidates to their party, and the ward. The
+            candidate names are also stored in the PreferenceProfile object.
     """
-    ballots = []
-    names = []
-    name_map = {}
-    numbers = True
-    cands_included = False
 
     if not os.path.isfile(fpath):
         raise FileNotFoundError(f"File with path {fpath} cannot be found")
     if os.path.getsize(fpath) == 0:
-        raise EmptyDataError("Dataset cannot be empty")
+        raise EmptyDataError(f"CSV at {fpath} is empty.")
+
+    # Convert the ballot rows to ints while leaving the candidates as strings
+    def convert_row(row):
+        return [int(item) if item.isdigit() else item for item in row]
+
+    data = []
+    with open(fpath, "r") as f:
+        reader = csv.reader(f)
+        for row in reader:
+            # This just removes any empty strings that are hanging out since
+            # we don't need to preserve columns
+            filtered_row = list(filter(lambda x: x != "", row))
+
+            # only save non-empty rows
+            if len(filtered_row) > 0:
+                data.append(convert_row(filtered_row))
+
+    if len(data[0]) != 2:
+        raise DataError(
+            "The metadata in the first row should be number of \
+                            candidates, seats."
+        )
+
+    cand_num, seats = data[0][0], data[0][1]
+    ward = data[-1][0]
+
+    num_to_cand = {}
+    cand_to_party = {}
+
+    data_cand_num = len([r for r in data if "Candidate" in str(r[0])])
+    if data_cand_num != cand_num:
+        raise DataError(
+            "Incorrect number of candidates in either first row metadata \
+                        or in candidate list at end of csv file."
+        )
+
+    # record candidate names, which are up until the final row
+    for i, line in enumerate(data[len(data) - (cand_num + 1) : -1]):
+        if "Candidate" not in line[0]:
+            raise DataError(
+                f"The number of candidates on line 1 is {cand_num}, which\
+                            does not match the metadata."
+            )
+        cand = line[1]
+        party = line[2]
+
+        # candidates are 1 indexed
+        num_to_cand[i + 1] = cand
+        cand_to_party[cand] = party
+
+    cand_list = list(cand_to_party.keys())
+
+    ballots = [Ballot()] * len(data[1 : len(data) - (cand_num + 1)])
+
+    for i, line in enumerate(data[1 : len(data) - (cand_num + 1)]):
+        ballot_weight = Fraction(line[0])
+        cand_ordering = line[1:]
+        ranking = tuple([frozenset({num_to_cand[n]}) for n in cand_ordering])
+
+        ballots[i] = Ballot(ranking=ranking, weight=ballot_weight)
 
-    with open(fpath, "r") as file:
-        for i, line in enumerate(file):
-            s = line.rstrip("\n").rstrip()
-            if i == 0:
-                # first number is number of candidates, second is number of seats to elect
-                metadata = [int(data) for data in s.split(" ")]
-                if len(metadata) != 2:
-                    raise DataError(
-                        "metadata (first line) should have two parameters"
-                        " (number of candidates, number of seats)"
-                    )
-                seats = metadata[1]
-            # read in ballots, cleaning out rankings labeled '0' (designating end of line)
-            elif numbers:
-                ballot = [int(vote) for vote in s.split(" ")]
-                num_votes = ballot[0]
-                # ballots terminate with a single row with the character '0'
-                if num_votes == 0:
-                    numbers = False
-                else:
-                    ranking = [rank for rank in list(ballot[1:]) if rank != 0]
-                    b = (ranking, num_votes)
-                    ballots.append(b)  # this is converted to the PP format later
-            # read in candidates
-            elif "(" in s:
-                cands_included = True
-                name_parts = s.strip('"').split(" ")
-                first_name = " ".join(name_parts[:-2])
-                last_name = name_parts[-2]
-                party = name_parts[-1].strip("(").strip(")")
-                names.append(str((first_name, last_name, party)))
-            else:
-                if len(names) != metadata[0]:
-                    err_message = (
-                        f"Number of candidates listed, {len(names)}," + f" differs from"
-                        f"number of candidates recorded in metadata, {metadata[0]}"
-                    )
-                    raise DataError(err_message)
-                # read in election location (do we need this?)
-                # location = s.strip("\"")
-                if not cands_included:
-                    raise DataError("Candidates missing from file")
-                # map candidate numbers onto their names and convert ballots to PP format
-                for i, name in enumerate(names):
-                    name_map[i + 1] = name
-                clean_ballots = [
-                    Ballot(
-                        ranking=tuple(
-                            [frozenset({name_map[cand]}) for cand in ballot[0]]
-                        ),
-                        weight=Fraction(ballot[1]),
-                    )
-                    for ballot in ballots
-                ]
-
-        return PreferenceProfile(ballots=clean_ballots, candidates=names), seats
+    profile = PreferenceProfile(
+        ballots=ballots, candidates=cand_list
+    ).condense_ballots()
+    return (profile, seats, cand_list, cand_to_party, ward)
diff --git a/tests/data/csv/scot_bad_metadata.csv b/tests/data/csv/scot_bad_metadata.csv
@@ -0,0 +1 @@
+1,2,3,
diff --git a/tests/data/csv/scot_blank_rows.csv b/tests/data/csv/scot_blank_rows.csv
@@ -0,0 +1,13 @@
+3,1,
+126,1,
+
+9,1,2,
+10,1,2,3,
+1,3,2,1,
+"Candidate 1","Paul","Orange (O)",
+"Candidate 2","George","Yellow (Y)",
+"Candidate 3","Ringo","Red (R)",
+
+"Wardy McWard Ward",
+
+
diff --git a/tests/data/csv/scot_candidate_overcount.csv b/tests/data/csv/scot_candidate_overcount.csv
@@ -0,0 +1,7 @@
+9,4,
+2,9,8,7,10,
+"Candidate 1","Paul","Orange (O)",
+"Candidate 2","George","Yellow (Y)",
+"Candidate 3","Ringo","Red (R)",
+"Wardy McWard Ward",
+
diff --git a/tests/data/csv/scot_candidate_undercount.csv b/tests/data/csv/scot_candidate_undercount.csv
@@ -0,0 +1,6 @@
+2,4,
+2,1,2,3,
+"Candidate 1","Paul","Orange (O)",
+"Candidate 2","George","Yellow (Y)",
+"Candidate 3","Ringo","Red (R)",
+"Wardy McWard Ward",
diff --git a/tests/data/txt/empty.blt → tests/data/csv/scot_empty.csv b/tests/data/txt/empty.blt → tests/data/csv/scot_empty.csv
diff --git a/tests/data/csv/scot_wardy_mc_ward.csv b/tests/data/csv/scot_wardy_mc_ward.csv
@@ -0,0 +1,9 @@
+3,1,
+126,1,
+9,1,2,
+10,1,2,3,
+1,3,2,1,
+"Candidate 1","Paul","Orange (O)",
+"Candidate 2","George","Yellow (Y)",
+"Candidate 3","Ringo","Red (R)",
+"Wardy McWard Ward",
diff --git a/tests/data/txt/bad_metadata.blt b/tests/data/txt/bad_metadata.blt
diff --git a/tests/data/txt/candidate_metadata_conflict.blt b/tests/data/txt/candidate_metadata_conflict.blt
diff --git a/tests/data/txt/edinburgh17-01_abridged.blt b/tests/data/txt/edinburgh17-01_abridged.blt
diff --git a/tests/data/txt/scottish_mini.txt b/tests/data/txt/scottish_mini.txt
diff --git a/tests/test_e2e.py b/tests/test_e2e.py
@@ -24,13 +24,14 @@ def test_load_clean_completion():
 
     # load CVR -> PP representation
     BASE_DIR = Path(__file__).resolve().parent
-    BLT_DIR = BASE_DIR / "data/txt/"
+    CSV_DIR = BASE_DIR / "data/csv/"
 
-    pp, seats = load_scottish(BLT_DIR / "edinburgh17-01_abridged.blt")
-    print(pp)
+    pp, seats, cand_list, cand_to_party, ward = load_scottish(
+        CSV_DIR / "scot_wardy_mc_ward.csv"
+    )
 
     # apply rules to get new PP
-    cleaned_pp = clean.remove_noncands(pp, ["Graham HUTCHISON (C)"])
+    cleaned_pp = clean.remove_noncands(pp, ["Paul"])
 
     # write intermediate output for inspection
     # cleaned_pp.save("cleaned.cvr")

diff --git a/tests/test_elections.py b/tests/test_elections.py
@@ -15,46 +15,53 @@
 
 BASE_DIR = Path(__file__).resolve().parent
 DATA_DIR = BASE_DIR / "data/csv/"
-BLT_DIR = BASE_DIR / "data/txt/"
 
 
 test_profile = load_csv(DATA_DIR / "test_election_A.csv")
 mn_profile = load_csv("src/votekit/data/mn_2013_cast_vote_record.csv")
 
 
 def test_droop_default_parameter():
-    pp, seats = load_scottish(BLT_DIR / "edinburgh17-01_abridged.blt")
+    pp, seats, cand_list, cand_to_party, ward = load_scottish(
+        DATA_DIR / "scot_wardy_mc_ward.csv"
+    )
 
     election = STV(pp, fractional_transfer, seats=seats)
 
-    droop_quota = int((8 + 14 + 1 + 13 + 1 + 1 + 2) / (4 + 1)) + 1
+    droop_quota = int((126 + 9 + 10 + 1) / (1 + 1)) + 1
 
     assert election.threshold == droop_quota
 
 
 def test_droop_inputed_parameter():
-    pp, seats = load_scottish(BLT_DIR / "edinburgh17-01_abridged.blt")
+    pp, seats, cand_list, cand_to_party, ward = load_scottish(
+        DATA_DIR / "scot_wardy_mc_ward.csv"
+    )
 
     election = STV(pp, fractional_transfer, seats=seats, quota="Droop")
 
-    droop_quota = int((8 + 14 + 1 + 13 + 1 + 1 + 2) / (4 + 1)) + 1
+    droop_quota = int((126 + 9 + 10 + 1) / (1 + 1)) + 1
 
     assert election.threshold == droop_quota
 
 
 def test_quota_misspelled_parameter():
-    pp, seats = load_scottish(BLT_DIR / "edinburgh17-01_abridged.blt")
+    pp, seats, cand_list, cand_to_party, ward = load_scottish(
+        DATA_DIR / "scot_wardy_mc_ward.csv"
+    )
 
     with pytest.raises(ValueError):
         _ = STV(pp, fractional_transfer, seats=seats, quota="droops")
 
 
 def test_hare_quota():
-    pp, seats = load_scottish(BLT_DIR / "edinburgh17-01_abridged.blt")
+    pp, seats, cand_list, cand_to_party, ward = load_scottish(
+        DATA_DIR / "scot_wardy_mc_ward.csv"
+    )
 
     election = STV(pp, fractional_transfer, seats=seats, quota="hare")
 
-    hare_quota = int((8 + 14 + 1 + 13 + 1 + 1 + 2) / 4)
+    hare_quota = int((126 + 9 + 10 + 1) / 1)
 
     assert election.threshold == hare_quota