revamp load_scottish, fix tests

mggg · Jun 30, 2024 · 8b37d9a · 8b37d9a
1 parent 0ef4ead
commit 8b37d9a
Show file tree

Hide file tree

Showing 13 changed files with 146 additions and 135 deletions.
diff --git a/src/votekit/cvr_loaders.py b/src/votekit/cvr_loaders.py
@@ -89,87 +89,86 @@ def load_csv(
     return PreferenceProfile(ballots=ballots)
 
 
-def load_scottish(fpath: str) -> tuple[PreferenceProfile, int]:
+def load_scottish(
+    fpath: str,
+) -> tuple[PreferenceProfile, int, list[str], dict[str, str], str]:
     """
-    Given a file path, loads cvr from format used for Scottish election data in
-    (this repo)[https://github.com/mggg/scot-elex].
+    Given a file path, loads cast vote record from format used for Scottish election data
+    in (this repo)[https://github.com/mggg/scot-elex].
 
     Args:
-        fpath (str): Path to cvr file.
+        fpath (str): Path to Scottish election csv file.
 
     Raises:
         FileNotFoundError: If fpath is invalid.
         EmptyDataError: If dataset is empty.
         DataError: If there is missing or incorrect metadata or candidate data.
 
     Returns:
-        tuple: A tuple ``(PreferenceProfile, seats)`` representing the election and the
-            number of seats in the election.
+        tuple: A tuple ``(PreferenceProfile, seats, cand_list, cand_to_party, ward)``
+            representing the election, the number of seats in the election, the candidate
+            names, a dictionary mapping candidates to their party, and the ward. The
+            candidate names are also stored in the PreferenceProfile object.
     """
-    ballots = []
-    names = []
-    name_map = {}
-    numbers = True
-    cands_included = False
 
     if not os.path.isfile(fpath):
         raise FileNotFoundError(f"File with path {fpath} cannot be found")
     if os.path.getsize(fpath) == 0:
-        raise EmptyDataError("Dataset cannot be empty")
+        raise EmptyDataError(f"CSV at {fpath} is empty.")
 
     with open(fpath, "r") as file:
-        for i, line in enumerate(file):
-            s = line.rstrip("\n").rstrip()
-            if i == 0:
-                # first number is number of candidates, second is number of seats to elect
-                metadata = [int(data) for data in s.split(" ")]
-                if len(metadata) != 2:
-                    raise DataError(
-                        "metadata (first line) should have two parameters"
-                        " (number of candidates, number of seats)"
-                    )
-                seats = metadata[1]
-            # read in ballots, cleaning out rankings labeled '0' (designating end of line)
-            elif numbers:
-                ballot = [int(vote) for vote in s.split(" ")]
-                num_votes = ballot[0]
-                # ballots terminate with a single row with the character '0'
-                if num_votes == 0:
-                    numbers = False
-                else:
-                    ranking = [rank for rank in list(ballot[1:]) if rank != 0]
-                    b = (ranking, num_votes)
-                    ballots.append(b)  # this is converted to the PP format later
-            # read in candidates
-            elif "(" in s:
-                cands_included = True
-                name_parts = s.strip('"').split(" ")
-                first_name = " ".join(name_parts[:-2])
-                last_name = name_parts[-2]
-                party = name_parts[-1].strip("(").strip(")")
-                names.append(str((first_name, last_name, party)))
-            else:
-                if len(names) != metadata[0]:
-                    err_message = (
-                        f"Number of candidates listed, {len(names)}," + f" differs from"
-                        f"number of candidates recorded in metadata, {metadata[0]}"
-                    )
-                    raise DataError(err_message)
-                # read in election location (do we need this?)
-                # location = s.strip("\"")
-                if not cands_included:
-                    raise DataError("Candidates missing from file")
-                # map candidate numbers onto their names and convert ballots to PP format
-                for i, name in enumerate(names):
-                    name_map[i + 1] = name
-                clean_ballots = [
-                    Ballot(
-                        ranking=tuple(
-                            [frozenset({name_map[cand]}) for cand in ballot[0]]
-                        ),
-                        weight=Fraction(ballot[1]),
-                    )
-                    for ballot in ballots
-                ]
-
-        return PreferenceProfile(ballots=clean_ballots, candidates=names), seats
+        lines = list(file)
+
+        # remove errant blank character at end of line
+        row_0 = lines[0].split(",")[:-1]
+
+        if len(row_0) != 2:
+            raise DataError(
+                "The metadata in the first show should be number of \
+                            candidates, seats."
+            )
+
+        cand_num, seats = int(row_0[0]), int(row_0[1])
+        ward = lines[-1].split(",")[0].strip('"')
+
+        num_to_cand = {}
+        cand_to_party = {}
+
+        # record candidate names, which are up until the final row
+        for i, line in enumerate(lines[len(lines) - (cand_num + 1) : -1]):
+            parsed_line = line.split(",")
+            if "Candidate" not in parsed_line[0]:
+                raise DataError(
+                    f"The number of candidates on line 1 is {cand_num}, which\
+                                does not match the metadata."
+                )
+            cand = parsed_line[1].strip('"')
+            party = parsed_line[2].strip('"')
+
+            # candidates are 1 indexed
+            num_to_cand[str(i + 1)] = cand
+            cand_to_party[cand] = party
+
+        cand_list = list(cand_to_party.keys())
+
+        if len(cand_list) != cand_num:
+            raise DataError(
+                "Incorrect number of candidates in either first row metadata \
+                            or in candidate list at end of csv file."
+            )
+        ballots = [Ballot()] * len(lines[1 : len(lines) - (cand_num + 1)])
+
+        for i, line in enumerate(lines[1 : len(lines) - (cand_num + 1)]):
+            # remove carriage return and blank string after final comma
+            parsed_line = line.strip("\n").split(",")[:-1]
+
+            ballot_weight = Fraction(parsed_line[0])
+            cand_ordering = parsed_line[1:]
+            ranking = tuple([frozenset({num_to_cand[n]}) for n in cand_ordering])
+
+            ballots[i] = Ballot(ranking=ranking, weight=ballot_weight)
+
+        profile = PreferenceProfile(
+            ballots=ballots, candidates=cand_list
+        ).condense_ballots()
+        return (profile, seats, cand_list, cand_to_party, ward)
diff --git a/tests/data/csv/scot_bad_metadata.csv b/tests/data/csv/scot_bad_metadata.csv
@@ -0,0 +1 @@
+1,2,3,
diff --git a/tests/data/csv/scot_candidate_overcount.csv b/tests/data/csv/scot_candidate_overcount.csv
@@ -0,0 +1,7 @@
+2,4,
+2,9,8,7,10,
+"Candidate 1","Paul","Orange (O)",
+"Candidate 2","George","Yellow (Y)",
+"Candidate 3","Ringo","Red (R)",
+"Wardy McWard Ward",
+
diff --git a/tests/data/csv/scot_candidate_undercount.csv b/tests/data/csv/scot_candidate_undercount.csv
@@ -0,0 +1,7 @@
+9,4,
+2,9,8,7,10,
+"Candidate 1","Paul","Orange (O)",
+"Candidate 2","George","Yellow (Y)",
+"Candidate 3","Ringo","Red (R)",
+"Wardy McWard Ward",
+
diff --git a/tests/data/txt/empty.blt → tests/data/csv/scot_empty.csv b/tests/data/txt/empty.blt → tests/data/csv/scot_empty.csv
diff --git a/tests/data/csv/scot_wardy_mc_ward.csv b/tests/data/csv/scot_wardy_mc_ward.csv
@@ -0,0 +1,9 @@
+3,1,
+126,1,
+9,1,2,
+10,1,2,3,
+1,3,2,1,
+"Candidate 1","Paul","Orange (O)",
+"Candidate 2","George","Yellow (Y)",
+"Candidate 3","Ringo","Red (R)",
+"Wardy McWard Ward",
diff --git a/tests/data/txt/bad_metadata.blt b/tests/data/txt/bad_metadata.blt
diff --git a/tests/data/txt/candidate_metadata_conflict.blt b/tests/data/txt/candidate_metadata_conflict.blt
diff --git a/tests/data/txt/edinburgh17-01_abridged.blt b/tests/data/txt/edinburgh17-01_abridged.blt
diff --git a/tests/data/txt/scottish_mini.txt b/tests/data/txt/scottish_mini.txt
diff --git a/tests/test_e2e.py b/tests/test_e2e.py
@@ -24,13 +24,14 @@ def test_load_clean_completion():
 
     # load CVR -> PP representation
     BASE_DIR = Path(__file__).resolve().parent
-    BLT_DIR = BASE_DIR / "data/txt/"
+    CSV_DIR = BASE_DIR / "data/csv/"
 
-    pp, seats = load_scottish(BLT_DIR / "edinburgh17-01_abridged.blt")
-    print(pp)
+    pp, seats, cand_list, cand_to_party, ward = load_scottish(
+        CSV_DIR / "scot_wardy_mc_ward.csv"
+    )
 
     # apply rules to get new PP
-    cleaned_pp = clean.remove_noncands(pp, ["Graham HUTCHISON (C)"])
+    cleaned_pp = clean.remove_noncands(pp, ["Paul"])
 
     # write intermediate output for inspection
     # cleaned_pp.save("cleaned.cvr")

diff --git a/tests/test_elections.py b/tests/test_elections.py
@@ -15,46 +15,53 @@
 
 BASE_DIR = Path(__file__).resolve().parent
 DATA_DIR = BASE_DIR / "data/csv/"
-BLT_DIR = BASE_DIR / "data/txt/"
 
 
 test_profile = load_csv(DATA_DIR / "test_election_A.csv")
 mn_profile = load_csv("src/votekit/data/mn_2013_cast_vote_record.csv")
 
 
 def test_droop_default_parameter():
-    pp, seats = load_scottish(BLT_DIR / "edinburgh17-01_abridged.blt")
+    pp, seats, cand_list, cand_to_party, ward = load_scottish(
+        DATA_DIR / "scot_wardy_mc_ward.csv"
+    )
 
     election = STV(pp, fractional_transfer, seats=seats)
 
-    droop_quota = int((8 + 14 + 1 + 13 + 1 + 1 + 2) / (4 + 1)) + 1
+    droop_quota = int((126 + 9 + 10 + 1) / (1 + 1)) + 1
 
     assert election.threshold == droop_quota
 
 
 def test_droop_inputed_parameter():
-    pp, seats = load_scottish(BLT_DIR / "edinburgh17-01_abridged.blt")
+    pp, seats, cand_list, cand_to_party, ward = load_scottish(
+        DATA_DIR / "scot_wardy_mc_ward.csv"
+    )
 
     election = STV(pp, fractional_transfer, seats=seats, quota="Droop")
 
-    droop_quota = int((8 + 14 + 1 + 13 + 1 + 1 + 2) / (4 + 1)) + 1
+    droop_quota = int((126 + 9 + 10 + 1) / (1 + 1)) + 1
 
     assert election.threshold == droop_quota
 
 
 def test_quota_misspelled_parameter():
-    pp, seats = load_scottish(BLT_DIR / "edinburgh17-01_abridged.blt")
+    pp, seats, cand_list, cand_to_party, ward = load_scottish(
+        DATA_DIR / "scot_wardy_mc_ward.csv"
+    )
 
     with pytest.raises(ValueError):
         _ = STV(pp, fractional_transfer, seats=seats, quota="droops")
 
 
 def test_hare_quota():
-    pp, seats = load_scottish(BLT_DIR / "edinburgh17-01_abridged.blt")
+    pp, seats, cand_list, cand_to_party, ward = load_scottish(
+        DATA_DIR / "scot_wardy_mc_ward.csv"
+    )
 
     election = STV(pp, fractional_transfer, seats=seats, quota="hare")
 
-    hare_quota = int((8 + 14 + 1 + 13 + 1 + 1 + 2) / 4)
+    hare_quota = int((126 + 9 + 10 + 1) / 1)
 
     assert election.threshold == hare_quota
 

diff --git a/tests/test_loaders.py b/tests/test_loaders.py
@@ -10,7 +10,6 @@
 
 BASE_DIR = Path(__file__).resolve().parent
 CSV_DIR = BASE_DIR / "data/csv/"
-BLT_DIR = BASE_DIR / "data/txt/"
 
 
 def is_equal(b1: list[Ballot], b2: list[Ballot]) -> bool:
@@ -161,21 +160,51 @@ def test_same_name():
 #     # print(p)
 
 
-def test_blt_seats_parse():
-    pp, seats = load_scottish(BLT_DIR / "edinburgh17-01_abridged.blt")
-    assert seats == 4
+def test_blt_parse():
+    pp, seats, cand_list, cand_to_party, ward = load_scottish(
+        CSV_DIR / "scot_wardy_mc_ward.csv"
+    )
+
+    assert seats == 1
+    assert isinstance(pp, PreferenceProfile)
+    assert cand_list == ["Paul", "George", "Ringo"]
+    assert cand_to_party == {
+        "Paul": "Orange (O)",
+        "George": "Yellow (Y)",
+        "Ringo": "Red (R)",
+    }
+    assert ward == "Wardy McWard Ward"
+    assert int(pp.num_ballots()) == 146
+    assert Ballot(ranking=tuple([frozenset({"Paul"})]), weight=126) in pp.ballots
+    assert (
+        Ballot(
+            ranking=tuple(
+                [frozenset({"Ringo"}), frozenset({"George"}), frozenset({"Paul"})]
+            ),
+            weight=1,
+        )
+        in pp.ballots
+    )
+
+
+def test_bad_file_path_blt():
+    with pytest.raises(FileNotFoundError):
+        load_scottish("")
 
 
 def test_empty_file_blt():
     with pytest.raises(EmptyDataError):
-        pp, seats = load_scottish(BLT_DIR / "empty.blt")
+        load_scottish(CSV_DIR / "scot_empty.csv")
 
 
 def test_bad_metadata_blt():
     with pytest.raises(DataError):
-        pp, seats = load_scottish(BLT_DIR / "bad_metadata.blt")
+        load_scottish(CSV_DIR / "scot_bad_metadata.csv")
 
 
 def test_incorrect_metadata_blt():
     with pytest.raises(DataError):
-        pp, seats = load_scottish(BLT_DIR / "candidate_metadata_conflict.blt")
+        load_scottish(CSV_DIR / "scot_candidate_overcount.csv")
+
+    with pytest.raises(DataError):
+        load_scottish(CSV_DIR / "scot_candidate_undercount.csv")