From 8b37d9adc92c4194d834326fab7022227e481677 Mon Sep 17 00:00:00 2001 From: Chris Donnay Date: Sun, 30 Jun 2024 11:18:40 -0400 Subject: [PATCH] revamp load_scottish, fix tests --- src/votekit/cvr_loaders.py | 133 +++++++++--------- tests/data/csv/scot_bad_metadata.csv | 1 + tests/data/csv/scot_candidate_overcount.csv | 7 + tests/data/csv/scot_candidate_undercount.csv | 7 + .../{txt/empty.blt => csv/scot_empty.csv} | 0 tests/data/csv/scot_wardy_mc_ward.csv | 9 ++ tests/data/txt/bad_metadata.blt | 1 - .../data/txt/candidate_metadata_conflict.blt | 20 --- tests/data/txt/edinburgh17-01_abridged.blt | 21 --- tests/data/txt/scottish_mini.txt | 7 - tests/test_e2e.py | 9 +- tests/test_elections.py | 23 +-- tests/test_loaders.py | 43 +++++- 13 files changed, 146 insertions(+), 135 deletions(-) create mode 100644 tests/data/csv/scot_bad_metadata.csv create mode 100644 tests/data/csv/scot_candidate_overcount.csv create mode 100644 tests/data/csv/scot_candidate_undercount.csv rename tests/data/{txt/empty.blt => csv/scot_empty.csv} (100%) create mode 100644 tests/data/csv/scot_wardy_mc_ward.csv delete mode 100644 tests/data/txt/bad_metadata.blt delete mode 100644 tests/data/txt/candidate_metadata_conflict.blt delete mode 100644 tests/data/txt/edinburgh17-01_abridged.blt delete mode 100644 tests/data/txt/scottish_mini.txt diff --git a/src/votekit/cvr_loaders.py b/src/votekit/cvr_loaders.py index 01089d8c..a23a85f1 100644 --- a/src/votekit/cvr_loaders.py +++ b/src/votekit/cvr_loaders.py @@ -89,13 +89,15 @@ def load_csv( return PreferenceProfile(ballots=ballots) -def load_scottish(fpath: str) -> tuple[PreferenceProfile, int]: +def load_scottish( + fpath: str, +) -> tuple[PreferenceProfile, int, list[str], dict[str, str], str]: """ - Given a file path, loads cvr from format used for Scottish election data in - (this repo)[https://github.com/mggg/scot-elex]. + Given a file path, loads cast vote record from format used for Scottish election data + in (this repo)[https://github.com/mggg/scot-elex]. Args: - fpath (str): Path to cvr file. + fpath (str): Path to Scottish election csv file. Raises: FileNotFoundError: If fpath is invalid. @@ -103,73 +105,70 @@ def load_scottish(fpath: str) -> tuple[PreferenceProfile, int]: DataError: If there is missing or incorrect metadata or candidate data. Returns: - tuple: A tuple ``(PreferenceProfile, seats)`` representing the election and the - number of seats in the election. + tuple: A tuple ``(PreferenceProfile, seats, cand_list, cand_to_party, ward)`` + representing the election, the number of seats in the election, the candidate + names, a dictionary mapping candidates to their party, and the ward. The + candidate names are also stored in the PreferenceProfile object. """ - ballots = [] - names = [] - name_map = {} - numbers = True - cands_included = False if not os.path.isfile(fpath): raise FileNotFoundError(f"File with path {fpath} cannot be found") if os.path.getsize(fpath) == 0: - raise EmptyDataError("Dataset cannot be empty") + raise EmptyDataError(f"CSV at {fpath} is empty.") with open(fpath, "r") as file: - for i, line in enumerate(file): - s = line.rstrip("\n").rstrip() - if i == 0: - # first number is number of candidates, second is number of seats to elect - metadata = [int(data) for data in s.split(" ")] - if len(metadata) != 2: - raise DataError( - "metadata (first line) should have two parameters" - " (number of candidates, number of seats)" - ) - seats = metadata[1] - # read in ballots, cleaning out rankings labeled '0' (designating end of line) - elif numbers: - ballot = [int(vote) for vote in s.split(" ")] - num_votes = ballot[0] - # ballots terminate with a single row with the character '0' - if num_votes == 0: - numbers = False - else: - ranking = [rank for rank in list(ballot[1:]) if rank != 0] - b = (ranking, num_votes) - ballots.append(b) # this is converted to the PP format later - # read in candidates - elif "(" in s: - cands_included = True - name_parts = s.strip('"').split(" ") - first_name = " ".join(name_parts[:-2]) - last_name = name_parts[-2] - party = name_parts[-1].strip("(").strip(")") - names.append(str((first_name, last_name, party))) - else: - if len(names) != metadata[0]: - err_message = ( - f"Number of candidates listed, {len(names)}," + f" differs from" - f"number of candidates recorded in metadata, {metadata[0]}" - ) - raise DataError(err_message) - # read in election location (do we need this?) - # location = s.strip("\"") - if not cands_included: - raise DataError("Candidates missing from file") - # map candidate numbers onto their names and convert ballots to PP format - for i, name in enumerate(names): - name_map[i + 1] = name - clean_ballots = [ - Ballot( - ranking=tuple( - [frozenset({name_map[cand]}) for cand in ballot[0]] - ), - weight=Fraction(ballot[1]), - ) - for ballot in ballots - ] - - return PreferenceProfile(ballots=clean_ballots, candidates=names), seats + lines = list(file) + + # remove errant blank character at end of line + row_0 = lines[0].split(",")[:-1] + + if len(row_0) != 2: + raise DataError( + "The metadata in the first show should be number of \ + candidates, seats." + ) + + cand_num, seats = int(row_0[0]), int(row_0[1]) + ward = lines[-1].split(",")[0].strip('"') + + num_to_cand = {} + cand_to_party = {} + + # record candidate names, which are up until the final row + for i, line in enumerate(lines[len(lines) - (cand_num + 1) : -1]): + parsed_line = line.split(",") + if "Candidate" not in parsed_line[0]: + raise DataError( + f"The number of candidates on line 1 is {cand_num}, which\ + does not match the metadata." + ) + cand = parsed_line[1].strip('"') + party = parsed_line[2].strip('"') + + # candidates are 1 indexed + num_to_cand[str(i + 1)] = cand + cand_to_party[cand] = party + + cand_list = list(cand_to_party.keys()) + + if len(cand_list) != cand_num: + raise DataError( + "Incorrect number of candidates in either first row metadata \ + or in candidate list at end of csv file." + ) + ballots = [Ballot()] * len(lines[1 : len(lines) - (cand_num + 1)]) + + for i, line in enumerate(lines[1 : len(lines) - (cand_num + 1)]): + # remove carriage return and blank string after final comma + parsed_line = line.strip("\n").split(",")[:-1] + + ballot_weight = Fraction(parsed_line[0]) + cand_ordering = parsed_line[1:] + ranking = tuple([frozenset({num_to_cand[n]}) for n in cand_ordering]) + + ballots[i] = Ballot(ranking=ranking, weight=ballot_weight) + + profile = PreferenceProfile( + ballots=ballots, candidates=cand_list + ).condense_ballots() + return (profile, seats, cand_list, cand_to_party, ward) diff --git a/tests/data/csv/scot_bad_metadata.csv b/tests/data/csv/scot_bad_metadata.csv new file mode 100644 index 00000000..1e96900a --- /dev/null +++ b/tests/data/csv/scot_bad_metadata.csv @@ -0,0 +1 @@ +1,2,3, \ No newline at end of file diff --git a/tests/data/csv/scot_candidate_overcount.csv b/tests/data/csv/scot_candidate_overcount.csv new file mode 100644 index 00000000..bfdc0dbd --- /dev/null +++ b/tests/data/csv/scot_candidate_overcount.csv @@ -0,0 +1,7 @@ +2,4, +2,9,8,7,10, +"Candidate 1","Paul","Orange (O)", +"Candidate 2","George","Yellow (Y)", +"Candidate 3","Ringo","Red (R)", +"Wardy McWard Ward", + diff --git a/tests/data/csv/scot_candidate_undercount.csv b/tests/data/csv/scot_candidate_undercount.csv new file mode 100644 index 00000000..750f1473 --- /dev/null +++ b/tests/data/csv/scot_candidate_undercount.csv @@ -0,0 +1,7 @@ +9,4, +2,9,8,7,10, +"Candidate 1","Paul","Orange (O)", +"Candidate 2","George","Yellow (Y)", +"Candidate 3","Ringo","Red (R)", +"Wardy McWard Ward", + diff --git a/tests/data/txt/empty.blt b/tests/data/csv/scot_empty.csv similarity index 100% rename from tests/data/txt/empty.blt rename to tests/data/csv/scot_empty.csv diff --git a/tests/data/csv/scot_wardy_mc_ward.csv b/tests/data/csv/scot_wardy_mc_ward.csv new file mode 100644 index 00000000..21c13f90 --- /dev/null +++ b/tests/data/csv/scot_wardy_mc_ward.csv @@ -0,0 +1,9 @@ +3,1, +126,1, +9,1,2, +10,1,2,3, +1,3,2,1, +"Candidate 1","Paul","Orange (O)", +"Candidate 2","George","Yellow (Y)", +"Candidate 3","Ringo","Red (R)", +"Wardy McWard Ward", diff --git a/tests/data/txt/bad_metadata.blt b/tests/data/txt/bad_metadata.blt deleted file mode 100644 index 703ca85b..00000000 --- a/tests/data/txt/bad_metadata.blt +++ /dev/null @@ -1 +0,0 @@ -1 2 3 \ No newline at end of file diff --git a/tests/data/txt/candidate_metadata_conflict.blt b/tests/data/txt/candidate_metadata_conflict.blt deleted file mode 100644 index e6c8c420..00000000 --- a/tests/data/txt/candidate_metadata_conflict.blt +++ /dev/null @@ -1,20 +0,0 @@ -9 4 -2 9 8 7 10 0 -1 9 8 7 10 6 5 4 3 1 2 0 -1 9 8 7 3 6 5 2 4 10 1 0 -1 9 8 7 4 10 6 0 -2 9 8 7 6 0 -1 9 8 7 6 4 10 0 -0 -"Daniel FRASER (Libtn)" -"Graham HUTCHISON (C)" -"Otto INGLIS (UKIP)" -"Kevin LANG (LD)" -"John LONGSTAFF (Ind)" -"Iain MCKINNON-WADDELL (Grn)" -"Pamela MITCHELL (SNP)" -"Bruce WHITEHEAD (Lab)" -"Norrie WORK (SNP)" -"Louise YOUNG (LD)" -"Ward 1 - Almond" - diff --git a/tests/data/txt/edinburgh17-01_abridged.blt b/tests/data/txt/edinburgh17-01_abridged.blt deleted file mode 100644 index 4b84ee5a..00000000 --- a/tests/data/txt/edinburgh17-01_abridged.blt +++ /dev/null @@ -1,21 +0,0 @@ -10 4 -8 1 0 -14 10 0 -1 10 1 2 8 4 0 -13 2 1 0 -1 3 10 5 0 -1 4 10 2 8 1 6 7 9 3 5 0 -2 9 8 7 6 0 -0 -"Daniel FRASER (Libtn)" -"Graham HUTCHISON (C)" -"Otto INGLIS (UKIP)" -"Kevin LANG (LD)" -"John LONGSTAFF (Ind)" -"Iain MCKINNON-WADDELL (Grn)" -"Pamela MITCHELL (SNP)" -"Bruce WHITEHEAD (Lab)" -"Norrie WORK (SNP)" -"Louise YOUNG (LD)" -"Ward 1 - Almond" - diff --git a/tests/data/txt/scottish_mini.txt b/tests/data/txt/scottish_mini.txt deleted file mode 100644 index c8974743..00000000 --- a/tests/data/txt/scottish_mini.txt +++ /dev/null @@ -1,7 +0,0 @@ -100 1 0 -10 1 2 0 -8 1 2 3 0 -1 1 2 3 4 0 -1 1 2 3 4 5 6 7 8 9 0 -1 1 2 3 6 7 9 0 -1 1 2 3 7 4 5 8 6 9 0 \ No newline at end of file diff --git a/tests/test_e2e.py b/tests/test_e2e.py index fea9e134..a3882144 100644 --- a/tests/test_e2e.py +++ b/tests/test_e2e.py @@ -24,13 +24,14 @@ def test_load_clean_completion(): # load CVR -> PP representation BASE_DIR = Path(__file__).resolve().parent - BLT_DIR = BASE_DIR / "data/txt/" + CSV_DIR = BASE_DIR / "data/csv/" - pp, seats = load_scottish(BLT_DIR / "edinburgh17-01_abridged.blt") - print(pp) + pp, seats, cand_list, cand_to_party, ward = load_scottish( + CSV_DIR / "scot_wardy_mc_ward.csv" + ) # apply rules to get new PP - cleaned_pp = clean.remove_noncands(pp, ["Graham HUTCHISON (C)"]) + cleaned_pp = clean.remove_noncands(pp, ["Paul"]) # write intermediate output for inspection # cleaned_pp.save("cleaned.cvr") diff --git a/tests/test_elections.py b/tests/test_elections.py index a916e3a9..99aec68a 100644 --- a/tests/test_elections.py +++ b/tests/test_elections.py @@ -15,7 +15,6 @@ BASE_DIR = Path(__file__).resolve().parent DATA_DIR = BASE_DIR / "data/csv/" -BLT_DIR = BASE_DIR / "data/txt/" test_profile = load_csv(DATA_DIR / "test_election_A.csv") @@ -23,38 +22,46 @@ def test_droop_default_parameter(): - pp, seats = load_scottish(BLT_DIR / "edinburgh17-01_abridged.blt") + pp, seats, cand_list, cand_to_party, ward = load_scottish( + DATA_DIR / "scot_wardy_mc_ward.csv" + ) election = STV(pp, fractional_transfer, seats=seats) - droop_quota = int((8 + 14 + 1 + 13 + 1 + 1 + 2) / (4 + 1)) + 1 + droop_quota = int((126 + 9 + 10 + 1) / (1 + 1)) + 1 assert election.threshold == droop_quota def test_droop_inputed_parameter(): - pp, seats = load_scottish(BLT_DIR / "edinburgh17-01_abridged.blt") + pp, seats, cand_list, cand_to_party, ward = load_scottish( + DATA_DIR / "scot_wardy_mc_ward.csv" + ) election = STV(pp, fractional_transfer, seats=seats, quota="Droop") - droop_quota = int((8 + 14 + 1 + 13 + 1 + 1 + 2) / (4 + 1)) + 1 + droop_quota = int((126 + 9 + 10 + 1) / (1 + 1)) + 1 assert election.threshold == droop_quota def test_quota_misspelled_parameter(): - pp, seats = load_scottish(BLT_DIR / "edinburgh17-01_abridged.blt") + pp, seats, cand_list, cand_to_party, ward = load_scottish( + DATA_DIR / "scot_wardy_mc_ward.csv" + ) with pytest.raises(ValueError): _ = STV(pp, fractional_transfer, seats=seats, quota="droops") def test_hare_quota(): - pp, seats = load_scottish(BLT_DIR / "edinburgh17-01_abridged.blt") + pp, seats, cand_list, cand_to_party, ward = load_scottish( + DATA_DIR / "scot_wardy_mc_ward.csv" + ) election = STV(pp, fractional_transfer, seats=seats, quota="hare") - hare_quota = int((8 + 14 + 1 + 13 + 1 + 1 + 2) / 4) + hare_quota = int((126 + 9 + 10 + 1) / 1) assert election.threshold == hare_quota diff --git a/tests/test_loaders.py b/tests/test_loaders.py index a94a118d..157961a3 100644 --- a/tests/test_loaders.py +++ b/tests/test_loaders.py @@ -10,7 +10,6 @@ BASE_DIR = Path(__file__).resolve().parent CSV_DIR = BASE_DIR / "data/csv/" -BLT_DIR = BASE_DIR / "data/txt/" def is_equal(b1: list[Ballot], b2: list[Ballot]) -> bool: @@ -161,21 +160,51 @@ def test_same_name(): # # print(p) -def test_blt_seats_parse(): - pp, seats = load_scottish(BLT_DIR / "edinburgh17-01_abridged.blt") - assert seats == 4 +def test_blt_parse(): + pp, seats, cand_list, cand_to_party, ward = load_scottish( + CSV_DIR / "scot_wardy_mc_ward.csv" + ) + + assert seats == 1 + assert isinstance(pp, PreferenceProfile) + assert cand_list == ["Paul", "George", "Ringo"] + assert cand_to_party == { + "Paul": "Orange (O)", + "George": "Yellow (Y)", + "Ringo": "Red (R)", + } + assert ward == "Wardy McWard Ward" + assert int(pp.num_ballots()) == 146 + assert Ballot(ranking=tuple([frozenset({"Paul"})]), weight=126) in pp.ballots + assert ( + Ballot( + ranking=tuple( + [frozenset({"Ringo"}), frozenset({"George"}), frozenset({"Paul"})] + ), + weight=1, + ) + in pp.ballots + ) + + +def test_bad_file_path_blt(): + with pytest.raises(FileNotFoundError): + load_scottish("") def test_empty_file_blt(): with pytest.raises(EmptyDataError): - pp, seats = load_scottish(BLT_DIR / "empty.blt") + load_scottish(CSV_DIR / "scot_empty.csv") def test_bad_metadata_blt(): with pytest.raises(DataError): - pp, seats = load_scottish(BLT_DIR / "bad_metadata.blt") + load_scottish(CSV_DIR / "scot_bad_metadata.csv") def test_incorrect_metadata_blt(): with pytest.raises(DataError): - pp, seats = load_scottish(BLT_DIR / "candidate_metadata_conflict.blt") + load_scottish(CSV_DIR / "scot_candidate_overcount.csv") + + with pytest.raises(DataError): + load_scottish(CSV_DIR / "scot_candidate_undercount.csv")