diff --git a/src/votekit/cvr_loaders.py b/src/votekit/cvr_loaders.py index a23a85f..428c320 100644 --- a/src/votekit/cvr_loaders.py +++ b/src/votekit/cvr_loaders.py @@ -1,5 +1,6 @@ from fractions import Fraction import os +import csv import pandas as pd from pandas.errors import EmptyDataError, DataError import pathlib @@ -116,59 +117,67 @@ def load_scottish( if os.path.getsize(fpath) == 0: raise EmptyDataError(f"CSV at {fpath} is empty.") - with open(fpath, "r") as file: - lines = list(file) - - # remove errant blank character at end of line - row_0 = lines[0].split(",")[:-1] - - if len(row_0) != 2: - raise DataError( - "The metadata in the first show should be number of \ + # Convert the ballot rows to ints while leaving the candidates as strings + def convert_row(row): + return [int(item) if item.isdigit() else item for item in row] + + data = [] + with open(fpath, "r") as f: + reader = csv.reader(f) + for row in reader: + # This just removes any empty strings that are hanging out since + # we don't need to preserve columns + filtered_row = list(filter(lambda x: x != "", row)) + + # only save non-empty rows + if len(filtered_row) > 0: + data.append(convert_row(filtered_row)) + + if len(data[0]) != 2: + raise DataError( + "The metadata in the first row should be number of \ candidates, seats." - ) - - cand_num, seats = int(row_0[0]), int(row_0[1]) - ward = lines[-1].split(",")[0].strip('"') - - num_to_cand = {} - cand_to_party = {} + ) - # record candidate names, which are up until the final row - for i, line in enumerate(lines[len(lines) - (cand_num + 1) : -1]): - parsed_line = line.split(",") - if "Candidate" not in parsed_line[0]: - raise DataError( - f"The number of candidates on line 1 is {cand_num}, which\ - does not match the metadata." - ) - cand = parsed_line[1].strip('"') - party = parsed_line[2].strip('"') + cand_num, seats = data[0][0], data[0][1] + ward = data[-1][0] - # candidates are 1 indexed - num_to_cand[str(i + 1)] = cand - cand_to_party[cand] = party + num_to_cand = {} + cand_to_party = {} - cand_list = list(cand_to_party.keys()) + data_cand_num = len([r for r in data if "Candidate" in str(r[0])]) + if data_cand_num != cand_num: + raise DataError( + "Incorrect number of candidates in either first row metadata \ + or in candidate list at end of csv file." + ) - if len(cand_list) != cand_num: + # record candidate names, which are up until the final row + for i, line in enumerate(data[len(data) - (cand_num + 1) : -1]): + if "Candidate" not in line[0]: raise DataError( - "Incorrect number of candidates in either first row metadata \ - or in candidate list at end of csv file." + f"The number of candidates on line 1 is {cand_num}, which\ + does not match the metadata." ) - ballots = [Ballot()] * len(lines[1 : len(lines) - (cand_num + 1)]) + cand = line[1] + party = line[2] + + # candidates are 1 indexed + num_to_cand[i + 1] = cand + cand_to_party[cand] = party + + cand_list = list(cand_to_party.keys()) - for i, line in enumerate(lines[1 : len(lines) - (cand_num + 1)]): - # remove carriage return and blank string after final comma - parsed_line = line.strip("\n").split(",")[:-1] + ballots = [Ballot()] * len(data[1 : len(data) - (cand_num + 1)]) - ballot_weight = Fraction(parsed_line[0]) - cand_ordering = parsed_line[1:] - ranking = tuple([frozenset({num_to_cand[n]}) for n in cand_ordering]) + for i, line in enumerate(data[1 : len(data) - (cand_num + 1)]): + ballot_weight = Fraction(line[0]) + cand_ordering = line[1:] + ranking = tuple([frozenset({num_to_cand[n]}) for n in cand_ordering]) - ballots[i] = Ballot(ranking=ranking, weight=ballot_weight) + ballots[i] = Ballot(ranking=ranking, weight=ballot_weight) - profile = PreferenceProfile( - ballots=ballots, candidates=cand_list - ).condense_ballots() - return (profile, seats, cand_list, cand_to_party, ward) + profile = PreferenceProfile( + ballots=ballots, candidates=cand_list + ).condense_ballots() + return (profile, seats, cand_list, cand_to_party, ward) diff --git a/tests/data/csv/scot_blank_rows.csv b/tests/data/csv/scot_blank_rows.csv new file mode 100644 index 0000000..ecd69c4 --- /dev/null +++ b/tests/data/csv/scot_blank_rows.csv @@ -0,0 +1,13 @@ +3,1, +126,1, + +9,1,2, +10,1,2,3, +1,3,2,1, +"Candidate 1","Paul","Orange (O)", +"Candidate 2","George","Yellow (Y)", +"Candidate 3","Ringo","Red (R)", + +"Wardy McWard Ward", + + diff --git a/tests/data/csv/scot_candidate_overcount.csv b/tests/data/csv/scot_candidate_overcount.csv index bfdc0db..750f147 100644 --- a/tests/data/csv/scot_candidate_overcount.csv +++ b/tests/data/csv/scot_candidate_overcount.csv @@ -1,4 +1,4 @@ -2,4, +9,4, 2,9,8,7,10, "Candidate 1","Paul","Orange (O)", "Candidate 2","George","Yellow (Y)", diff --git a/tests/data/csv/scot_candidate_undercount.csv b/tests/data/csv/scot_candidate_undercount.csv index 750f147..475412e 100644 --- a/tests/data/csv/scot_candidate_undercount.csv +++ b/tests/data/csv/scot_candidate_undercount.csv @@ -1,7 +1,6 @@ -9,4, -2,9,8,7,10, +2,4, +2,1,2,3, "Candidate 1","Paul","Orange (O)", "Candidate 2","George","Yellow (Y)", "Candidate 3","Ringo","Red (R)", -"Wardy McWard Ward", - +"Wardy McWard Ward", \ No newline at end of file diff --git a/tests/test_loaders.py b/tests/test_loaders.py index 157961a..e51c037 100644 --- a/tests/test_loaders.py +++ b/tests/test_loaders.py @@ -160,7 +160,7 @@ def test_same_name(): # # print(p) -def test_blt_parse(): +def test_scot_csv_parse(): pp, seats, cand_list, cand_to_party, ward = load_scottish( CSV_DIR / "scot_wardy_mc_ward.csv" ) @@ -187,22 +187,49 @@ def test_blt_parse(): ) -def test_bad_file_path_blt(): +def test_scot_csv_blank_rows(): + pp, seats, cand_list, cand_to_party, ward = load_scottish( + CSV_DIR / "scot_blank_rows.csv" + ) + + assert seats == 1 + assert isinstance(pp, PreferenceProfile) + assert cand_list == ["Paul", "George", "Ringo"] + assert cand_to_party == { + "Paul": "Orange (O)", + "George": "Yellow (Y)", + "Ringo": "Red (R)", + } + assert ward == "Wardy McWard Ward" + assert int(pp.num_ballots()) == 146 + assert Ballot(ranking=tuple([frozenset({"Paul"})]), weight=126) in pp.ballots + assert ( + Ballot( + ranking=tuple( + [frozenset({"Ringo"}), frozenset({"George"}), frozenset({"Paul"})] + ), + weight=1, + ) + in pp.ballots + ) + + +def test_bad_file_path_scot_csv(): with pytest.raises(FileNotFoundError): load_scottish("") -def test_empty_file_blt(): +def test_empty_file_scot_csv(): with pytest.raises(EmptyDataError): load_scottish(CSV_DIR / "scot_empty.csv") -def test_bad_metadata_blt(): +def test_bad_metadata_scot_csv(): with pytest.raises(DataError): load_scottish(CSV_DIR / "scot_bad_metadata.csv") -def test_incorrect_metadata_blt(): +def test_incorrect_metadata_scot_csv(): with pytest.raises(DataError): load_scottish(CSV_DIR / "scot_candidate_overcount.csv")