Skip to content

Commit

Permalink
revamp load_scottish, fix tests
Browse files Browse the repository at this point in the history
  • Loading branch information
cdonnay committed Jun 30, 2024
1 parent 0ef4ead commit 8b37d9a
Show file tree
Hide file tree
Showing 13 changed files with 146 additions and 135 deletions.
133 changes: 66 additions & 67 deletions src/votekit/cvr_loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,87 +89,86 @@ def load_csv(
return PreferenceProfile(ballots=ballots)


def load_scottish(fpath: str) -> tuple[PreferenceProfile, int]:
def load_scottish(
fpath: str,
) -> tuple[PreferenceProfile, int, list[str], dict[str, str], str]:
"""
Given a file path, loads cvr from format used for Scottish election data in
(this repo)[https://github.com/mggg/scot-elex].
Given a file path, loads cast vote record from format used for Scottish election data
in (this repo)[https://github.com/mggg/scot-elex].
Args:
fpath (str): Path to cvr file.
fpath (str): Path to Scottish election csv file.
Raises:
FileNotFoundError: If fpath is invalid.
EmptyDataError: If dataset is empty.
DataError: If there is missing or incorrect metadata or candidate data.
Returns:
tuple: A tuple ``(PreferenceProfile, seats)`` representing the election and the
number of seats in the election.
tuple: A tuple ``(PreferenceProfile, seats, cand_list, cand_to_party, ward)``
representing the election, the number of seats in the election, the candidate
names, a dictionary mapping candidates to their party, and the ward. The
candidate names are also stored in the PreferenceProfile object.
"""
ballots = []
names = []
name_map = {}
numbers = True
cands_included = False

if not os.path.isfile(fpath):
raise FileNotFoundError(f"File with path {fpath} cannot be found")
if os.path.getsize(fpath) == 0:
raise EmptyDataError("Dataset cannot be empty")
raise EmptyDataError(f"CSV at {fpath} is empty.")

with open(fpath, "r") as file:
for i, line in enumerate(file):
s = line.rstrip("\n").rstrip()
if i == 0:
# first number is number of candidates, second is number of seats to elect
metadata = [int(data) for data in s.split(" ")]
if len(metadata) != 2:
raise DataError(
"metadata (first line) should have two parameters"
" (number of candidates, number of seats)"
)
seats = metadata[1]
# read in ballots, cleaning out rankings labeled '0' (designating end of line)
elif numbers:
ballot = [int(vote) for vote in s.split(" ")]
num_votes = ballot[0]
# ballots terminate with a single row with the character '0'
if num_votes == 0:
numbers = False
else:
ranking = [rank for rank in list(ballot[1:]) if rank != 0]
b = (ranking, num_votes)
ballots.append(b) # this is converted to the PP format later
# read in candidates
elif "(" in s:
cands_included = True
name_parts = s.strip('"').split(" ")
first_name = " ".join(name_parts[:-2])
last_name = name_parts[-2]
party = name_parts[-1].strip("(").strip(")")
names.append(str((first_name, last_name, party)))
else:
if len(names) != metadata[0]:
err_message = (
f"Number of candidates listed, {len(names)}," + f" differs from"
f"number of candidates recorded in metadata, {metadata[0]}"
)
raise DataError(err_message)
# read in election location (do we need this?)
# location = s.strip("\"")
if not cands_included:
raise DataError("Candidates missing from file")
# map candidate numbers onto their names and convert ballots to PP format
for i, name in enumerate(names):
name_map[i + 1] = name
clean_ballots = [
Ballot(
ranking=tuple(
[frozenset({name_map[cand]}) for cand in ballot[0]]
),
weight=Fraction(ballot[1]),
)
for ballot in ballots
]

return PreferenceProfile(ballots=clean_ballots, candidates=names), seats
lines = list(file)

# remove errant blank character at end of line
row_0 = lines[0].split(",")[:-1]

if len(row_0) != 2:
raise DataError(
"The metadata in the first show should be number of \
candidates, seats."
)

cand_num, seats = int(row_0[0]), int(row_0[1])
ward = lines[-1].split(",")[0].strip('"')

num_to_cand = {}
cand_to_party = {}

# record candidate names, which are up until the final row
for i, line in enumerate(lines[len(lines) - (cand_num + 1) : -1]):
parsed_line = line.split(",")
if "Candidate" not in parsed_line[0]:
raise DataError(
f"The number of candidates on line 1 is {cand_num}, which\
does not match the metadata."
)
cand = parsed_line[1].strip('"')
party = parsed_line[2].strip('"')

# candidates are 1 indexed
num_to_cand[str(i + 1)] = cand
cand_to_party[cand] = party

cand_list = list(cand_to_party.keys())

if len(cand_list) != cand_num:
raise DataError(
"Incorrect number of candidates in either first row metadata \
or in candidate list at end of csv file."
)
ballots = [Ballot()] * len(lines[1 : len(lines) - (cand_num + 1)])

for i, line in enumerate(lines[1 : len(lines) - (cand_num + 1)]):
# remove carriage return and blank string after final comma
parsed_line = line.strip("\n").split(",")[:-1]

ballot_weight = Fraction(parsed_line[0])
cand_ordering = parsed_line[1:]
ranking = tuple([frozenset({num_to_cand[n]}) for n in cand_ordering])

ballots[i] = Ballot(ranking=ranking, weight=ballot_weight)

profile = PreferenceProfile(
ballots=ballots, candidates=cand_list
).condense_ballots()
return (profile, seats, cand_list, cand_to_party, ward)
1 change: 1 addition & 0 deletions tests/data/csv/scot_bad_metadata.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
1,2,3,
7 changes: 7 additions & 0 deletions tests/data/csv/scot_candidate_overcount.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
2,4,
2,9,8,7,10,
"Candidate 1","Paul","Orange (O)",
"Candidate 2","George","Yellow (Y)",
"Candidate 3","Ringo","Red (R)",
"Wardy McWard Ward",

7 changes: 7 additions & 0 deletions tests/data/csv/scot_candidate_undercount.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
9,4,
2,9,8,7,10,
"Candidate 1","Paul","Orange (O)",
"Candidate 2","George","Yellow (Y)",
"Candidate 3","Ringo","Red (R)",
"Wardy McWard Ward",

File renamed without changes.
9 changes: 9 additions & 0 deletions tests/data/csv/scot_wardy_mc_ward.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
3,1,
126,1,
9,1,2,
10,1,2,3,
1,3,2,1,
"Candidate 1","Paul","Orange (O)",
"Candidate 2","George","Yellow (Y)",
"Candidate 3","Ringo","Red (R)",
"Wardy McWard Ward",
1 change: 0 additions & 1 deletion tests/data/txt/bad_metadata.blt

This file was deleted.

20 changes: 0 additions & 20 deletions tests/data/txt/candidate_metadata_conflict.blt

This file was deleted.

21 changes: 0 additions & 21 deletions tests/data/txt/edinburgh17-01_abridged.blt

This file was deleted.

7 changes: 0 additions & 7 deletions tests/data/txt/scottish_mini.txt

This file was deleted.

9 changes: 5 additions & 4 deletions tests/test_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,14 @@ def test_load_clean_completion():

# load CVR -> PP representation
BASE_DIR = Path(__file__).resolve().parent
BLT_DIR = BASE_DIR / "data/txt/"
CSV_DIR = BASE_DIR / "data/csv/"

pp, seats = load_scottish(BLT_DIR / "edinburgh17-01_abridged.blt")
print(pp)
pp, seats, cand_list, cand_to_party, ward = load_scottish(
CSV_DIR / "scot_wardy_mc_ward.csv"
)

# apply rules to get new PP
cleaned_pp = clean.remove_noncands(pp, ["Graham HUTCHISON (C)"])
cleaned_pp = clean.remove_noncands(pp, ["Paul"])

# write intermediate output for inspection
# cleaned_pp.save("cleaned.cvr")
Expand Down
23 changes: 15 additions & 8 deletions tests/test_elections.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,46 +15,53 @@

BASE_DIR = Path(__file__).resolve().parent
DATA_DIR = BASE_DIR / "data/csv/"
BLT_DIR = BASE_DIR / "data/txt/"


test_profile = load_csv(DATA_DIR / "test_election_A.csv")
mn_profile = load_csv("src/votekit/data/mn_2013_cast_vote_record.csv")


def test_droop_default_parameter():
pp, seats = load_scottish(BLT_DIR / "edinburgh17-01_abridged.blt")
pp, seats, cand_list, cand_to_party, ward = load_scottish(
DATA_DIR / "scot_wardy_mc_ward.csv"
)

election = STV(pp, fractional_transfer, seats=seats)

droop_quota = int((8 + 14 + 1 + 13 + 1 + 1 + 2) / (4 + 1)) + 1
droop_quota = int((126 + 9 + 10 + 1) / (1 + 1)) + 1

assert election.threshold == droop_quota


def test_droop_inputed_parameter():
pp, seats = load_scottish(BLT_DIR / "edinburgh17-01_abridged.blt")
pp, seats, cand_list, cand_to_party, ward = load_scottish(
DATA_DIR / "scot_wardy_mc_ward.csv"
)

election = STV(pp, fractional_transfer, seats=seats, quota="Droop")

droop_quota = int((8 + 14 + 1 + 13 + 1 + 1 + 2) / (4 + 1)) + 1
droop_quota = int((126 + 9 + 10 + 1) / (1 + 1)) + 1

assert election.threshold == droop_quota


def test_quota_misspelled_parameter():
pp, seats = load_scottish(BLT_DIR / "edinburgh17-01_abridged.blt")
pp, seats, cand_list, cand_to_party, ward = load_scottish(
DATA_DIR / "scot_wardy_mc_ward.csv"
)

with pytest.raises(ValueError):
_ = STV(pp, fractional_transfer, seats=seats, quota="droops")


def test_hare_quota():
pp, seats = load_scottish(BLT_DIR / "edinburgh17-01_abridged.blt")
pp, seats, cand_list, cand_to_party, ward = load_scottish(
DATA_DIR / "scot_wardy_mc_ward.csv"
)

election = STV(pp, fractional_transfer, seats=seats, quota="hare")

hare_quota = int((8 + 14 + 1 + 13 + 1 + 1 + 2) / 4)
hare_quota = int((126 + 9 + 10 + 1) / 1)

assert election.threshold == hare_quota

Expand Down
43 changes: 36 additions & 7 deletions tests/test_loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@

BASE_DIR = Path(__file__).resolve().parent
CSV_DIR = BASE_DIR / "data/csv/"
BLT_DIR = BASE_DIR / "data/txt/"


def is_equal(b1: list[Ballot], b2: list[Ballot]) -> bool:
Expand Down Expand Up @@ -161,21 +160,51 @@ def test_same_name():
# # print(p)


def test_blt_seats_parse():
pp, seats = load_scottish(BLT_DIR / "edinburgh17-01_abridged.blt")
assert seats == 4
def test_blt_parse():
pp, seats, cand_list, cand_to_party, ward = load_scottish(
CSV_DIR / "scot_wardy_mc_ward.csv"
)

assert seats == 1
assert isinstance(pp, PreferenceProfile)
assert cand_list == ["Paul", "George", "Ringo"]
assert cand_to_party == {
"Paul": "Orange (O)",
"George": "Yellow (Y)",
"Ringo": "Red (R)",
}
assert ward == "Wardy McWard Ward"
assert int(pp.num_ballots()) == 146
assert Ballot(ranking=tuple([frozenset({"Paul"})]), weight=126) in pp.ballots
assert (
Ballot(
ranking=tuple(
[frozenset({"Ringo"}), frozenset({"George"}), frozenset({"Paul"})]
),
weight=1,
)
in pp.ballots
)


def test_bad_file_path_blt():
with pytest.raises(FileNotFoundError):
load_scottish("")


def test_empty_file_blt():
with pytest.raises(EmptyDataError):
pp, seats = load_scottish(BLT_DIR / "empty.blt")
load_scottish(CSV_DIR / "scot_empty.csv")


def test_bad_metadata_blt():
with pytest.raises(DataError):
pp, seats = load_scottish(BLT_DIR / "bad_metadata.blt")
load_scottish(CSV_DIR / "scot_bad_metadata.csv")


def test_incorrect_metadata_blt():
with pytest.raises(DataError):
pp, seats = load_scottish(BLT_DIR / "candidate_metadata_conflict.blt")
load_scottish(CSV_DIR / "scot_candidate_overcount.csv")

with pytest.raises(DataError):
load_scottish(CSV_DIR / "scot_candidate_undercount.csv")

0 comments on commit 8b37d9a

Please sign in to comment.