Skip to content

Commit

Permalink
Merge pull request #128 from cdonnay/update_scot
Browse files Browse the repository at this point in the history
Update load_scottish
  • Loading branch information
cdonnay authored Jul 3, 2024
2 parents 0ef4ead + 25a934a commit 2cfe619
Show file tree
Hide file tree
Showing 17 changed files with 202 additions and 141 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@ dist/
extra_data/
.venv
.docs_venv
docs/_build
docs/_build
.dev
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Updated tutorial notebooks; larger focus on slate models, updated notebooks to match current codebase.
- Removed the seq-RCV transfer rule since it is a dummy function, replaced with lambda function.
- Update plot MDS to have aspect ratio 1, remove axes labels since they are meaningless in MDS.
- Update all BLT files in scot-elex repo to be true CSV files, updated `load_scottish` accordingly.

## Fixed
- Fixed bug by which slate-PlackettLuce could not generate ballots when some candidate had 0 support.
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ matplotlib = "^3.7.2"
pandas = "^1.5.3"
apportionment = "^1.0"
scikit-learn = "^1.3.2"

numpy = "^1.26.0"


[tool.poetry.group.dev.dependencies]
Expand Down
144 changes: 76 additions & 68 deletions src/votekit/cvr_loaders.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from fractions import Fraction
import os
import csv
import pandas as pd
from pandas.errors import EmptyDataError, DataError
import pathlib
Expand Down Expand Up @@ -89,87 +90,94 @@ def load_csv(
return PreferenceProfile(ballots=ballots)


def load_scottish(fpath: str) -> tuple[PreferenceProfile, int]:
def load_scottish(
fpath: str,
) -> tuple[PreferenceProfile, int, list[str], dict[str, str], str]:
"""
Given a file path, loads cvr from format used for Scottish election data in
(this repo)[https://github.com/mggg/scot-elex].
Given a file path, loads cast vote record from format used for Scottish election data
in (this repo)[https://github.com/mggg/scot-elex].
Args:
fpath (str): Path to cvr file.
fpath (str): Path to Scottish election csv file.
Raises:
FileNotFoundError: If fpath is invalid.
EmptyDataError: If dataset is empty.
DataError: If there is missing or incorrect metadata or candidate data.
Returns:
tuple: A tuple ``(PreferenceProfile, seats)`` representing the election and the
number of seats in the election.
tuple: A tuple ``(PreferenceProfile, seats, cand_list, cand_to_party, ward)``
representing the election, the number of seats in the election, the candidate
names, a dictionary mapping candidates to their party, and the ward. The
candidate names are also stored in the PreferenceProfile object.
"""
ballots = []
names = []
name_map = {}
numbers = True
cands_included = False

if not os.path.isfile(fpath):
raise FileNotFoundError(f"File with path {fpath} cannot be found")
if os.path.getsize(fpath) == 0:
raise EmptyDataError("Dataset cannot be empty")
raise EmptyDataError(f"CSV at {fpath} is empty.")

# Convert the ballot rows to ints while leaving the candidates as strings
def convert_row(row):
return [int(item) if item.isdigit() else item for item in row]

data = []
with open(fpath, "r") as f:
reader = csv.reader(f)
for row in reader:
# This just removes any empty strings that are hanging out since
# we don't need to preserve columns
filtered_row = list(filter(lambda x: x != "", row))

# only save non-empty rows
if len(filtered_row) > 0:
data.append(convert_row(filtered_row))

if len(data[0]) != 2:
raise DataError(
"The metadata in the first row should be number of \
candidates, seats."
)

cand_num, seats = data[0][0], data[0][1]
ward = data[-1][0]

num_to_cand = {}
cand_to_party = {}

data_cand_num = len([r for r in data if "Candidate" in str(r[0])])
if data_cand_num != cand_num:
raise DataError(
"Incorrect number of candidates in either first row metadata \
or in candidate list at end of csv file."
)

# record candidate names, which are up until the final row
for i, line in enumerate(data[len(data) - (cand_num + 1) : -1]):
if "Candidate" not in line[0]:
raise DataError(
f"The number of candidates on line 1 is {cand_num}, which\
does not match the metadata."
)
cand = line[1]
party = line[2]

# candidates are 1 indexed
num_to_cand[i + 1] = cand
cand_to_party[cand] = party

cand_list = list(cand_to_party.keys())

ballots = [Ballot()] * len(data[1 : len(data) - (cand_num + 1)])

for i, line in enumerate(data[1 : len(data) - (cand_num + 1)]):
ballot_weight = Fraction(line[0])
cand_ordering = line[1:]
ranking = tuple([frozenset({num_to_cand[n]}) for n in cand_ordering])

ballots[i] = Ballot(ranking=ranking, weight=ballot_weight)

with open(fpath, "r") as file:
for i, line in enumerate(file):
s = line.rstrip("\n").rstrip()
if i == 0:
# first number is number of candidates, second is number of seats to elect
metadata = [int(data) for data in s.split(" ")]
if len(metadata) != 2:
raise DataError(
"metadata (first line) should have two parameters"
" (number of candidates, number of seats)"
)
seats = metadata[1]
# read in ballots, cleaning out rankings labeled '0' (designating end of line)
elif numbers:
ballot = [int(vote) for vote in s.split(" ")]
num_votes = ballot[0]
# ballots terminate with a single row with the character '0'
if num_votes == 0:
numbers = False
else:
ranking = [rank for rank in list(ballot[1:]) if rank != 0]
b = (ranking, num_votes)
ballots.append(b) # this is converted to the PP format later
# read in candidates
elif "(" in s:
cands_included = True
name_parts = s.strip('"').split(" ")
first_name = " ".join(name_parts[:-2])
last_name = name_parts[-2]
party = name_parts[-1].strip("(").strip(")")
names.append(str((first_name, last_name, party)))
else:
if len(names) != metadata[0]:
err_message = (
f"Number of candidates listed, {len(names)}," + f" differs from"
f"number of candidates recorded in metadata, {metadata[0]}"
)
raise DataError(err_message)
# read in election location (do we need this?)
# location = s.strip("\"")
if not cands_included:
raise DataError("Candidates missing from file")
# map candidate numbers onto their names and convert ballots to PP format
for i, name in enumerate(names):
name_map[i + 1] = name
clean_ballots = [
Ballot(
ranking=tuple(
[frozenset({name_map[cand]}) for cand in ballot[0]]
),
weight=Fraction(ballot[1]),
)
for ballot in ballots
]

return PreferenceProfile(ballots=clean_ballots, candidates=names), seats
profile = PreferenceProfile(
ballots=ballots, candidates=cand_list
).condense_ballots()
return (profile, seats, cand_list, cand_to_party, ward)
1 change: 1 addition & 0 deletions tests/data/csv/scot_bad_metadata.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
1,2,3,
13 changes: 13 additions & 0 deletions tests/data/csv/scot_blank_rows.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
3,1,
126,1,

9,1,2,
10,1,2,3,
1,3,2,1,
"Candidate 1","Paul","Orange (O)",
"Candidate 2","George","Yellow (Y)",
"Candidate 3","Ringo","Red (R)",

"Wardy McWard Ward",


7 changes: 7 additions & 0 deletions tests/data/csv/scot_candidate_overcount.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
9,4,
2,9,8,7,10,
"Candidate 1","Paul","Orange (O)",
"Candidate 2","George","Yellow (Y)",
"Candidate 3","Ringo","Red (R)",
"Wardy McWard Ward",

6 changes: 6 additions & 0 deletions tests/data/csv/scot_candidate_undercount.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
2,4,
2,1,2,3,
"Candidate 1","Paul","Orange (O)",
"Candidate 2","George","Yellow (Y)",
"Candidate 3","Ringo","Red (R)",
"Wardy McWard Ward",
File renamed without changes.
9 changes: 9 additions & 0 deletions tests/data/csv/scot_wardy_mc_ward.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
3,1,
126,1,
9,1,2,
10,1,2,3,
1,3,2,1,
"Candidate 1","Paul","Orange (O)",
"Candidate 2","George","Yellow (Y)",
"Candidate 3","Ringo","Red (R)",
"Wardy McWard Ward",
1 change: 0 additions & 1 deletion tests/data/txt/bad_metadata.blt

This file was deleted.

20 changes: 0 additions & 20 deletions tests/data/txt/candidate_metadata_conflict.blt

This file was deleted.

21 changes: 0 additions & 21 deletions tests/data/txt/edinburgh17-01_abridged.blt

This file was deleted.

7 changes: 0 additions & 7 deletions tests/data/txt/scottish_mini.txt

This file was deleted.

9 changes: 5 additions & 4 deletions tests/test_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,14 @@ def test_load_clean_completion():

# load CVR -> PP representation
BASE_DIR = Path(__file__).resolve().parent
BLT_DIR = BASE_DIR / "data/txt/"
CSV_DIR = BASE_DIR / "data/csv/"

pp, seats = load_scottish(BLT_DIR / "edinburgh17-01_abridged.blt")
print(pp)
pp, seats, cand_list, cand_to_party, ward = load_scottish(
CSV_DIR / "scot_wardy_mc_ward.csv"
)

# apply rules to get new PP
cleaned_pp = clean.remove_noncands(pp, ["Graham HUTCHISON (C)"])
cleaned_pp = clean.remove_noncands(pp, ["Paul"])

# write intermediate output for inspection
# cleaned_pp.save("cleaned.cvr")
Expand Down
23 changes: 15 additions & 8 deletions tests/test_elections.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,46 +15,53 @@

BASE_DIR = Path(__file__).resolve().parent
DATA_DIR = BASE_DIR / "data/csv/"
BLT_DIR = BASE_DIR / "data/txt/"


test_profile = load_csv(DATA_DIR / "test_election_A.csv")
mn_profile = load_csv("src/votekit/data/mn_2013_cast_vote_record.csv")


def test_droop_default_parameter():
pp, seats = load_scottish(BLT_DIR / "edinburgh17-01_abridged.blt")
pp, seats, cand_list, cand_to_party, ward = load_scottish(
DATA_DIR / "scot_wardy_mc_ward.csv"
)

election = STV(pp, fractional_transfer, seats=seats)

droop_quota = int((8 + 14 + 1 + 13 + 1 + 1 + 2) / (4 + 1)) + 1
droop_quota = int((126 + 9 + 10 + 1) / (1 + 1)) + 1

assert election.threshold == droop_quota


def test_droop_inputed_parameter():
pp, seats = load_scottish(BLT_DIR / "edinburgh17-01_abridged.blt")
pp, seats, cand_list, cand_to_party, ward = load_scottish(
DATA_DIR / "scot_wardy_mc_ward.csv"
)

election = STV(pp, fractional_transfer, seats=seats, quota="Droop")

droop_quota = int((8 + 14 + 1 + 13 + 1 + 1 + 2) / (4 + 1)) + 1
droop_quota = int((126 + 9 + 10 + 1) / (1 + 1)) + 1

assert election.threshold == droop_quota


def test_quota_misspelled_parameter():
pp, seats = load_scottish(BLT_DIR / "edinburgh17-01_abridged.blt")
pp, seats, cand_list, cand_to_party, ward = load_scottish(
DATA_DIR / "scot_wardy_mc_ward.csv"
)

with pytest.raises(ValueError):
_ = STV(pp, fractional_transfer, seats=seats, quota="droops")


def test_hare_quota():
pp, seats = load_scottish(BLT_DIR / "edinburgh17-01_abridged.blt")
pp, seats, cand_list, cand_to_party, ward = load_scottish(
DATA_DIR / "scot_wardy_mc_ward.csv"
)

election = STV(pp, fractional_transfer, seats=seats, quota="hare")

hare_quota = int((8 + 14 + 1 + 13 + 1 + 1 + 2) / 4)
hare_quota = int((126 + 9 + 10 + 1) / 1)

assert election.threshold == hare_quota

Expand Down
Loading

0 comments on commit 2cfe619

Please sign in to comment.