Skip to content

Commit

Permalink
Merge branch 'main' into update_CS
Browse files Browse the repository at this point in the history
  • Loading branch information
cdonnay committed Jul 3, 2024
2 parents dcdd6a0 + 2cfe619 commit eb16eab
Show file tree
Hide file tree
Showing 18 changed files with 202 additions and 152 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Updated tutorial notebooks; larger focus on slate models, updated notebooks to match current codebase.
- Removed the seq-RCV transfer rule since it is a dummy function, replaced with lambda function.
- Update plot MDS to have aspect ratio 1, remove axes labels since they are meaningless in MDS.
- Update all BLT files in scot-elex repo to be true CSV files, updated `load_scottish` accordingly.

## Fixed
- Fixed bug by which slate-PlackettLuce could not generate ballots when some candidate had 0 support.
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

`VoteKit` is a Swiss army knife for computational social choice research.

**Helpful links:** [Source Repository](https://github.com/mggg/VoteKit) | [Documentation](https://mggg.github.io/VoteKit/) | [Issues](https://github.com/mggg/VoteKit/issues) | [MGGG.org](https://mggg.org/)
**Helpful links:** [Source Repository](https://github.com/mggg/VoteKit) | [Documentation](https://votekit.readthedocs.io/en/latest/) | [Issues](https://github.com/mggg/VoteKit/issues) | [MGGG.org](https://mggg.org/)


[![PyPI badge](https://badge.fury.io/py/votekit.svg)](https://badge.fury.io/py/votekit)
Expand Down
13 changes: 2 additions & 11 deletions docs/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,28 +21,19 @@ fastjsonschema==2.19.1
fonttools==4.53.0
idna==3.7
imagesize==1.4.1
ipykernel==6.29.4
ipython==8.25.0
jedi==0.19.1
Jinja2==3.1.4
joblib==1.4.2
jsonschema==4.22.0
jsonschema-specifications==2023.12.1
jupyter_client==8.6.2
jupyter_core==5.7.2
jupyterlab_pygments==0.3.0
kiwisolver==1.4.5
MarkupSafe==2.1.5
matplotlib==3.9.0
matplotlib-inline==0.1.7
mistune==3.0.2
mypy==1.10.0
mypy-extensions==1.0.0
nbclient==0.10.0
nbconvert==7.16.4
nbformat==5.10.4
nest-asyncio==1.6.0
networkx==3.3
networkx==3.2
numpy==1.26.4
packaging==24.0
pandas==1.5.3
Expand Down Expand Up @@ -90,6 +81,6 @@ traitlets==5.14.3
types-pytz==2024.1.0.20240417
typing_extensions==4.12.1
urllib3==2.2.1
votekit @ file:///Users/cdonnay/PycharmProjects/VoteKit
votekit==2.0.0
wcwidth==0.2.13
webencodings==0.5.1
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ scikit-learn = "^1.3.2"
numpy = "^1.26.0"



[tool.poetry.group.dev.dependencies]
ruff = "^0.0.275"
black = "^23.3.0"
Expand Down
144 changes: 76 additions & 68 deletions src/votekit/cvr_loaders.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from fractions import Fraction
import os
import csv
import pandas as pd
from pandas.errors import EmptyDataError, DataError
import pathlib
Expand Down Expand Up @@ -89,87 +90,94 @@ def load_csv(
return PreferenceProfile(ballots=ballots)


def load_scottish(fpath: str) -> tuple[PreferenceProfile, int]:
def load_scottish(
fpath: str,
) -> tuple[PreferenceProfile, int, list[str], dict[str, str], str]:
"""
Given a file path, loads cvr from format used for Scottish election data in
(this repo)[https://github.com/mggg/scot-elex].
Given a file path, loads cast vote record from format used for Scottish election data
in (this repo)[https://github.com/mggg/scot-elex].
Args:
fpath (str): Path to cvr file.
fpath (str): Path to Scottish election csv file.
Raises:
FileNotFoundError: If fpath is invalid.
EmptyDataError: If dataset is empty.
DataError: If there is missing or incorrect metadata or candidate data.
Returns:
tuple: A tuple ``(PreferenceProfile, seats)`` representing the election and the
number of seats in the election.
tuple: A tuple ``(PreferenceProfile, seats, cand_list, cand_to_party, ward)``
representing the election, the number of seats in the election, the candidate
names, a dictionary mapping candidates to their party, and the ward. The
candidate names are also stored in the PreferenceProfile object.
"""
ballots = []
names = []
name_map = {}
numbers = True
cands_included = False

if not os.path.isfile(fpath):
raise FileNotFoundError(f"File with path {fpath} cannot be found")
if os.path.getsize(fpath) == 0:
raise EmptyDataError("Dataset cannot be empty")
raise EmptyDataError(f"CSV at {fpath} is empty.")

# Convert the ballot rows to ints while leaving the candidates as strings
def convert_row(row):
return [int(item) if item.isdigit() else item for item in row]

data = []
with open(fpath, "r") as f:
reader = csv.reader(f)
for row in reader:
# This just removes any empty strings that are hanging out since
# we don't need to preserve columns
filtered_row = list(filter(lambda x: x != "", row))

# only save non-empty rows
if len(filtered_row) > 0:
data.append(convert_row(filtered_row))

if len(data[0]) != 2:
raise DataError(
"The metadata in the first row should be number of \
candidates, seats."
)

cand_num, seats = data[0][0], data[0][1]
ward = data[-1][0]

num_to_cand = {}
cand_to_party = {}

data_cand_num = len([r for r in data if "Candidate" in str(r[0])])
if data_cand_num != cand_num:
raise DataError(
"Incorrect number of candidates in either first row metadata \
or in candidate list at end of csv file."
)

# record candidate names, which are up until the final row
for i, line in enumerate(data[len(data) - (cand_num + 1) : -1]):
if "Candidate" not in line[0]:
raise DataError(
f"The number of candidates on line 1 is {cand_num}, which\
does not match the metadata."
)
cand = line[1]
party = line[2]

# candidates are 1 indexed
num_to_cand[i + 1] = cand
cand_to_party[cand] = party

cand_list = list(cand_to_party.keys())

ballots = [Ballot()] * len(data[1 : len(data) - (cand_num + 1)])

for i, line in enumerate(data[1 : len(data) - (cand_num + 1)]):
ballot_weight = Fraction(line[0])
cand_ordering = line[1:]
ranking = tuple([frozenset({num_to_cand[n]}) for n in cand_ordering])

ballots[i] = Ballot(ranking=ranking, weight=ballot_weight)

with open(fpath, "r") as file:
for i, line in enumerate(file):
s = line.rstrip("\n").rstrip()
if i == 0:
# first number is number of candidates, second is number of seats to elect
metadata = [int(data) for data in s.split(" ")]
if len(metadata) != 2:
raise DataError(
"metadata (first line) should have two parameters"
" (number of candidates, number of seats)"
)
seats = metadata[1]
# read in ballots, cleaning out rankings labeled '0' (designating end of line)
elif numbers:
ballot = [int(vote) for vote in s.split(" ")]
num_votes = ballot[0]
# ballots terminate with a single row with the character '0'
if num_votes == 0:
numbers = False
else:
ranking = [rank for rank in list(ballot[1:]) if rank != 0]
b = (ranking, num_votes)
ballots.append(b) # this is converted to the PP format later
# read in candidates
elif "(" in s:
cands_included = True
name_parts = s.strip('"').split(" ")
first_name = " ".join(name_parts[:-2])
last_name = name_parts[-2]
party = name_parts[-1].strip("(").strip(")")
names.append(str((first_name, last_name, party)))
else:
if len(names) != metadata[0]:
err_message = (
f"Number of candidates listed, {len(names)}," + f" differs from"
f"number of candidates recorded in metadata, {metadata[0]}"
)
raise DataError(err_message)
# read in election location (do we need this?)
# location = s.strip("\"")
if not cands_included:
raise DataError("Candidates missing from file")
# map candidate numbers onto their names and convert ballots to PP format
for i, name in enumerate(names):
name_map[i + 1] = name
clean_ballots = [
Ballot(
ranking=tuple(
[frozenset({name_map[cand]}) for cand in ballot[0]]
),
weight=Fraction(ballot[1]),
)
for ballot in ballots
]

return PreferenceProfile(ballots=clean_ballots, candidates=names), seats
profile = PreferenceProfile(
ballots=ballots, candidates=cand_list
).condense_ballots()
return (profile, seats, cand_list, cand_to_party, ward)
1 change: 1 addition & 0 deletions tests/data/csv/scot_bad_metadata.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
1,2,3,
13 changes: 13 additions & 0 deletions tests/data/csv/scot_blank_rows.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
3,1,
126,1,

9,1,2,
10,1,2,3,
1,3,2,1,
"Candidate 1","Paul","Orange (O)",
"Candidate 2","George","Yellow (Y)",
"Candidate 3","Ringo","Red (R)",

"Wardy McWard Ward",


7 changes: 7 additions & 0 deletions tests/data/csv/scot_candidate_overcount.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
9,4,
2,9,8,7,10,
"Candidate 1","Paul","Orange (O)",
"Candidate 2","George","Yellow (Y)",
"Candidate 3","Ringo","Red (R)",
"Wardy McWard Ward",

6 changes: 6 additions & 0 deletions tests/data/csv/scot_candidate_undercount.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
2,4,
2,1,2,3,
"Candidate 1","Paul","Orange (O)",
"Candidate 2","George","Yellow (Y)",
"Candidate 3","Ringo","Red (R)",
"Wardy McWard Ward",
File renamed without changes.
9 changes: 9 additions & 0 deletions tests/data/csv/scot_wardy_mc_ward.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
3,1,
126,1,
9,1,2,
10,1,2,3,
1,3,2,1,
"Candidate 1","Paul","Orange (O)",
"Candidate 2","George","Yellow (Y)",
"Candidate 3","Ringo","Red (R)",
"Wardy McWard Ward",
1 change: 0 additions & 1 deletion tests/data/txt/bad_metadata.blt

This file was deleted.

20 changes: 0 additions & 20 deletions tests/data/txt/candidate_metadata_conflict.blt

This file was deleted.

21 changes: 0 additions & 21 deletions tests/data/txt/edinburgh17-01_abridged.blt

This file was deleted.

7 changes: 0 additions & 7 deletions tests/data/txt/scottish_mini.txt

This file was deleted.

9 changes: 5 additions & 4 deletions tests/test_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,14 @@ def test_load_clean_completion():

# load CVR -> PP representation
BASE_DIR = Path(__file__).resolve().parent
BLT_DIR = BASE_DIR / "data/txt/"
CSV_DIR = BASE_DIR / "data/csv/"

pp, seats = load_scottish(BLT_DIR / "edinburgh17-01_abridged.blt")
print(pp)
pp, seats, cand_list, cand_to_party, ward = load_scottish(
CSV_DIR / "scot_wardy_mc_ward.csv"
)

# apply rules to get new PP
cleaned_pp = clean.remove_noncands(pp, ["Graham HUTCHISON (C)"])
cleaned_pp = clean.remove_noncands(pp, ["Paul"])

# write intermediate output for inspection
# cleaned_pp.save("cleaned.cvr")
Expand Down
Loading

0 comments on commit eb16eab

Please sign in to comment.