Merge branch 'main' into update_CS

mggg · Jul 3, 2024 · eb16eab · eb16eab
2 parents dcdd6a0 + 2cfe619
commit eb16eab
Show file tree

Hide file tree

Showing 18 changed files with 202 additions and 152 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Updated tutorial notebooks; larger focus on slate models, updated notebooks to match current codebase.
 - Removed the seq-RCV transfer rule since it is a dummy function, replaced with lambda function.
 - Update plot MDS to have aspect ratio 1, remove axes labels since they are meaningless in MDS.
+- Update all BLT files in scot-elex repo to be true CSV files, updated `load_scottish` accordingly.
 
 ## Fixed
 - Fixed bug by which slate-PlackettLuce could not generate ballots when some candidate had 0 support.

diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 `VoteKit` is a Swiss army knife for computational social choice research.
 
-**Helpful links:** [Source Repository](https://github.com/mggg/VoteKit) | [Documentation](https://mggg.github.io/VoteKit/) | [Issues](https://github.com/mggg/VoteKit/issues) | [MGGG.org](https://mggg.org/)
+**Helpful links:** [Source Repository](https://github.com/mggg/VoteKit) | [Documentation](https://votekit.readthedocs.io/en/latest/) | [Issues](https://github.com/mggg/VoteKit/issues) | [MGGG.org](https://mggg.org/)
 
 
 [![PyPI badge](https://badge.fury.io/py/votekit.svg)](https://badge.fury.io/py/votekit)

diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -21,28 +21,19 @@ fastjsonschema==2.19.1
 fonttools==4.53.0
 idna==3.7
 imagesize==1.4.1
-ipykernel==6.29.4
-ipython==8.25.0
 jedi==0.19.1
 Jinja2==3.1.4
 joblib==1.4.2
 jsonschema==4.22.0
 jsonschema-specifications==2023.12.1
-jupyter_client==8.6.2
-jupyter_core==5.7.2
-jupyterlab_pygments==0.3.0
 kiwisolver==1.4.5
 MarkupSafe==2.1.5
 matplotlib==3.9.0
 matplotlib-inline==0.1.7
 mistune==3.0.2
 mypy==1.10.0
 mypy-extensions==1.0.0
-nbclient==0.10.0
-nbconvert==7.16.4
-nbformat==5.10.4
-nest-asyncio==1.6.0
-networkx==3.3
+networkx==3.2
 numpy==1.26.4
 packaging==24.0
 pandas==1.5.3
@@ -90,6 +81,6 @@ traitlets==5.14.3
 types-pytz==2024.1.0.20240417
 typing_extensions==4.12.1
 urllib3==2.2.1
-votekit @ file:///Users/cdonnay/PycharmProjects/VoteKit
+votekit==2.0.0
 wcwidth==0.2.13
 webencodings==0.5.1
diff --git a/pyproject.toml b/pyproject.toml
@@ -21,7 +21,6 @@ scikit-learn = "^1.3.2"
 numpy = "^1.26.0"
 
 
-
 [tool.poetry.group.dev.dependencies]
 ruff = "^0.0.275"
 black = "^23.3.0"

diff --git a/src/votekit/cvr_loaders.py b/src/votekit/cvr_loaders.py
@@ -1,5 +1,6 @@
 from fractions import Fraction
 import os
+import csv
 import pandas as pd
 from pandas.errors import EmptyDataError, DataError
 import pathlib
@@ -89,87 +90,94 @@ def load_csv(
     return PreferenceProfile(ballots=ballots)
 
 
-def load_scottish(fpath: str) -> tuple[PreferenceProfile, int]:
+def load_scottish(
+    fpath: str,
+) -> tuple[PreferenceProfile, int, list[str], dict[str, str], str]:
     """
-    Given a file path, loads cvr from format used for Scottish election data in
-    (this repo)[https://github.com/mggg/scot-elex].
+    Given a file path, loads cast vote record from format used for Scottish election data
+    in (this repo)[https://github.com/mggg/scot-elex].
 
     Args:
-        fpath (str): Path to cvr file.
+        fpath (str): Path to Scottish election csv file.
 
     Raises:
         FileNotFoundError: If fpath is invalid.
         EmptyDataError: If dataset is empty.
         DataError: If there is missing or incorrect metadata or candidate data.
 
     Returns:
-        tuple: A tuple ``(PreferenceProfile, seats)`` representing the election and the
-            number of seats in the election.
+        tuple: A tuple ``(PreferenceProfile, seats, cand_list, cand_to_party, ward)``
+            representing the election, the number of seats in the election, the candidate
+            names, a dictionary mapping candidates to their party, and the ward. The
+            candidate names are also stored in the PreferenceProfile object.
     """
-    ballots = []
-    names = []
-    name_map = {}
-    numbers = True
-    cands_included = False
 
     if not os.path.isfile(fpath):
         raise FileNotFoundError(f"File with path {fpath} cannot be found")
     if os.path.getsize(fpath) == 0:
-        raise EmptyDataError("Dataset cannot be empty")
+        raise EmptyDataError(f"CSV at {fpath} is empty.")
+
+    # Convert the ballot rows to ints while leaving the candidates as strings
+    def convert_row(row):
+        return [int(item) if item.isdigit() else item for item in row]
+
+    data = []
+    with open(fpath, "r") as f:
+        reader = csv.reader(f)
+        for row in reader:
+            # This just removes any empty strings that are hanging out since
+            # we don't need to preserve columns
+            filtered_row = list(filter(lambda x: x != "", row))
+
+            # only save non-empty rows
+            if len(filtered_row) > 0:
+                data.append(convert_row(filtered_row))
+
+    if len(data[0]) != 2:
+        raise DataError(
+            "The metadata in the first row should be number of \
+                            candidates, seats."
+        )
+
+    cand_num, seats = data[0][0], data[0][1]
+    ward = data[-1][0]
+
+    num_to_cand = {}
+    cand_to_party = {}
+
+    data_cand_num = len([r for r in data if "Candidate" in str(r[0])])
+    if data_cand_num != cand_num:
+        raise DataError(
+            "Incorrect number of candidates in either first row metadata \
+                        or in candidate list at end of csv file."
+        )
+
+    # record candidate names, which are up until the final row
+    for i, line in enumerate(data[len(data) - (cand_num + 1) : -1]):
+        if "Candidate" not in line[0]:
+            raise DataError(
+                f"The number of candidates on line 1 is {cand_num}, which\
+                            does not match the metadata."
+            )
+        cand = line[1]
+        party = line[2]
+
+        # candidates are 1 indexed
+        num_to_cand[i + 1] = cand
+        cand_to_party[cand] = party
+
+    cand_list = list(cand_to_party.keys())
+
+    ballots = [Ballot()] * len(data[1 : len(data) - (cand_num + 1)])
+
+    for i, line in enumerate(data[1 : len(data) - (cand_num + 1)]):
+        ballot_weight = Fraction(line[0])
+        cand_ordering = line[1:]
+        ranking = tuple([frozenset({num_to_cand[n]}) for n in cand_ordering])
+
+        ballots[i] = Ballot(ranking=ranking, weight=ballot_weight)
 
-    with open(fpath, "r") as file:
-        for i, line in enumerate(file):
-            s = line.rstrip("\n").rstrip()
-            if i == 0:
-                # first number is number of candidates, second is number of seats to elect
-                metadata = [int(data) for data in s.split(" ")]
-                if len(metadata) != 2:
-                    raise DataError(
-                        "metadata (first line) should have two parameters"
-                        " (number of candidates, number of seats)"
-                    )
-                seats = metadata[1]
-            # read in ballots, cleaning out rankings labeled '0' (designating end of line)
-            elif numbers:
-                ballot = [int(vote) for vote in s.split(" ")]
-                num_votes = ballot[0]
-                # ballots terminate with a single row with the character '0'
-                if num_votes == 0:
-                    numbers = False
-                else:
-                    ranking = [rank for rank in list(ballot[1:]) if rank != 0]
-                    b = (ranking, num_votes)
-                    ballots.append(b)  # this is converted to the PP format later
-            # read in candidates
-            elif "(" in s:
-                cands_included = True
-                name_parts = s.strip('"').split(" ")
-                first_name = " ".join(name_parts[:-2])
-                last_name = name_parts[-2]
-                party = name_parts[-1].strip("(").strip(")")
-                names.append(str((first_name, last_name, party)))
-            else:
-                if len(names) != metadata[0]:
-                    err_message = (
-                        f"Number of candidates listed, {len(names)}," + f" differs from"
-                        f"number of candidates recorded in metadata, {metadata[0]}"
-                    )
-                    raise DataError(err_message)
-                # read in election location (do we need this?)
-                # location = s.strip("\"")
-                if not cands_included:
-                    raise DataError("Candidates missing from file")
-                # map candidate numbers onto their names and convert ballots to PP format
-                for i, name in enumerate(names):
-                    name_map[i + 1] = name
-                clean_ballots = [
-                    Ballot(
-                        ranking=tuple(
-                            [frozenset({name_map[cand]}) for cand in ballot[0]]
-                        ),
-                        weight=Fraction(ballot[1]),
-                    )
-                    for ballot in ballots
-                ]
-
-        return PreferenceProfile(ballots=clean_ballots, candidates=names), seats
+    profile = PreferenceProfile(
+        ballots=ballots, candidates=cand_list
+    ).condense_ballots()
+    return (profile, seats, cand_list, cand_to_party, ward)
diff --git a/tests/data/csv/scot_bad_metadata.csv b/tests/data/csv/scot_bad_metadata.csv
@@ -0,0 +1 @@
+1,2,3,
diff --git a/tests/data/csv/scot_blank_rows.csv b/tests/data/csv/scot_blank_rows.csv
@@ -0,0 +1,13 @@
+3,1,
+126,1,
+
+9,1,2,
+10,1,2,3,
+1,3,2,1,
+"Candidate 1","Paul","Orange (O)",
+"Candidate 2","George","Yellow (Y)",
+"Candidate 3","Ringo","Red (R)",
+
+"Wardy McWard Ward",
+
+
diff --git a/tests/data/csv/scot_candidate_overcount.csv b/tests/data/csv/scot_candidate_overcount.csv
@@ -0,0 +1,7 @@
+9,4,
+2,9,8,7,10,
+"Candidate 1","Paul","Orange (O)",
+"Candidate 2","George","Yellow (Y)",
+"Candidate 3","Ringo","Red (R)",
+"Wardy McWard Ward",
+
diff --git a/tests/data/csv/scot_candidate_undercount.csv b/tests/data/csv/scot_candidate_undercount.csv
@@ -0,0 +1,6 @@
+2,4,
+2,1,2,3,
+"Candidate 1","Paul","Orange (O)",
+"Candidate 2","George","Yellow (Y)",
+"Candidate 3","Ringo","Red (R)",
+"Wardy McWard Ward",
diff --git a/tests/data/txt/empty.blt → tests/data/csv/scot_empty.csv b/tests/data/txt/empty.blt → tests/data/csv/scot_empty.csv
diff --git a/tests/data/csv/scot_wardy_mc_ward.csv b/tests/data/csv/scot_wardy_mc_ward.csv
@@ -0,0 +1,9 @@
+3,1,
+126,1,
+9,1,2,
+10,1,2,3,
+1,3,2,1,
+"Candidate 1","Paul","Orange (O)",
+"Candidate 2","George","Yellow (Y)",
+"Candidate 3","Ringo","Red (R)",
+"Wardy McWard Ward",
diff --git a/tests/data/txt/bad_metadata.blt b/tests/data/txt/bad_metadata.blt
diff --git a/tests/data/txt/candidate_metadata_conflict.blt b/tests/data/txt/candidate_metadata_conflict.blt
diff --git a/tests/data/txt/edinburgh17-01_abridged.blt b/tests/data/txt/edinburgh17-01_abridged.blt
diff --git a/tests/data/txt/scottish_mini.txt b/tests/data/txt/scottish_mini.txt
diff --git a/tests/test_e2e.py b/tests/test_e2e.py
@@ -24,13 +24,14 @@ def test_load_clean_completion():
 
     # load CVR -> PP representation
     BASE_DIR = Path(__file__).resolve().parent
-    BLT_DIR = BASE_DIR / "data/txt/"
+    CSV_DIR = BASE_DIR / "data/csv/"
 
-    pp, seats = load_scottish(BLT_DIR / "edinburgh17-01_abridged.blt")
-    print(pp)
+    pp, seats, cand_list, cand_to_party, ward = load_scottish(
+        CSV_DIR / "scot_wardy_mc_ward.csv"
+    )
 
     # apply rules to get new PP
-    cleaned_pp = clean.remove_noncands(pp, ["Graham HUTCHISON (C)"])
+    cleaned_pp = clean.remove_noncands(pp, ["Paul"])
 
     # write intermediate output for inspection
     # cleaned_pp.save("cleaned.cvr")