use csv module

mggg · Jul 3, 2024 · b9852a3 · b9852a3
1 parent d775874
commit b9852a3
Show file tree

Hide file tree

Showing 5 changed files with 103 additions and 55 deletions.
diff --git a/src/votekit/cvr_loaders.py b/src/votekit/cvr_loaders.py
@@ -1,5 +1,6 @@
 from fractions import Fraction
 import os
+import csv
 import pandas as pd
 from pandas.errors import EmptyDataError, DataError
 import pathlib
@@ -116,59 +117,67 @@ def load_scottish(
     if os.path.getsize(fpath) == 0:
         raise EmptyDataError(f"CSV at {fpath} is empty.")
 
-    with open(fpath, "r") as file:
-        lines = list(file)
-
-        # remove errant blank character at end of line
-        row_0 = lines[0].split(",")[:-1]
-
-        if len(row_0) != 2:
-            raise DataError(
-                "The metadata in the first show should be number of \
+    # Convert the ballot rows to ints while leaving the candidates as strings
+    def convert_row(row):
+        return [int(item) if item.isdigit() else item for item in row]
+
+    data = []
+    with open(fpath, "r") as f:
+        reader = csv.reader(f)
+        for row in reader:
+            # This just removes any empty strings that are hanging out since
+            # we don't need to preserve columns
+            filtered_row = list(filter(lambda x: x != "", row))
+
+            # only save non-empty rows
+            if len(filtered_row) > 0:
+                data.append(convert_row(filtered_row))
+
+    if len(data[0]) != 2:
+        raise DataError(
+            "The metadata in the first row should be number of \
                             candidates, seats."
-            )
-
-        cand_num, seats = int(row_0[0]), int(row_0[1])
-        ward = lines[-1].split(",")[0].strip('"')
-
-        num_to_cand = {}
-        cand_to_party = {}
+        )
 
-        # record candidate names, which are up until the final row
-        for i, line in enumerate(lines[len(lines) - (cand_num + 1) : -1]):
-            parsed_line = line.split(",")
-            if "Candidate" not in parsed_line[0]:
-                raise DataError(
-                    f"The number of candidates on line 1 is {cand_num}, which\
-                                does not match the metadata."
-                )
-            cand = parsed_line[1].strip('"')
-            party = parsed_line[2].strip('"')
+    cand_num, seats = data[0][0], data[0][1]
+    ward = data[-1][0]
 
-            # candidates are 1 indexed
-            num_to_cand[str(i + 1)] = cand
-            cand_to_party[cand] = party
+    num_to_cand = {}
+    cand_to_party = {}
 
-        cand_list = list(cand_to_party.keys())
+    data_cand_num = len([r for r in data if "Candidate" in str(r[0])])
+    if data_cand_num != cand_num:
+        raise DataError(
+            "Incorrect number of candidates in either first row metadata \
+                        or in candidate list at end of csv file."
+        )
 
-        if len(cand_list) != cand_num:
+    # record candidate names, which are up until the final row
+    for i, line in enumerate(data[len(data) - (cand_num + 1) : -1]):
+        if "Candidate" not in line[0]:
             raise DataError(
-                "Incorrect number of candidates in either first row metadata \
-                            or in candidate list at end of csv file."
+                f"The number of candidates on line 1 is {cand_num}, which\
+                            does not match the metadata."
             )
-        ballots = [Ballot()] * len(lines[1 : len(lines) - (cand_num + 1)])
+        cand = line[1]
+        party = line[2]
+
+        # candidates are 1 indexed
+        num_to_cand[i + 1] = cand
+        cand_to_party[cand] = party
+
+    cand_list = list(cand_to_party.keys())
 
-        for i, line in enumerate(lines[1 : len(lines) - (cand_num + 1)]):
-            # remove carriage return and blank string after final comma
-            parsed_line = line.strip("\n").split(",")[:-1]
+    ballots = [Ballot()] * len(data[1 : len(data) - (cand_num + 1)])
 
-            ballot_weight = Fraction(parsed_line[0])
-            cand_ordering = parsed_line[1:]
-            ranking = tuple([frozenset({num_to_cand[n]}) for n in cand_ordering])
+    for i, line in enumerate(data[1 : len(data) - (cand_num + 1)]):
+        ballot_weight = Fraction(line[0])
+        cand_ordering = line[1:]
+        ranking = tuple([frozenset({num_to_cand[n]}) for n in cand_ordering])
 
-            ballots[i] = Ballot(ranking=ranking, weight=ballot_weight)
+        ballots[i] = Ballot(ranking=ranking, weight=ballot_weight)
 
-        profile = PreferenceProfile(
-            ballots=ballots, candidates=cand_list
-        ).condense_ballots()
-        return (profile, seats, cand_list, cand_to_party, ward)
+    profile = PreferenceProfile(
+        ballots=ballots, candidates=cand_list
+    ).condense_ballots()
+    return (profile, seats, cand_list, cand_to_party, ward)
diff --git a/tests/data/csv/scot_blank_rows.csv b/tests/data/csv/scot_blank_rows.csv
@@ -0,0 +1,13 @@
+3,1,
+126,1,
+
+9,1,2,
+10,1,2,3,
+1,3,2,1,
+"Candidate 1","Paul","Orange (O)",
+"Candidate 2","George","Yellow (Y)",
+"Candidate 3","Ringo","Red (R)",
+
+"Wardy McWard Ward",
+
+
diff --git a/tests/data/csv/scot_candidate_overcount.csv b/tests/data/csv/scot_candidate_overcount.csv
@@ -1,4 +1,4 @@
-2,4,
+9,4,
 2,9,8,7,10,
 "Candidate 1","Paul","Orange (O)",
 "Candidate 2","George","Yellow (Y)",

diff --git a/tests/data/csv/scot_candidate_undercount.csv b/tests/data/csv/scot_candidate_undercount.csv
@@ -1,7 +1,6 @@
-9,4,
-2,9,8,7,10,
+2,4,
+2,1,2,3,
 "Candidate 1","Paul","Orange (O)",
 "Candidate 2","George","Yellow (Y)",
 "Candidate 3","Ringo","Red (R)",
-"Wardy McWard Ward",
-
+"Wardy McWard Ward",
diff --git a/tests/test_loaders.py b/tests/test_loaders.py
@@ -160,7 +160,7 @@ def test_same_name():
 #     # print(p)
 
 
-def test_blt_parse():
+def test_scot_csv_parse():
     pp, seats, cand_list, cand_to_party, ward = load_scottish(
         CSV_DIR / "scot_wardy_mc_ward.csv"
     )
@@ -187,22 +187,49 @@ def test_blt_parse():
     )
 
 
-def test_bad_file_path_blt():
+def test_scot_csv_blank_rows():
+    pp, seats, cand_list, cand_to_party, ward = load_scottish(
+        CSV_DIR / "scot_blank_rows.csv"
+    )
+
+    assert seats == 1
+    assert isinstance(pp, PreferenceProfile)
+    assert cand_list == ["Paul", "George", "Ringo"]
+    assert cand_to_party == {
+        "Paul": "Orange (O)",
+        "George": "Yellow (Y)",
+        "Ringo": "Red (R)",
+    }
+    assert ward == "Wardy McWard Ward"
+    assert int(pp.num_ballots()) == 146
+    assert Ballot(ranking=tuple([frozenset({"Paul"})]), weight=126) in pp.ballots
+    assert (
+        Ballot(
+            ranking=tuple(
+                [frozenset({"Ringo"}), frozenset({"George"}), frozenset({"Paul"})]
+            ),
+            weight=1,
+        )
+        in pp.ballots
+    )
+
+
+def test_bad_file_path_scot_csv():
     with pytest.raises(FileNotFoundError):
         load_scottish("")
 
 
-def test_empty_file_blt():
+def test_empty_file_scot_csv():
     with pytest.raises(EmptyDataError):
         load_scottish(CSV_DIR / "scot_empty.csv")
 
 
-def test_bad_metadata_blt():
+def test_bad_metadata_scot_csv():
     with pytest.raises(DataError):
         load_scottish(CSV_DIR / "scot_bad_metadata.csv")
 
 
-def test_incorrect_metadata_blt():
+def test_incorrect_metadata_scot_csv():
     with pytest.raises(DataError):
         load_scottish(CSV_DIR / "scot_candidate_overcount.csv")