-
Notifications
You must be signed in to change notification settings - Fork 1
/
create_results_dataframe.py
156 lines (138 loc) · 5.64 KB
/
create_results_dataframe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# Creates results.df, which contains various measurements of various clustering methods
# run on all 1000+ Scottish elections
import pandas as pd
import glob
import os
from joblib import Parallel, delayed
from datetime import datetime
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from Clustering_Functions import *
def process_election_with_method(method, election):
if method == "meanBC":
C, centers = kmeans(election, proxy="Borda", borda_style="pes", return_centroids=True)
elif method == "meanBA":
C, centers = kmeans(election, proxy="Borda", borda_style="avg", return_centroids=True)
elif method == "meanH":
C, centers = kmeans(election, proxy="HH", return_centroids=True)
elif method == "medoBC":
C, centers = kmedoids(election, proxy="Borda", borda_style="pes", verbose=False, return_medoids=True)
elif method == "medoBA":
C, centers = kmedoids(election, proxy="Borda", borda_style="avg", verbose=False, return_medoids=True)
elif method == "medoH":
C, centers = kmedoids(election, proxy="HH", verbose=False, return_medoids=True)
elif method == 'slate':
centers, C = Slate_cluster(election, verbose=False, return_slates=True)
else:
raise Exception("unknown method")
if type(centers)==list:
centers = {0:centers[0], 1:centers[1]} # convert list to dict.
return C, centers
def compute_scores(C, election, num_cands):
labels = []
XH = [] # first build list of ballot proxies with repititions
for ballot, weight in election.items():
for _ in range(weight):
XH.append(HH_proxy(ballot, num_cands=num_cands))
label = 0 if ballot in C[0].keys() else 1
labels.append(label)
sil = silhouette_score(XH, labels, metric="manhattan")
cal = calinski_harabasz_score(XH, labels)
dav = davies_bouldin_score(XH,labels)
return sil, cal, dav
def process_election_file(full_filename):
print(f"[{datetime.now().strftime('%H:%M:%S')}] {full_filename}", flush=True)
filename = os.path.basename(full_filename)
num_cands, election, cand_names, location = csv_parse(full_filename)
all_ballots = list(election.keys())
num_unique_ballots = len(all_ballots)
num_voters = sum(election.values())
avg_ballot_len = (
sum([len(ballot) * election[ballot] for ballot in all_ballots]) / num_voters
)
ballot_lengths = {n: 0 for n in range(num_cands + 1)}
for ballot in all_ballots:
l = len(ballot)
ballot_lengths[l] += 1
# Compute dictionary of parties
parties = dict()
for count in range(len(cand_names)):
party = cand_names[count][2]
parties[count + 1] = party
results = []
Clusterings = dict()
method_list = ["meanBC", "meanBA", "meanH", "medoBC", "medoBA", "medoH", "slate"]
for method in method_list:
for trial in range(2):
C, centers = process_election_with_method(method, election)
if trial == 1:
sil, cal, dav = (compute_scores(C, election, num_cands))
Clusterings[(method,trial)]=C
# build a dictionary storing the closenesses of the clusterings formed by the 7 different methods
# this dictionary will only be included in the last dataframe row (slate) for this election.
if method=='slate' and trial ==1:
method_closeness = dict()
for m1 in method_list:
for m2 in method_list:
if m1==m2:
method_closeness[(m1,m1)]=Clustering_closeness(election,Clusterings[(m1,0)],Clusterings[(m1,1)])
else:
method_closeness[(m1,m2)]=Clustering_closeness(election,Clusterings[(m1,0)],Clusterings[(m2,0)])
else:
method_closeness = None
block_size = sum(C[0].values()) / num_voters
if trial == 1:
results.append(
[
filename,
num_cands,
num_voters,
num_unique_ballots,
avg_ballot_len,
ballot_lengths,
parties,
method,
block_size,
sil,
cal,
dav,
centers,
{0:C[0], 1:C[1]},
method_closeness
])
return results
if __name__ == "__main__":
start_time = datetime.now()
print(f"[{(start_time).strftime('%H:%M:%S')}] Start time", flush=True)
print()
filename_list = glob.glob("scot-elex/**/*.csv")
n_jobs = 128
results = Parallel(n_jobs=n_jobs)(
delayed(process_election_file)(file) for file in filename_list
)
# Flatten the list of lists
results = [item for sublist in results for item in sublist]
# Convert results into a DataFrame
results_df = pd.DataFrame(
results,
columns=[
"filename",
"num_cands",
"num_voters",
"num_unique_ballots",
"avg_ballot_len",
"ballot_lengths",
"parties",
"method",
"block_size",
"sil",
"cal",
"dav",
"centers",
"clustering",
"method_closeness"
],
)
results_df.to_pickle("results.pkl")
print()
print(f"[{datetime.now().strftime('%H:%M:%S')}] End Time", flush=True)
print(f"[{datetime.now()-start_time}] Elapsed", flush=True)