-
Notifications
You must be signed in to change notification settings - Fork 2
/
feature_generation.py
187 lines (129 loc) · 7.96 KB
/
feature_generation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
""" Script to generate DataFrame containing feature data and labels.
Each row in the resulting DataFrame contains features and labels. Row only generated if (1) h2h data exists, and (2)
if both sumo wrestlers have the requisite feature info.
Usage:
(1) Set up pickle files with relevant DataFrames (basic profile info, and head-to-head info)
(2) Set up pickle file save name (bottom of script)
(3) Run feature_generation.py
"""
import pandas as pd
import time
import random
from ml_fxns import map_rank, check_makuuchi, calculate_age, calculate_active_years, calculate_h2h_wp
# --- Import Data --- #
rikishi_df = pd.read_pickle('data/all_rikishi.pkl')
h2h_df = pd.read_pickle('data/recent_full_h2h.pkl')
# --- Apply Requisite Conditions Used to Extract H2H Data --- #
# only sumos that satisfied these conditions were used to store head-to-head data into h2h_df DataFrame
logical = rikishi_df['shikona'].map(lambda x: x is not None) & \
rikishi_df['sumo_stable'].map(lambda x: x is not None) & \
rikishi_df['bday'].map(lambda x: pd.notnull(x)) & \
rikishi_df['debut'].map(lambda x: pd.notnull(x))
filtered_df = rikishi_df[logical]
# ----- Remove Rows Without Requisite Features ----- #
# eliminate missing data
logical = (pd.notnull(filtered_df['height'])) & (pd.notnull(filtered_df['weight'])) & \
(pd.notnull(filtered_df['bday'])) & (pd.notnull(filtered_df['debut'])) & \
(~filtered_df['highest_rank'].isnull()) & (filtered_df['highest_rank'].apply(check_makuuchi))
filtered_df = filtered_df[logical]
filtered_df[['height', 'weight']] = filtered_df[['height', 'weight']].astype('float64') # cast correct data types
# --- Pre-process H2H DataFrame: Filter out NoneTypes, Select Win Outcomes, & Sort Chronologically --- #
# NOTE: datatypes (including NoneTypes) for rank/kimarite were cast to unicode for convenience
# remove nonetypes
logical = (h2h_df['rank1'] != u'None') & (h2h_df['rank2'] != u'None') & (h2h_df['kimarite'] != u'None')
h2h_df = h2h_df[logical]
# remove outcomes that were draws or defaults
logical = ((h2h_df['outcome1'] == u'shiro') | (h2h_df['outcome1'] == u'kuro')) & ((h2h_df['outcome2'] == u'shiro') |
(h2h_df['outcome2'] == u'kuro'))
h2h_df = h2h_df[logical]
# sort chronologically (for easier readability in resulting feature df)
h2h_df = h2h_df.sort_values(by='date', ascending=True) # debut 2000 to 2017
# ================ Loop Over H2H DF ================ #
checkpoint1 = 0
checkpoint2 = len(h2h_df.index)
compiled_dicts = [] # list for storing all the dicts generated
counter = 1
start = time.time()
for ind, row in h2h_df.iterrows():
if counter % 1000 == 0:
print "Counter: %4d" % counter # print a counter every once in a while to show progress
# watch out for this 'if' statement; the ind is the DataFrame label, NOT necessarily consecutive index
if (ind >= checkpoint1) & (ind <= checkpoint2):
# --- Extract Info --- #
ID1 = row['ID1']
ID2 = row['ID2']
rikishi1 = filtered_df[filtered_df['rikishi_ID'] == ID1]
rikishi2 = filtered_df[filtered_df['rikishi_ID'] == ID2]
# possible to get h2h_df rikishi WITHOUT min feature requirements to uniquely identify in filtered_df
if (not rikishi1.empty) and (not rikishi2.empty):
print "# --- Generating Features: Row: %2d" % ind + " --- #"
# --- Continue Extracting Relevant Info --- #
date = row['date']
rank1 = row['rank1']
rank2 = row['rank2']
outcome1 = row['outcome1']
outcome2 = row['outcome2']
# --- Generate Feature Values (ALWAYS Sumo1 Minus Sumo2) --- #
height_diff = rikishi1['height'].item() - rikishi2['height'].item() # diff in height (cm)
weight_diff = rikishi1['weight'].item() - rikishi2['weight'].item() # diff in weight (kg)
bday1 = rikishi1['bday'].item()
bday2 = rikishi2['bday'].item()
age1 = calculate_age(date, bday1)
age2 = calculate_age(date, bday2)
age_diff = age1 - age2 # diff in age in yrs
debut1 = rikishi1['debut'].item()
debut2 = rikishi2['debut'].item()
active_years1 = calculate_active_years(date, debut1)
active_years2 = calculate_active_years(date, debut2)
active_diff = active_years1 - active_years2 # difference in active no. of years
num_rank1 = map_rank(rank1)
num_rank2 = map_rank(rank2)
rank_diff = num_rank1 - num_rank2 # difference in rank
# calculate historical h2h win percentage
# obtain all previous matches by boolean indexing, date must be less than current_date
logical = ((h2h_df['ID1'] == ID1) & (h2h_df['ID2'] == ID2)) | \
((h2h_df['ID1'] == ID2) & (h2h_df['ID2'] == ID1))
rel_df = h2h_df[logical]
logical = (h2h_df['date'] < date) # h2h date is less than current date
rel_df = rel_df[logical]
h2h_wp1, h2h_wp2 = calculate_h2h_wp(date, rel_df, ID1) # if rel_df is empty, then say h2h_wp is 50/50
h2h_diff = h2h_wp1 - h2h_wp2
# ----------- Generate Feature Vectors and Labels ------------- #
# order in which sumos are stored must be randomized, else the data would be biased b/c of the order in
# which head-to-head data were scraped
order = random.randint(1, 2) # randomize order in which features and sumos are stored
feature_dict = {} # initialize dictionary
# --- Keep Order of Sumos --- #
if order == 1: # keep everything as is (no switching signs)
if (outcome1 == u'shiro') and (outcome2 == u'kuro'): # win by 1st sumo, loss by 2nd sumo
feature_dict = {'height_diff': height_diff, 'weight_diff': weight_diff, 'age_diff': age_diff,
'active_diff': active_diff, 'rank_diff': rank_diff, 'h2h_diff': h2h_diff,
'ID1': ID1, 'ID2': ID2, 'label': 1}
elif (outcome1 == u'kuro') and (outcome2 == u'shiro'): # loss by 1st sumo, win by 2nd sumo
feature_dict = {'height_diff': height_diff, 'weight_diff': weight_diff, 'age_diff': age_diff,
'active_diff': active_diff, 'rank_diff': rank_diff, 'h2h_diff': h2h_diff,
'ID1': ID1, 'ID2': ID2, 'label': 0}
# --- Switch Order of Sumos --- #
elif order == 2: # then switch signs of features, order of sumos, but NOT labels
if (outcome2 == u'shiro') and (outcome1 == u'kuro'): # win by "1st sumo", loss by 2nd sumo
feature_dict = {'height_diff': -height_diff, 'weight_diff': -weight_diff, 'age_diff': -age_diff,
'active_diff': -active_diff, 'rank_diff': -rank_diff, 'h2h_diff': -h2h_diff,
'ID1': ID2, 'ID2': ID1, 'label': 1}
elif (outcome2 == u'kuro') and (outcome1 == u'shiro'): # loss by "1st sumo", win by 2nd sumo
feature_dict = {'height_diff': -height_diff, 'weight_diff': -weight_diff, 'age_diff': -age_diff,
'active_diff': -active_diff, 'rank_diff': -rank_diff, 'h2h_diff': -h2h_diff,
'ID1': ID2, 'ID2': ID1, 'label': 0}
# --- Collect Data --- #
if feature_dict:
compiled_dicts.append(feature_dict)
else:
print "Empty feature dict! Cannot append to compiled dicts..."
counter += 1
# ----- Store Data into DataFrame ---- #
cols = ['height_diff', 'weight_diff', 'age_diff', 'active_diff', 'rank_diff', 'h2h_diff',
'ID1', 'ID2', 'label']
feature_df = pd.DataFrame(compiled_dicts, columns=cols)
end = time.time()
print "Time: %2.2f" % (end - start)
# --- Uncomment And Change Filename When Ready to Store --- #
# feature_df.to_pickle('data/recent_feature_df.pkl')