-
Notifications
You must be signed in to change notification settings - Fork 0
/
takeOne.py
134 lines (107 loc) · 4.59 KB
/
takeOne.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import mm_predictor # stat-analysis library
from sklearn import cross_validation, linear_model
import csv
import random
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from tpot import TPOTClassifier
from sklearn.metrics import make_scorer
def getWinnersList(tourney_data):
winners = []
tourney_data_grouped = tourney_data[4:].groupby('WTeamID').size().reset_index(name='NumWins')
for index, row in tourney_data_grouped.iterrows():
teamName = team_id_map[row['WTeamID']]
wins = row['NumWins']
if len(winners) == 0:
winners.append([])
winners[0].append(teamName)
if wins > 1:
if len(winners) == 1:
winners.append([])
winners[1].append(teamName)
if wins > 2:
if len(winners) == 2:
winners.append([])
winners[2].append(teamName)
if wins > 3:
if len(winners) == 3:
winners.append([])
winners[3].append(teamName)
if wins > 4:
if len(winners) == 4:
winners.append([])
winners[4].append(teamName)
if wins > 5:
if len(winners) == 5:
winners.append([])
winners[5].append(teamName)
if wins > 6:
if len(winners) == 6:
winners.append([])
winners[6].append(teamName)
return winners
# Method that takes in season's tournament data, id to name mapping of teams
def calcBracketScore(teamTeamWinChanceMap, tourney_data):
winners = getWinnersList(tourney_data)
# First four rows is 'first four' and not in actual bracket
firstFour = tourney_data[:4]
mainTourney = tourney_data[4:]
tourney_seeds = pd.read_csv('./ncaa-data/TourneySeeds.csv')
tourney = [-1] * max(tourney_seeds['Team'])
# Look at first four teams
for index, row in firstFour.iterrows():
index1 = int(row['WTeamID'])
index2 = int(row['LTeamID'])
team1Name = team_id_map[index1]
team2Name = team_id_map[index2]
if (team1Name in teamTeamWinChanceMap) and (team2Name in teamTeamWinChanceMap[team1Name]):
tourney[index2] = index1
else:
tourney[index1] = index2
# MAIN TOURNAMENT
score = 0
for index, row in mainTourney.iterrows():
index1 = int(row['WTeamID'])
index2 = int(row['LTeamID'])
while tourney[index1] > 0:
index1 = tourney[index1]
while tourney[index2] > 0:
index2 = tourney[index2]
team1Name = team_id_map[index1]
team2Name = team_id_map[index2]
if (team1Name in teamTeamWinChanceMap) and (team2Name in teamTeamWinChanceMap[team1Name]): # team1 would win
tourney[index2] = index1
tourney[index1] = tourney[index1] - 1
if team1Name in winners[abs(tourney[index1]) - 2]:
score += 2**(abs(tourney[index1]) - 2) * 10
print(team1Name + ' vs ' + team2Name + ', team 1 wins')
print('Score ' + str(2**(abs(tourney[index1]) - 2) * 10))
else: # Delete else statemenet when done debugging
print('Incorrect: Predicted ' + team1Name + ' vs ' + team2Name + ', team1 wins')
else: #team2 would win
tourney[index1] = index2
tourney[index2] = tourney[index2] - 1
if team2Name in winners[abs(tourney[index2]) - 2]:
score += 2**(abs(tourney[index2]) - 2) * 10
print(team1Name + ' vs ' + team2Name + ', team 2 wins')
print('Score ' + str(2**(abs(tourney[index1]) - 2) * 10))
else: # Delete else statement when done debugging
print('Incorrect: Predicted ' + team1Name + ' vs ' + team2Name + ', team2 wins')
return score
if __name__ == '__main__':
# intialize stat & elo dictionaries
mm_predictor.init()
# Load data
season_data = pd.read_csv('./ncaa-data/RegularSeasonDetailedResults.csv')
tourney_data = pd.read_csv('./ncaa-data/NCAATourneyDetailedResults.csv')
tourney_data = tourney_data[tourney_data.Season != 2017]
aggregated_data = pd.concat([season_data, tourney_data])
X,Y = mm_predictor.analyze_teams_diff(aggregated_data)
print("Fitting on " + str(len(X)) + " samples")
# TODO Use TPOT or xgboost
# model = XGBClassifier()
# model = linear_model.LogisticRegression()
tpot = TPOTClassifier(generations = 50, random_state=42, max_time_mins=600, verbosity=2, population_size=30)
tpot.fit(np.array(X), np.array(Y))
tpot.export('tpot_orig_pipeline.py')