-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess_data.py
63 lines (53 loc) · 2.09 KB
/
preprocess_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import pandas as pd
def add_coordinate_bins(df, n_bins_x = 10, n_bins_y = 10):
return df.assign(
x_bin = pd.cut(df['x'], bins = n_bins_x, labels = range(n_bins_x)),
y_bin = pd.cut(df['y'], bins = n_bins_y, labels = range(n_bins_y))
)
def add_team_as_dummy(df):
teams = (
df
.groupby(['game_id', 'period_id'])
['team_id']
.first()
.reset_index()
.rename(columns = {'team_id': 'home_team'})
)
return (
df
.merge(teams)
.assign(team = lambda d: d.team_id == d.home_team)
)
def get_action_type_names(df, action_types : dict):
return df.assign(action_type = df['type_id'].astype(str).map(action_types))
def get_action_tokens(df):
return df.assign(action_token = lambda d: d.team.astype(str) + "," + d.action_type + "," + d.x_bin.astype(str) + "," + d.y_bin.astype(str))
if __name__ == "__main__":
import json
with open('action_types.json', 'r') as f:
action_types = json.load(f)
df = (
pd.read_csv("WSL_actions.csv", index_col = 0)
.pipe(add_coordinate_bins, n_bins_x = 10, n_bins_y = 10)
.pipe(add_team_as_dummy)
.pipe(get_action_type_names, action_types)
.pipe(get_action_tokens)
.assign(
match_id = lambda d: d.groupby(['game_id']).ngroup(),
action_token = lambda d: pd.Categorical(d.action_token)
)
[['match_id', 'action_token']]
)
from numpy.random import choice, seed
from numpy import array, select
seed(42)
train_groups = choice(df['match_id'].unique(), int(0.8 * df['match_id'].nunique()), replace = False)
validation_candidates = list(set(df['match_id'].unique()) - set(train_groups))
val_groups = choice(validation_candidates, int(len(validation_candidates) * 0.5), replace = False)
test_groups = array(list(set(validation_candidates) - set(val_groups)))
assert(train_groups[0] == 96)
(
df
.assign(dataset = select([df.match_id.isin(train_groups), df.match_id.isin(val_groups)], ['train', 'val'], 'test'))
.to_csv("df_clean.csv")
)