-
Notifications
You must be signed in to change notification settings - Fork 0
/
check_data_advalibility.py
144 lines (119 loc) · 4.35 KB
/
check_data_advalibility.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import argparse
import json
import logging
import os
from random import randint
from datetime import datetime
import numpy as np
# import time
import pandas as pd
from tqdm import tqdm
from parse_statsbomb_data import (
get_json_from_web_gz,
get_list_of_statsbomb_games,
parse_statsbomb_amf_tracking_data,
)
def check_data_availability(
check_csv: bool = False,
check_parquet: bool = False,
check_all_seasons: bool = True
):
if check_csv is False and check_parquet is False:
raise ValueError(
"Please specify if you want to check the `.csv` files" +
", or the `.parquet` files."
)
games_df = get_list_of_statsbomb_games()
if check_all_seasons is False:
seasons_arr = games_df["season"].to_numpy()
seasons_arr = np.unique(seasons_arr)
s_len = len(seasons_arr)-1
s = seasons_arr[randint(0, s_len)]
games_df = games_df[games_df["season"] == s]
del seasons_arr, s_len, s
game_ids_arr = games_df["nfl_game_id"].to_numpy()
url_arr = games_df["url"].to_numpy()
for i in tqdm(range(0, len(game_ids_arr))):
game_id = game_ids_arr[i]
game_url = url_arr[i]
print(f"\nChecking game {game_id}")
if check_csv is True:
try:
df = pd.read_csv(
"https://github.com/sportsdataverse/" +
"amf-location-data/releases/download/" +
f"amf_tracking_csv/{game_id}.csv"
)
print(df)
print(f"\n{game_id} is verified to to exist in `.csv` form.")
except Exception as e:
logging.warning(
f"NFL game ID #{game_id} could not be reached.\n"
+ f"Unhandled exception: {e}"
)
logging.info("Attempting to re-download this game.")
json_data = get_json_from_web_gz(game_url)
parsed_df = parse_statsbomb_amf_tracking_data(json_data)
parsed_df.to_csv(f"statsbomb/{game_id}.csv", index=False)
elif check_parquet is True:
try:
df = pd.read_parquet(
"https://github.com/sportsdataverse/" +
"amf-location-data/releases/download/" +
f"amf_tracking_parquet/{game_id}.parquet"
)
print(df)
print(
f"\n{game_id} is verified to to exist in `.parquet` form."
)
except Exception as e:
logging.warning(
f"NFL game ID #{game_id} could not be reached.\n"
+ f"Unhandled exception: {e}"
)
logging.info("Attempting to re-download this game.")
json_data = get_json_from_web_gz(game_url)
parsed_df = parse_statsbomb_amf_tracking_data(json_data)
parsed_df.to_parquet(
f"statsbomb/{game_id}.parquet",
index=False
)
del game_id, game_url
if __name__ == "__main__":
now = datetime.now()
check_all_seasons = False
if now.day <= 8:
check_all_seasons = True
try:
os.mkdir("statsbomb")
except Exception as e:
logging.info(f"Unhandled exception: {e}")
parser = argparse.ArgumentParser()
parser.add_argument("-csv", action="store_true")
parser.add_argument("-parquet", action="store_true")
args = parser.parse_args()
csv_flag = args.csv
parquet_flag = args.parquet
if csv_flag is True and parquet_flag is True:
raise ValueError(
"Please specify if you want to check the `.csv` " +
"files, or the `.parquet` files."
)
elif csv_flag is True:
check_data_availability(
check_csv=True,
check_all_seasons=check_all_seasons
)
elif parquet_flag is True:
check_data_availability(
check_parquet=True,
check_all_seasons=check_all_seasons
)
timestamp_json = {
"last_check": {"month": now.month, "day": now.day, "year": now.year},
"check_all_seasons": check_all_seasons,
"check_csv": csv_flag,
"check_parquet": parquet_flag,
}
with open("statsbomb/last_check_timestamp.json", "w+") as f:
f.write(json.dumps(timestamp_json))