-
Notifications
You must be signed in to change notification settings - Fork 1
/
condense_scraped_data.py
191 lines (160 loc) · 6.25 KB
/
condense_scraped_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
#!/usr/bin/python
import csv
import pprint
import os.path
import re
import simplejson as json
import gzip
from deck_info import DeckInfo
ALL_DIGITS = re.compile('\d+')
POINTS_RE = re.compile('points = (-?\d+)p')
CHIPS_RE = re.compile('(\d+) chips')
GAME_NUM_RE = re.compile('Game #(\d+)')
GOAL_NUM_RE = re.compile('g(\d+)')
CARD_NUM_RE = re.compile('has (\d+) card on hand')
GOAL_NUM_TO_NAME = {
1: 'First 5 vps',
2: 'First 6 pt dev',
3: 'First all phase powers',
4: 'First discard',
5: 'First all worlds',
6: 'First 3 aliens',
10: 'Most Prod worlds',
11: 'Most Developments',
12: 'Most Military',
13: 'Most Rares or Novelties',
}
class BogusGoalNum(Exception):
def __init__(self, msg):
Exception.__init__(self, msg)
class CompleteButRejectedGame(Exception):
pass
def ParseGoals(snip_with_goals):
avail_goal_chunks = snip_with_goals.split('rftg/')[1:]
ret = []
for goal_chunk in avail_goal_chunks:
goal_num_match = GOAL_NUM_RE.search(goal_chunk)
if not goal_num_match:
continue
goal_num = int(goal_num_match.group(1))
if goal_num not in GOAL_NUM_TO_NAME:
print 'bogus goal num?', goal_num
raise BogusGoalNum("Bogus goal number " + str(goal_num))
ret.append(GOAL_NUM_TO_NAME[goal_num])
return ret
def ParseGame(page_contents, card_id_to_name):
page_contents = page_contents.replace('<br/>', '<br>')
version_loc = page_contents.index(': Game #')
version_with_context = page_contents[version_loc-50:version_loc + 50]
is_gs = 'Gathering Storm' in version_with_context
version = int(is_gs)
using_goals = False
goals = set()
if 'Available goals' in version_with_context:
end_goal_loc = page_contents.find('Chosen Actions')
snip_with_goals = page_contents[version_loc - 50:end_goal_loc]
goals.update(ParseGoals(snip_with_goals))
using_goals = True
ret = {'player_list': [],
'game_id': ('http://genie.game-host.org/game.htm?gid=' +
GAME_NUM_RE.search(version_with_context).group(1))}
if 'Status:' not in page_contents:
print 'could not find status'
raise CompleteButRejectedGame()
page_contents = page_contents[:page_contents.find('Comments'):]
status_list = page_contents.split('Status:')[1:]
for status in status_list:
player_result = {}
status_lines = status.split('<br>')
if len(status_lines) < 3:
print 'confused, could not under status lines', status_lines
game_end_line = status_lines[0]
if 'Game End' not in game_end_line:
if 'Frozen' in game_end_line:
raise CompleteButRejectedGame()
print "Don't think game", ret['game_no'] ,"is over"
return None
name_points = status_lines[1]
player_result['name'] = name_points[:name_points.find("'")]
player_result['points'] = int(POINTS_RE.search(name_points).group(1))
player_result['chips'] = int(CHIPS_RE.search(name_points).group(1))
if player_result['points'] <= 5:
raise CompleteButRejectedGame()
card_goal_list_line = status_lines[2]
card_goal_list_imgs = card_goal_list_line.split('src=')
cards = []
won_goals = []
goods = 0
for card_or_goal in card_goal_list_imgs:
if 'border-width:5px' in card_or_goal:
goods += 1
for card_or_goal in card_goal_list_imgs[1:]:
img = card_or_goal.replace('/', ' ').replace('.', ' ').split()[1]
if img[0] == 'g':
won_goals.append(GOAL_NUM_TO_NAME[int(img[1:])])
else:
card_data = card_id_to_name[img]
card_name = card_data['name']
cards.append(card_name)
version = max(version, DeckInfo.Version(card_name))
player_result['cards'] = cards
if using_goals:
goals.update(won_goals)
player_result['goals'] = won_goals
player_result['goods'] = goods
ret['player_list'].append(player_result)
if len(ret['player_list']) <= 1:
print 'insufficient players'
raise CompleteButRejectedGame()
if using_goals:
ret['goals'] = list(goals)
hand_sizes = [int(x) for x in CARD_NUM_RE.findall(page_contents)]
for player_result, hand_size in zip(ret['player_list'], hand_sizes):
player_result['hand'] = hand_size
ret['advanced'] = int('Action1:' in page_contents and
'Action2:' in page_contents)
version = max(version, int(len(goals) > 1))
ret['expansion'] = version
return ret
def ReadImageIDFile():
card_id_to_name = csv.DictReader(open('card_names.csv', 'r'))
cards_by_id = {}
for row in card_id_to_name:
id_line = row['BGG img number ID']
card_id = ALL_DIGITS.search(id_line).group()
cards_by_id[card_id] = row
return cards_by_id
def main():
cards_by_id = ReadImageIDFile()
games = []
error_sources = []
known_errors = []
data_sources = [x for x in os.listdir('data') if not 'xml' in x]
for game_data_fn in data_sources:
write_dead = False
try:
print game_data_fn
if game_data_fn.endswith('dead'):
continue
full_game_fn = 'data/' + game_data_fn
game = ParseGame(open(full_game_fn, 'r').read(), cards_by_id)
if game and len(game['player_list']):
games.append(game)
except BogusGoalNum, e:
known_errors.append(game_data_fn)
write_dead = True
except CompleteButRejectedGame:
print 'Rejecting', game_data_fn
write_dead = True
except Exception, e:
error_sources.append(game_data_fn)
print 'error', e, game_data_fn
if write_dead:
open(full_game_fn + '.dead', 'w')
print 'games with errors', error_sources
print 'games with known errors', known_errors
json.dump(games, open('condensed_games.json', 'w'), indent=True)
gzip.GzipFile('condensed_games.json.gz', 'w').write(
open('condensed_games.json', 'r').read())
if __name__ == '__main__':
main()