-
Notifications
You must be signed in to change notification settings - Fork 22
/
scrape.py
85 lines (68 loc) · 2.91 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/python
# taken from
# http://stackoverflow.com/questions/1060279/iterating-through-a-range-of-dates-in-python
import datetime
import time
import os
import urllib
import utils
parser = utils.incremental_date_range_cmd_line_parser()
# if the size of the game log is less than this assume we got an error page
SMALL_FILE_SIZE = 5000
utils.ensure_exists('static/scrape_data')
os.chdir('static/scrape_data')
# make I should just adopt the isotropic format for consistency?
ISOTROPIC_FORMAT = '%(year)d%(month)02d/%(day)02d/all.tar.bz2'
COUNCILROOM_FORMAT = '%(year)d%(month)02d%(day)02d/%(year)d%(month)02d%(day)02d.all.tar.bz2'
def FormatDate(fmt, date):
return fmt % {
'year': cur_date.year, 'month': cur_date.month, 'day': cur_date.day
}
def IsotropicGamesCollectionUrl(cur_date):
host = 'http://dominion.isotropic.org/gamelog/'
return host + FormatDate(ISOTROPIC_FORMAT, cur_date)
def CouncilroomGamesCollectionUrl(cur_date):
host = 'http://councilroom.com/static/scrape_data/'
return host + FormatDate(COUNCILROOM_FORMAT, cur_date)
def RemoveSmallFileIfExists(fn):
if (os.path.exists(fn) and
os.stat(fn).st_size <= SMALL_FILE_SIZE):
print 'removing small existing file', fn
os.unlink(fn)
args = parser.parse_args()
for cur_date in utils.daterange(datetime.date(2010, 10, 15),
datetime.date.today()):
str_date = time.strftime("%Y%m%d", cur_date.timetuple())
if not utils.includes_day(args, str_date):
print 'skipping', str_date, 'because not in cmd line arg daterange'
continue
directory = str_date
print str_date
games_short_name = str_date + '.all.tar.bz2'
saved_games_bundle = directory + '/' + games_short_name
if utils.at_least_as_big_as(saved_games_bundle, SMALL_FILE_SIZE):
print 'skipping because exists', str_date, saved_games_bundle, \
'and not small (size=', os.stat(saved_games_bundle).st_size, ')'
else:
if not os.path.exists(directory):
os.mkdir(directory)
RemoveSmallFileIfExists(saved_games_bundle)
urls_by_priority = [CouncilroomGamesCollectionUrl(cur_date),
IsotropicGamesCollectionUrl(cur_date)]
for url in urls_by_priority:
print 'getting', saved_games_bundle, 'at', url
contents = urllib.urlopen(url).read()
if len(contents) > SMALL_FILE_SIZE:
print 'yay, success from', url, 'no more requests for', \
str_date, 'needed'
open(saved_games_bundle, 'w').write(contents)
break
else:
print 'request to', url, 'failed to find large file'
time.sleep(5)
os.chdir(directory)
cmd = 'tar -xjvf ' + games_short_name
print cmd
os.system(cmd)
os.system('chmod -R 755 .')
os.chdir('..')