-
Notifications
You must be signed in to change notification settings - Fork 6
/
oz_footballers_abroad.py
50 lines (38 loc) · 1.39 KB
/
oz_footballers_abroad.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# -*- coding: utf-8 -*-
import scraperwiki
from collections import defaultdict
def table_to_list(table):
dct = table_to_2d_dict(table)
return list(iter_2d_dict(dct))
def table_to_2d_dict(table):
result = defaultdict(lambda : defaultdict(unicode))
for row_i, row in enumerate(table.xpath('./tr')):
for col_i, col in enumerate(row.xpath('./td|./th')):
colspan = int(col.get('colspan', 1))
rowspan = int(col.get('rowspan', 1))
col_data = col.text_content()
while row_i in result and col_i in result[row_i]:
col_i += 1
for i in range(row_i, row_i + rowspan):
for j in range(col_i, col_i + colspan):
result[i][j] = col_data
return result
def iter_2d_dict(dct):
for i, row in sorted(dct.items()):
cols = []
for j, col in sorted(row.items()):
cols.append(col)
yield cols
import lxml.html
html = scraperwiki.scrape('http://www.ozfootball.net/ark/Abroad/index.html')
doc = lxml.html.fromstring(html)
for table_el in doc.xpath('//table'):
table = table_to_list(table_el)
keys = []
for x in table[:1]:
for m in x:
keys.append(m.strip().replace('.',''))
for t in table[1:]:
record = {}
record = dict(zip(keys,t))
scraperwiki.sqlite.save(['SURNAME','FIRST'],record,verbose=0)