-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlobbyists.py~
44 lines (27 loc) · 1.47 KB
/
lobbyists.py~
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#!/usr/bin/python
import urllib
import xlrd
#source_uri = '''http://ec.europa.eu/transparencyregister/public/consultation/statistics.do?action=getLobbyistsExcel&fileType=XLS_NEW'''
#data = urllib.urlretrieve(source_uri)
#xlrd doesn't like strings, only file names. so it's either use urllib2 and StringIO or urllib.urlretrieve
book = xlrd.open_workbook('./full_export_new.xls')
#remember urlretrieve returns an HTTPMessage
sn = (book.sheet_names())[0]
#get the sheet name
sheet = book.sheet_by_name(sn)
#open the sheet. cor, xlrd is picky.
column_names = sheet.row_values(0)
#this is how you get the headers. as I say, picky
client_lobbyist_mapping = {}
for row_number in range(sheet.nrows): #nrows returns count of rows
r = sheet.row_values(row_number) #address each row by its index number
row = dict(zip(column_names,r)) #make a dict with the column names
if row['Clients']:
if u'No clients' not in row['Clients']:
for client in (row['Clients']).split(','):
if client != 'S.A.': #French and Spanish people put commas in front of this
client_lobbyist_mapping[(client)] = row['(Organisation) name']
for k, v in client_lobbyist_mapping.items():
print 'client: ', k, ' ', 'lobbyist: ', v
#we should now have a mapping of clients to lobbyists in a Python dict.
#this will break if the EComm changes the column names. doing it the other way by index would break if they changed the order of #columns, but then both are certain to happen at least once so pick your poison.