-
Notifications
You must be signed in to change notification settings - Fork 0
/
congress_scraper.py
112 lines (103 loc) · 4.42 KB
/
congress_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from bs4 import BeautifulSoup
import urllib2
import re
import socket
import time
import lxml.html
import itertools
from cp_namer import proper_name
import parseattempt as pa
class Bill(object):
def __init__(self, sponsor, congress_num, date, cosponsors=[], title='', summary=''):
self.sponsor = sponsor
self.congress_num = congress_num
self.date = date
self.cosponsors = cosponsors
self.title = title
self.summary = summary
def addCosponsor(self, cosponsor):
self.cosponsors.append(cosponsor)
def addAction(self, action):
self.action = action
#def partisan_score(congress_person):
def plus_dot(name):
return name[0:3] + "." + name[3:]
# Mapping of bills to senators that own them
bill_dict = {}
bill_list = []
congress_people = set([])
timeout = 30
socket.setdefaulttimeout(timeout) # don't want shit lasting longer than 30 secs
printing = True # Make false if you want to suppress output.
base_url = 'https://beta.congress.gov/members?q=%7B%22congress%22%3A%22all%22%7D&pageSize=500&page='
congress_people_dictionary = pa.produce_congress_dict(base_url)
print congress_people_dictionary
base_url = "http://thomas.loc.gov/home/Browse.php?n=bills&c="
# currently on the 113th congress, so last param should be 114
congress_range = range(101, 102)
urllist = [str(base_url + str(congress_num)) for congress_num in congress_range]
for i in range(len(urllist)):
directory_page = pa.url_to_page_contents(urllist[i])
senate_regex = '\/cgi-bin\/query\/L\?c' + str(congress_range[i]) + \
'\:\.\/list\/c' + str(congress_range[i]) + 's.lst:\d*'
house_regex = '\/cgi-bin\/query\/L\?c' + str(congress_range[i]) + \
'\:\.\/list\/c' + str(congress_range[i]) + 'h.lst:\d*'
senate_groups = re.findall(senate_regex, directory_page)
house_groups = re.findall(house_regex, directory_page)
base_next_level_url = "http://thomas.loc.gov"
senate_bill_table_urls = [base_next_level_url + a for a in senate_groups]
house_bill_table_urls = [base_next_level_url + a for a in house_groups]
for senate_url in senate_bill_table_urls[0:1]:
if printing: print "*** ABOUT TO VISIT: " + senate_url + " ***\n"
# Visit the page of bills 101-200 for example
senate_numbered_page = pa.url_to_page_contents(senate_url)
final_regex_s_nums = '\/cgi-bin\/query\/z\?c' + str(congress_range[i]) + \
':S\.(\d*)\.\S*:'
senate_bill_nums = re.findall(final_regex_s_nums, senate_numbered_page)
senate_bill_nums = ["%05.0f" % int(senate_num) for senate_num in senate_bill_nums]
senate_final_urls = []
for number in senate_bill_nums:
senate_final_urls.append('http://thomas.loc.gov/cgi-bin/bdquery/z?d' \
+ str(congress_range[i]) + ':SN'+ str(number) +':@@@L&summ2=m&')
for url_ind in range(len(senate_final_urls[0:14])):
if printing: print "Bill url is: ", senate_final_urls[url_ind]
# Explore the all information individual bill page
bill_page = pa.url_to_page_contents(senate_final_urls[url_ind])
bill_soup = BeautifulSoup(bill_page)
title = bill_soup.findAll(attrs={"name":"dc.title"})[0]
title = str(title).split("\"")[1]
summary = bill_soup.findAll(attrs={"name":"dc.description"})[0]
summary = str(summary).split("\"")[1]
date = bill_soup.findAll(attrs={"name":"dc.date"})[0]
date = str(date).split("\"")[1]
if printing: print "Bill title is: ", title
if printing: print "Bill date is: ", date
if printing: print "Bill summary is:", summary
sponsor_index = 0
cosponsor_index = 0
for aref in bill_soup.find_all('a'):
match_spons = re.search("FLD003",str(aref))
match_cospons = re.search("FLD004",str(aref))
if match_spons and sponsor_index == 0:
if printing: print "Bill number ", senate_bill_nums[url_ind], \
"sponsored by: ", aref.string
sponsor_index += 1
# now that we have a sponsor, we can use the Bill defualt constructor
new_bill = Bill(str(aref.string), congress_range[i], date, [],
title, summary)
bill_list.append(new_bill)
if match_cospons:
if printing: print "Bill number ", senate_bill_nums[url_ind], "cosponsored by: ", aref.string
cosponsor_index += 1
new_bill.addCosponsor(str(aref.string))
for action in bill_soup.find_all('b'):
if re.search("Latest Major Action:",str(action)):
new_bill.addAction(action.nextSibling)
for house_url in house_bill_table_urls:
if printing: print "*** ABOUT TO VISIT: " + house_url + " ***"
for bill in bill_list:
try:
if printing: print congress_people_dictionary[plus_dot(bill.sponsor)]
except:
#suck one
pass