-
Notifications
You must be signed in to change notification settings - Fork 1
/
scrape_incident_logs.py
111 lines (90 loc) · 3.38 KB
/
scrape_incident_logs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import requests
from bs4 import BeautifulSoup
from time import sleep
import os.path
import re
def extract_pdf_urls(url, html):
""" Yields full urls of pdf reports.
url -- url pointing to the page where the pdf_urls will be extracted (necessary for forming the full pdf_url)
html -- html content from which the pdf_urls will be extracted
"""
# create a Beautiful soup object from the response content, html5lib is the parser
soup = BeautifulSoup(html, 'html5lib')
# iterate over each "td" tag that has one of the two bgcolor values
# note that sometimes (but not always) the bgcolor values include a leading '#', hence the use of regex
for td in soup.find_all('td', attrs = {'bgcolor': re.compile(r"#?CAE1C1"), 'bgcolor': re.compile(r"#?F0FFC6")}):
# try finding an "a" tag in the td, combine the provided url with the href attribute
try:
pdf_url = url + td.find("a")['href']
except TypeError:
# ignore if there's no "a" tag
pass
finally:
# if there is, yield the full pdf_url to the array
yield pdf_url
# urls where we find the pdf_urls
calendar_urls = [
'http://mupolice.missouri.edu/blotter/'
, 'http://mupolice.missouri.edu/blotter/archive/2014/'
, 'http://mupolice.missouri.edu/blotter/archive/2013/'
]
# set up an array for all the links to the pdfs
pdf_urls = []
# set up a requests session
with requests.session() as requests_session:
for cal_url in calendar_urls:
r = None
# until we have a response...
while r == None:
# try requesting the initial page
try:
r = requests_session.get(cal_url)
except:
print "Request failed."
# if the request fails, reset the session before trying again
requests_session = requests.session()
finally:
# create a Beautiful soup object from the response content, html5lib is the parser
soup = BeautifulSoup(r.content, 'html5lib')
# iterate over each "td" tag that has one of the two bgcolor values
# note that sometimes (but not always) the bgcolor values include a leading '#', hence the use of regex
for td in soup.find_all('td', attrs = {'bgcolor': re.compile(r"#?CAE1C1"), 'bgcolor': re.compile(r"#?F0FFC6")}):
# try finding an "a" tag in the td, combine the provided url with the href attribute
try:
pdf_url = cal_url + td.find("a")['href']
except TypeError:
# ignore if there's no "a" tag
pass
finally:
# append the pdf_url to the global array
pdf_urls.append(pdf_url)
# iterate over the array of pdf_urls
for pdf_url in pdf_urls:
# split the pdf_url at '/' and take the last item, which is the date
date = pdf_url.split('/')[-1]
month = date[0:2]
day = date[2:4]
year = '20{}'.format(date[4:6])
# store the files in the "pdf/" directory, name format "YYYY-MM-DD"
file_name = 'pdfs/{0}-{1}-{2}.pdf'.format(year, month, day)
# check to see if we've already downloaded the file
if not os.path.isfile(file_name):
print ' Downloading to {}'.format(file_name)
# pause three seconds between making requests
sleep(3)
r = None
# until we have a response...
while r == None:
# try requesting the pdf
try:
r = requests.get(pdf_url)
except:
print "Request failed."
# if the request fails, reset the session before trying again
requests_session = requests.session()
finally:
# open the file...
with open(file_name, 'w') as f:
# write the contents of the file ()
f.write(r.content)
print 'fin.'