-
Notifications
You must be signed in to change notification settings - Fork 6
/
bis_speeches.py
67 lines (58 loc) · 2.09 KB
/
bis_speeches.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import scraperwiki,re
from BeautifulSoup import BeautifulSoup
base_url = 'http://www.bis.gov.uk'
starturl = 'http://www.bis.gov.uk/news/speeches?pp=50'
def strip_tags(value):
"Return the given HTML with all tags stripped."
return re.sub(r'<[^>]*?>', '', value)
def parse_speech(url):
data = {}
html = scraperwiki.scrape(url)
soup = BeautifulSoup(html)
soup.prettify()
content = soup.find('div', {'id' : 'mainColumn' })
title = content.h1.text
author = content.find('p', {'class' : 'detail'}).text[2:]
position = content.p.nextSibling.text
date_and_place = content.find('p', {'class' : 'detail alternate'})
date = date_and_place.text.partition(',')[0]
place = date_and_place.text.partition(',')[2]
body = content.findAll(lambda tag: len(tag.name) == 1 and not tag.attrs)
# Re-join the body, otherwise it shows up with array cruft in the JSON
body = ''.join(str(tag) for tag in body)
data['title'] = title
data['body'] = body
data['minister_name'] = author
data['minister_position'] = position
data['date'] = date
data['where'] = place
data['source_url'] = url
data['department'] = 'Business, Innovation and Skills'
print "Save: " + str(data)
scraperwiki.sqlite.save(["title", "source_url"], data)
def scrape_list(soup):
#html = scraperwiki.scrape(url)
#soup = BeautifulSoup(soup)
soup.prettify()
content = soup.find('ul', {'id' : 'listing' })
items = content.findAll('li')
for item in items:
#title = item.h2.text
url = 'http://www.bis.gov.uk' + item.h2.a['href']
print 'Process: ' + url
parse_speech(url)
#minister = item.p.text[2:]
#print minister
#content_stuff = soup.findAll(text=True)
#print content
def scrape_next_link(url):
html = scraperwiki.scrape(url)
soup = BeautifulSoup(html)
scrape_list(soup)
next_link = soup.find("li", { "class" : "next" })
#print next_link
if next_link:
next_url = base_url + next_link.a['href']
#print next_url
scrape_next_link(next_url)
scrape_next_link(starturl)