This repository has been archived by the owner on Dec 8, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
scraper.py
116 lines (103 loc) · 3.56 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import re, requests
from bs4 import BeautifulSoup
import sqlite3
from dateutil.parser import parse
USE_CACHE = False
SHOW_PREVIEW = True
DATABASE_NAME = 'data.sqlite'
BASE_URL = 'https://make.opendata.ch'
START_PAGE = BASE_URL + '/wiki/event:home'
FILTER_EVENTS = r'.*\/event:.*'
FILTER_PROJECTS = r'.*\/project:.*'
DATESTAMP_REGEXP = re.compile('^2')
fields = [
'event',
'event_url',
'updated',
'title',
'url',
'text',
'html',
]
def run():
# Set up a fresh database
conn = sqlite3.connect(DATABASE_NAME)
c = conn.cursor()
c.execute('DROP TABLE IF EXISTS data')
fieldlist = " text, ".join(fields)
c.execute(
'CREATE TABLE data (' + fieldlist + ')'
)
conn.commit()
# Download from cache
if USE_CACHE:
for page_count in range(0, 2):
print ("Collecting page %d" % page_count)
f = open('_cache/%d.html' % page_count, 'r')
cache_data = f.read()
soup = BeautifulSoup(cache_data, 'html.parser')
rows = soup.select('tbody tr')
save(c, rows)
conn.commit()
f.close()
# Retrieve from server
else:
print ("Refreshing start page %s" % START_PAGE)
page = requests.get(START_PAGE)
soup = BeautifulSoup(page.content, 'html.parser')
events = soup.select('a[href*="event:2"]')
for e in events:
name = e.get_text()
url = e.get('href')
if url.endswith('event:home'): continue
if not url.startswith('http'): url = BASE_URL + url
print ('Retrieving event: %s / %s' % (name, url))
event_page = requests.get(url)
event_soup = BeautifulSoup(event_page.content, 'html.parser')
projects = event_soup.select('a[href*="wiki/project:"]')
project_list = []
for p in projects:
p_name = p.get_text()
p_url = p.get('href')
if p_url.endswith('project:home'): continue
if not p_url.startswith('http'): p_url = BASE_URL + p_url
if not p_url in project_list:
project_list.append(p_url)
print("Downloading", p_url)
# extract meta
p_page = requests.get(p_url)
p_soup = BeautifulSoup(p_page.content, 'html.parser')
datestamp = p_soup.find('span', { 'title': DATESTAMP_REGEXP })
if datestamp:
datestamp = parse(datestamp.get_text())
else:
datestamp = '?'
content = p_soup.find('div', { 'class': 'dw-content' })
text = ''
if content is not None:
text = content.get_text()
content = content.encode_contents()
else:
content = ''
print (p_name, datestamp, len(text))
c.execute(
'''
INSERT INTO data (
''' + ','.join(fields) + '''
)
VALUES
(''' + '?,'*(len(fields)-1) + '''?)
''',
[
name,
url,
datestamp,
p_name,
p_url,
text,
content,
]
)
conn.commit()
conn.close()
run()