-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_news.py
194 lines (181 loc) · 7.8 KB
/
scrape_news.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''Scrape Dutch news sites to make a large diachronic corpus'''
import re, requests, time, json, os, argparse
from bs4 import BeautifulSoup
def get_dates_from_archive(newspaper, archive_url, year):
'''Scrapes archive overview page for a given year for a given newspaper, returns all dates for which archive is available'''
dates = []
year_url = archive_url + year
year_page = requests.get(year_url, cookies = cookiewall_cookie)
year_soup = BeautifulSoup(year_page.content, 'html.parser')
for link in year_soup.find_all('a'):
try:
date = ''
if newspaper == 'trouw':
if re.search('archiveDay=', link['href']):
date = link['href'][-8:]
dates.append(date)
if newspaper == 'volkskrant':
if re.search('archief/detail/', link['href']):
date = link['href'][-9:-1]
dates.append(date)
except KeyError: # When link has no 'href', just skip
pass
return dates
def get_articles_for_date_trouw(base_date_url, date):
'''Scrapes archive page for a certain date for Trouw, returns urls for all articles of that date'''
article_urls = []
date_page = requests.get(base_date_url + date, cookies = cookiewall_cookie)
date_soup = BeautifulSoup(date_page.content, 'html.parser')
for link in date_soup.find_all('a'):
try:
if link.parent.parent.parent.name == 'div' and link.parent.parent.parent['class'][0] == 'articleOverview':
if link.parent.parent.name == 'dl':
if link.parent.name == 'dd':
article_url= link['href']
if re.match('http://www.trouw.nl/tr/nl/.*/archief/article/detail/', article_url):
article_urls.append(article_url)
except AttributeError:
pass
return article_urls
def get_articles_for_date_volkskrant(base_date_url, date):
'''Scrapes archive page for a certain date for Volkskrant, returns urls for all articles of that date'''
article_urls = []
date_page = requests.get(base_date_url + date)
date_soup = BeautifulSoup(date_page.content, 'html.parser')
next_page = True
next_page_url = ''
while next_page:
next_page = False
for link in date_soup.find_all("a"):
if link.parent.name == 'article':
article_urls.append(link['href'])
if date_soup.find_all("a", class_="pager--next"): # If there is a next page, get it and parse it
next_page = True
next_page_url = date_soup.find_all("a", class_="pager--next")[0]['href']
date_page = requests.get(next_page_url)
date_soup = BeautifulSoup(date_page.content, 'html.parser')
return article_urls
def get_text_from_article(newspaper, article_url):
'''Scrapes article page of a given newspaper to get all the content text, returns text'''
article_text = ''
try:
article_page = requests.get(article_url, cookies = cookiewall_cookie)
except requests.exceptions.TooManyRedirects: # Sometimes the article page just redirects, then skip it
return ''
except requests.exceptions.ConnectionError: # Instable connection? Wait for a bit, then skip its
time.sleep(5)
return ''
article_soup = BeautifulSoup(article_page.content, 'html.parser')
# Newspaper-specific things
headers = []
if newspaper == 'trouw':
headers = article_soup.find_all('h1', id='articleDetailTitle')
if newspaper == 'volkskrant':
headers = article_soup.find_all('h1', class_='article__title')
# Scrape header text
for header in headers:
article_text += header.text.strip() + '\n'
# Scrape text body
for paragraph in article_soup.find_all('p'):
try:
if paragraph.parent.name == 'div':
is_correct_parent = False
# Newspaper-specific things
if newspaper == 'trouw':
is_correct_parent = re.match('art_box', paragraph.parent['id'])
if newspaper == 'volkskrant':
is_correct_parent = re.match('article__intro', paragraph.parent['class'][0]) or re.match('article__body', paragraph.parent['class'][0])
if is_correct_parent:
if paragraph.text:
text = re.sub('\n+', '\n', paragraph.text) # Remove extra newlines or spaces
text = re.sub(' +', ' ', text)
article_text += text.strip() + '\n'
except KeyError:
pass
return article_text
def scrape_newspaper(newspaper, years):
'''Scrape article texts from Trouw or Volkskrant archive, store per-year'''
if newspaper == 'trouw':
base_year_url = 'FILL THIS IN'
base_date_url = 'FILL THIS IN'
else:
base_year_url = 'FILL THIS IN'
base_date_url = 'FILL THIS IN'
# Cyle through years specified
year_range = [str(year) for year in years]
for year in year_range:
logfile = open('working/{1}_{0}.log'.format(year, newspaper), 'w', buffering=0) # No buffering, so able to read from log constantly while running
# Scrape or load dates
time0 = time.time()
logfile.write('Scraping {1} for year {0}\n'.format(year, newspaper.capitalize()))
fn = 'working/{1}_dates_{0}.json'.format(year, newspaper)
if os.path.exists(fn):
logfile.write('Loading dates from {0}\n'.format(fn))
with open(fn, 'r') as f:
dates = json.load(f)
else:
logfile.write('Scraping dates for year {0}\n'.format(year))
if newspaper == 'trouw':
dates = get_dates_from_archive(newspaper, base_year_url, year)
if newspaper == 'volkskrant':
dates = get_dates_from_archive(newspaper, base_year_url, year)
with open(fn, 'w') as of:
json.dump(dates, of)
time1 = time.time()
logfile.write('Found {0} dates for year {1}, took {2} seconds\n'.format(len(dates), year, time1 - time0))
# Scrape or load article urls
fn = 'working/{1}_article_urls_{0}.json'.format(year, newspaper)
if os.path.exists(fn):
logfile.write('Loading article urls from {0}\n'.format(fn))
with open(fn, 'r') as f:
article_urls = json.load(f)
else:
logfile.write('Scraping article URLs for year {0}\n'.format(year))
article_urls = []
for date in dates:
if newspaper == 'trouw':
article_urls += get_articles_for_date_trouw(base_date_url, date)
if newspaper == 'volkskrant':
article_urls += get_articles_for_date_volkskrant(base_date_url, date)
with open(fn, 'w') as of:
json.dump(article_urls, of)
article_urls = list(set(article_urls)) # Remove duplicates
time2 = time.time()
logfile.write('Found {0} article URLs for year {1}, took {2} seconds\n'.format(len(article_urls), year, time2 - time1))
# Scrape text from article pages
logfile.write('Scraping article text for year {0}\n'.format(year))
fn = 'working/{1}_{0}'.format(year, newspaper)
with open(fn, 'w') as of:
prev_time = time.time()
for idx, article_url in enumerate(article_urls):
if idx % 100 == 0 and idx > 1:
logfile.write('Scraping article {0} of {1}. Previous 100 took {2} seconds.\n'.format(idx, len(article_urls), time.time() - prev_time))
prev_time = time.time()
if newspaper == 'trouw':
of.write(get_text_from_article(newspaper, article_url).encode('utf-8'))
if newspaper == 'volkskrant':
of.write(get_text_from_article(newspaper, article_url).encode('utf-8'))
time3 = time.time()
logfile.write('Done! Took {0} seconds\n'.format(time3 - time2))
logfile.close()
if __name__ == '__main__':
# Parse arguments
parser = argparse.ArgumentParser(description = 'Arguments for newspaper scraping')
parser.add_argument('newspaper', metavar = 'trouw|volkskrant', type = str, help = 'Specify whether to scrape Trouw or Volkskrant')
parser.add_argument('-y', '--year', metavar = 'YEAR', type = str, help = "Specify which year of Trouw (1994-2016) or Volkskrant (1994-2016) to scrape. If this argument is not given, all years will be scraped")
args = parser.parse_args()
# Get years to scrape
if args.year:
years = [args.year]
else:
years = range(1994,2017)
# Define cookie needed for scraping
cookiewall_cookie = {'nl_cookiewall_version': '1'}
if args.newspaper.lower() == 'trouw':
scrape_newspaper('trouw', years)
elif args.newspaper.lower() == 'volkskrant' or args.newspaper.lower() == 'vk':
scrape_newspaper('volkskrant', years)
else:
print '{0} is not a valid option for scraping'.format(args.newspaper)