forked from SvanBoxel/weather_analysis_us
-
Notifications
You must be signed in to change notification settings - Fork 0
/
wunderground_scraper.py
executable file
·53 lines (38 loc) · 1.84 KB
/
wunderground_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# coding: utf-8
from datetime import datetime, timedelta
from urllib.request import urlopen
import os
def scrape_station(station):
'''
This function scrapes the weather data web pages from wunderground.com
for the station you provide it.
You can look up your city's weather station by performing a search for
it on wunderground.com then clicking on the "History" section.
The 4-letter name of the station will appear on that page.
'''
# Scrape between July 1, 2014 and July 1, 2015
# You can change the dates here if you prefer to scrape a different range
current_date = datetime(year=2014, month=7, day=1)
end_date = datetime(year=2015, month=7, day=1)
# Make sure a directory exists for the station web pages
os.mkdir(station)
# Use .format(station, YYYY, M, D)
lookup_URL = 'http://www.wunderground.com/history/airport/{}/{}/{}/{}/DailyHistory.html'
while current_date != end_date:
if current_date.day == 1:
print(current_date)
formatted_lookup_URL = lookup_URL.format(station,
current_date.year,
current_date.month,
current_date.day)
html = urlopen(formatted_lookup_URL).read().decode('utf-8')
out_file_name = '{}/{}-{}-{}.html'.format(station, current_date.year,
current_date.month,
current_date.day)
with open(out_file_name, 'w') as out_file:
out_file.write(html)
current_date += timedelta(days=1)
# Scrape the stations used in this article
for station in ['KCLT', 'KCQT', 'KHOU', 'KIND', 'KJAX',
'KMDW', 'KNYC', 'KPHL', 'KPHX', 'KSEA']:
scrape_station(station)