-
Notifications
You must be signed in to change notification settings - Fork 0
/
HS_latlong.py
109 lines (89 loc) · 3.38 KB
/
HS_latlong.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# Scrape high school lats/longs from Wikipedia
# Edit to your exported Wufoo form CSV
CSVDATA = "blueprint.csv"
# JSON output file
OUTPUTFILE = "highschools.js"
# Vars
totalrows = 0
errorcount = 0
highschools = []
highschool_locations = {}
highschool_count = {}
# areacodes = []
# fancy_words = ["best", "excellence", "specialized", "magnet", "premier", "recognition", "top-performing"]
# Depedencies
import csv, re, wikipedia, bs4
# Get list of high schools, with sanitization
with open(CSVDATA, 'rb') as csvfile:
registrationreader = csv.reader(csvfile, delimiter=",", quotechar='"')
for row in registrationreader:
totalrows = totalrows + 1
highschoolname = row[5]
# make sure it's not homeschool
homeschool = re.findall(r'[.]*(homeschool|Home school|Home School|Homeschool)[.]*', highschoolname)
if homeschool:
continue
if highschoolname not in highschools:
highschool_count[highschoolname] = 1
school = re.findall(r'[.]*(school|School|Highschool|Academy|College|Institute)[.]*', highschoolname)
if school:
highschools.append(row[5])
else:
highschools.append('"' + row[5][1:-1] + ' School"')
else:
highschool_count[highschoolname] = highschool_count.get(highschoolname) + 1
# areacode = re.sub("[^0-9]", "", row[4])
# if areacode[:1] == "1":
# areacode = areacode[1:4]
# else:
# areacode = areacode[0:3]
# areacodes.append(areacode)
# Scrape high school info off Wikipedia
for highschool in highschools[2:]:
try:
wikisearch = wikipedia.search(highschool)
wikipage = wikipedia.page(wikisearch[0]).html()
except:
continue
try:
latitude = re.findall(r'[3-5][0-9]\.[0-9]{3,10}', wikipage)
longitude = re.findall(r'[7-9,1][0-9]{1,2}\.[0-9]{3,10}', wikipage)
try:
highschool_locations[highschool] = latitude[0] + ",-" + longitude[0]
print(highschool + " // " + latitude[0] + ", " + longitude[1])
except:
errorcount = errorcount + 1
print("Uh oh, no lat or long found for " + highschool + ". Continuing...")
continue
except AttributeError:
try:
zipcode = re.findall(r' [0-9]{5}', wikiparse.prettify())[0][1:]
highschool_locations[highschool] = zipcode
print(highschool + " // " + zipcode)
except:
try:
wikiparse = bs4.BeautifulSoup(wikipage)
locality = wikiparse.find('span', attrs={'class': 'locality'}).text
region = wikiparse.find('span', attrs={'class': 'region'}).text
highschool_locations[highschool] = locality + " " + region
print(highschool + " // " + locality + " " + region)
except:
errorcount = errorcount + 1
print("Couldn't find " + highschool + " on Wikipedia.")
# Print formatted json for Goog Maps
# highschools['High School Name'] = {
# center: new google.maps.LatLng(LATITUDE, LONGITUDE),
# count: COUNT
# };
print("Scraping finished. Writing file...")
outputfile = open(OUTPUTFILE, 'w')
outputfile.write("var highschools = {};\n\n")
for highschool, count in highschool_count.items():
if highschool_locations.get(highschool):
outputfile.write("highschools['" + str(highschool.replace("'", "")) + "'] = { center: new google.maps.LatLng(" + str(highschool_locations.get(highschool)) + "), count: " + str(count) + " };\n")
outputfile.close()
print("\nDone!")
print("Total registrants = " + str(totalrows))
print("Total errors = " + str(errorcount))
# print("\n\n\n=================\nHigh school locations: \n\n")
# print highschool_locations