-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdirectory.py
194 lines (165 loc) · 5.85 KB
/
directory.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
'''
directory.py
DevX, last updated 10/19/2014
This python script goes over the current stalkernet database (as of 10/19/2014) and then scrapes the
information from each page, placing the information into a .csv file.
You will need to install the requests python library in order to use this program.
Get pip and do the following command to get the library:
$ pip install requests
Run the commands below in the terminal to run the file, and make sure this file is alone in the directory.
It creates everything it will need for operation.
$ cd path/to/this/file
$ python directory.py
'''
# yearless, majorless:
# [u'Michael Shin', u'Jake Reardon', u'Woosik Koong', u'Sipho Mhlanga', u'Chessy Cantrell']
import argparse
import pickle
import re
import requests
import string
import urllib
import os
from collections import defaultdict
# Global constants.
url = 'http://apps.carleton.edu/campus/directory/'
# Regular expressions to capture data elements from HTML.
re_person = '<li class="person">(.*?)</li>'
re_name = '<h2>(.*?)</h2>'
re_name_profile = '<h2>[<a href="/profiles/[a-z]+[0-9]?/">]?(.*?)[</a>]*</h2>'
re_year = '<span class="affiliation">(.*?)</span>'
re_major = '<span class="major">(.*?)</span>'
re_concentration = '<span class="concentration">(.*?)</span>'
re_dorm = '<p class="location">.*?<a .*?>(.*?)</a>'
re_email = '<div class="email">.*?<a .*?>(.*?)</a>'
re_phone = '<div class="telephone">.*?<a .*?>(.*?)</a>'
re_address = '<div class="homeAddress">(.*?)</div>'
re_emailcheck = '[a-z]+[0-9][email protected]'
re_status = '<p class="status">(.*?)</p>'
# this was causing problems for me, but you said it was okay so we shall see
re_photo = 'src="/stock/ldapimage.php?\?id=(.*?)&source=campus_directory"'
def main():
global d, m, output_file, years
parser = argparse.ArgumentParser(description='Scrape Stalkernet for metadata and pictures of all current Carleton students')
parser.add_argument('year', metavar='YEAR', type=int, help='The graduation year of the current senior class')
args = parser.parse_args()
year = args.year
years = [year, year+1, year+2, year+3]
try:
f = open('directory.pickle', 'r')
d = pickle.load(f)
f.close()
print "Directory data loaded."
except IOError:
print "Directory data not found, beginning scrape."
d = get_directory()
f = open('directory.pickle', 'w')
pickle.dump(d, f)
f.close()
print "Directory data saved."
if not os.path.exists('stalkernet_images'):
os.makedirs('stalkernet_images')
output_file = open('stalkernet_data.csv', 'w')
get_people(d)
output_file.close()
def get_people(d):
m = defaultdict(int)
counter = 0
for k in d.iterkeys():
counter += 1
name = d[k]['name']
majors = ' / '.join(d[k]['major'])
year = d[k]['year']
dorm = d[k]['dorm']
floor = d[k]['floor']
roomNumber = d[k]['roomNumber']
email = d[k]['photo']
address = d[k]['address']
urllib.urlretrieve("https://apps.carleton.edu/stock/ldapimage.php?id=%s&source=campus_directory" %email, "stalkernet_images/%s.jpg" %email)
output_file.write('{0},{1},{2},{3},{4},{5},{6},"{7}",\n'.format(
name, majors, year, dorm, floor, roomNumber, email, address))
def name_of(x):
name = re.search(re_name, x).groups()[0]
return re.sub('<.*?>', '', name)
def data_of(x):
lives_in_dorm = True
name = re.search(re_name_profile, x)
if name:
name = name.groups()[0]
else:
name = re.search(re_name, x)
name = name.groups()[0] if name else None
major = re.findall(re_major, x)
concentration = re.findall(re_concentration, x)
year = re.search(re_year, x)
try:
year = int(year.groups()[0]) if year else None
except ValueError:
year = None
dorm = re.search(re_dorm, x)
if dorm:
dorm = dorm.groups()[0]
else:
dorm = re.search(re_status, x)
if dorm:
dorm = dorm.groups()[0]
else:
dorm = "Unknown"
if re.match(re_emailcheck, dorm) != None:
dorm = "Northfield Option"
lives_in_dorm = False
email = re.search(re_email, x)
email = email.groups()[0] if email else None
phone = re.search(re_phone, x)
phone = phone.groups()[0] if phone else None
phone = re.sub('<.*?>', '', phone) if phone else None
address = re.search(re_address, x)
address = address.groups()[0] if address else None
photo = re.search(re_photo, x)
photo = photo.groups()[0] if photo else None
if dorm in ["Off Campus Program", "On Leave", "Unknown", "Early Finish"]:
return None
if lives_in_dorm:
# this was crashing because Carleton has some Northfiled option
# students listed differently. Temporary fix.
try:
floor = int(dorm.split()[-1][0])
except:
return None
roomNumber = dorm.split()[-1]
dorm = (' ').join(dorm.split()[:-1])
else:
floor = -1
roomNumber = "-1"
return {
'name': name,
'major': major,
'year': year,
'dorm': dorm,
'floor': floor,
'roomNumber': roomNumber,
'photo': photo,
'address': address,
}
def get_directory():
d = {}
for y in years:
for c in string.ascii_lowercase:
print "Looking up students from the class of %i whose names begin with %s" % (y, c.upper())
payload = {
'search_for': 'student',
'year': y,
'first_name': c
}
add_results(d, payload)
return d
def add_results(d, payload):
r = requests.get(url, params=payload)
s = r.text.replace('\n', '')
p = re.findall(re_person, s)
for x in p:
data = data_of(x)
if data != None:
d[name_of(x)] = data
if __name__ == "__main__":
main()