-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
56 lines (56 loc) · 1.42 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
'''
This file is intended to be the script
that this project will use to gather the
anderson cooper dataset that will be used to
train cooperbot
'''
import requests
import re
from bs4 import BeautifulSoup as bs
f = open("coopertext.txt","w")
for year in range(4,17):
for month in range(1,13):
for day in range(1,32):
for time in range(1,100):
y = str(year)
m = str(month)
d = str(day)
t = str(time)
if year <10:
y = "0" + y
if month <10:
m = "0" + m
if day <10:
d = "0" + d
if time <10:
t = "0" + t
try:
webpage = requests.get("http://transcripts.cnn.com/TRANSCRIPTS/" + y +m + "/" + d +"/acd." + t +".html")
except:
continue
if webpage.status_code == 404:
print "whoops,404 on " + "http://transcripts.cnn.com/TRANSCRIPTS/" + y +m + "/" + d +"/acd." + t +".html"
break
parser = bs(webpage.content,"html.parser")
data = parser.find_all("p",{"class":"cnnBodyText"})
for paragraph in data:
try:
text = paragraph.text
m = re.findall('[A-Za-z %s(),-]+:',text)
l = re.split('[A-Za-z %s(),-]+:',text)
i = 0
while i< len(l):
if l[i].isspace() or l[i] == '':
l.remove(l[i])
else:
i+=1
for i in range(len(m)):
if "COOPER" in m[i]:
f.write(l[i]+"\n")
print l[i]
pass
except Exception as e:
print e
print "failed to get text"
pass
f.close()