-
Notifications
You must be signed in to change notification settings - Fork 0
/
text.py
109 lines (96 loc) · 3.7 KB
/
text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# -*- coding:utf-8 -*-
import urllib
import urllib2
import re
#慕课爬虫类
class IMooc:
def __init__(self,baseUrl):
self.baseUrl = baseUrl
def getPage(self,viewId):
try:
headers = {
'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
}
url = self.baseUrl + str(viewId)
request = urllib2.Request(url)
response = urllib2.urlopen(request)
return response.read()
except urllib2.URLError, e:
if hasattr(e,"reason"):
print u"连接mooc失败,错误原因",e.reason
return None
def getContent(self,pageIndex):
try:
headers = {
'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
}
url='http://www.mooc.cn/tag/classic/page/'+ str(pageIndex)
request = urllib2.Request(url)
response = urllib2.urlopen(request)
return response.read().decode('utf-8')
except urllib2.URLError, e:
if hasattr(e,"reason"):
print u"连接mooc失败,错误原因",e.reason
return None
def getTitle(self,page):
pattern = re.compile('<title>(.*?)</title>',re.S)
result = re.search(pattern,page)
if result:
return result.group(1).strip()
else:
return None
def getBrief(self,page):
pattern = re.compile('<div class="course-excerpt">.*?>(.*?)</p>',re.S)
result = re.search(pattern,page)
if result:
return result.group(1).strip()
else:
return None
def getTime(self,page):
pattern = re.compile('<div class="coursetime">(.*?)</div>',re.S)
result = re.search(pattern,page)
if result:
return result.group(1).strip()
else:
return None
def getping(self,page):
pattern = re.compile('<em class="join-strong-gw join-strong-bg">(.*?)</em>',re.S)
result = re.search(pattern,page)
print result
if result:
return result.group(1).strip()
else:
return None
def getViewsId(self,page):
pattern = re.compile('<h1 class="courselist-title".*?<a target="_blank" href="http://www.mooc.cn/course/(.*?)"',re.S)
result = re.findall(pattern,page)
return result
def start(self):
indexPage = self.getContent(1)
file = open("imooc.txt","w+")
try:
for i in range(1,3):
indexPage = self.getContent(i)
ViewsId = self.getViewsId(indexPage)
for item in ViewsId:
page = self.getPage(item)
title = self.getTitle(page)
file.write('\n'+'课程题目:' + title)
brief = self.getBrief(page)
file.write('\n'+'课程介绍:' + brief)
time = self.getTime(page)
file.write('\n'+ time)
# ping = self.getping(page)
# file.write('\n'+'课程介绍:' + ping)
# outline = self.getOutline(page)
# file.write('\n'+'课程提纲:' + '\n')
# for item in outline:
# file.write(item[0] + '\n')
# file.write(item[1] + '\n')
except IOError,e:
print "写入异常,原因" + e.message
finally:
print "写入任务完成"
baseUrl = "http://www.mooc.cn/course/"
imooc = IMooc(baseUrl)
imooc.start()