-
Notifications
You must be signed in to change notification settings - Fork 46
/
parser.py
103 lines (97 loc) · 3.2 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!encoding=utf-8
import gzip
import cStringIO
import chardet
from bs4 import BeautifulSoup
import urllib
import urllib2
import time
from datetime import datetime
import math
import sys
def getPage(url):
try:
time.sleep(1)
web = urllib.urlopen(url)
page = web.read()
web.close()
return page
except:
return None
def formatStr(s):
s = s.replace('\n',' ').replace('\t',' ')
return s.strip()
def getArticle(url):
page = getPage(url)
pageSoup = BeautifulSoup(page)
title = str(pageSoup.title).replace('<title>','').replace('</title>','').strip()
item = pageSoup.find_all('div',{'class':'zm-item-answer'})
if item is None or len(item) == 0:
return None
anwser = item[0].find('div',{'class':'fixed-summary zm-editable-content clearfix'}).get_text().strip()
vote = item[0].find('div',{'class':'zm-item-vote-info '}).get('data-votecount').strip()
anwser = formatStr(anwser)
ans_len = len(anwser)
if ans_len > 100:
anwser = anwser[0:100]
title = formatStr(title)
out = [title, anwser, str(ans_len),vote,url]
return out
def getQuestions(start,offset='20'):
#cookies = urllib2.HTTPCookieProcessor()
#opener = urllib2.build_opener(cookies)
#urllib2.install_opener(opener)
header = {"Accept":"*/*",
"Accept-Encoding":"gbk,utf-8,gzip,deflate,sdch",
"Accept-Language":"zh-CN,zh;q=0.8,en;q=0.6",
"Connection":"keep-alive",
"Content-Length":"64",
"Content-Type":"application/x-www-form-urlencoded; charset=utf-8",
'Cookie':'*************'
"Host":"www.zhihu.com",
"Origin":"http://www.zhihu.com",
"Referer":"http://www.zhihu.com/log/questions",
"User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36",
"X-Requested-With":"XMLHttpRequest"
}
parms = {'start':start,
'offset':offset,
'_xsrf':'*************'}
url = 'http://www.zhihu.com/log/questions'
req = urllib2.Request(url,headers=header,data=urllib.urlencode(parms))
content = urllib2.urlopen( req ).read()
html = gzip.GzipFile(fileobj = cStringIO.StringIO(content)).read()
html = eval(html)['msg'][1]
pageSoup = BeautifulSoup(html)
questions = []
items = pageSoup.find_all('div',{'class':'zm-item'})
for item in items:
url = item.find_all('a',{'target':'_blank'})[0].get('href').rsplit('/',1)[1]
questions.append(url)
lastId = items[-1].get('id').split('-')[1]
return questions,lastId
def craw():
wf = open('zhihu.txt','a+')
domain = 'http://www.zhihu.com/question/'
#lastId = '404659437'
lastId = '389059437'
for i in xrange(10000):
print i,lastId
ques,lastId = getQuestions(lastId)
for q in ques:
try:
out = getArticle(domain+q)
except:
continue
if out == None:
continue
for j in xrange(len(out)):
each = out[j]
if j == 1:
each = each.encode('utf-8')
wf.write(each)
wf.write('\t')
wf.write('\n')
wf.close()
if __name__ == '__main__':
craw()