-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathxxx.py
205 lines (178 loc) · 6.25 KB
/
xxx.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
# -*- coding:utf-8 -*-
# @Time : 2021/9/29 10:32
# @Author: 应无所住,何生其心
# @File : xxx.py
# @Software : PyCharm
class UrlManager(object):
def __init__(self):
self.new_urls =set() # 未爬取URL集合
self.old_urls =set() # 已爬取URL集合
def has_new_url(self):
'''
判断是否有未爬取的URL
:return:
'''
return self.new_url_size()
def get_new_url(self):
'''
获取一个未爬取的URL
:return:
'''
new_url =self.new_urls.pop()
self.old_urls.add(new_url)
return new_url
def add_new_url(self,url):
'''
将新的URL添加到未爬取的URL集合中
:param url:单个URL
:return:
'''
if url is None:
return
if url not in self.new_urls and url not in self.old_urls:
self.new_urls.add(url)
def add_new_urls(self,urls):
'''
将新的URLS添加到未爬取的URL集合中
:param urls:url集合
:return:
'''
if urls is None or len(urls):
return
for url in urls:
self.add_new_url(url)
def new_url_size(self):
'''
获取未爬取URL集合的s大小
:return:
'''
return len(self.new_urls)
def old_url_size(self):
'''
获取已经爬取URL集合的大小
:return:
'''
return len(self.old_urls)
import requests
class HtmlDownloader(object):
def download(self,url):
if url is None:
return None
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers={'User-Agent':user_agent}
r = requests.get(url,headers=headers)
if r.status_code==200:
r.encoding='utf-8'
return r.text
return None
import re
# import urlparse
from urllib import parse
from bs4 import BeautifulSoup
class HtmlParser(object):
def parser(self,page_url,html_cont):
'''
用于解析网页内容抽取URL和数据
:param page_url: 下载页面的URL
:param html_cont: 下载的网页内容
:return:返回URL和数据
'''
if page_url is None or html_cont is None:
return
soup = BeautifulSoup(html_cont,'html.parser',from_encoding='utf-8')
new_urls = self._get_new_urls(page_url,soup)
new_data = self._get_new_data(page_url,soup)
return new_urls,new_data
def _get_new_urls(self,page_url,soup):
'''
抽取新的URL集合
:param page_url: 下载页面的URL
:param soup:soup
:return: 返回新的URL集合
'''
new_urls = set()
#抽取符合要求的a标签
#原书代码
# links = soup.find_all('a',href=re.compile(r'/view/\d+\.htm'))
#2017-07-03 更新,原因百度词条的链接形式发生改变
links = soup.find_all('a', href=re.compile(r'/item/.*'))
for link in links:
#提取href属性
new_url = link['href']
#拼接成完整网址
new_full_url = parse.urljoin(page_url,new_url)
new_urls.add(new_full_url)
return new_urls
def _get_new_data(self,page_url,soup):
'''
抽取有效数据
:param page_url:下载页面的URL
:param soup:
:return:返回有效数据
'''
data={}
data['url']=page_url
title = soup.find('dd',class_='lemmaWgt-lemmaTitle-title').find('h1')
data['title']=title.get_text()
summary = soup.find('div',class_='lemma-summary')
#获取到tag中包含的所有文版内容包括子孙tag中的内容,并将结果作为Unicode字符串返回
data['summary']=summary.get_text()
return data
import codecs
class DataOutput(object):
def __init__(self):
self.datas=[]
def store_data(self,data):
if data is None:
return
self.datas.append(data)
def output_html(self):
fout=codecs.open('baike.html','w',encoding='utf-8')
fout.write("<html>")
fout.write("<head><meta charset='utf-8'/></head>")
fout.write("<body>")
fout.write("<table>")
for data in self.datas:
fout.write("<tr>")
fout.write("<td>%s</td>"%data['url'])
fout.write("<td>%s</td>"%data['title'])
fout.write("<td>%s</td>"%data['summary'])
fout.write("</tr>")
fout.write("</table>")
fout.write("</body>")
fout.write("</html>")
fout.close()
from URLManager import UrlManager
from HtmlDownloader import HtmlDownloader
from HtmlParser import HtmlParser
from DataOutput import DataOutput
class SpiderMan(object):
def __init__(self):
self.manager = UrlManager()
self.downloader = HtmlDownloader()
self.parser = HtmlParser()
self.output = DataOutput()
def crawl(self,root_url):
#添加入口URL
self.manager.add_new_url(root_url)
#判断url管理器中是否有新的url,同时判断抓取了多少个url
while(self.manager.has_new_url() and self.manager.old_url_size()>100):
try:
#从URL管理器获取新的url
new_url = self.manager.get_new_url()
#HTML下载器下载网页
html = self.downloader.download(new_url)
#HTML解析器抽取网页数据
new_urls,data = self.parser.parser(new_url,html)
#将抽取到url添加到URL管理器中
self.manager.add_new_urls(new_urls)
#数据存储器储存文件
self.output.store_data(data)
print('已经抓取%s个链接'%self.manager.old_url_size())
except Exception as e:
print('crawl failed')
#数据存储器将文件输出成指定格式
self.output.output_html()
if __name__=='__main__':
spider_man = SpiderMan()
spider_man.crawl('http://baike.baidu.com/view/284853.htm')