forked from xingag/spider_python
-
Notifications
You must be signed in to change notification settings - Fork 0
/
spider_qiu_shi_bai_ke.py
87 lines (67 loc) · 2.02 KB
/
spider_qiu_shi_bai_ke.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/usr/bin/env python
# encoding: utf-8
"""
@version: v1.0
@author: xag
@license: Apache Licence
@contact: [email protected]
@site: http://www.xingag.top
@software: PyCharm
@file: spider_qiu_shi_bai_ke.py
@time: 2018/9/21 23:16
@description:利用正则表达式去爬取【糗事百科】的文字数据
@link:https://www.qiushibaike.com/text/
"""
import re
import requests
# 待爬取的地址
base_url = 'https://www.qiushibaike.com/text/page/%s/'
HEADERS = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Referer': 'https://www.qiushibaike.com/'
}
def spider_page(url):
"""
爬取某一页的数据
:param url:
:return:
"""
response = requests.get(url, headers=HEADERS)
text_raw = response.text
# 获取此页的段子数据
# 1.获取作者列表数据
authors_pre = re.findall(r'<div\sclass="article.*?<h2>(.*?)</h2>', text_raw, re.DOTALL)
# 1.1 对获取的作者信息进一步进行处理【数据中包含\n】
authors = []
for author_pre in authors_pre:
author = re.sub(r'\n', '', author_pre)
authors.append(author)
# 2.获取段子列表数据
contents_pre = re.findall(r'<div\sclass="content">.*?<span>(.*?)</span>', text_raw, re.S)
# 2.1 对段子数据进一步处理【数据中包含\n和<br/>】
contents = []
for content_pre in contents_pre:
content = re.sub(r'<.*?>|\n', '', content_pre)
contents.append(content)
# 3.把两个列表数据组装成一个新的列表中
jokes = []
for temp in zip(authors, contents):
author, content = temp
jokes.append({
'author': author,
'content': content
})
# 4.返回当前页面获取的段子数据列表
return jokes
def spider():
jokes = []
for page_num in range(1, 10):
print('开始爬取第%s页数据' % page_num)
# 爬取某一页的数据
jokes.append(spider_page(base_url % page_num))
# 打印爬取的数据
for joke in jokes:
print(joke)
print('恭喜!爬取数据完成!')
if __name__ == '__main__':
spider()