-
-
Notifications
You must be signed in to change notification settings - Fork 1.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
淘宝免cookie策略,可实现阿里全系自主平台爬取
- Loading branch information
卜俊杰
committed
Feb 27, 2020
1 parent
fb207f4
commit 5835247
Showing
16 changed files
with
1,674 additions
and
19 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
# 说明 | ||
|
||
## 进度说明 | ||
|
||
- 本程序设计思路对阿里系自主平台(如:淘宝 taobao、天猫 tmall、闲鱼、菜鸟裹裹等平台均有效),此处提供 淘宝 taobao 的程序 | ||
|
||
- 已完成 | ||
|
||
1. 整体框架设计 | ||
2. 搜索页面 csv 存储 | ||
3. 多线程 | ||
|
||
- 未完成 | ||
1. 详情页面 csv 存储 | ||
2. 搜索页面 mysql 存储 | ||
3. 详情页面 mysql 存储 | ||
4. 搜索页面与详情页面同时爬取, mysql + redis 存储 | ||
|
||
## 使用方法 | ||
|
||
1. 在 `config.py` 文件中根据需要配置 | ||
2. 运行 `python3 main.py` | ||
|
||
## 思路 | ||
|
||
1. 生产-消费 模式 | ||
2. 各功能单独建文件 | ||
3. 多线程 | ||
4. 数据库: csv \ redis \ mysql | ||
|
||
## 阿里系自主平台(非收购)cookie 自动配置策略 | ||
|
||
1. 第一次无 cookie 请求,返回 cookie | ||
2. 从返回的 cookie 提取 token 并计算 sign(token, timestamp, appKey, data),拼接新的 url | ||
3. 第二次带返回的 cookie 请求 url,得到结果 | ||
|
||
- 注: | ||
1. cookie、token 有效期为 30 天,sign 有效期为 1 小时 | ||
2. 理论上:只要一个小时跟换一次时间戳、重新计算一次 sign 即可,不断重复第二次请求 | ||
3. 实践中:一小时更换一次有被反爬虫风险;可用 30 秒隧道代理,每次都重复第一步生成新 cookie(效率极高,时间可忽略),理论上无反爬虫风险 | ||
4. 程序中对第一次请求 url 固定(不影响程序),若以后能从 js 文件中看懂其生成机制,则可改为每次自动生成 | ||
|
||
## taobao 入口 | ||
|
||
http://uland.taobao.com/sem/tbsearch?keyword=XXX | ||
|
||
把最后的 XXX 换成您要搜索的内容即可 | ||
|
||
(用以第一步请求,得到真正的请求地址,程序中已经配置,不用管) | ||
|
||
## Tmall 入口 | ||
|
||
http://www.tmall.com/ | ||
|
||
(用以第一步请求,得到真正的请求地址,程序中已经配置,不用管) | ||
|
||
## mysql 表结构 | ||
|
||
## 关于作者 | ||
|
||
本人从事 `大数据`、`数据分析` 工作,欢迎各位大牛叨扰~ | ||
|
||
- github : [https://github.com/SoliDeoGloria31](https://github.com/SoliDeoGloria31) | ||
|
||
- 码云 Gitee : [https://gitee.com/joseph31](https://gitee.com/joseph31) | ||
|
||
- 微信 : mortaltiger | ||
|
||
<img src="https://gitee.com/joseph31/picture_bed/raw/master/mortaltiger.jpg" width="15%"> | ||
|
||
- 个人公众号: JosephNest(Joseph 的小窝) | ||
经常测试新功能导致服务器不稳定,可能会出故障, 实现`自动推荐系统`、`自动回复功能`、`需求留言功能`、`人工智能集成(图片识别)`、`其他功能定制` | ||
|
||
<img src="https://gitee.com/joseph31/picture_bed/raw/master/JosephNest.jpg" width="15%"> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
# 搜索配置 | ||
# 搜索内容 | ||
str_searchContent = 'iPhone Xs' | ||
# 每页显示数量 | ||
num_pageSize = 100 | ||
# 从第一页 至 第几页(理论上可穷尽阿里服务器),推荐填入 1~100 ,页数再大则显示的内容匹配度不足 | ||
num_page = 2 | ||
# 阿里服务编号,12574478 固定不要更改,如菜鸟裹裹为 12574478 固定 | ||
appKey = '12574478' # 不要更改!!! | ||
|
||
########################################### | ||
# 爬取内容设置 | ||
|
||
# 开启线程数 | ||
threads_num_get_pages = 1 # 抓取搜素页的线程数, 默认为 1 | ||
threads_num_get_comments = 3 # 抓取评论页的线程数,当为 0 时,不抓取详情页面(评论) | ||
|
||
########################################### | ||
# 储存 | ||
switch_save = 0 # 本地 csv 存储 | ||
# switch_save = 1 # mysql 存储 | ||
# switch_save = 2 # mysql + redis 存储 | ||
|
||
# redis | ||
redis_host = '127.0.0.1' | ||
redis_port = 6379 | ||
|
||
# mysql | ||
mysql_host = '127.0.0.1' | ||
mysql_port = 3306 | ||
mysql_user = 'root' | ||
mysql_passwd = '123456' | ||
mysql_db = 'taobao' | ||
mysql_charset = 'utf8' | ||
|
||
########################################### | ||
# 代理设置 | ||
# 隧道服务器 | ||
_tunnel_host = "tps189.kdlapi.com" | ||
_tunnel_port = "15818" | ||
|
||
# 隧道用户名密码 | ||
_tid = "t17888082960619" | ||
_password = "gid72p4o" | ||
|
||
proxies = { | ||
"http": "http://%s:%s@%s:%s/" % (_tid, _password, _tunnel_host, _tunnel_port), | ||
"https": "https://%s:%s@%s:%s/" % (_tid, _password, _tunnel_host, _tunnel_port) | ||
} | ||
|
||
########################################### |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,156 @@ | ||
# encoding: utf-8 | ||
|
||
from config import * | ||
import requests | ||
import hashlib | ||
import time | ||
from urllib.parse import quote | ||
import threading | ||
|
||
|
||
class TaoBao: | ||
def __init__(self, str_searchContent, num_pageSize, num_page, appKey, threads_num_get_pages, threads_num_get_comments, switch_save, proxies): | ||
self.str_searchContent = str_searchContent | ||
self.num_pageSize = num_pageSize | ||
self.num_page = num_page | ||
self.appKey = appKey | ||
self.threads_num_get_pages = threads_num_get_pages | ||
self.threads_num_get_comments = threads_num_get_comments | ||
self.switch_save = switch_save | ||
self.proxies = proxies | ||
self.cookie = '' | ||
self.token = '' | ||
self.file_name = '' | ||
self.L_itemId = [] | ||
|
||
self.run() | ||
|
||
def first_requests(self): | ||
# 第一次请求,无cookie请求,获取cookie | ||
base_url = 'https://h5api.m.taobao.com/h5/mtop.alimama.union.sem.landing.pc.items/1.0/?jsv=2.4.0&appKey=12574478&t=1582738149318&sign=fe2cf689bdac8258a1d12507a06bd289&api=mtop.alimama.union.sem.landing.pc.items&v=1.0&AntiCreep=true&dataType=jsonp&type=jsonp&ecode=0&callback=mtopjsonp1&data=%7B%22keyword%22%3A%22%E8%8B%B9%E6%9E%9C%E6%89%8B%E6%9C%BA%22%2C%22ppath%22%3A%22%22%2C%22loc%22%3A%22%22%2C%22minPrice%22%3A%22%22%2C%22maxPrice%22%3A%22%22%2C%22ismall%22%3A%22%22%2C%22ship%22%3A%22%22%2C%22itemAssurance%22%3A%22%22%2C%22exchange7%22%3A%22%22%2C%22custAssurance%22%3A%22%22%2C%22b%22%3A%22%22%2C%22clk1%22%3A%22%22%2C%22pvoff%22%3A%22%22%2C%22pageSize%22%3A%22100%22%2C%22page%22%3A%22%22%2C%22elemtid%22%3A%221%22%2C%22refpid%22%3A%22%22%2C%22pid%22%3A%22430673_1006%22%2C%22featureNames%22%3A%22spGoldMedal%2CdsrDescribe%2CdsrDescribeGap%2CdsrService%2CdsrServiceGap%2CdsrDeliver%2C%20dsrDeliverGap%22%2C%22ac%22%3A%22%22%2C%22wangwangid%22%3A%22%22%2C%22catId%22%3A%22%22%7D' | ||
try: | ||
with requests.get(base_url) as response: | ||
get_cookies = requests.utils.dict_from_cookiejar( | ||
response.cookies) | ||
_m_h5_tk = get_cookies['_m_h5_tk'] | ||
_m_h5_tk_enc = get_cookies['_m_h5_tk_enc'] | ||
self.token = _m_h5_tk.split('_')[0] | ||
self.cookie = '_m_h5_tk={}; _m_h5_tk_enc={}'.format( | ||
_m_h5_tk, _m_h5_tk_enc) | ||
except Exception as e: | ||
print('first_requests 出错: ', e) | ||
|
||
def sign(self, token, tme, appKey, data): | ||
st = token+"&"+tme+"&"+appKey+"&"+data | ||
m = hashlib.md5(st.encode(encoding='utf-8')).hexdigest() | ||
return(m) | ||
|
||
def second_requests(self): | ||
# 第二次带cookie请求,返回数据并存储 | ||
searchContent = '"sc"'.replace('sc', self.str_searchContent) | ||
pageSize = '"ps"'.replace('ps', str(self.num_pageSize)) # 每页结果属 | ||
page = '"p"'.replace('p', str(self.num_page)) # 第几页 | ||
|
||
str_data = '{"keyword":'+searchContent+',"ppath":"","loc":"","minPrice":"","maxPrice":"","ismall":"","ship":"","itemAssurance":"","exchange7":"","custAssurance":"","b":"","clk1":"","pvoff":"","pageSize":'+pageSize+',"page":' + \ | ||
page+',"elemtid":"1","refpid":"","pid":"430673_1006","featureNames":"spGoldMedal,dsrDescribe,dsrDescribeGap,dsrService,dsrServiceGap,dsrDeliver, dsrDeliverGap","ac":"","wangwangid":"","catId":""}' | ||
data = quote(str_data, 'utf-8') | ||
|
||
tme = str(time.time()).replace('.', '')[0:13] | ||
|
||
sgn = self.sign(self.token, tme, self.appKey, str_data) | ||
|
||
url = 'https://h5api.m.taobao.com/h5/mtop.alimama.union.sem.landing.pc.items/1.0/?jsv=2.4.0&appKey={}&t={}&sign={}&api=mtop.alimama.union.sem.landing.pc.items&v=1.0&AntiCreep=true&dataType=jsonp&type=jsonp&ecode=0&callback=mtopjsonp2&data={}'.format( | ||
appKey, tme, sgn, data) | ||
|
||
headers = {'cookie': self.cookie} # 未使用proxies | ||
try: | ||
with requests.get(url, headers=headers) as res: | ||
html = res.text | ||
|
||
res_str = html.split( | ||
'"mainItems":')[-1].split('},"ret":')[0].replace('true', '"true"').replace('false', '"false"') | ||
res_list = eval(res_str) | ||
if self.switch_save == 0: | ||
self.switch_save_0(res_list) | ||
elif self.switch_save == 1: | ||
self.switch_save_1(res_list) | ||
elif self.switch_save == 2: | ||
self.switch_save_2(res_list) | ||
else: | ||
print('config.py 文件中存储部分设置有误!') | ||
except Exception as e: | ||
print('second_requests 出错: ', e) | ||
|
||
def switch_save_0(self, res_list): | ||
from save_csv import save_csv | ||
csv_file_name = self.file_name+'.csv' | ||
# 返回该页面所有的 itemId 存入 L_itemId 列表中 | ||
self.L_itemId += save_csv(res_list, csv_file_name) | ||
print('\n爬取项目数目: ', len(self.L_itemId)) | ||
|
||
def switch_save_1(self, res_list): | ||
from save_mysql import save_mysql | ||
save_mysql(res_list) | ||
|
||
def switch_save_2(self, res_list): | ||
from save_mysql_redis import save_mysql_redis | ||
save_mysql_redis(res_list) | ||
|
||
def get_search_page(self): | ||
print('搜索页面 线程启动: ', threading.current_thread().name) | ||
for i in range(1, self.num_page+1): | ||
self.first_requests() # 可以在此调整获取cookie的频率 | ||
self.second_requests() | ||
print('完成第 {} 页爬取\n====================\n'.format(i)) | ||
|
||
def get_comments_page(self): | ||
print('评论页面 线程启动: ', threading.current_thread().name) | ||
time.sleep(5) | ||
n = 3 # 三次请求 self.L_itemId 无返回, 则认为所有数据爬取完毕 | ||
while True: | ||
if n == 0: | ||
break | ||
try: | ||
itemId = self.L_itemId.pop(0) | ||
self.get_comments(itemId) | ||
n = 3 | ||
except Exception as e: | ||
n -= 1 | ||
time.sleep(5) | ||
|
||
def get_comments(self, itemId): | ||
pass | ||
|
||
def run(self): | ||
tme = str(time.time()).replace('.', '')[0:13] | ||
self.file_name = '搜索页面'+'_'+self.str_searchContent+'_' + str(self.num_pageSize)+'_' + str( | ||
self.num_page)+'_'+time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(tme[:10]))).replace(' ', '_').replace(':', '_') | ||
|
||
threads = [] | ||
# 一条线程爬取搜索页面 | ||
if self.threads_num_get_pages != 1: | ||
print('请在 config.py 文件中 设置 threads_num_get_pages = 1') | ||
thread0 = threading.Thread(target=self.get_search_page, args=()) | ||
threads.append(thread0) | ||
|
||
# 新建线程爬取详情页面 | ||
if self.threads_num_get_comments: | ||
for i in range(self.threads_num_get_comments): | ||
thread = threading.Thread( | ||
target=self.get_comments_page, args=()) | ||
threads.append(thread) | ||
|
||
# 启动多线程 | ||
for t in threads: | ||
t.start() | ||
|
||
for t in threads: | ||
t.join() | ||
print('关闭线程: ', t.name) | ||
|
||
print('主线程结束!', threading.current_thread().name) | ||
|
||
|
||
if __name__ == "__main__": | ||
TaoBao(str_searchContent, num_pageSize, num_page, appKey, | ||
threads_num_get_pages, threads_num_get_comments, switch_save, proxies) |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
import csv | ||
import os | ||
|
||
|
||
def save_csv(res_list, csv_file_name): | ||
L_itemId = [] | ||
|
||
path = './csv/' | ||
# 判断\新建文件夹 | ||
if not os.path.exists(path): | ||
os.makedirs(path) | ||
print(path, ' 文件夹创建成功') | ||
file_name = path+csv_file_name | ||
# 判断\新建文件 | ||
if not os.path.exists(file_name): | ||
header = ["dsrDeliver", "dsrDeliverGap", "dsrDescribe", "dsrDescribeGap", "dsrService", "dsrServiceGap", "imgUrl", "ismall", | ||
"itemId", "loc", "price", "promoPrice", "redkeys", "sellCount", "sellerPayPostfee", "spGoldMedal", "title", "wangwangId"] | ||
with open(file_name, 'a', newline='', encoding='utf-8') as f: | ||
writer = csv.writer(f) | ||
writer.writerow(header) | ||
# 写入文件 | ||
for item in res_list: | ||
with open(file_name, 'a', newline='', encoding='utf-8') as f: | ||
L_itemId.append(item["itemId"]) | ||
writer = csv.writer(f) | ||
L = [item["dsrDeliver"], item["dsrDeliverGap"], item["dsrDescribe"], item["dsrDescribeGap"], item["dsrService"], item["dsrServiceGap"], 'https:'+item["imgUrl"], item["ismall"], | ||
item["itemId"], item["loc"], item["price"], item["promoPrice"], item["redkeys"], item["sellCount"], item["sellerPayPostfee"], item["spGoldMedal"], item["title"], item["wangwangId"]] | ||
writer.writerow(L) | ||
|
||
return L_itemId |
Oops, something went wrong.