Skip to content

Commit

Permalink
1.1.8 提交
Browse files Browse the repository at this point in the history
  • Loading branch information
shengchenyang committed Apr 14, 2023
1 parent cfb62ba commit f91d241
Show file tree
Hide file tree
Showing 31 changed files with 304 additions and 176 deletions.
17 changes: 17 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
[flake8]

max-line-length = 120
ignore =
# W503: line break before binary operator
W503,
# E203: whitespace before ':'
E203

exclude =
.git,
docs,

per-file-ignores =
# E501: line too long
ayugespidertools/common/Params.py:E501
tests/test_common/test_common_Utils.py:E501
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ release:
poetry publish

test:
flake8 .
coverage run -m pytest
coverage combine
coverage report
Expand Down
15 changes: 7 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -160,17 +160,13 @@ scrapy crawl <spider_name>
## TodoList

- [x] `scrapy` 的扩展功能场景
- [ ] ~~`scrapy` 结合 `crawlab` 的日志统计功能~~
- [x] `scrapy` 脚本运行信息统计和项目依赖表采集量统计,可用于日志记录和预警
- [x] 自定义模板,在 `ayugespidertools startproject <projname>``ayugespidertools genspider <spidername>` 时生成适合本库的模板文件
- [x] ~~增加根据 `nacos` 来获取配置的功能~~ -> 改为增加根据 `consul` 来获取配置的功能
- [x] 代理中间件(独享代理、动态隧道代理)
- [x] 随机请求头 `UA` 中间件,根据 `fake_useragent` 中的权重来随机
- [x] 使用以下工具来替换 `scrapy``Request` 来发送请求
- [ ] ~~`selenium`: 性能没有 `pyppeteer`~~
- [x] `pyppeteer`: `Gerapy-pyppeteer` 库已经实现此功能
- [x] `requests`: 这个不推荐使用,`requests` 同步库会降低 `scrapy` 运行效率
- [ ] ~~`splash`: 继承 `splash` 渲染 `js` 的功能~~
- [x] `aiohttp`: 集成将 `scrapy Request` 替换为 `aiohttp` 的协程方式
- [x] `Mysql` 存储的场景下适配
- [x] 自动创建 `Mysql` 用户场景下需要的数据库和数据表及字段格式,还有字段注释
Expand All @@ -183,7 +179,10 @@ scrapy crawl <spider_name>
- [x] `sql` 语句拼接,只是简单场景,后续优化。已给出优化方向,参考库等信息。
- [x] `mongoDB` 语句拼接
- [x] 数据格式化处理,比如:去除网页标签,去除无效空格等
- [ ] 字体反爬还原方法
- [x] 字体反爬还原方法
- [x] 基于 `ttf``woff` 之类的字体文件映射,或结合 `css` 等实现
- [x] 可以直接在字体文件 `xml` 中找到映射关系的:使用 [fontforge](https://github.com/fontforge/fontforge/releases) 工具导出映射即可。
- [x] 无法找到映射关系的,则一般使用 `ocr` 识别(准确率非百分百),通过 `fontforge` 导出每个映射的 `png`,后再通过各种方式识别。
- [x] `html` 格式转 `markdown` 格式
- [x] `html` 数据处理,去除标签,不可见字符,特殊字符改成正常显示等等等
- [x] 添加常用的图片验证码中的处理方法
Expand All @@ -192,7 +191,7 @@ scrapy crawl <spider_name>
- [x] 识别点选验证码位置及点击顺序,识别结果不太好,待优化
- [x] 图片乱序混淆的还原方法示例

注:
**注:**

- 不再开发结合 `selenium` 扩展的功能,推荐使用 `scrapy-playwright``Gerapy-pyppeteer` 等其它库;
- 不再开发结合 `splash` 的功能,如果使用 `splash http api` 的话,在 `scrapy` ,本库或自写脚本中都比较容易扩展。如果要使用 `lua` `api` 等复杂的功能那还是推荐 `scrapy-splash` 这类的扩展库
1. 由于 `scrapy` 扩展库的开源项目众多,在使用本库的基础上扩展它们,与使用 `scrapy` 时扩展它们的用法一致,所以不再开发一些其它工具已经拥有且稳定的功能,但也会尽量开发一些常用且通用的方法来提升效率。
2. 字体反爬部分不会给出详细解决示例,不管是使用 `fontforge``fontTools``ocr` 等工具,已经脱离本库的范围了,我会给出部分依赖的方法,但不会添加以上工具库的依赖了,而导致本库依赖过于杂糅。而且,若要实现高可用的字体映射也比较简单,请自行实现,可能会考虑新开 `pypi` 库来实现此部分
6 changes: 3 additions & 3 deletions ayugespidertools/DownloaderMiddlewares.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import asyncio
import urllib.parse

import aiohttp
from scrapy.http import HtmlResponse
Expand Down Expand Up @@ -75,9 +74,10 @@ async def _process_request(self, request, spider):

# set proxy

# todo: 此部分中的 domain 参数暂时不使用,后续考虑是否需要
# set cookies domain 参数
parse_result = urllib.parse.urlsplit(request.url)
domain = parse_result.hostname
# parse_result = urllib.parse.urlsplit(request.url)
# domain = parse_result.hostname

# _timeout = self.download_timeout
# if aiohttp_meta.get('timeout') is not None:
Expand Down
57 changes: 7 additions & 50 deletions ayugespidertools/ImgOperation.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from typing import Optional, Union

import cv2
import requests
from PIL import Image

from ayugespidertools.common.Encryption import EncryptOperation
Expand All @@ -17,48 +16,6 @@ class Picture(object):
对验证码图片的一些操作
"""

@classmethod
def get_captcha(cls, url: str, img_path: str) -> None:
"""
下载完美滑块的图片,并将缺口图和滑块在一起的图片切割
Args:
url: 完美滑块的滑块图片链接
img_path: 图片保存路径
Returns:
None
"""
session = requests.Session()
session.headers = {
"authority": "captchas-1251008858.file.myqcloud.com",
"pragma": "no-cache",
"cache-control": "no-cache",
"sec-ch-ua": '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
"sec-ch-ua-mobile": "?0",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"sec-fetch-site": "none",
"sec-fetch-mode": "navigate",
"sec-fetch-user": "?1",
"sec-fetch-dest": "document",
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
}
text = session.get(url).content
with open(f"{img_path}/captcha.png", "wb") as f:
f.write(text)

# 新建空白图片
# captcha = Image.new('RGB', (50, 120))
# 实例化原始图片 Image 对象
img = Image.open(f"{img_path}/captcha.png")

# 切割滑块验证码图片,将背景图和滑块图分开
# (left, upper, right, lower)
captcha = img.crop((260, 0, 325, 120 - 4))
captcha = captcha.convert("RGBA")
captcha.save(f"{img_path}/captcha_slide.png")

@classmethod
def convert_index_to_offset(cls, index):
"""
Expand Down Expand Up @@ -146,17 +103,17 @@ def reset_pic(cls, slide_data):
true_pic_list: 真实图片的顺序坐标
"""
c = 260
d = 120
l = 20
s = 9
a = 61
# d = 120
_l = 20
# s = 9
# a = 61
true_pic_list = []
for curr_data in slide_data:
curr_position_list = []
if curr_data < l:
curr_position_list.extend((int(c / l * curr_data), 0))
if curr_data < _l:
curr_position_list.extend((int(c / _l * curr_data), 0))
else:
curr_position_list.extend((int(c / l * (curr_data % l)), 60))
curr_position_list.extend((int(c / _l * (curr_data % _l)), 60))
true_pic_list.append(curr_position_list)
return true_pic_list

Expand Down
2 changes: 1 addition & 1 deletion ayugespidertools/MongoClient.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import Dict, List, Optional

from gridfs import *
from gridfs import GridFS
from pymongo import MongoClient

__all__ = [
Expand Down
2 changes: 1 addition & 1 deletion ayugespidertools/MysqlClient.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Literal, Optional
from typing import Literal

import pymysql

Expand Down
2 changes: 1 addition & 1 deletion ayugespidertools/Oss.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def put_oss(
f"{self.operateDoc}/{input_file_name}.{file_format}",
put_bytes_or_url,
)
except Exception as e:
except Exception:
return False, ""
return True, input_file_name

Expand Down
2 changes: 1 addition & 1 deletion ayugespidertools/RPA.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,4 +71,4 @@ def deal_pyppeteer_suspend(cls, fn: str, line: int):

# 当最新四行日志中未出现 scrapy 统计,则为正常状态,并清空日志
elif block_times == 0:
clean_log = subprocess.getstatusoutput(f"> {fn}")
subprocess.getstatusoutput(f"> {fn}")
17 changes: 17 additions & 0 deletions ayugespidertools/common/Encryption.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import base64
import hashlib
import re
from typing import Union

import mmh3
Expand Down Expand Up @@ -98,3 +99,19 @@ def mm3_hash128_encode(encode_data: str) -> str:
o = mmh3.hash128(encode_data)
hash128_encoded = hex(((o & 0xFFFFFFFFFFFFFFFF) << 64) + (o >> 64))
return hash128_encoded[2:]

@staticmethod
def uni_to_chr(uni: str) -> str:
"""
将 Unicode 码位表示的字符串转换正常的字符,用于获取字体映射时使用
Args:
uni: 需要转换的 unicode 字符串,
如:006A,但它可能是非标准的,可能需要去掉前面的 0x 或 uni。
Returns:
1). 转换后的字符
"""
_uni = re.sub(r"^(0x|U\+|uni)", "", uni)
unicode_value = int(_uni, 16)
# 使用 chr() 函数将整数值转换为字符
return chr(unicode_value)
4 changes: 2 additions & 2 deletions ayugespidertools/common/Expend.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,11 +127,11 @@ def _get_log_by_spider(self, spider, crawl_time):
text[k.replace("/", "_")] = v

log_info = {
"database": mysql_conf["database"],
"database": mysql_conf.database,
# 脚本名称
"spider_name": spider.name,
# uid
"uid": f'{mysql_conf["database"]}|{spider.name}',
"uid": f"{mysql_conf.database}|{spider.name}",
# 请求次数统计
"request_counts": text.get("downloader_request_count", 0),
# 接收次数统计
Expand Down
10 changes: 4 additions & 6 deletions ayugespidertools/common/MultiPlexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,16 +66,14 @@ def get_file_name_by_url(file_url: str) -> str:
@staticmethod
def get_files_from_path(path: str) -> list:
"""
获取 path 文件夹下的所有文件
获取 path 文件夹下的所有文件,而且输出以 path 为根目录的相对路径
Args:
path: 需要判断的文件夹路径
Returns:
file_list: path 文件夹下的文件列表
"""
# 得到文件夹下的所有文件名称
files = os.listdir(path)
return [file for file in files if not os.path.isdir(path + "\\" + file)]
return [f.path for f in os.scandir(path) if f.is_file()]

@staticmethod
def get_bytes_by_file(file_path: str) -> bytes:
Expand Down Expand Up @@ -301,7 +299,7 @@ def judge_str_is_json(cls, judge_str: str) -> bool:

try:
json.loads(judge_str)
except Exception as e:
except Exception:
return False
else:
return True
Expand Down Expand Up @@ -338,7 +336,7 @@ def get_req_dict_from_scrapy(req_body_data_str: str) -> dict:
}

@staticmethod
def get_array_dimension(array: list) -> int:
def get_array_dimension(array: Union[frozenset, list, set, tuple]) -> int:
"""
获取 array 的维度
Args:
Expand Down
25 changes: 18 additions & 7 deletions ayugespidertools/common/MysqlErrorHandle.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,12 @@ def _create_table(
if demand_code != "":
tabel_notes = f"{demand_code}_{tabel_notes}"

sql = f"""CREATE TABLE IF NOT EXISTS `{table_name}` (`id` int(32) NOT NULL AUTO_INCREMENT COMMENT 'id',
PRIMARY KEY (`id`)) ENGINE=InnoDB DEFAULT CHARSET={charset}
COLLATE={collate} COMMENT='{tabel_notes}'; """
sql = f"""
CREATE TABLE IF NOT EXISTS `{table_name}`
(`id` int(32) NOT NULL AUTO_INCREMENT COMMENT 'id', PRIMARY KEY (`id`))
ENGINE=InnoDB DEFAULT CHARSET={charset} COLLATE={collate} COMMENT='{tabel_notes}';
"""

try:
# 执行 sql 查询,获取数据
data = cursor.execute(sql)
Expand Down Expand Up @@ -80,8 +83,10 @@ def _get_column_type(
Returns:
column_type: 字段存储类型
"""
sql = f"""select COLUMN_TYPE from information_schema.columns where table_schema = '{database}' and
table_name = '{table}' and COLUMN_NAME= '{column}';"""
sql = f"""
select COLUMN_TYPE from information_schema.columns
where table_schema = '{database}' and table_name = '{table}' and COLUMN_NAME= '{column}';
"""
column_type = None
try:
if _ := cursor.execute(sql):
Expand Down Expand Up @@ -262,7 +267,10 @@ def deal_1406_error(
cursor=cursor, database=database, table=table, column=colum
)
change_colum_type = "LONGTEXT" if column_type == "text" else "TEXT"
sql = f"""ALTER TABLE `{table}` CHANGE COLUMN `{colum}` `{colum}` {change_colum_type} NULL DEFAULT NULL COMMENT "{notes}" ;"""
sql = f"""
ALTER TABLE `{table}` CHANGE COLUMN
`{colum}` `{colum}` {change_colum_type} NULL DEFAULT NULL COMMENT "{notes}";
"""
return sql, f"1406: 更新 {colum} 字段类型为 {change_colum_type} 时失败"

def deal_1265_error(
Expand Down Expand Up @@ -294,7 +302,10 @@ def deal_1265_error(
cursor=cursor, database=database, table=table, column=colum
)
change_colum_type = "LONGTEXT" if column_type == "text" else "TEXT"
sql = f"""ALTER TABLE `{table}` CHANGE COLUMN `{colum}` `{colum}` {change_colum_type} NULL DEFAULT NULL COMMENT "{notes}" ;"""
sql = f"""
ALTER TABLE `{table}` CHANGE COLUMN
`{colum}` `{colum}` {change_colum_type} NULL DEFAULT NULL COMMENT "{notes}";
"""
return sql, f"1265: 更新 {colum} 字段类型为 {change_colum_type} 时失败"

@abstractmethod
Expand Down
3 changes: 3 additions & 0 deletions ayugespidertools/common/Params.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
import random
from typing import List, TypeVar

import pymongo
import pymysql

from ayugespidertools.Items import MongoDataItem, MysqlDataItem, ScrapyClassicItem

__all__ = [
Expand Down
16 changes: 15 additions & 1 deletion ayugespidertools/common/Utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,21 @@ def get_kvs_detail_by_consul(

curr_consul_headers = copy.deepcopy(Param.consul_headers)
curr_consul_headers["X-Consul-Token"] = token
r = requests.get(url, headers=curr_consul_headers, verify=False)
try:
r = requests.get(
url,
headers=curr_consul_headers,
verify=False,
timeout=(
Param.requests_req_timeout,
Param.requests_res_timeout,
),
)
except (
requests.exceptions.ConnectionError,
requests.exceptions.ConnectTimeout,
) as e:
raise ValueError("请求 consul 超时,请检查 consul 是否正常运行!") from e
# 判断是否返回的 raw 原始数据
if "raw" in url_params:
return r.text
Expand Down
8 changes: 6 additions & 2 deletions ayugespidertools/scraper/middlewares/proxy/private.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,11 @@ def from_crawler(cls, crawler):

@retry(stop_max_attempt_number=Param.retry_num)
def get_proxy_ip(self, size, isdict: Optional[bool] = None):
proxy_url = f"http://dps.kdlapi.com/api/getdps?orderid={self.simidaili_conf['orderid']}&num={size}&signature={self.simidaili_conf['signature']}&format=json"
proxy_url = (
"http://dps.kdlapi.com/api/getdps"
f"?orderid={self.simidaili_conf['orderid']}&num={size}"
f"&signature={self.simidaili_conf['signature']}&format=json"
)
if self.important_error:
raise ValueError("ip 获取方式有误,请重构私密代理中间件获取 ip 的模块!")

Expand Down Expand Up @@ -69,7 +73,7 @@ def get_proxy_ip(self, size, isdict: Optional[bool] = None):
return iplist
else:
raise ValueError("ip 获取方式有误,请重构私密代理中间件获取 ip 的模块!")
except:
except Exception:
self.important_error = True
traceback.print_exc()
raise ValueError("ip 获取方式有误,请重构私密代理中间件获取ip的模块!")
Expand Down
Loading

0 comments on commit f91d241

Please sign in to comment.