1.1.8 提交

shengchenyang · Apr 14, 2023 · f91d241 · f91d241
1 parent cfb62ba
commit f91d241
Show file tree

Hide file tree

Showing 31 changed files with 304 additions and 176 deletions.
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,17 @@
+[flake8]
+
+max-line-length = 120
+ignore =
+    # W503: line break before binary operator
+    W503,
+    # E203: whitespace before ':'
+    E203
+
+exclude =
+    .git,
+    docs,
+
+per-file-ignores =
+# E501: line too long
+    ayugespidertools/common/Params.py:E501
+    tests/test_common/test_common_Utils.py:E501
diff --git a/Makefile b/Makefile
@@ -43,6 +43,7 @@ release:
 	poetry publish
 
 test:
+	flake8 .
 	coverage run -m pytest
 	coverage combine
 	coverage report

diff --git a/README.md b/README.md
@@ -160,17 +160,13 @@ scrapy crawl <spider_name>
 ## TodoList
 
 - [x] `scrapy` 的扩展功能场景
-  - [ ] ~~`scrapy` 结合 `crawlab` 的日志统计功能~~
   - [x] `scrapy` 脚本运行信息统计和项目依赖表采集量统计，可用于日志记录和预警
   - [x] 自定义模板，在 `ayugespidertools startproject <projname>` 和 `ayugespidertools genspider <spidername>` 时生成适合本库的模板文件
   - [x] ~~增加根据 `nacos` 来获取配置的功能~~ -> 改为增加根据 `consul` 来获取配置的功能
   - [x] 代理中间件（独享代理、动态隧道代理）
   - [x] 随机请求头 `UA` 中间件，根据 `fake_useragent` 中的权重来随机
   - [x] 使用以下工具来替换 `scrapy` 的 `Request` 来发送请求
-    - [ ] ~~`selenium`: 性能没有 `pyppeteer` 强~~
-    - [x] `pyppeteer`: `Gerapy-pyppeteer` 库已经实现此功能
     - [x] `requests`: 这个不推荐使用，`requests` 同步库会降低 `scrapy` 运行效率
-    - [ ] ~~`splash`: 继承 `splash` 渲染 `js` 的功能~~
     - [x] `aiohttp`: 集成将 `scrapy Request` 替换为 `aiohttp` 的协程方式
   - [x] `Mysql` 存储的场景下适配
     - [x] 自动创建 `Mysql` 用户场景下需要的数据库和数据表及字段格式，还有字段注释
@@ -183,7 +179,10 @@ scrapy crawl <spider_name>
   - [x] `sql` 语句拼接，只是简单场景，后续优化。已给出优化方向，参考库等信息。
   - [x] `mongoDB` 语句拼接
   - [x] 数据格式化处理，比如：去除网页标签，去除无效空格等
-  - [ ] 字体反爬还原方法
+  - [x] 字体反爬还原方法
+    - [x] 基于 `ttf`，`woff` 之类的字体文件映射，或结合 `css` 等实现
+      - [x] 可以直接在字体文件 `xml` 中找到映射关系的：使用 [fontforge](https://github.com/fontforge/fontforge/releases) 工具导出映射即可。
+      - [x] 无法找到映射关系的，则一般使用 `ocr` 识别（准确率非百分百），通过 `fontforge` 导出每个映射的 `png`，后再通过各种方式识别。
   - [x] `html` 格式转 `markdown` 格式
   - [x] `html` 数据处理，去除标签，不可见字符，特殊字符改成正常显示等等等
   - [x] 添加常用的图片验证码中的处理方法
@@ -192,7 +191,7 @@ scrapy crawl <spider_name>
     - [x] 识别点选验证码位置及点击顺序，识别结果不太好，待优化
     - [x] 图片乱序混淆的还原方法示例
 
-注：
+**注：**
 
-- 不再开发结合 `selenium` 扩展的功能，推荐使用 `scrapy-playwright`，`Gerapy-pyppeteer` 等其它库；
-- 不再开发结合 `splash` 的功能，如果使用 `splash http api` 的话，在 `scrapy` ，本库或自写脚本中都比较容易扩展。如果要使用 `lua` `api` 等复杂的功能那还是推荐 `scrapy-splash` 这类的扩展库。
+1. 由于 `scrapy` 扩展库的开源项目众多，在使用本库的基础上扩展它们，与使用 `scrapy` 时扩展它们的用法一致，所以不再开发一些其它工具已经拥有且稳定的功能，但也会尽量开发一些常用且通用的方法来提升效率。
+2. 字体反爬部分不会给出详细解决示例，不管是使用 `fontforge`，`fontTools` 或 `ocr` 等工具，已经脱离本库的范围了，我会给出部分依赖的方法，但不会添加以上工具库的依赖了，而导致本库依赖过于杂糅。而且，若要实现高可用的字体映射也比较简单，请自行实现，可能会考虑新开 `pypi` 库来实现此部分。
diff --git a/ayugespidertools/DownloaderMiddlewares.py b/ayugespidertools/DownloaderMiddlewares.py
@@ -1,5 +1,4 @@
 import asyncio
-import urllib.parse
 
 import aiohttp
 from scrapy.http import HtmlResponse
@@ -75,9 +74,10 @@ async def _process_request(self, request, spider):
 
         # set proxy
 
+        # todo: 此部分中的 domain 参数暂时不使用，后续考虑是否需要
         # set cookies domain 参数
-        parse_result = urllib.parse.urlsplit(request.url)
-        domain = parse_result.hostname
+        # parse_result = urllib.parse.urlsplit(request.url)
+        # domain = parse_result.hostname
 
         # _timeout = self.download_timeout
         # if aiohttp_meta.get('timeout') is not None:

diff --git a/ayugespidertools/ImgOperation.py b/ayugespidertools/ImgOperation.py
@@ -1,7 +1,6 @@
 from typing import Optional, Union
 
 import cv2
-import requests
 from PIL import Image
 
 from ayugespidertools.common.Encryption import EncryptOperation
@@ -17,48 +16,6 @@ class Picture(object):
     对验证码图片的一些操作
     """
 
-    @classmethod
-    def get_captcha(cls, url: str, img_path: str) -> None:
-        """
-        下载完美滑块的图片，并将缺口图和滑块在一起的图片切割
-        Args:
-            url: 完美滑块的滑块图片链接
-            img_path: 图片保存路径
-
-        Returns:
-            None
-        """
-        session = requests.Session()
-        session.headers = {
-            "authority": "captchas-1251008858.file.myqcloud.com",
-            "pragma": "no-cache",
-            "cache-control": "no-cache",
-            "sec-ch-ua": '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
-            "sec-ch-ua-mobile": "?0",
-            "upgrade-insecure-requests": "1",
-            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36",
-            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
-            "sec-fetch-site": "none",
-            "sec-fetch-mode": "navigate",
-            "sec-fetch-user": "?1",
-            "sec-fetch-dest": "document",
-            "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
-        }
-        text = session.get(url).content
-        with open(f"{img_path}/captcha.png", "wb") as f:
-            f.write(text)
-
-        # 新建空白图片
-        # captcha = Image.new('RGB', (50, 120))
-        # 实例化原始图片 Image 对象
-        img = Image.open(f"{img_path}/captcha.png")
-
-        # 切割滑块验证码图片，将背景图和滑块图分开
-        # (left, upper, right, lower)
-        captcha = img.crop((260, 0, 325, 120 - 4))
-        captcha = captcha.convert("RGBA")
-        captcha.save(f"{img_path}/captcha_slide.png")
-
     @classmethod
     def convert_index_to_offset(cls, index):
         """
@@ -146,17 +103,17 @@ def reset_pic(cls, slide_data):
             true_pic_list: 真实图片的顺序坐标
         """
         c = 260
-        d = 120
-        l = 20
-        s = 9
-        a = 61
+        # d = 120
+        _l = 20
+        # s = 9
+        # a = 61
         true_pic_list = []
         for curr_data in slide_data:
             curr_position_list = []
-            if curr_data < l:
-                curr_position_list.extend((int(c / l * curr_data), 0))
+            if curr_data < _l:
+                curr_position_list.extend((int(c / _l * curr_data), 0))
             else:
-                curr_position_list.extend((int(c / l * (curr_data % l)), 60))
+                curr_position_list.extend((int(c / _l * (curr_data % _l)), 60))
             true_pic_list.append(curr_position_list)
         return true_pic_list
 

diff --git a/ayugespidertools/MongoClient.py b/ayugespidertools/MongoClient.py
@@ -1,6 +1,6 @@
 from typing import Dict, List, Optional
 
-from gridfs import *
+from gridfs import GridFS
 from pymongo import MongoClient
 
 __all__ = [

diff --git a/ayugespidertools/MysqlClient.py b/ayugespidertools/MysqlClient.py
@@ -1,4 +1,4 @@
-from typing import Literal, Optional
+from typing import Literal
 
 import pymysql
 

diff --git a/ayugespidertools/Oss.py b/ayugespidertools/Oss.py
@@ -105,7 +105,7 @@ def put_oss(
                 f"{self.operateDoc}/{input_file_name}.{file_format}",
                 put_bytes_or_url,
             )
-        except Exception as e:
+        except Exception:
             return False, ""
         return True, input_file_name
 

diff --git a/ayugespidertools/RPA.py b/ayugespidertools/RPA.py
@@ -71,4 +71,4 @@ def deal_pyppeteer_suspend(cls, fn: str, line: int):
 
             # 当最新四行日志中未出现 scrapy 统计，则为正常状态，并清空日志
             elif block_times == 0:
-                clean_log = subprocess.getstatusoutput(f"> {fn}")
+                subprocess.getstatusoutput(f"> {fn}")
diff --git a/ayugespidertools/common/Encryption.py b/ayugespidertools/common/Encryption.py
@@ -1,5 +1,6 @@
 import base64
 import hashlib
+import re
 from typing import Union
 
 import mmh3
@@ -98,3 +99,19 @@ def mm3_hash128_encode(encode_data: str) -> str:
         o = mmh3.hash128(encode_data)
         hash128_encoded = hex(((o & 0xFFFFFFFFFFFFFFFF) << 64) + (o >> 64))
         return hash128_encoded[2:]
+
+    @staticmethod
+    def uni_to_chr(uni: str) -> str:
+        """
+        将 Unicode 码位表示的字符串转换正常的字符，用于获取字体映射时使用
+        Args:
+            uni: 需要转换的 unicode 字符串，
+                如：006A，但它可能是非标准的，可能需要去掉前面的 0x 或 uni。
+
+        Returns:
+            1). 转换后的字符
+        """
+        _uni = re.sub(r"^(0x|U\+|uni)", "", uni)
+        unicode_value = int(_uni, 16)
+        # 使用 chr() 函数将整数值转换为字符
+        return chr(unicode_value)
diff --git a/ayugespidertools/common/Expend.py b/ayugespidertools/common/Expend.py
@@ -127,11 +127,11 @@ def _get_log_by_spider(self, spider, crawl_time):
                 text[k.replace("/", "_")] = v
 
         log_info = {
-            "database": mysql_conf["database"],
+            "database": mysql_conf.database,
             # 脚本名称
             "spider_name": spider.name,
             # uid
-            "uid": f'{mysql_conf["database"]}|{spider.name}',
+            "uid": f"{mysql_conf.database}|{spider.name}",
             # 请求次数统计
             "request_counts": text.get("downloader_request_count", 0),
             # 接收次数统计

diff --git a/ayugespidertools/common/MultiPlexing.py b/ayugespidertools/common/MultiPlexing.py
@@ -66,16 +66,14 @@ def get_file_name_by_url(file_url: str) -> str:
     @staticmethod
     def get_files_from_path(path: str) -> list:
         """
-        获取 path 文件夹下的所有文件
+        获取 path 文件夹下的所有文件，而且输出以 path 为根目录的相对路径
         Args:
             path: 需要判断的文件夹路径
 
         Returns:
             file_list: path 文件夹下的文件列表
         """
-        # 得到文件夹下的所有文件名称
-        files = os.listdir(path)
-        return [file for file in files if not os.path.isdir(path + "\\" + file)]
+        return [f.path for f in os.scandir(path) if f.is_file()]
 
     @staticmethod
     def get_bytes_by_file(file_path: str) -> bytes:
@@ -301,7 +299,7 @@ def judge_str_is_json(cls, judge_str: str) -> bool:
 
         try:
             json.loads(judge_str)
-        except Exception as e:
+        except Exception:
             return False
         else:
             return True
@@ -338,7 +336,7 @@ def get_req_dict_from_scrapy(req_body_data_str: str) -> dict:
         }
 
     @staticmethod
-    def get_array_dimension(array: list) -> int:
+    def get_array_dimension(array: Union[frozenset, list, set, tuple]) -> int:
         """
         获取 array 的维度
         Args:

diff --git a/ayugespidertools/common/MysqlErrorHandle.py b/ayugespidertools/common/MysqlErrorHandle.py
@@ -48,9 +48,12 @@ def _create_table(
         if demand_code != "":
             tabel_notes = f"{demand_code}_{tabel_notes}"
 
-        sql = f"""CREATE TABLE IF NOT EXISTS `{table_name}` (`id` int(32) NOT NULL AUTO_INCREMENT COMMENT 'id',
-            PRIMARY KEY (`id`)) ENGINE=InnoDB DEFAULT CHARSET={charset} 
-            COLLATE={collate} COMMENT='{tabel_notes}'; """
+        sql = f"""
+        CREATE TABLE IF NOT EXISTS `{table_name}`
+        (`id` int(32) NOT NULL AUTO_INCREMENT COMMENT 'id', PRIMARY KEY (`id`))
+        ENGINE=InnoDB DEFAULT CHARSET={charset} COLLATE={collate} COMMENT='{tabel_notes}';
+        """
+
         try:
             # 执行 sql 查询，获取数据
             data = cursor.execute(sql)
@@ -80,8 +83,10 @@ def _get_column_type(
         Returns:
             column_type: 字段存储类型
         """
-        sql = f"""select COLUMN_TYPE from information_schema.columns where table_schema = '{database}' and 
-            table_name = '{table}' and COLUMN_NAME= '{column}';"""
+        sql = f"""
+        select COLUMN_TYPE from information_schema.columns
+        where table_schema = '{database}' and table_name = '{table}' and COLUMN_NAME= '{column}';
+        """
         column_type = None
         try:
             if _ := cursor.execute(sql):
@@ -262,7 +267,10 @@ def deal_1406_error(
                 cursor=cursor, database=database, table=table, column=colum
             )
             change_colum_type = "LONGTEXT" if column_type == "text" else "TEXT"
-            sql = f"""ALTER TABLE `{table}` CHANGE COLUMN `{colum}` `{colum}` {change_colum_type} NULL DEFAULT NULL COMMENT "{notes}" ;"""
+            sql = f"""
+            ALTER TABLE `{table}` CHANGE COLUMN
+            `{colum}` `{colum}` {change_colum_type} NULL DEFAULT NULL COMMENT "{notes}";
+            """
             return sql, f"1406: 更新 {colum} 字段类型为 {change_colum_type} 时失败"
 
     def deal_1265_error(
@@ -294,7 +302,10 @@ def deal_1265_error(
                 cursor=cursor, database=database, table=table, column=colum
             )
             change_colum_type = "LONGTEXT" if column_type == "text" else "TEXT"
-            sql = f"""ALTER TABLE `{table}` CHANGE COLUMN `{colum}` `{colum}` {change_colum_type} NULL DEFAULT NULL COMMENT "{notes}" ;"""
+            sql = f"""
+            ALTER TABLE `{table}` CHANGE COLUMN
+            `{colum}` `{colum}` {change_colum_type} NULL DEFAULT NULL COMMENT "{notes}";
+            """
             return sql, f"1265: 更新 {colum} 字段类型为 {change_colum_type} 时失败"
 
     @abstractmethod

diff --git a/ayugespidertools/common/Params.py b/ayugespidertools/common/Params.py
@@ -2,6 +2,9 @@
 import random
 from typing import List, TypeVar
 
+import pymongo
+import pymysql
+
 from ayugespidertools.Items import MongoDataItem, MysqlDataItem, ScrapyClassicItem
 
 __all__ = [

diff --git a/ayugespidertools/common/Utils.py b/ayugespidertools/common/Utils.py
@@ -70,7 +70,21 @@ def get_kvs_detail_by_consul(
 
         curr_consul_headers = copy.deepcopy(Param.consul_headers)
         curr_consul_headers["X-Consul-Token"] = token
-        r = requests.get(url, headers=curr_consul_headers, verify=False)
+        try:
+            r = requests.get(
+                url,
+                headers=curr_consul_headers,
+                verify=False,
+                timeout=(
+                    Param.requests_req_timeout,
+                    Param.requests_res_timeout,
+                ),
+            )
+        except (
+            requests.exceptions.ConnectionError,
+            requests.exceptions.ConnectTimeout,
+        ) as e:
+            raise ValueError("请求 consul 超时，请检查 consul 是否正常运行!") from e
         # 判断是否返回的 raw 原始数据
         if "raw" in url_params:
             return r.text

diff --git a/ayugespidertools/scraper/middlewares/proxy/private.py b/ayugespidertools/scraper/middlewares/proxy/private.py
@@ -38,7 +38,11 @@ def from_crawler(cls, crawler):
 
     @retry(stop_max_attempt_number=Param.retry_num)
     def get_proxy_ip(self, size, isdict: Optional[bool] = None):
-        proxy_url = f"http://dps.kdlapi.com/api/getdps?orderid={self.simidaili_conf['orderid']}&num={size}&signature={self.simidaili_conf['signature']}&format=json"
+        proxy_url = (
+            "http://dps.kdlapi.com/api/getdps"
+            f"?orderid={self.simidaili_conf['orderid']}&num={size}"
+            f"&signature={self.simidaili_conf['signature']}&format=json"
+        )
         if self.important_error:
             raise ValueError("ip 获取方式有误，请重构私密代理中间件获取 ip 的模块！")
 
@@ -69,7 +73,7 @@ def get_proxy_ip(self, size, isdict: Optional[bool] = None):
                         return iplist
                 else:
                     raise ValueError("ip 获取方式有误，请重构私密代理中间件获取 ip 的模块！")
-            except:
+            except Exception:
                 self.important_error = True
                 traceback.print_exc()
                 raise ValueError("ip 获取方式有误，请重构私密代理中间件获取ip的模块！")