update: Enhance the capability to detect garbled document issues

opendatalab · Jun 19, 2024 · df14c61 · df14c61
1 parent 89d7964
commit df14c61
Show file tree

Hide file tree

Showing 6 changed files with 140 additions and 54 deletions.
diff --git a/magic_pdf/filter/pdf_classify_by_type.py b/magic_pdf/filter/pdf_classify_by_type.py
@@ -305,7 +305,7 @@ def is_narrow_strip(img):
 
 
 def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list,
-             text_layout_list: list):
+             text_layout_list: list, invalid_chars: bool):
     """
     这里的图片和页面长度单位是pts
     :param total_page:
@@ -322,7 +322,8 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l
         'by_avg_words': classify_by_avg_words(text_len_list),
         'by_img_num': classify_by_img_num(img_sz_list, img_num_list),
         'by_text_layout': classify_by_text_layout(text_layout_list),
-        'by_img_narrow_strips': classify_by_img_narrow_strips(page_width, page_height, img_sz_list)
+        'by_img_narrow_strips': classify_by_img_narrow_strips(page_width, page_height, img_sz_list),
+        'by_invalid_chars': invalid_chars,
     }
 
     if all(results.values()):
@@ -331,7 +332,10 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l
         return False, results
     else:
         logger.warning(
-            f"pdf is not classified by area and text_len, by_image_area: {results['by_image_area']}, by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']}, by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']}",
+            f"pdf is not classified by area and text_len, by_image_area: {results['by_image_area']},"
+            f" by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']},"
+            f" by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']},"
+            f" by_invalid_chars: {results['by_invalid_chars']}",
             file=sys.stderr)  # 利用这种情况可以快速找出来哪些pdf比较特殊，针对性修正分类算法
         return False, results
 

diff --git a/magic_pdf/filter/pdf_meta_scan.py b/magic_pdf/filter/pdf_meta_scan.py
@@ -12,27 +12,29 @@
 
 from magic_pdf.libs.drop_reason import DropReason
 from magic_pdf.libs.language import detect_lang
+from magic_pdf.libs.pdf_check import detect_invalid_chars
 
 scan_max_page = 50
 junk_limit_min = 10
 
 
-def calculate_max_image_area_per_page(result:list, page_width_pts, page_height_pts):
+def calculate_max_image_area_per_page(result: list, page_width_pts, page_height_pts):
     max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in
                                result]
     page_area = int(page_width_pts) * int(page_height_pts)
     max_image_area_per_page = [area / page_area for area in max_image_area_per_page]
     max_image_area_per_page = [area for area in max_image_area_per_page if area > 0.6]
     return max_image_area_per_page
 
+
 def process_image(page, junk_img_bojids=[]):
-    page_result = []# 存每个页面里的多张图四元组信息
+    page_result = []  # 存每个页面里的多张图四元组信息
     items = page.get_images()
     dedup = set()
     for img in items:
         # 这里返回的是图片在page上的实际展示的大小。返回一个数组，每个元素第一部分是
-        img_bojid = img[0]# 在pdf文件中是全局唯一的，如果这个图反复出现在pdf里那么就可能是垃圾信息，例如水印、页眉页脚等
-        if img_bojid in junk_img_bojids:# 如果是垃圾图像，就跳过
+        img_bojid = img[0]  # 在pdf文件中是全局唯一的，如果这个图反复出现在pdf里那么就可能是垃圾信息，例如水印、页眉页脚等
+        if img_bojid in junk_img_bojids:  # 如果是垃圾图像，就跳过
             continue
         recs = page.get_image_rects(img, transform=True)
         if recs:
@@ -47,6 +49,8 @@ def process_image(page, junk_img_bojids=[]):
             dedup.add((x0, y0, x1, y1, img_bojid))
             page_result.append([x0, y0, x1, y1, img_bojid])
     return page_result
+
+
 def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
     """
     返回每个页面里的图片的四元组，每个页面多个图片。
@@ -57,7 +61,7 @@ def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
     img_bojid_counter = Counter(img[0] for page in doc for img in page.get_images())
     # 找出出现次数超过 len(doc) 半数的 img_bojid
 
-    junk_limit = max(len(doc)*0.5, junk_limit_min)# 对一些页数比较少的进行豁免
+    junk_limit = max(len(doc) * 0.5, junk_limit_min)  # 对一些页数比较少的进行豁免
 
     junk_img_bojids = [img_bojid for img_bojid, count in img_bojid_counter.items() if count >= junk_limit]
 
@@ -82,9 +86,10 @@ def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
         result.append(page_result)
         for item in result:
             if not any(item):  # 如果任何一页没有图片，说明是个文字版，需要判断是否为特殊文字版
-                if max(imgs_len_list) == min(imgs_len_list) and max(imgs_len_list) >= junk_limit_min:# 如果是特殊文字版，就把junklist置空并break
+                if max(imgs_len_list) == min(imgs_len_list) and max(
+                        imgs_len_list) >= junk_limit_min:  # 如果是特殊文字版，就把junklist置空并break
                     junk_img_bojids = []
-                else:# 不是特殊文字版，是个普通文字版，但是存在垃圾图片，不置空junklist
+                else:  # 不是特殊文字版，是个普通文字版，但是存在垃圾图片，不置空junklist
                     pass
                 break_loop = True
                 break
@@ -94,16 +99,16 @@ def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
         # 检查前80%的元素是否都相等
         if len(set(top_eighty_percent)) == 1 and max(imgs_len_list) >= junk_limit_min:
 
-        # # 如果前10页跑完都有图，根据每页图片数量是否相等判断是否需要清除junklist
-        # if max(imgs_len_list) == min(imgs_len_list) and max(imgs_len_list) >= junk_limit_min:
+            # # 如果前10页跑完都有图，根据每页图片数量是否相等判断是否需要清除junklist
+            # if max(imgs_len_list) == min(imgs_len_list) and max(imgs_len_list) >= junk_limit_min:
 
             #前10页都有图，且每页数量一致，需要检测图片大小占页面的比例判断是否需要清除junklist
             max_image_area_per_page = calculate_max_image_area_per_page(result, page_width_pts, page_height_pts)
             if len(max_image_area_per_page) < 0.8 * special_limit_pages:  # 前10页不全是大图，说明可能是个文字版pdf，把垃圾图片list置空
                 junk_img_bojids = []
-            else:# 前10页都有图，而且80%都是大图，且每页图片数量一致并都很多，说明是扫描版1，不需要清空junklist
+            else:  # 前10页都有图，而且80%都是大图，且每页图片数量一致并都很多，说明是扫描版1，不需要清空junklist
                 pass
-        else:# 每页图片数量不一致，需要清掉junklist全量跑前50页图片
+        else:  # 每页图片数量不一致，需要清掉junklist全量跑前50页图片
             junk_img_bojids = []
 
     #正式进入取前50页图片的信息流程
@@ -136,7 +141,6 @@ def get_pdf_page_size_pts(doc: fitz.Document):
     median_width = page_width_list[len(page_width_list) // 2]
     median_height = page_height_list[len(page_height_list) // 2]
 
-
     return median_width, median_height
 
 
@@ -156,6 +160,7 @@ def get_pdf_textlen_per_page(doc: fitz.Document):
 
     return text_len_lst
 
+
 def get_pdf_text_layout_per_page(doc: fitz.Document):
     """
     根据PDF文档的每一页文本布局，判断该页的文本布局是横向、纵向还是未知。
@@ -233,11 +238,16 @@ def get_pdf_text_layout_per_page(doc: fitz.Document):
         # logger.info(f"page_id: {page_id}, vertical_count: {vertical_count}, horizontal_count: {horizontal_count}")
     return text_layout_list
 
+
 '''定义一个自定义异常用来抛出单页svg太多的pdf'''
+
+
 class PageSvgsTooManyError(Exception):
     def __init__(self, message="Page SVGs are too many"):
         self.message = message
         super().__init__(self.message)
+
+
 def get_svgs_per_page(doc: fitz.Document):
     svgs_len_list = []
     for page_id, page in enumerate(doc):
@@ -251,6 +261,7 @@ def get_svgs_per_page(doc: fitz.Document):
         # logger.info(f"page_id: {page_id}, svgs_len: {len(svgs)}")
     return svgs_len_list
 
+
 def get_imgs_per_page(doc: fitz.Document):
     imgs_len_list = []
     for page_id, page in enumerate(doc):
@@ -287,6 +298,13 @@ def get_language(doc: fitz.Document):
     return language
 
 
+def check_invalid_chars(pdf_bytes):
+    """
+    乱码检测
+    """
+    return detect_invalid_chars(pdf_bytes)
+
+
 def pdf_meta_scan(pdf_bytes: bytes):
     """
     :param s3_pdf_path:
@@ -318,7 +336,8 @@ def pdf_meta_scan(pdf_bytes: bytes):
         # logger.info(f"text_layout_per_page: {text_layout_per_page}")
         text_language = get_language(doc)
         # logger.info(f"text_language: {text_language}")
-
+        invalid_chars = check_invalid_chars(pdf_bytes)
+        # logger.info(f"invalid_chars: {invalid_chars}")
 
         # 最后输出一条json
         res = {
@@ -334,6 +353,7 @@ def pdf_meta_scan(pdf_bytes: bytes):
             # "svgs_per_page": svgs_per_page,
             "imgs_per_page": imgs_per_page,  # 增加每页img数量list
             "junk_img_bojids": junk_img_bojids,  # 增加垃圾图片的bojid list
+            "invalid_chars": invalid_chars,
             "metadata": doc.metadata
         }
         # logger.info(json.dumps(res, ensure_ascii=False))
@@ -365,4 +385,4 @@ def main(s3_pdf_path: str, s3_profile: str):
     # file_content = read_file("D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师_大乘无量寿.pdf","")
     # doc = fitz.open("pdf", file_content)
     # text_layout_lst = get_pdf_text_layout_per_page(doc)
-    # print(text_layout_lst)
+    # print(text_layout_lst)
diff --git a/magic_pdf/libs/pdf_check.py b/magic_pdf/libs/pdf_check.py
@@ -0,0 +1,59 @@
+from io import BytesIO
+import re
+import fitz
+import numpy as np
+from loguru import logger
+from pdfminer.high_level import extract_text
+
+
+def calculate_sample_count(total_page: int, sample_ratio=0.1):
+    """
+    根据总页数和采样率计算采样页面的数量。
+    """
+    select_page_cnt = int(total_page * sample_ratio)
+    if select_page_cnt < 5:
+        select_page_cnt = min(10, total_page)
+    elif select_page_cnt > 10:
+        select_page_cnt = 10
+    return select_page_cnt
+
+
+def extract_pages(src_pdf_bytes: bytes):
+    pdf_docs = fitz.open("pdf", src_pdf_bytes)
+    total_page = len(pdf_docs)
+    if total_page == 0:
+        # 如果PDF没有页面，直接返回空文档
+        logger.warning("PDF is empty, return empty document")
+        return fitz.Document()
+    select_page_cnt = calculate_sample_count(total_page)
+
+    page_num = np.random.choice(total_page, select_page_cnt, replace=False)
+    sample_docs = fitz.Document()
+    try:
+        for index in page_num:
+            sample_docs.insert_pdf(pdf_docs, from_page=int(index), to_page=int(index))
+    except Exception as e:
+        logger.exception(e)
+    return sample_docs
+
+
+def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
+    """"
+    检测PDF中是否包含非法字符
+    """
+    '''需要使用'''
+    sample_docs = extract_pages(src_pdf_bytes)
+    sample_pdf_bytes = sample_docs.tobytes()
+    sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
+    text = extract_text(sample_pdf_file_like_object)
+    # logger.info(text)
+    '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
+    cid_pattern = re.compile(r'\(cid:\d+\)')
+    matches = cid_pattern.findall(text)
+    cid_count = len(matches)
+    text_len = len(text)
+    logger.info(f"cid_count: {cid_count}, text_len: {text_len}")
+    if cid_count > 10:
+        return False  # 乱码文档
+    else:
+        return True   # 正常文档
diff --git a/magic_pdf/pipe/AbsPipe.py b/magic_pdf/pipe/AbsPipe.py
@@ -83,6 +83,7 @@ def classify(pdf_bytes: bytes) -> str:
                     pdf_meta["text_len_per_page"],
                     pdf_meta["imgs_per_page"],
                     pdf_meta["text_layout_per_page"],
+                    pdf_meta["invalid_chars"],
                 )
                 if is_text_pdf:
                     return AbsPipe.PIP_TXT

diff --git a/magic_pdf/user_api.py b/magic_pdf/user_api.py
@@ -86,45 +86,46 @@ def parse_pdf(method):
             return None
 
     pdf_info_dict = parse_pdf(parse_pdf_by_txt)
-    text_all = ""
-    for page_dict in pdf_info_dict['pdf_info']:
-        for para_block in page_dict['para_blocks']:
-            if para_block['type'] in ['title', 'text']:
-                for line in para_block['lines']:
-                    for span in line['spans']:
-                        text_all += span['content']
-
-    def calculate_not_common_character_rate(text):
-        garbage_regex = re.compile(r'[^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a\u3000-\u303f\uff00-\uffef]')
-        # 计算乱码字符的数量
-        garbage_count = len(garbage_regex.findall(text))
-        total = len(text)
-        if total == 0:
-            return 0  # 避免除以零的错误
-        return garbage_count / total
-
-    def calculate_not_printable_rate(text):
-        printable_text = ""
-        for c in text:
-            if c.isprintable():
-                printable_text += c
-        printable_total = len(printable_text)
-        total = len(text)
-        if total == 0:
-            return 0  # 避免除以零的错误
-        return (total - printable_total) / total
-
-    not_common_character_rate = calculate_not_common_character_rate(text_all)
-    not_printable_rate = calculate_not_printable_rate(text_all)
-    pdf_info_dict["_not_common_character_rate"] = not_common_character_rate
-    pdf_info_dict["_not_printable_rate"] = not_printable_rate
-    logger.info(f"not_common_character_rate: {not_common_character_rate}, not_printable_rate: {not_printable_rate}")
+    # text_all = ""
+    # for page_dict in pdf_info_dict['pdf_info']:
+    #     for para_block in page_dict['para_blocks']:
+    #         if para_block['type'] in ['title', 'text']:
+    #             for line in para_block['lines']:
+    #                 for span in line['spans']:
+    #                     text_all += span['content']
+
+    # def calculate_not_common_character_rate(text):
+    #     garbage_regex = re.compile(r'[^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a\u3000-\u303f\uff00-\uffef]')
+    #     # 计算乱码字符的数量
+    #     garbage_count = len(garbage_regex.findall(text))
+    #     total = len(text)
+    #     if total == 0:
+    #         return 0  # 避免除以零的错误
+    #     return garbage_count / total
+    #
+    # def calculate_not_printable_rate(text):
+    #     printable_text = ""
+    #     for c in text:
+    #         if c.isprintable():
+    #             printable_text += c
+    #     printable_total = len(printable_text)
+    #     total = len(text)
+    #     if total == 0:
+    #         return 0  # 避免除以零的错误
+    #     return (total - printable_total) / total
+    #
+    # not_common_character_rate = calculate_not_common_character_rate(text_all)
+    # not_printable_rate = calculate_not_printable_rate(text_all)
+    # pdf_info_dict["_not_common_character_rate"] = not_common_character_rate
+    # pdf_info_dict["_not_printable_rate"] = not_printable_rate
+    # logger.info(f"not_common_character_rate: {not_common_character_rate}, not_printable_rate: {not_printable_rate}")
+    '''新逻辑使用pdfminer识别乱码pdf,准确率高且不会误伤,已在解析流程之前进行处理'''
     # not_common_character_rate对小语种可能会有误伤，not_printable_rate对小语种较为友好
     if (pdf_info_dict is None
-        or pdf_info_dict.get("_need_drop", False)
-        or not_printable_rate > 0.02  # 参考一些正常的pdf，这个值没有超过0.01的，阈值设为0.02
+            or pdf_info_dict.get("_need_drop", False)
+            # or not_printable_rate > 0.02  # 参考一些正常的pdf，这个值没有超过0.01的，阈值设为0.02
     ):
-        logger.warning(f"parse_pdf_by_txt drop or error or garbled_rate too large, switch to parse_pdf_by_ocr")
+        logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
         if input_model_is_empty:
             pdf_models = doc_analyze(pdf_bytes, ocr=True)
         pdf_info_dict = parse_pdf(parse_pdf_by_ocr)

diff --git a/requirements.txt b/requirements.txt
@@ -14,4 +14,5 @@ wordninja>=2.0.0
 scikit-learn>=1.0.2
 nltk==3.8.1
 s3pathlib>=2.1.1
-paddleocr @ https://github.com/magicpdf/PaddleOCR/releases/download/paddleocr-2.8.2-released/paddleocr-2.8.2-py3-none-any.whl
+paddleocr @ https://github.com/magicpdf/PaddleOCR/releases/download/paddleocr-2.8.2-released/paddleocr-2.8.2-py3-none-any.whl
+pdfminer.six>=20231228