From 8998380da56b1dc50dbf1f02e8b211899156f77d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B5=B5=E5=B0=8F=E8=92=99?= Date: Thu, 20 Jun 2024 11:18:15 +0800 Subject: [PATCH] update check invalid_chars algorithm to improve accuracy --- magic_pdf/libs/pdf_check.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/magic_pdf/libs/pdf_check.py b/magic_pdf/libs/pdf_check.py index 840509df..3f9dc350 100644 --- a/magic_pdf/libs/pdf_check.py +++ b/magic_pdf/libs/pdf_check.py @@ -6,15 +6,11 @@ from pdfminer.high_level import extract_text -def calculate_sample_count(total_page: int, sample_ratio=0.1): +def calculate_sample_count(total_page: int): """ 根据总页数和采样率计算采样页面的数量。 """ - select_page_cnt = int(total_page * sample_ratio) - if select_page_cnt < 5: - select_page_cnt = min(10, total_page) - elif select_page_cnt > 10: - select_page_cnt = 10 + select_page_cnt = min(10, total_page) return select_page_cnt @@ -46,14 +42,21 @@ def detect_invalid_chars(src_pdf_bytes: bytes) -> bool: sample_pdf_bytes = sample_docs.tobytes() sample_pdf_file_like_object = BytesIO(sample_pdf_bytes) text = extract_text(sample_pdf_file_like_object) + text = text.replace("\n", "") # logger.info(text) '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)''' cid_pattern = re.compile(r'\(cid:\d+\)') matches = cid_pattern.findall(text) cid_count = len(matches) + cid_len = sum(len(match) for match in matches) text_len = len(text) - logger.info(f"cid_count: {cid_count}, text_len: {text_len}") - if cid_count > 10: + if text_len == 0: + cid_chars_radio = 0 + else: + cid_chars_radio = cid_count/(cid_count + text_len - cid_len) + logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}") + '''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档''' + if cid_chars_radio > 0.05: return False # 乱码文档 else: return True # 正常文档