Skip to content

Commit

Permalink
update: Enhance the capability to detect garbled document issues
Browse files Browse the repository at this point in the history
  • Loading branch information
myhloli committed Jun 19, 2024
1 parent 89d7964 commit df14c61
Show file tree
Hide file tree
Showing 6 changed files with 140 additions and 54 deletions.
10 changes: 7 additions & 3 deletions magic_pdf/filter/pdf_classify_by_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,7 @@ def is_narrow_strip(img):


def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list,
text_layout_list: list):
text_layout_list: list, invalid_chars: bool):
"""
这里的图片和页面长度单位是pts
:param total_page:
Expand All @@ -322,7 +322,8 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l
'by_avg_words': classify_by_avg_words(text_len_list),
'by_img_num': classify_by_img_num(img_sz_list, img_num_list),
'by_text_layout': classify_by_text_layout(text_layout_list),
'by_img_narrow_strips': classify_by_img_narrow_strips(page_width, page_height, img_sz_list)
'by_img_narrow_strips': classify_by_img_narrow_strips(page_width, page_height, img_sz_list),
'by_invalid_chars': invalid_chars,
}

if all(results.values()):
Expand All @@ -331,7 +332,10 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l
return False, results
else:
logger.warning(
f"pdf is not classified by area and text_len, by_image_area: {results['by_image_area']}, by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']}, by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']}",
f"pdf is not classified by area and text_len, by_image_area: {results['by_image_area']},"
f" by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']},"
f" by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']},"
f" by_invalid_chars: {results['by_invalid_chars']}",
file=sys.stderr) # 利用这种情况可以快速找出来哪些pdf比较特殊,针对性修正分类算法
return False, results

Expand Down
48 changes: 34 additions & 14 deletions magic_pdf/filter/pdf_meta_scan.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,27 +12,29 @@

from magic_pdf.libs.drop_reason import DropReason
from magic_pdf.libs.language import detect_lang
from magic_pdf.libs.pdf_check import detect_invalid_chars

scan_max_page = 50
junk_limit_min = 10


def calculate_max_image_area_per_page(result:list, page_width_pts, page_height_pts):
def calculate_max_image_area_per_page(result: list, page_width_pts, page_height_pts):
max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in
result]
page_area = int(page_width_pts) * int(page_height_pts)
max_image_area_per_page = [area / page_area for area in max_image_area_per_page]
max_image_area_per_page = [area for area in max_image_area_per_page if area > 0.6]
return max_image_area_per_page


def process_image(page, junk_img_bojids=[]):
page_result = []# 存每个页面里的多张图四元组信息
page_result = [] # 存每个页面里的多张图四元组信息
items = page.get_images()
dedup = set()
for img in items:
# 这里返回的是图片在page上的实际展示的大小。返回一个数组,每个元素第一部分是
img_bojid = img[0]# 在pdf文件中是全局唯一的,如果这个图反复出现在pdf里那么就可能是垃圾信息,例如水印、页眉页脚等
if img_bojid in junk_img_bojids:# 如果是垃圾图像,就跳过
img_bojid = img[0] # 在pdf文件中是全局唯一的,如果这个图反复出现在pdf里那么就可能是垃圾信息,例如水印、页眉页脚等
if img_bojid in junk_img_bojids: # 如果是垃圾图像,就跳过
continue
recs = page.get_image_rects(img, transform=True)
if recs:
Expand All @@ -47,6 +49,8 @@ def process_image(page, junk_img_bojids=[]):
dedup.add((x0, y0, x1, y1, img_bojid))
page_result.append([x0, y0, x1, y1, img_bojid])
return page_result


def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
"""
返回每个页面里的图片的四元组,每个页面多个图片。
Expand All @@ -57,7 +61,7 @@ def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
img_bojid_counter = Counter(img[0] for page in doc for img in page.get_images())
# 找出出现次数超过 len(doc) 半数的 img_bojid

junk_limit = max(len(doc)*0.5, junk_limit_min)# 对一些页数比较少的进行豁免
junk_limit = max(len(doc) * 0.5, junk_limit_min) # 对一些页数比较少的进行豁免

junk_img_bojids = [img_bojid for img_bojid, count in img_bojid_counter.items() if count >= junk_limit]

Expand All @@ -82,9 +86,10 @@ def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
result.append(page_result)
for item in result:
if not any(item): # 如果任何一页没有图片,说明是个文字版,需要判断是否为特殊文字版
if max(imgs_len_list) == min(imgs_len_list) and max(imgs_len_list) >= junk_limit_min:# 如果是特殊文字版,就把junklist置空并break
if max(imgs_len_list) == min(imgs_len_list) and max(
imgs_len_list) >= junk_limit_min: # 如果是特殊文字版,就把junklist置空并break
junk_img_bojids = []
else:# 不是特殊文字版,是个普通文字版,但是存在垃圾图片,不置空junklist
else: # 不是特殊文字版,是个普通文字版,但是存在垃圾图片,不置空junklist
pass
break_loop = True
break
Expand All @@ -94,16 +99,16 @@ def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
# 检查前80%的元素是否都相等
if len(set(top_eighty_percent)) == 1 and max(imgs_len_list) >= junk_limit_min:

# # 如果前10页跑完都有图,根据每页图片数量是否相等判断是否需要清除junklist
# if max(imgs_len_list) == min(imgs_len_list) and max(imgs_len_list) >= junk_limit_min:
# # 如果前10页跑完都有图,根据每页图片数量是否相等判断是否需要清除junklist
# if max(imgs_len_list) == min(imgs_len_list) and max(imgs_len_list) >= junk_limit_min:

#前10页都有图,且每页数量一致,需要检测图片大小占页面的比例判断是否需要清除junklist
max_image_area_per_page = calculate_max_image_area_per_page(result, page_width_pts, page_height_pts)
if len(max_image_area_per_page) < 0.8 * special_limit_pages: # 前10页不全是大图,说明可能是个文字版pdf,把垃圾图片list置空
junk_img_bojids = []
else:# 前10页都有图,而且80%都是大图,且每页图片数量一致并都很多,说明是扫描版1,不需要清空junklist
else: # 前10页都有图,而且80%都是大图,且每页图片数量一致并都很多,说明是扫描版1,不需要清空junklist
pass
else:# 每页图片数量不一致,需要清掉junklist全量跑前50页图片
else: # 每页图片数量不一致,需要清掉junklist全量跑前50页图片
junk_img_bojids = []

#正式进入取前50页图片的信息流程
Expand Down Expand Up @@ -136,7 +141,6 @@ def get_pdf_page_size_pts(doc: fitz.Document):
median_width = page_width_list[len(page_width_list) // 2]
median_height = page_height_list[len(page_height_list) // 2]


return median_width, median_height


Expand All @@ -156,6 +160,7 @@ def get_pdf_textlen_per_page(doc: fitz.Document):

return text_len_lst


def get_pdf_text_layout_per_page(doc: fitz.Document):
"""
根据PDF文档的每一页文本布局,判断该页的文本布局是横向、纵向还是未知。
Expand Down Expand Up @@ -233,11 +238,16 @@ def get_pdf_text_layout_per_page(doc: fitz.Document):
# logger.info(f"page_id: {page_id}, vertical_count: {vertical_count}, horizontal_count: {horizontal_count}")
return text_layout_list


'''定义一个自定义异常用来抛出单页svg太多的pdf'''


class PageSvgsTooManyError(Exception):
def __init__(self, message="Page SVGs are too many"):
self.message = message
super().__init__(self.message)


def get_svgs_per_page(doc: fitz.Document):
svgs_len_list = []
for page_id, page in enumerate(doc):
Expand All @@ -251,6 +261,7 @@ def get_svgs_per_page(doc: fitz.Document):
# logger.info(f"page_id: {page_id}, svgs_len: {len(svgs)}")
return svgs_len_list


def get_imgs_per_page(doc: fitz.Document):
imgs_len_list = []
for page_id, page in enumerate(doc):
Expand Down Expand Up @@ -287,6 +298,13 @@ def get_language(doc: fitz.Document):
return language


def check_invalid_chars(pdf_bytes):
"""
乱码检测
"""
return detect_invalid_chars(pdf_bytes)


def pdf_meta_scan(pdf_bytes: bytes):
"""
:param s3_pdf_path:
Expand Down Expand Up @@ -318,7 +336,8 @@ def pdf_meta_scan(pdf_bytes: bytes):
# logger.info(f"text_layout_per_page: {text_layout_per_page}")
text_language = get_language(doc)
# logger.info(f"text_language: {text_language}")

invalid_chars = check_invalid_chars(pdf_bytes)
# logger.info(f"invalid_chars: {invalid_chars}")

# 最后输出一条json
res = {
Expand All @@ -334,6 +353,7 @@ def pdf_meta_scan(pdf_bytes: bytes):
# "svgs_per_page": svgs_per_page,
"imgs_per_page": imgs_per_page, # 增加每页img数量list
"junk_img_bojids": junk_img_bojids, # 增加垃圾图片的bojid list
"invalid_chars": invalid_chars,
"metadata": doc.metadata
}
# logger.info(json.dumps(res, ensure_ascii=False))
Expand Down Expand Up @@ -365,4 +385,4 @@ def main(s3_pdf_path: str, s3_profile: str):
# file_content = read_file("D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师_大乘无量寿.pdf","")
# doc = fitz.open("pdf", file_content)
# text_layout_lst = get_pdf_text_layout_per_page(doc)
# print(text_layout_lst)
# print(text_layout_lst)
59 changes: 59 additions & 0 deletions magic_pdf/libs/pdf_check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from io import BytesIO
import re
import fitz
import numpy as np
from loguru import logger
from pdfminer.high_level import extract_text


def calculate_sample_count(total_page: int, sample_ratio=0.1):
"""
根据总页数和采样率计算采样页面的数量。
"""
select_page_cnt = int(total_page * sample_ratio)
if select_page_cnt < 5:
select_page_cnt = min(10, total_page)
elif select_page_cnt > 10:
select_page_cnt = 10
return select_page_cnt


def extract_pages(src_pdf_bytes: bytes):
pdf_docs = fitz.open("pdf", src_pdf_bytes)
total_page = len(pdf_docs)
if total_page == 0:
# 如果PDF没有页面,直接返回空文档
logger.warning("PDF is empty, return empty document")
return fitz.Document()
select_page_cnt = calculate_sample_count(total_page)

page_num = np.random.choice(total_page, select_page_cnt, replace=False)
sample_docs = fitz.Document()
try:
for index in page_num:
sample_docs.insert_pdf(pdf_docs, from_page=int(index), to_page=int(index))
except Exception as e:
logger.exception(e)
return sample_docs


def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
""""
检测PDF中是否包含非法字符
"""
'''需要使用'''
sample_docs = extract_pages(src_pdf_bytes)
sample_pdf_bytes = sample_docs.tobytes()
sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
text = extract_text(sample_pdf_file_like_object)
# logger.info(text)
'''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
cid_pattern = re.compile(r'\(cid:\d+\)')
matches = cid_pattern.findall(text)
cid_count = len(matches)
text_len = len(text)
logger.info(f"cid_count: {cid_count}, text_len: {text_len}")
if cid_count > 10:
return False # 乱码文档
else:
return True # 正常文档
1 change: 1 addition & 0 deletions magic_pdf/pipe/AbsPipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ def classify(pdf_bytes: bytes) -> str:
pdf_meta["text_len_per_page"],
pdf_meta["imgs_per_page"],
pdf_meta["text_layout_per_page"],
pdf_meta["invalid_chars"],
)
if is_text_pdf:
return AbsPipe.PIP_TXT
Expand Down
73 changes: 37 additions & 36 deletions magic_pdf/user_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,45 +86,46 @@ def parse_pdf(method):
return None

pdf_info_dict = parse_pdf(parse_pdf_by_txt)
text_all = ""
for page_dict in pdf_info_dict['pdf_info']:
for para_block in page_dict['para_blocks']:
if para_block['type'] in ['title', 'text']:
for line in para_block['lines']:
for span in line['spans']:
text_all += span['content']

def calculate_not_common_character_rate(text):
garbage_regex = re.compile(r'[^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a\u3000-\u303f\uff00-\uffef]')
# 计算乱码字符的数量
garbage_count = len(garbage_regex.findall(text))
total = len(text)
if total == 0:
return 0 # 避免除以零的错误
return garbage_count / total

def calculate_not_printable_rate(text):
printable_text = ""
for c in text:
if c.isprintable():
printable_text += c
printable_total = len(printable_text)
total = len(text)
if total == 0:
return 0 # 避免除以零的错误
return (total - printable_total) / total

not_common_character_rate = calculate_not_common_character_rate(text_all)
not_printable_rate = calculate_not_printable_rate(text_all)
pdf_info_dict["_not_common_character_rate"] = not_common_character_rate
pdf_info_dict["_not_printable_rate"] = not_printable_rate
logger.info(f"not_common_character_rate: {not_common_character_rate}, not_printable_rate: {not_printable_rate}")
# text_all = ""
# for page_dict in pdf_info_dict['pdf_info']:
# for para_block in page_dict['para_blocks']:
# if para_block['type'] in ['title', 'text']:
# for line in para_block['lines']:
# for span in line['spans']:
# text_all += span['content']

# def calculate_not_common_character_rate(text):
# garbage_regex = re.compile(r'[^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a\u3000-\u303f\uff00-\uffef]')
# # 计算乱码字符的数量
# garbage_count = len(garbage_regex.findall(text))
# total = len(text)
# if total == 0:
# return 0 # 避免除以零的错误
# return garbage_count / total
#
# def calculate_not_printable_rate(text):
# printable_text = ""
# for c in text:
# if c.isprintable():
# printable_text += c
# printable_total = len(printable_text)
# total = len(text)
# if total == 0:
# return 0 # 避免除以零的错误
# return (total - printable_total) / total
#
# not_common_character_rate = calculate_not_common_character_rate(text_all)
# not_printable_rate = calculate_not_printable_rate(text_all)
# pdf_info_dict["_not_common_character_rate"] = not_common_character_rate
# pdf_info_dict["_not_printable_rate"] = not_printable_rate
# logger.info(f"not_common_character_rate: {not_common_character_rate}, not_printable_rate: {not_printable_rate}")
'''新逻辑使用pdfminer识别乱码pdf,准确率高且不会误伤,已在解析流程之前进行处理'''
# not_common_character_rate对小语种可能会有误伤,not_printable_rate对小语种较为友好
if (pdf_info_dict is None
or pdf_info_dict.get("_need_drop", False)
or not_printable_rate > 0.02 # 参考一些正常的pdf,这个值没有超过0.01的,阈值设为0.02
or pdf_info_dict.get("_need_drop", False)
# or not_printable_rate > 0.02 # 参考一些正常的pdf,这个值没有超过0.01的,阈值设为0.02
):
logger.warning(f"parse_pdf_by_txt drop or error or garbled_rate too large, switch to parse_pdf_by_ocr")
logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
if input_model_is_empty:
pdf_models = doc_analyze(pdf_bytes, ocr=True)
pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,5 @@ wordninja>=2.0.0
scikit-learn>=1.0.2
nltk==3.8.1
s3pathlib>=2.1.1
paddleocr @ https://github.com/magicpdf/PaddleOCR/releases/download/paddleocr-2.8.2-released/paddleocr-2.8.2-py3-none-any.whl
paddleocr @ https://github.com/magicpdf/PaddleOCR/releases/download/paddleocr-2.8.2-released/paddleocr-2.8.2-py3-none-any.whl
pdfminer.six>=20231228

0 comments on commit df14c61

Please sign in to comment.