From 5de013e6d591d0da9e8691613b66310264168af1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B5=B5=E5=B0=8F=E8=92=99?= Date: Wed, 19 Jun 2024 12:54:54 +0800 Subject: [PATCH] fix:use line_lang instead of content_lang to concatenate para --- magic_pdf/dict2md/ocr_mkcontent.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/magic_pdf/dict2md/ocr_mkcontent.py b/magic_pdf/dict2md/ocr_mkcontent.py index ac6501c5..952d6037 100644 --- a/magic_pdf/dict2md/ocr_mkcontent.py +++ b/magic_pdf/dict2md/ocr_mkcontent.py @@ -144,10 +144,17 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""): def merge_para_with_text(para_block): para_text = '' for line in para_block['lines']: + line_text = "" + line_lang = "" + for span in line['spans']: + span_type = span['type'] + if span_type == ContentType.Text: + line_text += span['content'].strip() + if line_text != "": + line_lang = detect_lang(line_text) for span in line['spans']: span_type = span['type'] content = '' - language = '' if span_type == ContentType.Text: content = span['content'] language = detect_lang(content) @@ -161,7 +168,7 @@ def merge_para_with_text(para_block): content = f"\n$$\n{span['content']}\n$$\n" if content != '': - if 'zh' in language: + if 'zh' in line_lang: # 遇到一些一个字一个span的文档,这种单字语言判断不准,需要用整行文本判断 para_text += content # 中文语境下,content间不需要空格分隔 else: para_text += content + ' ' # 英文语境下 content间需要空格分隔