Skip to content

Commit

Permalink
实现多模态markdown拼装
Browse files Browse the repository at this point in the history
  • Loading branch information
[email protected] committed Mar 27, 2024
1 parent fffee0a commit 433684c
Showing 1 changed file with 34 additions and 3 deletions.
37 changes: 34 additions & 3 deletions magic_pdf/pipeline_txt.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,19 @@
文本型pdf转化为统一清洗格式
"""


# TODO 移动到spark/目录下

from loguru import logger
from magic_pdf.dict2md.mkcontent import mk_universal_format
from magic_pdf.dict2md.mkcontent import mk_mm_markdown, mk_universal_format
from magic_pdf.libs.commons import join_path
from magic_pdf.libs.json_compressor import JsonCompressor
from magic_pdf.spark.base import exception_handler, get_data_source


def txt_pdf_to_standard_format(jso: dict, debug_mode=False) -> dict:

"""
变成统一的标准格式
"""
if debug_mode:
pass
else: # 如果debug没开,则检测是否有needdrop字段
Expand All @@ -35,3 +37,32 @@ def txt_pdf_to_standard_format(jso: dict, debug_mode=False) -> dict:
except Exception as e:
jso = exception_handler(jso, e)
return jso


def txt_pdf_to_mm_markdown_format(jso: dict, debug_mode=False) -> dict:
"""
变成多模态的markdown格式
"""
if debug_mode:
pass
else: # 如果debug没开,则检测是否有needdrop字段
if jso.get("need_drop", False):
book_name = join_path(get_data_source(jso), jso["file_id"])
logger.info(f"book_name is:{book_name} need drop")
jso["dropped"] = True
return jso
try:
pdf_intermediate_dict = jso["pdf_intermediate_dict"]
# 将 pdf_intermediate_dict 解压
pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
standard_format = mk_universal_format(pdf_intermediate_dict)
mm_content = mk_mm_markdown(standard_format)
jso["content_list"] = mm_content
logger.info(f"book_name is:{get_data_source(jso)}/{jso['file_id']},content_list length is {len(standard_format)}",)
# 把无用的信息清空
jso["doc_layout_result"] = ""
jso["pdf_intermediate_dict"] = ""
jso["pdf_meta"] = ""
except Exception as e:
jso = exception_handler(jso, e)
return jso

0 comments on commit 433684c

Please sign in to comment.