From 1e73b9fca0fefdd34cebe1d934414ca924d76a08 Mon Sep 17 00:00:00 2001 From: myhloli Date: Sun, 7 Jul 2024 22:06:02 +0800 Subject: [PATCH] fix: fasttext not support numpy>=2.0.0 --- demo/demo.py | 37 +++++++++++++++++++++---------------- magic_pdf/libs/language.py | 6 +++--- requirements.txt | 1 + 3 files changed, 25 insertions(+), 19 deletions(-) diff --git a/demo/demo.py b/demo/demo.py index 9d05ba4d..7a69e7c8 100644 --- a/demo/demo.py +++ b/demo/demo.py @@ -1,22 +1,27 @@ import os import json +from loguru import logger + from magic_pdf.pipe.UNIPipe import UNIPipe from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter -current_script_dir = os.path.dirname(os.path.abspath(__file__)) -demo_name = "demo1" -pdf_path = os.path.join(current_script_dir, f"{demo_name}.pdf") -model_path = os.path.join(current_script_dir, f"{demo_name}.json") -pdf_bytes = open(pdf_path, "rb").read() -model_json = json.loads(open(model_path, "r", encoding="utf-8").read()) -jso_useful_key = {"_pdf_type": "", "model_list": model_json} -local_image_dir = os.path.join(current_script_dir, 'images') -image_dir = str(os.path.basename(local_image_dir)) -image_writer = DiskReaderWriter(local_image_dir) -pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer) -pipe.pipe_classify() -pipe.pipe_parse() -md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none") -with open(f"{demo_name}.md", "w", encoding="utf-8") as f: - f.write(md_content) +try: + current_script_dir = os.path.dirname(os.path.abspath(__file__)) + demo_name = "demo1" + pdf_path = os.path.join(current_script_dir, f"{demo_name}.pdf") + model_path = os.path.join(current_script_dir, f"{demo_name}.json") + pdf_bytes = open(pdf_path, "rb").read() + model_json = json.loads(open(model_path, "r", encoding="utf-8").read()) + jso_useful_key = {"_pdf_type": "", "model_list": model_json} + local_image_dir = os.path.join(current_script_dir, 'images') + image_dir = str(os.path.basename(local_image_dir)) + image_writer = DiskReaderWriter(local_image_dir) + pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer) + pipe.pipe_classify() + pipe.pipe_parse() + md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none") + with open(f"{demo_name}.md", "w", encoding="utf-8") as f: + f.write(md_content) +except Exception as e: + logger.exception(e) \ No newline at end of file diff --git a/magic_pdf/libs/language.py b/magic_pdf/libs/language.py index 29cdc9ea..bddb5475 100644 --- a/magic_pdf/libs/language.py +++ b/magic_pdf/libs/language.py @@ -1,15 +1,15 @@ import unicodedata -from fast_langdetect import detect_langs +from fast_langdetect import detect_language def detect_lang(text: str) -> str: if len(text) == 0: return "" try: - lang_upper = detect_langs(text) + lang_upper = detect_language(text) except: html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]]) - lang_upper = detect_langs(html_no_ctrl_chars) + lang_upper = detect_language(html_no_ctrl_chars) try: lang = lang_upper.lower() except: diff --git a/requirements.txt b/requirements.txt index cbd71e82..40a78af0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,4 +8,5 @@ fast-langdetect>=0.1.1 wordninja>=2.0.0 scikit-learn>=1.0.2 pdfminer.six>=20231228 +numpy<2.0.0 #2.0版本与fasttext不兼容 # requirements.txt 须保证只引入必需的外部依赖,如有新依赖添加请联系项目管理员 \ No newline at end of file