fix: fasttext not support numpy>=2.0.0

opendatalab · Jul 7, 2024 · 1e73b9f · 1e73b9f
1 parent f14e50e
commit 1e73b9f
Show file tree

Hide file tree

Showing 3 changed files with 25 additions and 19 deletions.
diff --git a/demo/demo.py b/demo/demo.py
@@ -1,22 +1,27 @@
 import os
 import json
 
+from loguru import logger
+
 from magic_pdf.pipe.UNIPipe import UNIPipe
 from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
 
-current_script_dir = os.path.dirname(os.path.abspath(__file__))
-demo_name = "demo1"
-pdf_path = os.path.join(current_script_dir, f"{demo_name}.pdf")
-model_path = os.path.join(current_script_dir, f"{demo_name}.json")
-pdf_bytes = open(pdf_path, "rb").read()
-model_json = json.loads(open(model_path, "r", encoding="utf-8").read())
-jso_useful_key = {"_pdf_type": "", "model_list": model_json}
-local_image_dir = os.path.join(current_script_dir, 'images')
-image_dir = str(os.path.basename(local_image_dir))
-image_writer = DiskReaderWriter(local_image_dir)
-pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
-pipe.pipe_classify()
-pipe.pipe_parse()
-md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
-with open(f"{demo_name}.md", "w", encoding="utf-8") as f:
-    f.write(md_content)
+try:
+    current_script_dir = os.path.dirname(os.path.abspath(__file__))
+    demo_name = "demo1"
+    pdf_path = os.path.join(current_script_dir, f"{demo_name}.pdf")
+    model_path = os.path.join(current_script_dir, f"{demo_name}.json")
+    pdf_bytes = open(pdf_path, "rb").read()
+    model_json = json.loads(open(model_path, "r", encoding="utf-8").read())
+    jso_useful_key = {"_pdf_type": "", "model_list": model_json}
+    local_image_dir = os.path.join(current_script_dir, 'images')
+    image_dir = str(os.path.basename(local_image_dir))
+    image_writer = DiskReaderWriter(local_image_dir)
+    pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
+    pipe.pipe_classify()
+    pipe.pipe_parse()
+    md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
+    with open(f"{demo_name}.md", "w", encoding="utf-8") as f:
+        f.write(md_content)
+except Exception as e:
+    logger.exception(e)
diff --git a/magic_pdf/libs/language.py b/magic_pdf/libs/language.py
@@ -1,15 +1,15 @@
 import unicodedata
-from fast_langdetect import detect_langs
+from fast_langdetect import detect_language
 
 
 def detect_lang(text: str) -> str:
     if len(text) == 0:
         return ""
     try:
-        lang_upper = detect_langs(text)
+        lang_upper = detect_language(text)
     except:
         html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]])
-        lang_upper = detect_langs(html_no_ctrl_chars)
+        lang_upper = detect_language(html_no_ctrl_chars)
     try:
         lang = lang_upper.lower()
     except:

diff --git a/requirements.txt b/requirements.txt
@@ -8,4 +8,5 @@ fast-langdetect>=0.1.1
 wordninja>=2.0.0
 scikit-learn>=1.0.2
 pdfminer.six>=20231228
+numpy<2.0.0 #2.0版本与fasttext不兼容
 # requirements.txt 须保证只引入必需的外部依赖,如有新依赖添加请联系项目管理员