Skip to content

Commit

Permalink
fix: fasttext not support numpy>=2.0.0
Browse files Browse the repository at this point in the history
  • Loading branch information
myhloli committed Jul 7, 2024
1 parent f14e50e commit 1e73b9f
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 19 deletions.
37 changes: 21 additions & 16 deletions demo/demo.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,27 @@
import os
import json

from loguru import logger

from magic_pdf.pipe.UNIPipe import UNIPipe
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter

current_script_dir = os.path.dirname(os.path.abspath(__file__))
demo_name = "demo1"
pdf_path = os.path.join(current_script_dir, f"{demo_name}.pdf")
model_path = os.path.join(current_script_dir, f"{demo_name}.json")
pdf_bytes = open(pdf_path, "rb").read()
model_json = json.loads(open(model_path, "r", encoding="utf-8").read())
jso_useful_key = {"_pdf_type": "", "model_list": model_json}
local_image_dir = os.path.join(current_script_dir, 'images')
image_dir = str(os.path.basename(local_image_dir))
image_writer = DiskReaderWriter(local_image_dir)
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
pipe.pipe_classify()
pipe.pipe_parse()
md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
with open(f"{demo_name}.md", "w", encoding="utf-8") as f:
f.write(md_content)
try:
current_script_dir = os.path.dirname(os.path.abspath(__file__))
demo_name = "demo1"
pdf_path = os.path.join(current_script_dir, f"{demo_name}.pdf")
model_path = os.path.join(current_script_dir, f"{demo_name}.json")
pdf_bytes = open(pdf_path, "rb").read()
model_json = json.loads(open(model_path, "r", encoding="utf-8").read())
jso_useful_key = {"_pdf_type": "", "model_list": model_json}
local_image_dir = os.path.join(current_script_dir, 'images')
image_dir = str(os.path.basename(local_image_dir))
image_writer = DiskReaderWriter(local_image_dir)
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
pipe.pipe_classify()
pipe.pipe_parse()
md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
with open(f"{demo_name}.md", "w", encoding="utf-8") as f:
f.write(md_content)
except Exception as e:
logger.exception(e)
6 changes: 3 additions & 3 deletions magic_pdf/libs/language.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
import unicodedata
from fast_langdetect import detect_langs
from fast_langdetect import detect_language


def detect_lang(text: str) -> str:
if len(text) == 0:
return ""
try:
lang_upper = detect_langs(text)
lang_upper = detect_language(text)
except:
html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]])
lang_upper = detect_langs(html_no_ctrl_chars)
lang_upper = detect_language(html_no_ctrl_chars)
try:
lang = lang_upper.lower()
except:
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@ fast-langdetect>=0.1.1
wordninja>=2.0.0
scikit-learn>=1.0.2
pdfminer.six>=20231228
numpy<2.0.0 #2.0版本与fasttext不兼容
# requirements.txt 须保证只引入必需的外部依赖,如有新依赖添加请联系项目管理员

0 comments on commit 1e73b9f

Please sign in to comment.