diff --git a/requirements.txt b/requirements.txt index a6dc5241..7988bf1e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,4 +14,18 @@ wordninja>=2.0.0 scikit-learn>=1.0.2 nltk==3.8.1 s3pathlib>=2.1.1 -pdfminer.six>=20231228 \ No newline at end of file +pdfminer.six>=20231228 +Levenshtein +nltk +rapidfuzz +statistics +openxlab #安装opendatalab +pandas +numpy +matplotlib +seaborn +scipy +scikit-learn +tqdm +htmltabletomd +pypandoc \ No newline at end of file diff --git a/tools/benchmark.py b/tools/benchmark.py index f8f01b53..daefee26 100644 --- a/tools/benchmark.py +++ b/tools/benchmark.py @@ -5,18 +5,20 @@ pdf_dev_path = "/home/quyuan/data" pdf_res_path = "/home/quyuan/code/Magic-PDF/Magic-PDF/Magic-PDF/ci/magic-pdf" def test_cli(): - cmd = 'cd %s && export PYTHONPATH=. && find %s -type f -name "*.pdf" | xargs -I{} python magic_pdf/cli/magicpdf.py pdf-command --pdf {}' % (code_path, pdf_dev_path) + magicpdf_path = os.path.join(pdf_dev_path, "output") + if not os.path.exists(magicpdf_path): + os.makedirs(magicpdf_path) + cmd = 'cd %s && export PYTHONPATH=. && find %s -type f -name "*.pdf" | xargs -I{} python magic_pdf/cli/magicpdf.py pdf-command --pdf {}' % (code_path, magicpdf_path) os.system(cmd) - if not os.path.exists(os.path.join(pdf_dev_path, "output")): - os.makedirs(os.path.join(pdf_dev_path, "output")) - for annotaion_name in os.listdir(os.path.join(pdf_dev_path, "output")): - if annotaion_name.endswith('.pdf'): + + for annotaion_name in os.walk(os.path.join(pdf_dev_path, "ci")): + if annotaion_name.endswith('.md'): for pdf_res_path in os.listdir(pdf_res_path): - if ".md" in os.path.join(pdf_res_path, annotaion_name, "auto"): + if annotaion_name in os.path.join(pdf_res_path, annotaion_name, "auto"): prefix = annotaion_name.split('_')[-2] if not os.path.exists(os.join(pdf_dev_path, prefix)): - os.makedirs(os.path.join(pdf_dev_path, prefix)) - shutil.copy(os.path.join(pdf_res_path, annotaion_name, "auto", annotaion_name + ".md"), os.join(pdf_dev_path, prefix, annotaion_name + ".md")) + #os.makedirs(os.path.join(pdf_dev_path, prefix)) + shutil.copy(os.path.join(pdf_res_path, annotaion_name.strip(".md"), "auto", annotaion_name), os.join(pdf_dev_path, "ci", prefix, annotaion_name)) def calculate_score():