diff --git a/tools/benchmark.py b/tools/benchmark.py deleted file mode 100644 index 5efebe98..00000000 --- a/tools/benchmark.py +++ /dev/null @@ -1,74 +0,0 @@ -import zipfile -import os -import shutil -import json -import markdown_calculate -code_path = os.environ.get('GITHUB_WORKSPACE') -#数据集存放路径 -pdf_dev_path = "/share/quyuan/mineru/data/" -#magicpdf最终结果 -pdf_res_path = "/share/quyuan/mineru/data/mineru" -file_types = ["academic_literature", "atlas", "courseware", "colorful_textbook", "historical_documents", "notes", "ordinary_books", "ordinary_exam_paper", "ordinary_textbook", "research_report", "special_exam_paper"] -def test_cli(): - #magicpdf模型输出结果 - magicpdf_path = os.path.join(pdf_dev_path, "output") - rm_cmd = "rm -rf %s" % (pdf_res_path) - os.system(rm_cmd) - os.makedirs(pdf_res_path) - cmd = 'cd %s && export PYTHONPATH=. && find %s -type f -name "*.pdf" | xargs -I{} python magic_pdf/cli/magicpdf.py pdf-command --pdf {}' % (code_path, magicpdf_path) - os.system(cmd) - for root, dirs, files in os.walk(pdf_res_path): - for magic_file in files: - for file_type in file_types: - target_dir = os.path.join(pdf_dev_path, "ci", file_type, "magicpdf") - if magic_file.endswith(".md") and magic_file.startswith(file_type): - source_file = os.path.join(root, magic_file) - target_file = os.path.join(pdf_dev_path, "ci", file_type, "magicpdf", magic_file) - if not os.path.exists(target_dir): - os.makedirs(target_dir) - shutil.copy(source_file, target_file) - -def calculate_score(): - data_path = os.path.join(pdf_dev_path, "ci") - cmd = "cd %s && export PYTHONPATH=. && python tools/clean_photo.py --tool_name annotations --download_dir %s" % (code_path, data_path) - os.system(cmd) - cmd = "cd %s && export PYTHONPATH=. && python tools/clean_photo.py --tool_name magicpdf --download_dir %s" % (code_path, data_path) - os.system(cmd) - score = markdown_calculate.Scoring(os.path.join(data_path, "result.json")) - score.calculate_similarity_total("magicpdf", file_types, data_path) - res = score.summary_scores() - return res - - -def extrat_zip(zip_file_path, extract_to_path): - if zipfile.is_zipfile(zip_file_path): - with zipfile.ZipFile(zip_file_path, 'r') as zip_ref: - zip_ref.extractall(extract_to_path) - print(f'Files extracted to {extract_to_path}') - else: - print(f'{zip_file_path} is not a zip file') - - -def ci_ben(): - fr = open(os.path.join(pdf_dev_path, "ci", "result.json"), "r") - lines = fr.readlines() - last_line = lines[-1].strip() - last_score = json.loads(last_line) - print ("last_score:", last_score) - last_simscore = last_score["average_sim_score"] - last_editdistance = last_score["average_edit_distance"] - last_bleu = last_score["average_bleu_score"] - extrat_zip(os.path.join(pdf_dev_path, 'output.zip'), os.path.join(pdf_dev_path)) - test_cli() - now_score = calculate_score() - print ("now_score:", now_score) - now_simscore = now_score["average_sim_score"] - now_editdistance = now_score["average_edit_distance"] - now_bleu = now_score["average_bleu_score"] - assert last_simscore <= now_simscore - assert last_editdistance <= now_editdistance - assert last_bleu <= now_bleu - - -if __name__ == "__main__": - ci_ben() diff --git a/tools/clean_photo.py b/tools/clean_photo.py deleted file mode 100644 index e8504595..00000000 --- a/tools/clean_photo.py +++ /dev/null @@ -1,112 +0,0 @@ -import pypandoc -import re -import htmltabletomd -import os -import argparse -import zipfile - -parser = argparse.ArgumentParser(description="get tool type") -parser.add_argument( - "--tool_name", - type=str, - required=True, - help="input tool name", -) -parser.add_argument( - "--download_dir", - type=str, - required=True, - help="input download dir", -) -args = parser.parse_args() - -def clean_markdown_images(content): - pattern = re.compile(r'!\[[^\]]*\]\([^)]*\)', re.IGNORECASE) - cleaned_content = pattern.sub('', content) - return cleaned_content - -def clean_ocrmath_photo(content): - pattern = re.compile(r'\\includegraphics\[.*?\]\{.*?\}', re.IGNORECASE) - cleaned_content = pattern.sub('', content) - return cleaned_content - -def convert_html_table_to_md(html_table): - lines = html_table.strip().split('\n') - md_table = '' - if lines and '' in lines[0]: - in_thead = True - for line in lines: - if '' in line: - cells = re.findall(r'(.*?)', line) - md_table += '| ' + ' | '.join(cells) + ' |\n' - in_thead = False - elif '' in line and not in_thead: - cells = re.findall(r'(.*?)', line) - md_table += '| ' + ' | '.join(cells) + ' |\n' - md_table = md_table.rstrip() + '\n' - return md_table - -def convert_latext_to_md(content): - tables = re.findall(r'\\begin\{tabular\}(.*?)\\end\{tabular\}', content, re.DOTALL) - placeholders = [] - for table in tables: - placeholder = f"" - replace_str = f"\\begin{{tabular}}{table}cl\\end{{tabular}}" - content = content.replace(replace_str, placeholder) - try: - pypandoc.convert_text(replace_str, format="latex", to="md", outputfile="output.md", encoding="utf-8") - except: - markdown_string = replace_str - else: - markdown_string = open('output.md', 'r', encoding='utf-8').read() - placeholders.append((placeholder, markdown_string)) - new_content = content - for placeholder, md_table in placeholders: - new_content = new_content.replace(placeholder, md_table) - # 写入文件 - return new_content - - -def convert_htmltale_to_md(content): - tables = re.findall(r'(.*?)
', content, re.DOTALL) - placeholders = [] - for table in tables: - placeholder = f"" - content = content.replace(f"{table}
", placeholder) - try: - convert_table = htmltabletomd.convert_table(table) - except: - convert_table = table - placeholders.append((placeholder,convert_table)) - new_content = content - for placeholder, md_table in placeholders: - new_content = new_content.replace(placeholder, md_table) - # 写入文件 - return new_content - -def clean_data(prod_type, download_dir): - file_type = ["academic_literature", "atlas", "courseware", "colorful_textbook", "historical_documents", "notes", "ordinary_books", "ordinary_exam_paper", "ordinary_textbook", "research_report", "special_exam_paper"] - for filetype in file_type: - tgt_dir = os.path.join(download_dir, filetype, prod_type, "cleaned") - if not os.path.exists(tgt_dir): - os.makedirs(tgt_dir) - source_dir = os.path.join(download_dir, filetype, prod_type) - filenames = os.listdir(source_dir) - for filename in filenames: - if filename.endswith('.md'): - input_file = os.path.join(source_dir, filename) - output_file = os.path.join(tgt_dir, "cleaned_" + filename) - with open(input_file, 'r', encoding='utf-8') as fr: - content = fr.read() - new_content = convert_htmltale_to_md(content) - new_content = clean_markdown_images(new_content) - new_content = clean_ocrmath_photo(new_content) - new_content = convert_latext_to_md(new_content) - with open(output_file, 'w', encoding='utf-8') as fw: - fw.write(new_content) - - -if __name__ == '__main__': - tool_type = args.tool_name - download_dir = args.download_dir - clean_data(tool_type, download_dir) diff --git a/tools/markdown_calculate.py b/tools/markdown_calculate.py deleted file mode 100644 index 08c1d337..00000000 --- a/tools/markdown_calculate.py +++ /dev/null @@ -1,99 +0,0 @@ -import os -from Levenshtein import distance -from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction, corpus_bleu -from nltk.tokenize import word_tokenize -import json -import re -import scoring -import argparse -import nltk -nltk.download('punkt') -# 初始化列表来存储编辑距离和BLEU分数 -class Scoring: - def __init__(self, result_path): - self.edit_distances = [] - self.bleu_scores = [] - self.sim_scores = [] - self.filenames = [] - self.score_dict = {} - self.anntion_cnt = 0 - self.fw = open(result_path, "w+") - def simple_bleu_score(self, candidate, reference): - candidate_tokens = word_tokenize(candidate) - reference_tokens = word_tokenize(reference) - return sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=SmoothingFunction().method1) - - - def preprocess_string(self, s): - sub_enter = re.sub(r'\n+', '\n', s) - return re.sub(r' ', ' ', sub_enter) - - def calculate_similarity(self, annotion, actual, tool_type): - class_dict = {} - edit_distances = [] - bleu_scores = [] - sim_scores = list() - total_file = 0 - for filename in os.listdir(annotion): - if filename.endswith('.md') and not filename.startswith('.'): # 忽略隐藏文件 - total_file = total_file + 1 - # 读取A目录中的文件 - with open(os.path.join(annotion, filename), 'r', encoding='utf-8') as file_a: - content_a = file_a.read() - self.anntion_cnt = self.anntion_cnt + 1 - filepath_b = os.path.join(actual, filename) - if os.path.exists(filepath_b): - with open(filepath_b, 'r', encoding='utf-8') as file_b: - content_b = file_b.read() - self.filenames.append(filename) - # 计算编辑距离 - edit_dist = distance(self.preprocess_string(content_b),self.preprocess_string(content_a)) / max(len(content_a), len(content_b)) - self.edit_distances.append(edit_dist) - edit_distances.append(edit_dist) - #计算BLUE分数 - bleu_score = self.simple_bleu_score(content_b, content_a) - bleu_scores.append(bleu_score) - self.bleu_scores.append(bleu_score) - #计算marker分数 - score = scoring.score_text(content_b, content_a) - sim_scores.append(score) - self.sim_scores.append(score) - class_dict[filename] = {"edit_dist": edit_dist, "bleu_score": bleu_score, "sim_score": score} - self.score_dict[filename] = {"edit_dist": edit_dist, "bleu_score": bleu_score, "sim_score": score} - else: - print(f"File {filename} not found in actual directory.") - # 计算每类平均值 - class_average_edit_distance = sum(edit_distances) / len(edit_distances) if edit_distances else 0 - class_average_bleu_score = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0 - class_average_sim_score = sum(sim_scores) / len(sim_scores) if sim_scores else 0 - self.fw.write(json.dumps(class_dict, ensure_ascii=False) + "\n") - ratio = len(class_dict)/total_file - self.fw.write(f"{tool_type} extract ratio: {ratio}" + "\n") - self.fw.write(f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}" + "\n") - self.fw.write(f"{tool_type} Average BLEU Score: {class_average_bleu_score}" + "\n") - self.fw.write(f"{tool_type} Average Sim Score: {class_average_sim_score}" + "\n") - - print (f"{tool_type} extract ratio: {ratio}") - print (f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}") - print (f"{tool_type} Average BLEU Score: {class_average_bleu_score}") - print (f"{tool_type} Average Sim Score: {class_average_sim_score}") - return self.score_dict - - def summary_scores(self): - # 计算整体平均值 - over_all_dict = dict() - average_edit_distance = sum(self.edit_distances) / len(self.edit_distances) if self.edit_distances else 0 - average_bleu_score = sum(self.bleu_scores) / len(self.bleu_scores) if self.bleu_scores else 0 - average_sim_score = sum(self.sim_scores) / len(self.sim_scores) if self.sim_scores else 0 - over_all_dict["average_edit_distance"] = average_edit_distance - over_all_dict["average_bleu_score"] = average_bleu_score - over_all_dict["average_sim_score"] = average_sim_score - self.fw.write(json.dumps(over_all_dict, ensure_ascii=False) + "\n") - return over_all_dict - - def calculate_similarity_total(self, tool_type, file_types, download_dir): - for file_type in file_types: - annotion = os.path.join(download_dir, file_type, "annotations", "cleaned") - actual = os.path.join(download_dir, file_type, tool_type, "cleaned") - self.calculate_similarity(annotion, actual, file_type) - diff --git a/tools/scoring.py b/tools/scoring.py deleted file mode 100644 index 64c74923..00000000 --- a/tools/scoring.py +++ /dev/null @@ -1,48 +0,0 @@ -import math - -from rapidfuzz import fuzz -import re -import regex -from statistics import mean - -CHUNK_MIN_CHARS = 25 - -def chunk_text(text, chunk_len=500): - chunks = [text[i:i+chunk_len] for i in range(0, len(text), chunk_len)] - chunks = [c for c in chunks if c.strip() and len(c) > CHUNK_MIN_CHARS] - return chunks - - -def overlap_score(hypothesis_chunks, reference_chunks): - if len(reference_chunks) > 0: - length_modifier = len(hypothesis_chunks) / len(reference_chunks) - else: - length_modifier = 0 - search_distance = max(len(reference_chunks) // 5, 10) - chunk_scores = [] - for i, hyp_chunk in enumerate(hypothesis_chunks): - max_score = 0 - total_len = 0 - i_offset = int(i * length_modifier) - chunk_range = range(max(0, i_offset-search_distance), min(len(reference_chunks), i_offset+search_distance)) - for j in chunk_range: - ref_chunk = reference_chunks[j] - score = fuzz.ratio(hyp_chunk, ref_chunk, score_cutoff=30) / 100 - if score > max_score: - max_score = score - total_len = len(ref_chunk) - chunk_scores.append(max_score) - return chunk_scores - - -def score_text(hypothesis, reference): - # Returns a 0-1 alignment score - hypothesis_chunks = chunk_text(hypothesis) - reference_chunks = chunk_text(reference) - chunk_scores = overlap_score(hypothesis_chunks, reference_chunks) - if len(chunk_scores) > 0: - mean_score = mean(chunk_scores) - return mean_score - else: - return 0 - #return mean(chunk_scores) \ No newline at end of file