From 5f6079faa4c3dc3fabb3e4e4301fe363c11cb0ac Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Tue, 5 Dec 2023 11:20:22 -0800 Subject: [PATCH] Add github action, debug settings --- .github/workflows/tests.yml | 37 +++++++++++++++ marker/cleaners/equations.py | 19 +++++--- marker/convert.py | 4 ++ marker/debug/data.py | 76 ++++++++++++++++++++++++++++++ marker/settings.py | 5 +- scripts/verify_benchmark_scores.py | 20 ++++++++ 6 files changed, 154 insertions(+), 7 deletions(-) create mode 100644 .github/workflows/tests.yml create mode 100644 marker/debug/data.py create mode 100644 scripts/verify_benchmark_scores.py diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 00000000..1fc49429 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,37 @@ +name: Integration test with benchmark + +on: [push] + +env: + TESSDATA_PREFIX: "/usr/share/tesseract-ocr/5/tessdata" + TORCH_DEVICE: "cpu" + OCR_ENGINE: "tesseract" # So we don't have to install ghostscript, which takes a long time + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Set up Python 3.12 + uses: actions/setup-python@v4 + with: + python-version: 3.12 + - name: Install system dependencies + run: cat scripts/install/apt-requirements.txt | xargs sudo apt-get install -y + - name: Install tesseract 5 + run: bash scripts/install/tesseract_5_install.sh + - name: Install python dependencies + run: | + pip install poetry + poetry install + - name: Download benchmark data + run: | + wget https://drive.google.com/uc?export=download&id=1ktVDYPEeyHlKLaF56FnHjI5VjVnYa1xL -O benchmark_data.zip + unzip benchmark_data.zip + - name: Run benchmark test + run: | + poetry run python benchmark.py benchmark_data/pdfs benchmark_data/references report.json + poetry run python scripts/verify_benchmark_scores.py report.json + + + diff --git a/marker/cleaners/equations.py b/marker/cleaners/equations.py index 9790bd79..e2cf325d 100644 --- a/marker/cleaners/equations.py +++ b/marker/cleaners/equations.py @@ -1,19 +1,18 @@ import io -from concurrent.futures import ThreadPoolExecutor from copy import deepcopy from functools import partial from typing import List import torch from nougat import NougatModel -from nougat.postprocessing import close_envs, markdown_compatible +from nougat.postprocessing import markdown_compatible from nougat.utils.checkpoint import get_checkpoint import re from PIL import Image, ImageDraw -import fitz as pymupdf from nougat.utils.dataset import ImageDataset -from marker.bbox import should_merge_blocks, merge_boxes, multiple_boxes_intersect +from marker.bbox import should_merge_blocks, merge_boxes +from marker.debug.data import dump_nougat_debug_data from marker.settings import settings from marker.schema import Page, Span, Line, Block, BlockType from nougat.utils.device import move_to_device @@ -209,6 +208,7 @@ def get_bboxes_for_region(page, region): def replace_blocks_with_nougat_predictions(page_blocks: Page, merged_boxes, reformat_regions, predictions, pnum, nougat_model): new_blocks = [] + converted_spans = [] current_region = 0 idx = 0 success_count = 0 @@ -233,6 +233,7 @@ def replace_blocks_with_nougat_predictions(page_blocks: Page, merged_boxes, refo idx = reformat_regions[current_region][-1] + 1 if not all(conditions): fail_count += 1 + converted_spans.append(None) for i in reformat_regions[current_region]: new_blocks.append(page_blocks.blocks[i]) else: @@ -250,13 +251,14 @@ def replace_blocks_with_nougat_predictions(page_blocks: Page, merged_boxes, refo ], bbox=merged_boxes[current_region] ) + converted_spans.append(deepcopy(block_line.spans[0])) new_blocks.append(Block( lines=[block_line], bbox=merged_boxes[current_region], pnum=pnum )) current_region += 1 - return new_blocks, success_count, fail_count + return new_blocks, success_count, fail_count, converted_spans def replace_equations(doc, blocks: List[Page], block_types: List[List[BlockType]], nougat_model, batch_size=settings.NOUGAT_BATCH_SIZE): @@ -290,10 +292,11 @@ def replace_equations(doc, blocks: List[Page], block_types: List[List[BlockType] # Replace blocks with predictions page_start = 0 + converted_spans = [] for page_idx, reformat_regions_page in enumerate(reformat_regions): page_predictions = predictions[page_start:page_start + len(reformat_regions_page)] page_boxes = merged_boxes[page_start:page_start + len(reformat_regions_page)] - new_page_blocks, success_count, fail_count = replace_blocks_with_nougat_predictions( + new_page_blocks, success_count, fail_count, converted_span = replace_blocks_with_nougat_predictions( blocks[page_idx], page_boxes, reformat_regions_page, @@ -301,9 +304,13 @@ def replace_equations(doc, blocks: List[Page], block_types: List[List[BlockType] page_idx, nougat_model ) + converted_spans.extend(converted_span) blocks[page_idx].blocks = new_page_blocks page_start += len(reformat_regions_page) successful_ocr += success_count unsuccessful_ocr += fail_count + # If debug mode is on, dump out conversions for comparison + dump_nougat_debug_data(doc, images, converted_spans) + return blocks, {"successful_ocr": successful_ocr, "unsuccessful_ocr": unsuccessful_ocr, "equations": eq_count} \ No newline at end of file diff --git a/marker/convert.py b/marker/convert.py index ddf72dce..c56a1d5f 100644 --- a/marker/convert.py +++ b/marker/convert.py @@ -1,6 +1,7 @@ import fitz as pymupdf from marker.cleaners.table import merge_table_blocks, create_new_tables +from marker.debug.data import dump_bbox_debug_data from marker.extract_text import get_text_blocks from marker.cleaners.headers import filter_header_footer, filter_common_titles from marker.cleaners.equations import replace_equations @@ -117,6 +118,9 @@ def convert_single_pdf( annotate_spans(blocks, block_types) + # Dump debug data if flags are set + dump_bbox_debug_data(doc, blocks) + blocks = order_blocks( doc, blocks, diff --git a/marker/debug/data.py b/marker/debug/data.py new file mode 100644 index 00000000..a1c1e437 --- /dev/null +++ b/marker/debug/data.py @@ -0,0 +1,76 @@ +import base64 +import json +import os +import zlib +from typing import List + +from marker.schema import Page +from marker.settings import settings +from PIL import Image +import io + + +def dump_nougat_debug_data(doc, images, converted_spans): + if not settings.DEBUG or not settings.DEBUG_DATA_FOLDER: + return + + # We attempted one conversion per image + assert len(converted_spans) == len(images) + + data_lines = [] + for idx, (image, converted_span) in enumerate(zip(images, converted_spans)): + if converted_span is None: + continue + # Image is a BytesIO object + pil_image = Image.open(image) + img_bytes = io.BytesIO() + pil_image.save(img_bytes, format="WEBP", lossless=True) + b64_image = base64.b64encode(img_bytes.getvalue()).decode("utf-8") + data_lines.append({ + "image": b64_image, + "text": converted_span.text, + "bbox": converted_span.bbox + }) + + # Remove extension from doc name + doc_base = os.path.basename(doc.name).rsplit(".", 1)[0] + + debug_file = os.path.join(settings.DEBUG_DATA_FOLDER, f"{doc_base}_equations.json") + with open(debug_file, "w+") as f: + json.dump(data_lines, f, indent=4) + + +def dump_bbox_debug_data(doc, blocks: List[Page]): + if not settings.DEBUG or not settings.DEBUG_DATA_FOLDER: + return + + # Remove extension from doc name + doc_base = os.path.basename(doc.name).rsplit(".", 1)[0] + + debug_file = os.path.join(settings.DEBUG_DATA_FOLDER, f"{doc_base}_bbox.json") + debug_data = [] + for idx, page_blocks in enumerate(blocks): + page = doc[idx] + + pix = page.get_pixmap(dpi=settings.NOUGAT_DPI, annots=False, clip=page_blocks.bbox) + png = pix.pil_tobytes(format="PNG") + png_image = Image.open(io.BytesIO(png)) + width, height = png_image.size + max_dimension = 6000 + if width > max_dimension or height > max_dimension: + scaling_factor = min(max_dimension / width, max_dimension / height) + png_image = png_image.resize((int(width * scaling_factor), int(height * scaling_factor)), Image.ANTIALIAS) + + img_bytes = io.BytesIO() + png_image.save(img_bytes, format="WEBP", lossless=True, quality=100) + b64_image = base64.b64encode(img_bytes.getvalue()).decode("utf-8") + + page_data = page_blocks.model_dump() + page_data["image"] = b64_image + debug_data.append(page_data) + + with open(debug_file, "w+") as f: + json.dump(debug_data, f, indent=4) + + + diff --git a/marker/settings.py b/marker/settings.py index cc9e438a..8701f5d9 100644 --- a/marker/settings.py +++ b/marker/settings.py @@ -13,7 +13,6 @@ class Settings(BaseSettings): TORCH_DEVICE: str = "cpu" INFERENCE_RAM: int = 40 # How much VRAM each GPU has (in GB). VRAM_PER_TASK: float = 2.5 # How much VRAM to allocate per task (in GB). Peak marker VRAM usage is around 3GB, but avg across workers is lower. - DEBUG: bool = False # Enable debug logging DEFAULT_LANG: str = "English" # Default language we assume files to be in, should be one of the keys in TESSERACT_LANGUAGES SUPPORTED_FILETYPES: Dict = { @@ -85,6 +84,10 @@ class Settings(BaseSettings): RAY_DASHBOARD_HOST: str = "127.0.0.1" RAY_CORES_PER_WORKER: int = 1 # How many cpu cores to allocate per worker + # Debug + DEBUG: bool = False # Enable debug logging + DEBUG_DATA_FOLDER: Optional[str] = None + @computed_field @property def CUDA(self) -> bool: diff --git a/scripts/verify_benchmark_scores.py b/scripts/verify_benchmark_scores.py new file mode 100644 index 00000000..7cd679da --- /dev/null +++ b/scripts/verify_benchmark_scores.py @@ -0,0 +1,20 @@ +import json +import argparse + + +def verify_scores(file_path): + with open(file_path, 'r') as file: + data = json.load(file) + + multicolcnn_score = data["marker"]["files"]["multicolcnn.pdf"]["score"] + switch_trans_score = data["marker"]["files"]["switch_trans.pdf"]["score"] + + if multicolcnn_score <= 0.4 or switch_trans_score <= 0.4: + raise ValueError("One or more scores are below the required threshold of 0.4") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Verify benchmark scores") + parser.add_argument("file_path", type=str, help="Path to the json file") + args = parser.parse_args() + verify_scores(args.file_path)