From 5f6079faa4c3dc3fabb3e4e4301fe363c11cb0ac Mon Sep 17 00:00:00 2001
From: Vik Paruchuri <vik.paruchuri@gmail.com>
Date: Tue, 5 Dec 2023 11:20:22 -0800
Subject: [PATCH] Add github action, debug settings

---
 .github/workflows/tests.yml        | 37 +++++++++++++++
 marker/cleaners/equations.py       | 19 +++++---
 marker/convert.py                  |  4 ++
 marker/debug/data.py               | 76 ++++++++++++++++++++++++++++++
 marker/settings.py                 |  5 +-
 scripts/verify_benchmark_scores.py | 20 ++++++++
 6 files changed, 154 insertions(+), 7 deletions(-)
 create mode 100644 .github/workflows/tests.yml
 create mode 100644 marker/debug/data.py
 create mode 100644 scripts/verify_benchmark_scores.py

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
new file mode 100644
index 00000000..1fc49429
--- /dev/null
+++ b/.github/workflows/tests.yml
@@ -0,0 +1,37 @@
+name: Integration test with benchmark
+
+on: [push]
+
+env:
+  TESSDATA_PREFIX: "/usr/share/tesseract-ocr/5/tessdata"
+  TORCH_DEVICE: "cpu"
+  OCR_ENGINE: "tesseract" # So we don't have to install ghostscript, which takes a long time
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python 3.12
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.12
+      - name: Install system dependencies
+        run: cat scripts/install/apt-requirements.txt | xargs sudo apt-get install -y
+      - name: Install tesseract 5
+        run: bash scripts/install/tesseract_5_install.sh
+      - name: Install python dependencies
+        run: |
+          pip install poetry
+          poetry install
+      - name: Download benchmark data
+        run: |
+          wget https://drive.google.com/uc?export=download&id=1ktVDYPEeyHlKLaF56FnHjI5VjVnYa1xL -O benchmark_data.zip
+          unzip benchmark_data.zip
+      - name: Run benchmark test
+        run: |
+          poetry run python benchmark.py benchmark_data/pdfs benchmark_data/references report.json
+          poetry run python scripts/verify_benchmark_scores.py report.json
+        
+          
+
diff --git a/marker/cleaners/equations.py b/marker/cleaners/equations.py
index 9790bd79..e2cf325d 100644
--- a/marker/cleaners/equations.py
+++ b/marker/cleaners/equations.py
@@ -1,19 +1,18 @@
 import io
-from concurrent.futures import ThreadPoolExecutor
 from copy import deepcopy
 from functools import partial
 from typing import List
 
 import torch
 from nougat import NougatModel
-from nougat.postprocessing import close_envs, markdown_compatible
+from nougat.postprocessing import markdown_compatible
 from nougat.utils.checkpoint import get_checkpoint
 import re
 from PIL import Image, ImageDraw
-import fitz as pymupdf
 from nougat.utils.dataset import ImageDataset
 
-from marker.bbox import should_merge_blocks, merge_boxes, multiple_boxes_intersect
+from marker.bbox import should_merge_blocks, merge_boxes
+from marker.debug.data import dump_nougat_debug_data
 from marker.settings import settings
 from marker.schema import Page, Span, Line, Block, BlockType
 from nougat.utils.device import move_to_device
@@ -209,6 +208,7 @@ def get_bboxes_for_region(page, region):
 
 def replace_blocks_with_nougat_predictions(page_blocks: Page, merged_boxes, reformat_regions, predictions, pnum, nougat_model):
     new_blocks = []
+    converted_spans = []
     current_region = 0
     idx = 0
     success_count = 0
@@ -233,6 +233,7 @@ def replace_blocks_with_nougat_predictions(page_blocks: Page, merged_boxes, refo
         idx = reformat_regions[current_region][-1] + 1
         if not all(conditions):
             fail_count += 1
+            converted_spans.append(None)
             for i in reformat_regions[current_region]:
                 new_blocks.append(page_blocks.blocks[i])
         else:
@@ -250,13 +251,14 @@ def replace_blocks_with_nougat_predictions(page_blocks: Page, merged_boxes, refo
                 ],
                 bbox=merged_boxes[current_region]
             )
+            converted_spans.append(deepcopy(block_line.spans[0]))
             new_blocks.append(Block(
                 lines=[block_line],
                 bbox=merged_boxes[current_region],
                 pnum=pnum
             ))
         current_region += 1
-    return new_blocks, success_count, fail_count
+    return new_blocks, success_count, fail_count, converted_spans
 
 
 def replace_equations(doc, blocks: List[Page], block_types: List[List[BlockType]], nougat_model, batch_size=settings.NOUGAT_BATCH_SIZE):
@@ -290,10 +292,11 @@ def replace_equations(doc, blocks: List[Page], block_types: List[List[BlockType]
 
     # Replace blocks with predictions
     page_start = 0
+    converted_spans = []
     for page_idx, reformat_regions_page in enumerate(reformat_regions):
         page_predictions = predictions[page_start:page_start + len(reformat_regions_page)]
         page_boxes = merged_boxes[page_start:page_start + len(reformat_regions_page)]
-        new_page_blocks, success_count, fail_count = replace_blocks_with_nougat_predictions(
+        new_page_blocks, success_count, fail_count, converted_span = replace_blocks_with_nougat_predictions(
             blocks[page_idx],
             page_boxes,
             reformat_regions_page,
@@ -301,9 +304,13 @@ def replace_equations(doc, blocks: List[Page], block_types: List[List[BlockType]
             page_idx,
             nougat_model
         )
+        converted_spans.extend(converted_span)
         blocks[page_idx].blocks = new_page_blocks
         page_start += len(reformat_regions_page)
         successful_ocr += success_count
         unsuccessful_ocr += fail_count
 
+    # If debug mode is on, dump out conversions for comparison
+    dump_nougat_debug_data(doc, images, converted_spans)
+
     return blocks, {"successful_ocr": successful_ocr, "unsuccessful_ocr": unsuccessful_ocr, "equations": eq_count}
\ No newline at end of file
diff --git a/marker/convert.py b/marker/convert.py
index ddf72dce..c56a1d5f 100644
--- a/marker/convert.py
+++ b/marker/convert.py
@@ -1,6 +1,7 @@
 import fitz as pymupdf
 
 from marker.cleaners.table import merge_table_blocks, create_new_tables
+from marker.debug.data import dump_bbox_debug_data
 from marker.extract_text import get_text_blocks
 from marker.cleaners.headers import filter_header_footer, filter_common_titles
 from marker.cleaners.equations import replace_equations
@@ -117,6 +118,9 @@ def convert_single_pdf(
 
     annotate_spans(blocks, block_types)
 
+    # Dump debug data if flags are set
+    dump_bbox_debug_data(doc, blocks)
+
     blocks = order_blocks(
         doc,
         blocks,
diff --git a/marker/debug/data.py b/marker/debug/data.py
new file mode 100644
index 00000000..a1c1e437
--- /dev/null
+++ b/marker/debug/data.py
@@ -0,0 +1,76 @@
+import base64
+import json
+import os
+import zlib
+from typing import List
+
+from marker.schema import Page
+from marker.settings import settings
+from PIL import Image
+import io
+
+
+def dump_nougat_debug_data(doc, images, converted_spans):
+    if not settings.DEBUG or not settings.DEBUG_DATA_FOLDER:
+        return
+
+    # We attempted one conversion per image
+    assert len(converted_spans) == len(images)
+
+    data_lines = []
+    for idx, (image, converted_span) in enumerate(zip(images, converted_spans)):
+        if converted_span is None:
+            continue
+        # Image is a BytesIO object
+        pil_image = Image.open(image)
+        img_bytes = io.BytesIO()
+        pil_image.save(img_bytes, format="WEBP", lossless=True)
+        b64_image = base64.b64encode(img_bytes.getvalue()).decode("utf-8")
+        data_lines.append({
+            "image": b64_image,
+            "text": converted_span.text,
+            "bbox": converted_span.bbox
+        })
+
+    # Remove extension from doc name
+    doc_base = os.path.basename(doc.name).rsplit(".", 1)[0]
+
+    debug_file = os.path.join(settings.DEBUG_DATA_FOLDER, f"{doc_base}_equations.json")
+    with open(debug_file, "w+") as f:
+        json.dump(data_lines, f, indent=4)
+
+
+def dump_bbox_debug_data(doc, blocks: List[Page]):
+    if not settings.DEBUG or not settings.DEBUG_DATA_FOLDER:
+        return
+
+    # Remove extension from doc name
+    doc_base = os.path.basename(doc.name).rsplit(".", 1)[0]
+
+    debug_file = os.path.join(settings.DEBUG_DATA_FOLDER, f"{doc_base}_bbox.json")
+    debug_data = []
+    for idx, page_blocks in enumerate(blocks):
+        page = doc[idx]
+
+        pix = page.get_pixmap(dpi=settings.NOUGAT_DPI, annots=False, clip=page_blocks.bbox)
+        png = pix.pil_tobytes(format="PNG")
+        png_image = Image.open(io.BytesIO(png))
+        width, height = png_image.size
+        max_dimension = 6000
+        if width > max_dimension or height > max_dimension:
+            scaling_factor = min(max_dimension / width, max_dimension / height)
+            png_image = png_image.resize((int(width * scaling_factor), int(height * scaling_factor)), Image.ANTIALIAS)
+
+        img_bytes = io.BytesIO()
+        png_image.save(img_bytes, format="WEBP", lossless=True, quality=100)
+        b64_image = base64.b64encode(img_bytes.getvalue()).decode("utf-8")
+
+        page_data = page_blocks.model_dump()
+        page_data["image"] = b64_image
+        debug_data.append(page_data)
+
+    with open(debug_file, "w+") as f:
+        json.dump(debug_data, f, indent=4)
+
+
+
diff --git a/marker/settings.py b/marker/settings.py
index cc9e438a..8701f5d9 100644
--- a/marker/settings.py
+++ b/marker/settings.py
@@ -13,7 +13,6 @@ class Settings(BaseSettings):
     TORCH_DEVICE: str = "cpu"
     INFERENCE_RAM: int = 40 # How much VRAM each GPU has (in GB).
     VRAM_PER_TASK: float = 2.5 # How much VRAM to allocate per task (in GB).  Peak marker VRAM usage is around 3GB, but avg across workers is lower.
-    DEBUG: bool = False # Enable debug logging
     DEFAULT_LANG: str = "English" # Default language we assume files to be in, should be one of the keys in TESSERACT_LANGUAGES
 
     SUPPORTED_FILETYPES: Dict = {
@@ -85,6 +84,10 @@ class Settings(BaseSettings):
     RAY_DASHBOARD_HOST: str = "127.0.0.1"
     RAY_CORES_PER_WORKER: int = 1 # How many cpu cores to allocate per worker
 
+    # Debug
+    DEBUG: bool = False # Enable debug logging
+    DEBUG_DATA_FOLDER: Optional[str] = None
+
     @computed_field
     @property
     def CUDA(self) -> bool:
diff --git a/scripts/verify_benchmark_scores.py b/scripts/verify_benchmark_scores.py
new file mode 100644
index 00000000..7cd679da
--- /dev/null
+++ b/scripts/verify_benchmark_scores.py
@@ -0,0 +1,20 @@
+import json
+import argparse
+
+
+def verify_scores(file_path):
+    with open(file_path, 'r') as file:
+        data = json.load(file)
+
+    multicolcnn_score = data["marker"]["files"]["multicolcnn.pdf"]["score"]
+    switch_trans_score = data["marker"]["files"]["switch_trans.pdf"]["score"]
+
+    if multicolcnn_score <= 0.4 or switch_trans_score <= 0.4:
+        raise ValueError("One or more scores are below the required threshold of 0.4")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Verify benchmark scores")
+    parser.add_argument("file_path", type=str, help="Path to the json file")
+    args = parser.parse_args()
+    verify_scores(args.file_path)