Merge pull request #37 from VikParuchuri/dev

Bug fix: Work with rotated pdfs
VikParuchuri · Dec 13, 2023 · 43abdf4 · 43abdf4
2 parents f7734fb + 844833f
commit 43abdf4
Show file tree

Hide file tree

Showing 6 changed files with 77 additions and 11 deletions.
diff --git a/README.md b/README.md
@@ -40,6 +40,10 @@ The above results are with marker and nougat setup so they each take ~3GB of VRA
 
 See [below](#benchmarks) for detailed speed and accuracy benchmarks, and instructions on how to run your own benchmarks.
 
+# Community
+
+[Discord](https://discord.gg//KuZwXNGnfH) is where we discuss future development.
+
 # Limitations
 
 PDF is a tricky format, so marker will not always work perfectly.  Here are some known limitations that are on the roadmap to address:

diff --git a/marker/bbox.py b/marker/bbox.py
@@ -1,3 +1,5 @@
+import fitz as pymupdf
+
 def should_merge_blocks(box1, box2, tol=5):
     # Within tol y px, and to the right within tol px
     merge = [
@@ -58,4 +60,22 @@ def unnormalize_box(bbox, width, height):
         height * (bbox[1] / 1000),
         width * (bbox[2] / 1000),
         height * (bbox[3] / 1000),
-    ]
+    ]
+
+
+def correct_rotation(bbox, page):
+    #bbox base is (x0, y0, x1, y1)
+    rotation = page.rotation
+    if rotation == 0:
+        return bbox
+
+    tl = pymupdf.Point(bbox[0], bbox[1]) * page.rotation_matrix
+    br = pymupdf.Point(bbox[2], bbox[3]) * page.rotation_matrix
+    if rotation == 90:
+        bbox = [br[0], tl[1], tl[0], br[1]]
+    elif rotation == 180:
+        bbox = [br[0], br[1], tl[0], tl[1]]
+    elif rotation == 270:
+        bbox = [tl[0], br[1], br[0], tl[1]]
+
+    return bbox
diff --git a/marker/debug/data.py b/marker/debug/data.py
@@ -14,6 +14,9 @@ def dump_nougat_debug_data(doc, images, converted_spans):
     if not settings.DEBUG_DATA_FOLDER:
         return
 
+    if len(images) == 0:
+        return
+
     # We attempted one conversion per image
     assert len(converted_spans) == len(images)
 
@@ -37,7 +40,7 @@ def dump_nougat_debug_data(doc, images, converted_spans):
 
     debug_file = os.path.join(settings.DEBUG_DATA_FOLDER, f"{doc_base}_equations.json")
     with open(debug_file, "w+") as f:
-        json.dump(data_lines, f, indent=4)
+        json.dump(data_lines, f)
 
 
 def dump_bbox_debug_data(doc, blocks: List[Page]):
@@ -70,7 +73,7 @@ def dump_bbox_debug_data(doc, blocks: List[Page]):
         debug_data.append(page_data)
 
     with open(debug_file, "w+") as f:
-        json.dump(debug_data, f, indent=4)
+        json.dump(debug_data, f)
 
 
 
diff --git a/marker/extract_text.py b/marker/extract_text.py
@@ -3,6 +3,7 @@
 
 from spellchecker import SpellChecker
 
+from marker.bbox import correct_rotation
 from marker.ocr.page import ocr_entire_page
 from marker.ocr.utils import detect_bad_ocr, font_flags_decomposer
 from marker.settings import settings
@@ -12,8 +13,27 @@
 os.environ["TESSDATA_PREFIX"] = settings.TESSDATA_PREFIX
 
 
+def sort_rotated_text(page_blocks, tolerance=1.25):
+    vertical_groups = {}
+    for block in page_blocks:
+        group_key = round(block.bbox[1] / tolerance) * tolerance
+        if group_key not in vertical_groups:
+            vertical_groups[group_key] = []
+        vertical_groups[group_key].append(block)
+
+    # Sort each group horizontally and flatten the groups into a single list
+    sorted_page_blocks = []
+    for _, group in sorted(vertical_groups.items()):
+        sorted_group = sorted(group, key=lambda x: x.bbox[0])
+        sorted_page_blocks.extend(sorted_group)
+
+    return sorted_page_blocks
+
+
 def get_single_page_blocks(doc, pnum: int, tess_lang: str, spellchecker: Optional[SpellChecker] = None, ocr=False) -> Tuple[List[Block], int]:
     page = doc[pnum]
+    rotation = page.rotation
+
     if ocr:
         blocks = ocr_entire_page(page, tess_lang, spellchecker)
     else:
@@ -30,7 +50,7 @@ def get_single_page_blocks(doc, pnum: int, tess_lang: str, spellchecker: Optiona
                 bbox = s["bbox"]
                 span_obj = Span(
                     text=block_text,
-                    bbox=bbox,
+                    bbox=correct_rotation(bbox, page),
                     span_id=f"{pnum}_{span_id}",
                     font=f"{s['font']}_{font_flags_decomposer(s['flags'])}", # Add font flags to end of font
                     color=s["color"],
@@ -41,19 +61,23 @@ def get_single_page_blocks(doc, pnum: int, tess_lang: str, spellchecker: Optiona
                 span_id += 1
             line_obj = Line(
                 spans=spans,
-                bbox=l["bbox"]
+                bbox=correct_rotation(l["bbox"], page),
             )
             # Only select valid lines, with positive bboxes
             if line_obj.area > 0:
                 block_lines.append(line_obj)
         block_obj = Block(
             lines=block_lines,
-            bbox=block["bbox"],
+            bbox=correct_rotation(block["bbox"], page),
             pnum=pnum
         )
         # Only select blocks with multiple lines
         if len(block_lines) > 0:
             page_blocks.append(block_obj)
+
+    # If the page was rotated, sort the text again
+    if rotation > 0:
+        page_blocks = sort_rotated_text(page_blocks)
     return page_blocks
 
 
@@ -80,8 +104,9 @@ def convert_single_page(doc, pnum, tess_lang: str, spell_lang: Optional[str], no
         not disable_ocr
     ]
     if all(conditions) or settings.OCR_ALL_PAGES:
+        page = doc[pnum]
         blocks = get_single_page_blocks(doc, pnum, tess_lang, spellchecker, ocr=True)
-        page_obj = Page(blocks=blocks, pnum=pnum, bbox=page_bbox)
+        page_obj = Page(blocks=blocks, pnum=pnum, bbox=page_bbox, rotation=page.rotation)
         ocr_pages = 1
         if len(blocks) == 0:
             ocr_failed = 1

diff --git a/marker/schema.py b/marker/schema.py
@@ -1,5 +1,5 @@
 from collections import Counter
-from typing import List, Optional
+from typing import List, Optional, Tuple
 
 from pydantic import BaseModel, field_validator
 import ftfy
@@ -20,7 +20,6 @@ def find_span_type(span, page_blocks):
 class BboxElement(BaseModel):
     bbox: List[float]
 
-
     @field_validator('bbox')
     @classmethod
     def check_4_elements(cls, v: List[float]) -> List[float]:
@@ -134,6 +133,7 @@ class Page(BboxElement):
     blocks: List[Block]
     pnum: int
     column_count: Optional[int] = None
+    rotation: Optional[int] = None # Rotation degrees of the page
 
     def get_nonblank_lines(self):
         lines = self.get_all_lines()

diff --git a/marker/settings.py b/marker/settings.py
@@ -54,8 +54,22 @@ class Settings(BaseSettings):
     # Nougat model
     NOUGAT_MODEL_MAX: int = 512 # Max inference length for nougat
     NOUGAT_TOKEN_BUFFER: int = 256 # Number of tokens to buffer above max for nougat
-    NOUGAT_HALLUCINATION_WORDS: List[str] = ["[MISSING_PAGE_POST]", "## References\n", "**Figure Captions**\n", "Footnote",
-                                  "\par\par\par", "## Chapter", "Fig.", "particle", "[REPEATS]", "[TRUNCATED]", "### ", "effective field strength", "\Phi_{\rm eff}"]
+    NOUGAT_HALLUCINATION_WORDS: List[str] = [
+        "[MISSING_PAGE_POST]",
+        "## References\n",
+        "**Figure Captions**\n",
+        "Footnote",
+        "\par\par\par",
+        "## Chapter",
+        "Fig.",
+        "particle",
+        "[REPEATS]",
+        "[TRUNCATED]",
+        "### ",
+        "effective field strength",
+        "\Phi_{\rm eff}",
+        "\mathbf{\mathbf"
+    ]
     NOUGAT_DPI: int = 96 # DPI to render images at, matches default settings for nougat
     NOUGAT_MODEL_NAME: str = "0.1.0-small" # Name of the model to use
     NOUGAT_BATCH_SIZE: int = 6 if TORCH_DEVICE == "cuda" else 1 # Batch size for nougat, don't batch on cpu