Fix OCR scaling

VikParuchuri · Oct 15, 2024 · 15f5f2d · 15f5f2d
1 parent 04d308e
commit 15f5f2d
Show file tree

Hide file tree

Showing 3 changed files with 11 additions and 9 deletions.
diff --git a/marker/ocr/recognition.py b/marker/ocr/recognition.py
@@ -1,4 +1,5 @@
 import tempfile
+from copy import deepcopy
 from itertools import repeat
 from typing import List, Optional, Dict
 
@@ -12,6 +13,7 @@
 from marker.ocr.heuristics import should_ocr_page, no_text_found, detect_bad_ocr
 from marker.ocr.lang import langs_to_ids
 from marker.pdf.images import render_image
+from marker.schema.bbox import rescale_bbox
 from marker.schema.page import Page
 from marker.schema.block import Block, Line, Span
 from marker.settings import settings
@@ -74,7 +76,7 @@ def surya_recognition(doc, page_idxs, langs: List[str], rec_model, pages: List[P
 
     surya_langs = [langs] * len(page_idxs)
     detection_results = [p.text_lines.bboxes for p in selected_pages]
-    polygons = [[b.polygon for b in bboxes] for bboxes in detection_results]
+    polygons = deepcopy([[b.polygon for b in bboxes] for bboxes in detection_results])
 
     # Scale polygons to get correct image slices
     for poly in polygons:
@@ -85,12 +87,12 @@ def surya_recognition(doc, page_idxs, langs: List[str], rec_model, pages: List[P
     results = run_recognition(images, surya_langs, rec_model, processor, polygons=polygons, batch_size=int(get_batch_size() * batch_multiplier))
 
     new_pages = []
-    for (page_idx, result, old_page) in zip(page_idxs, results, selected_pages):
+    for idx, (page_idx, result, old_page) in enumerate(zip(page_idxs, results, selected_pages)):
         text_lines = old_page.text_lines
         ocr_results = result.text_lines
         blocks = []
         for i, line in enumerate(ocr_results):
-            scaled_bbox = [b / box_scale for b in line.bbox]
+            scaled_bbox = rescale_bbox([0, 0, images[idx].size[0], images[idx].size[1]], old_page.text_lines.image_bbox, line.bbox)
             block = Block(
                 bbox=scaled_bbox,
                 pnum=page_idx,
@@ -108,11 +110,10 @@ def surya_recognition(doc, page_idxs, langs: List[str], rec_model, pages: List[P
                 )]
             )
             blocks.append(block)
-        scaled_image_bbox = [b / box_scale for b in result.image_bbox]
         page = Page(
             blocks=blocks,
             pnum=page_idx,
-            bbox=scaled_image_bbox,
+            bbox=old_page.text_lines.image_bbox,
             rotation=0,
             text_lines=text_lines,
             ocr_method="surya"

diff --git a/marker/tables/table.py b/marker/tables/table.py
@@ -35,7 +35,7 @@ def get_table_boxes(pages: List[Page], doc: PdfDocument, fname):
         highres_img = render_image(doc[pnum], dpi=settings.SURYA_TABLE_DPI)
 
         page_table_imgs = []
-        lowres_bbox = []
+        page_bboxes = []
 
         # Merge tables that are next to each other
         bbox = merge_tables(bbox)
@@ -47,10 +47,10 @@ def get_table_boxes(pages: List[Page], doc: PdfDocument, fname):
         for bb in bbox:
             highres_bb = rescale_bbox(page.layout.image_bbox, [0, 0, highres_img.size[0], highres_img.size[1]], bb)
             page_table_imgs.append(highres_img.crop(highres_bb))
-            lowres_bbox.append(highres_bb)
+            page_bboxes.append(highres_bb)
 
         table_imgs.extend(page_table_imgs)
-        table_bboxes.extend(lowres_bbox)
+        table_bboxes.extend(page_bboxes)
 
     table_idxs = [i for i, c in enumerate(table_counts) if c > 0]
     sel_text_lines = get_page_text_lines(
@@ -81,6 +81,7 @@ def format_tables(pages: List[Page], doc: PdfDocument, fname: str, detection_mod
     cells, needs_ocr = get_cells(table_imgs, table_boxes, img_sizes, table_text_lines, det_models, detect_boxes=settings.OCR_ALL_PAGES)
     tqdm.disable = False
 
+    # This will redo OCR if OCR is forced, since we need to redetect bounding boxes, etc.
     table_rec = recognize_tables(table_imgs, cells, needs_ocr, rec_models)
     cells = [assign_rows_columns(tr, im_size) for tr, im_size in zip(table_rec, img_sizes)]
     table_md = [formatter("markdown", cell)[0] for cell in cells]

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "marker-pdf"
-version = "0.2.17"
+version = "0.3.0"
 description = "Convert PDF to markdown with high speed and accuracy."
 authors = ["Vik Paruchuri <[email protected]>"]
 readme = "README.md"