From 15f5f2df96630bc1c3f22b209824e381ac0bf2f8 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Tue, 15 Oct 2024 15:29:44 -0400 Subject: [PATCH] Fix OCR scaling --- marker/ocr/recognition.py | 11 ++++++----- marker/tables/table.py | 7 ++++--- pyproject.toml | 2 +- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/marker/ocr/recognition.py b/marker/ocr/recognition.py index f5ceee50..0ebfe060 100644 --- a/marker/ocr/recognition.py +++ b/marker/ocr/recognition.py @@ -1,4 +1,5 @@ import tempfile +from copy import deepcopy from itertools import repeat from typing import List, Optional, Dict @@ -12,6 +13,7 @@ from marker.ocr.heuristics import should_ocr_page, no_text_found, detect_bad_ocr from marker.ocr.lang import langs_to_ids from marker.pdf.images import render_image +from marker.schema.bbox import rescale_bbox from marker.schema.page import Page from marker.schema.block import Block, Line, Span from marker.settings import settings @@ -74,7 +76,7 @@ def surya_recognition(doc, page_idxs, langs: List[str], rec_model, pages: List[P surya_langs = [langs] * len(page_idxs) detection_results = [p.text_lines.bboxes for p in selected_pages] - polygons = [[b.polygon for b in bboxes] for bboxes in detection_results] + polygons = deepcopy([[b.polygon for b in bboxes] for bboxes in detection_results]) # Scale polygons to get correct image slices for poly in polygons: @@ -85,12 +87,12 @@ def surya_recognition(doc, page_idxs, langs: List[str], rec_model, pages: List[P results = run_recognition(images, surya_langs, rec_model, processor, polygons=polygons, batch_size=int(get_batch_size() * batch_multiplier)) new_pages = [] - for (page_idx, result, old_page) in zip(page_idxs, results, selected_pages): + for idx, (page_idx, result, old_page) in enumerate(zip(page_idxs, results, selected_pages)): text_lines = old_page.text_lines ocr_results = result.text_lines blocks = [] for i, line in enumerate(ocr_results): - scaled_bbox = [b / box_scale for b in line.bbox] + scaled_bbox = rescale_bbox([0, 0, images[idx].size[0], images[idx].size[1]], old_page.text_lines.image_bbox, line.bbox) block = Block( bbox=scaled_bbox, pnum=page_idx, @@ -108,11 +110,10 @@ def surya_recognition(doc, page_idxs, langs: List[str], rec_model, pages: List[P )] ) blocks.append(block) - scaled_image_bbox = [b / box_scale for b in result.image_bbox] page = Page( blocks=blocks, pnum=page_idx, - bbox=scaled_image_bbox, + bbox=old_page.text_lines.image_bbox, rotation=0, text_lines=text_lines, ocr_method="surya" diff --git a/marker/tables/table.py b/marker/tables/table.py index 172ef7e7..f9b1ebcb 100644 --- a/marker/tables/table.py +++ b/marker/tables/table.py @@ -35,7 +35,7 @@ def get_table_boxes(pages: List[Page], doc: PdfDocument, fname): highres_img = render_image(doc[pnum], dpi=settings.SURYA_TABLE_DPI) page_table_imgs = [] - lowres_bbox = [] + page_bboxes = [] # Merge tables that are next to each other bbox = merge_tables(bbox) @@ -47,10 +47,10 @@ def get_table_boxes(pages: List[Page], doc: PdfDocument, fname): for bb in bbox: highres_bb = rescale_bbox(page.layout.image_bbox, [0, 0, highres_img.size[0], highres_img.size[1]], bb) page_table_imgs.append(highres_img.crop(highres_bb)) - lowres_bbox.append(highres_bb) + page_bboxes.append(highres_bb) table_imgs.extend(page_table_imgs) - table_bboxes.extend(lowres_bbox) + table_bboxes.extend(page_bboxes) table_idxs = [i for i, c in enumerate(table_counts) if c > 0] sel_text_lines = get_page_text_lines( @@ -81,6 +81,7 @@ def format_tables(pages: List[Page], doc: PdfDocument, fname: str, detection_mod cells, needs_ocr = get_cells(table_imgs, table_boxes, img_sizes, table_text_lines, det_models, detect_boxes=settings.OCR_ALL_PAGES) tqdm.disable = False + # This will redo OCR if OCR is forced, since we need to redetect bounding boxes, etc. table_rec = recognize_tables(table_imgs, cells, needs_ocr, rec_models) cells = [assign_rows_columns(tr, im_size) for tr, im_size in zip(table_rec, img_sizes)] table_md = [formatter("markdown", cell)[0] for cell in cells] diff --git a/pyproject.toml b/pyproject.toml index 8e2189b7..85c95816 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "marker-pdf" -version = "0.2.17" +version = "0.3.0" description = "Convert PDF to markdown with high speed and accuracy." authors = ["Vik Paruchuri "] readme = "README.md"