Skip to content

Commit

Permalink
Fix OCR scaling
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Oct 15, 2024
1 parent 04d308e commit 15f5f2d
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 9 deletions.
11 changes: 6 additions & 5 deletions marker/ocr/recognition.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import tempfile
from copy import deepcopy
from itertools import repeat
from typing import List, Optional, Dict

Expand All @@ -12,6 +13,7 @@
from marker.ocr.heuristics import should_ocr_page, no_text_found, detect_bad_ocr
from marker.ocr.lang import langs_to_ids
from marker.pdf.images import render_image
from marker.schema.bbox import rescale_bbox
from marker.schema.page import Page
from marker.schema.block import Block, Line, Span
from marker.settings import settings
Expand Down Expand Up @@ -74,7 +76,7 @@ def surya_recognition(doc, page_idxs, langs: List[str], rec_model, pages: List[P

surya_langs = [langs] * len(page_idxs)
detection_results = [p.text_lines.bboxes for p in selected_pages]
polygons = [[b.polygon for b in bboxes] for bboxes in detection_results]
polygons = deepcopy([[b.polygon for b in bboxes] for bboxes in detection_results])

# Scale polygons to get correct image slices
for poly in polygons:
Expand All @@ -85,12 +87,12 @@ def surya_recognition(doc, page_idxs, langs: List[str], rec_model, pages: List[P
results = run_recognition(images, surya_langs, rec_model, processor, polygons=polygons, batch_size=int(get_batch_size() * batch_multiplier))

new_pages = []
for (page_idx, result, old_page) in zip(page_idxs, results, selected_pages):
for idx, (page_idx, result, old_page) in enumerate(zip(page_idxs, results, selected_pages)):
text_lines = old_page.text_lines
ocr_results = result.text_lines
blocks = []
for i, line in enumerate(ocr_results):
scaled_bbox = [b / box_scale for b in line.bbox]
scaled_bbox = rescale_bbox([0, 0, images[idx].size[0], images[idx].size[1]], old_page.text_lines.image_bbox, line.bbox)
block = Block(
bbox=scaled_bbox,
pnum=page_idx,
Expand All @@ -108,11 +110,10 @@ def surya_recognition(doc, page_idxs, langs: List[str], rec_model, pages: List[P
)]
)
blocks.append(block)
scaled_image_bbox = [b / box_scale for b in result.image_bbox]
page = Page(
blocks=blocks,
pnum=page_idx,
bbox=scaled_image_bbox,
bbox=old_page.text_lines.image_bbox,
rotation=0,
text_lines=text_lines,
ocr_method="surya"
Expand Down
7 changes: 4 additions & 3 deletions marker/tables/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def get_table_boxes(pages: List[Page], doc: PdfDocument, fname):
highres_img = render_image(doc[pnum], dpi=settings.SURYA_TABLE_DPI)

page_table_imgs = []
lowres_bbox = []
page_bboxes = []

# Merge tables that are next to each other
bbox = merge_tables(bbox)
Expand All @@ -47,10 +47,10 @@ def get_table_boxes(pages: List[Page], doc: PdfDocument, fname):
for bb in bbox:
highres_bb = rescale_bbox(page.layout.image_bbox, [0, 0, highres_img.size[0], highres_img.size[1]], bb)
page_table_imgs.append(highres_img.crop(highres_bb))
lowres_bbox.append(highres_bb)
page_bboxes.append(highres_bb)

table_imgs.extend(page_table_imgs)
table_bboxes.extend(lowres_bbox)
table_bboxes.extend(page_bboxes)

table_idxs = [i for i, c in enumerate(table_counts) if c > 0]
sel_text_lines = get_page_text_lines(
Expand Down Expand Up @@ -81,6 +81,7 @@ def format_tables(pages: List[Page], doc: PdfDocument, fname: str, detection_mod
cells, needs_ocr = get_cells(table_imgs, table_boxes, img_sizes, table_text_lines, det_models, detect_boxes=settings.OCR_ALL_PAGES)
tqdm.disable = False

# This will redo OCR if OCR is forced, since we need to redetect bounding boxes, etc.
table_rec = recognize_tables(table_imgs, cells, needs_ocr, rec_models)
cells = [assign_rows_columns(tr, im_size) for tr, im_size in zip(table_rec, img_sizes)]
table_md = [formatter("markdown", cell)[0] for cell in cells]
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "marker-pdf"
version = "0.2.17"
version = "0.3.0"
description = "Convert PDF to markdown with high speed and accuracy."
authors = ["Vik Paruchuri <[email protected]>"]
readme = "README.md"
Expand Down

0 comments on commit 15f5f2d

Please sign in to comment.