Skip to content

Commit

Permalink
Merge pull request #37 from VikParuchuri/dev
Browse files Browse the repository at this point in the history
Bug fix: Work with rotated pdfs
  • Loading branch information
VikParuchuri authored Dec 13, 2023
2 parents f7734fb + 844833f commit 43abdf4
Show file tree
Hide file tree
Showing 6 changed files with 77 additions and 11 deletions.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ The above results are with marker and nougat setup so they each take ~3GB of VRA

See [below](#benchmarks) for detailed speed and accuracy benchmarks, and instructions on how to run your own benchmarks.

# Community

[Discord](https://discord.gg//KuZwXNGnfH) is where we discuss future development.

# Limitations

PDF is a tricky format, so marker will not always work perfectly. Here are some known limitations that are on the roadmap to address:
Expand Down
22 changes: 21 additions & 1 deletion marker/bbox.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import fitz as pymupdf

def should_merge_blocks(box1, box2, tol=5):
# Within tol y px, and to the right within tol px
merge = [
Expand Down Expand Up @@ -58,4 +60,22 @@ def unnormalize_box(bbox, width, height):
height * (bbox[1] / 1000),
width * (bbox[2] / 1000),
height * (bbox[3] / 1000),
]
]


def correct_rotation(bbox, page):
#bbox base is (x0, y0, x1, y1)
rotation = page.rotation
if rotation == 0:
return bbox

tl = pymupdf.Point(bbox[0], bbox[1]) * page.rotation_matrix
br = pymupdf.Point(bbox[2], bbox[3]) * page.rotation_matrix
if rotation == 90:
bbox = [br[0], tl[1], tl[0], br[1]]
elif rotation == 180:
bbox = [br[0], br[1], tl[0], tl[1]]
elif rotation == 270:
bbox = [tl[0], br[1], br[0], tl[1]]

return bbox
7 changes: 5 additions & 2 deletions marker/debug/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ def dump_nougat_debug_data(doc, images, converted_spans):
if not settings.DEBUG_DATA_FOLDER:
return

if len(images) == 0:
return

# We attempted one conversion per image
assert len(converted_spans) == len(images)

Expand All @@ -37,7 +40,7 @@ def dump_nougat_debug_data(doc, images, converted_spans):

debug_file = os.path.join(settings.DEBUG_DATA_FOLDER, f"{doc_base}_equations.json")
with open(debug_file, "w+") as f:
json.dump(data_lines, f, indent=4)
json.dump(data_lines, f)


def dump_bbox_debug_data(doc, blocks: List[Page]):
Expand Down Expand Up @@ -70,7 +73,7 @@ def dump_bbox_debug_data(doc, blocks: List[Page]):
debug_data.append(page_data)

with open(debug_file, "w+") as f:
json.dump(debug_data, f, indent=4)
json.dump(debug_data, f)



33 changes: 29 additions & 4 deletions marker/extract_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from spellchecker import SpellChecker

from marker.bbox import correct_rotation
from marker.ocr.page import ocr_entire_page
from marker.ocr.utils import detect_bad_ocr, font_flags_decomposer
from marker.settings import settings
Expand All @@ -12,8 +13,27 @@
os.environ["TESSDATA_PREFIX"] = settings.TESSDATA_PREFIX


def sort_rotated_text(page_blocks, tolerance=1.25):
vertical_groups = {}
for block in page_blocks:
group_key = round(block.bbox[1] / tolerance) * tolerance
if group_key not in vertical_groups:
vertical_groups[group_key] = []
vertical_groups[group_key].append(block)

# Sort each group horizontally and flatten the groups into a single list
sorted_page_blocks = []
for _, group in sorted(vertical_groups.items()):
sorted_group = sorted(group, key=lambda x: x.bbox[0])
sorted_page_blocks.extend(sorted_group)

return sorted_page_blocks


def get_single_page_blocks(doc, pnum: int, tess_lang: str, spellchecker: Optional[SpellChecker] = None, ocr=False) -> Tuple[List[Block], int]:
page = doc[pnum]
rotation = page.rotation

if ocr:
blocks = ocr_entire_page(page, tess_lang, spellchecker)
else:
Expand All @@ -30,7 +50,7 @@ def get_single_page_blocks(doc, pnum: int, tess_lang: str, spellchecker: Optiona
bbox = s["bbox"]
span_obj = Span(
text=block_text,
bbox=bbox,
bbox=correct_rotation(bbox, page),
span_id=f"{pnum}_{span_id}",
font=f"{s['font']}_{font_flags_decomposer(s['flags'])}", # Add font flags to end of font
color=s["color"],
Expand All @@ -41,19 +61,23 @@ def get_single_page_blocks(doc, pnum: int, tess_lang: str, spellchecker: Optiona
span_id += 1
line_obj = Line(
spans=spans,
bbox=l["bbox"]
bbox=correct_rotation(l["bbox"], page),
)
# Only select valid lines, with positive bboxes
if line_obj.area > 0:
block_lines.append(line_obj)
block_obj = Block(
lines=block_lines,
bbox=block["bbox"],
bbox=correct_rotation(block["bbox"], page),
pnum=pnum
)
# Only select blocks with multiple lines
if len(block_lines) > 0:
page_blocks.append(block_obj)

# If the page was rotated, sort the text again
if rotation > 0:
page_blocks = sort_rotated_text(page_blocks)
return page_blocks


Expand All @@ -80,8 +104,9 @@ def convert_single_page(doc, pnum, tess_lang: str, spell_lang: Optional[str], no
not disable_ocr
]
if all(conditions) or settings.OCR_ALL_PAGES:
page = doc[pnum]
blocks = get_single_page_blocks(doc, pnum, tess_lang, spellchecker, ocr=True)
page_obj = Page(blocks=blocks, pnum=pnum, bbox=page_bbox)
page_obj = Page(blocks=blocks, pnum=pnum, bbox=page_bbox, rotation=page.rotation)
ocr_pages = 1
if len(blocks) == 0:
ocr_failed = 1
Expand Down
4 changes: 2 additions & 2 deletions marker/schema.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from collections import Counter
from typing import List, Optional
from typing import List, Optional, Tuple

from pydantic import BaseModel, field_validator
import ftfy
Expand All @@ -20,7 +20,6 @@ def find_span_type(span, page_blocks):
class BboxElement(BaseModel):
bbox: List[float]


@field_validator('bbox')
@classmethod
def check_4_elements(cls, v: List[float]) -> List[float]:
Expand Down Expand Up @@ -134,6 +133,7 @@ class Page(BboxElement):
blocks: List[Block]
pnum: int
column_count: Optional[int] = None
rotation: Optional[int] = None # Rotation degrees of the page

def get_nonblank_lines(self):
lines = self.get_all_lines()
Expand Down
18 changes: 16 additions & 2 deletions marker/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,22 @@ class Settings(BaseSettings):
# Nougat model
NOUGAT_MODEL_MAX: int = 512 # Max inference length for nougat
NOUGAT_TOKEN_BUFFER: int = 256 # Number of tokens to buffer above max for nougat
NOUGAT_HALLUCINATION_WORDS: List[str] = ["[MISSING_PAGE_POST]", "## References\n", "**Figure Captions**\n", "Footnote",
"\par\par\par", "## Chapter", "Fig.", "particle", "[REPEATS]", "[TRUNCATED]", "### ", "effective field strength", "\Phi_{\rm eff}"]
NOUGAT_HALLUCINATION_WORDS: List[str] = [
"[MISSING_PAGE_POST]",
"## References\n",
"**Figure Captions**\n",
"Footnote",
"\par\par\par",
"## Chapter",
"Fig.",
"particle",
"[REPEATS]",
"[TRUNCATED]",
"### ",
"effective field strength",
"\Phi_{\rm eff}",
"\mathbf{\mathbf"
]
NOUGAT_DPI: int = 96 # DPI to render images at, matches default settings for nougat
NOUGAT_MODEL_NAME: str = "0.1.0-small" # Name of the model to use
NOUGAT_BATCH_SIZE: int = 6 if TORCH_DEVICE == "cuda" else 1 # Batch size for nougat, don't batch on cpu
Expand Down

0 comments on commit 43abdf4

Please sign in to comment.