Skip to content

Commit

Permalink
Merge pull request #383 from VikParuchuri/vik_v2
Browse files Browse the repository at this point in the history
Fix broken text
  • Loading branch information
VikParuchuri authored Nov 21, 2024
2 parents 243ae0b + a0f0ea6 commit 8d5459f
Show file tree
Hide file tree
Showing 16 changed files with 435 additions and 309 deletions.
6 changes: 6 additions & 0 deletions marker/converters/pdf.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import os

from marker.processors.line_numbers import LineNumbersProcessor

os.environ["TOKENIZERS_PARALLELISM"] = "false" # disables a tokenizers warning

from marker.processors.code import CodeProcessor
Expand All @@ -25,6 +28,7 @@
from marker.schema.blocks import Block
from marker.schema.registry import register_block_class
from marker.processors.debug import DebugProcessor
from marker.processors.ignoretext import IgnoreTextProcessor


class PdfConverter(BaseConverter):
Expand Down Expand Up @@ -56,6 +60,8 @@ def __init__(self, artifact_dict: Dict[str, Any], processor_list: List[str] | No
TextProcessor,
CodeProcessor,
DocumentTOCProcessor,
IgnoreTextProcessor,
LineNumbersProcessor,
DebugProcessor,
]

Expand Down
4 changes: 1 addition & 3 deletions marker/processors/code.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,7 @@ class CodeProcessor(BaseProcessor):

def __call__(self, document: Document):
for page in document.pages:
for block in page.children:
if block.block_type not in self.block_types:
continue
for block in page.contained_blocks(document, self.block_types):
self.format_block(document, block)

def format_block(self, document: Document, block: Code):
Expand Down
80 changes: 51 additions & 29 deletions marker/processors/debug.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,19 +59,40 @@ def __call__(self, document: Document):
print(f"Dumped layout debug images to {self.debug_data_folder}")

if self.debug_pdf_images:
self.draw_layout_debug_images(document, pdf_mode=True)
self.draw_pdf_debug_images(document)
print(f"Dumped PDF debug images to {self.debug_data_folder}")

if self.debug_json:
self.dump_block_debug_data(document)
print(f"Dumped block debug data to {self.debug_data_folder}")

def draw_pdf_debug_images(self, document: Document):
for idx, page in enumerate(document.pages):
png_image = page.highres_image.copy()

line_bboxes = []
span_bboxes = []
for child in page.children:
if child.block_type == BlockTypes.Line:
bbox = child.polygon.rescale(page.polygon.size, png_image.size).bbox
line_bboxes.append(bbox)
elif child.block_type == BlockTypes.Span:
bbox = child.polygon.rescale(page.polygon.size, png_image.size).bbox
span_bboxes.append(bbox)

self.render_on_image(line_bboxes, png_image, color="blue", draw_bbox=True, label_font_size=24)
self.render_on_image(span_bboxes, png_image, color="green", draw_bbox=True, label_font_size=24)

png_image = self.render_layout_boxes(page, png_image)

debug_file = os.path.join(self.debug_folder, f"pdf_page_{idx}.png")
png_image.save(debug_file)


def draw_layout_debug_images(self, document: Document, pdf_mode=False):
for idx, page in enumerate(document.pages):
img_size = page.highres_image.size
png_image = Image.new("RGB", img_size, color="white")
if pdf_mode:
png_image = page.highres_image.copy()

line_bboxes = []
line_text = []
Expand All @@ -83,37 +104,38 @@ def draw_layout_debug_images(self, document: Document, pdf_mode=False):
line_bboxes.append(bbox)
line_text.append(child.raw_text(document))

if pdf_mode:
line_text = None

self.render_on_image(line_bboxes, png_image, labels=line_text, color="black", draw_bbox=False, label_font_size=24)

layout_bboxes = []
layout_labels = []
for child in page.children:
if child.block_type in [BlockTypes.Line, BlockTypes.Span]:
continue
png_image = self.render_layout_boxes(page, png_image)

bbox = child.polygon.rescale(page.polygon.size, img_size).bbox
layout_bboxes.append(bbox)
layout_labels.append(str(child.block_type))

self.render_on_image(layout_bboxes, png_image, labels=layout_labels, color="red", label_font_size=24)

order_labels = [str(i) for i in range(len(layout_bboxes))]
self.render_on_image(
layout_bboxes,
png_image,
labels=order_labels,
color="green",
draw_bbox=False,
label_offset=5
)

filecomp = "pdf" if pdf_mode else "layout"
debug_file = os.path.join(self.debug_folder, f"{filecomp}_page_{idx}.png")
debug_file = os.path.join(self.debug_folder, f"layout_page_{idx}.png")
png_image.save(debug_file)


def render_layout_boxes(self, page, png_image):
layout_bboxes = []
layout_labels = []
for child in page.children:
if child.block_type in [BlockTypes.Line, BlockTypes.Span]:
continue

bbox = child.polygon.rescale(page.polygon.size, png_image.size).bbox
layout_bboxes.append(bbox)
layout_labels.append(str(child.block_type))

self.render_on_image(layout_bboxes, png_image, labels=layout_labels, color="red", label_font_size=24)

order_labels = [str(i) for i in range(len(layout_bboxes))]
self.render_on_image(
layout_bboxes,
png_image,
labels=order_labels,
color="green",
draw_bbox=False,
label_offset=5
)
return png_image

def dump_block_debug_data(self, document: Document):
debug_file = os.path.join(self.debug_folder, f"blocks.json")
debug_data = []
Expand Down
5 changes: 1 addition & 4 deletions marker/processors/document_toc.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,7 @@ class DocumentTOCProcessor(BaseProcessor):
def __call__(self, document: Document):
toc = []
for page in document.pages:
for block in page.children:
if block.block_type not in self.block_types:
continue

for block in page.contained_blocks(document, self.block_types):
toc.append({
"title": block.raw_text(document).strip(),
"heading_level": block.heading_level,
Expand Down
4 changes: 1 addition & 3 deletions marker/processors/equation.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,7 @@ def __call__(self, document: Document):
equation_data = []

for page in document.pages:
for block in page.children:
if block.block_type not in self.block_types:
continue
for block in page.contained_blocks(document, self.block_types):
image_poly = block.polygon.rescale((page.polygon.width, page.polygon.height), page.lowres_image.size)
image = page.lowres_image.crop(image_poly.bbox).convert("RGB")
raw_text = block.raw_text(document)
Expand Down
36 changes: 23 additions & 13 deletions marker/processors/ignoretext.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import re
from collections import Counter

from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.document import Document

from rapidfuzz import fuzz


class IgnoreTextProcessor(BaseProcessor):
"""
Expand All @@ -15,7 +18,9 @@ class IgnoreTextProcessor(BaseProcessor):
Default is 0.6.
"""
block_types = (BlockTypes.Text,)
common_element_threshold = .6
common_element_threshold = .25
max_blocks = 1
text_match_threshold = 90

def __call__(self, document: Document):
first_blocks = []
Expand All @@ -24,10 +29,7 @@ def __call__(self, document: Document):
initial_block = None
block = None
last_block = None
for block in page.children:
if block.block_type not in self.block_types:
continue

for block in page.contained_blocks(document, self.block_types):
if initial_block is None:
initial_block = block

Expand All @@ -42,14 +44,22 @@ def __call__(self, document: Document):
self.filter_common_elements(document, first_blocks)
self.filter_common_elements(document, last_blocks)

def filter_common_elements(self, document, lines):
@staticmethod
def clean_text(text):
return re.sub(r"\s+", "", text)

def filter_common_elements(self, document, blocks):
# We can't filter if we don't have enough pages to find common elements
if len(lines) < 3:
return []
if len(blocks) < 3:
return

text = [b.raw_text(document) for b in lines]
text = [self.clean_text(b.raw_text(document)) for b in blocks]
counter = Counter(text)
common = [k for k, v in counter.items() if v > len(lines) * self.common_element_threshold]
for b in lines:
if b.raw_text(document) in common:
b.is_header_footer = True
common = [k for k, v in counter.items() if v > len(blocks) * self.common_element_threshold]
if len(common) == 0:
return

for b in blocks:
if fuzz.ratio(self.clean_text(b.raw_text(document)), common[0]) > self.text_match_threshold:
for span in b.contained_blocks(document, [BlockTypes.Span]):
span.ignore_for_output = True
80 changes: 80 additions & 0 deletions marker/processors/line_numbers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.document import Document


class LineNumbersProcessor(BaseProcessor):
block_types = (BlockTypes.Text, BlockTypes.TextInlineMath)
strip_numbers_threshold: int = .6
min_lines_in_block: int = 4
min_line_length: int = 10

def __init__(self, config):
super().__init__(config)

def __call__(self, document: Document):
self.ignore_line_starts_ends(document)
self.ignore_line_number_blocks(document)

def ignore_line_number_blocks(self, document: Document):
for page in document.pages:
for block in page.contained_blocks(document, self.block_types):
raw_text = block.raw_text(document)
tokens = raw_text.strip().split()
if len(tokens) < 4:
continue

tokens_are_numbers = [token.isdigit() for token in tokens]
if all([
sum(tokens_are_numbers) / len(tokens) > self.strip_numbers_threshold,
block.polygon.height > block.polygon.width # Ensure block is taller than it is wide, like vertical page numbers
]):
for span in block.contained_blocks(document, [BlockTypes.Span]):
span.ignore_for_output = True


def ignore_line_starts_ends(self, document: Document):
for page in document.pages:
for block in page.contained_blocks(document, self.block_types):
if block.structure is None:
continue

all_lines = block.structure_blocks(document)
if len(all_lines) < self.min_lines_in_block:
continue

starts_with_number = []
ends_with_number = []
for line in all_lines:
spans = line.structure_blocks(document)
if len(spans) < 2:
starts_with_number.append(False)
ends_with_number.append(False)
continue

raw_text = line.raw_text(document)
starts = all([
spans[0].text.strip().isdigit(),
len(raw_text) - len(spans[0].text.strip()) > self.min_line_length
])

ends= all([
spans[-1].text.strip().isdigit(),
len(raw_text) - len(spans[-1].text.strip()) > self.min_line_length
])

starts_with_number.append(starts)
ends_with_number.append(ends)

if sum(starts_with_number) / len(starts_with_number) > self.strip_numbers_threshold:
for starts, line in zip(starts_with_number, all_lines):
if starts:
span = page.get_block(line.structure[0])
span.ignore_for_output = True

if sum(ends_with_number) / len(ends_with_number) > self.strip_numbers_threshold:
for ends, line in zip(ends_with_number, all_lines):
if ends:
span = page.get_block(line.structure[-1])
span.ignore_for_output = True

5 changes: 1 addition & 4 deletions marker/processors/sectionheader.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,7 @@ class SectionHeaderProcessor(BaseProcessor):
def __call__(self, document: Document):
line_heights: Dict[int, List[float]] = {}
for page in document.pages:
for block in page.children:
if block.block_type not in self.block_types:
continue

for block in page.contained_blocks(document, self.block_types):
line_heights[block.block_id] = []
if block.structure is not None:
line_heights[block.block_id] = [document.get_block(l).polygon.height for l in block.structure if l.block_type == BlockTypes.Line]
Expand Down
5 changes: 1 addition & 4 deletions marker/processors/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,7 @@ def __call__(self, document: Document):

table_data = []
for page in document.pages:
for block in page.children:
if block.block_type not in self.block_types:
continue

for block in page.contained_blocks(document, self.block_types):
image_poly = block.polygon.rescale((page.polygon.width, page.polygon.height), page.highres_image.size)
image = page.highres_image.crop(image_poly.bbox).convert("RGB")

Expand Down
Loading

0 comments on commit 8d5459f

Please sign in to comment.