Skip to content

Commit

Permalink
Push footnotes down, add pdf debug output
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Oct 17, 2024
1 parent dd537a2 commit 2602804
Show file tree
Hide file tree
Showing 7 changed files with 123 additions and 13 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ wandb
*.dat
report.json
benchmark_data
debug
debug_data

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
8 changes: 4 additions & 4 deletions marker/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,16 +110,16 @@ def convert_single_pdf(
# Add block types in
annotate_block_types(pages)

# Dump debug data if flags are set
draw_page_debug_images(fname, pages)
dump_bbox_debug_data(fname, pages)

# Find reading order for blocks
# Sort blocks by reading order
surya_order(doc, pages, order_model, batch_multiplier=batch_multiplier)
sort_blocks_in_reading_order(pages)
flush_cuda_memory()

# Dump debug data if flags are set
draw_page_debug_images(fname, pages)
dump_bbox_debug_data(fname, pages)

# Fix code blocks
code_block_count = identify_code_blocks(pages)
out_meta["block_stats"]["code"] = code_block_count
Expand Down
52 changes: 47 additions & 5 deletions marker/debug/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,7 @@
from PIL import Image


def draw_page_debug_images(fname, pages: List[Page]):
if not settings.DEBUG:
return

def draw_layout_page_debug_images(fname, pages: List[Page]):
# Remove extension from doc name
doc_base = os.path.basename(fname).rsplit(".", 1)[0]

Expand All @@ -31,6 +28,7 @@ def draw_page_debug_images(fname, pages: List[Page]):
line_text.append(line.prelim_text)

render_on_image(line_bboxes, png_image, labels=line_text, color="black", draw_bbox=False)
pdf_image = png_image.copy()

line_bboxes = [line.bbox for line in page.text_lines.bboxes]
render_on_image(line_bboxes, png_image, color="blue")
Expand All @@ -40,9 +38,53 @@ def draw_page_debug_images(fname, pages: List[Page]):

render_on_image(layout_boxes, png_image, labels=layout_labels, color="red")

debug_file = os.path.join(debug_folder, f"page_{idx}.png")
order_labels = [str(i) for i in range(len(page.layout.bboxes))]
render_on_image(layout_boxes, png_image, labels=order_labels, color="green", draw_bbox=False, label_offset=5)

debug_file = os.path.join(debug_folder, f"layout_page_{idx}.png")
png_image.save(debug_file)

# PDF Image

block_bboxes = [rescale_bbox(page.bbox, page.text_lines.image_bbox, block.bbox) for block in page.blocks]
block_labels = [block.block_type for block in page.blocks]
render_on_image(block_bboxes, pdf_image, labels=block_labels, color="red")

block_order = [str(i) for i in range(len(page.blocks))]
render_on_image(block_bboxes, pdf_image, labels=block_order, color="green", draw_bbox=False, label_offset=5)

debug_file = os.path.join(debug_folder, f"pdf_page_{idx}.png")
pdf_image.save(debug_file)


def draw_pdf_page_debug_images(fname, pages: List[Page]):
# Remove extension from doc name
doc_base = os.path.basename(fname).rsplit(".", 1)[0]

debug_folder = os.path.join(settings.DEBUG_DATA_FOLDER, doc_base)
os.makedirs(debug_folder, exist_ok=True)
for idx, page in enumerate(pages):
img_size = (int(math.ceil(page.text_lines.image_bbox[2])), int(math.ceil(page.text_lines.image_bbox[3])))
png_image = Image.new("RGB", img_size, color="white")

line_bboxes = []
line_text = []
for block in page.blocks:
for line in block.lines:
line_bboxes.append(rescale_bbox(page.bbox, page.text_lines.image_bbox, line.bbox))
line_text.append(line.prelim_text)




def draw_page_debug_images(fname, pages: List[Page]):
if not settings.DEBUG:
return

draw_layout_page_debug_images(fname, pages)
draw_pdf_page_debug_images(fname, pages)



def dump_bbox_debug_data(fname, pages: List[Page]):
if not settings.DEBUG:
Expand Down
62 changes: 62 additions & 0 deletions marker/debug/render.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import requests
from PIL import ImageDraw, ImageFont, Image

from marker.settings import settings
import os


def get_font_path() -> str:
font_path = settings.DEBUG_RENDER_FONT

if not os.path.exists(font_path):
os.makedirs(os.path.dirname(font_path), exist_ok=True)
font_dl_path = f"{settings.RECOGNITION_FONT_DL_BASE}/{os.path.basename(font_path)}"
with requests.get(font_dl_path, stream=True) as r, open(font_path, 'wb') as f:
r.raise_for_status()
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)

return font_path


def get_text_size(text, font):
im = Image.new(mode="P", size=(0, 0))
draw = ImageDraw.Draw(im)
_, _, width, height = draw.textbbox((0, 0), text=text, font=font)
return width, height


def render_on_image(bboxes, image, labels=None, label_offset=1, label_font_size=10, color: str | list='red', draw_bbox=True):
draw = ImageDraw.Draw(image)
font_path = get_font_path()
label_font = ImageFont.truetype(font_path, label_font_size)

for i, bbox in enumerate(bboxes):
bbox = [int(p) for p in bbox]
if draw_bbox:
draw.rectangle(bbox, outline=color[i] if isinstance(color, list) else color, width=1)

if labels is not None:
label = labels[i]
text_position = (
bbox[0] + label_offset,
bbox[1] + label_offset
)
text_size = get_text_size(label, label_font)
if text_size[0] <= 0 or text_size[1] <= 0:
continue
box_position = (
text_position[0],
text_position[1],
text_position[0] + text_size[0],
text_position[1] + text_size[1]
)
draw.rectangle(box_position, fill="white")
draw.text(
text_position,
label,
fill=color[i] if isinstance(color, list) else color,
font=label_font
)

return image
6 changes: 6 additions & 0 deletions marker/layout/order.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,4 +66,10 @@ def sort_blocks_in_reading_order(pages: List[Page]):
block_group = sort_block_group(block_groups[position])
new_blocks.extend(block_group)

# Ensure we properly put footers at the end of the page
footer_blocks = [b for b in new_blocks if b.block_type in ["Footnote", "Page-footer"]]
header_blocks = [b for b in new_blocks if b.block_type in ["Page-header"]]
regular_blocks = [b for b in new_blocks if b.block_type not in ["Footnote", "Page-footer", "Page-header"]]

new_blocks = header_blocks + regular_blocks + footer_blocks
page.blocks = new_blocks
2 changes: 1 addition & 1 deletion marker/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def save_markdown(out_folder, fname, full_text, images, out_metadata):
with open(markdown_filepath, "w+", encoding='utf-8') as f:
f.write(full_text)
with open(out_meta_filepath, "w+") as f:
f.write(json.dumps(out_metadata, indent=4))
f.write(json.dumps(out_metadata, indent=4, ensure_ascii=False))

for filename, image in images.items():
image_filepath = os.path.join(subfolder_path, filename)
Expand Down
4 changes: 2 additions & 2 deletions marker/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def TORCH_DEVICE_MODEL(self) -> str:

# Layout model
SURYA_LAYOUT_DPI: int = 96
BAD_SPAN_TYPES: List[str] = ["Page-footer", "Page-header", "Picture"] # You can add "Caption" and "Footnote" here to get rid of those elements
BAD_SPAN_TYPES: List[str] = ["Page-footer", "Page-header"] # You can add "Caption" and "Footnote" here to get rid of those elements
LAYOUT_MODEL_CHECKPOINT: str = "vikp/surya_layout3"
BBOX_INTERSECTION_THRESH: float = 0.7 # How much the layout and pdf bboxes need to overlap to be the same
TABLE_INTERSECTION_THRESH: float = 0.7
Expand All @@ -89,7 +89,7 @@ def TORCH_DEVICE_MODEL(self) -> str:
PAGE_SEPARATOR: str = "\n\n" + "-" * 48 + "\n\n"

# Debug
DEBUG_DATA_FOLDER: str = os.path.join(BASE_DIR, "debug")
DEBUG_DATA_FOLDER: str = os.path.join(BASE_DIR, "debug_data")
DEBUG: bool = False
FONT_DIR: str = os.path.join(BASE_DIR, "static", "fonts")
DEBUG_RENDER_FONT: str = os.path.join(FONT_DIR, "GoNotoCurrent-Regular.ttf")
Expand Down

0 comments on commit 2602804

Please sign in to comment.