Skip to content

Commit

Permalink
Merge pull request #372 from VikParuchuri/vik_v2
Browse files Browse the repository at this point in the history
Vik v2
  • Loading branch information
VikParuchuri authored Nov 19, 2024
2 parents bd18169 + a72a508 commit 7b817ff
Show file tree
Hide file tree
Showing 18 changed files with 205 additions and 146 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ benchmark_data
debug_data
temp.md
temp
conversion_results

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
1 change: 1 addition & 0 deletions marker/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ class Settings(BaseSettings):
EXTRACT_IMAGES: bool = True # Extract images from pdfs and save them
PAGINATE_OUTPUT: bool = False # Paginate output markdown
BASE_DIR: str = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
OUTPUT_DIR: str = os.path.join(BASE_DIR, "conversion_results")
FLATTEN_PDF: bool = True # Pull form field values into the PDF before converting to markdown

@computed_field
Expand Down
38 changes: 23 additions & 15 deletions marker/v2/builders/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,19 @@

from marker.settings import settings
from marker.v2.builders import BaseBuilder
from marker.v2.providers.pdf import PageLines, PageSpans, PdfProvider
from marker.v2.providers import ProviderOutput, ProviderPageLines
from marker.v2.providers.pdf import PdfProvider
from marker.v2.schema import BlockTypes
from marker.v2.schema.document import Document
from marker.v2.schema.groups.page import PageGroup
from marker.v2.schema.polygon import PolygonBox
from marker.v2.schema.registry import get_block_class
from marker.v2.schema.text.line import Line


class LayoutBuilder(BaseBuilder):
batch_size = None
layout_coverage_min_lines = 1
layout_coverage_threshold = .5

def __init__(self, layout_model, config=None):
self.layout_model = layout_model
Expand All @@ -25,7 +27,7 @@ def __init__(self, layout_model, config=None):
def __call__(self, document: Document, provider: PdfProvider):
layout_results = self.surya_layout(document.pages)
self.add_blocks_to_pages(document.pages, layout_results)
self.merge_blocks(document.pages, provider.page_lines, provider.page_spans)
self.merge_blocks(document.pages, provider.page_lines)

def get_batch_size(self):
if self.batch_size is not None:
Expand Down Expand Up @@ -54,28 +56,34 @@ def add_blocks_to_pages(self, pages: List[PageGroup], layout_results: List[Layou
layout_block.polygon = layout_block.polygon.rescale(layout_page_size, provider_page_size)
page.add_structure(layout_block)

def merge_blocks(self, document_pages: List[PageGroup], provider_page_lines: PageLines, provider_page_spans: PageSpans):
for document_page, provider_lines in zip(document_pages, provider_page_lines.values()):
def merge_blocks(self, document_pages: List[PageGroup], provider_page_lines: ProviderPageLines):
for document_page in document_pages:
provider_lines = provider_page_lines.get(document_page.page_id, [])
if not self.check_layout_coverage(document_page, provider_lines):
document_page.text_extraction_method = "surya"
continue
line_spans = provider_page_spans[document_page.page_id]
document_page.merge_blocks(provider_lines, line_spans, text_extraction_method="pdftext")
document_page.merge_blocks(provider_lines, text_extraction_method="pdftext")

def check_layout_coverage(
self,
document_page: PageGroup,
provider_lines: List[Line],
coverage_threshold=0.5
provider_lines: List[ProviderOutput],
):
layout_area = 0
provider_area = 0
covered_blocks = 0
total_blocks = 0
for layout_block_id in document_page.structure:
layout_block = document_page.get_block(layout_block_id)
if layout_block.block_type in [BlockTypes.Figure, BlockTypes.Picture, BlockTypes.Table]:
continue
layout_area += layout_block.polygon.area

total_blocks += 1
intersecting_lines = 0
for provider_line in provider_lines:
provider_area += layout_block.polygon.intersection_area(provider_line.polygon)
coverage_ratio = provider_area / layout_area if layout_area > 0 else 0
return coverage_ratio >= coverage_threshold
if layout_block.polygon.intersection_area(provider_line.line.polygon) > 0:
intersecting_lines += 1

if intersecting_lines > self.layout_coverage_min_lines:
covered_blocks += 1

coverage_ratio = covered_blocks / max(total_blocks, 1)
return coverage_ratio >= self.layout_coverage_threshold
65 changes: 30 additions & 35 deletions marker/v2/builders/ocr.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
from typing import Dict, List, Tuple

from surya.ocr import run_ocr

from marker.settings import settings
from marker.v2.builders import BaseBuilder
from marker.v2.providers import ProviderOutput, ProviderPageLines
from marker.v2.providers.pdf import PdfProvider
from marker.v2.schema import BlockTypes
from marker.v2.schema.document import Document
Expand All @@ -12,10 +11,6 @@
from marker.v2.schema.text.line import Line
from marker.v2.schema.text.span import Span

PageLines = Dict[int, List[Line]]
LineSpans = Dict[int, List[Span]]
PageSpans = Dict[int, LineSpans]


class OcrBuilder(BaseBuilder):
recognition_batch_size = None
Expand All @@ -28,8 +23,8 @@ def __init__(self, detection_model, recognition_model, config=None):
self.recognition_model = recognition_model

def __call__(self, document: Document, provider: PdfProvider):
page_lines, page_spans = self.ocr_extraction(document, provider)
self.merge_blocks(document, page_lines, page_spans)
page_lines = self.ocr_extraction(document, provider)
self.merge_blocks(document, page_lines)

def get_recognition_batch_size(self):
if self.recognition_batch_size is not None:
Expand All @@ -47,7 +42,7 @@ def get_detection_batch_size(self):
return 4
return 4

def ocr_extraction(self, document: Document, provider: PdfProvider) -> Tuple[PageLines, PageSpans]:
def ocr_extraction(self, document: Document, provider: PdfProvider) -> ProviderPageLines:
page_list = [page for page in document.pages if page.text_extraction_method == "surya"]
recognition_results = run_ocr(
images=[page.lowres_image for page in page_list],
Expand All @@ -61,43 +56,43 @@ def ocr_extraction(self, document: Document, provider: PdfProvider) -> Tuple[Pag
)

page_lines = {}
page_spans = {}

SpanClass: Span = get_block_class(BlockTypes.Span)
LineClass: Line = get_block_class(BlockTypes.Line)

for page_id, recognition_result in zip((page.page_id for page in page_list), recognition_results):
page_spans.setdefault(page_id, {})
page_lines.setdefault(page_id, [])

page_size = provider.get_page_bbox(page_id).size
line_spans = page_spans[page_id]

for ocr_line_idx, ocr_line in enumerate(recognition_result.text_lines):
image_polygon = PolygonBox.from_bbox(recognition_result.image_bbox)
polygon = PolygonBox.from_bbox(ocr_line.bbox).rescale(image_polygon.size, page_size)

page_lines[page_id].append(LineClass(
polygon=polygon,
page_id=page_id,
))

line_spans.setdefault(ocr_line_idx, [])
line_spans[ocr_line_idx].append(SpanClass(
text=ocr_line.text,
formats=['plain'],
page_id=page_id,
polygon=polygon,
minimum_position=0,
maximum_position=0,
font='',
font_weight=0,
font_size=0,
))

return page_lines, page_spans

def merge_blocks(self, document: Document, page_lines: PageLines, page_spans: PageSpans):
line = LineClass(
polygon=polygon,
page_id=page_id,
)
spans = [
SpanClass(
text=ocr_line.text + "\n",
formats=['plain'],
page_id=page_id,
polygon=polygon,
minimum_position=0,
maximum_position=0,
font='',
font_weight=0,
font_size=0,
)
]

page_lines[page_id].append(ProviderOutput(line=line, spans=spans))

return page_lines

def merge_blocks(self, document: Document, page_lines: ProviderPageLines):
ocred_pages = [page for page in document.pages if page.text_extraction_method == "surya"]
for document_page, lines, line_spans in zip(ocred_pages, page_lines.values(), page_spans.values()):
document_page.merge_blocks(lines, line_spans, text_extraction_method="surya")
for document_page in ocred_pages:
lines = page_lines[document_page.page_id]
document_page.merge_blocks(lines, text_extraction_method="surya")
53 changes: 30 additions & 23 deletions marker/v2/converters/pdf.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import json

from marker.settings import settings
from marker.v2.processors.document_toc import DocumentTOCProcessor
from marker.v2.providers.pdf import PdfProvider
import os

Expand Down Expand Up @@ -63,6 +65,7 @@ def __call__(self, filepath: str):
EquationProcessor(self.texify_model, self.config),
TableProcessor(self.detection_model, self.recognition_model, self.table_rec_model, self.config),
SectionHeaderProcessor(self.config),
DocumentTOCProcessor(self.config),
DebugProcessor(self.config),
]

Expand All @@ -73,49 +76,53 @@ def __call__(self, filepath: str):


@click.command()
@click.option("--output", type=click.Path(exists=False), required=False, default="temp")
@click.option("--fname", type=str, default="adversarial.pdf")
@click.argument("fpath", type=str)
@click.option("--output_dir", type=click.Path(exists=False), required=False, default=settings.OUTPUT_DIR)
@click.option("--debug", is_flag=True)
@click.option("--output_format", type=click.Choice(["markdown", "json"]), default="markdown")
def main(output: str, fname: str, debug: bool, output_format: str):
dataset = datasets.load_dataset("datalab-to/pdfs", split="train")
idx = dataset['filename'].index(fname)
fname_base = fname.rsplit(".", 1)[0]
os.makedirs(output, exist_ok=True)

config = {}
@click.option("--pages", type=str, default=None)
@click.option("--force_ocr", is_flag=True)
def main(fpath: str, output_dir: str, debug: bool, output_format: str, pages: str, force_ocr: bool):
if pages is not None:
pages = list(map(int, pages.split(",")))

fname_base = os.path.splitext(os.path.basename(fpath))[0]
output_dir = os.path.join(output_dir, fname_base)
os.makedirs(output_dir, exist_ok=True)

config = {
"page_range": pages,
}
if debug:
config["debug_pdf_images"] = True
config["debug_layout_images"] = True
config["debug_json"] = True
config["debug_data_folder"] = output_dir
if force_ocr:
config["force_ocr"] = True

with tempfile.NamedTemporaryFile(suffix=".pdf") as temp_pdf:
temp_pdf.write(dataset['pdf'][idx])
temp_pdf.flush()

converter = PdfConverter(config=config, output_format=output_format)
rendered = converter(temp_pdf.name)
converter = PdfConverter(config=config, output_format=output_format)
rendered = converter(fpath)

if output_format == "markdown":
out_filename = f"{fname_base}.md"
with open(os.path.join(output, out_filename), "w+") as f:
with open(os.path.join(output_dir, f"{fname_base}.md"), "w+") as f:
f.write(rendered.markdown)

meta_filename = f"{fname_base}_meta.json"
with open(os.path.join(output, meta_filename), "w+") as f:
with open(os.path.join(output_dir, f"{fname_base}_meta.json"), "w+") as f:
f.write(json.dumps(rendered.metadata, indent=2))

for img_name, img in rendered.images.items():
img.save(os.path.join(output, img_name), "PNG")
img.save(os.path.join(output_dir, img_name), "PNG")
elif output_format == "json":
out_filename = f"{fname_base}.json"
with open(os.path.join(output, out_filename), "w+") as f:
with open(os.path.join(output_dir, f"{fname_base}.json"), "w+") as f:
f.write(rendered.model_dump_json(indent=2))

meta_filename = f"{fname_base}_meta.json"
with open(os.path.join(output, meta_filename), "w+") as f:
with open(os.path.join(output_dir, f"{fname_base}_meta.json"), "w+") as f:
f.write(json.dumps(rendered.metadata, indent=2))

print(f"Output written to {output_dir}")


if __name__ == "__main__":
main()
22 changes: 22 additions & 0 deletions marker/v2/processors/document_toc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from marker.v2.processors import BaseProcessor
from marker.v2.schema import BlockTypes
from marker.v2.schema.document import Document


class DocumentTOCProcessor(BaseProcessor):
block_types = (BlockTypes.SectionHeader, )

def __call__(self, document: Document):
toc = []
for page in document.pages:
for block in page.children:
if block.block_type not in self.block_types:
continue

toc.append({
"title": block.raw_text(document).strip(),
"heading_level": block.heading_level,
"page_id": page.page_id,
"polygon": block.polygon.polygon
})
document.table_of_contents = toc
9 changes: 8 additions & 1 deletion marker/v2/providers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,18 @@
from typing import List, Optional
from typing import List, Optional, Dict

from pydantic import BaseModel

from marker.v2.schema.text import Span
from marker.v2.schema.text.line import Line
from marker.v2.util import assign_config


class ProviderOutput(BaseModel):
line: Line
spans: List[Span]

ProviderPageLines = Dict[int, List[ProviderOutput]]

class BaseProvider:
def __init__(self, filepath: str, config: Optional[BaseModel | dict] = None):
assign_config(self, config)
Expand Down
Loading

0 comments on commit 7b817ff

Please sign in to comment.