Skip to content

Commit

Permalink
Merge branch 'v2' into vik_v2
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri authored Nov 21, 2024
2 parents 44e0322 + 243ae0b commit 1d98e6e
Show file tree
Hide file tree
Showing 17 changed files with 254 additions and 21 deletions.
6 changes: 4 additions & 2 deletions convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for a simple op, which is not supported on MPS
os.environ["IN_STREAMLIT"] = "true" # Avoid multiprocessing inside surya
os.environ["PDFTEXT_CPU_WORKERS"] = "1" # Avoid multiprocessing inside pdftext

import argparse
import torch.multiprocessing as mp
Expand Down Expand Up @@ -67,7 +66,7 @@ def process_single_pdf(args):
@click.option("--chunk_idx", type=int, default=0, help="Chunk index to convert")
@click.option("--num_chunks", type=int, default=1, help="Number of chunks being processed in parallel")
@click.option("--max_files", type=int, default=None, help="Maximum number of pdfs to convert")
@click.option("--workers", type=int, default=5, help="Number of worker processes to use.")
@click.option("--workers", type=int, default=3, help="Number of worker processes to use.")
def main(in_folder: str, **kwargs):
in_folder = os.path.abspath(in_folder)
files = [os.path.join(in_folder, f) for f in os.listdir(in_folder)]
Expand All @@ -84,6 +83,9 @@ def main(in_folder: str, **kwargs):
if kwargs["max_files"]:
files_to_convert = files_to_convert[:kwargs["max_files"]]

# Disable nested multiprocessing
kwargs["disable_multiprocessing"] = True

total_processes = min(len(files_to_convert), kwargs["workers"])

try:
Expand Down
6 changes: 4 additions & 2 deletions convert_single.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,22 @@
import os

os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for a simple op, which is not supported on MPS

import time

import click

from marker.config.parser import ConfigParser
from marker.config.printer import CustomClickPrinter
from marker.converters.pdf import PdfConverter
from marker.logger import configure_logging
from marker.models import create_model_dict
from marker.output import save_output
from marker.config.parser import ConfigParser

configure_logging()


@click.command(help="Convert a single PDF to markdown.")
@click.command(cls=CustomClickPrinter, help="Convert a single PDF to markdown.")
@click.argument("fpath", type=str)
@ConfigParser.common_options
def main(fpath: str, **kwargs):
Expand Down
12 changes: 12 additions & 0 deletions marker/builders/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,18 @@


class DocumentBuilder(BaseBuilder):
"""
Constructs a Document given a PdfProvider, LayoutBuilder, and OcrBuilder.
Attributes:
lowres_image_dpi (int):
DPI setting for low-resolution page images used for Layout and Line Detection.
Default is 96.
highres_image_dpi (int):
DPI setting for high-resolution page images used for OCR.
Default is 192.
"""
lowres_image_dpi: int = 96
highres_image_dpi: int = 192

Expand Down
16 changes: 16 additions & 0 deletions marker/builders/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,22 @@


class LayoutBuilder(BaseBuilder):
"""
A builder for performing layout detection on PDF pages and merging the results into the document.
Attributes:
batch_size (int):
The batch size to use for the layout model.
Default is None, which will use the default batch size for the model.
layout_coverage_min_lines (int):
The minimum number of PdfProvider lines that must be covered by the layout model
to consider the lines from the PdfProvider valid. Default is 1.
layout_coverage_threshold (float):
The minimum coverage ratio required for the layout model to consider
the lines from the PdfProvider valid. Default is 0.3.
"""
batch_size = None
layout_coverage_min_lines = 1
layout_coverage_threshold = .3
Expand Down
17 changes: 16 additions & 1 deletion marker/builders/ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,21 @@


class OcrBuilder(BaseBuilder):
"""
A builder for performing OCR on PDF pages and merging the results into the document.
Attributes:
detection_batch_size (int):
The batch size to use for the detection model.
Default is None, which will use the default batch size for the model.
recognition_batch_size (int):
The batch size to use for the recognition model.
Default is None, which will use the default batch size for the model.
languages (List[str]):
A list of languages to use for OCR. Default is None.
"""
recognition_batch_size: int | None = None
detection_batch_size: int | None = None
languages: List[str] | None = None
Expand Down Expand Up @@ -51,7 +66,7 @@ def ocr_extraction(self, document: Document, provider: PdfProvider) -> ProviderP
page_list = [page for page in document.pages if page.text_extraction_method == "surya"]
recognition_results = run_ocr(
images=[page.lowres_image for page in page_list],
langs=[None] * len(page_list),
langs=[self.languages] * len(page_list),
det_model=self.detection_model,
det_processor=self.detection_model.processor,
rec_model=self.recognition_model,
Expand Down
12 changes: 12 additions & 0 deletions marker/builders/structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,18 @@


class StructureBuilder(BaseBuilder):
"""
A builder for grouping blocks together based on their structure.
Attributes:
gap_threshold (float):
The minimum gap between blocks to consider them part of the same group.
Default is 0.05.
list_gap_threshold (float):
The minimum gap between list items to consider them part of the same group.
Default is 0.1.
"""
gap_threshold: int = .05
list_gap_threshold: int = .1

Expand Down
6 changes: 5 additions & 1 deletion marker/config/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ def common_options(fn):
fn = click.option("--config_json", type=str, default=None,
help="Path to JSON file with additional configuration.")(fn)
fn = click.option("--languages", type=str, default=None, help="Comma separated list of languages to use for OCR.")(fn)
fn = click.option("--disable_multiprocessing", is_flag=True, default=False, help="Disable multiprocessing.")(fn)
fn = click.option('-l', is_flag=True, help="List available builders, processors and converters")(fn)
return fn

def generate_config_dict(self) -> Dict[str, any]:
Expand All @@ -57,6 +59,9 @@ def generate_config_dict(self) -> Dict[str, any]:
if v:
with open(v, "r") as f:
config.update(json.load(f))
case "disable_multiprocessing":
if v:
config["pdftext_workers"] = 1
return config

def get_renderer(self):
Expand Down Expand Up @@ -94,4 +99,3 @@ def get_output_folder(self, filepath: str):
def get_base_filename(self, filepath: str):
basename = os.path.basename(filepath)
return os.path.splitext(basename)[0]

54 changes: 54 additions & 0 deletions marker/config/printer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import importlib
import inspect
import pkgutil

import click

from marker.builders import BaseBuilder
from marker.converters import BaseConverter
from marker.processors import BaseProcessor


def find_subclasses(base_class):
"""
Dynamically find all subclasses of a base class in the module where the base class is defined
and its submodules.
"""
subclasses = {}
module_name = base_class.__module__
package = importlib.import_module(module_name)
if hasattr(package, '__path__'):
for _, module_name, _ in pkgutil.walk_packages(package.__path__, module_name + "."):
try:
module = importlib.import_module(module_name)
for name, obj in inspect.getmembers(module, inspect.isclass):
if issubclass(obj, base_class) and obj is not base_class:
subclasses[name] = obj
except ImportError:
pass
return subclasses


class CustomClickPrinter(click.Command):
def get_help(self, ctx):
additional_help = (
"\n\nTip: Use 'config --help' to display all the attributes of the Builders, Processors, and Converters in Marker."
)
help_text = super().get_help(ctx)
help_text = help_text + additional_help
click.echo(help_text)

def parse_args(self, ctx, args):
if 'config' in args and '--help' in args:
click.echo("Here is a list of all the Builders, Processors, and Converters in Marker along with their attributes:")
base_classes = [BaseBuilder, BaseProcessor, BaseConverter]
for base in base_classes:
click.echo(f"{base.__name__.removeprefix('Base')}s:\n")

subclasses = find_subclasses(base)
for class_name, class_type in subclasses.items():
doc = class_type.__doc__
if doc and "Attributes:" in doc:
click.echo(f" {class_name}: {doc}")
ctx.exit()
super().parse_args(ctx, args)
10 changes: 10 additions & 0 deletions marker/converters/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,16 @@


class PdfConverter(BaseConverter):
"""
A converter for processing and rendering PDF files into Markdown, JSON, HTML and other formats.
Attributes:
override_map (Dict[BlockTypes, Type[Block]]):
A mapping to override the default block classes for specific block types.
The keys are `BlockTypes` enum values, representing the types of blocks,
and the values are corresponding `Block` class implementations to use
instead of the defaults.
"""
override_map: Dict[BlockTypes, Type[Block]] = defaultdict()

def __init__(self, artifact_dict: Dict[str, Any], processor_list: List[str] | None = None, renderer: str | None = None, config=None):
Expand Down
4 changes: 3 additions & 1 deletion marker/processors/code.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@


class CodeProcessor(BaseProcessor):
"""
A processor for formatting code blocks.
"""
block_types = (BlockTypes.Code, )
y_top_threshold = 2 # pixels

def __call__(self, document: Document):
for page in document.pages:
Expand Down
30 changes: 29 additions & 1 deletion marker/processors/debug.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,41 @@
import requests
from PIL import Image, ImageDraw, ImageFont

from marker.settings import settings
from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.document import Document
from marker.settings import settings


class DebugProcessor(BaseProcessor):
"""
A processor for debugging the document.
Attributes:
debug_data_folder (str):
The folder to dump debug data to.
Default is "debug_data".
debug_layout_images (bool):
Whether to dump layout debug images.
Default is False.
debug_pdf_images (bool):
Whether to dump PDF debug images.
Default is False.
debug_json (bool):
Whether to dump block debug data.
Default is False.
render_font (str):
The path to the font to use for rendering debug images.
Default is "GoNotoCurrent-Regular.ttf" in the FONT_DIR folder.
font_dl_path (str):
The path to download the font from.
Default is "https://github.com/satbyy/go-noto-universal/releases/download/v7.0".
"""
block_types = tuple()
debug_data_folder: str = "debug_data"
debug_layout_images: bool = False
Expand Down
5 changes: 4 additions & 1 deletion marker/processors/document_toc.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@


class DocumentTOCProcessor(BaseProcessor):
"""
A processor for generating a table of contents for the document.
"""
block_types = (BlockTypes.SectionHeader, )

def __call__(self, document: Document):
Expand All @@ -16,4 +19,4 @@ def __call__(self, document: Document):
"page_id": page.page_id,
"polygon": block.polygon.polygon
})
document.table_of_contents = toc
document.table_of_contents = toc
18 changes: 17 additions & 1 deletion marker/processors/equation.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,29 @@
from texify.model.model import GenerateVisionEncoderDecoderModel
from tqdm import tqdm

from marker.settings import settings
from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.document import Document
from marker.settings import settings


class EquationProcessor(BaseProcessor):
"""
A processor for recognizing equations in the document.
Attributes:
model_max_length (int):
The maximum number of tokens to allow for the Texify model.
Default is 384.
batch_size (int):
The batch size to use for the Texify model.
Default is None, which will use the default batch size for the model.
token_buffer (int):
The number of tokens to buffer above max for the Texify model.
Default is 256.
"""
block_types = (BlockTypes.Equation, )
model_max_length = 384
batch_size = None
Expand Down
8 changes: 8 additions & 0 deletions marker/processors/ignoretext.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,14 @@


class IgnoreTextProcessor(BaseProcessor):
"""
A processor for ignoring text blocks that are common elements in the document.
Attributes:
common_element_threshold (float):
The minimum fraction of pages that a block must appear in to be considered a common element.
Default is 0.6.
"""
block_types = (BlockTypes.Text,)
common_element_threshold = .25
max_blocks = 1
Expand Down
Loading

0 comments on commit 1d98e6e

Please sign in to comment.