diff --git a/convert.py b/convert.py index bf99e260..ef9d3cf0 100755 --- a/convert.py +++ b/convert.py @@ -2,7 +2,6 @@ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for a simple op, which is not supported on MPS os.environ["IN_STREAMLIT"] = "true" # Avoid multiprocessing inside surya -os.environ["PDFTEXT_CPU_WORKERS"] = "1" # Avoid multiprocessing inside pdftext import argparse import torch.multiprocessing as mp @@ -67,7 +66,7 @@ def process_single_pdf(args): @click.option("--chunk_idx", type=int, default=0, help="Chunk index to convert") @click.option("--num_chunks", type=int, default=1, help="Number of chunks being processed in parallel") @click.option("--max_files", type=int, default=None, help="Maximum number of pdfs to convert") -@click.option("--workers", type=int, default=5, help="Number of worker processes to use.") +@click.option("--workers", type=int, default=3, help="Number of worker processes to use.") def main(in_folder: str, **kwargs): in_folder = os.path.abspath(in_folder) files = [os.path.join(in_folder, f) for f in os.listdir(in_folder)] @@ -84,6 +83,9 @@ def main(in_folder: str, **kwargs): if kwargs["max_files"]: files_to_convert = files_to_convert[:kwargs["max_files"]] + # Disable nested multiprocessing + kwargs["disable_multiprocessing"] = True + total_processes = min(len(files_to_convert), kwargs["workers"]) try: diff --git a/convert_single.py b/convert_single.py index 15875454..4ef90b4a 100755 --- a/convert_single.py +++ b/convert_single.py @@ -1,20 +1,22 @@ import os + os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for a simple op, which is not supported on MPS import time import click +from marker.config.parser import ConfigParser +from marker.config.printer import CustomClickPrinter from marker.converters.pdf import PdfConverter from marker.logger import configure_logging from marker.models import create_model_dict from marker.output import save_output -from marker.config.parser import ConfigParser configure_logging() -@click.command(help="Convert a single PDF to markdown.") +@click.command(cls=CustomClickPrinter, help="Convert a single PDF to markdown.") @click.argument("fpath", type=str) @ConfigParser.common_options def main(fpath: str, **kwargs): diff --git a/marker/builders/document.py b/marker/builders/document.py index 59d59bf8..d9729beb 100644 --- a/marker/builders/document.py +++ b/marker/builders/document.py @@ -10,6 +10,18 @@ class DocumentBuilder(BaseBuilder): + """ + Constructs a Document given a PdfProvider, LayoutBuilder, and OcrBuilder. + + Attributes: + lowres_image_dpi (int): + DPI setting for low-resolution page images used for Layout and Line Detection. + Default is 96. + + highres_image_dpi (int): + DPI setting for high-resolution page images used for OCR. + Default is 192. + """ lowres_image_dpi: int = 96 highres_image_dpi: int = 192 diff --git a/marker/builders/layout.py b/marker/builders/layout.py index cf1500a6..03c5bb74 100644 --- a/marker/builders/layout.py +++ b/marker/builders/layout.py @@ -16,6 +16,22 @@ class LayoutBuilder(BaseBuilder): + """ + A builder for performing layout detection on PDF pages and merging the results into the document. + + Attributes: + batch_size (int): + The batch size to use for the layout model. + Default is None, which will use the default batch size for the model. + + layout_coverage_min_lines (int): + The minimum number of PdfProvider lines that must be covered by the layout model + to consider the lines from the PdfProvider valid. Default is 1. + + layout_coverage_threshold (float): + The minimum coverage ratio required for the layout model to consider + the lines from the PdfProvider valid. Default is 0.3. + """ batch_size = None layout_coverage_min_lines = 1 layout_coverage_threshold = .3 diff --git a/marker/builders/ocr.py b/marker/builders/ocr.py index b4d53a7c..e169da23 100644 --- a/marker/builders/ocr.py +++ b/marker/builders/ocr.py @@ -17,6 +17,21 @@ class OcrBuilder(BaseBuilder): + """ + A builder for performing OCR on PDF pages and merging the results into the document. + + Attributes: + detection_batch_size (int): + The batch size to use for the detection model. + Default is None, which will use the default batch size for the model. + + recognition_batch_size (int): + The batch size to use for the recognition model. + Default is None, which will use the default batch size for the model. + + languages (List[str]): + A list of languages to use for OCR. Default is None. + """ recognition_batch_size: int | None = None detection_batch_size: int | None = None languages: List[str] | None = None @@ -51,7 +66,7 @@ def ocr_extraction(self, document: Document, provider: PdfProvider) -> ProviderP page_list = [page for page in document.pages if page.text_extraction_method == "surya"] recognition_results = run_ocr( images=[page.lowres_image for page in page_list], - langs=[None] * len(page_list), + langs=[self.languages] * len(page_list), det_model=self.detection_model, det_processor=self.detection_model.processor, rec_model=self.recognition_model, diff --git a/marker/builders/structure.py b/marker/builders/structure.py index c6dfb290..a371e6ed 100644 --- a/marker/builders/structure.py +++ b/marker/builders/structure.py @@ -7,6 +7,18 @@ class StructureBuilder(BaseBuilder): + """ + A builder for grouping blocks together based on their structure. + + Attributes: + gap_threshold (float): + The minimum gap between blocks to consider them part of the same group. + Default is 0.05. + + list_gap_threshold (float): + The minimum gap between list items to consider them part of the same group. + Default is 0.1. + """ gap_threshold: int = .05 list_gap_threshold: int = .1 diff --git a/marker/config/parser.py b/marker/config/parser.py index 62893904..bc0016fe 100644 --- a/marker/config/parser.py +++ b/marker/config/parser.py @@ -31,6 +31,8 @@ def common_options(fn): fn = click.option("--config_json", type=str, default=None, help="Path to JSON file with additional configuration.")(fn) fn = click.option("--languages", type=str, default=None, help="Comma separated list of languages to use for OCR.")(fn) + fn = click.option("--disable_multiprocessing", is_flag=True, default=False, help="Disable multiprocessing.")(fn) + fn = click.option('-l', is_flag=True, help="List available builders, processors and converters")(fn) return fn def generate_config_dict(self) -> Dict[str, any]: @@ -57,6 +59,9 @@ def generate_config_dict(self) -> Dict[str, any]: if v: with open(v, "r") as f: config.update(json.load(f)) + case "disable_multiprocessing": + if v: + config["pdftext_workers"] = 1 return config def get_renderer(self): @@ -94,4 +99,3 @@ def get_output_folder(self, filepath: str): def get_base_filename(self, filepath: str): basename = os.path.basename(filepath) return os.path.splitext(basename)[0] - diff --git a/marker/config/printer.py b/marker/config/printer.py new file mode 100644 index 00000000..20eac045 --- /dev/null +++ b/marker/config/printer.py @@ -0,0 +1,54 @@ +import importlib +import inspect +import pkgutil + +import click + +from marker.builders import BaseBuilder +from marker.converters import BaseConverter +from marker.processors import BaseProcessor + + +def find_subclasses(base_class): + """ + Dynamically find all subclasses of a base class in the module where the base class is defined + and its submodules. + """ + subclasses = {} + module_name = base_class.__module__ + package = importlib.import_module(module_name) + if hasattr(package, '__path__'): + for _, module_name, _ in pkgutil.walk_packages(package.__path__, module_name + "."): + try: + module = importlib.import_module(module_name) + for name, obj in inspect.getmembers(module, inspect.isclass): + if issubclass(obj, base_class) and obj is not base_class: + subclasses[name] = obj + except ImportError: + pass + return subclasses + + +class CustomClickPrinter(click.Command): + def get_help(self, ctx): + additional_help = ( + "\n\nTip: Use 'config --help' to display all the attributes of the Builders, Processors, and Converters in Marker." + ) + help_text = super().get_help(ctx) + help_text = help_text + additional_help + click.echo(help_text) + + def parse_args(self, ctx, args): + if 'config' in args and '--help' in args: + click.echo("Here is a list of all the Builders, Processors, and Converters in Marker along with their attributes:") + base_classes = [BaseBuilder, BaseProcessor, BaseConverter] + for base in base_classes: + click.echo(f"{base.__name__.removeprefix('Base')}s:\n") + + subclasses = find_subclasses(base) + for class_name, class_type in subclasses.items(): + doc = class_type.__doc__ + if doc and "Attributes:" in doc: + click.echo(f" {class_name}: {doc}") + ctx.exit() + super().parse_args(ctx, args) diff --git a/marker/converters/pdf.py b/marker/converters/pdf.py index 17a8e27d..24c88e26 100644 --- a/marker/converters/pdf.py +++ b/marker/converters/pdf.py @@ -28,6 +28,16 @@ class PdfConverter(BaseConverter): + """ + A converter for processing and rendering PDF files into Markdown, JSON, HTML and other formats. + + Attributes: + override_map (Dict[BlockTypes, Type[Block]]): + A mapping to override the default block classes for specific block types. + The keys are `BlockTypes` enum values, representing the types of blocks, + and the values are corresponding `Block` class implementations to use + instead of the defaults. + """ override_map: Dict[BlockTypes, Type[Block]] = defaultdict() def __init__(self, artifact_dict: Dict[str, Any], processor_list: List[str] | None = None, renderer: str | None = None, config=None): diff --git a/marker/processors/code.py b/marker/processors/code.py index a4f8cf04..9cd3e925 100644 --- a/marker/processors/code.py +++ b/marker/processors/code.py @@ -5,8 +5,10 @@ class CodeProcessor(BaseProcessor): + """ + A processor for formatting code blocks. + """ block_types = (BlockTypes.Code, ) - y_top_threshold = 2 # pixels def __call__(self, document: Document): for page in document.pages: diff --git a/marker/processors/debug.py b/marker/processors/debug.py index d6f4d3fe..3cf6cbfc 100644 --- a/marker/processors/debug.py +++ b/marker/processors/debug.py @@ -4,13 +4,41 @@ import requests from PIL import Image, ImageDraw, ImageFont -from marker.settings import settings from marker.processors import BaseProcessor from marker.schema import BlockTypes from marker.schema.document import Document +from marker.settings import settings class DebugProcessor(BaseProcessor): + """ + A processor for debugging the document. + + Attributes: + debug_data_folder (str): + The folder to dump debug data to. + Default is "debug_data". + + debug_layout_images (bool): + Whether to dump layout debug images. + Default is False. + + debug_pdf_images (bool): + Whether to dump PDF debug images. + Default is False. + + debug_json (bool): + Whether to dump block debug data. + Default is False. + + render_font (str): + The path to the font to use for rendering debug images. + Default is "GoNotoCurrent-Regular.ttf" in the FONT_DIR folder. + + font_dl_path (str): + The path to download the font from. + Default is "https://github.com/satbyy/go-noto-universal/releases/download/v7.0". + """ block_types = tuple() debug_data_folder: str = "debug_data" debug_layout_images: bool = False diff --git a/marker/processors/document_toc.py b/marker/processors/document_toc.py index c03e2fd3..8ddbcbb7 100644 --- a/marker/processors/document_toc.py +++ b/marker/processors/document_toc.py @@ -4,6 +4,9 @@ class DocumentTOCProcessor(BaseProcessor): + """ + A processor for generating a table of contents for the document. + """ block_types = (BlockTypes.SectionHeader, ) def __call__(self, document: Document): @@ -19,4 +22,4 @@ def __call__(self, document: Document): "page_id": page.page_id, "polygon": block.polygon.polygon }) - document.table_of_contents = toc \ No newline at end of file + document.table_of_contents = toc diff --git a/marker/processors/equation.py b/marker/processors/equation.py index c09f89f8..3a4dd405 100644 --- a/marker/processors/equation.py +++ b/marker/processors/equation.py @@ -4,13 +4,29 @@ from texify.model.model import GenerateVisionEncoderDecoderModel from tqdm import tqdm -from marker.settings import settings from marker.processors import BaseProcessor from marker.schema import BlockTypes from marker.schema.document import Document +from marker.settings import settings class EquationProcessor(BaseProcessor): + """ + A processor for recognizing equations in the document. + + Attributes: + model_max_length (int): + The maximum number of tokens to allow for the Texify model. + Default is 384. + + batch_size (int): + The batch size to use for the Texify model. + Default is None, which will use the default batch size for the model. + + token_buffer (int): + The number of tokens to buffer above max for the Texify model. + Default is 256. + """ block_types = (BlockTypes.Equation, ) model_max_length = 384 batch_size = None diff --git a/marker/processors/ignoretext.py b/marker/processors/ignoretext.py index 3e85d04e..eefd4e96 100644 --- a/marker/processors/ignoretext.py +++ b/marker/processors/ignoretext.py @@ -6,9 +6,16 @@ class IgnoreTextProcessor(BaseProcessor): + """ + A processor for ignoring text blocks that are common elements in the document. + + Attributes: + common_element_threshold (float): + The minimum fraction of pages that a block must appear in to be considered a common element. + Default is 0.6. + """ block_types = (BlockTypes.Text,) common_element_threshold = .6 - max_blocks = 1 def __call__(self, document: Document): first_blocks = [] diff --git a/marker/processors/sectionheader.py b/marker/processors/sectionheader.py index 4fe2f7ea..f4d6ff3e 100644 --- a/marker/processors/sectionheader.py +++ b/marker/processors/sectionheader.py @@ -1,18 +1,39 @@ -from marker.processors import BaseProcessor -from marker.schema import BlockTypes -from marker.schema.document import Document - +import warnings from typing import Dict, List + import numpy as np from sklearn.cluster import KMeans from sklearn.exceptions import ConvergenceWarning +from marker.processors import BaseProcessor +from marker.schema import BlockTypes +from marker.schema.document import Document + # Ignore sklearn warning about not converging -import warnings warnings.filterwarnings("ignore", category=ConvergenceWarning) class SectionHeaderProcessor(BaseProcessor): + """ + A processor for recognizing section headers in the document. + + Attributes: + level_count (int): + The number of levels to use for headings. + Default is 4. + + merge_threshold (float): + The minimum gap between headings to consider them part of the same group. + Default is 0.25. + + default_level (int): + The default heading level to use if no heading level is detected. + Default is 2. + + height_tolerance (float): + The minimum height of a heading to consider it a heading. + Default is 0.99. + """ block_types = (BlockTypes.SectionHeader, ) level_count = 4 merge_threshold = .25 diff --git a/marker/processors/table.py b/marker/processors/table.py index 37402961..3cff8afc 100644 --- a/marker/processors/table.py +++ b/marker/processors/table.py @@ -6,18 +6,38 @@ from tabled.assignment import assign_rows_columns from tabled.inference.recognition import get_cells, recognize_tables -from marker.settings import settings from marker.processors import BaseProcessor from marker.schema import BlockTypes from marker.schema.document import Document +from marker.settings import settings class TableProcessor(BaseProcessor): + """ + A processor for recognizing tables in the document. + + Attributes: + detect_boxes (bool): + Whether to detect boxes for the table recognition model. + Default is False. + + detector_batch_size (int): + The batch size to use for the table detection model. + Default is None, which will use the default batch size for the model. + + table_rec_batch_size (int): + The batch size to use for the table recognition model. + Default is None, which will use the default batch size for the model. + + recognition_batch_size (int): + The batch size to use for the table recognition model. + Default is None, which will use the default batch size for the model. + """ block_types = (BlockTypes.Table, BlockTypes.TableOfContents, BlockTypes.Form) detect_boxes = False detector_batch_size = None table_rec_batch_size = None - ocr_batch_size = None + recognition_batch_size = None def __init__( self, @@ -77,7 +97,7 @@ def __call__(self, document: Document): needs_ocr, [self.table_rec_model, self.table_rec_model.processor, self.recognition_model, self.recognition_model.processor], table_rec_batch_size=self.get_table_rec_batch_size(), - ocr_batch_size=self.get_ocr_batch_size() + ocr_batch_size=self.get_recognition_batch_size() ) for table_d, table_res in zip(table_data, tables): @@ -101,9 +121,9 @@ def get_table_rec_batch_size(self): return 64 return 8 - def get_ocr_batch_size(self): - if self.ocr_batch_size is not None: - return self.ocr_batch_size + def get_recognition_batch_size(self): + if self.recognition_batch_size is not None: + return self.recognition_batch_size elif settings.TORCH_DEVICE_MODEL == "mps": return 32 elif settings.TORCH_DEVICE_MODEL == "cuda": diff --git a/marker/processors/text.py b/marker/processors/text.py index 09e3d806..b3ff0e20 100644 --- a/marker/processors/text.py +++ b/marker/processors/text.py @@ -10,6 +10,14 @@ class TextProcessor(BaseProcessor): + """ + A processor for merging text across pages and columns. + + Attributes: + column_gap_ratio (float): + The minimum ratio of the page width to the column gap to consider a column break. + Default is 0.02. + """ block_types = (BlockTypes.Text, BlockTypes.TextInlineMath) column_gap_ratio = 0.02 # column gaps are atleast 2% of the page width