Merge pull request #382 from VikParuchuri/dev-mose/marker-v2

Add Docstrings for Processors, Builders and Converters and `-l` to list them from the `convert.py` CLI + Misc Fixes
VikParuchuri · Nov 21, 2024 · 243ae0b · 243ae0b
2 parents c61f195 + 72ba6ab
commit 243ae0b
Show file tree

Hide file tree

Showing 17 changed files with 254 additions and 22 deletions.
diff --git a/convert.py b/convert.py
@@ -2,7 +2,6 @@
 
 os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for a simple op, which is not supported on MPS
 os.environ["IN_STREAMLIT"] = "true" # Avoid multiprocessing inside surya
-os.environ["PDFTEXT_CPU_WORKERS"] = "1" # Avoid multiprocessing inside pdftext
 
 import argparse
 import torch.multiprocessing as mp
@@ -67,7 +66,7 @@ def process_single_pdf(args):
 @click.option("--chunk_idx", type=int, default=0, help="Chunk index to convert")
 @click.option("--num_chunks", type=int, default=1, help="Number of chunks being processed in parallel")
 @click.option("--max_files", type=int, default=None, help="Maximum number of pdfs to convert")
-@click.option("--workers", type=int, default=5, help="Number of worker processes to use.")
+@click.option("--workers", type=int, default=3, help="Number of worker processes to use.")
 def main(in_folder: str, **kwargs):
     in_folder = os.path.abspath(in_folder)
     files = [os.path.join(in_folder, f) for f in os.listdir(in_folder)]
@@ -84,6 +83,9 @@ def main(in_folder: str, **kwargs):
     if kwargs["max_files"]:
         files_to_convert = files_to_convert[:kwargs["max_files"]]
 
+    # Disable nested multiprocessing 
+    kwargs["disable_multiprocessing"] = True
+
     total_processes = min(len(files_to_convert), kwargs["workers"])
 
     try:

diff --git a/convert_single.py b/convert_single.py
@@ -1,20 +1,22 @@
 import os
+
 os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for a simple op, which is not supported on MPS
 
 import time
 
 import click
 
+from marker.config.parser import ConfigParser
+from marker.config.printer import CustomClickPrinter
 from marker.converters.pdf import PdfConverter
 from marker.logger import configure_logging
 from marker.models import create_model_dict
 from marker.output import save_output
-from marker.config.parser import ConfigParser
 
 configure_logging()
 
 
-@click.command(help="Convert a single PDF to markdown.")
+@click.command(cls=CustomClickPrinter, help="Convert a single PDF to markdown.")
 @click.argument("fpath", type=str)
 @ConfigParser.common_options
 def main(fpath: str, **kwargs):

diff --git a/marker/builders/document.py b/marker/builders/document.py
@@ -10,6 +10,18 @@
 
 
 class DocumentBuilder(BaseBuilder):
+    """
+    Constructs a Document given a PdfProvider, LayoutBuilder, and OcrBuilder.
+
+    Attributes:
+        lowres_image_dpi (int): 
+            DPI setting for low-resolution page images used for Layout and Line Detection.
+            Default is 96.
+
+        highres_image_dpi (int): 
+            DPI setting for high-resolution page images used for OCR.
+            Default is 192.
+    """
     lowres_image_dpi: int = 96
     highres_image_dpi: int = 192
 

diff --git a/marker/builders/layout.py b/marker/builders/layout.py
@@ -16,6 +16,22 @@
 
 
 class LayoutBuilder(BaseBuilder):
+    """
+    A builder for performing layout detection on PDF pages and merging the results into the document.
+
+    Attributes:
+        batch_size (int):
+            The batch size to use for the layout model.
+            Default is None, which will use the default batch size for the model.
+
+        layout_coverage_min_lines (int):
+            The minimum number of PdfProvider lines that must be covered by the layout model
+            to consider the lines from the PdfProvider valid. Default is 1.
+
+        layout_coverage_threshold (float):
+            The minimum coverage ratio required for the layout model to consider
+            the lines from the PdfProvider valid. Default is 0.3.
+    """
     batch_size = None
     layout_coverage_min_lines = 1
     layout_coverage_threshold = .3

diff --git a/marker/builders/ocr.py b/marker/builders/ocr.py
@@ -17,6 +17,21 @@
 
 
 class OcrBuilder(BaseBuilder):
+    """
+    A builder for performing OCR on PDF pages and merging the results into the document.
+
+    Attributes:
+        detection_batch_size (int):
+            The batch size to use for the detection model.
+            Default is None, which will use the default batch size for the model.
+
+        recognition_batch_size (int):
+            The batch size to use for the recognition model.
+            Default is None, which will use the default batch size for the model.
+
+        languages (List[str]):
+            A list of languages to use for OCR. Default is None.
+    """
     recognition_batch_size: int | None = None
     detection_batch_size: int | None = None
     languages: List[str] | None = None
@@ -51,7 +66,7 @@ def ocr_extraction(self, document: Document, provider: PdfProvider) -> ProviderP
         page_list = [page for page in document.pages if page.text_extraction_method == "surya"]
         recognition_results = run_ocr(
             images=[page.lowres_image for page in page_list],
-            langs=[None] * len(page_list),
+            langs=[self.languages] * len(page_list),
             det_model=self.detection_model,
             det_processor=self.detection_model.processor,
             rec_model=self.recognition_model,

diff --git a/marker/builders/structure.py b/marker/builders/structure.py
@@ -7,6 +7,18 @@
 
 
 class StructureBuilder(BaseBuilder):
+    """
+    A builder for grouping blocks together based on their structure.
+
+    Attributes:
+        gap_threshold (float):
+            The minimum gap between blocks to consider them part of the same group.
+            Default is 0.05.
+
+        list_gap_threshold (float):
+            The minimum gap between list items to consider them part of the same group.
+            Default is 0.1.
+    """
     gap_threshold: int = .05
     list_gap_threshold: int = .1
 

diff --git a/marker/config/parser.py b/marker/config/parser.py
@@ -31,6 +31,8 @@ def common_options(fn):
         fn = click.option("--config_json", type=str, default=None,
                           help="Path to JSON file with additional configuration.")(fn)
         fn = click.option("--languages", type=str, default=None, help="Comma separated list of languages to use for OCR.")(fn)
+        fn = click.option("--disable_multiprocessing", is_flag=True, default=False, help="Disable multiprocessing.")(fn)
+        fn = click.option('-l', is_flag=True, help="List available builders, processors and converters")(fn)
         return fn
 
     def generate_config_dict(self) -> Dict[str, any]:
@@ -57,6 +59,9 @@ def generate_config_dict(self) -> Dict[str, any]:
                     if v:
                         with open(v, "r") as f:
                             config.update(json.load(f))
+                case "disable_multiprocessing":
+                    if v:
+                        config["pdftext_workers"] = 1
         return config
 
     def get_renderer(self):
@@ -94,4 +99,3 @@ def get_output_folder(self, filepath: str):
     def get_base_filename(self, filepath: str):
         basename = os.path.basename(filepath)
         return os.path.splitext(basename)[0]
-
diff --git a/marker/config/printer.py b/marker/config/printer.py
@@ -0,0 +1,54 @@
+import importlib
+import inspect
+import pkgutil
+
+import click
+
+from marker.builders import BaseBuilder
+from marker.converters import BaseConverter
+from marker.processors import BaseProcessor
+
+
+def find_subclasses(base_class):
+    """
+    Dynamically find all subclasses of a base class in the module where the base class is defined
+    and its submodules.
+    """
+    subclasses = {}
+    module_name = base_class.__module__
+    package = importlib.import_module(module_name)
+    if hasattr(package, '__path__'):
+        for _, module_name, _ in pkgutil.walk_packages(package.__path__, module_name + "."):
+            try:
+                module = importlib.import_module(module_name)
+                for name, obj in inspect.getmembers(module, inspect.isclass):
+                    if issubclass(obj, base_class) and obj is not base_class:
+                        subclasses[name] = obj
+            except ImportError:
+                pass
+    return subclasses
+
+
+class CustomClickPrinter(click.Command):
+    def get_help(self, ctx):
+        additional_help = (
+            "\n\nTip: Use 'config --help' to display all the attributes of the Builders, Processors, and Converters in Marker."
+        )
+        help_text = super().get_help(ctx)
+        help_text = help_text + additional_help
+        click.echo(help_text)
+
+    def parse_args(self, ctx, args):
+        if 'config' in args and '--help' in args:
+            click.echo("Here is a list of all the Builders, Processors, and Converters in Marker along with their attributes:")
+            base_classes = [BaseBuilder, BaseProcessor, BaseConverter]
+            for base in base_classes:
+                click.echo(f"{base.__name__.removeprefix('Base')}s:\n")
+
+                subclasses = find_subclasses(base)
+                for class_name, class_type in subclasses.items():
+                    doc = class_type.__doc__
+                    if doc and "Attributes:" in doc:
+                        click.echo(f"  {class_name}: {doc}")
+            ctx.exit()
+        super().parse_args(ctx, args)
diff --git a/marker/converters/pdf.py b/marker/converters/pdf.py
@@ -28,6 +28,16 @@
 
 
 class PdfConverter(BaseConverter):
+    """
+    A converter for processing and rendering PDF files into Markdown, JSON, HTML and other formats.
+
+    Attributes:
+        override_map (Dict[BlockTypes, Type[Block]]): 
+            A mapping to override the default block classes for specific block types. 
+            The keys are `BlockTypes` enum values, representing the types of blocks, 
+            and the values are corresponding `Block` class implementations to use 
+            instead of the defaults.
+    """
     override_map: Dict[BlockTypes, Type[Block]] = defaultdict()
 
     def __init__(self, artifact_dict: Dict[str, Any], processor_list: List[str] | None = None, renderer: str | None = None, config=None):

diff --git a/marker/processors/code.py b/marker/processors/code.py
@@ -5,8 +5,10 @@
 
 
 class CodeProcessor(BaseProcessor):
+    """
+    A processor for formatting code blocks.
+    """
     block_types = (BlockTypes.Code, )
-    y_top_threshold = 2 # pixels
 
     def __call__(self, document: Document):
         for page in document.pages:

diff --git a/marker/processors/debug.py b/marker/processors/debug.py
@@ -4,13 +4,41 @@
 import requests
 from PIL import Image, ImageDraw, ImageFont
 
-from marker.settings import settings
 from marker.processors import BaseProcessor
 from marker.schema import BlockTypes
 from marker.schema.document import Document
+from marker.settings import settings
 
 
 class DebugProcessor(BaseProcessor):
+    """
+    A processor for debugging the document.
+
+    Attributes:
+        debug_data_folder (str):
+            The folder to dump debug data to.
+            Default is "debug_data".
+
+        debug_layout_images (bool):
+            Whether to dump layout debug images.
+            Default is False.
+
+        debug_pdf_images (bool):
+            Whether to dump PDF debug images.
+            Default is False.
+
+        debug_json (bool):
+            Whether to dump block debug data.
+            Default is False.
+
+        render_font (str):
+            The path to the font to use for rendering debug images.
+            Default is "GoNotoCurrent-Regular.ttf" in the FONT_DIR folder.
+
+        font_dl_path (str):
+            The path to download the font from.
+            Default is "https://github.com/satbyy/go-noto-universal/releases/download/v7.0".
+    """
     block_types = tuple()
     debug_data_folder: str = "debug_data"
     debug_layout_images: bool = False

diff --git a/marker/processors/document_toc.py b/marker/processors/document_toc.py
@@ -4,6 +4,9 @@
 
 
 class DocumentTOCProcessor(BaseProcessor):
+    """
+    A processor for generating a table of contents for the document.
+    """
     block_types = (BlockTypes.SectionHeader, )
 
     def __call__(self, document: Document):
@@ -19,4 +22,4 @@ def __call__(self, document: Document):
                     "page_id": page.page_id,
                     "polygon": block.polygon.polygon
                 })
-        document.table_of_contents = toc
+        document.table_of_contents = toc
diff --git a/marker/processors/equation.py b/marker/processors/equation.py
@@ -4,13 +4,29 @@
 from texify.model.model import GenerateVisionEncoderDecoderModel
 from tqdm import tqdm
 
-from marker.settings import settings
 from marker.processors import BaseProcessor
 from marker.schema import BlockTypes
 from marker.schema.document import Document
+from marker.settings import settings
 
 
 class EquationProcessor(BaseProcessor):
+    """
+    A processor for recognizing equations in the document.
+
+    Attributes:
+        model_max_length (int):
+            The maximum number of tokens to allow for the Texify model.
+            Default is 384.
+
+        batch_size (int):
+            The batch size to use for the Texify model.
+            Default is None, which will use the default batch size for the model.
+
+        token_buffer (int):
+            The number of tokens to buffer above max for the Texify model.
+            Default is 256.
+    """
     block_types = (BlockTypes.Equation, )
     model_max_length = 384
     batch_size = None

diff --git a/marker/processors/ignoretext.py b/marker/processors/ignoretext.py
@@ -6,9 +6,16 @@
 
 
 class IgnoreTextProcessor(BaseProcessor):
+    """
+    A processor for ignoring text blocks that are common elements in the document.
+
+    Attributes:
+        common_element_threshold (float):
+            The minimum fraction of pages that a block must appear in to be considered a common element.
+            Default is 0.6.
+    """
     block_types = (BlockTypes.Text,)
     common_element_threshold = .6
-    max_blocks = 1
 
     def __call__(self, document: Document):
         first_blocks = []