Skip to content

Commit

Permalink
disable multiprocessing via config
Browse files Browse the repository at this point in the history
  • Loading branch information
iammosespaulr committed Nov 21, 2024
1 parent c61f195 commit 7442256
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 3 deletions.
6 changes: 4 additions & 2 deletions convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for a simple op, which is not supported on MPS
os.environ["IN_STREAMLIT"] = "true" # Avoid multiprocessing inside surya
os.environ["PDFTEXT_CPU_WORKERS"] = "1" # Avoid multiprocessing inside pdftext

import argparse
import torch.multiprocessing as mp
Expand Down Expand Up @@ -67,7 +66,7 @@ def process_single_pdf(args):
@click.option("--chunk_idx", type=int, default=0, help="Chunk index to convert")
@click.option("--num_chunks", type=int, default=1, help="Number of chunks being processed in parallel")
@click.option("--max_files", type=int, default=None, help="Maximum number of pdfs to convert")
@click.option("--workers", type=int, default=5, help="Number of worker processes to use.")
@click.option("--workers", type=int, default=3, help="Number of worker processes to use.")
def main(in_folder: str, **kwargs):
in_folder = os.path.abspath(in_folder)
files = [os.path.join(in_folder, f) for f in os.listdir(in_folder)]
Expand All @@ -84,6 +83,9 @@ def main(in_folder: str, **kwargs):
if kwargs["max_files"]:
files_to_convert = files_to_convert[:kwargs["max_files"]]

# Disable nested multiprocessing
kwargs["disable_multiprocessing"] = True

total_processes = min(len(files_to_convert), kwargs["workers"])

try:
Expand Down
4 changes: 3 additions & 1 deletion marker/config/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ def generate_config_dict(self) -> Dict[str, any]:
if v:
with open(v, "r") as f:
config.update(json.load(f))
case "disable_multiprocessing":
if v:
config["pdftext_workers"] = 1
return config

def get_renderer(self):
Expand Down Expand Up @@ -94,4 +97,3 @@ def get_output_folder(self, filepath: str):
def get_base_filename(self, filepath: str):
basename = os.path.basename(filepath)
return os.path.splitext(basename)[0]

0 comments on commit 7442256

Please sign in to comment.