opendatalab · jorgeolothar · Sep 3, 2024 · Sep 5, 2024 · Sep 5, 2024 · Sep 11, 2024
diff --git a/README.md b/README.md
@@ -258,6 +258,7 @@ Parameter explanations:
 - `--vis`: Whether to visualize the results; if yes, detection results including bounding boxes and categories will be visualized.
 - `--render`: Whether to render the recognized results, including LaTeX code for formulas and plain text, which will be rendered and placed in the detection boxes. Note: This process is very time-consuming, and also requires prior installation of `xelatex` and `imagemagic`.
 - `--batch-size`: Batch size for dataloader. Larger batch sizes are recommended, but smaller sizes require less GPU memory. Default is 128.
+- `--num-workers`: Workers to preload data. 0 means main process will do data loading. Otherwise workers will speed up I/O process by consuming memory. Default is 32.
 
 > This project is dedicated to using models for high-quality content extraction from documents on diversity. It does not involve reassembling the extracted content into new documents, such as converting PDFs to Markdown. For those needs, please refer to our other GitHub project: [MinerU](https://github.com/opendatalab/MinerU)
 

diff --git a/modules/latex2png.py b/modules/latex2png.py
@@ -175,7 +175,7 @@ def zhtext2pil(zh_string, word_size=18):
     word_num = len(zh_string)
     img = Image.new('RGB', ((word_size-3)*word_num, word_size), 'white')
     draw = ImageDraw.Draw(img)
-    fontText = ImageFont.truetype("simhei.ttf", 15, encoding="utf-8")
+    fontText = ImageFont.truetype("assets/fonts/simhei.ttf", 15, encoding="utf-8")
     draw.text((0, 1), zh_string, (0, 0, 0), font=fontText)
     # draw.rectangle([0, 0, img.size[0]-1, img.size[1]-1], fill=None, outline=(255,0,0), width=1)
     return img

diff --git a/pdf_extract.py b/pdf_extract.py
@@ -10,6 +10,7 @@
 import torch
 import numpy as np
 import gc
+import sys
 
 from paddleocr import draw_ocr
 from PIL import Image, ImageDraw, ImageFont
@@ -74,14 +75,15 @@ def __getitem__(self, idx):
         return image
 
 
-if __name__ == '__main__':
+def main(cli_args):
     parser = argparse.ArgumentParser()
     parser.add_argument('--pdf', type=str)
     parser.add_argument('--output', type=str, default="output")
     parser.add_argument('--batch-size', type=int, default=128)
+    parser.add_argument('--num-workers', type=int, default=32)
     parser.add_argument('--vis', action='store_true')
     parser.add_argument('--render', action='store_true')
-    args = parser.parse_args()
+    args = parser.parse_args(cli_args)
     print(args)
 
     tz = pytz.timezone('Asia/Shanghai')