Improve force ocr, enable parallel factor below 1

VikParuchuri · Dec 18, 2023 · 1d1ec20 · 1d1ec20
1 parent 844833f
commit 1d1ec20
Show file tree

Hide file tree

Showing 10 changed files with 101 additions and 14 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -17,7 +17,9 @@ jobs:
         with:
           python-version: 3.11
       - name: Install system dependencies
-        run: cat scripts/install/apt-requirements.txt | xargs sudo apt-get install -y
+        run: |
+          sudo apt-get update
+          cat scripts/install/apt-requirements.txt | xargs sudo apt-get install -y
       - name: Show tessdata folders
         run: ls /usr/share/tesseract-ocr/
       - name: Install python dependencies

diff --git a/README.md b/README.md
@@ -51,7 +51,7 @@ PDF is a tricky format, so marker will not always work perfectly.  Here are some
 - Marker will convert fewer equations to latex than nougat.  This is because it has to first detect equations, then convert them without hallucation.
 - Whitespace and indentations are not always respected.
 - Not all lines/spans will be joined properly.
-- Only languages similar to English (Spanish, French, German, Russian, etc) are supported.  Languages with different character sets (Chinese, Japanese, Korean, etc) are not.
+- Languages similar to English (Spanish, French, German, Russian, etc) have the best support. There is provisional support for Chinese, Japanese, Korean, and Hindi, but it may not work as well.
 - This works best on digital PDFs that won't require a lot of OCR.  It's optimized for speed, and limited OCR is used to fix errors.
 
 # Installation
@@ -88,6 +88,7 @@ First, clone the repo:
 - Install python requirements
   - `poetry install`
   - `poetry shell` to activate your poetry venv
+- On ARM macs (M1+), make sure to set the `TORCH_DEVICE` setting to `mps` (more details below) for a speedup
 
 # Usage
 

diff --git a/marker/convert.py b/marker/convert.py
@@ -92,7 +92,7 @@ def convert_single_pdf(
         tess_lang,
         spell_lang,
         max_pages=max_pages,
-        parallel=parallel_factor * settings.OCR_PARALLEL_WORKERS
+        parallel=int(parallel_factor * settings.OCR_PARALLEL_WORKERS)
     )
 
     out_meta["toc"] = toc
@@ -109,7 +109,7 @@ def convert_single_pdf(
         doc,
         blocks,
         layoutlm_model,
-        batch_size=settings.LAYOUT_BATCH_SIZE * parallel_factor
+        batch_size=int(settings.LAYOUT_BATCH_SIZE * parallel_factor)
     )
 
     # Find headers and footers
@@ -125,7 +125,7 @@ def convert_single_pdf(
         doc,
         blocks,
         order_model,
-        batch_size=settings.ORDERER_BATCH_SIZE * parallel_factor
+        batch_size=int(settings.ORDERER_BATCH_SIZE * parallel_factor)
     )
 
     # Fix code blocks
@@ -148,7 +148,7 @@ def convert_single_pdf(
         blocks,
         block_types,
         nougat_model,
-        batch_size=settings.NOUGAT_BATCH_SIZE * parallel_factor
+        batch_size=int(settings.NOUGAT_BATCH_SIZE * parallel_factor)
     )
     out_meta["block_stats"]["equations"] = eq_stats
 

diff --git a/marker/debug/data.py b/marker/debug/data.py
@@ -11,7 +11,7 @@
 
 
 def dump_nougat_debug_data(doc, images, converted_spans):
-    if not settings.DEBUG_DATA_FOLDER:
+    if not settings.DEBUG_DATA_FOLDER or settings.DEBUG_LEVEL == 0:
         return
 
     if len(images) == 0:
@@ -44,7 +44,7 @@ def dump_nougat_debug_data(doc, images, converted_spans):
 
 
 def dump_bbox_debug_data(doc, blocks: List[Page]):
-    if not settings.DEBUG_DATA_FOLDER:
+    if not settings.DEBUG_DATA_FOLDER or settings.DEBUG_LEVEL < 2:
         return
 
     # Remove extension from doc name

diff --git a/marker/ocr/page.py b/marker/ocr/page.py
@@ -53,7 +53,8 @@ def ocr_entire_page_ocrmp(page, lang: str, spellchecker: Optional[SpellChecker]
         outbytes,
         language=lang,
         output_type="pdf",
-        redo_ocr=True,
+        redo_ocr=None if settings.OCR_ALL_PAGES else True,
+        force_ocr=True if settings.OCR_ALL_PAGES else None,
         progress_bar=False,
         optimize=False,
         fast_web_view=1e6,

diff --git a/marker/settings.py b/marker/settings.py
@@ -37,6 +37,10 @@ class Settings(BaseSettings):
         "French": "fra",
         "German": "deu",
         "Russian": "rus",
+        "Chinese": "chi_sim",
+        "Japanese": "jpn",
+        "Korean": "kor",
+        "Hindi": "hin",
     }
     TESSERACT_TIMEOUT: int = 20 # When to give up on OCR
     SPELLCHECK_LANGUAGES: Dict = {
@@ -46,6 +50,10 @@ class Settings(BaseSettings):
         "French": "fr",
         "German": "de",
         "Russian": "ru",
+        "Chinese": None,
+        "Japanese": None,
+        "Korean": None,
+        "Hindi": None,
     }
     OCR_ALL_PAGES: bool = False # Run OCR on every page even if text can be extracted
     OCR_PARALLEL_WORKERS: int = 2 # How many CPU workers to use for OCR
@@ -101,6 +109,7 @@ class Settings(BaseSettings):
     # Debug
     DEBUG: bool = False # Enable debug logging
     DEBUG_DATA_FOLDER: Optional[str] = None
+    DEBUG_LEVEL: int = 0 # 0 to 2, 2 means log everything
 
     @computed_field
     @property

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,9 +1,12 @@
 [tool.poetry]
 name = "marker"
 version = "0.1.0"
-description = ""
-authors = ["Vik Paruchuri <[email protected]>"]
+description = "Convert PDF to markdown with high speed and accuracy."
+authors = ["Vik Paruchuri <[email protected]>"]
 readme = "README.md"
+license = "GPL-3.0-or-later"
+repository = "https://github.com/VikParuchuri/marker"
+keywords = ["pdf", "markdown", "ocr", "nlp"]
 
 [tool.poetry.dependencies]
 python = ">=3.9,<3.13"
@@ -29,6 +32,7 @@ ftfy = "^6.1.1"
 nltk = "^3.8.1"
 ocrmypdf = "^15.4.0"
 bitsandbytes = "^0.41.2.post2"
+grpcio = "^1.60.0"
 
 [tool.poetry.group.dev.dependencies]
 jupyter = "^1.0.0"

diff --git a/scripts/install/apt-requirements.txt b/scripts/install/apt-requirements.txt
@@ -7,4 +7,8 @@ tesseract-ocr-deu
 tesseract-ocr-por
 tesseract-ocr-spa
 tesseract-ocr-rus
-tesseract-ocr-fra
+tesseract-ocr-fra
+tesseract-ocr-chi-sim
+tesseract-ocr-jpn
+tesseract-ocr-kor
+tesseract-ocr-hin
diff --git a/scripts/markdown_to_pdf.sh b/scripts/markdown_to_pdf.sh
@@ -7,4 +7,4 @@ if [ $# -ne 2 ]; then
     exit 1
 fi
 
-pandoc $1 $2 --pdf-engine=xelatex --include-in-header=header.tex
+pandoc $1 -o $2 --pdf-engine=xelatex --include-in-header=header.tex