Skip to content

Commit

Permalink
Improve force ocr, enable parallel factor below 1
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Dec 18, 2023
1 parent 844833f commit 0d8d683
Show file tree
Hide file tree
Showing 9 changed files with 88 additions and 12 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ First, clone the repo:
- Install python requirements
- `poetry install`
- `poetry shell` to activate your poetry venv
- On ARM macs (M1+), make sure to set the `TORCH_DEVICE` setting to `mps` (more details below) for a speedup

# Usage

Expand Down
8 changes: 4 additions & 4 deletions marker/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def convert_single_pdf(
tess_lang,
spell_lang,
max_pages=max_pages,
parallel=parallel_factor * settings.OCR_PARALLEL_WORKERS
parallel=int(parallel_factor * settings.OCR_PARALLEL_WORKERS)
)

out_meta["toc"] = toc
Expand All @@ -109,7 +109,7 @@ def convert_single_pdf(
doc,
blocks,
layoutlm_model,
batch_size=settings.LAYOUT_BATCH_SIZE * parallel_factor
batch_size=int(settings.LAYOUT_BATCH_SIZE * parallel_factor)
)

# Find headers and footers
Expand All @@ -125,7 +125,7 @@ def convert_single_pdf(
doc,
blocks,
order_model,
batch_size=settings.ORDERER_BATCH_SIZE * parallel_factor
batch_size=int(settings.ORDERER_BATCH_SIZE * parallel_factor)
)

# Fix code blocks
Expand All @@ -148,7 +148,7 @@ def convert_single_pdf(
blocks,
block_types,
nougat_model,
batch_size=settings.NOUGAT_BATCH_SIZE * parallel_factor
batch_size=int(settings.NOUGAT_BATCH_SIZE * parallel_factor)
)
out_meta["block_stats"]["equations"] = eq_stats

Expand Down
4 changes: 2 additions & 2 deletions marker/debug/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@


def dump_nougat_debug_data(doc, images, converted_spans):
if not settings.DEBUG_DATA_FOLDER:
if not settings.DEBUG_DATA_FOLDER or settings.DEBUG_LEVEL == 0:
return

if len(images) == 0:
Expand Down Expand Up @@ -44,7 +44,7 @@ def dump_nougat_debug_data(doc, images, converted_spans):


def dump_bbox_debug_data(doc, blocks: List[Page]):
if not settings.DEBUG_DATA_FOLDER:
if not settings.DEBUG_DATA_FOLDER or settings.DEBUG_LEVEL < 2:
return

# Remove extension from doc name
Expand Down
3 changes: 2 additions & 1 deletion marker/ocr/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@ def ocr_entire_page_ocrmp(page, lang: str, spellchecker: Optional[SpellChecker]
outbytes,
language=lang,
output_type="pdf",
redo_ocr=True,
redo_ocr=None if settings.OCR_ALL_PAGES else True,
force_ocr=True if settings.OCR_ALL_PAGES else None,
progress_bar=False,
optimize=False,
fast_web_view=1e6,
Expand Down
3 changes: 3 additions & 0 deletions marker/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ class Settings(BaseSettings):
"French": "fra",
"German": "deu",
"Russian": "rus",
"Chinese": "chi_sim",
}
TESSERACT_TIMEOUT: int = 20 # When to give up on OCR
SPELLCHECK_LANGUAGES: Dict = {
Expand All @@ -46,6 +47,7 @@ class Settings(BaseSettings):
"French": "fr",
"German": "de",
"Russian": "ru",
"Chinese": None
}
OCR_ALL_PAGES: bool = False # Run OCR on every page even if text can be extracted
OCR_PARALLEL_WORKERS: int = 2 # How many CPU workers to use for OCR
Expand Down Expand Up @@ -101,6 +103,7 @@ class Settings(BaseSettings):
# Debug
DEBUG: bool = False # Enable debug logging
DEBUG_DATA_FOLDER: Optional[str] = None
DEBUG_LEVEL: int = 0 # 0 to 2, 2 means log everything

@computed_field
@property
Expand Down
68 changes: 67 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 6 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
[tool.poetry]
name = "marker"
version = "0.1.0"
description = ""
authors = ["Vik Paruchuri <[email protected]>"]
description = "Convert PDF to markdown with high speed and accuracy."
authors = ["Vik Paruchuri <[email protected]>"]
readme = "README.md"
license = "GPL-3.0-or-later"
repository = "https://github.com/VikParuchuri/marker"
keywords = ["pdf", "markdown", "ocr", "nlp"]

[tool.poetry.dependencies]
python = ">=3.9,<3.13"
Expand All @@ -29,6 +32,7 @@ ftfy = "^6.1.1"
nltk = "^3.8.1"
ocrmypdf = "^15.4.0"
bitsandbytes = "^0.41.2.post2"
grpcio = "^1.60.0"

[tool.poetry.group.dev.dependencies]
jupyter = "^1.0.0"
Expand Down
3 changes: 2 additions & 1 deletion scripts/install/apt-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@ tesseract-ocr-deu
tesseract-ocr-por
tesseract-ocr-spa
tesseract-ocr-rus
tesseract-ocr-fra
tesseract-ocr-fra
tesseract-ocr-chi-sim
2 changes: 1 addition & 1 deletion scripts/markdown_to_pdf.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@ if [ $# -ne 2 ]; then
exit 1
fi

pandoc $1 $2 --pdf-engine=xelatex --include-in-header=header.tex
pandoc $1 -o $2 --pdf-engine=xelatex --include-in-header=header.tex

0 comments on commit 0d8d683

Please sign in to comment.