From 0d8d6832b232954425f8071a7b0838a26fe9c0a7 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Sun, 17 Dec 2023 22:03:47 -0800 Subject: [PATCH] Improve force ocr, enable parallel factor below 1 --- README.md | 1 + marker/convert.py | 8 ++-- marker/debug/data.py | 4 +- marker/ocr/page.py | 3 +- marker/settings.py | 3 ++ poetry.lock | 68 +++++++++++++++++++++++++++- pyproject.toml | 8 +++- scripts/install/apt-requirements.txt | 3 +- scripts/markdown_to_pdf.sh | 2 +- 9 files changed, 88 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index cbb91905..651ca6fe 100644 --- a/README.md +++ b/README.md @@ -88,6 +88,7 @@ First, clone the repo: - Install python requirements - `poetry install` - `poetry shell` to activate your poetry venv +- On ARM macs (M1+), make sure to set the `TORCH_DEVICE` setting to `mps` (more details below) for a speedup # Usage diff --git a/marker/convert.py b/marker/convert.py index c56a1d5f..79431102 100644 --- a/marker/convert.py +++ b/marker/convert.py @@ -92,7 +92,7 @@ def convert_single_pdf( tess_lang, spell_lang, max_pages=max_pages, - parallel=parallel_factor * settings.OCR_PARALLEL_WORKERS + parallel=int(parallel_factor * settings.OCR_PARALLEL_WORKERS) ) out_meta["toc"] = toc @@ -109,7 +109,7 @@ def convert_single_pdf( doc, blocks, layoutlm_model, - batch_size=settings.LAYOUT_BATCH_SIZE * parallel_factor + batch_size=int(settings.LAYOUT_BATCH_SIZE * parallel_factor) ) # Find headers and footers @@ -125,7 +125,7 @@ def convert_single_pdf( doc, blocks, order_model, - batch_size=settings.ORDERER_BATCH_SIZE * parallel_factor + batch_size=int(settings.ORDERER_BATCH_SIZE * parallel_factor) ) # Fix code blocks @@ -148,7 +148,7 @@ def convert_single_pdf( blocks, block_types, nougat_model, - batch_size=settings.NOUGAT_BATCH_SIZE * parallel_factor + batch_size=int(settings.NOUGAT_BATCH_SIZE * parallel_factor) ) out_meta["block_stats"]["equations"] = eq_stats diff --git a/marker/debug/data.py b/marker/debug/data.py index 8d6ac537..97418d74 100644 --- a/marker/debug/data.py +++ b/marker/debug/data.py @@ -11,7 +11,7 @@ def dump_nougat_debug_data(doc, images, converted_spans): - if not settings.DEBUG_DATA_FOLDER: + if not settings.DEBUG_DATA_FOLDER or settings.DEBUG_LEVEL == 0: return if len(images) == 0: @@ -44,7 +44,7 @@ def dump_nougat_debug_data(doc, images, converted_spans): def dump_bbox_debug_data(doc, blocks: List[Page]): - if not settings.DEBUG_DATA_FOLDER: + if not settings.DEBUG_DATA_FOLDER or settings.DEBUG_LEVEL < 2: return # Remove extension from doc name diff --git a/marker/ocr/page.py b/marker/ocr/page.py index 8cc7c546..140b4823 100644 --- a/marker/ocr/page.py +++ b/marker/ocr/page.py @@ -53,7 +53,8 @@ def ocr_entire_page_ocrmp(page, lang: str, spellchecker: Optional[SpellChecker] outbytes, language=lang, output_type="pdf", - redo_ocr=True, + redo_ocr=None if settings.OCR_ALL_PAGES else True, + force_ocr=True if settings.OCR_ALL_PAGES else None, progress_bar=False, optimize=False, fast_web_view=1e6, diff --git a/marker/settings.py b/marker/settings.py index 1af597f8..d1912711 100644 --- a/marker/settings.py +++ b/marker/settings.py @@ -37,6 +37,7 @@ class Settings(BaseSettings): "French": "fra", "German": "deu", "Russian": "rus", + "Chinese": "chi_sim", } TESSERACT_TIMEOUT: int = 20 # When to give up on OCR SPELLCHECK_LANGUAGES: Dict = { @@ -46,6 +47,7 @@ class Settings(BaseSettings): "French": "fr", "German": "de", "Russian": "ru", + "Chinese": None } OCR_ALL_PAGES: bool = False # Run OCR on every page even if text can be extracted OCR_PARALLEL_WORKERS: int = 2 # How many CPU workers to use for OCR @@ -101,6 +103,7 @@ class Settings(BaseSettings): # Debug DEBUG: bool = False # Enable debug logging DEBUG_DATA_FOLDER: Optional[str] = None + DEBUG_LEVEL: int = 0 # 0 to 2, 2 means log everything @computed_field @property diff --git a/poetry.lock b/poetry.lock index 145f793c..3349f3eb 100644 --- a/poetry.lock +++ b/poetry.lock @@ -968,6 +968,72 @@ files = [ [package.dependencies] wcwidth = ">=0.2.12,<0.3.0" +[[package]] +name = "grpcio" +version = "1.60.0" +description = "HTTP/2-based RPC framework" +optional = false +python-versions = ">=3.7" +files = [ + {file = "grpcio-1.60.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:d020cfa595d1f8f5c6b343530cd3ca16ae5aefdd1e832b777f9f0eb105f5b139"}, + {file = "grpcio-1.60.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:b98f43fcdb16172dec5f4b49f2fece4b16a99fd284d81c6bbac1b3b69fcbe0ff"}, + {file = "grpcio-1.60.0-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:20e7a4f7ded59097c84059d28230907cd97130fa74f4a8bfd1d8e5ba18c81491"}, + {file = "grpcio-1.60.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:452ca5b4afed30e7274445dd9b441a35ece656ec1600b77fff8c216fdf07df43"}, + {file = "grpcio-1.60.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:43e636dc2ce9ece583b3e2ca41df5c983f4302eabc6d5f9cd04f0562ee8ec1ae"}, + {file = "grpcio-1.60.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:6e306b97966369b889985a562ede9d99180def39ad42c8014628dd3cc343f508"}, + {file = "grpcio-1.60.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f897c3b127532e6befdcf961c415c97f320d45614daf84deba0a54e64ea2457b"}, + {file = "grpcio-1.60.0-cp310-cp310-win32.whl", hash = "sha256:b87efe4a380887425bb15f220079aa8336276398dc33fce38c64d278164f963d"}, + {file = "grpcio-1.60.0-cp310-cp310-win_amd64.whl", hash = "sha256:a9c7b71211f066908e518a2ef7a5e211670761651039f0d6a80d8d40054047df"}, + {file = "grpcio-1.60.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:fb464479934778d7cc5baf463d959d361954d6533ad34c3a4f1d267e86ee25fd"}, + {file = "grpcio-1.60.0-cp311-cp311-macosx_10_10_universal2.whl", hash = "sha256:4b44d7e39964e808b071714666a812049765b26b3ea48c4434a3b317bac82f14"}, + {file = "grpcio-1.60.0-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:90bdd76b3f04bdb21de5398b8a7c629676c81dfac290f5f19883857e9371d28c"}, + {file = "grpcio-1.60.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:91229d7203f1ef0ab420c9b53fe2ca5c1fbeb34f69b3bc1b5089466237a4a134"}, + {file = "grpcio-1.60.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b36a2c6d4920ba88fa98075fdd58ff94ebeb8acc1215ae07d01a418af4c0253"}, + {file = "grpcio-1.60.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:297eef542156d6b15174a1231c2493ea9ea54af8d016b8ca7d5d9cc65cfcc444"}, + {file = "grpcio-1.60.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:87c9224acba0ad8bacddf427a1c2772e17ce50b3042a789547af27099c5f751d"}, + {file = "grpcio-1.60.0-cp311-cp311-win32.whl", hash = "sha256:95ae3e8e2c1b9bf671817f86f155c5da7d49a2289c5cf27a319458c3e025c320"}, + {file = "grpcio-1.60.0-cp311-cp311-win_amd64.whl", hash = "sha256:467a7d31554892eed2aa6c2d47ded1079fc40ea0b9601d9f79204afa8902274b"}, + {file = "grpcio-1.60.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:a7152fa6e597c20cb97923407cf0934e14224af42c2b8d915f48bc3ad2d9ac18"}, + {file = "grpcio-1.60.0-cp312-cp312-macosx_10_10_universal2.whl", hash = "sha256:7db16dd4ea1b05ada504f08d0dca1cd9b926bed3770f50e715d087c6f00ad748"}, + {file = "grpcio-1.60.0-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:b0571a5aef36ba9177e262dc88a9240c866d903a62799e44fd4aae3f9a2ec17e"}, + {file = "grpcio-1.60.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6fd9584bf1bccdfff1512719316efa77be235469e1e3295dce64538c4773840b"}, + {file = "grpcio-1.60.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d6a478581b1a1a8fdf3318ecb5f4d0cda41cacdffe2b527c23707c9c1b8fdb55"}, + {file = "grpcio-1.60.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:77c8a317f0fd5a0a2be8ed5cbe5341537d5c00bb79b3bb27ba7c5378ba77dbca"}, + {file = "grpcio-1.60.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1c30bb23a41df95109db130a6cc1b974844300ae2e5d68dd4947aacba5985aa5"}, + {file = "grpcio-1.60.0-cp312-cp312-win32.whl", hash = "sha256:2aef56e85901c2397bd557c5ba514f84de1f0ae5dd132f5d5fed042858115951"}, + {file = "grpcio-1.60.0-cp312-cp312-win_amd64.whl", hash = "sha256:e381fe0c2aa6c03b056ad8f52f8efca7be29fb4d9ae2f8873520843b6039612a"}, + {file = "grpcio-1.60.0-cp37-cp37m-linux_armv7l.whl", hash = "sha256:92f88ca1b956eb8427a11bb8b4a0c0b2b03377235fc5102cb05e533b8693a415"}, + {file = "grpcio-1.60.0-cp37-cp37m-macosx_10_10_universal2.whl", hash = "sha256:e278eafb406f7e1b1b637c2cf51d3ad45883bb5bd1ca56bc05e4fc135dfdaa65"}, + {file = "grpcio-1.60.0-cp37-cp37m-manylinux_2_17_aarch64.whl", hash = "sha256:a48edde788b99214613e440fce495bbe2b1e142a7f214cce9e0832146c41e324"}, + {file = "grpcio-1.60.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:de2ad69c9a094bf37c1102b5744c9aec6cf74d2b635558b779085d0263166454"}, + {file = "grpcio-1.60.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:073f959c6f570797272f4ee9464a9997eaf1e98c27cb680225b82b53390d61e6"}, + {file = "grpcio-1.60.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:c826f93050c73e7769806f92e601e0efdb83ec8d7c76ddf45d514fee54e8e619"}, + {file = "grpcio-1.60.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:9e30be89a75ee66aec7f9e60086fadb37ff8c0ba49a022887c28c134341f7179"}, + {file = "grpcio-1.60.0-cp37-cp37m-win_amd64.whl", hash = "sha256:b0fb2d4801546598ac5cd18e3ec79c1a9af8b8f2a86283c55a5337c5aeca4b1b"}, + {file = "grpcio-1.60.0-cp38-cp38-linux_armv7l.whl", hash = "sha256:9073513ec380434eb8d21970e1ab3161041de121f4018bbed3146839451a6d8e"}, + {file = "grpcio-1.60.0-cp38-cp38-macosx_10_10_universal2.whl", hash = "sha256:74d7d9fa97809c5b892449b28a65ec2bfa458a4735ddad46074f9f7d9550ad13"}, + {file = "grpcio-1.60.0-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:1434ca77d6fed4ea312901122dc8da6c4389738bf5788f43efb19a838ac03ead"}, + {file = "grpcio-1.60.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e61e76020e0c332a98290323ecfec721c9544f5b739fab925b6e8cbe1944cf19"}, + {file = "grpcio-1.60.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:675997222f2e2f22928fbba640824aebd43791116034f62006e19730715166c0"}, + {file = "grpcio-1.60.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:5208a57eae445ae84a219dfd8b56e04313445d146873117b5fa75f3245bc1390"}, + {file = "grpcio-1.60.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:428d699c8553c27e98f4d29fdc0f0edc50e9a8a7590bfd294d2edb0da7be3629"}, + {file = "grpcio-1.60.0-cp38-cp38-win32.whl", hash = "sha256:83f2292ae292ed5a47cdcb9821039ca8e88902923198f2193f13959360c01860"}, + {file = "grpcio-1.60.0-cp38-cp38-win_amd64.whl", hash = "sha256:705a68a973c4c76db5d369ed573fec3367d7d196673fa86614b33d8c8e9ebb08"}, + {file = "grpcio-1.60.0-cp39-cp39-linux_armv7l.whl", hash = "sha256:c193109ca4070cdcaa6eff00fdb5a56233dc7610216d58fb81638f89f02e4968"}, + {file = "grpcio-1.60.0-cp39-cp39-macosx_10_10_universal2.whl", hash = "sha256:676e4a44e740deaba0f4d95ba1d8c5c89a2fcc43d02c39f69450b1fa19d39590"}, + {file = "grpcio-1.60.0-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:5ff21e000ff2f658430bde5288cb1ac440ff15c0d7d18b5fb222f941b46cb0d2"}, + {file = "grpcio-1.60.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4c86343cf9ff7b2514dd229bdd88ebba760bd8973dac192ae687ff75e39ebfab"}, + {file = "grpcio-1.60.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0fd3b3968ffe7643144580f260f04d39d869fcc2cddb745deef078b09fd2b328"}, + {file = "grpcio-1.60.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:30943b9530fe3620e3b195c03130396cd0ee3a0d10a66c1bee715d1819001eaf"}, + {file = "grpcio-1.60.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:b10241250cb77657ab315270b064a6c7f1add58af94befa20687e7c8d8603ae6"}, + {file = "grpcio-1.60.0-cp39-cp39-win32.whl", hash = "sha256:79a050889eb8d57a93ed21d9585bb63fca881666fc709f5d9f7f9372f5e7fd03"}, + {file = "grpcio-1.60.0-cp39-cp39-win_amd64.whl", hash = "sha256:8a97a681e82bc11a42d4372fe57898d270a2707f36c45c6676e49ce0d5c41353"}, + {file = "grpcio-1.60.0.tar.gz", hash = "sha256:2199165a1affb666aa24adf0c97436686d0a61bc5fc113c037701fb7c7fceb96"}, +] + +[package.extras] +protobuf = ["grpcio-tools (>=1.60.0)"] + [[package]] name = "huggingface-hub" version = "0.19.4" @@ -5746,4 +5812,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = ">=3.9,<3.13" -content-hash = "a5c103deeebe3f7f31384cf69884d893b123c89f30fa9122a2bd4067c1675843" +content-hash = "7d8f07f7b4ab7e802386b76d1add6ade5560636df131e8a7123436817638ad7c" diff --git a/pyproject.toml b/pyproject.toml index 8ca9daf9..ecf5481e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,9 +1,12 @@ [tool.poetry] name = "marker" version = "0.1.0" -description = "" -authors = ["Vik Paruchuri "] +description = "Convert PDF to markdown with high speed and accuracy." +authors = ["Vik Paruchuri "] readme = "README.md" +license = "GPL-3.0-or-later" +repository = "https://github.com/VikParuchuri/marker" +keywords = ["pdf", "markdown", "ocr", "nlp"] [tool.poetry.dependencies] python = ">=3.9,<3.13" @@ -29,6 +32,7 @@ ftfy = "^6.1.1" nltk = "^3.8.1" ocrmypdf = "^15.4.0" bitsandbytes = "^0.41.2.post2" +grpcio = "^1.60.0" [tool.poetry.group.dev.dependencies] jupyter = "^1.0.0" diff --git a/scripts/install/apt-requirements.txt b/scripts/install/apt-requirements.txt index 258059f5..f98c7c18 100644 --- a/scripts/install/apt-requirements.txt +++ b/scripts/install/apt-requirements.txt @@ -7,4 +7,5 @@ tesseract-ocr-deu tesseract-ocr-por tesseract-ocr-spa tesseract-ocr-rus -tesseract-ocr-fra \ No newline at end of file +tesseract-ocr-fra +tesseract-ocr-chi-sim \ No newline at end of file diff --git a/scripts/markdown_to_pdf.sh b/scripts/markdown_to_pdf.sh index 806aafca..710f1d29 100644 --- a/scripts/markdown_to_pdf.sh +++ b/scripts/markdown_to_pdf.sh @@ -7,4 +7,4 @@ if [ $# -ne 2 ]; then exit 1 fi -pandoc $1 $2 --pdf-engine=xelatex --include-in-header=header.tex \ No newline at end of file +pandoc $1 -o $2 --pdf-engine=xelatex --include-in-header=header.tex \ No newline at end of file