diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 00000000..7e1723ca --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,29 @@ +name: Python package +on: + push: + tags: + - "v*.*.*" +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Set up Python 3.11 + uses: actions/setup-python@v4 + with: + python-version: 3.11 + - name: Install python dependencies + run: | + pip install poetry + poetry install + poetry remove torch + poetry run pip install torch --index-url https://download.pytorch.org/whl/cpu + - name: Build package + run: | + poetry build + - name: Publish package + env: + PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} + run: | + poetry config pypi-token.pypi "$PYPI_TOKEN" + poetry publish diff --git a/benchmark.py b/benchmark.py index 54341616..59c5de59 100644 --- a/benchmark.py +++ b/benchmark.py @@ -8,9 +8,6 @@ from marker.convert import convert_single_pdf from marker.logger import configure_logging from marker.models import load_all_models -from marker.ordering import load_ordering_model -from marker.segmentation import load_layout_model -from marker.cleaners.equations import load_nougat_model from marker.benchmark.scoring import score_text from marker.extract_text import naive_get_text import json @@ -18,7 +15,6 @@ import subprocess import shutil import fitz as pymupdf -from marker.settings import settings from tabulate import tabulate configure_logging() @@ -34,7 +30,7 @@ def nougat_prediction(pdf_filename, batch_size=1): return data -if __name__ == "__main__": +def main(): parser = argparse.ArgumentParser(description="Benchmark PDF to MD conversion. Needs source pdfs, and a refernece folder with the correct markdown.") parser.add_argument("in_folder", help="Input PDF files") parser.add_argument("reference_folder", help="Reference folder with reference markdown files") @@ -126,3 +122,7 @@ def nougat_prediction(pdf_filename, batch_size=1): print("Scores by file") print(tabulate(score_table, headers=["Method", *score_headers])) + +if __name__ == "__main__": + main() + diff --git a/chunk_convert.py b/chunk_convert.py new file mode 100755 index 00000000..9dd45197 --- /dev/null +++ b/chunk_convert.py @@ -0,0 +1,19 @@ +import argparse +import subprocess + + +def main(): + parser = argparse.ArgumentParser(description="Convert a folder of PDFs to a folder of markdown files in chunks.") + parser.add_argument("in_folder", help="Input folder with pdfs.") + parser.add_argument("out_folder", help="Output folder") + args = parser.parse_args() + + # Construct the command + cmd = f"./chunk_convert.sh {args.in_folder} {args.out_folder}" + + # Execute the shell script + subprocess.run(cmd, shell=True, check=True) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/chunk_convert.sh b/chunk_convert.sh old mode 100644 new mode 100755 diff --git a/convert.py b/convert.py old mode 100644 new mode 100755 index eb4499b6..a29d363a --- a/convert.py +++ b/convert.py @@ -45,7 +45,7 @@ def process_single_pdf(fname: str, out_folder: str, model_refs, metadata: Option print(traceback.format_exc()) -if __name__ == "__main__": +def main(): parser = argparse.ArgumentParser(description="Convert multiple pdfs to markdown.") parser.add_argument("in_folder", help="Input folder with pdfs.") parser.add_argument("out_folder", help="Output folder") @@ -121,4 +121,8 @@ def process_single_pdf(fname: str, out_folder: str, model_refs, metadata: Option progress_bar.update(1) # Shutdown ray to free resources - ray.shutdown() \ No newline at end of file + ray.shutdown() + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/convert_single.py b/convert_single.py old mode 100644 new mode 100755 index efe11713..dabfd933 --- a/convert_single.py +++ b/convert_single.py @@ -3,13 +3,12 @@ from marker.convert import convert_single_pdf from marker.logger import configure_logging from marker.models import load_all_models -from marker.settings import settings import json configure_logging() -if __name__ == "__main__": +def main(): parser = argparse.ArgumentParser() parser.add_argument("filename", help="PDF file to parse") parser.add_argument("output", help="Output file name") @@ -26,4 +25,8 @@ out_meta_filename = args.output.rsplit(".", 1)[0] + "_meta.json" with open(out_meta_filename, "w+") as f: - f.write(json.dumps(out_meta, indent=4)) \ No newline at end of file + f.write(json.dumps(out_meta, indent=4)) + + +if __name__ == "__main__": + main() diff --git a/marker/convert.py b/marker/convert.py index 79431102..fa6317de 100644 --- a/marker/convert.py +++ b/marker/convert.py @@ -13,7 +13,6 @@ from marker.markdown import merge_spans, merge_lines, get_full_text from marker.schema import Page, BlockType from typing import List, Dict, Tuple, Optional -from copy import deepcopy import re import magic from marker.settings import settings diff --git a/pyproject.toml b/pyproject.toml index ecf5481e..89df89b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,12 +1,22 @@ [tool.poetry] -name = "marker" -version = "0.1.0" +name = "marker-pdf" +version = "0.1.1" description = "Convert PDF to markdown with high speed and accuracy." authors = ["Vik Paruchuri "] readme = "README.md" license = "GPL-3.0-or-later" repository = "https://github.com/VikParuchuri/marker" keywords = ["pdf", "markdown", "ocr", "nlp"] +packages = [ + {include = "marker"} +] +include = [ + "convert.py", + "convert_single.py", + "chunk_convert.sh", + "benchmark.py", + "chunk_convert.py", +] [tool.poetry.dependencies] python = ">=3.9,<3.13" @@ -37,6 +47,12 @@ grpcio = "^1.60.0" [tool.poetry.group.dev.dependencies] jupyter = "^1.0.0" +[tool.poetry.scripts] +marker = "convert:main" +marker_single = "convert_single:main" +marker_benchmark = "benchmark:main" +marker_chunk_convert = "chunk_convert:main" + [build-system] requires = ["poetry-core"] -build-backend = "poetry.core.masonry.api" +build-backend = "poetry.core.masonry.api" \ No newline at end of file