diff --git a/.gitignore b/.gitignore index 6f036496..7c44ec6b 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,8 @@ test_data training wandb *.dat +benchmark_data +report.json # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/poetry.lock b/poetry.lock index 401fd057..f5735900 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2007,13 +2007,13 @@ testing = ["docopt", "pytest"] [[package]] name = "pdftext" -version = "0.3.8" +version = "0.3.10" description = "Extract structured text from pdfs quickly" optional = false python-versions = "!=2.7.*,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,!=3.7.*,!=3.8.*,>=3.9" files = [ - {file = "pdftext-0.3.8-py3-none-any.whl", hash = "sha256:d11aeaf792b96ea878139ad7cd64a92d61cc5e01fec4f3b85ca6da1043d98cbe"}, - {file = "pdftext-0.3.8.tar.gz", hash = "sha256:1fbf53f0dc636b6863ccbbb6aed693c0e435b531a55a58e3d23bd125a2e0c616"}, + {file = "pdftext-0.3.10-py3-none-any.whl", hash = "sha256:99bd900d0d0692df06719c07ce10a859750ade3eb7f10c543f637118417497f9"}, + {file = "pdftext-0.3.10.tar.gz", hash = "sha256:90de726e818fb5683a0616cabb1a75a32a7224e873c3058006c93da6e440c66c"}, ] [package.dependencies] @@ -3379,13 +3379,13 @@ tests = ["cython", "littleutils", "pygments", "pytest", "typeguard"] [[package]] name = "surya-ocr" -version = "0.4.9" +version = "0.4.8" description = "OCR, layout, reading order, and line detection in 90+ languages" optional = false python-versions = "!=2.7.*,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,!=3.7.*,!=3.8.*,>=3.9" files = [ - {file = "surya_ocr-0.4.9-py3-none-any.whl", hash = "sha256:1634b14d39e2a07baf6a3ccbdefe4d970ba88b97ec310276a8a9b1780da1f3c5"}, - {file = "surya_ocr-0.4.9.tar.gz", hash = "sha256:c01e2eba24180045f1ed3fde00bf0074036e43acdd27eb2aa52b36d592bca0ee"}, + {file = "surya_ocr-0.4.8-py3-none-any.whl", hash = "sha256:6753bf295581f44b3e3452de563a3730a6c91500ea09090927154a1edfe57364"}, + {file = "surya_ocr-0.4.8.tar.gz", hash = "sha256:01e97db0d43941637ff0ddededa46491f7b0b937dba5c7fbba4ee75177991465"}, ] [package.dependencies] @@ -3975,4 +3975,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more [metadata] lock-version = "2.0" python-versions = ">=3.9,<3.13,!=3.9.7" -content-hash = "ea6029f8fe580673f23861dadcab04af7dc5ed8085c587bde453dc26c978c21b" +content-hash = "651ca6f147ab76aaad9508d3719bc5d51d28a56982295877a769df8d5491fc53" diff --git a/pyproject.toml b/pyproject.toml index 62928a49..8b824a42 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,10 +33,10 @@ tabulate = "^0.9.0" ftfy = "^6.1.1" texify = "^0.1.9" rapidfuzz = "^3.8.1" -surya-ocr = "^0.4.9" +surya-ocr = "0.4.8" filetype = "^1.2.0" regex = "^2024.4.28" -pdftext = "0.3.8" +pdftext = "^0.3.10" grpcio = "^1.63.0" [tool.poetry.group.dev.dependencies]