From 409253439b1242016e2b28cda51358d8129766dd Mon Sep 17 00:00:00 2001 From: Julian Smith Date: Tue, 5 Sep 2023 08:48:59 +0100 Subject: [PATCH 1/6] fitz/fitz.i: simplify required include path. We now use full path when including MuPDF headers. --- fitz/fitz.i | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fitz/fitz.i b/fitz/fitz.i index 3cad88b31..1341b7b29 100644 --- a/fitz/fitz.i +++ b/fitz/fitz.i @@ -113,8 +113,8 @@ EnsureOwnership(self)%} #define JM_BinFromChar(x) PyBytes_FromString(x) #define JM_BinFromCharSize(x, y) PyBytes_FromStringAndSize(x, (Py_ssize_t) y) -#include -#include +#include +#include #include // freetype includes >> -------------------------------------------------- #include From f2ac51231fe43a5e213f3ee6d320b8aada3c174e Mon Sep 17 00:00:00 2001 From: Julian Smith Date: Tue, 5 Sep 2023 08:49:55 +0100 Subject: [PATCH 2/6] pipcl.py: moved sysconfig.get_path('platlib') code into new fn install_dir(). Also minor improvement to listing of wheel contents. --- pipcl.py | 56 ++++++++++++++++++++++++++++++++------------------------ 1 file changed, 32 insertions(+), 24 deletions(-) diff --git a/pipcl.py b/pipcl.py index d08c11d13..af85482d6 100644 --- a/pipcl.py +++ b/pipcl.py @@ -626,7 +626,7 @@ def add_str(content, to_): _log( f'Have created wheel size={st.st_size}: {path}') with zipfile.ZipFile(path, compression=self.wheel_compression) as z: _log(f'Contents are:') - for zi in z.infolist(): + for zi in sorted(z.infolist(), key=lambda z: z.filename): _log(f' {zi.file_size: 10d} {zi.filename}') return os.path.basename(path) @@ -767,28 +767,10 @@ def install(self, record_path=None, root=None, verbose=False): items = list() if self.fn_build: items = self._call_fn_build( dict()) - - if root: - if windows(): - # If we are in a venv, `sysconfig.get_path('platlib')` - # can be absolute, e.g. - # `C:\\...\\venv-pypackage-3.11.1-64\\Lib\\site-packages`, so - # it's not clear how to append it to `root`. So we just use - # `root`. - root2 = root - else: - # E.g. if `root` is `install' and `sysconfig.get_path('platlib')` - # is `/usr/local/lib/python3.9/site-packages`, we set `root2` to - # `install/usr/local/lib/python3.9/site-packages`. - # - r = sysconfig.get_path('platlib') - if verbose: - _log( f'{r=}') - r = r.lstrip( os.sep) - root2 = os.path.join( root, r) - else: - root2 = r - # todo: for pure-python we should use sysconfig.get_path('purelib') ? + + root2 = install_dir(root) + if verbose: + _log( f'{root2=}') _log( f'Installing into: {root2!r}') dist_info_dir = self._dist_info_dir() @@ -2090,7 +2072,33 @@ def _so_suffix(): # things like `numpy/core/_simd.cpython-311-darwin.so`. # return sysconfig.get_config_var('EXT_SUFFIX') - + + +def install_dir(root=None): + ''' + Returns install directory used by `install()`. + + This will be `sysconfig.get_path('platlib')`, modified by `root` if not + None. + ''' + # todo: for pure-python we should use sysconfig.get_path('purelib') ? + root2 = sysconfig.get_path('platlib') + if root: + if windows(): + # If we are in a venv, `sysconfig.get_path('platlib')` + # can be absolute, e.g. + # `C:\\...\\venv-pypackage-3.11.1-64\\Lib\\site-packages`, so it's + # not clear how to append it to `root`. So we just use `root`. + return root + else: + # E.g. if `root` is `install' and `sysconfig.get_path('platlib')` + # is `/usr/local/lib/python3.9/site-packages`, we set `root2` to + # `install/usr/local/lib/python3.9/site-packages`. + # + return os.path.join( root, root2.lstrip( os.sep)) + else: + return root2 + class _Record: ''' From 843efb52f40017974b07d25070993d032faa1223 Mon Sep 17 00:00:00 2001 From: Julian Smith Date: Tue, 5 Sep 2023 08:52:54 +0100 Subject: [PATCH 3/6] scripts/gh_release.py: fix pyodide build error. Building tesseract fails because of a known bug in Pyodide, so for now we disable tesseract when building for Pyodide. --- scripts/gh_release.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/scripts/gh_release.py b/scripts/gh_release.py index ac10f9ca3..7da40b4a6 100755 --- a/scripts/gh_release.py +++ b/scripts/gh_release.py @@ -304,7 +304,7 @@ def set_cibuild_test(): # wheel. # # Also, `auditwheel addtag` says `No tags to be added` and terminates - # with non-zero. + # with non-zero. See: https://github.com/pypa/auditwheel/issues/439. # env_set('CIBW_REPAIR_WHEEL_COMMAND_LINUX', '') env_set('CIBW_REPAIR_WHEEL_COMMAND_MACOS', '') @@ -359,7 +359,17 @@ def build_pyodide_wheel(): # Build PyMuPDF as a single wheel without a separate PyMuPDFb # wheel. env_extra['PYMUPDF_SETUP_IMPLEMENTATIONS'] = 'a' - + + # 2023-08-30: We set PYMUPDF_SETUP_MUPDF_BUILD_TESSERACT=0 because + # otherwise mupdf thirdparty/tesseract/src/ccstruct/dppoint.cpp fails to + # build because `#include "errcode.h"` finds a header inside emsdk. This is + # pyodide bug https://github.com/pyodide/pyodide/issues/3839. It's fixed in + # https://github.com/pyodide/pyodide/pull/3866 but the fix has not reached + # pypi.org's pyodide-build package. E.g. currently in tag 0.23.4, but + # current devuan pyodide-build is pyodide_build-0.23.4. + # + env_extra['PYMUPDF_SETUP_MUPDF_TESSERACT'] = '0' + command = pyodide_setup() command += ' && pyodide build --exports pyinit' run(command, env_extra=env_extra) From 1b1cdafb7574bc8236c16f3f1ad869509a85914b Mon Sep 17 00:00:00 2001 From: Julian Smith Date: Tue, 5 Sep 2023 08:53:11 +0100 Subject: [PATCH 4/6] setup.py: improvements to allow build/install with system MuPDF. Support new environment variables for when building with system MuPDF. Moved classic/rebased extension code into separate functions and moved common setting of build flags into new fn. --- setup.py | 333 ++++++++++++++++++++++++++++++------------------ src/__init__.py | 2 +- src/utils.py | 2 +- 3 files changed, 208 insertions(+), 129 deletions(-) diff --git a/setup.py b/setup.py index c65df8d91..099651fee 100755 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ ''' Overview: - Build script for PyMuPDF. + Build script for PyMuPDF, supporting PEP-517 and simple command-line usage. We hard-code the URL of the MuPDF .tar.gz file that we require. This generally points to a particular source release on mupdf.com. @@ -16,12 +16,33 @@ Building PyMuPDF: If we are not in an sdist we first download the mupdf .tar.gz file. - Then we extract and build MuPDF locally, before setuptools builds - PyMuPDF. So PyMuPDF will always be built with the exact MuPDF + Then we extract and build MuPDF locally, before building PyMuPDF + itself. So PyMuPDF will always be built with the exact MuPDF release that we require. + Environmental variables: + If building with system MuPDF (PYMUPDF_SETUP_MUPDF_BUILD is empty string): + + CFLAGS + CXXFLAGS + LDFLAGS + Added to c, c++, and link commands. + + PYMUPDF_INCLUDES + Colon-separated extra include paths. + + PYMUPDF_MUPDF_INCLUDE + System include directory that contains `mupdf/`. This directory is expected + to have layout from: + + cd mupdf && make install-shared-c|c++|python + + PYMUPDF_MUPDF_LIB + Directory containing MuPDF libraries, (libmupdf.so, + libmupdfcpp.so). + PYMUPDF_SETUP_IMPLEMENTATIONS Must be one of 'a', 'b', 'ab'. If unset we use 'ab'. If contains 'a' we build original implementation. @@ -945,12 +966,158 @@ def _fs_update(text, path): def _build_extensions( mupdf_local, mupdf_build_dir, build_type): ''' - Builds Python extension module `_fitz` and `_extra`. + Builds Python extension modules `_fitz` and `_extra`. Returns (path_so_leaf_a, path_so_leaf_b), the leafnames of the generated shared libraries within mupdf_build_dir. ''' + path_so_leaf_a = None + path_so_leaf_b = None + + if 'a' in _implementations(): + path_so_leaf_a = _build_extension_classic( mupdf_local, mupdf_build_dir, build_type) + + if 'b' in _implementations(): + path_so_leaf_b = _build_extension_rebased( mupdf_local, mupdf_build_dir, build_type) + + return path_so_leaf_a, path_so_leaf_b + + +def _build_extension_classic( mupdf_local, mupdf_build_dir, build_type): + ''' + Builds Python extension module `_fitz` for classic implementation. + + Returns leafname of the generated shared libraries within mupdf_build_dir. + ''' + (compiler_extra, linker_extra, includes, defines, optimise, debug, libpaths, libs, libraries) \ + = _extension_flags( mupdf_local, mupdf_build_dir, build_type) + + # Update helper-git-versions.i. + f = io.StringIO() + f.write('%pythoncode %{\n') + def repr_escape(text): + text = repr(text) + text = text.replace('{', '{{') + text = text.replace('}', '}}') + text = text.replace('%', '{chr(37)})') # Avoid confusing swig. + return 'f' + text + def write_git(name, directory): + sha, comment, diff, branch = get_git_id(directory) + f.write(f'{name}_git_sha = \'{sha}\'\n') + f.write(f'{name}_git_comment = {repr_escape(comment)}\n') + f.write(f'{name}_git_diff = {repr_escape(diff)}\n') + f.write(f'{name}_git_branch = {repr_escape(branch)}\n') + f.write('\n') + write_git('pymupdf', '.') + if mupdf_local: + write_git('mupdf', mupdf_local) + f.write('%}\n') + _fs_update( f.getvalue(), 'fitz/helper-git-versions.i') + + if windows: + compiler_extra_c = '' + else: + compiler_extra_c = ( + ' -Wno-incompatible-pointer-types' + ' -Wno-pointer-sign' + ' -Wno-sign-compare' + ) + prerequisites_swig = glob.glob( f'{g_root}/fitz/*.i') + if os.environ.get( 'PYMUPDF_SETUP_REBUILD_GIT_DETAILS') == '0': + # Remove helper-git-versions.i from prerequisites_swig so + # it doesn't force rebuild on its own. [Cannot easily use + # prerequisites_swig.remove() because / vs \ on Windows.] + # + for i, p in enumerate( prerequisites_swig): + if p.endswith( 'helper-git-versions.i'): + del prerequisites_swig[i] + break + else: + assert 0, f'Cannot find *helper-git-versions.i in prerequisites_swig: {prerequisites_swig}' + + path_so_leaf_a = pipcl.build_extension( + name = 'fitz', + path_i = f'{g_root}/fitz/fitz.i', + outdir = f'{g_root}/fitz', + includes = includes, + defines = defines, + libpaths = libpaths, + libs = libs, + compiler_extra = compiler_extra + compiler_extra_c, + linker_extra = linker_extra, + optimise = optimise, + debug = debug, + cpp = False, + prerequisites_swig = prerequisites_swig, + prerequisites_compile = f'{mupdf_local}/include', + prerequisites_link = libraries, + ) + + return path_so_leaf_a + + +def _build_extension_rebased( mupdf_local, mupdf_build_dir, build_type): + ''' + Builds Python extension module `_extra` for rebased implementation. + + Returns leafname of the generated shared libraries within mupdf_build_dir. + ''' + (compiler_extra, linker_extra, includes, defines, optimise, debug, libpaths, libs, libraries) \ + = _extension_flags( mupdf_local, mupdf_build_dir, build_type) + + if mupdf_local: + includes = ( + f'{mupdf_local}/platform/c++/include', + f'{mupdf_local}/include', + ) + + # Build rebased extension module. + log('Building PyMuPDF rebased.') + compile_extra_cpp = '' + if darwin: + # Avoids `error: cannot pass object of non-POD type + # 'std::nullptr_t' through variadic function; call will abort at + # runtime` when compiling `mupdf::pdf_dict_getl(..., nullptr)`. + compile_extra_cpp += ' -Wno-non-pod-varargs' + # Avoid errors caused by mupdf's C++ bindings' exception classes + # not having `nothrow` to match the base exception class. + compile_extra_cpp += ' -std=c++14' + if windows: + wp = pipcl.wdev.WindowsPython() + libs = f'mupdfcpp{wp.cpu.windows_suffix}.lib' + else: + libs = ('mupdf', 'mupdfcpp') + libraries = [ + f'{mupdf_build_dir}/libmupdf.so' + f'{mupdf_build_dir}/libmupdfcpp.so' + ] + + path_so_leaf_b = pipcl.build_extension( + name = 'extra', + path_i = f'{g_root}/src/extra.i', + outdir = f'{g_root}/src', + includes = includes, + defines = defines, + libpaths = libpaths, + libs = libs, + compiler_extra = compiler_extra + compile_extra_cpp, + linker_extra = linker_extra, + optimise = optimise, + debug = debug, + prerequisites_swig = None, + prerequisites_compile = f'{mupdf_local}/include', + prerequisites_link = libraries, + ) + + return path_so_leaf_b + + +def _extension_flags( mupdf_local, mupdf_build_dir, build_type): + ''' + Returns various flags to pass to pipcl.build_extension(). + ''' compiler_extra = '' + linker_extra = '' if build_type == 'memento': compiler_extra += ' -DMEMENTO' if mupdf_build_dir: @@ -959,6 +1126,7 @@ def _build_extensions( mupdf_local, mupdf_build_dir, build_type): mupdf_build_dir_flags = '' optimise = 'release' in mupdf_build_dir_flags debug = 'debug' in mupdf_build_dir_flags + r_extra = '' if windows: defines = ('FZ_DLL_CLIENT',) wp = pipcl.wdev.WindowsPython() @@ -975,144 +1143,55 @@ def _build_extensions( mupdf_local, mupdf_build_dir, build_type): libs = f'mupdfcpp{wp.cpu.windows_suffix}.lib' libraries = f'{mupdf_local}\\platform\\{infix}\\{wp.cpu.windows_subdir}{build_type_infix}\\{libs}' compiler_extra = '' - linker_extra = '' else: - defines = None - libpaths = (mupdf_build_dir,) libs = ['mupdf'] - libraries = f'{mupdf_build_dir}/{libs[0]}' + defines = None compiler_extra += ( ' -Wall' ' -Wno-deprecated-declarations' ' -Wno-unused-const-variable' ) - if openbsd: - compiler_extra += ' -Wno-deprecated-declarations' - linker_extra = '' - - path_so_leaf_a = None - path_so_leaf_b = None - - if 'a' in _implementations(): - # Build PyMuPDF original implementation. - log('Building PyMuPDF classic.') if mupdf_local: - includes = ( - f'{mupdf_local}/include', - f'{mupdf_local}/include/mupdf', - f'{mupdf_local}/thirdparty/freetype/include', - ) - else: - includes = None - - # Update helper-git-versions.i. - f = io.StringIO() - f.write('%pythoncode %{\n') - def repr_escape(text): - text = repr(text) - text = text.replace('{', '{{') - text = text.replace('}', '}}') - text = text.replace('%', '{chr(37)})') # Avoid confusing swig. - return 'f' + text - def write_git(name, directory): - sha, comment, diff, branch = get_git_id(directory) - f.write(f'{name}_git_sha = \'{sha}\'\n') - f.write(f'{name}_git_comment = {repr_escape(comment)}\n') - f.write(f'{name}_git_diff = {repr_escape(diff)}\n') - f.write(f'{name}_git_branch = {repr_escape(branch)}\n') - f.write('\n') - write_git('pymupdf', '.') - if mupdf_local: - write_git('mupdf', mupdf_local) - f.write('%}\n') - _fs_update( f.getvalue(), 'fitz/helper-git-versions.i') - - if windows: - compiler_extra_c = '' + libpaths = (mupdf_build_dir,) + libraries = f'{mupdf_build_dir}/{libs[0]}' + if openbsd: + compiler_extra += ' -Wno-deprecated-declarations' else: - compiler_extra_c = ( - ' -Wno-incompatible-pointer-types' - ' -Wno-pointer-sign' - ' -Wno-sign-compare' - ) - prerequisites_swig = glob.glob( f'{g_root}/fitz/*.i') - if os.environ.get( 'PYMUPDF_SETUP_REBUILD_GIT_DETAILS') == '0': - # Remove helper-git-versions.i from prerequisites_swig so - # it doesn't force rebuild on its own. [Cannot easily use - # prerequisites_swig.remove() because / vs \ on Windows.] - # - for i, p in enumerate( prerequisites_swig): - if p.endswith( 'helper-git-versions.i'): - del prerequisites_swig[i] - break - else: - assert 0, f'Cannot find *helper-git-versions.i in prerequisites_swig: {prerequisites_swig}' - - path_so_leaf_a = pipcl.build_extension( - name = 'fitz', - path_i = f'{g_root}/fitz/fitz.i', - outdir = f'{g_root}/fitz', - includes = includes, - defines = defines, - libpaths = libpaths, - libs = libs, - compiler_extra = compiler_extra + compiler_extra_c, - linker_extra = linker_extra, - optimise = optimise, - debug = debug, - cpp = False, - prerequisites_swig = prerequisites_swig, - prerequisites_compile = f'{mupdf_local}/include', - prerequisites_link = libraries, - ) - + libpaths = os.environ.get('PYMUPDF_MUPDF_LIB') + libraries = None + if libpaths: + libpaths = libpaths.split(':') + if mupdf_local: includes = ( - f'{mupdf_local}/platform/c++/include', f'{mupdf_local}/include', + f'{mupdf_local}/include/mupdf', + f'{mupdf_local}/thirdparty/freetype/include', ) else: - includes = None - if 'b' in _implementations(): - # Build rebased extension module. - log('Building PyMuPDF rebased.') - compile_extra_cpp = '' - if darwin: - # Avoids `error: cannot pass object of non-POD type - # 'std::nullptr_t' through variadic function; call will abort at - # runtime` when compiling `mupdf::pdf_dict_getl(..., nullptr)`. - compile_extra_cpp += ' -Wno-non-pod-varargs' - # Avoid errors caused by mupdf's C++ bindings' exception classes - # not having `nothrow` to match the base exception class. - compile_extra_cpp += ' -std=c++14' - if windows: - wp = pipcl.wdev.WindowsPython() - libs = f'mupdfcpp{wp.cpu.windows_suffix}.lib' - else: - libs = ('mupdf', 'mupdfcpp') - libraries = [ - f'{mupdf_build_dir}/libmupdf.so' - f'{mupdf_build_dir}/libmupdfcpp.so' + # Use system MuPDF. + includes = list() + pi = os.environ.get('PYMUPDF_INCLUDES') + if pi: + includes += pi.split(':') + pmi = os.environ.get('PYMUPDF_MUPDF_INCLUDE') + if pmi: + includes += [ + f'{pmi}', + f'{pmi}/mupdf/thirdparty/freetype', ] - path_so_leaf_b = pipcl.build_extension( - name = 'extra', - path_i = f'{g_root}/src/extra.i', - outdir = f'{g_root}/src', - includes = includes, - defines = defines, - libpaths = libpaths, - libs = libs, - compiler_extra = compiler_extra + compile_extra_cpp, - linker_extra = linker_extra, - optimise = optimise, - debug = debug, - prerequisites_swig = None, - prerequisites_compile = f'{mupdf_local}/include', - prerequisites_link = libraries, - ) - - return path_so_leaf_a, path_so_leaf_b - + ldflags = os.environ.get('LDFLAGS') + if ldflags: + linker_extra += f' {ldflags}' + cflags = os.environ.get('CFLAGS') + if cflags: + compiler_extra += f' {cflags}' + cxxflags = os.environ.get('CXXFLAGS') + if cxxflags: + compiler_extra += f' {cxxflags}' + + return compiler_extra, linker_extra, includes, defines, optimise, debug, libpaths, libs, libraries, + def sdist(): ret = list() diff --git a/src/__init__.py b/src/__init__.py index ef536d413..26874f277 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -115,7 +115,7 @@ def get_env_bool( name, default): mupdf = mupdf_cppyy.cppyy.gbl.mupdf else: # Use MuPDF Python SWIG bindings. - from . import mupdf + import mupdf mupdf.reinit_singlethreaded() mupdf_version_tuple = (mupdf.FZ_VERSION_MAJOR, mupdf.FZ_VERSION_MINOR, mupdf.FZ_VERSION_PATCH) diff --git a/src/utils.py b/src/utils.py index 9f50096f6..59aba1126 100644 --- a/src/utils.py +++ b/src/utils.py @@ -12,7 +12,7 @@ import typing from . import fitz -from . import mupdf +import mupdf g_exceptions_verbose = fitz.g_exceptions_verbose g_exceptions_verbose = False From a3e3335ee67909a59f2fceb163dcbed9e7b587d3 Mon Sep 17 00:00:00 2001 From: Julian Smith Date: Wed, 6 Sep 2023 23:03:53 +0100 Subject: [PATCH 5/6] src/: allow mupdf module to be in separate location. If `from . import mupdf` fails, we now also try `import mupdf`. This allows things to work if mupdf module is installed separately, e.g. as part of system installation of MuPDF. --- src/__init__.py | 10 ++++++++-- src/utils.py | 5 ++++- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/src/__init__.py b/src/__init__.py index 26874f277..b9ba32d2b 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -114,8 +114,14 @@ def get_env_bool( name, default): ).load_module() mupdf = mupdf_cppyy.cppyy.gbl.mupdf else: - # Use MuPDF Python SWIG bindings. - import mupdf + # Use MuPDF Python SWIG bindings. We allow import from either our own + # directory for conventional wheel installs, or from separate place in case + # we are using a separately-installed system installation of mupdf. + # + try: + from . import mupdf + except Exception as e: + import mupdf mupdf.reinit_singlethreaded() mupdf_version_tuple = (mupdf.FZ_VERSION_MAJOR, mupdf.FZ_VERSION_MINOR, mupdf.FZ_VERSION_PATCH) diff --git a/src/utils.py b/src/utils.py index 59aba1126..63399cb64 100644 --- a/src/utils.py +++ b/src/utils.py @@ -12,7 +12,10 @@ import typing from . import fitz -import mupdf +try: + from . import mupdf +except Exception as e: + import mupdf g_exceptions_verbose = fitz.g_exceptions_verbose g_exceptions_verbose = False From 63af0edbe2f47efa4c932427065f10c246d632e8 Mon Sep 17 00:00:00 2001 From: Julian Smith Date: Fri, 8 Sep 2023 13:31:00 +0100 Subject: [PATCH 6/6] src/__init__.py tests/test_tesseract.py: fix tesseract on rebased. Also extended test_tesseract() to assert that page.get_textpage_ocr() succeeds if TESSDATA_PREFIX is set in environment. --- src/__init__.py | 4 ++-- src/utils.py | 1 + tests/test_tesseract.py | 37 ++++++++++++++++++++++++------------- 3 files changed, 27 insertions(+), 15 deletions(-) diff --git a/src/__init__.py b/src/__init__.py index b9ba32d2b..7d8fcdfdc 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -9587,14 +9587,14 @@ def pdfocr_save(self, filename, compress=1, language=None, tessdata=None): ''' if not TESSDATA_PREFIX and not tessdata: raise RuntimeError('No OCR support: TESSDATA_PREFIX not set') - opts = mupdf.PdfocrOptions() + opts = mupdf.FzPdfocrOptions() opts.compress = compress; if language: opts.language_set2( language) if tessdata: opts.datadir_set2( tessdata) pix = self.this - if filename: + if isinstance(filename, str): mupdf.fz_save_pixmap_as_pdfocr( pix, filename, 0, opts) else: out = JM_new_output_fileptr( filename) diff --git a/src/utils.py b/src/utils.py index 63399cb64..051b6fa79 100644 --- a/src/utils.py +++ b/src/utils.py @@ -10,6 +10,7 @@ import math import os import typing +import weakref from . import fitz try: diff --git a/tests/test_tesseract.py b/tests/test_tesseract.py index cd213c6e1..2d7a69a17 100644 --- a/tests/test_tesseract.py +++ b/tests/test_tesseract.py @@ -3,21 +3,32 @@ def test_tesseract(): ''' - This checks that MuPDF has been built with tesseract support. We don't - (yet) attempt to supply a valid `tessdata` directory. + This checks that MuPDF has been built with tesseract support. + + By default we don't supply a valid `tessdata` directory, and just assert + that attempting to use Tesseract raises the expected error (which checks + that MuPDF is built with Tesseract support). + + But if TESSDATA_PREFIX is set in the environment, we assert that + FzPage.get_textpage_ocr() succeeds. ''' - if hasattr(fitz, 'mupdf'): - print(f'Not running test_tesseract() on rebased because tesseract not yet supported.') - return path = os.path.abspath( f'{__file__}/../resources/2.pdf') doc = fitz.open( path) page = doc[5] - e_expected = 'OCR initialisation failed' - try: - tp = page.get_textpage_ocr(full=True, tessdata='/foo/bar') - except Exception as e: - ee = str(e) - print(f'Received expected exception: {e}') - assert ee == e_expected, f'Unexpected exception: {ee!r}' + e_expected = ( + 'OCR initialisation failed', + 'code=2: OCR initialisation failed', + ) + tessdata_prefix = os.environ.get('TESSDATA_PREFIX') + if tessdata_prefix: + tp = page.get_textpage_ocr(full=True) + print(f'test_tesseract(): page.get_textpage_ocr() succeeded') else: - assert 0, f'Expected exception {e_expected!r}' + try: + tp = page.get_textpage_ocr(full=True, tessdata='/foo/bar') + except Exception as e: + ee = str(e) + print(f'Received expected exception: {e}') + assert ee in e_expected, f'Unexpected exception: {ee!r}' + else: + assert 0, f'Expected exception {e_expected!r}'