diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 67bc4fbe..99c0d16e 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -10,20 +10,14 @@ jobs: Linting: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 - - name: Set PY variable - run: echo "PY=$(python -VV | sha256sum | cut -d' ' -f1)" >> $GITHUB_ENV - - uses: actions/cache@v2 + - uses: actions/checkout@v3 with: - path: ~/.cache/pre-commit - key: pre-commit|${{ env.PY }}|${{ hashFiles('.pre-commit-config.yaml') }} - - name: Install pre-commit - run: | - pip install pre-commit - pre-commit install - - name: Run pre-commit - run: SKIP=no-commit-to-branch pre-commit run --all-files + # requites to grab the history of the PR + fetch-depth: 0 + - uses: actions/setup-python@v3 + - uses: pre-commit/action@v3.0.0 + with: + extra_args: --color=always --from-ref ${{ github.event.pull_request.base.sha }} --to-ref ${{ github.event.pull_request.head.sha }} Pytest: runs-on: ubuntu-latest @@ -45,6 +39,7 @@ jobs: - name: Install dependencies run: | pip install -e '.[dev]' + pip install poetry build - name: Test with Pytest on Python ${{ matrix.python-version }} run: python -m pytest --cov edspdf --cov-report xml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 45fbaaef..3c6220a2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -18,7 +18,7 @@ repos: # ruff - repo: https://github.com/charliermarsh/ruff-pre-commit # Ruff version. - rev: 'v0.0.245' + rev: 'v0.0.287' hooks: - id: ruff args: ['--config', 'pyproject.toml'] diff --git a/changelog.md b/changelog.md index 200ce78a..f3eb8c6f 100644 --- a/changelog.md +++ b/changelog.md @@ -6,6 +6,7 @@ - Add multi-modal transformers (`huggingface-embedding`) with windowing options - Add `render_page` option to `pdfminer` extractor, for multi-modal PDF features +- Packaging utils (`pipeline.package(...)`) to make a pip installable package from a pipeline ### Changed diff --git a/docs/pipeline.md b/docs/pipeline.md index 23b0042c..0268282e 100644 --- a/docs/pipeline.md +++ b/docs/pipeline.md @@ -11,6 +11,8 @@ Processing PDFs usually involves many steps such as extracting lines, running OC can use any technology in static components, we do not provide tools to train components built with other deep learning frameworks. +## Creating a pipeline + A pipe is a processing block (like a function) that applies a transformation on its input and returns a modified object. At the moment, four types of pipes are implemented in the library: @@ -57,7 +59,33 @@ model(pdf_bytes) model.pipe([pdf_bytes, ...]) ``` -## Hybrid models +### Hybrid models EDS-PDF was designed to facilitate the training and inference of hybrid models that arbitrarily chain static components or trained deep learning components. Static components are callable objects that take a PDFDoc object as input, perform arbitrary transformations over the input, and return the modified object. [Trainable pipes][edspdf.trainable_pipe.TrainablePipe], on the other hand, allow for deep learning operations to be performed on the [PDFDoc][edspdf.structures.PDFDoc] object and must be trained to be used. + +## Saving and loading a pipeline + +Pipelines can be saved and loaded using the `save` and `load` methods. The saved pipeline is not a pickled objet but a folder containing the config file, the weights and extra resources for each pipeline. This allows for easy inspection and modification of the pipeline, and avoids the execution of arbitrary code when loading a pipeline. + +```python +model.save("path/to/your/model") +model = edspdf.load("path/to/your/model") +``` + +To share the pipeline and turn it into a pip installable package, you can use the `package` method, which will use or create a pyproject.toml file, fill it accordingly, and create a wheel file. At the moment, we only support the poetry package manager. + +```python +model.package( + name="path/to/your/package", + version="0.0.1", + root_dir="path/to/project/root", # optional, to retrieve an existing pyproject.toml file + # if you don't have a pyproject.toml, you can provide the metadata here instead + metadata=dict( + authors="Firstname Lastname ", + description="A short description of your package", + ), +) +``` + +This will create a wheel file in the root_dir/dist folder, which you can share and install with pip diff --git a/edspdf/pipeline.py b/edspdf/pipeline.py index 6a3446da..14d55af6 100644 --- a/edspdf/pipeline.py +++ b/edspdf/pipeline.py @@ -15,6 +15,7 @@ Dict, Iterable, List, + Mapping, Optional, Sequence, Set, @@ -28,6 +29,7 @@ from confit.utils.collections import join_path, split_path from confit.utils.xjson import Reference from tqdm import tqdm +from typing_extensions import Literal import edspdf @@ -40,6 +42,7 @@ decompress_dict, multi_tee, ) +from .utils.package import ModuleName, package EMPTY_LIST = FrozenList() @@ -944,6 +947,35 @@ def select_pipes( yield self self._disabled = disabled_before + def package( + self, + name: ModuleName, + root_dir: Union[str, Path] = ".", + artifacts_name: ModuleName = "artifacts", + check_dependencies: bool = False, + project_type: Optional[Literal["poetry", "setuptools"]] = None, + version: str = "0.1.0", + metadata: Optional[Dict[str, Any]] = {}, + distributions: Optional[Sequence[Literal["wheel", "sdist"]]] = ["wheel"], + config_settings: Optional[Mapping[str, Union[str, Sequence[str]]]] = None, + isolation: bool = True, + skip_build_dependency_check: bool = False, + ): + return package( + pipeline=self, + name=name, + root_dir=root_dir, + artifacts_name=artifacts_name, + check_dependencies=check_dependencies, + project_type=project_type, + version=version, + metadata=metadata, + distributions=distributions, + config_settings=config_settings, + isolation=isolation, + skip_build_dependency_check=skip_build_dependency_check, + ) + def load( config: Union[Path, str, Config], diff --git a/edspdf/utils/package.py b/edspdf/utils/package.py new file mode 100644 index 00000000..8f415009 --- /dev/null +++ b/edspdf/utils/package.py @@ -0,0 +1,443 @@ +import io +import os +import re +import shutil +import subprocess +import sys +from contextlib import contextmanager +from pathlib import Path +from types import FunctionType +from typing import ( + TYPE_CHECKING, + Any, + Dict, + Literal, + Mapping, + Optional, + Sequence, + Tuple, + Type, + Union, +) + +import build +import dill +import toml +from build.__main__ import build_package, build_package_via_sdist +from confit import Cli +from dill._dill import save_function as dill_save_function +from dill._dill import save_type as dill_save_type +from importlib_metadata import PackageNotFoundError +from importlib_metadata import version as get_version +from loguru import logger + +import edspdf +from edspdf import Pipeline + +py_version = f"{sys.version_info.major}.{sys.version_info.minor}" + + +def get_package(obj_type: Type): + # Retrieve the __package__ attribute of the module of a type, if possible. + # And returns the package version as well + try: + module_name = obj_type.__module__ + if module_name == "__main__": + raise Exception(f"Could not find package of type {obj_type}") + module = __import__(module_name, fromlist=["__package__"]) + package = module.__package__ + try: + version = get_version(package) + except PackageNotFoundError: + return None + return package, version + except (ImportError, AttributeError): + raise Exception(f"Cound not find package of type {obj_type}") + + +def save_type(pickler, obj, *args, **kwargs): + package_name = get_package(obj) + if package_name is not None: + pickler.packages.add(package_name) + dill_save_type(pickler, obj, *args, **kwargs) + + +def save_function(pickler, obj, *args, **kwargs): + package_name = get_package(obj) + if package_name is not None: + pickler.packages.add(package_name) + return dill_save_function(pickler, obj, *args, **kwargs) + + +class PackagingPickler(dill.Pickler): + dispatch = dill.Pickler.dispatch.copy() + + dispatch[FunctionType] = save_function + dispatch[type] = save_type + + def __init__(self, *args, **kwargs): + self.file = io.BytesIO() + super().__init__(self.file, *args, **kwargs) + self.packages = set() + + +def get_deep_dependencies(obj): + pickler = PackagingPickler() + pickler.dump(obj) + return sorted(pickler.packages) + + +app = Cli(pretty_exceptions_show_locals=False, pretty_exceptions_enable=False) + + +def snake_case(s): + # From https://www.w3resource.com/python-exercises/string/python-data-type-string-exercise-97.php # noqa E501 + return "_".join( + re.sub( + "([A-Z][a-z]+)", r" \1", re.sub("([A-Z]+)", r" \1", s.replace("-", " ")) + ).split() + ).lower() + + +class ModuleName(str): + def __new__(cls, *args, **kwargs): + raise NotImplementedError("ModuleName is only meant for typing.") + + @classmethod + def __get_validators__(self): + yield self.validate + + @classmethod + def validate(cls, value, config=None): + if not isinstance(value, str): + raise TypeError("string required") + + if not re.match( + r"^([A-Z0-9]|[A-Z0-9][A-Z0-9._-]*[A-Z0-9])$", value, flags=re.IGNORECASE + ): + raise ValueError("invalid identifier") + return value + + +if TYPE_CHECKING: + ModuleName = str # noqa F811 + +POETRY_SNIPPET = """\ +from poetry.core.masonry.builders.sdist import SdistBuilder +from poetry.factory import Factory +from poetry.core.masonry.utils.module import ModuleOrPackageNotFound +import sys +# Initialize the Poetry object for the current project +poetry = Factory().create_poetry("__root_dir__") + +# Initialize the builder +try: + builder = SdistBuilder(poetry, None, None) +except ModuleOrPackageNotFound: + if not poetry.package.packages: + print([]) + sys.exit(0) + +print([ + {k: v for k, v in { + "include": include._include, + "from": include.source, + "formats": include.formats, + }.items()} + for include in builder._module.includes +]) + +# Get the list of files to include +files = builder.find_files_to_add() + +# Print the list of files +for file in files: + print(file.path) +""" + +INIT_PY = """\ +import edspdf +from pathlib import Path + +def load(device: "torch.device" = "cpu") -> edspdf.Pipeline: + artifacts_path = Path(__file__).parent / "{artifacts_path}" + model = edspdf.load(artifacts_path, device=device) + return model +""" + + +# def parse_authors_as_dicts(authors): +# authors = [authors] if isinstance(authors, str) else authors +# return [ +# dict(zip(("name", "email"), re.match(r"(.*) <(.*)>", author).groups())) +# if isinstance(author, str) +# else author +# for author in authors +# ] + + +def parse_authors_as_strings(authors): + authors = [authors] if isinstance(authors, str) else authors + return [ + author if isinstance(author, str) else f"{author['name']} <{author['email']}>" + for author in authors + ] + + +class PoetryPackager: + def __init__( + self, + pyproject: Optional[Dict[str, Any]], + pipeline: Union[Path, Pipeline], + version: str, + name: ModuleName, + root_dir: Path = ".", + build_dir: Path = "build", + out_dir: Path = "dist", + artifacts_name: ModuleName = "artifacts", + dependencies: Optional[Sequence[Tuple[str, str]]] = None, + metadata: Optional[Dict[str, Any]] = {}, + ): + self.poetry_bin_path = ( + subprocess.run(["which", "poetry"], stdout=subprocess.PIPE) + .stdout.decode() + .strip() + ) + self.version = version + self.pyproject = pyproject + self.root_dir = root_dir.resolve() + self.build_dir = build_dir + self.out_dir = self.root_dir / out_dir + self.artifacts_name = artifacts_name + self.name = name + self.pipeline = pipeline + self.dependencies = dependencies + + with self.ensure_pyproject(metadata): + logger.info(f"root_dir: {self.root_dir}") + logger.info(f"build_dir: {self.build_dir}") + logger.info(f"artifacts_name: {self.artifacts_name}") + logger.info(f"name: {self.name}") + + python_executable = ( + Path(self.poetry_bin_path).read_text().split("\n")[0][2:] + ) + result = subprocess.run( + [ + *python_executable.split(), + "-c", + POETRY_SNIPPET.replace("__root_dir__", str(self.root_dir)), + ], + stdout=subprocess.PIPE, + cwd=self.root_dir, + ) + if result.returncode != 0: + raise Exception() + out = result.stdout.decode().strip().split("\n") + self.poetry_packages = eval(out[0]) + self.file_paths = [self.root_dir / file_path for file_path in out[1:]] + + @contextmanager + def ensure_pyproject(self, metadata): + """Generates a Poetry based pyproject.toml""" + metadata = dict(metadata) + new_pyproject = self.pyproject is None + if "authors" in metadata: + metadata["authors"] = parse_authors_as_strings(metadata["authors"]) + try: + if new_pyproject: + self.pyproject = { + "build-system": { + "requires": ["poetry-core>=1.0.0"], + "build-backend": "poetry.core.masonry.api", + }, + "tool": { + "poetry": { + **metadata, + "name": self.name, + "version": self.version, + "dependencies": { + "python": f">={py_version},<4.0", + **{ + dep_name: f"^{dep_version}" + for dep_name, dep_version in self.dependencies + }, + }, + }, + }, + } + (self.root_dir / "pyproject.toml").write_text( + toml.dumps(self.pyproject) + ) + else: + for key, value in metadata.items(): + pyproject_value = self.pyproject["tool"]["poetry"].get(key) + if pyproject_value != metadata[key]: + raise ValueError( + f"Field {key} in pyproject.toml doesn't match the one " + f"passed as argument, you should remove it from the " + f"metadata parameter. Avoid using metadata if you already " + f"have a pyproject.toml file.\n" + f"pyproject.toml:\n {pyproject_value}\n" + f"metadata:\n {value}" + ) + yield + except Exception: + if new_pyproject: + os.remove(self.root_dir / "pyproject.toml") + raise + + def list_files_to_add(self): + # Extract python from the shebang in the poetry executable + return self.file_paths + + def build( + self, + distributions: Sequence[str] = (), + config_settings: Optional[build.ConfigSettingsType] = None, + isolation: bool = True, + skip_dependency_check: bool = False, + ): + logger.info(f"Building package {self.name}") + + if distributions: + build_call = build_package + else: + build_call = build_package_via_sdist + distributions = ["wheel"] + build_call( + srcdir=self.build_dir, + outdir=self.out_dir, + distributions=distributions, + config_settings=config_settings, + isolation=isolation, + skip_dependency_check=skip_dependency_check, + ) + + def update_pyproject(self): + # Replacing project name + old_name = self.pyproject["tool"]["poetry"]["name"] + self.pyproject["tool"]["poetry"]["name"] = self.name + logger.info( + f"Replaced project name {old_name!r} with {self.name!r} in poetry based " + f"project" + ) + + old_version = self.pyproject["tool"]["poetry"]["version"] + self.pyproject["tool"]["poetry"]["version"] = self.version + logger.info( + f"Replaced project version {old_version!r} with {self.version!r} in poetry " + f"based project" + ) + + # Adding artifacts to include in pyproject.toml + snake_name = snake_case(self.name.lower()) + included = self.pyproject["tool"]["poetry"].setdefault("include", []) + included.append(f"{snake_name}/{self.artifacts_name}/**") + + packages = list(self.poetry_packages) + packages.append({"include": snake_name}) + self.pyproject["tool"]["poetry"]["packages"] = packages + + def make_src_dir(self): + snake_name = snake_case(self.name.lower()) + package_dir = self.build_dir / snake_name + build_artifacts_dir = package_dir / self.artifacts_name + for file_path in self.list_files_to_add(): + new_file_path = self.build_dir / Path(file_path).relative_to(self.root_dir) + if isinstance(self.pipeline, Path) and self.pipeline in file_path.parents: + raise Exception( + f"Pipeline ({self.artifacts_name}) is already " + "included in the package's data, you should " + "remove it from the pyproject.toml metadata." + ) + os.makedirs(new_file_path.parent, exist_ok=True) + logger.info(f"COPY {file_path} TO {new_file_path}") + shutil.copy(file_path, new_file_path) + + self.update_pyproject() + + # Write pyproject.toml + (self.build_dir / "pyproject.toml").write_text(toml.dumps(self.pyproject)) + + if isinstance(self.pipeline, Path): + # self.pipeline = edspdf.load(self.pipeline) + shutil.copytree( + self.pipeline, + build_artifacts_dir, + ) + else: + self.pipeline.save(build_artifacts_dir) + os.makedirs(package_dir, exist_ok=True) + (package_dir / "__init__.py").write_text( + INIT_PY.format( + artifacts_path=os.path.relpath(build_artifacts_dir, package_dir) + ) + ) + + +@app.command(name="package") +def package( + pipeline: Union[Path, Pipeline], + name: ModuleName, + root_dir: Path = ".", + artifacts_name: ModuleName = "artifacts", + check_dependencies: bool = False, + project_type: Optional[Literal["poetry", "setuptools"]] = None, + version: str = "0.1.0", + metadata: Optional[Dict[str, Any]] = {}, + distributions: Optional[Sequence[Literal["wheel", "sdist"]]] = ["wheel"], + config_settings: Optional[Mapping[str, Union[str, Sequence[str]]]] = None, + isolation: bool = True, + skip_build_dependency_check: bool = False, +): + # root_dir = Path(".").resolve() + pyproject_path = root_dir / "pyproject.toml" + + if not pyproject_path.exists(): + check_dependencies = True + + dependencies = None + if check_dependencies: + if isinstance(pipeline, Path): + pipeline = edspdf.load(pipeline) + dependencies = get_deep_dependencies(pipeline) + for dep in dependencies: + print("DEPENDENCY", dep[0].ljust(30), dep[1]) + + root_dir = root_dir.resolve() + build_dir = root_dir / "build" / name + shutil.rmtree(build_dir, ignore_errors=True) + os.makedirs(build_dir) + + pyproject = None + if pyproject_path.exists(): + pyproject = toml.loads((root_dir / "pyproject.toml").read_text()) + + if "tool" in pyproject and "poetry" in pyproject["tool"]: + project_type = "poetry" + + if project_type == "poetry": + packager = PoetryPackager( + pyproject=pyproject, + pipeline=pipeline, + name=name, + version=version, + root_dir=root_dir, + build_dir=build_dir, + artifacts_name=artifacts_name, + dependencies=dependencies, + metadata=metadata, + ) + elif project_type is None: + raise Exception("Could not infer project type") + else: + raise Exception("Only poetry based projects are supported for now") + + packager.make_src_dir() + packager.build( + distributions=distributions, + config_settings=config_settings, + isolation=isolation, + skip_dependency_check=skip_build_dependency_check, + ) diff --git a/pyproject.toml b/pyproject.toml index 2d7d8073..9690d106 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,9 @@ dependencies = [ "rich-logger>=0.3.0,<1.0.0", "safetensors~=0.3.1", "anyascii>=0.3.2", - "attrs~=23.1" + "attrs~=23.1", + "build>=0.10.0", + "loguru", ] [project.optional-dependencies] @@ -125,6 +127,7 @@ exclude_also = [ "@overload", "pragma: no cover", "raise .*Error", + "raise .*Exception", "if __name__ == .__main__.:", "if (self[.])?name in exclude:", "if TYPE_CHECKING:", @@ -163,6 +166,7 @@ fixable = ["E", "F", "W", "I"] [tool.ruff.isort] known-first-party = ["edspdf"] +known-third-party = ["build"] [build-system] diff --git a/tests/conftest.py b/tests/conftest.py index 01e29b12..289e7831 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -8,6 +8,7 @@ from pytest import fixture from utils import nested_approx +from edspdf import Pipeline from edspdf.utils.collections import ld_to_dl pytest.nested_approx = nested_approx @@ -104,3 +105,24 @@ def dummy_dataset(tmpdir_factory, pdf): ) ds.save_to_disk(dataset_path) return dataset_path + + +@pytest.fixture(scope="session") +def frozen_pipeline(): + model = Pipeline() + model.add_pipe("pdfminer-extractor", name="extractor") + model.add_pipe( + "trainable-classifier", + name="classifier", + config=dict( + embedding={ + "@factory": "box-layout-embedding", + "n_positions": 32, + "size": "48", + }, + labels=["first", "second"], + ), + ) + model.add_pipe("simple-aggregator") + model.post_init([]) + return model diff --git a/tests/core/test_pipeline.py b/tests/core/test_pipeline.py index 80a88199..5d40bf0a 100644 --- a/tests/core/test_pipeline.py +++ b/tests/core/test_pipeline.py @@ -19,27 +19,6 @@ class CustomClass: pass -@pytest.fixture(scope="session") -def frozen_pipeline(): - model = Pipeline() - model.add_pipe("pdfminer-extractor", name="extractor") - model.add_pipe( - "trainable-classifier", - name="classifier", - config=dict( - embedding={ - "@factory": "box-layout-embedding", - "n_positions": 32, - "size": "48", - }, - labels=["first", "second"], - ), - ) - model.add_pipe("simple-aggregator") - model.post_init([]) - return model - - @pytest.fixture() def pipeline(): model = Pipeline() diff --git a/tests/utils/test_package.py b/tests/utils/test_package.py new file mode 100644 index 00000000..3eaac021 --- /dev/null +++ b/tests/utils/test_package.py @@ -0,0 +1,87 @@ +import pytest + +from edspdf.utils.package import package + + +def test_blank_package(frozen_pipeline, tmp_path): + + # Missing metadata makes poetry fail due to missing author / description + with pytest.raises(Exception): + package( + pipeline=frozen_pipeline, + root_dir=tmp_path, + name="test-model", + metadata={}, + project_type="poetry", + ) + + frozen_pipeline.package( + root_dir=tmp_path, + name="test-model", + metadata={ + "description": "A test model", + "authors": "Test Author ", + }, + project_type="poetry", + distributions=["wheel"], + ) + assert (tmp_path / "dist").is_dir() + assert (tmp_path / "build").is_dir() + assert (tmp_path / "dist" / "test_model-0.1.0-py3-none-any.whl").is_file() + assert not (tmp_path / "dist" / "test_model-0.1.0.tar.gz").is_file() + assert (tmp_path / "build" / "test-model").is_dir() + + +def test_package_with_files(frozen_pipeline, tmp_path): + frozen_pipeline.save(tmp_path / "model") + + ((tmp_path / "test_model_trainer").mkdir(parents=True)) + (tmp_path / "test_model_trainer" / "__init__.py").write_text( + """\ +print("Hello World!") +""" + ) + (tmp_path / "pyproject.toml").write_text( + """\ +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" + +[tool.poetry] +name = "test-model-trainer" +version = "0.0.0" +description = "A test model" +authors = ["Test Author "] + +[tool.poetry.dependencies] +python = "^3.7" +torch = "^1.4.0" +""" + ) + + with pytest.raises(ValueError): + package( + pipeline=frozen_pipeline, + root_dir=tmp_path, + version="0.1.0", + name="test-model", + metadata={ + "description": "Wrong description", + "authors": "Test Author ", + }, + ) + + package( + pipeline=tmp_path / "model", + root_dir=tmp_path, + version="0.1.0", + name="test-model", + metadata={ + "description": "A test model", + "authors": "Test Author ", + }, + ) + assert (tmp_path / "dist").is_dir() + assert (tmp_path / "dist" / "test_model-0.1.0.tar.gz").is_file() + assert (tmp_path / "dist" / "test_model-0.1.0-py3-none-any.whl").is_file() + assert (tmp_path / "pyproject.toml").is_file()