aphp · percevalw · Sep 7, 2023 · Sep 1, 2023 · Sep 2, 2023 · Sep 7, 2023
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -10,20 +10,14 @@ jobs:
   Linting:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
-      - uses: actions/setup-python@v2
-      - name: Set PY variable
-        run: echo "PY=$(python -VV | sha256sum | cut -d' ' -f1)" >> $GITHUB_ENV
-      - uses: actions/cache@v2
+      - uses: actions/checkout@v3
         with:
-          path: ~/.cache/pre-commit
-          key: pre-commit|${{ env.PY }}|${{ hashFiles('.pre-commit-config.yaml') }}
-      - name: Install pre-commit
-        run: |
-          pip install pre-commit
-          pre-commit install
-      - name: Run pre-commit
-        run: SKIP=no-commit-to-branch pre-commit run --all-files
+          # requites to grab the history of the PR
+          fetch-depth: 0
+      - uses: actions/setup-python@v3
+      - uses: pre-commit/[email protected]
+        with:
+          extra_args: --color=always --from-ref ${{ github.event.pull_request.base.sha }} --to-ref ${{ github.event.pull_request.head.sha }}
 
   Pytest:
     runs-on: ubuntu-latest
@@ -45,6 +39,7 @@ jobs:
       - name: Install dependencies
         run: |
           pip install -e '.[dev]'
+          pip install poetry build
 
       - name: Test with Pytest on Python ${{ matrix.python-version }}
         run: python -m pytest --cov edspdf --cov-report xml

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -18,7 +18,7 @@ repos:
   # ruff
   - repo: https://github.com/charliermarsh/ruff-pre-commit
     # Ruff version.
-    rev: 'v0.0.245'
+    rev: 'v0.0.287'
     hooks:
       - id: ruff
         args: ['--config', 'pyproject.toml']

diff --git a/changelog.md b/changelog.md
@@ -7,6 +7,7 @@
 - Add multi-modal transformers (`huggingface-embedding`) with windowing options
 - Add `render_page` option to `pdfminer` extractor, for multi-modal PDF features
 - Add inference utilities (`accelerators`), with simple mono process support and multi gpu / cpu support
+- Packaging utils (`pipeline.package(...)`) to make a pip installable package from a pipeline
 
 ### Changed
 

diff --git a/docs/pipeline.md b/docs/pipeline.md
@@ -11,6 +11,8 @@ Processing PDFs usually involves many steps such as extracting lines, running OC
     can use any technology in static components, we do not provide tools to train
     components built with other deep learning frameworks.
 
+## Creating a pipeline
+
 A pipe is a processing block (like a function) that applies a transformation on its input and returns a modified object.
 
 At the moment, four types of pipes are implemented in the library:
@@ -59,7 +61,33 @@ model.pipe([pdf_bytes, ...])
 
 For more information on how to use the pipeline, refer to the [Inference](/inference) page.
 
-## Hybrid models
+### Hybrid models
 
 EDS-PDF was designed to facilitate the training and inference of hybrid models that
 arbitrarily chain static components or trained deep learning components. Static components are callable objects that take a PDFDoc object as input, perform arbitrary transformations over the input, and return the modified object. [Trainable pipes][edspdf.trainable_pipe.TrainablePipe], on the other hand, allow for deep learning operations to be performed on the [PDFDoc][edspdf.structures.PDFDoc] object and must be trained to be used.
+
+## Saving and loading a pipeline
+
+Pipelines can be saved and loaded using the `save` and `load` methods. The saved pipeline is not a pickled objet but a folder containing the config file, the weights and extra resources for each pipeline. This allows for easy inspection and modification of the pipeline, and avoids the execution of arbitrary code when loading a pipeline.
+
+```python
+model.save("path/to/your/model")
+model = edspdf.load("path/to/your/model")
+```
+
+To share the pipeline and turn it into a pip installable package, you can use the `package` method, which will use or create a pyproject.toml file, fill it accordingly, and create a wheel file. At the moment, we only support the poetry package manager.
+
+```python
+model.package(
+    name="your-package-name",  # leave None to reuse name in pyproject.toml
+    version="0.0.1",
+    root_dir="path/to/project/root",  # optional, to retrieve an existing pyproject.toml file
+    # if you don't have a pyproject.toml, you can provide the metadata here instead
+    metadata=dict(
+        authors="Firstname Lastname <[email protected]>",
+        description="A short description of your package",
+    ),
+)
+```
+
+This will create a wheel file in the root_dir/dist folder, which you can share and install with pip
diff --git a/edspdf/pipeline.py b/edspdf/pipeline.py
@@ -13,6 +13,7 @@
     Dict,
     Iterable,
     List,
+    Mapping,
     Optional,
     Sequence,
     Set,
@@ -25,6 +26,7 @@
 from confit.errors import ConfitValidationError, patch_errors
 from confit.utils.collections import join_path, split_path
 from confit.utils.xjson import Reference
+from typing_extensions import Literal
 
 import edspdf
 
@@ -700,6 +702,7 @@ def load_state_from_disk(
         path: Union[str, Path],
         *,
         exclude: Set[str] = None,
+        device: Optional[Union[str, "torch.device"]] = "cpu",  # noqa F821
     ) -> "Pipeline":
         """
         Load the pipeline from a directory. Components will be updated in-place.
@@ -730,7 +733,9 @@ def deserialize_tensors(path: Path):
                     # are expected to be shared
                     pipe = trainable_components[pipe_names[0]]
                     tensor_dict = {}
-                    for keys, tensor in safetensors.torch.load_file(file_name).items():
+                    for keys, tensor in safetensors.torch.load_file(
+                        file_name, device=device
+                    ).items():
                         split_keys = [split_path(key) for key in keys.split("+")]
                         key = next(key for key in split_keys if key[0] == pipe_names[0])
                         tensor_dict[join_path(key[1:])] = tensor
@@ -769,11 +774,12 @@ def load(
         path: Union[str, Path],
         *,
         exclude: Optional[Set[str]] = None,
+        device: Optional[Union[str, "torch.device"]] = "cpu",  # noqa F821
     ):
         path = Path(path) if isinstance(path, str) else path
         config = Config.from_disk(path / "config.cfg")
         self = Pipeline.from_config(config)
-        self.load_state_from_disk(path, exclude=exclude)
+        self.load_state_from_disk(path, exclude=exclude, device=device)
         return self
 
     # override config property getter to remove "factory" key from components
@@ -853,13 +859,47 @@ def __exit__(ctx_self, type, value, traceback):
         self._disabled = disable
         return context()
 
+    def package(
+        self,
+        name: Optional[str] = None,
+        root_dir: Union[str, Path] = ".",
+        artifacts_name: str = "artifacts",
+        check_dependencies: bool = False,
+        project_type: Optional[Literal["poetry", "setuptools"]] = None,
+        version: str = "0.1.0",
+        metadata: Optional[Dict[str, Any]] = {},
+        distributions: Optional[Sequence[Literal["wheel", "sdist"]]] = ["wheel"],
+        config_settings: Optional[Mapping[str, Union[str, Sequence[str]]]] = None,
+        isolation: bool = True,
+        skip_build_dependency_check: bool = False,
+    ):
+        from .utils.package import package
+
+        return package(
+            pipeline=self,
+            name=name,
+            root_dir=root_dir,
+            artifacts_name=artifacts_name,
+            check_dependencies=check_dependencies,
+            project_type=project_type,
+            version=version,
+            metadata=metadata,
+            distributions=distributions,
+            config_settings=config_settings,
+            isolation=isolation,
+            skip_build_dependency_check=skip_build_dependency_check,
+        )
+
 
-def load(config: Union[Path, str, Config]) -> Pipeline:
+def load(
+    config: Union[Path, str, Config],
+    device: Optional[Union[str, "torch.device"]] = "cpu",  # noqa F821
+) -> Pipeline:
     error = "The load function expects a Config or a path to a config file"
     if isinstance(config, (Path, str)):
         path = Path(config)
         if path.is_dir():
-            return Pipeline.load(path)
+            return Pipeline.load(path, device=device)
         elif path.is_file():
             config = Config.from_disk(path)
         else: