diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 108b6be4..2acfdfdf 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -31,6 +31,13 @@ jobs:
           python-version: ${{ matrix.python-version }}
           architecture: x64
 
+      - name: Cache HuggingFace Models
+        uses: actions/cache@v2
+        id: cache-huggingface
+        with:
+          path: ~/.cache/huggingface/
+          key: ${{ matrix.python-version }}-huggingface
+
       - name: Install hatch
         run: pip install hatch
 
diff --git a/.gitignore b/.gitignore
index bc92f9af..60358cc5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -63,7 +63,7 @@ report.xml
 *.pickle
 *.joblib
 *.pdf
-data/
+/data/
 
 # MkDocs output
 docs/reference
diff --git a/changelog.md b/changelog.md
index 3583237e..e6a5d2ca 100644
--- a/changelog.md
+++ b/changelog.md
@@ -1,5 +1,33 @@
 # Changelog
 
+
+
+## v0.9.0
+
+### Added
+
+- New unified `edspdf.data` api (pdf files, pandas, parquet) and LazyCollection object
+  to efficiently read / write data from / to different formats & sources. This API is
+  has been heavily inspired by the `edsnlp.data` API.
+- New unified processing API to select the execution backend via `data.set_processing(...)`
+  to replace the old `accelerators` API (which is now deprecated, but still available).
+- `huggingface-embedding` now supports quantization and other `AutoModel.from_pretrained` kwargs
+- It is now possible to add convert a label to multiple labels in the `simple-aggregator` component :
+
+```ini
+# To build the "text" field, we will aggregate "title", "body" and "table" lines,
+# and output "title" lines in a separate field as well.
+label_map = {
+    "text" : [ "title", "body", "table" ],
+    "title": "title",
+    }
+```
+
+### Fixed
+
+- `huggingface-embedding` now resize bbox features for large PDFs, instead of making the model crash
+- `huggingface-embedding` and `sub-box-cnn-pooler` now handle empty PDFs correctly
+
 ## v0.8.1
 
 ### Fixed
diff --git a/docs/assets/images/multiprocessing.png b/docs/assets/images/multiprocessing.png
new file mode 100644
index 00000000..f6d54762
Binary files /dev/null and b/docs/assets/images/multiprocessing.png differ
diff --git a/docs/assets/images/multiprocessing.svg b/docs/assets/images/multiprocessing.svg
deleted file mode 100644
index 594b0d04..00000000
--- a/docs/assets/images/multiprocessing.svg
+++ /dev/null
@@ -1,3 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
-<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" width="413px" height="360px" viewBox="-0.5 -0.5 413 360" style="background-color: rgb(255, 255, 255);"><defs><style type="text/css">@import url(https://fonts.googleapis.com/css?family=Roboto);&#xa;@import url(https://fonts.googleapis.com/css?family=Inconsolata);&#xa;</style></defs><g><path d="M 197.35 114 L 221 114.04 Q 231 114.06 231 124.06 L 231 134.06 Q 231 144.06 241 144.04 L 257.46 144.01" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 262.71 144 L 255.72 147.51 L 257.46 144.01 L 255.71 140.51 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><path d="M 197.35 124 L 207.18 124.03 Q 217 124.06 217 134.06 L 217 267.63" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 217 272.88 L 213.5 265.88 L 217 267.63 L 220.5 265.88 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><rect x="93.35" y="94" width="104" height="40" rx="6" ry="6" fill="#dae8fc" stroke="#6c8ebf" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 102px; height: 1px; padding-top: 114px; margin-left: 94px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">CPU Worker 1</div></div></div></foreignObject><text x="145" y="118" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">CPU Worker 1</text></switch></g><path d="M 197.35 174 L 221 174.04 Q 231 174.06 231 184.06 L 231 194.06 Q 231 204.06 241 204.04 L 257.46 204.01" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 262.71 204 L 255.72 207.51 L 257.46 204.01 L 255.71 200.51 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><path d="M 197.35 174 L 221 174.04 Q 231 174.06 231 164.06 L 231 154.06 Q 231 144.06 241 144.04 L 257.46 144.01" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 262.71 144 L 255.72 147.51 L 257.46 144.01 L 255.71 140.51 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><path d="M 197.35 184 L 207.18 184.03 Q 217 184.06 217 194.06 L 217 267.63" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 217 272.88 L 213.5 265.88 L 217 267.63 L 220.5 265.88 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><rect x="93.35" y="154" width="104" height="40" rx="6" ry="6" fill="#dae8fc" stroke="#6c8ebf" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 102px; height: 1px; padding-top: 174px; margin-left: 94px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">CPU Worker 2</div></div></div></foreignObject><text x="145" y="178" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">CPU Worker 2</text></switch></g><path d="M 197.35 234 L 221 234.04 Q 231 234.06 231 224.06 L 231 214.06 Q 231 204.06 241 204.04 L 257.46 204.01" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 262.71 204 L 255.72 207.51 L 257.46 204.01 L 255.71 200.51 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><path d="M 197.35 244 L 207.18 244.03 Q 217 244.06 217 254.06 L 217 267.63" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 217 272.88 L 213.5 265.88 L 217 267.63 L 220.5 265.88 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><rect x="93.35" y="214" width="104" height="40" rx="6" ry="6" fill="#dae8fc" stroke="#6c8ebf" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 102px; height: 1px; padding-top: 234px; margin-left: 94px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">CPU Worker 3</div></div></div></foreignObject><text x="145" y="238" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">CPU Worker 3</text></switch></g><path d="M 368.46 144 L 378.26 144.03 Q 388.06 144.06 388.06 134.06 L 388.06 84.06 Q 388.06 74.06 378.06 74.06 L 76.06 74.06 Q 66.06 74.06 66.06 84.06 L 66.06 94.06 Q 66.06 104.06 76.06 104.04 L 86.98 104.01" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 92.23 104 L 85.24 107.52 L 86.98 104.01 L 85.22 100.52 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><path d="M 368.46 144 L 378.26 144.03 Q 388.06 144.06 388.06 134.06 L 388.06 84.06 Q 388.06 74.06 378.06 74.06 L 76.06 74.06 Q 66.06 74.06 66.06 84.06 L 66.06 154.06 Q 66.06 164.06 76.06 164.04 L 86.98 164.01" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 92.23 164 L 85.24 167.52 L 86.98 164.01 L 85.22 160.52 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><path d="M 368.46 144 L 378.26 144.03 Q 388.06 144.06 388.06 134.06 L 388.06 84.06 Q 388.06 74.06 378.06 74.06 L 76.06 74.06 Q 66.06 74.06 66.06 84.06 L 66.06 214.06 Q 66.06 224.06 76.06 224.04 L 86.98 224.01" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 92.23 224 L 85.24 227.52 L 86.98 224.01 L 85.22 220.52 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><rect x="263.83" y="124" width="104.63" height="40" rx="6" ry="6" fill="#f8cecc" stroke="#b85450" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 103px; height: 1px; padding-top: 144px; margin-left: 265px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">GPU Worker 1</div></div></div></foreignObject><text x="316" y="148" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">GPU Worker 1</text></switch></g><path d="M 368.46 204 L 378.26 204.03 Q 388.06 204.06 388.05 194.06 L 388 104" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><rect x="263.83" y="184" width="104.63" height="40" rx="6" ry="6" fill="#f8cecc" stroke="#b85450" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 103px; height: 1px; padding-top: 204px; margin-left: 265px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">GPU Worker 2</div></div></div></foreignObject><text x="316" y="208" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">GPU Worker 2</text></switch></g><path d="M 31 174 L 41 174 Q 51 174 51 164 L 51 154 Q 51 144 51 134 L 51 124 Q 51 114 61 114 L 86.98 114" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 92.23 114 L 85.23 117.5 L 86.98 114 L 85.23 110.5 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><path d="M 31 174 L 86.98 174" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 92.23 174 L 85.23 177.5 L 86.98 174 L 85.23 170.5 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><path d="M 31 174 L 41 174 Q 51 174 51 184 L 51 224 Q 51 234 61 234 L 86.98 234" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 92.23 234 L 85.23 237.5 L 86.98 234 L 85.23 230.5 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><rect x="328.31" y="6" width="81.38" height="51" fill="#000000" stroke="#000000" pointer-events="all" transform="translate(2,3)" opacity="0.25"/><rect x="328.31" y="6" width="81.38" height="51" fill="rgb(255, 255, 255)" stroke="rgb(0, 0, 0)" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe flex-start; width: 79px; height: 1px; padding-top: 31px; margin-left: 330px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: left;"><div style="display: inline-block; font-size: 9px; font-family: Verdana; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;"><p style="line-height: 100%;">batch_id = 8<br style="border-color: var(--border-color); font-size: 9px;" /><span style="font-size: 9px;">cpu_id = 1</span><br style="border-color: var(--border-color); font-size: 9px;" /><span style="font-size: 9px;">gpu_id = 0</span><br style="border-color: var(--border-color); font-size: 9px;" /><span style="font-size: 9px;">stage = 2</span><br style="border-color: var(--border-color); font-size: 9px;" /><span style="font-size: 9px;">forward out = ...</span></p></div></div></div></foreignObject><text x="330" y="33" fill="rgb(0, 0, 0)" font-family="Verdana" font-size="9px">batch_id = 8...</text></switch></g><path d="M 380.59 141.03 L 369.41 59.97" fill="none" stroke="#808080" stroke-miterlimit="10" pointer-events="stroke"/><ellipse cx="381" cy="144" rx="3" ry="3" fill="#808080" stroke="#808080" pointer-events="all"/><ellipse cx="369" cy="57" rx="3" ry="3" fill="#808080" stroke="#808080" pointer-events="all"/><rect x="215.68" y="6" width="81.38" height="51" fill="#000000" stroke="#000000" pointer-events="all" transform="translate(2,3)" opacity="0.25"/><rect x="215.68" y="6" width="81.38" height="51" fill="rgb(255, 255, 255)" stroke="rgb(0, 0, 0)" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe flex-start; width: 79px; height: 1px; padding-top: 31px; margin-left: 218px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: left;"><div style="display: inline-block; font-size: 9px; font-family: Verdana; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;"><p style="line-height: 100%;">batch_id = 28<br style="border-color: var(--border-color); font-size: 9px;" /><span style="font-size: 9px;">cpu_id = 2</span><br style="border-color: var(--border-color); font-size: 9px;" /><span style="font-size: 9px;">gpu_id = 1</span><br style="border-color: var(--border-color); font-size: 9px;" /><span style="font-size: 9px;">stage = 0</span><br style="border-color: var(--border-color); font-size: 9px;" /><span style="font-size: 9px;">collate out = ...</span></p></div></div></div></foreignObject><text x="218" y="33" fill="rgb(0, 0, 0)" font-family="Verdana" font-size="9px">batch_id = 28...</text></switch></g><path d="M 247.19 201.01 L 256.18 59.99" fill="none" stroke="#808080" stroke-miterlimit="10" pointer-events="stroke"/><ellipse cx="247" cy="204" rx="3" ry="3" fill="#808080" stroke="#808080" pointer-events="all"/><ellipse cx="256.37" cy="57" rx="3" ry="3" fill="#808080" stroke="#808080" pointer-events="all"/><rect x="0" y="147" width="60" height="30" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 58px; height: 1px; padding-top: 162px; margin-left: 1px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 11px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">Inputs</div></div></div></foreignObject><text x="30" y="165" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="11px" text-anchor="middle">Inputs</text></switch></g><path d="M 153.85 221.5 L 148.85 221.5 Q 143.85 221.5 143.85 231.5 L 143.85 264 Q 143.85 274 138.85 274 L 136.35 274 Q 133.85 274 138.85 274 L 141.35 274 Q 143.85 274 143.85 284 L 143.85 316.5 Q 143.85 326.5 148.85 326.5 L 153.85 326.5" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" transform="rotate(-90,143.85,274)" pointer-events="all"/><rect x="86.35" y="279" width="160" height="80" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe flex-start; width: 1px; height: 1px; padding-top: 319px; margin-left: 88px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: left;"><div style="display: inline-block; font-size: 10px; font-family: Verdana; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: nowrap;"><div style="font-size: 10px; line-height: 100%;"><span style="background-color: initial; font-size: 10px;">Non deep-learning ops:</span></div><div style="font-size: 10px; line-height: 100%;"><span style="background-color: initial; font-size: 10px;">- extractors</span></div><div style="font-size: 10px; line-height: 100%;"><span style="background-color: initial; font-size: 10px;">- aggregators<br style="font-size: 10px;" />- feature preprocessing</span></div>- feature collating<br style="font-size: 10px;" />- forward output postproc.</div></div></div></foreignObject><text x="88" y="322" fill="rgb(0, 0, 0)" font-family="Verdana" font-size="10px">Non deep-learning ops:...</text></switch></g><path d="M 333.83 214 L 328.83 214 Q 323.83 214 323.83 224 L 323.83 264 Q 323.83 274 318.83 274 L 316.33 274 Q 313.83 274 318.83 274 L 321.33 274 Q 323.83 274 323.83 284 L 323.83 324 Q 323.83 334 328.83 334 L 333.83 334" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" transform="rotate(-90,323.83,274)" pointer-events="all"/><rect x="263.83" y="280" width="120" height="30" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe flex-start; justify-content: unsafe flex-start; width: 1px; height: 1px; padding-top: 287px; margin-left: 266px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: left;"><div style="display: inline-block; font-size: 10px; font-family: Verdana; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: nowrap;"><div style="font-size: 10px; line-height: 100%;"><span style="background-color: initial; font-size: 10px;">Deep-learning ops:</span></div><div style="font-size: 10px; line-height: 100%;"><span style="background-color: initial; font-size: 10px;">- forward</span></div></div></div></div></foreignObject><text x="266" y="297" fill="rgb(0, 0, 0)" font-family="Verdana" font-size="10px">Deep-learning ops:...</text></switch></g><rect x="53" y="6" width="81.38" height="51" fill="#000000" stroke="#000000" pointer-events="all" transform="translate(2,3)" opacity="0.25"/><rect x="53" y="6" width="81.38" height="51" fill="rgb(255, 255, 255)" stroke="rgb(0, 0, 0)" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe flex-start; width: 79px; height: 1px; padding-top: 31px; margin-left: 55px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: left;"><div style="display: inline-block; font-size: 9px; font-family: Verdana; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;"><p style="line-height: 100%;">batch_id = 28<br style="border-color: var(--border-color); font-size: 9px;" /><span style="font-size: 9px;">cpu_id = 2</span><br style="border-color: var(--border-color); font-size: 9px;" /><span style="font-size: 9px;">gpu_id = ?</span><br style="border-color: var(--border-color); font-size: 9px;" /><span style="font-size: 9px;">stage = 0</span><br style="border-color: var(--border-color); font-size: 9px;" /><span style="font-size: 9px;">input doc = ...</span></p></div></div></div></foreignObject><text x="55" y="33" fill="rgb(0, 0, 0)" font-family="Verdana" font-size="9px">batch_id = 28...</text></switch></g><path d="M 61.81 171.11 L 92.88 59.89" fill="none" stroke="#808080" stroke-miterlimit="10" pointer-events="stroke"/><ellipse cx="61" cy="174" rx="3" ry="3" fill="#808080" stroke="#808080" pointer-events="all"/><ellipse cx="93.69" cy="57" rx="3" ry="3" fill="#808080" stroke="#808080" pointer-events="all"/><rect x="210.35" y="244" width="60" height="30" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 58px; height: 1px; padding-top: 259px; margin-left: 211px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 11px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">Outputs</div></div></div></foreignObject><text x="240" y="262" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="11px" text-anchor="middle">Outputs</text></switch></g></g><switch><g requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"/><a transform="translate(0,-5)" xlink:href="https://www.drawio.com/doc/faq/svg-export-text-problems" target="_blank"><text text-anchor="middle" font-size="10px" x="50%" y="100%">Text is not SVG - cannot display</text></a></switch></svg>
diff --git a/docs/index.md b/docs/index.md
index 52b46d0d..735b6a32 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -99,12 +99,10 @@ See the [rule-based recipe](recipes/rule-based.md) for a step-by-step explanatio
 If you use EDS-PDF, please cite us as below.
 
 ```bibtex
-@software{edspdf,
-  author  = {Dura, Basile and Wajsburt, Perceval and Calliger, Alice and Gérardin, Christel and Bey, Romain},
-  doi     = {10.5281/zenodo.6902977},
-  license = {BSD-3-Clause},
-  title   = {{EDS-PDF: Smart text extraction from PDF documents}},
-  url     = {https://github.com/aphp/edspdf}
+@article{gerardin_wajsburt_pdf,
+  title={Bridging Clinical PDFs and Downstream Natural Language Processing: An Efficient Neural Approach to Layout Segmentation},
+  author={G{\'e}rardin, Christel Ducroz and Wajsburt, Perceval and Dura, Basile and Calliger, Alice and Mouchet, Alexandre and Tannier, Xavier and Bey, Romain},
+  journal={Available at SSRN 4587624}
 }
 ```
 
diff --git a/docs/inference.md b/docs/inference.md
index d849ba86..9f7c325b 100644
--- a/docs/inference.md
+++ b/docs/inference.md
@@ -1,61 +1,124 @@
 # Inference
 
-Once you have obtained a pipeline, either by composing rule-based components, training a model or loading a model from the disk, you can use it to make predictions on documents. This is referred to as inference.
+Once you have obtained a pipeline, either by composing rule-based components, training a model or loading a model from the disk, you can use it to make predictions on documents. This is referred to as inference. This page answers the following questions :
+
+> How do we leverage computational resources run a model on many documents?
+
+> How do we connect to various data sources to retrieve documents?
+
 
 ## Inference on a single document
 
-In EDS-PDF, computing the prediction on a single document is done by calling the pipeline on the document. The input can be either:
+In EDS-model, computing the prediction on a single document is done by calling the pipeline on the document. The input can be either:
 
-- a sequence of bytes
-- or a [PDFDoc][edspdf.structures.PDFDoc] object
+- a bytes string
+- or a [PDFDoc](https://spacy.io/api/doc) object
 
-```python
+```{ .python .no-check }
 from pathlib import Path
 
-pipeline = ...
-content = Path("path/to/.pdf").read_bytes()
-doc = pipeline(content)
+model = ...
+pdf_bytes = b"..."
+doc = model(pdf_bytes)
 ```
 
-If you're lucky enough to have a GPU, you can use it to speed up inference by moving the model to the GPU before calling the pipeline. To leverage multiple GPUs, refer to the [multiprocessing accelerator][edspdf.accelerators.multiprocessing.MultiprocessingAccelerator] description below.
+If you're lucky enough to have a GPU, you can use it to speed up inference by moving the model to the GPU before calling the pipeline.
 
-```python
-pipeline.to("cuda")  # same semantics as pytorch
-doc = pipeline(content)
+```{ .python .no-check }
+model.to("cuda")  # same semantics as pytorch
+doc = model(pdf_bytes)
 ```
 
-## Inference on multiple documents
+To leverage multiple GPUs when processing multiple documents, refer to the [multiprocessing backend][edspdf.processing.multiprocessing.execute_multiprocessing_backend] description below.
+
+## Inference on multiple documents {: #edspdf.lazy_collection.LazyCollection }
+
+When processing multiple documents, we can optimize the inference by parallelizing the computation on a single core, multiple cores and GPUs or even multiple machines.
+
+### Lazy collection
+
+These optimizations are enabled by performing *lazy inference* : the operations (e.g., reading a document, converting it to a PDFDoc, running the different pipes of a model or writing the result somewhere) are not executed immediately but are instead scheduled in a [LazyCollection][edspdf.lazy_collection.LazyCollection] object. It can then be executed by calling the `execute` method, iterating over it or calling a writing method (e.g., `to_pandas`). In fact, data connectors like `edspdf.data.read_files` return a lazy collection, as well as the `model.pipe` method.
+
+A lazy collection contains :
+
+- a `reader`: the source of the data (e.g., a file, a database, a list of strings, etc.)
+- the list of operations to perform under a `pipeline` attribute containing the name if any, function / pipe, keyword arguments and context for each operation
+- an optional `writer`: the destination of the data (e.g., a file, a database, a list of strings, etc.)
+- the execution `config`, containing the backend to use and its configuration such as the number of workers, the batch size, etc.
+
+All methods (`.map`, `.map_pipeline`, `.set_processing`) of the lazy collection are chainable, meaning that they return a new object (no in-place modification).
+
+For instance, the following code will load a model, read a folder of JSON files, apply the model to each document and write the result in a Parquet folder, using 4 CPUs and 2 GPUs.
 
-When processing multiple documents, it is usually more efficient to use the `pipeline.pipe(...)` method, especially when using deep learning components, since this allow matrix multiplications to be batched together. Depending on your computational resources and requirements, EDS-PDF comes with various "accelerators" to speed up inference (see the [Accelerators](#accelerators) section for more details). By default, the `.pipe()` method uses the [`simple` accelerator][edspdf.accelerators.simple.SimpleAccelerator] but you can switch to a different one by passing the `accelerator` argument.
+```{ .python .no-check }
+import edspdf
 
-```python
-pipeline = ...
-docs = pipeline.pipe(
-    [content1, content2, ...],
-    batch_size=16,  # optional, default to the one defined in the pipeline
-    accelerator=my_accelerator,
+# Load or create a model, for instance following the "Recipes"
+model = edspdf.load("path/to/model")
+
+# Read some data (this is lazy, no data will be read until the end of of this snippet)
+data = edspdf.data.read_files(
+    "/Users/perceval/Development/edspdf/tests/resources/",
+    # dict to doc converter function
+    converter=lambda x: PDFDoc(id=x["id"], content=x["content"]),
+)
+
+# Apply each pipe of the model to our documents
+data = data.map_pipeline(model)
+# or equivalently : data = model.pipe(data)
+
+# Configure the execution
+data = data.set_processing(
+    # 4 CPUs to parallelize rule-based pipes, IO and preprocessing
+    num_cpu_workers=4,
+    # 2 GPUs to accelerate deep-learning pipes
+    num_gpu_workers=2,
+)
+
+# Write the result, this will execute the lazy collection
+data.write_parquet(
+    "path/to/output_folder",
+    # doc to dict converter function
+    converter=lambda doc: {
+        "id": doc.id,
+        "text": (
+            doc.aggregated_texts["body"].text
+            if "body" in doc.aggregated_texts
+            else ""
+        ),
+    },
 )
 ```
 
-The `pipe` method supports the following arguments :
+### Applying operations to a lazy collection
+
+To apply an operation to a lazy collection, you can use the `.map` method. It takes a callable as input and an optional dictionary of keyword arguments. The function will be applied to each element of the collection.
+
+To apply a model, you can use the `.map_pipeline` method. It takes a model as input and will add every pipe of the model to the scheduled operations.
+
+In both cases, the operations will not be executed immediately but will be scheduled to be executed when iterating of the collection, or calling the `.execute`, `.to_*` or `.write_*` methods.
+
+### Execution of a lazy collection {: #edspdf.lazy_collection.LazyCollection.set_processing }
+
+You can configure how the operations performed in the lazy collection is executed by calling its `set_processing(...)` method. The following options are available :
 
-::: edspdf.pipeline.Pipeline.pipe
+::: edspdf.lazy_collection.LazyCollection.set_processing
     options:
         heading_level: 3
-        only_parameters: true
+        only_parameters: "no-header"
 
-## Accelerators
+## Backends
 
-### Simple accelerator {: #edspdf.accelerators.simple.SimpleAccelerator }
+### Simple backend {: #edspdf.processing.simple.execute_simple_backend }
 
-::: edspdf.accelerators.simple.SimpleAccelerator
+::: edspdf.processing.simple.execute_simple_backend
     options:
         heading_level: 3
-        only_class_level: true
+        show_source: false
 
-### Multiprocessing accelerator {: #edspdf.accelerators.multiprocessing.MultiprocessingAccelerator }
+### Multiprocessing backend {: #edspdf.processing.multiprocessing.execute_multiprocessing_backend }
 
-::: edspdf.accelerators.multiprocessing.MultiprocessingAccelerator
+::: edspdf.processing.multiprocessing.execute_multiprocessing_backend
     options:
         heading_level: 3
-        only_class_level: true
+        show_source: false
diff --git a/docs/scripts/plugin.py b/docs/scripts/plugin.py
index 22e7c1bd..b020a225 100644
--- a/docs/scripts/plugin.py
+++ b/docs/scripts/plugin.py
@@ -7,6 +7,8 @@
 import mkdocs.structure.files
 import mkdocs.structure.nav
 import mkdocs.structure.pages
+import regex
+from mkdocs_autorefs.plugin import AutorefsPlugin
 
 try:
     from importlib.metadata import entry_points
@@ -128,9 +130,13 @@ def on_page_read_source(page, config):
     return None
 
 
-HREF_REGEX = r'href=(?:"([^"]*)"|\'([^\']*)|[ ]*([^ =>]*)(?![a-z]+=))'
+HREF_REGEX = (
+    r"(?<=<\s*(?:a[^>]*href|img[^>]*src)=)"
+    r'(?:"([^"]*)"|\'([^\']*)|[ ]*([^ =>]*)(?![a-z]+=))'
+)
+
+
 # Maybe find something less specific ?
-PIPE_REGEX = r"(?<=[^a-zA-Z0-9._-])eds[.][a-zA-Z0-9._-]*(?=[^a-zA-Z0-9._-])"
 
 
 @mkdocs.plugins.event_priority(-1000)
@@ -155,100 +161,57 @@ def on_post_page(
 
     """
 
-    autorefs = config["plugins"]["autorefs"]
-    edspdf_factories_entry_points = {
-        ep.name: ep.value for ep in entry_points()["edspdf_factories"]
+    autorefs: AutorefsPlugin = config["plugins"]["autorefs"]
+    factories_entry_points = {
+        ep.name: autorefs.get_item_url(ep.value.replace(":", "."))
+        for ep in entry_points()["edspdf_factories"]
+    }
+    factories_entry_points = {
+        k: "/" + v if not v.startswith("/") else v
+        for k, v in factories_entry_points.items()
     }
+    factories_entry_points.update(
+        {
+            "mupdf-extractor": "https://aphp.github.io/edspdf-mupdf/latest/",
+            "poppler-extractor": "https://aphp.github.io/edspdf-poppler/latest/",
+        }
+    )
 
-    def get_component_url(name):
-        ep = edspdf_factories_entry_points.get(name)
-        if ep is None:
-            return None
-        try:
-            url = autorefs.get_item_url(ep.replace(":", "."))
-        except KeyError:
-            pass
-        else:
-            return url
-        return None
+    PIPE_REGEX_BASE = "|".join(regex.escape(name) for name in factories_entry_points)
+    PIPE_REGEX = f"""(?x)
+    (?<=")({PIPE_REGEX_BASE})(?=")
+    |(?<=&quot;)({PIPE_REGEX_BASE})(?=&quot;)
+    |(?<=')({PIPE_REGEX_BASE})(?=')
+    |(?<=<code>)({PIPE_REGEX_BASE})(?=</code>)
+    """
 
-    def get_relative_link(url):
+    def replace_component(match):
+        name = match.group()
+        preceding = output[match.start(0) - 50 : match.start(0)]
+        if (
+            "DEFAULT:"
+            not in preceding
+            # and output[: match.start(0)].count("<code>")
+            # > output[match.end(0) :].count("</code>")
+        ):
+            try:
+                ep_url = factories_entry_points[name]
+            except KeyError:
+                pass
+            else:
+                if ep_url.split("#")[0].strip("/") != page.file.url.strip("/"):
+                    return "<a href={href}>{name}</a>".format(href=ep_url, name=name)
+        return name
+
+    def replace_link(match):
+        relative_url = url = match.group(1) or match.group(2) or match.group(3)
         page_url = os.path.join("/", page.file.url)
         if url.startswith("/"):
-            url = os.path.relpath(url, page_url)
-        return url
-
-    def replace_component_span(span):
-        content = span.text
-        if content is None:
-            return
-        link_url = get_component_url(content.strip("\"'"))
-        if link_url is None:
-            return
-        a = etree.Element("a", href="/" + link_url)
-        a.text = content
-        span.text = ""
-        span.append(a)
-
-    def replace_component_names(root):
-        # Iterate through all span elements
-        spans = list(root.iter("span", "code"))
-        for i, span in enumerate(spans):
-            prev = span.getprevious()
-            if span.getparent().tag == "a":
-                continue
-            # To avoid replacing default component name in parameter tables
-            if prev is None or prev.text != "DEFAULT:":
-                replace_component_span(span)
-            # if span.text == "add_pipe":
-            #     next_span = span.getnext()
-            #     if next_span is None:
-            #         continue
-            #     next_span = next_span.getnext()
-            #     if next_span is None or next_span.tag != "span":
-            #         continue
-            #     replace_component_span(next_span)
-            #     continue
-            # tokens = ["@", "factory", "="]
-            # while True:
-            #     if len(tokens) == 0:
-            #         break
-            #     if span.text != tokens[0]:
-            #         break
-            #     tokens = tokens[1:]
-            #     span = span.getnext()
-            #     while span is not None and (
-            #       span.text is None or not span.text.strip()
-            #     ):
-            #         span = span.getnext()
-            # if len(tokens) == 0:
-            #     replace_component_span(span)
-
-        # Convert the modified tree back to a string
-        return root
-
-    def replace_absolute_links(root):
-        # Iterate through all a elements
-        for a in root.iter("a"):
-            href = a.get("href")
-            if href is None or href.startswith("http"):
-                continue
-            a.set("href", get_relative_link(href))
-        for img in root.iter("img"):
-            href = img.get("src")
-            if href is None or href.startswith("http"):
-                continue
-            img.set("src", get_relative_link(href))
-
-        # Convert the modified tree back to a string
-        return root
+            relative_url = os.path.relpath(url, page_url)
+        return f'"{relative_url}"'
 
     # Replace absolute paths with path relative to the rendered page
-    from lxml.html import etree
-
-    root = etree.HTML(output)
-    root = replace_component_names(root)
-    root = replace_absolute_links(root)
-    doctype = root.getroottree().docinfo.doctype
-    res = etree.tostring(root, encoding="unicode", method="html", doctype=doctype)
-    return res
+    output = regex.sub(PIPE_REGEX, replace_component, output)
+    output = regex.sub(HREF_REGEX, replace_link, output)
+
+    return output
diff --git a/docs/trainable-pipes.md b/docs/trainable-pipes.md
index b9dc02c8..ae6f37b0 100644
--- a/docs/trainable-pipes.md
+++ b/docs/trainable-pipes.md
@@ -97,12 +97,12 @@ class MyComponent(TrainablePipe):
             "my-feature": ...(doc),
         }
 
-    def collate(self, batch, device: torch.device) -> Dict:
+    def collate(self, batch) -> Dict:
         # Collate the features of the "embedding" subcomponent
         # and the features of this component as well
         return {
-            "embedding": self.embedding.collate(batch["embedding"], device),
-            "my-feature": torch.as_tensor(batch["my-feature"], device=device),
+            "embedding": self.embedding.collate(batch["embedding"]),
+            "my-feature": torch.as_tensor(batch["my-feature"]),
         }
 
     def forward(self, batch: Dict, supervision=False) -> Dict:
diff --git a/edspdf/__init__.py b/edspdf/__init__.py
index 9d5ed750..1ab0d6f1 100644
--- a/edspdf/__init__.py
+++ b/edspdf/__init__.py
@@ -3,7 +3,8 @@
 from .pipeline import Pipeline, load
 from .registry import registry
 from .structures import Box, Page, PDFDoc, Text, TextBox, TextProperties
+from . import data
 
 from . import utils  # isort:skip
 
-__version__ = "0.8.1"
+__version__ = "0.9.0"
diff --git a/edspdf/accelerators/base.py b/edspdf/accelerators/base.py
index 07bc01b2..2360377d 100644
--- a/edspdf/accelerators/base.py
+++ b/edspdf/accelerators/base.py
@@ -1,97 +1,2 @@
-from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, Union
-
-from ..structures import PDFDoc
-
-
-class FromDictFieldsToDoc:
-    def __init__(self, content_field, id_field=None):
-        self.content_field = content_field
-        self.id_field = id_field
-
-    def __call__(self, item):
-        if isinstance(item, dict):
-            return PDFDoc(
-                content=item[self.content_field],
-                id=item[self.id_field] if self.id_field else None,
-            )
-        return item
-
-
-class ToDoc:
-    @classmethod
-    def __get_validators__(cls):
-        yield cls.validate
-
-    @classmethod
-    def validate(cls, value, config=None):
-        if isinstance(value, str):
-            value = {"content_field": value}
-        if isinstance(value, dict):
-            value = FromDictFieldsToDoc(**value)
-        if callable(value):
-            return value
-        raise TypeError(
-            f"Invalid entry {value} ({type(value)}) for ToDoc, "
-            f"expected string, a dict or a callable."
-        )
-
-
-FROM_DOC_TO_DICT_FIELDS_TEMPLATE = """
-def fn(doc):
-    return {X}
-"""
-
-
-class FromDocToDictFields:
-    def __init__(self, mapping):
-        self.mapping = mapping
-        dict_fields = ", ".join(f"{repr(k)}: doc.{v}" for k, v in mapping.items())
-        local_vars = {}
-        exec(FROM_DOC_TO_DICT_FIELDS_TEMPLATE.replace("X", dict_fields), local_vars)
-        self.fn = local_vars["fn"]
-
-    def __reduce__(self):
-        return FromDocToDictFields, (self.mapping,)
-
-    def __call__(self, doc):
-        return self.fn(doc)
-
-
-class FromDoc:
-    """
-    A FromDoc converter (from a PDFDoc to an arbitrary type) can be either:
-
-    - a dict mapping field names to doc attributes
-    - a callable that takes a PDFDoc and returns an arbitrary type
-    """
-
-    @classmethod
-    def __get_validators__(cls):
-        yield cls.validate
-
-    @classmethod
-    def validate(cls, value, config=None):
-        if isinstance(value, dict):
-            value = FromDocToDictFields(value)
-        if callable(value):
-            return value
-        raise TypeError(
-            f"Invalid entry {value} ({type(value)}) for ToDoc, "
-            f"expected dict or callable"
-        )
-
-
 class Accelerator:
-    def __call__(
-        self,
-        inputs: Iterable[Any],
-        model: Any,
-        to_doc: ToDoc = FromDictFieldsToDoc("content"),
-        from_doc: FromDoc = lambda doc: doc,
-    ):
-        raise NotImplementedError()
-
-
-if TYPE_CHECKING:
-    ToDoc = Union[str, Dict[str, Any], Callable[[Any], PDFDoc]]  # noqa: F811
-    FromDoc = Union[Dict[str, Any], Callable[[PDFDoc], Any]]  # noqa: F811
+    pass
diff --git a/edspdf/accelerators/multiprocessing.py b/edspdf/accelerators/multiprocessing.py
index 41ef2f80..b11d556e 100644
--- a/edspdf/accelerators/multiprocessing.py
+++ b/edspdf/accelerators/multiprocessing.py
@@ -1,338 +1,15 @@
-import gc
-import signal
-from multiprocessing.connection import wait
-from random import shuffle
-from typing import Any, Iterable, List, Optional, Union
+from typing import List, Optional, Union
 
 import torch
-import torch.multiprocessing as mp
 
-from .. import TrainablePipe
 from ..registry import registry
-from ..utils.collections import batchify
-from .base import Accelerator, FromDictFieldsToDoc, FromDoc, ToDoc
-
-DEBUG = False
-
-debug = (
-    (lambda *args, flush=False, **kwargs: print(*args, **kwargs, flush=True))
-    if DEBUG
-    else lambda *args, **kwargs: None
-)
-
-
-class Exchanger:
-    def __init__(
-        self,
-        num_stages,
-        num_gpu_workers,
-        num_cpu_workers,
-        gpu_worker_devices,
-    ):
-        # queue for cpu input tasks
-        self.gpu_worker_devices = gpu_worker_devices
-        # We add prioritized queue at the end for STOP signals
-        self.cpu_inputs_queues = [
-            [mp.SimpleQueue()] + [mp.SimpleQueue() for _ in range(num_stages + 1)]
-            # The input queue is not shared between processes, since calling `wait`
-            # on a queue reader from multiple processes may lead to a deadlock
-            for _ in range(num_cpu_workers)
-        ]
-        self.gpu_inputs_queues = [
-            [mp.SimpleQueue() for _ in range(num_stages + 1)]
-            for _ in range(num_gpu_workers)
-        ]
-        self.outputs_queue = mp.Queue()
-
-    def get_cpu_tasks(self, idx):
-        while True:
-            queue_readers = wait(
-                [queue._reader for queue in self.cpu_inputs_queues[idx]]
-            )
-            stage, queue = next(
-                (stage, q)
-                for stage, q in reversed(list(enumerate(self.cpu_inputs_queues[idx])))
-                if q._reader in queue_readers
-            )
-            try:
-                item = queue.get()
-            except BaseException:
-                continue
-            if item is None:
-                return
-            yield stage, item
-
-    def put_cpu(self, item, stage, idx):
-        return self.cpu_inputs_queues[idx][stage].put(item)
-
-    def get_gpu_tasks(self, idx):
-        while True:
-            queue_readers = wait(
-                [queue._reader for queue in self.gpu_inputs_queues[idx]]
-            )
-            stage, queue = next(
-                (stage, q)
-                for stage, q in reversed(list(enumerate(self.gpu_inputs_queues[idx])))
-                if q._reader in queue_readers
-            )
-            try:
-                item = queue.get()
-            except BaseException:  # pragma: no cover
-                continue
-            if item is None:
-                return
-            yield stage, item
-
-    def put_gpu(self, item, stage, idx):
-        return self.gpu_inputs_queues[idx][stage].put(item)
-
-    def put_results(self, items):
-        self.outputs_queue.put(items)
-
-    def iter_results(self):
-        for out in iter(self.outputs_queue.get, None):
-            yield out
-
-
-class CPUWorker(mp.Process):
-    def __init__(
-        self,
-        cpu_idx: int,
-        exchanger: Exchanger,
-        gpu_pipe_names: List[str],
-        model: Any,
-        device: Union[str, torch.device],
-    ):
-        super(CPUWorker, self).__init__()
-
-        self.cpu_idx = cpu_idx
-        self.exchanger = exchanger
-        self.gpu_pipe_names = gpu_pipe_names
-        self.model = model
-        self.device = device
-
-    def _run(self):
-        # Cannot pass torch tensor during init i think ? otherwise i get
-        # ValueError: bad value(s) in fds_to_keep
-        mp._prctl_pr_set_pdeathsig(signal.SIGINT)
-
-        model = self.model.to(self.device)
-        stages = [{"cpu_components": [], "gpu_component": None}]
-        for name, component in model.pipeline:
-            if name in self.gpu_pipe_names:
-                stages[-1]["gpu_component"] = component
-                stages.append({"cpu_components": [], "gpu_component": None})
-            else:
-                stages[-1]["cpu_components"].append(component)
-
-        next_batch_id = 0
-        active_batches = {}
-        debug(
-            f"CPU worker {self.cpu_idx} is ready",
-            next(model.parameters()).device,
-            flush=True,
-        )
-
-        had_error = False
-        with torch.no_grad():
-            for stage, task in self.exchanger.get_cpu_tasks(self.cpu_idx):
-                if had_error:
-                    continue  # pragma: no cover
-                try:
-                    if stage == 0:
-                        gpu_idx = None
-                        batch_id = next_batch_id
-                        debug("preprocess start for", batch_id)
-                        next_batch_id += 1
-                        docs = task
-                    else:
-                        gpu_idx, batch_id, result = task
-                        debug("postprocess start for", batch_id)
-                        docs = active_batches.pop(batch_id)
-                        gpu_pipe = stages[stage - 1]["gpu_component"]
-                        docs = gpu_pipe.postprocess(docs, result)  # type: ignore
-
-                    for component in stages[stage]["cpu_components"]:
-                        if hasattr(component, "batch_process"):
-                            docs = component.batch_process(docs)
-                        else:
-                            docs = [component(doc) for doc in docs]
-
-                    gpu_pipe = stages[stage]["gpu_component"]
-                    if gpu_pipe is not None:
-                        preprocessed = gpu_pipe.make_batch(docs)  # type: ignore
-                        active_batches[batch_id] = docs
-                        if gpu_idx is None:
-                            gpu_idx = batch_id % len(self.exchanger.gpu_worker_devices)
-                        collated = gpu_pipe.collate(  # type: ignore
-                            preprocessed,
-                            device=self.exchanger.gpu_worker_devices[gpu_idx],
-                        )
-                        self.exchanger.put_gpu(
-                            item=(self.cpu_idx, batch_id, collated),
-                            idx=gpu_idx,
-                            stage=stage,
-                        )
-                        batch_id += 1
-                        debug("preprocess end for", batch_id)
-                    else:
-                        self.exchanger.put_results((docs, self.cpu_idx, gpu_idx))
-                        debug("postprocess end for", batch_id)
-                except BaseException as e:
-                    had_error = True
-                    import traceback
-
-                    print(traceback.format_exc(), flush=True)
-                    self.exchanger.put_results((e, self.cpu_idx, None))
-            # We need to drain the queues of GPUWorker fed inputs (pre-moved to GPU)
-            # to ensure no tensor allocated on producer processes (CPUWorker via
-            # collate) are left in consumer processes
-            debug("Start draining CPU worker", self.cpu_idx)
-            [None for _ in self.exchanger.get_cpu_tasks(self.cpu_idx)]
-        debug(f"CPU worker {self.cpu_idx} is about to stop")
-
-    def run(self):
-        self._run()
-        self.model = None
-        gc.collect()
-        torch.cuda.empty_cache()
-
-
-class GPUWorker(mp.Process):
-    def __init__(
-        self,
-        gpu_idx,
-        exchanger: Exchanger,
-        gpu_pipe_names: List[str],
-        model: Any,
-        device: Union[str, torch.device],
-    ):
-        super().__init__()
-
-        self.device = device
-        self.gpu_idx = gpu_idx
-        self.exchanger = exchanger
-
-        self.gpu_pipe_names = gpu_pipe_names
-        self.model = model
-        self.device = device
-
-    def _run(self):
-        debug("GPU worker", self.gpu_idx, "started")
-        mp._prctl_pr_set_pdeathsig(signal.SIGINT)
-        had_error = False
-
-        model = self.model.to(self.device)
-        stage_components = [model.get_pipe(name) for name in self.gpu_pipe_names]
-        del model
-        with torch.no_grad():
-            for stage, task in self.exchanger.get_gpu_tasks(self.gpu_idx):
-                if had_error:
-                    continue  # pragma: no cover
-                try:
-                    cpu_idx, batch_id, batch = task
-                    debug("forward start for", batch_id)
-                    component = stage_components[stage]
-                    res = component.module_forward(batch)
-                    del batch, task
-                    # TODO set non_blocking=True here
-                    res = {
-                        key: val.to("cpu") if not isinstance(val, int) else val
-                        for key, val in res.items()
-                    }
-                    self.exchanger.put_cpu(
-                        item=(self.gpu_idx, batch_id, res),
-                        stage=stage + 1,
-                        idx=cpu_idx,
-                    )
-                    debug("forward end for", batch_id)
-                except BaseException as e:
-                    had_error = True
-                    self.exchanger.put_results((e, None, self.gpu_idx))
-                    import traceback
-
-                    print(traceback.format_exc(), flush=True)
-                task = batch = res = None  # noqa
-            # We need to drain the queues of CPUWorker fed inputs (pre-moved to GPU)
-            # to ensure no tensor allocated on producer processes (CPUWorker via
-            # collate) are left in consumer processes
-            debug("Start draining GPU worker", self.gpu_idx)
-            [None for _ in self.exchanger.get_gpu_tasks(self.gpu_idx)]
-        debug(f"GPU worker {self.gpu_idx} is about to stop")
-
-    def run(self):
-        self._run()
-        self.model = None
-        gc.collect()
-        torch.cuda.empty_cache()
-
-
-DEFAULT_MAX_CPU_WORKERS = 4
+from .base import Accelerator
 
 
 @registry.accelerator.register("multiprocessing")
 class MultiprocessingAccelerator(Accelerator):
     """
-    If you have multiple CPU cores, and optionally multiple GPUs, we provide a
-    `multiprocessing` accelerator that allows to run the inference on multiple
-    processes.
-
-    This accelerator dispatches the batches between multiple workers
-    (data-parallelism), and distribute the computation of a given batch on one or two
-    workers (model-parallelism). This is done by creating two types of workers:
-
-    - a `CPUWorker` which handles the non deep-learning components and the
-      preprocessing, collating and postprocessing of deep-learning components
-    - a `GPUWorker` which handles the forward call of the deep-learning components
-
-    The advantage of dedicating a worker to the deep-learning components is that it
-    allows to prepare multiple batches in parallel in multiple `CPUWorker`, and ensure
-    that the `GPUWorker` never wait for a batch to be ready.
-
-    The overall architecture described in the following figure, for 3 CPU workers and 2
-    GPU workers.
-
-    <div style="text-align:center">
-        <img src="../assets/images/multiprocessing.svg" style="height:450px" />
-    </div>
-
-    Here is how a small pipeline with rule-based components and deep-learning components
-    is distributed between the workers:
-
-    <div style="text-align:center">
-        <img src="../assets/images/model-parallelism.png" />
-    </div>
-
-    Examples
-    --------
-
-    ```python
-    docs = list(
-        pipeline.pipe(
-            [content1, content2, ...],
-            accelerator={
-                "@accelerator": "multiprocessing",
-                "num_cpu_workers": 3,
-                "num_gpu_workers": 2,
-                "batch_size": 8,
-            },
-        )
-    )
-    ```
-
-    Parameters
-    ----------
-    batch_size: int
-        Number of documents to process at a time in a CPU/GPU worker
-    num_cpu_workers: int
-        Number of CPU workers. A CPU worker handles the non deep-learning components
-        and the preprocessing, collating and postprocessing of deep-learning components.
-    num_gpu_workers: Optional[int]
-        Number of GPU workers. A GPU worker handles the forward call of the
-        deep-learning components.
-    gpu_pipe_names: Optional[List[str]]
-        List of pipe names to accelerate on a GPUWorker, defaults to all pipes
-        that inherit from TrainablePipe
+    Deprecated: Use `docs.map_pipeline(model).set_processing(...)` instead
     """
 
     def __init__(
@@ -350,196 +27,3 @@ def __init__(
         self.gpu_pipe_names = gpu_pipe_names
         self.gpu_worker_devices = gpu_worker_devices
         self.cpu_worker_devices = cpu_worker_devices
-
-    def __call__(
-        self,
-        inputs: Iterable[Any],
-        model: Any,
-        to_doc: ToDoc = FromDictFieldsToDoc("content"),
-        from_doc: FromDoc = lambda doc: doc,
-    ):
-        """
-        Stream of documents to process. Each document can be a string or a tuple
-
-        Parameters
-        ----------
-        inputs
-        model
-
-        Yields
-        ------
-        Any
-            Processed outputs of the pipeline
-        """
-        if torch.multiprocessing.get_start_method() != "spawn":
-            torch.multiprocessing.set_start_method("spawn", force=True)
-
-        gpu_pipe_names = (
-            [
-                name
-                for name, component in model.pipeline
-                if isinstance(component, TrainablePipe)
-            ]
-            if self.gpu_pipe_names is None
-            else self.gpu_pipe_names
-        )
-
-        if not all(model.has_pipe(name) for name in gpu_pipe_names):
-            raise ValueError(
-                "GPU accelerated pipes {} could not be found in the model".format(
-                    sorted(set(model.pipe_names) - set(gpu_pipe_names))
-                )
-            )
-
-        num_devices = torch.cuda.device_count()
-        print(f"Number of available devices: {num_devices}", flush=True)
-
-        num_cpu_workers = self.num_cpu_workers
-        num_gpu_workers = self.num_gpu_workers
-
-        if num_gpu_workers is None:
-            num_gpu_workers = num_devices if len(gpu_pipe_names) > 0 else 0
-
-        if num_cpu_workers is None:
-            num_cpu_workers = max(
-                min(mp.cpu_count() - num_gpu_workers, DEFAULT_MAX_CPU_WORKERS), 0
-            )
-
-        if num_gpu_workers == 0:
-            gpu_pipe_names = []
-
-        gpu_worker_devices = (
-            [
-                torch.device(f"cuda:{gpu_idx * num_devices // num_gpu_workers}")
-                for gpu_idx in range(num_gpu_workers)
-            ]
-            if self.gpu_worker_devices is None
-            else self.gpu_worker_devices
-        )
-        cpu_worker_devices = (
-            ["cpu"] * num_cpu_workers
-            if self.cpu_worker_devices is None
-            else self.cpu_worker_devices
-        )
-        assert len(cpu_worker_devices) == num_cpu_workers
-        assert len(gpu_worker_devices) == num_gpu_workers
-        if num_cpu_workers == 0:
-            (
-                num_cpu_workers,
-                num_gpu_workers,
-                cpu_worker_devices,
-                gpu_worker_devices,
-                gpu_pipe_names,
-            ) = (num_gpu_workers, 0, gpu_worker_devices, [], [])
-
-        debug(f"Number of CPU workers: {num_cpu_workers}")
-        debug(f"Number of GPU workers: {num_gpu_workers}")
-
-        exchanger = Exchanger(
-            num_stages=len(gpu_pipe_names),
-            num_cpu_workers=num_cpu_workers,
-            num_gpu_workers=num_gpu_workers,
-            gpu_worker_devices=gpu_worker_devices,
-        )
-
-        cpu_workers = []
-        gpu_workers = []
-        model = model.to("cpu")
-
-        for gpu_idx in range(num_gpu_workers):
-            gpu_workers.append(
-                GPUWorker(
-                    gpu_idx=gpu_idx,
-                    exchanger=exchanger,
-                    gpu_pipe_names=gpu_pipe_names,
-                    model=model,
-                    device=gpu_worker_devices[gpu_idx],
-                )
-            )
-
-        for cpu_idx in range(num_cpu_workers):
-            cpu_workers.append(
-                CPUWorker(
-                    cpu_idx=cpu_idx,
-                    exchanger=exchanger,
-                    gpu_pipe_names=gpu_pipe_names,
-                    model=model,
-                    device=cpu_worker_devices[cpu_idx],
-                )
-            )
-
-        for worker in (*cpu_workers, *gpu_workers):
-            worker.start()
-
-        try:
-            num_max_enqueued = num_cpu_workers * 2 + 10
-            # Number of input/output batch per process
-            total_inputs = [0] * num_cpu_workers
-            total_outputs = [0] * num_cpu_workers
-            outputs_iterator = exchanger.iter_results()
-
-            cpu_worker_indices = list(range(num_cpu_workers))
-            inputs_iterator = (to_doc(i) for i in inputs)
-            for i, pdfs_batch in enumerate(batchify(inputs_iterator, self.batch_size)):
-                if sum(total_inputs) - sum(total_outputs) >= num_max_enqueued:
-                    outputs, cpu_idx, gpu_idx = next(outputs_iterator)
-                    if isinstance(outputs, BaseException):
-                        raise outputs  # pragma: no cover
-                    yield from (from_doc(o) for o in outputs)
-                    total_outputs[cpu_idx] += 1
-
-                # Shuffle to ensure the first process does not receive all the documents
-                # in case of total_inputs - total_outputs equality
-                shuffle(cpu_worker_indices)
-                cpu_idx = min(
-                    cpu_worker_indices,
-                    key=lambda i: total_inputs[i] - total_outputs[i],
-                )
-                exchanger.put_cpu(pdfs_batch, stage=0, idx=cpu_idx)
-                total_inputs[cpu_idx] += 1
-
-            while sum(total_outputs) < sum(total_inputs):
-                outputs, cpu_idx, gpu_idx = next(outputs_iterator)
-                if isinstance(outputs, BaseException):
-                    raise outputs  # pragma: no cover
-                yield from (from_doc(o) for o in outputs)
-                total_outputs[cpu_idx] += 1
-        finally:
-            # Send gpu and cpu process the order to stop processing data
-            # We use the prioritized queue to ensure the stop signal is processed
-            # before the next batch of data
-            for i, worker in enumerate(gpu_workers):
-                exchanger.gpu_inputs_queues[i][-1].put(None)
-                debug("Asked gpu worker", i, "to stop processing data")
-            for i, worker in enumerate(cpu_workers):
-                exchanger.cpu_inputs_queues[i][-1].put(None)
-                debug("Asked cpu worker", i, "to stop processing data")
-
-            # Enqueue a final non prioritized STOP signal to ensure there remains no
-            # data in the queues (cf drain loop in CPUWorker / GPUWorker)
-            for i, worker in enumerate(gpu_workers):
-                exchanger.gpu_inputs_queues[i][0].put(None)
-                debug("Asked gpu", i, "to end")
-            for i, worker in enumerate(gpu_workers):
-                worker.join(timeout=5)
-                debug("Joined gpu worker", i)
-            for i, worker in enumerate(cpu_workers):
-                exchanger.cpu_inputs_queues[i][0].put(None)
-                debug("Asked cpu", i, "to end")
-            for i, worker in enumerate(cpu_workers):
-                worker.join(timeout=1)
-                debug("Joined cpu worker", i)
-
-            # If a worker is still alive, kill it
-            # This should not happen, but for a reason I cannot explain, it does in
-            # some CPU workers sometimes when we catch an error, even though each run
-            # method of the workers completes cleanly. Maybe this has something to do
-            # with the cleanup of these processes ?
-            for i, worker in enumerate(gpu_workers):  # pragma: no cover
-                if worker.is_alive():
-                    print("Killing gpu worker", i)
-                    worker.kill()
-            for i, worker in enumerate(cpu_workers):  # pragma: no cover
-                if worker.is_alive():
-                    print("Killing cpu worker", i)
-                    worker.kill()
diff --git a/edspdf/accelerators/simple.py b/edspdf/accelerators/simple.py
deleted file mode 100644
index 340dbebb..00000000
--- a/edspdf/accelerators/simple.py
+++ /dev/null
@@ -1,92 +0,0 @@
-from typing import Any, Dict, Iterable
-
-import torch
-
-from ..registry import registry
-from ..utils.collections import batchify
-from .base import Accelerator, FromDictFieldsToDoc, FromDoc, ToDoc
-
-
-@registry.accelerator.register("simple")
-class SimpleAccelerator(Accelerator):
-    """
-    This is the simplest accelerator which batches the documents and process each batch
-    on the main process (the one calling `.pipe()`).
-
-    Examples
-    --------
-
-    ```python
-    docs = list(pipeline.pipe([content1, content2, ...]))
-    ```
-
-    or, if you want to override the model defined batch size
-
-    ```python
-    docs = list(pipeline.pipe([content1, content2, ...], batch_size=8))
-    ```
-
-    which is equivalent to passing a confit dict
-
-    ```python
-    docs = list(
-        pipeline.pipe(
-            [content1, content2, ...],
-            accelerator={
-                "@accelerator": "simple",
-                "batch_size": 8,
-            },
-        )
-    )
-    ```
-
-    or the instantiated accelerator directly
-
-    ```python
-    from edspdf.accelerators.simple import SimpleAccelerator
-
-    accelerator = SimpleAccelerator(batch_size=8)
-    docs = list(pipeline.pipe([content1, content2, ...], accelerator=accelerator))
-    ```
-
-    If you have a GPU, make sure to move the model to the appropriate device before
-    calling `.pipe()`. If you have multiple GPUs, use the
-    [multiprocessing][edspdf.accelerators.multiprocessing.MultiprocessingAccelerator]
-    accelerator instead.
-
-    ```python
-    pipeline.to("cuda")
-    docs = list(pipeline.pipe([content1, content2, ...]))
-    ```
-
-    Parameters
-    ----------
-    batch_size: int
-        The number of documents to process in each batch.
-    """
-
-    def __init__(
-        self,
-        *,
-        batch_size: int = 32,
-    ):
-        self.batch_size = batch_size
-
-    def __call__(
-        self,
-        inputs: Iterable[Any],
-        model: Any,
-        to_doc: ToDoc = FromDictFieldsToDoc("content"),
-        from_doc: FromDoc = lambda doc: doc,
-        component_cfg: Dict[str, Dict[str, Any]] = None,
-    ):
-        docs = (to_doc(doc) for doc in inputs)
-        for batch in batchify(docs, batch_size=self.batch_size):
-            with torch.no_grad(), model.cache(), model.train(False):
-                for name, pipe in model.pipeline:
-                    if name not in model._disabled:
-                        if hasattr(pipe, "batch_process"):
-                            batch = pipe.batch_process(batch)
-                        else:
-                            batch = [pipe(doc) for doc in batch]  # type: ignore
-            yield from (from_doc(doc) for doc in batch)
diff --git a/edspdf/data/__init__.py b/edspdf/data/__init__.py
new file mode 100644
index 00000000..14fc755e
--- /dev/null
+++ b/edspdf/data/__init__.py
@@ -0,0 +1,11 @@
+from typing import TYPE_CHECKING
+from edspdf.utils.lazy_module import lazify
+
+lazify()
+
+if TYPE_CHECKING:
+    from .base import from_iterable, to_iterable
+    from .files import read_files, write_files
+    from .parquet import read_parquet, write_parquet
+    from .pandas import from_pandas, to_pandas
+    from .converters import get_dict2doc_converter, get_doc2dict_converter
diff --git a/edspdf/data/base.py b/edspdf/data/base.py
new file mode 100644
index 00000000..cbe5917e
--- /dev/null
+++ b/edspdf/data/base.py
@@ -0,0 +1,180 @@
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    Optional,
+    Sequence,
+    Tuple,
+    TypeVar,
+    Union,
+)
+
+from edspdf.lazy_collection import LazyCollection
+
+from .converters import get_dict2doc_converter, get_doc2dict_converter
+
+
+class BaseReader:
+    """
+    The BaseReader servers as a base class for all readers. It expects two methods:
+
+    - `read_main` method which is called in the main process and should return a
+        generator of fragments (like filenames) with their estimated size (number of
+        documents)
+    - `read_worker` method which is called in the worker processes and receives
+        batches of fragments and should return a list of dictionaries (one per
+        document), ready to be converted to a Doc object by the converter.
+
+    Additionally, the subclass should define a `DATA_FIELDS` class attribute which
+    contains the names of all attributes that should not be copied when the reader is
+    copied to the worker processes. This is useful for example when the reader holds a
+    reference to a large object like a DataFrame that should not be copied to the
+    worker processes.
+    """
+
+    DATA_FIELDS = ()
+
+    def read_main(self) -> Iterable[Tuple[Any, int]]:
+        raise NotImplementedError()
+
+    def read_worker(self, fragment: Iterable[Any]) -> Iterable[Dict]:
+        raise NotImplementedError()
+
+    def worker_copy(self):
+        # new reader without data, this will not call __init__ since we use __dict__
+        # to set the data
+        reader = self.__class__.__new__(self.__class__)
+        state = {
+            k: v
+            for k, v in self.__dict__.items()
+            if k not in self.__class__.DATA_FIELDS
+        }
+        reader.__dict__ = state
+        return reader
+
+
+T = TypeVar("T")
+
+
+class BaseWriter:
+    def write_worker(self, records: Sequence[Any]) -> T:
+        raise NotImplementedError()
+
+    def write_main(self, fragments: Iterable[T]):
+        raise NotImplementedError()
+
+    def finalize(self):
+        return None, 0
+
+
+class IterableReader(BaseReader):
+    DATA_FIELDS = ("data",)
+
+    def __init__(self, data: Iterable):
+        self.data = data
+
+        super().__init__()
+
+    def read_main(self) -> Iterable[Tuple[Any, int]]:
+        return ((item, 1) for item in self.data)
+
+    def read_worker(self, fragments):
+        return [task for task in fragments]
+
+
+def from_iterable(
+    data: Iterable,
+    converter: Union[str, Callable] = None,
+    **kwargs,
+) -> LazyCollection:
+    """
+    The IterableReader (or `edsnlp.data.from_iterable`) reads a list of Python objects (
+    texts, dictionaries, ...) and yields documents by passing them through the
+    `converter` if given, or returns them as is.
+
+    Example
+    -------
+    ```{ .python .no-check }
+
+    import edsnlp
+
+    nlp = edsnlp.blank("eds")
+    nlp.add_pipe(...)
+    doc_iterator = edsnlp.data.from_iterable([{...}], nlp=nlp, converter=...)
+    annotated_docs = nlp.pipe(doc_iterator)
+    ```
+
+    !!! note "Generator vs list"
+
+        `edsnlp.data.from_iterable` returns a
+        [LazyCollection][edspdf.lazy_collection.LazyCollection].
+        To iterate over the documents multiple times efficiently or to access them by
+        index, you must convert it to a list
+
+        ```{ .python .no-check }
+        docs = list(edsnlp.data.from_iterable([{...}], converter=...)
+        ```
+
+    Parameters
+    ----------
+    data: Iterable
+        The data to read
+    converter: Optional[Union[str, Callable]]
+        Converter to use to convert the JSON rows of the data source to Doc objects
+    kwargs:
+        Additional keyword arguments to pass to the converter. These are documented
+        on the [Data schemas](/data/schemas) page.
+
+    Returns
+    -------
+    LazyCollection
+    """
+    data = LazyCollection(reader=IterableReader(data))
+    if converter:
+        converter, kwargs = get_dict2doc_converter(converter, kwargs)
+        data = data.map(converter, kwargs=kwargs)
+    return data
+
+
+def to_iterable(
+    data: Union[Any, LazyCollection],
+    converter: Optional[Union[str, Callable]] = None,
+    **kwargs,
+):
+    """
+    `edsnlp.data.to_items` returns an iterator of documents, as converted by the
+    `converter`. In comparison to just iterating over a LazyCollection, this will
+    also apply the `converter` to the documents, which can lower the data transfer
+    overhead when using multiprocessing.
+
+    Example
+    -------
+    ```{ .python .no-check }
+
+    import edsnlp
+
+    nlp = edsnlp.blank("eds")
+    nlp.add_pipe(...)
+
+    doc = nlp("My document with entities")
+
+    edsnlp.data.to_items([doc], converter="omop")
+    ```
+
+    Parameters
+    ----------
+    data: Union[Any, LazyCollection],
+        The data to write (either a list of documents or a LazyCollection).
+    converter: Optional[Union[str, Callable]]
+        Converter to use to convert the documents to dictionary objects.
+    kwargs:
+        Additional keyword arguments passed to the converter. These are documented
+        on the [Data schemas](/data/schemas) page.
+    """
+    data = LazyCollection.ensure_lazy(data)
+    if converter:
+        converter, kwargs = get_doc2dict_converter(converter, kwargs)
+        data = data.map(converter, kwargs=kwargs)
+
+    return data
diff --git a/edspdf/data/converters.py b/edspdf/data/converters.py
new file mode 100644
index 00000000..397be290
--- /dev/null
+++ b/edspdf/data/converters.py
@@ -0,0 +1,88 @@
+"""
+Converters are used to convert documents between python dictionaries and Doc objects.
+There are two types of converters: readers and writers. Readers convert dictionaries to
+Doc objects, and writers convert Doc objects to dictionaries.
+"""
+import inspect
+from copy import copy
+from types import FunctionType
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Optional,
+    Tuple,
+)
+
+from confit.registry import ValidatedFunction
+
+FILENAME = "__FILENAME__"
+CONTENT = "__CONTENT__"
+
+SCHEMA = {}
+
+
+def validate_kwargs(converter, kwargs):
+    converter: FunctionType = copy(converter)
+    spec = inspect.getfullargspec(converter)
+    first = spec.args[0]
+    converter.__annotations__[first] = Optional[Any]
+    converter.__defaults__ = (None, *(spec.defaults or ())[-len(spec.args) + 1 :])
+    vd = ValidatedFunction(converter, {"arbitrary_types_allowed": True})
+    model = vd.init_model_instance(**kwargs)
+    d = {
+        k: v
+        for k, v in model._iter()
+        if (k in model.__fields__ or model.__fields__[k].default_factory)
+    }
+    d.pop("v__duplicate_kwargs", None)  # see pydantic ValidatedFunction code
+    d.pop(vd.v_args_name, None)
+    d.pop(first, None)
+    return {**(d.pop(vd.v_kwargs_name, None) or {}), **d}
+
+
+def get_dict2doc_converter(converter: Callable, kwargs) -> Tuple[Callable, Dict]:
+    # kwargs_to_init = False
+    # if not callable(converter):
+    #     available = edspdf.registry.factory.get_available()
+    #     try:
+    #         filtered = [
+    #             name
+    #             for name in available
+    #             if converter == name or (converter in name and "dict2doc" in name)
+    #         ]
+    #         converter = edspdf.registry.factory.get(filtered[0])
+    #         converter = converter(**kwargs).instantiate(nlp=None)
+    #         kwargs = {}
+    #         return converter, kwargs
+    #     except (KeyError, IndexError):
+    #         available = [v for v in available if "dict2doc" in v]
+    #         raise ValueError(
+    #             f"Cannot find converter for format {converter}. "
+    #             f"Available converters are {', '.join(available)}"
+    #         )
+    # if isinstance(converter, type) or kwargs_to_init:
+    #     return converter(**kwargs), {}
+    return converter, validate_kwargs(converter, kwargs)
+
+
+def get_doc2dict_converter(converter: Callable, kwargs) -> Tuple[Callable, Dict]:
+    # if not callable(converter):
+    #     available = edspdf.registry.factory.get_available()
+    #     try:
+    #         filtered = [
+    #             name
+    #             for name in available
+    #             if converter == name or (converter in name and "doc2dict" in name)
+    #         ]
+    #         converter = edspdf.registry.factory.get(filtered[0])
+    #         converter = converter(**kwargs).instantiate(nlp=None)
+    #         kwargs = {}
+    #         return converter, kwargs
+    #     except (KeyError, IndexError):
+    #         available = [v for v in available if "doc2dict" in v]
+    #         raise ValueError(
+    #             f"Cannot find converter for format {converter}. "
+    #             f"Available converters are {', '.join(available)}"
+    #         )
+    return converter, validate_kwargs(converter, kwargs)
diff --git a/edspdf/data/files.py b/edspdf/data/files.py
new file mode 100644
index 00000000..94cda772
--- /dev/null
+++ b/edspdf/data/files.py
@@ -0,0 +1,337 @@
+# ruff: noqa: F401
+import json
+import os
+from collections import Counter
+from pathlib import Path
+from typing import (
+    Any,
+    Callable,
+    List,
+    Optional,
+    Union,
+)
+
+import pyarrow
+import pyarrow.fs
+from fsspec import AbstractFileSystem
+from fsspec.implementations.arrow import ArrowFSWrapper
+from loguru import logger
+
+from edspdf import registry
+from edspdf.data.base import BaseReader, BaseWriter
+from edspdf.data.converters import (
+    CONTENT,
+    FILENAME,
+    get_dict2doc_converter,
+    get_doc2dict_converter,
+)
+from edspdf.lazy_collection import LazyCollection
+from edspdf.utils.collections import flatten
+
+
+class FileReader(BaseReader):
+    DATA_FIELDS = ()
+
+    def __init__(
+        self,
+        path: Union[str, Path],
+        *,
+        keep_ipynb_checkpoints: bool = False,
+        load_annotations: bool = False,
+        filesystem: Optional[Any] = None,
+    ):
+        super().__init__()
+
+        if filesystem is None or (isinstance(path, str) and "://" in path):
+            path = (
+                path
+                if isinstance(path, Path) or "://" in path
+                else f"file://{os.path.abspath(path)}"
+            )
+            inferred_fs, fs_path = pyarrow.fs.FileSystem.from_uri(path)
+            filesystem = filesystem or inferred_fs
+            assert inferred_fs.type_name == filesystem.type_name, (
+                f"Protocol {inferred_fs.type_name} in path does not match "
+                f"filesystem {filesystem.type_name}"
+            )
+            path = fs_path
+
+        self.path = path
+        self.filesystem = (
+            ArrowFSWrapper(filesystem)
+            if isinstance(filesystem, pyarrow.fs.FileSystem)
+            else filesystem
+        )
+        self.load_annotations = load_annotations
+        if not self.filesystem.exists(path):
+            raise FileNotFoundError(f"Path {path} does not exist")
+
+        self.files: List[str] = [
+            file
+            for file in self.filesystem.glob(os.path.join(str(self.path), "*.pdf"))
+            if (keep_ipynb_checkpoints or ".ipynb_checkpoints" not in str(file))
+            and (
+                not load_annotations
+                or self.filesystem.exists(str(path).replace(".pdf", ".json"))
+            )
+        ]
+        assert len(self.files), f"No .pdf files found in the directory {path}"
+        logger.info(f"The directory contains {len(self.files)} .pdf files.")
+
+    def read_main(self):
+        return ((f, 1) for f in self.files)
+
+    def read_worker(self, fragment):
+        tasks = []
+        for path in fragment:
+            with self.filesystem.open(str(path), "rb") as f:
+                content = f.read()
+
+            json_path = str(path).replace(".pdf", ".json")
+
+            record = {"content": content}
+            if self.load_annotations and self.filesystem.exists(json_path):
+                with self.filesystem.open(json_path) as f:
+                    record["annotations"] = json.load(f)
+
+            record[FILENAME] = str(os.path.relpath(path, self.path)).rsplit(".", 1)[0]
+            record["id"] = record[FILENAME]
+            tasks.append(record)
+        return tasks
+
+
+class FileWriter(BaseWriter):
+    def __init__(
+        self,
+        path: Union[str, Path],
+        *,
+        overwrite: bool = False,
+        filesystem: Optional[AbstractFileSystem] = None,
+    ):
+        fs_path = path
+        if filesystem is None or (isinstance(path, str) and "://" in path):
+            path = (
+                path
+                if isinstance(path, Path) or "://" in path
+                else f"file://{os.path.abspath(path)}"
+            )
+            inferred_fs, fs_path = pyarrow.fs.FileSystem.from_uri(path)
+            filesystem = filesystem or inferred_fs
+            assert inferred_fs.type_name == filesystem.type_name, (
+                f"Protocol {inferred_fs.type_name} in path does not match "
+                f"filesystem {filesystem.type_name}"
+            )
+            path = fs_path
+
+        self.path = path
+        self.filesystem = (
+            ArrowFSWrapper(filesystem)
+            if isinstance(filesystem, pyarrow.fs.FileSystem)
+            else filesystem
+        )
+        self.filesystem.mkdirs(fs_path, exist_ok=True)
+
+        if self.filesystem.exists(self.path):
+            suffixes = Counter(f.suffix for f in self.filesystem.listdir(self.path))
+            unsafe_suffixes = {
+                s: v for s, v in suffixes.items() if s == ".pdf" or s == ".json"
+            }
+            if unsafe_suffixes and not overwrite:
+                raise FileExistsError(
+                    f"Directory {self.path} already exists and appear to contain "
+                    "annotations:"
+                    + "".join(f"\n -{s}: {v} files" for s, v in unsafe_suffixes.items())
+                    + "\nUse overwrite=True to write files anyway."
+                )
+
+        self.filesystem.mkdirs(path, exist_ok=True)
+
+        super().__init__()
+
+    def write_worker(self, records):
+        # If write as jsonl, we will perform the actual writing in the `write` method
+        results = []
+        for rec in flatten(records):
+            filename = str(rec.pop(FILENAME))
+            path = os.path.join(self.path, f"{filename}.pdf")
+            parent_dir = filename.rsplit("/", 1)[0]
+            if parent_dir and not self.filesystem.exists(parent_dir):
+                self.filesystem.makedirs(parent_dir, exist_ok=True)
+            if CONTENT in rec:
+                content = rec.pop(CONTENT)
+                with self.filesystem.open(path, "wb") as f:
+                    f.write(content)
+            ann_path = str(path).replace(".pdf", ".json")
+
+            with self.filesystem.open(ann_path, "w") as f:
+                json.dump(rec, f)
+
+            results.append(path)
+        return results, len(results)
+
+    def write_main(self, fragments):
+        return list(flatten(fragments))
+
+
+# noinspection PyIncorrectDocstring
+@registry.readers.register("files")
+def read_files(
+    path: Union[str, Path],
+    *,
+    keep_ipynb_checkpoints: bool = False,
+    load_annotations: bool = False,
+    converter: Optional[Union[str, Callable]] = None,
+    filesystem: Optional[Any] = None,
+    **kwargs,
+) -> LazyCollection:
+    """
+    The BratReader (or `edspdf.data.read_files`) reads a directory of BRAT files and
+    yields documents. At the moment, only entities and attributes are loaded. Relations
+     and events are not supported.
+
+    Example
+    -------
+    ```{ .python .no-check }
+
+    import edspdf
+
+    nlp = edspdf.blank("eds")
+    nlp.add_pipe(...)
+    doc_iterator = edspdf.data.read_files("path/to/brat/directory")
+    annotated_docs = nlp.pipe(doc_iterator)
+    ```
+
+    !!! note "Generator vs list"
+
+        `edspdf.data.read_files` returns a
+        [LazyCollection][edspdf.core.lazy_collection.LazyCollection].
+        To iterate over the documents multiple times efficiently or to access them by
+        index, you must convert it to a list :
+
+        ```{ .python .no-check }
+        docs = list(edspdf.data.read_files("path/to/brat/directory"))
+        ```
+
+    !!! warning "True/False attributes"
+
+        Boolean values are not supported by the BRAT editor, and are stored as empty
+        (key: empty value) if true, and not stored otherwise. This means that False
+        values will not be assigned to attributes by default, which can be problematic
+        when deciding if an entity is negated or not : is the entity not negated, or
+        has the negation attribute not been annotated ?
+
+        To avoid this issue, you can use the `bool_attributes` argument to specify
+        which attributes should be considered as boolean when reading a BRAT dataset.
+        These attributes will be assigned a value of `True` if they are present, and
+        `False` otherwise.
+
+        ```{ .python .no-check }
+        doc_iterator = edspdf.data.read_files(
+            "path/to/brat/directory",
+            # Mapping from 'BRAT attribute name' to 'Doc attribute name'
+            span_attributes={"Negation": "negated"},
+            bool_attributes=["negated"],  # Missing values will be set to False
+        )
+        ```
+
+    Parameters
+    ----------
+    path : Union[str, Path]
+        Path to the directory containing the BRAT files (will recursively look for
+        files in subdirectories).
+    nlp : Optional[PipelineProtocol]
+        The pipeline object (optional and likely not needed, prefer to use the
+        `tokenizer` directly argument instead).
+    keep_ipynb_checkpoints : bool
+        Whether to keep files in the `.ipynb_checkpoints` directories.
+    load_annotations : bool
+        Whether to load annotations from the `.json` files that share the same name as
+        the `.pdf` files.
+    converter : Optional[Union[str, Callable]]
+        Converter to use to convert the dictionary objects to documents.
+    filesystem: Optional[AbstractFileSystem]
+        The filesystem to use to write the files. If not set, the local filesystem
+        will be used.
+
+
+    Returns
+    -------
+    LazyCollection
+    """
+    data = LazyCollection(
+        reader=FileReader(
+            path,
+            keep_ipynb_checkpoints=keep_ipynb_checkpoints,
+            load_annotations=load_annotations,
+            filesystem=filesystem,
+        )
+    )
+    if converter:
+        converter, kwargs = get_dict2doc_converter(converter, kwargs)
+        data = data.map(converter, kwargs=kwargs)
+    return data
+
+
+@registry.writers.register("files")
+def write_files(
+    data: Union[Any, LazyCollection],
+    path: Union[str, Path],
+    *,
+    overwrite: bool = False,
+    converter: Union[str, Callable],
+    filesystem: Optional[AbstractFileSystem] = None,
+    **kwargs,
+) -> None:
+    """
+    `edspdf.data.write_files` writes a list of documents using the BRAT/File
+    format in a directory. The BRAT files will be named after the `note_id` attribute of
+    the documents, and subdirectories will be created if the name contains `/`
+    characters.
+
+    Example
+    -------
+    ```{ .python .no-check }
+
+    import edspdf
+
+    nlp = edspdf.blank("eds")
+    nlp.add_pipe(...)
+
+    doc = nlp("My document with entities")
+
+    edspdf.data.write_files([doc], "path/to/brat/directory")
+    ```
+
+    !!! warning "Overwriting files"
+
+        By default, `write_files` will raise an error if the directory already exists
+        and contains files with `.json` or `.pdf` suffixes. This is to avoid overwriting
+        existing annotations. To allow overwriting existing files, use `overwrite=True`.
+
+    Parameters
+    ----------
+    data: Union[Any, LazyCollection],
+        The data to write (either a list of documents or a LazyCollection).
+    path: Union[str, Path]
+        Path to the directory containing the BRAT files (will recursively look for
+        files in subdirectories).
+    overwrite: bool
+        Whether to overwrite existing directories.
+    converter: Optional[Union[str, Callable]]
+        Converter to use to convert the documents to dictionary objects.
+    filesystem: Optional[AbstractFileSystem]
+        The filesystem to use to write the files. If not set, the local filesystem
+        will be used.
+    """
+    data = LazyCollection.ensure_lazy(data)
+    if converter:
+        converter, kwargs = get_doc2dict_converter(converter, kwargs)
+        data = data.map(converter, kwargs=kwargs)
+
+    return data.write(
+        FileWriter(
+            path=path,
+            filesystem=filesystem,
+            overwrite=overwrite,
+        )
+    )
diff --git a/edspdf/data/pandas.py b/edspdf/data/pandas.py
new file mode 100644
index 00000000..c97fedea
--- /dev/null
+++ b/edspdf/data/pandas.py
@@ -0,0 +1,155 @@
+from __future__ import annotations
+
+from typing import Any, Callable, Iterable, Optional, Tuple, Union
+
+import pandas as pd
+
+from edspdf import registry
+from edspdf.data.base import BaseReader, BaseWriter
+from edspdf.data.converters import (
+    FILENAME,
+    get_dict2doc_converter,
+    get_doc2dict_converter,
+)
+from edspdf.lazy_collection import LazyCollection
+from edspdf.utils.collections import dl_to_ld, flatten, ld_to_dl
+
+
+class PandasReader(BaseReader):
+    DATA_FIELDS = ("data",)
+
+    def __init__(
+        self,
+        data: pd.DataFrame,
+        **kwargs,
+    ):
+        assert isinstance(data, pd.DataFrame)
+        self.data = data
+
+        super().__init__(**kwargs)
+
+    def read_main(self) -> Iterable[Tuple[Any, int]]:
+        return ((item, 1) for item in dl_to_ld(dict(self.data)))
+
+    def read_worker(self, fragments):
+        return [task for task in fragments]
+
+
+@registry.readers.register("pandas")
+def from_pandas(
+    data,
+    converter: Union[str, Callable],
+    **kwargs,
+) -> LazyCollection:
+    """
+    The PandasReader (or `edspdf.data.from_pandas`) handles reading from a table and
+    yields documents. At the moment, only entities and attributes are loaded. Relations
+    and events are not supported.
+
+    Example
+    -------
+    ```{ .python .no-check }
+
+    import edspdf
+
+    nlp = edspdf.blank("eds")
+    nlp.add_pipe(...)
+    doc_iterator = edspdf.data.from_pandas(df, nlp=nlp, converter="omop")
+    annotated_docs = nlp.pipe(doc_iterator)
+    ```
+
+    !!! note "Generator vs list"
+
+        `edspdf.data.from_pandas` returns a
+        [LazyCollection][edspdf.core.lazy_collection.LazyCollection].
+        To iterate over the documents multiple times efficiently or to access them by
+        index, you must convert it to a list
+
+        ```{ .python .no-check }
+        docs = list(edspdf.data.from_pandas(df, converter="omop"))
+        ```
+
+    Parameters
+    ----------
+    data: pd.DataFrame
+        Pandas object
+    converter: Optional[Union[str, Callable]]
+        Converter to use to convert the rows of the DataFrame to Doc objects
+    kwargs:
+        Additional keyword arguments passed to the converter. These are documented
+        on the [Data schemas](/data/schemas) page.
+
+    Returns
+    -------
+    LazyCollection
+    """
+
+    data = LazyCollection(reader=PandasReader(data))
+    if converter:
+        converter, kwargs = get_dict2doc_converter(converter, kwargs)
+        data = data.map(converter, kwargs=kwargs)
+    return data
+
+
+class PandasWriter(BaseWriter):
+    def __init__(self, dtypes: Optional[dict] = None):
+        self.dtypes = dtypes
+
+    def write_worker(self, records):
+        # If write as jsonl, we will perform the actual writing in the `write` method
+        for rec in records:
+            if isinstance(rec, dict):
+                rec.pop(FILENAME, None)
+        return records, len(records)
+
+    def write_main(self, fragments):
+        import pandas as pd
+
+        columns = ld_to_dl(flatten(fragments))
+        res = pd.DataFrame(columns)
+        return res.astype(self.dtypes) if self.dtypes else res
+
+
+@registry.writers.register("pandas")
+def to_pandas(
+    data: Union[Any, LazyCollection],
+    converter: Optional[Union[str, Callable]],
+    dtypes: Optional[dict] = None,
+    **kwargs,
+) -> pd.DataFrame:
+    """
+    `edspdf.data.to_pandas` writes a list of documents as a pandas table.
+
+    Example
+    -------
+    ```{ .python .no-check }
+
+    import edspdf
+
+    nlp = edspdf.blank("eds")
+    nlp.add_pipe(...)
+
+    doc = nlp("My document with entities")
+
+    edspdf.data.to_pandas([doc], converter="omop")
+    ```
+
+    Parameters
+    ----------
+    data: Union[Any, LazyCollection],
+        The data to write (either a list of documents or a LazyCollection).
+    converter: Optional[Union[str, Callable]]
+        Converter to use to convert the documents to dictionary objects before storing
+        them in the dataframe.
+    dtypes: Optional[dict]
+        Dictionary of column names to dtypes. This is passed to `pd.DataFrame.astype`.
+    kwargs:
+        Additional keyword arguments passed to the converter. These are documented
+        on the [Data schemas](/data/schemas) page.
+    """
+    data = LazyCollection.ensure_lazy(data)
+    if converter:
+        converter, kwargs = get_doc2dict_converter(converter, kwargs)
+        data = data.map(converter, kwargs=kwargs)
+
+    return data.write(PandasWriter(dtypes))
diff --git a/edspdf/data/parquet.py b/edspdf/data/parquet.py
new file mode 100644
index 00000000..813ff984
--- /dev/null
+++ b/edspdf/data/parquet.py
@@ -0,0 +1,226 @@
+import os
+from itertools import chain
+from pathlib import Path
+from typing import Any, Callable, Dict, Iterable, List, Optional, TypeVar, Union
+
+import pyarrow.dataset
+import pyarrow.fs
+import pyarrow.parquet
+from pyarrow.dataset import ParquetFileFragment
+
+from edspdf.data.base import BaseReader, BaseWriter
+from edspdf.data.converters import (
+    FILENAME,
+    get_dict2doc_converter,
+    get_doc2dict_converter,
+)
+from edspdf.lazy_collection import LazyCollection
+from edspdf.structures import PDFDoc, registry
+from edspdf.utils.collections import dl_to_ld, flatten, ld_to_dl
+
+
+class ParquetReader(BaseReader):
+    DATA_FIELDS = ("dataset",)
+
+    def __init__(
+        self,
+        path: Union[str, Path],
+        *,
+        read_in_worker: bool,
+        filesystem: Optional[pyarrow.fs.FileSystem] = None,
+    ):
+        super().__init__()
+        # Either the filesystem has not been passed
+        # or the path is a URL (e.g. s3://) => we need to infer the filesystem
+        fs_path = path
+        if filesystem is None or (isinstance(path, str) and "://" in path):
+            path = (
+                path
+                if isinstance(path, Path) or "://" in path
+                else f"file://{os.path.abspath(path)}"
+            )
+            inferred_fs, fs_path = pyarrow.fs.FileSystem.from_uri(path)
+            filesystem = filesystem or inferred_fs
+            assert inferred_fs.type_name == filesystem.type_name, (
+                f"Protocol {inferred_fs.type_name} in path does not match "
+                f"filesystem {filesystem.type_name}"
+            )
+        self.read_in_worker = read_in_worker
+        self.dataset = pyarrow.dataset.dataset(
+            fs_path, format="parquet", filesystem=filesystem
+        )
+
+    def read_main(self):
+        fragments: List[ParquetFileFragment] = self.dataset.get_fragments()
+        if self.read_in_worker:
+            # read in worker -> each task is a file to read from
+            return ((f, f.metadata.num_rows) for f in fragments)
+        else:
+            # read in worker -> each task is a non yet parsed line
+            return (
+                (line, 1)
+                for f in fragments
+                for batch in f.to_table().to_batches(1024)
+                for line in dl_to_ld(batch.to_pydict())
+            )
+
+    def read_worker(self, tasks):
+        if self.read_in_worker:
+            tasks = list(
+                chain.from_iterable(
+                    dl_to_ld(batch.to_pydict())
+                    for task in tasks
+                    for batch in task.to_table().to_batches(1024)
+                )
+            )
+        return tasks
+
+
+T = TypeVar("T")
+
+
+class ParquetWriter(BaseWriter):
+    def __init__(
+        self,
+        path: Union[str, Path],
+        num_rows_per_file: int,
+        overwrite: bool,
+        write_in_worker: bool,
+        accumulate: bool = True,
+        filesystem: Optional[pyarrow.fs.FileSystem] = None,
+    ):
+        super().__init__()
+        fs_path = path
+        if filesystem is None or (isinstance(path, str) and "://" in path):
+            path = (
+                path
+                if isinstance(path, Path) or "://" in path
+                else f"file://{os.path.abspath(path)}"
+            )
+            inferred_fs, fs_path = pyarrow.fs.FileSystem.from_uri(path)
+            filesystem = filesystem or inferred_fs
+            assert inferred_fs.type_name == filesystem.type_name, (
+                f"Protocol {inferred_fs.type_name} in path does not match "
+                f"filesystem {filesystem.type_name}"
+            )
+            path = fs_path
+        # Check that filesystem has the same protocol as indicated by path
+        filesystem.create_dir(fs_path, recursive=True)
+        if overwrite is False:
+            dataset = pyarrow.dataset.dataset(
+                fs_path, format="parquet", filesystem=filesystem
+            )
+            if len(list(dataset.get_fragments())):
+                raise FileExistsError(
+                    f"Directory {fs_path} already exists and is not empty. "
+                    "Use overwrite=True to overwrite."
+                )
+        self.filesystem = filesystem
+        self.path = path
+        self.write_in_worker = write_in_worker
+        self.batch = []
+        self.num_rows_per_file = num_rows_per_file
+        self.closed = False
+        self.finalized = False
+        self.accumulate = accumulate
+        if not self.accumulate:
+            self.finalize = super().finalize
+
+    def write_worker(self, records, last=False):
+        # Results will contain a batches of samples ready to be written (or None if
+        # write_in_worker is True) and they have already been written.
+        results = []
+        count = 0
+
+        for rec in records:
+            if isinstance(rec, dict):
+                rec.pop(FILENAME, None)
+
+        # While there is something to write
+        greedy = last or not self.accumulate
+        while len(records) or greedy and len(self.batch):
+            n_to_fill = self.num_rows_per_file - len(self.batch)
+            self.batch.extend(records[:n_to_fill])
+            records = records[n_to_fill:]
+            if greedy or len(self.batch) >= self.num_rows_per_file:
+                fragment = pyarrow.Table.from_pydict(ld_to_dl(flatten(self.batch)))
+                count += len(self.batch)
+                self.batch = []
+                if self.write_in_worker:
+                    pyarrow.parquet.write_to_dataset(
+                        table=fragment,
+                        root_path=self.path,
+                        filesystem=self.filesystem,
+                    )
+                    fragment = None
+                results.append(fragment)
+        return results, count
+
+    def finalize(self):
+        if not self.finalized:
+            self.finalized = True
+            return self.write_worker([], last=True)
+
+    def write_main(self, fragments: Iterable[List[Union[pyarrow.Table, Path]]]):
+        for table in flatten(fragments):
+            if not self.write_in_worker:
+                pyarrow.parquet.write_to_dataset(
+                    table=table,
+                    root_path=self.path,
+                    filesystem=self.filesystem,
+                )
+        return pyarrow.dataset.dataset(
+            self.path, format="parquet", filesystem=self.filesystem
+        )
+
+
+@registry.readers.register("parquet")
+def read_parquet(
+    path: Union[str, Path],
+    converter: Union[str, Callable],
+    *,
+    read_in_worker: bool = False,
+    filesystem: Optional[pyarrow.fs.FileSystem] = None,
+    **kwargs,
+) -> LazyCollection:
+    data = LazyCollection(
+        reader=ParquetReader(
+            path,
+            read_in_worker=read_in_worker,
+            filesystem=filesystem,
+        )
+    )
+    if converter:
+        converter, kwargs = get_dict2doc_converter(converter, kwargs)
+        data = data.map(converter, kwargs=kwargs)
+    return data
+
+
+@registry.writers.register("parquet")
+def write_parquet(
+    data: Union[Any, LazyCollection],
+    path: Union[str, Path],
+    *,
+    write_in_worker: bool = False,
+    num_rows_per_file: int = 1024,
+    overwrite: bool = False,
+    filesystem: Optional[pyarrow.fs.FileSystem] = None,
+    accumulate: bool = True,
+    converter: Optional[Union[str, Callable[[PDFDoc], Dict]]],
+    **kwargs,
+) -> None:
+    data = LazyCollection.ensure_lazy(data)
+    if converter:
+        converter, kwargs = get_doc2dict_converter(converter, kwargs)
+        data = data.map(converter, kwargs=kwargs)
+
+    return data.write(
+        ParquetWriter(
+            path,
+            num_rows_per_file=num_rows_per_file,
+            overwrite=overwrite,
+            write_in_worker=write_in_worker,
+            accumulate=accumulate,
+            filesystem=filesystem,
+        )
+    )
diff --git a/edspdf/lazy_collection.py b/edspdf/lazy_collection.py
new file mode 100644
index 00000000..e3b60b9f
--- /dev/null
+++ b/edspdf/lazy_collection.py
@@ -0,0 +1,345 @@
+from __future__ import annotations
+
+import contextlib
+from functools import wraps
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Container,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Tuple,
+)
+
+from typing_extensions import Literal
+
+import edspdf.data
+
+if TYPE_CHECKING:
+    import torch
+
+    from edspdf import Pipeline
+    from edspdf.data.base import BaseReader, BaseWriter
+    from edspdf.trainable_pipe import TrainablePipe
+
+INFER = type("INFER", (), {"__repr__": lambda self: "INFER"})()
+
+
+def with_non_default_args(fn: Callable) -> Callable:
+    @wraps(fn)
+    def wrapper(self, **kwargs):
+        return fn(self, **kwargs, _non_default_args=kwargs.keys())
+
+    return wrapper
+
+
+class MetaLazyCollection(type):
+    def __getattr__(self, item):
+        if item in edspdf.data.__all__:
+            fn = getattr(edspdf.data, item)
+            setattr(self, item, fn)
+            return fn
+        raise AttributeError(item)
+
+    def __dir__(self):  # pragma: no cover
+        return (*super().__dir__(), *edspdf.data.__all__)
+
+
+class LazyCollection(metaclass=MetaLazyCollection):
+    def __init__(
+        self,
+        reader: Optional[BaseReader] = None,
+        writer: Optional[BaseWriter] = None,
+        pipeline: List[Any] = [],
+        config={},
+    ):
+        self.reader = reader
+        self.writer = writer
+        self.pipeline: List[Tuple[str, Callable, Dict]] = pipeline
+        self.config = config
+
+    @property
+    def batch_size(self):
+        return self.config.get("batch_size", 1)
+
+    @property
+    def batch_by(self):
+        return self.config.get("batch_by", "docs")
+
+    @property
+    def sort_chunks(self):
+        return self.config.get("sort_chunks", False)
+
+    @property
+    def split_into_batches_after(self):
+        return self.config.get("split_into_batches_after")
+
+    @property
+    def chunk_size(self):
+        return self.config.get("chunk_size", self.config.get("batch_size", 128))
+
+    @property
+    def disable_implicit_parallelism(self):
+        return self.config.get("disable_implicit_parallelism", True)
+
+    @property
+    def num_cpu_workers(self):
+        return self.config.get("num_cpu_workers")
+
+    @property
+    def num_gpu_workers(self):
+        return self.config.get("num_gpu_workers")
+
+    @property
+    def gpu_pipe_names(self):
+        return self.config.get("gpu_pipe_names")
+
+    @property
+    def gpu_worker_devices(self):
+        return self.config.get("gpu_worker_devices")
+
+    @property
+    def cpu_worker_devices(self):
+        return self.config.get("cpu_worker_devices")
+
+    @property
+    def backend(self):
+        return self.config.get("backend")
+
+    @property
+    def show_progress(self):
+        return self.config.get("show_progress")
+
+    @property
+    def process_start_method(self):
+        return self.config.get("process_start_method")
+
+    @with_non_default_args
+    def set_processing(
+        self,
+        batch_size: int = 1,
+        batch_by: Literal["docs", "pages", "content_boxes"] = "docs",
+        chunk_size: int = INFER,
+        sort_chunks: bool = False,
+        split_into_batches_after: str = INFER,
+        num_cpu_workers: Optional[int] = INFER,
+        num_gpu_workers: Optional[int] = INFER,
+        disable_implicit_parallelism: bool = True,
+        backend: Optional[Literal["simple", "multiprocessing"]] = INFER,
+        gpu_pipe_names: Optional[List[str]] = INFER,
+        show_progress: bool = False,
+        process_start_method: Optional[Literal["fork", "spawn"]] = INFER,
+        gpu_worker_devices: Optional[List[str]] = INFER,
+        cpu_worker_devices: Optional[List[str]] = INFER,
+        _non_default_args: Iterable[str] = (),
+    ) -> "LazyCollection":
+        """
+        Parameters
+        ----------
+        batch_size: int
+            Number of documents to process at a time in a GPU worker (or in the
+            main process if no workers are used).
+        batch_by: Literal["docs", "pages", "content_boxes"]
+            How to compute the batch size:
+
+            - "docs" (default) is the number of documents.
+            - "pages" is the total number of pages in the documents.
+            - "content_boxes" is the total number of content boxes in the documents
+        chunk_size: int
+            Number of documents to build before splitting into batches. Only used
+            with "simple" and "multiprocessing" backends. This is also the number of
+            documents that will be passed through the first components of the pipeline
+            until a GPU worker is used (then the chunks will be split according to the
+            `batch_size` and `batch_by` arguments).
+
+            By default, the chunk size is equal to the batch size, or 128 if the batch
+            size is not set.
+        sort_chunks: bool
+            Whether to sort the documents by size before splitting into batches.
+        split_into_batches_after: str
+            The name of the component after which to split the documents into batches.
+            Only used with "simple" and "multiprocessing" backends.
+            By default, the documents are split into batches as soon as the input
+            are converted into Doc objects.
+        num_cpu_workers: int
+            Number of CPU workers. A CPU worker handles the non deep-learning components
+            and the preprocessing, collating and postprocessing of deep-learning
+            components. If no GPU workers are used, the CPU workers also handle the
+            forward call of the deep-learning components.
+        num_gpu_workers: Optional[int]
+            Number of GPU workers. A GPU worker handles the forward call of the
+            deep-learning components. Only used with "multiprocessing" backend.
+        disable_implicit_parallelism: bool
+            Whether to disable OpenMP and Huggingface tokenizers implicit parallelism in
+            multiprocessing mode. Defaults to True.
+        gpu_pipe_names: Optional[List[str]]
+            List of pipe names to accelerate on a GPUWorker, defaults to all pipes
+            that inherit from TrainablePipe. Only used with "multiprocessing" backend.
+            Inferred from the pipeline if not set.
+        backend: Optional[Literal["simple", "multiprocessing", "spark"]]
+            The backend to use for parallel processing. If not set, the backend is
+            automatically selected based on the input data and the number of workers.
+
+            - "simple" is the default backend and is used when `num_cpu_workers` is 1
+                and `num_gpu_workers` is 0.
+            - "multiprocessing" is used when `num_cpu_workers` is greater than 1 or
+                `num_gpu_workers` is greater than 0.
+            - "spark" is used when the input data is a Spark dataframe and the output
+                writer is a Spark writer.
+        show_progress: Optional[bool]
+            Whether to show progress bars (only applicable with "simple" and
+            "multiprocessing" backends).
+        process_start_method: Optional[Literal["fork", "spawn"]]
+            Whether to use "fork" or "spawn" as the start method for the multiprocessing
+            backend. The default is "fork" on Unix systems and "spawn" on Windows.
+
+            - "fork" is the default start method on Unix systems and is the fastest
+                start method, but it is not available on Windows, can cause issues
+                with CUDA and is not safe when using multiple threads.
+            - "spawn" is the default start method on Windows and is the safest start
+                method, but it is not available on Unix systems and is slower than
+                "fork".
+        gpu_worker_devices: Optional[List[str]]
+            List of GPU devices to use for the GPU workers. Defaults to all available
+            devices, one worker per device. Only used with "multiprocessing" backend.
+        cpu_worker_devices: Optional[List[str]]
+            List of GPU devices to use for the CPU workers. Used for debugging purposes.
+
+        Returns
+        -------
+        LazyCollection
+        """
+        kwargs = {k: v for k, v in locals().items() if k in _non_default_args}
+        return LazyCollection(
+            reader=self.reader,
+            writer=self.writer,
+            pipeline=self.pipeline,
+            config={
+                **self.config,
+                **{k: v for k, v in kwargs.items() if v is not INFER},
+            },
+        )
+
+    @classmethod
+    def ensure_lazy(cls, data):
+        from edspdf.data.base import IterableReader
+
+        if isinstance(data, cls):
+            return data
+        return cls(reader=IterableReader(data))
+
+    def map(self, pipe, name: Optional[str] = None, kwargs={}) -> "LazyCollection":
+        return LazyCollection(
+            reader=self.reader,
+            writer=self.writer,
+            pipeline=[*self.pipeline, (name, pipe, kwargs)],
+            config=self.config,
+        )
+
+    def map_pipeline(self, model: Pipeline) -> "LazyCollection":
+        new_steps = []
+        for name, pipe, kwargs in self.pipeline:
+            new_steps.append((name, pipe, kwargs))
+
+        new_steps.append((None, model.ensure_doc, {}))
+
+        for name, pipe in model.pipeline:
+            if name not in model._disabled:
+                new_steps.append((name, pipe, {}))
+        config = (
+            {**self.config, "batch_size": model.batch_size}
+            if self.batch_size is None
+            else self.config
+        )
+        return LazyCollection(
+            reader=self.reader,
+            writer=self.writer,
+            pipeline=new_steps,
+            config=config,
+        )
+
+    def write(self, writer: BaseWriter, execute: bool = True) -> Any:
+        lc = LazyCollection(
+            reader=self.reader,
+            writer=writer,
+            pipeline=self.pipeline,
+            config=self.config,
+        )
+        return lc.execute() if execute else lc
+
+    def execute(self):
+        import edspdf.processing
+
+        backend = self.backend
+        if backend is None:
+            if (
+                self.num_cpu_workers is not None
+                and self.num_cpu_workers > 1
+                or self.num_gpu_workers is not None
+                and self.num_gpu_workers > 0
+            ):
+                backend = "multiprocessing"
+            else:
+                backend = "simple"
+        execute = getattr(edspdf.processing, f"execute_{backend}_backend")
+        return execute(self)
+
+    def __iter__(self):
+        return iter(self.execute())
+
+    @contextlib.contextmanager
+    def cache(self):
+        for name, pipe, *_ in self.pipeline:
+            if hasattr(pipe, "enable_cache"):
+                pipe.enable_cache()
+        yield
+        for name, pipe, *_ in self.pipeline:
+            if hasattr(pipe, "disable_cache"):
+                pipe.disable_cache()
+
+    def torch_components(
+        self, disable: Container[str] = ()
+    ) -> Iterable[Tuple[str, "TrainablePipe"]]:
+        """
+        Yields components that are PyTorch modules.
+
+        Parameters
+        ----------
+        disable: Container[str]
+            The names of disabled components, which will be skipped.
+
+        Returns
+        -------
+        Iterable[Tuple[str, TrainablePipe]]
+        """
+        for name, pipe, *_ in self.pipeline:
+            if name not in disable and hasattr(pipe, "forward"):
+                yield name, pipe
+
+    def to(self, device: Union[str, Optional["torch.device"]] = None):  # noqa F821
+        """Moves the pipeline to a given device"""
+        for name, pipe, *_ in self.torch_components():
+            pipe.to(device)
+        return self
+
+    def worker_copy(self):
+        return LazyCollection(
+            reader=self.reader.worker_copy(),
+            writer=self.writer,
+            pipeline=self.pipeline,
+            config=self.config,
+        )
+
+    def __dir__(self):  # pragma: no cover
+        return (*super().__dir__(), *edspdf.data.__all__)
+
+    def __getattr__(self, item):
+        return getattr(LazyCollection, item).__get__(self)
+
+
+if TYPE_CHECKING:
+    # just to add read/from_* and write/to_* methods to the static type hints
+    LazyCollection = edspdf.data  # noqa: F811
diff --git a/edspdf/pipeline.py b/edspdf/pipeline.py
index 05e1d226..83ecec1f 100644
--- a/edspdf/pipeline.py
+++ b/edspdf/pipeline.py
@@ -1,13 +1,15 @@
 import functools
+import importlib
+import inspect
 import json
 import os
 import shutil
 import warnings
-from collections import defaultdict
 from contextlib import contextmanager
 from enum import Enum
 from pathlib import Path
 from typing import (
+    TYPE_CHECKING,
     Any,
     Callable,
     Dict,
@@ -24,14 +26,13 @@
 
 from confit import Config
 from confit.errors import ConfitValidationError, patch_errors
-from confit.utils.collections import join_path, split_path
 from confit.utils.xjson import Reference
-from pydantic import parse_obj_as
 from typing_extensions import Literal
 
 import edspdf
 
-from .accelerators.base import Accelerator, FromDoc, ToDoc
+from .data.converters import FILENAME
+from .lazy_collection import LazyCollection
 from .registry import CurriedFactory, registry
 from .structures import PDFDoc
 from .utils.collections import (
@@ -41,6 +42,9 @@
     multi_tee,
 )
 
+if TYPE_CHECKING:
+    import torch
+
 EMPTY_LIST = FrozenList()
 
 
@@ -210,17 +214,16 @@ def add_pipe(
             pipe = factory
             if hasattr(pipe, "name"):
                 if name is not None and name != pipe.name:
-                    raise ValueError(
+                    warnings.warn(
                         "The provided name does not match the name of the component."
                     )
+                    pipe.name = name
                 else:
                     name = pipe.name
-            else:
-                if name is None:
-                    raise ValueError(
-                        "The component does not have a name, so you must provide one",
-                    )
-                pipe.name = name
+            if name is None:
+                raise ValueError(
+                    "The component does not have a name, so you must provide one",
+                )
         assert sum([before is not None, after is not None, first]) <= 1, (
             "You can only use one of before, after, or first",
         )
@@ -236,6 +239,18 @@ def add_pipe(
         self._components.insert(insertion_idx, (name, pipe))
         return pipe
 
+    def ensure_doc(self, doc):
+        return (
+            doc
+            if isinstance(doc, PDFDoc)
+            else PDFDoc(content=doc)
+            if isinstance(doc, bytes)
+            else PDFDoc(
+                content=doc["content"],
+                id=doc.get("id") or doc.get(FILENAME),
+            )
+        )
+
     def __call__(self, doc: Any) -> PDFDoc:
         """
         Apply each component successively on a document.
@@ -265,20 +280,20 @@ def __call__(self, doc: Any) -> PDFDoc:
 
     def pipe(
         self,
-        inputs: Any,
+        inputs: Union[LazyCollection, Iterable],
         batch_size: Optional[int] = None,
         *,
-        accelerator: Optional[Union[str, Accelerator]] = None,
-        to_doc: Optional[ToDoc] = None,
-        from_doc: FromDoc = lambda doc: doc,
-    ) -> Iterable[PDFDoc]:
+        accelerator: Any = None,
+        to_doc: Any = None,
+        from_doc: Any = None,
+    ) -> LazyCollection:
         """
         Process a stream of documents by applying each component successively on
         batches of documents.
 
         Parameters
         ----------
-        inputs: Iterable[Union[str, PDFDoc]]
+        inputs: Union[LazyCollection, Iterable]
             The inputs to create the PDFDocs from, or the PDFDocs directly.
         batch_size: Optional[int]
             The batch size to use. If not provided, the batch size of the pipeline
@@ -296,43 +311,102 @@ def pipe(
 
         Returns
         -------
-        Iterable[PDFDoc]
+        LazyCollection
         """
 
         if batch_size is None:
             batch_size = self.batch_size
 
-        if accelerator is None:
-            accelerator = "simple"
-        if isinstance(accelerator, str):
-            accelerator = {"@accelerator": accelerator, "batch_size": batch_size}
-        if isinstance(accelerator, dict):
-            accelerator = Config(accelerator).resolve(registry=registry)
-
-        kwargs = {
-            "inputs": inputs,
-            "model": self,
-            "to_doc": parse_obj_as(Optional[ToDoc], to_doc),
-            "from_doc": parse_obj_as(Optional[FromDoc], from_doc),
-        }
-        for k, v in list(kwargs.items()):
-            if v is None:
-                del kwargs[k]
-
-        with self.train(False):
-            return accelerator(**kwargs)
+        lazy_collection = LazyCollection.ensure_lazy(inputs)
+
+        if to_doc is not None:
+            warnings.warn(
+                "The `to_doc` argument is deprecated. "
+                "Please use the returned value's `map` method or the read/from_{} "
+                "method's converter argument instead.",
+                DeprecationWarning,
+            )
+            if isinstance(to_doc, str):
+                to_doc = {"content_field": to_doc}
+            if isinstance(to_doc, dict):
+                to_doc_dict = to_doc
+
+                def to_doc(doc):
+                    return PDFDoc(
+                        content=doc[to_doc_dict["content_field"]],
+                        id=doc[to_doc_dict["id_field"]]
+                        if "id_field" in to_doc_dict
+                        else None,
+                    )
+
+            if not callable(to_doc):
+                raise ValueError(
+                    "The `to_doc` argument must be a callable or a dictionary",
+                )
+            lazy_collection = lazy_collection.map(to_doc)
+
+        lazy_collection = lazy_collection.map_pipeline(self).set_processing(
+            batch_size=batch_size
+        )
+
+        if accelerator is not None:
+            warnings.warn(
+                "The `accelerator` argument is deprecated. "
+                "Please use the returned value's `set_processing` method instead.",
+                DeprecationWarning,
+            )
+            if isinstance(accelerator, str):
+                kwargs = {}
+                backend = accelerator
+            elif isinstance(accelerator, dict):
+                kwargs = dict(accelerator)
+                backend = kwargs.pop("@accelerator", "simple")
+            elif "Accelerator" in type(accelerator).__name__:
+                backend = (
+                    "multiprocessing"
+                    if "Multiprocessing" in type(accelerator).__name__
+                    else "simple"
+                )
+                kwargs = accelerator.__dict__
+            lazy_collection.set_processing(
+                backend=backend,
+                **kwargs,
+            )
+        if from_doc is not None:
+            warnings.warn(
+                "The `from_doc` argument is deprecated. "
+                "Please use the returned value's `map` method or the write/to_{} "
+                "method's converter argument instead.",
+                DeprecationWarning,
+            )
+            if isinstance(from_doc, dict):
+                from_doc_dict = from_doc
+
+                def from_doc(doc):
+                    return {k: getattr(doc, v) for k, v in from_doc_dict.items()}
+
+            if not callable(from_doc):
+                raise ValueError(
+                    "The `from_doc` argument must be a callable or a dictionary",
+                )
+            lazy_collection = lazy_collection.map(from_doc)
+
+        return lazy_collection
 
     @contextmanager
     def cache(self):
         """
         Enable caching for all (trainable) components in the pipeline
         """
-        was_not_cached = self._cache is None
-        if was_not_cached:
-            self._cache = {}
+        to_disable = set()
+        for name, pipe in self.trainable_pipes():
+            if getattr(pipe, "_current_cache_id", None) is None:
+                pipe.enable_cache()
+                to_disable.add(name)
         yield
-        if was_not_cached:
-            self._cache = None
+        for name, pipe in self.trainable_pipes():
+            if name in to_disable:
+                pipe.disable_cache()
 
     def trainable_pipes(
         self, disable: Sequence[str] = ()
@@ -353,7 +427,7 @@ def trainable_pipes(
             if name not in disable and hasattr(pipe, "batch_process"):
                 yield name, pipe
 
-    def post_init(self, gold_data: Iterable[PDFDoc], exclude: Optional[set] = None):
+    def post_init(self, gold_data: Iterable[PDFDoc], exclude: Optional[Set] = None):
         """
         Completes the initialization of the pipeline by calling the post_init
         method of all components that have one.
@@ -365,7 +439,7 @@ def post_init(self, gold_data: Iterable[PDFDoc], exclude: Optional[set] = None):
         gold_data: Iterable[PDFDoc]
             The documents to use for initialization.
             Each component will not necessarily see all the data.
-        exclude: Optional[set]
+        exclude: Optional[Set]
             The names of components to exclude from initialization.
             This argument will be gradually updated  with the names of initialized
             components
@@ -580,7 +654,7 @@ def collate(
             for name, component in self.pipeline:
                 if name in batch:
                     component_inputs = batch[name]
-                    batch[name] = component.collate(component_inputs, device)
+                    batch[name] = component.collate(component_inputs)
         return batch
 
     def parameters(self):
@@ -600,7 +674,7 @@ def named_parameters(self):
                     seen.add(param)
                     yield f"{name}.{param_name}", param
 
-    def to(self, device: Optional["torch.device"] = None):  # noqa F821
+    def to(self, device: Union[str, Optional["torch.device"]] = None):  # noqa F821
         """Moves the pipeline to a given device"""
         for name, component in self.trainable_pipes():
             component.to(device)
@@ -630,7 +704,7 @@ def __exit__(ctx_self, type, value, traceback):
 
         return context()
 
-    def save(
+    def to_disk(
         self, path: Union[str, Path], *, exclude: Optional[Set[str]] = None
     ) -> None:
         """
@@ -648,30 +722,6 @@ def save(
             process. This list will be gradually filled in place as components are
             saved
         """
-
-        def save_tensors(path: Path):
-            import safetensors.torch
-
-            shutil.rmtree(path, ignore_errors=True)
-            os.makedirs(path, exist_ok=True)
-            tensors = defaultdict(list)
-            tensor_to_group = defaultdict(list)
-            for pipe_name, pipe in self.trainable_pipes(disable=exclude):
-                for key, tensor in pipe.state_dict(keep_vars=True).items():
-                    full_key = join_path((pipe_name, *split_path(key)))
-                    tensors[tensor].append(full_key)
-                    tensor_to_group[tensor].append(pipe_name)
-            group_to_tensors = defaultdict(set)
-            for tensor, group in tensor_to_group.items():
-                group_to_tensors["+".join(sorted(set(group)))].add(tensor)
-            for group, group_tensors in group_to_tensors.items():
-                sub_path = path / f"{group}.safetensors"
-                tensor_dict = {
-                    "+".join(tensors[p]): p
-                    for p in {p.data_ptr(): p for p in group_tensors}.values()
-                }
-                safetensors.torch.save_file(tensor_dict, sub_path)
-
         exclude = set() if exclude is None else exclude
 
         path = Path(path) if isinstance(path, str) else path
@@ -690,18 +740,29 @@ def save_tensors(path: Path):
         if "config" not in exclude:
             self.config.to_disk(path / "config.cfg")
 
-        extra_exclude = set(exclude)
-        for pipe_name, pipe in self._components:
-            if hasattr(pipe, "save_extra_data") and pipe_name not in extra_exclude:
-                pipe.save_extra_data(path / pipe_name, exclude=extra_exclude)
-        if "tensors" not in exclude:
-            save_tensors(path / "tensors")
+        pwd = os.getcwd()
+        overrides = {"components": {}}
+        try:
+            os.chdir(path)
+            for pipe_name, pipe in self._components:
+                if hasattr(pipe, "to_disk") and pipe_name not in exclude:
+                    pipe_overrides = pipe.to_disk(Path(pipe_name), exclude=exclude)
+                    overrides["components"][pipe_name] = pipe_overrides
+        finally:
+            os.chdir(pwd)
+
+        config = self.config.merge(overrides)
+
+        if "config" not in exclude:
+            config.to_disk(path / "config.cfg")
+
+    save = to_disk
 
-    def load_state_from_disk(
+    def from_disk(
         self,
         path: Union[str, Path],
         *,
-        exclude: Set[str] = None,
+        exclude: Optional[Union[str, Sequence[str]]] = None,
         device: Optional[Union[str, "torch.device"]] = "cpu",  # noqa F821
     ) -> "Pipeline":
         """
@@ -711,77 +772,47 @@ def load_state_from_disk(
         ----------
         path: Union[str, Path]
             The path to the directory to load the pipeline from
-        exclude: Set[str]
+        exclude: Optional[Union[str, Sequence[str]]]
             The names of the components, or attributes to exclude from the loading
-            process. This list will be gradually filled in place as components are
-            loaded
+            process.
+        device: Optional[Union[str, "torch.device"]]
+            Device to use when loading the tensors
         """
 
         def deserialize_meta(path: Path) -> None:
             if path.exists():
-                data = json.loads(path.read_text())
+                with open(path, "r") as f:
+                    data = json.load(f)
                 self.meta.update(data)
+                # self.meta always overrides meta["vectors"] with the metadata
+                # from self.vocab.vectors, so set the name directly
+
+        exclude = (
+            set()
+            if exclude is None
+            else {exclude}
+            if isinstance(exclude, str)
+            else set(exclude)
+        )
 
-        def deserialize_tensors(path: Path):
-            import safetensors.torch
-
-            trainable_components = dict(self.trainable_pipes())
-            for file_name in path.iterdir():
-                pipe_names = file_name.stem.split("+")
-                if any(pipe_name in trainable_components for pipe_name in pipe_names):
-                    # We only load tensors in one of the pipes since parameters
-                    # are expected to be shared
-                    pipe = trainable_components[pipe_names[0]]
-                    tensor_dict = {}
-                    for keys, tensor in safetensors.torch.load_file(
-                        file_name, device=device
-                    ).items():
-                        split_keys = [split_path(key) for key in keys.split("+")]
-                        key = next(key for key in split_keys if key[0] == pipe_names[0])
-                        tensor_dict[join_path(key[1:])] = tensor
-                    # Non-strict because tensors of a given pipeline can be shared
-                    # between multiple files
-                    print(f"Loading tensors of {pipe_names[0]} from {file_name}")
-                    extra_tensors = set(tensor_dict) - set(
-                        pipe.state_dict(keep_vars=True).keys()
-                    )
-                    if extra_tensors:
-                        warnings.warn(
-                            f"{file_name} contains tensors that are not in the state"
-                            f"dict of {pipe_names[0]}: {sorted(extra_tensors)}"
-                        )
-                    pipe.load_state_dict(tensor_dict, strict=False)
-
-        exclude = set() if exclude is None else exclude
-
+        path = (Path(path) if isinstance(path, str) else path).absolute()
         if "meta" not in exclude:
             deserialize_meta(path / "meta.json")
 
-        extra_exclude = set(exclude)
-        for name, proc in self._components:
-            if hasattr(proc, "load_extra_data") and name not in extra_exclude:
-                proc.load_extra_data(path / name, extra_exclude)
-
-        if "tensors" not in exclude:
-            deserialize_tensors(path / "tensors")
+        pwd = os.getcwd()
+        try:
+            os.chdir(path)
+            for name, proc in self._components:
+                if hasattr(proc, "from_disk") and name not in exclude:
+                    proc.from_disk(Path(name), exclude=exclude)
+                # Convert to list here in case exclude is (default) tuple
+                exclude.add(name)
+        finally:
+            os.chdir(pwd)
 
         self._path = path  # type: ignore[assignment]
         return self
 
-    @classmethod
-    def load(
-        cls,
-        path: Union[str, Path],
-        *,
-        exclude: Optional[Set[str]] = None,
-        device: Optional[Union[str, "torch.device"]] = "cpu",  # noqa F821
-    ):
-        path = Path(path) if isinstance(path, str) else path
-        config = Config.from_disk(path / "config.cfg")
-        self = Pipeline.from_config(config)
-        self.load_state_from_disk(path, exclude=exclude, device=device)
-        return self
-
     # override config property getter to remove "factory" key from components
     @property
     def cfg(self) -> Config:
@@ -830,12 +861,10 @@ def __exit__(ctx_self, type, value, traceback):
 
         if enable is None and disable is None:
             raise ValueError("Expected either `enable` or `disable`")
-        if isinstance(disable, str):
-            disable = [disable]
+        disable = [disable] if isinstance(disable, str) else disable
         pipe_names = set(self.pipe_names)
         if enable is not None:
-            if isinstance(enable, str):
-                enable = [enable]
+            enable = [enable] if isinstance(enable, str) else enable
             if set(enable) - pipe_names:
                 raise ValueError(
                     "Enabled pipes {} not found in pipeline.".format(
@@ -863,24 +892,26 @@ def package(
         self,
         name: Optional[str] = None,
         root_dir: Union[str, Path] = ".",
+        build_dir: Union[str, Path] = "build",
+        dist_dir: Union[str, Path] = "dist",
         artifacts_name: str = "artifacts",
-        check_dependencies: bool = False,
         project_type: Optional[Literal["poetry", "setuptools"]] = None,
-        version: str = "0.1.0",
+        version: Optional[str] = None,
         metadata: Optional[Dict[str, Any]] = {},
         distributions: Optional[Sequence[Literal["wheel", "sdist"]]] = ["wheel"],
         config_settings: Optional[Mapping[str, Union[str, Sequence[str]]]] = None,
         isolation: bool = True,
         skip_build_dependency_check: bool = False,
     ):
-        from .utils.package import package
+        from edspdf.utils.package import package
 
         return package(
             pipeline=self,
             name=name,
             root_dir=root_dir,
+            build_dir=build_dir,
+            dist_dir=dist_dir,
             artifacts_name=artifacts_name,
-            check_dependencies=check_dependencies,
             project_type=project_type,
             version=version,
             metadata=metadata,
@@ -892,19 +923,99 @@ def package(
 
 
 def load(
-    config: Union[Path, str, Config],
-    device: Optional[Union[str, "torch.device"]] = "cpu",  # noqa F821
-) -> Pipeline:
-    error = "The load function expects a Config or a path to a config file"
-    if isinstance(config, (Path, str)):
-        path = Path(config)
-        if path.is_dir():
-            return Pipeline.load(path, device=device)
-        elif path.is_file():
-            config = Config.from_disk(path)
-        else:
-            raise ValueError(error)
-    elif not isinstance(config, Config):
+    model: Union[Path, str, Config],
+    overrides: Optional[Dict[str, Any]] = None,
+    *,
+    exclude: Optional[Union[str, Iterable[str]]] = None,
+    device: Optional[Union[str, "torch.device"]] = "cpu",
+):
+    """
+    Load a pipeline from a config file or a directory.
+
+    Examples
+    --------
+
+    ```{ .python .no-check }
+    import edspdf
+
+    nlp = edspdf.load(
+        "path/to/config.cfg",
+        overrides={"components": {"my_component": {"arg": "value"}}},
+    )
+    ```
+
+    Parameters
+    ----------
+    model: Union[Path, str, Config]
+        The config to use for the pipeline, or the path to a config file or a directory.
+    overrides: Optional[Dict[str, Any]]
+        Overrides to apply to the config when loading the pipeline. These are the
+        same parameters as the ones used when initializing the pipeline.
+    exclude: Optional[Union[str, Iterable[str]]]
+        The names of the components, or attributes to exclude from the loading
+        process. :warning: The `exclude` argument will be mutated in place.
+    device: Optional[Union[str, "torch.device"]]
+        Device to use when loading the tensors
+
+    Returns
+    -------
+    Pipeline
+    """
+    error = (
+        "The load function expects either :\n"
+        "- a confit Config object\n"
+        "- the path of a config file (.cfg file)\n"
+        "- the path of a trained model\n"
+        "- the name of an installed pipeline package\n"
+        f"but got {model!r} which is neither"
+    )
+    if isinstance(model, (Path, str)):
+        path = Path(model)
+        is_dir = path.is_dir()
+        is_config = path.is_file() and path.suffix == ".cfg"
+        try:
+            module = importlib.import_module(model)
+            is_package = True
+        except (ImportError, AttributeError, TypeError):
+            module = None
+            is_package = False
+        if is_dir and is_package:
+            warnings.warn(
+                "The path provided is both a directory and a package : edspdf will "
+                "load the package. To load from the directory instead, please pass the "
+                f'path as "./{path}" instead.'
+            )
+        if is_dir:
+            path = (Path(path) if isinstance(path, str) else path).absolute()
+            config = Config.from_disk(path / "config.cfg")
+            if overrides:
+                config = config.merge(overrides)
+            pwd = os.getcwd()
+            try:
+                os.chdir(path)
+                nlp = Pipeline.from_config(config)
+                nlp.from_disk(path, exclude=exclude, device=device)
+            finally:
+                os.chdir(pwd)
+            return nlp
+        elif is_config:
+            model = Config.from_disk(path)
+        elif is_package:
+            # Load as package
+            available_kwargs = {
+                "overrides": overrides,
+                "exclude": exclude,
+                "device": device,
+            }
+            signature_kwargs = inspect.signature(module.load).parameters
+            kwargs = {
+                name: available_kwargs[name]
+                for name in signature_kwargs
+                if name in available_kwargs
+            }
+            return module.load(**kwargs)
+
+    if not isinstance(model, Config):
         raise ValueError(error)
 
-    return Pipeline.from_config(config)
+    return Pipeline.from_config(model)
diff --git a/edspdf/pipes/aggregators/simple.py b/edspdf/pipes/aggregators/simple.py
index 2e46ae57..f5682aac 100644
--- a/edspdf/pipes/aggregators/simple.py
+++ b/edspdf/pipes/aggregators/simple.py
@@ -1,5 +1,5 @@
-from itertools import groupby
-from typing import Dict, List
+from collections import defaultdict
+from typing import Dict, List, Union
 
 import numpy as np
 
@@ -45,8 +45,13 @@ class SimpleAggregator:
         @factory = "simple-aggregator"
         new_line_threshold = 0.2
         new_paragraph_threshold = 1.5
-        label_map = { body = "text", table = "text" }
-
+        # To build the "text" label, we will aggregate lines from
+        # "title", "body" and "table" and output "title" lines in a
+        # separate field "title" as well.
+        label_map = {
+            "text" : [ "title", "body", "table" ],
+            "title" : "title",
+            }
         ...
         ```
 
@@ -77,8 +82,9 @@ class SimpleAggregator:
         lines to consider them as being on separate paragraphs and thus add a
         newline character between them.
     label_map: Dict
-        A dictionary mapping labels to new labels. This is useful to group labels
-        together, for instance, to output both "body" and "table" as "text".
+        A dictionary mapping from new labels to old labels.
+        This is useful to group labels together, for instance, to output both "body"
+        and "table" as "text".
     """
 
     def __init__(
@@ -88,11 +94,14 @@ def __init__(
         sort: bool = False,
         new_line_threshold: float = 0.2,
         new_paragraph_threshold: float = 1.5,
-        label_map: Dict = {},
+        label_map: Dict[str, Union[str, List[str]]] = {},
     ) -> None:
         self.name = name
         self.sort = sort
-        self.label_map = dict(label_map)
+        self.label_map = {
+            label: [old_labels] if not isinstance(old_labels, list) else old_labels
+            for label, old_labels in label_map.items()
+        }
         self.new_line_threshold = new_line_threshold
         self.new_paragraph_threshold = new_paragraph_threshold
 
@@ -107,12 +116,22 @@ def __call__(self, doc: PDFDoc) -> PDFDoc:
                 all_lines,
                 key=lambda b: (b.label, b.page_num, b.y1 // row_height, b.x0),
             )
-        else:
-            all_lines = sorted(all_lines, key=lambda b: b.label)
 
         texts = {}
         styles = {}
-        for label, lines in groupby(all_lines, key=lambda b: b.label):
+
+        inv_label_map = defaultdict(list)
+        for new_label, old_labels in self.label_map.items():
+            for old_label in old_labels:
+                inv_label_map[old_label].append(new_label)
+
+        lines_per_label = defaultdict(list)
+        lines_per_label.update({k: [] for k in self.label_map})
+        for line in all_lines:
+            for new_label in inv_label_map.get(line.label, [line.label]):
+                lines_per_label[new_label].append(line)
+
+        for label, lines in lines_per_label.items():
             styles[label] = []
             text = ""
             lines: List[TextBox] = list(lines)
diff --git a/edspdf/pipes/classifiers/trainable.py b/edspdf/pipes/classifiers/trainable.py
index f879af48..1f71e79a 100644
--- a/edspdf/pipes/classifiers/trainable.py
+++ b/edspdf/pipes/classifiers/trainable.py
@@ -155,9 +155,9 @@ def preprocess_supervised(self, doc: PDFDoc) -> Dict[str, Any]:
             ],
         }
 
-    def collate(self, batch, device: torch.device) -> Dict:
+    def collate(self, batch) -> Dict:
         collated = {
-            "embedding": self.embedding.collate(batch["embedding"], device),
+            "embedding": self.embedding.collate(batch["embedding"]),
             "doc_id": batch["doc_id"],
         }
         if "labels" in batch:
@@ -167,7 +167,6 @@ def collate(self, batch, device: torch.device) -> Dict:
                         batch["labels"],
                         data_dims=("line",),
                         full_names=("sample", "page", "line"),
-                        device=device,
                         dtype=torch.long,
                     ),
                 }
@@ -182,7 +181,9 @@ def forward(self, batch: Dict) -> Dict:
         output = {"loss": 0, "mask": embeddings.mask}
 
         # Label prediction / learning
-        logits = self.classifier(embeddings).refold("line")
+        logits = self.classifier(embeddings.to(self.classifier.weight.dtype)).refold(
+            "line"
+        )
         if "labels" in batch:
             targets = batch["labels"].refold(logits.data_dims)
             output["label_loss"] = (
@@ -208,28 +209,28 @@ def postprocess(self, docs: Sequence[PDFDoc], output: Dict) -> Sequence[PDFDoc]:
             b.label = self.label_voc.decode(label) if b.text != "" else None
         return docs
 
-    def save_extra_data(self, path: Path, exclude: Set):
+    def to_disk(self, path: Path, exclude: Set):
         if self.name in exclude:
             return
         exclude.add(self.name)
 
-        self.embedding.save_extra_data(path / "embedding", exclude)
-
         os.makedirs(path, exist_ok=True)
 
         with (path / "label_voc.json").open("w") as f:
             json.dump(self.label_voc.indices, f)
 
-    def load_extra_data(self, path: Path, exclude: Set):
+        return super().to_disk(path, exclude)
+
+    def from_disk(self, path: Path, exclude: Set):
         if self.name in exclude:
             return
         exclude.add(self.name)
 
-        self.embedding.load_extra_data(path / "embedding", exclude)
-
         label_voc_indices = dict(self.label_voc.indices)
 
         with (path / "label_voc.json").open("r") as f:
             self.label_voc.indices = json.load(f)
 
         self.update_weights_from_vocab_(label_voc_indices)
+
+        super().from_disk(path, exclude)
diff --git a/edspdf/pipes/embeddings/box_layout_embedding.py b/edspdf/pipes/embeddings/box_layout_embedding.py
index 0bd8c29f..e27661a4 100644
--- a/edspdf/pipes/embeddings/box_layout_embedding.py
+++ b/edspdf/pipes/embeddings/box_layout_embedding.py
@@ -9,7 +9,7 @@
     BoxLayoutPreprocessor,
 )
 from edspdf.registry import registry
-from edspdf.trainable_pipe import NestedSequences, TrainablePipe
+from edspdf.trainable_pipe import TrainablePipe
 
 
 @registry.factory.register("box-layout-embedding")
@@ -71,8 +71,8 @@ def __init__(
     def preprocess(self, doc):
         return self.box_preprocessor.preprocess(doc)
 
-    def collate(self, batch: NestedSequences, device: torch.device) -> BoxLayoutBatch:
-        return self.box_preprocessor.collate(batch, device)
+    def collate(self, batch) -> BoxLayoutBatch:
+        return self.box_preprocessor.collate(batch)
 
     @classmethod
     def _make_embed(cls, n_positions, size, mode):
diff --git a/edspdf/pipes/embeddings/box_layout_preprocessor.py b/edspdf/pipes/embeddings/box_layout_preprocessor.py
index ae9715f3..10559f6f 100644
--- a/edspdf/pipes/embeddings/box_layout_preprocessor.py
+++ b/edspdf/pipes/embeddings/box_layout_preprocessor.py
@@ -74,11 +74,10 @@ def preprocess(self, doc: PDFDoc, supervision: bool = False):
             "last_page": [[b.page_num == last_p for b in p.text_boxes] for p in pages],
         }
 
-    def collate(self, batch, device: torch.device) -> BoxLayoutBatch:
+    def collate(self, batch) -> BoxLayoutBatch:
         kw = {
             "full_names": ["sample", "page", "line"],
             "data_dims": ["line"],
-            "device": device,
         }
 
         return {
diff --git a/edspdf/pipes/embeddings/huggingface_embedding.py b/edspdf/pipes/embeddings/huggingface_embedding.py
index 16ccf076..58a3e3ad 100644
--- a/edspdf/pipes/embeddings/huggingface_embedding.py
+++ b/edspdf/pipes/embeddings/huggingface_embedding.py
@@ -1,8 +1,12 @@
 import math
+import sys
+from typing import Optional, Set
 
 import torch
+from confit import validate_arguments
 from foldedtensor import as_folded_tensor
 from transformers import AutoImageProcessor, AutoModel, AutoTokenizer
+from transformers import BitsAndBytesConfig as BitsAndBytesConfig_
 from typing_extensions import Literal
 
 from edspdf import TrainablePipe, registry
@@ -10,6 +14,8 @@
 from edspdf.pipes.embeddings import EmbeddingOutput
 from edspdf.structures import PDFDoc
 
+BitsAndBytesConfig = validate_arguments(BitsAndBytesConfig_)
+
 
 def compute_contextualization_scores(windows):
     ramp = torch.arange(0, windows.shape[1], 1)
@@ -108,6 +114,11 @@ class HuggingfaceEmbedding(TrainablePipe[EmbeddingOutput]):
         The maximum number of tokens that can be processed by the model on a single
         device. This does not affect the results but can be used to reduce the memory
         usage of the model, at the cost of a longer processing time.
+    quantization_config: Optional[BitsAndBytesConfig]
+        The quantization configuration to use when loading the model
+    kwargs:
+        Additional keyword arguments to pass to the Huggingface
+        `AutoModel.from_pretrained` method
     """
 
     def __init__(
@@ -119,7 +130,9 @@ def __init__(
         window: int = 510,
         stride: int = 255,
         line_pooling: Literal["mean", "max", "sum"] = "mean",
-        max_tokens_per_device: int = 128 * 128,
+        max_tokens_per_device: int = sys.maxsize,
+        quantization_config: Optional[BitsAndBytesConfig] = None,
+        **kwargs,
     ):
         super().__init__(pipeline, name)
         self.use_image = use_image
@@ -129,7 +142,11 @@ def __init__(
             else None
         )
         self.tokenizer = AutoTokenizer.from_pretrained(model)
-        self.hf_model = AutoModel.from_pretrained(model)
+        self.hf_model = AutoModel.from_pretrained(
+            model,
+            quantization_config=quantization_config,
+            **kwargs,
+        )
         self.output_size = self.hf_model.config.hidden_size
         self.window = window
         self.stride = stride
@@ -148,14 +165,21 @@ def preprocess(self, doc: PDFDoc):
 
         for page in doc.pages:
             # Preprocess it using LayoutLMv3
+            width = page.width
+            height = page.height
+
+            ratio = width / height
+            width, height = (1000, 1000 / ratio) if width > 1000 else (width, height)
+            width, height = (1000 * ratio, 1000) if height > 1000 else (width, height)
+
             prep = self.tokenizer(
                 text=[line.text for line in page.text_boxes],
                 boxes=[
                     (
-                        int(line.x0 * line.page.width),
-                        int(line.y0 * line.page.height),
-                        int(line.x1 * line.page.width),
-                        int(line.y1 * line.page.height),
+                        int(line.x0 * width),
+                        int(line.y0 * height),
+                        int(line.x1 * width),
+                        int(line.y1 * height),
                     )
                     for line in page.text_boxes
                 ],
@@ -181,7 +205,7 @@ def preprocess(self, doc: PDFDoc):
 
         return res
 
-    def collate(self, batch, device):
+    def collate(self, batch):
         # Flatten most of these arrays to process batches page per page and
         # not sample per sample
 
@@ -214,7 +238,9 @@ def collate(self, batch, device):
             data_dims=("window", "token"),
             dtype=torch.long,
         )
-        indexer = torch.zeros(windows.max() + 1, dtype=torch.long)
+        indexer = torch.zeros(
+            (windows.max() + 1) if windows.numel() else 0, dtype=torch.long
+        )
 
         # Sort each occurrence of an initial token by its contextualization score:
         # We can only use the amax reduction, so to retrieve the best occurrence, we
@@ -262,7 +288,7 @@ def collate(self, batch, device):
             data_dims=("token",),
             dtype=torch.long,
         )
-        last_after_one = max(1, len(line_window_offsets_flat) - 1)
+        last_after_one = max(0, len(line_window_offsets_flat) - 1)
         line_window_offsets_flat = as_folded_tensor(
             # discard the last offset, since we start from 0 and add each line length
             data=torch.as_tensor(line_window_offsets_flat[:last_after_one]),
@@ -270,42 +296,48 @@ def collate(self, batch, device):
             full_names=("sample", "page", "line"),
             lengths=line_window_indices.lengths[:-1],
         )
-
         kw = dict(
             full_names=("sample", "page", "subword"),
             data_dims=("subword",),
-            device=device,
         )
         collated = {
             "input_ids": as_folded_tensor(batch["input_ids"], **kw, dtype=torch.long),
             "bbox": as_folded_tensor(batch["bbox"], **kw, dtype=torch.long),
-            "windows": windows.to(device),
-            "indexer": indexer[line_window_indices].to(device),
-            "line_window_indices": indexer[line_window_indices].as_tensor().to(device),
-            "line_window_offsets_flat": line_window_offsets_flat.to(device),
+            "windows": windows,
+            "indexer": indexer[line_window_indices],
+            "line_window_indices": indexer[line_window_indices].as_tensor(),
+            "line_window_offsets_flat": line_window_offsets_flat,
         }
         if self.use_image:
-            collated["pixel_values"] = (
-                torch.stack(
-                    [
-                        torch.from_numpy(page_pixels)
-                        for sample_pages in batch["pixel_values"]
-                        for page_pixels in sample_pages
-                    ],
-                    dim=0,
-                )
-                .repeat_interleave(torch.as_tensor(windows_count_per_page), dim=0)
-                .to(device)
+            collated["pixel_values"] = torch.as_tensor(
+                [
+                    page_pixels
+                    for sample_pages in batch["pixel_values"]
+                    for page_pixels in sample_pages
+                ],
+            ).repeat_interleave(
+                torch.as_tensor(windows_count_per_page, dtype=torch.long), dim=0
             )
         return collated
 
     def forward(self, batch):
+        if 0 in batch["input_ids"].shape:
+            return {
+                "embeddings": batch["line_window_offsets_flat"].view(
+                    *batch["line_window_offsets_flat"].shape, self.output_size
+                ),
+            }
+
         windows = batch["windows"]
         kwargs = dict(
             input_ids=batch["input_ids"].as_tensor()[windows],
             bbox=batch["bbox"].as_tensor()[windows],
             attention_mask=windows.mask,
-            pixel_values=batch.get("pixel_values"),
+            pixel_values=(
+                batch.get("pixel_values").to(next(self.parameters()).dtype)
+                if self.use_image
+                else None
+            ),
         )
         num_windows_per_batch = self.max_tokens_per_device // (
             windows.shape[1]
@@ -340,3 +372,16 @@ def forward(self, batch):
             mode=self.line_pooling,
         )
         return {"embeddings": line_embedding}
+
+    def to_disk(self, path, *, exclude: Optional[Set[str]]):
+        repr_id = object.__repr__(self)
+        if repr_id in exclude:
+            return
+        for obj in (self.tokenizer, self.image_processor, self.hf_model):
+            if obj is not None:
+                obj.save_pretrained(path)
+        for param in self.hf_model.parameters():
+            exclude.add(object.__repr__(param))
+        cfg = super().to_disk(path, exclude=exclude) or {}
+        cfg["model"] = f"./{path.as_posix()}"
+        return cfg
diff --git a/edspdf/pipes/embeddings/simple_text_embedding.py b/edspdf/pipes/embeddings/simple_text_embedding.py
index cc2d8691..f7674102 100644
--- a/edspdf/pipes/embeddings/simple_text_embedding.py
+++ b/edspdf/pipes/embeddings/simple_text_embedding.py
@@ -154,7 +154,7 @@ def post_init(self, gold_data, exclude: set):
 
         self.update_weights_from_vocab_(vocab_items_before)
 
-    def save_extra_data(self, path: Path, exclude: Set):
+    def to_disk(self, path: Path, exclude: Set):
         if self.name in exclude:
             return
 
@@ -169,7 +169,9 @@ def save_extra_data(self, path: Path, exclude: Set):
         with (path / "norm_voc.json").open("w") as f:
             json.dump(self.norm_voc.indices, f)
 
-    def load_extra_data(self, path: Path, exclude: Set):
+        return super().to_disk(path, exclude)
+
+    def from_disk(self, path: Path, exclude: Set):
         if self.name in exclude:
             return
 
@@ -191,6 +193,8 @@ def load_extra_data(self, path: Path, exclude: Set):
 
         self.update_weights_from_vocab_(vocab_items_before)
 
+        super().from_disk(path, exclude)
+
     def preprocess(self, doc: PDFDoc):
         tokens_shape = []
         tokens_prefix = []
@@ -228,10 +232,9 @@ def preprocess(self, doc: PDFDoc):
             "tokens_norm": tokens_norm,
         }
 
-    def collate(self, batch, device: torch.device) -> BoxTextEmbeddingInputBatch:
+    def collate(self, batch) -> BoxTextEmbeddingInputBatch:
         kwargs = dict(
             dtype=torch.long,
-            device=device,
             data_dims=("word",),
             full_names=(
                 "sample",
diff --git a/edspdf/pipes/embeddings/sub_box_cnn_pooler.py b/edspdf/pipes/embeddings/sub_box_cnn_pooler.py
index a917281e..03edbbb9 100644
--- a/edspdf/pipes/embeddings/sub_box_cnn_pooler.py
+++ b/edspdf/pipes/embeddings/sub_box_cnn_pooler.py
@@ -74,6 +74,15 @@ def forward(self, batch: Any) -> EmbeddingOutput:
         embeddings = self.embedding.module_forward(batch["embedding"])[
             "embeddings"
         ].refold("line", "word")
+        if 0 in embeddings.shape:
+            return {
+                "embeddings": as_folded_tensor(
+                    data=torch.zeros(0, self.output_size, device=embeddings.device),
+                    lengths=embeddings.lengths[:-1],  # pooled on the last dim
+                    data_dims=["line"],  # fully flattened
+                    full_names=["sample", "page", "line"],
+                )
+            }
 
         # sample word dim -> sample dim word
         box_token_embeddings = embeddings.as_tensor().permute(0, 2, 1)
diff --git a/edspdf/pipes/extractors/pdfminer.py b/edspdf/pipes/extractors/pdfminer.py
index 3e645398..b822adf7 100644
--- a/edspdf/pipes/extractors/pdfminer.py
+++ b/edspdf/pipes/extractors/pdfminer.py
@@ -185,12 +185,10 @@ def __call__(self, doc: Union[PDFDoc, bytes]) -> PDFDoc:
 
         if self.render_pages:
             # See https://pypdfium2.readthedocs.io/en/stable/python_api.html#user-unit
-            images = pypdfium2.PdfDocument(content).render_topil(
-                scale=self.render_dpi / 72
-            )
-            for page, image in zip(pages, images):
+            pdfium_doc = pypdfium2.PdfDocument(content)
+            for page, pdfium_page in zip(pages, pdfium_doc):
+                image = pdfium_page.render(scale=self.render_dpi / 72).to_pil()
                 np_img = np.array(image)
-                print("NP IMG", np_img.shape)
                 page.image = np_img
 
         return doc
diff --git a/edspdf/processing/__init__.py b/edspdf/processing/__init__.py
new file mode 100644
index 00000000..65a80d6e
--- /dev/null
+++ b/edspdf/processing/__init__.py
@@ -0,0 +1,9 @@
+from typing import TYPE_CHECKING
+
+from edspdf.utils.lazy_module import lazify
+
+lazify()
+
+if TYPE_CHECKING:
+    from .simple import execute_simple_backend
+    from .multiprocessing import execute_multiprocessing_backend
diff --git a/edspdf/processing/multiprocessing.py b/edspdf/processing/multiprocessing.py
new file mode 100644
index 00000000..0fbbe129
--- /dev/null
+++ b/edspdf/processing/multiprocessing.py
@@ -0,0 +1,933 @@
+from __future__ import annotations
+
+import copyreg
+import gc
+import io
+import logging
+import multiprocessing
+import multiprocessing.reduction
+import os
+import sys
+import tempfile
+import warnings
+from contextlib import nullcontext
+from multiprocessing.connection import wait
+from random import shuffle
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Dict,
+    List,
+    Optional,
+    Tuple,
+    Union,
+)
+
+import dill
+from typing_extensions import TypedDict
+
+from edspdf.lazy_collection import LazyCollection
+from edspdf.utils.collections import batchify, flatten
+
+batch_size_fns = {
+    "content_boxes": lambda batch: sum(len(doc.content_boxes) for doc in batch),
+    "pages": lambda batch: sum(len(doc.pages) for doc in batch),
+    "docs": len,
+}
+
+doc_size_fns = {
+    "content_boxes": lambda doc: len(doc.content_boxes),
+}
+
+if TYPE_CHECKING:
+    import torch
+
+    from edspdf.trainable_pipe import TrainablePipe
+
+Stage = TypedDict(
+    "Stage",
+    {
+        "cpu_components": List[Tuple[str, Callable, Dict]],
+        "gpu_component": Optional[Any],
+    },
+)
+
+
+def apply_basic_pipes(docs, pipes):
+    for name, pipe, kwargs in pipes:
+        if hasattr(pipe, "batch_process"):
+            docs = pipe.batch_process(docs)
+        else:
+            docs = [pipe(doc, **kwargs) for doc in docs]
+    return docs
+
+
+class ForkingPickler(dill.Pickler):
+    """
+    ForkingPickler that uses dill instead of pickle to transfer objects between
+    processes.
+    """
+
+    _extra_reducers = {}
+    _copyreg_dispatch_table = copyreg.dispatch_table
+
+    def __new__(cls, *args, **kwargs):
+        result = dill.Pickler.__new__(ForkingPickler)
+        # Python would not call __init__ if the original
+        # multiprocessing.reduction.ForkingPickler called, leading to a call to this
+        # monkey-patched __new__ method, because [original cls] != [type of result]
+        # (see https://docs.python.org/3/reference/datamodel.html#basic-customization)
+        # so we force the call to __init__ here
+        if not isinstance(result, cls):
+            result.__init__(*args, **kwargs)
+        return result
+
+    def __init__(self, *args, **kwds):
+        super().__init__(*args, **kwds)
+        self.dispatch_table = self._copyreg_dispatch_table.copy()
+        self.dispatch_table.update(self._extra_reducers)
+
+    @classmethod
+    def register(cls, type, reduce):
+        """Register a reduce function for a type."""
+        cls._extra_reducers[type] = reduce
+
+    @classmethod
+    def dumps(cls, obj, protocol=None, *args, **kwds):
+        buf = io.BytesIO()
+        cls(buf, protocol, *args, **kwds).dump(obj)
+        return buf.getbuffer()
+
+    loads = dill.loads
+
+
+def replace_pickler():
+    """
+    Replace the default pickler used by multiprocessing with dill.
+    "multiprocess" didn't work for obscure reasons (maybe the reducers / dispatchers
+    are not propagated between multiprocessing and multiprocess => torch specific
+    reducers might be missing ?), so this patches multiprocessing directly.
+    directly.
+
+    For some reason I do not explain, this has a massive impact on the performance of
+    the multiprocessing backend. With the original pickler, the performance can be
+    up to 2x slower than with our custom one.
+    """
+    old_pickler = multiprocessing.reduction.ForkingPickler
+
+    before = (
+        dict(ForkingPickler._extra_reducers),
+        old_pickler.__new__,
+        old_pickler.dumps,
+        old_pickler.loads,
+        old_pickler.register,
+    )
+
+    old_pickler.__new__ = ForkingPickler.__new__
+    old_pickler.dumps = ForkingPickler.dumps
+    old_pickler.loads = ForkingPickler.loads
+    old_pickler.register = ForkingPickler.register
+    ForkingPickler._extra_reducers.update(
+        multiprocessing.reduction.ForkingPickler._extra_reducers
+    )
+
+    def revert():
+        (
+            ForkingPickler._extra_reducers,
+            old_pickler.__new__,
+            old_pickler.dumps,
+            old_pickler.loads,
+            old_pickler.register,
+        ) = before
+
+    return revert
+
+
+# Should we check if the multiprocessing module of edspdf
+# is responsible for this child process before replacing the pickler ?
+if (
+    multiprocessing.current_process() != "MainProcess"
+    or hasattr(multiprocessing, "parent_process")
+    and multiprocessing.parent_process() is not None
+):
+    replace_pickler()
+
+DEBUG = True
+
+debug = (
+    (lambda *args, flush=False, **kwargs: print(*args, **kwargs, flush=True))
+    if DEBUG
+    else lambda *args, **kwargs: None
+)
+
+try:  # pragma: no cover
+    import torch
+
+    # Torch may still be imported as a namespace package, so we can access the
+    # torch.save and torch.load functions
+    torch_save = torch.save
+    torch_load = torch.load
+
+    MAP_LOCATION = None
+
+    try:
+        from accelerate.hooks import AlignDevicesHook
+
+        # We need to replace the "execution_device" attribute of the AlignDevicesHook
+        # using map_location when unpickling the lazy collection
+
+        def save_align_devices_hook(pickler: Any, obj: Any):
+            pickler.save_reduce(load_align_devices_hook, (obj.__dict__,), obj=obj)
+
+        def load_align_devices_hook(state):
+            state["execution_device"] = MAP_LOCATION
+            new_obj = AlignDevicesHook.__new__(AlignDevicesHook)
+            new_obj.__dict__.update(state)
+            return new_obj
+
+    except ImportError:
+        AlignDevicesHook = None
+
+    def dump(*args, **kwargs):
+        # We need to replace the "execution_device" attribute of the AlignDevicesHook
+        # using map_location when pickling the lazy collection
+        old = None
+        try:
+            if AlignDevicesHook is not None:
+                old = dill.Pickler.dispatch.get(AlignDevicesHook)
+                dill.Pickler.dispatch[AlignDevicesHook] = save_align_devices_hook
+            dill.settings["recurse"] = True
+            return torch_save(*args, pickle_module=dill, **kwargs)
+        finally:
+            dill.settings["recurse"] = False
+            if AlignDevicesHook is not None:
+                del dill.Pickler.dispatch[AlignDevicesHook]
+                if old is not None:  # pragma: no cover
+                    dill.Pickler.dispatch[AlignDevicesHook] = old
+
+    def load(*args, map_location=None, **kwargs):
+        global MAP_LOCATION
+        MAP_LOCATION = map_location
+        if torch.__version__ >= "2.1" and isinstance(args[0], str):
+            kwargs["mmap"] = True
+        # with open(args[0], "rb") as f:
+        #     result = dill.load(f, **kwargs)
+        try:
+            if torch.__version__ < "2.0.0":
+                pickle = torch_load.__globals__["pickle"]
+                torch_load.__globals__["pickle"] = dill
+            result = torch_load(
+                *args,
+                pickle_module=dill,
+                map_location=map_location,
+                **kwargs,
+            )
+        finally:
+            import pickle
+
+            torch_load.__globals__["pickle"] = pickle
+        MAP_LOCATION = None
+        return result
+
+except (ImportError, AttributeError):  # pragma: no cover
+
+    def load(file, *args, map_location=None, **kwargs):
+        # check if path
+        if isinstance(file, str):
+            with open(file, "rb") as f:
+                return dill.load(f, *args, **kwargs)
+        return dill.load(file, *args, **kwargs)
+
+    dump = dill.dump
+
+
+class Exchanger:
+    def __init__(
+        self,
+        mp: multiprocessing.context.BaseContext,
+        num_stages: int,
+        num_gpu_workers: int,
+        num_cpu_workers: int,
+        gpu_worker_devices: List[Any],
+    ):
+        # queue for cpu input tasks
+        self.gpu_worker_devices = gpu_worker_devices
+        self.num_cpu_workers = num_cpu_workers
+        self.num_gpu_workers = num_gpu_workers
+        # We add prioritized queue at the end for STOP signals
+        self.cpu_inputs_queues = [
+            [mp.Queue()] + [mp.SimpleQueue() for _ in range(num_stages + 1)]
+            # The input queue is not shared between processes, since calling `wait`
+            # on a queue reader from multiple processes may lead to a deadlock
+            for _ in range(num_cpu_workers)
+        ]
+        self.gpu_inputs_queues = [
+            [mp.Queue() for _ in range(num_stages + 1)] for _ in range(num_gpu_workers)
+        ]
+        self.outputs_queue = mp.Queue()
+        self.num_stages = num_stages
+
+    # noinspection PyUnresolvedReferences
+    def get_cpu_task(self, idx):
+        queue_readers = wait([queue._reader for queue in self.cpu_inputs_queues[idx]])
+        stage, queue = next(
+            (stage, q)
+            for stage, q in reversed(list(enumerate(self.cpu_inputs_queues[idx])))
+            if q._reader in queue_readers
+        )
+        item = queue.get()
+        return stage, item
+
+    def put_cpu(self, item, stage, idx):
+        return self.cpu_inputs_queues[idx][stage].put(item)
+
+    def get_gpu_task(self, idx):
+        queue_readers = wait([queue._reader for queue in self.gpu_inputs_queues[idx]])
+        stage, queue = next(
+            (stage, q)
+            for stage, q in reversed(list(enumerate(self.gpu_inputs_queues[idx])))
+            if q._reader in queue_readers
+        )
+        item = queue.get()
+        return stage, item
+
+    def put_gpu(self, item, stage, idx):
+        return self.gpu_inputs_queues[idx][stage].put(item)
+
+    def put_results(self, items):
+        self.outputs_queue.put(items)
+
+    def iter_results(self):
+        for out in iter(self.outputs_queue.get, None):
+            yield out
+
+
+class CPUWorker:
+    def __init__(
+        self,
+        cpu_idx: int,
+        exchanger: Exchanger,
+        gpu_pipe_names: List[str],
+        lazy_collection_path: str,
+        device: Union[str, "torch.device"],
+    ):
+        super(CPUWorker, self).__init__()
+
+        self.cpu_idx = cpu_idx
+        self.exchanger = exchanger
+        self.gpu_pipe_names = gpu_pipe_names
+        self.lazy_collection_path = lazy_collection_path
+        self.device = device
+
+    def run(self):
+        # Cannot pass torch tensor during init i think ? otherwise i get
+        # ValueError: bad value(s) in fds_to_keep
+        # mp._prctl_pr_set_pdeathsig(signal.SIGINT)
+
+        def read_tasks():
+            next_batch_id = self.cpu_idx
+            expect_new_tasks = True
+
+            while expect_new_tasks or len(active_batches) > 0:
+                stage, task = self.exchanger.get_cpu_task(
+                    idx=self.cpu_idx,
+                )
+                # stage, task = next(iterator)
+                # Prioritized STOP signal: something bad happened in another process
+                # -> stop listening to input queues and raise StopIteration (return)
+                if task is None and stage == self.exchanger.num_stages + 1:
+                    return
+                # Non prioritized STOP signal: there are no more tasks to process
+                # and we should smoothly stop (wait that there are no more active
+                # tasks, and finalize the writer)
+                if stage == 0 and task is None:
+                    expect_new_tasks = False
+                    continue
+
+                # If first stage, we receive tasks that may require batching
+                # again => we split them into chunks
+                if stage == 0:
+                    task_id, fragments = task
+                    chunks = list(
+                        batchify(lc.reader.read_worker(fragments), lc.chunk_size)
+                    )
+                    for chunk_idx, docs in enumerate(chunks):
+                        # If we sort by size, we must first create the documents
+                        # to have features against which we will sort
+                        docs = apply_basic_pipes(docs, preprocess_pipes)
+
+                        if lc.sort_chunks:
+                            docs.sort(
+                                key=doc_size_fns.get(
+                                    lc.sort_chunks, doc_size_fns["content_boxes"]
+                                )
+                            )
+
+                        batches = [
+                            batch
+                            for batch in batchify(
+                                docs,
+                                batch_size=lc.batch_size,
+                                formula=batch_size_fns[lc.batch_by],
+                            )
+                        ]
+
+                        for batch_idx, batch in enumerate(batches):
+                            assert len(batch) > 0
+                            batch_id = next_batch_id
+
+                            # We mark the task id only for the last batch of a task
+                            # since the purpose of storing the task id is to know
+                            # when the worker has finished processing the task,
+                            # which is true only when the last batch has been
+                            # processed
+                            active_batches[batch_id] = (
+                                batch,
+                                task_id
+                                if (batch_idx == len(batches) - 1)
+                                and (chunk_idx == len(chunks) - 1)
+                                else None,
+                            )
+                            next_batch_id += num_cpu
+                            # gpu_idx = None
+                            # batch_id = we have just created a new batch
+                            # result from the last stage = None
+                            yield stage, (None, batch_id, None)
+                else:
+                    yield stage, task
+
+        try:
+            lc: LazyCollection = load(
+                self.lazy_collection_path, map_location=self.device
+            )
+            preprocess_pipes = []
+            num_cpu = self.exchanger.num_cpu_workers
+            split_into_batches_after = lc.split_into_batches_after
+            if (
+                split_into_batches_after is None
+                or lc.batch_by != "docs"
+                or lc.sort_chunks
+            ):
+                split_into_batches_after = next(
+                    (p[0] for p in lc.pipeline if p[0] is not None), None
+                )
+            is_before_split = split_into_batches_after is not None
+
+            stages: List[Stage] = [{"cpu_components": [], "gpu_component": None}]
+            for name, pipe, *rest in lc.pipeline:
+                if name in self.gpu_pipe_names:
+                    is_before_split = False
+                    stages[-1]["gpu_component"] = pipe
+                    stages.append({"cpu_components": [], "gpu_component": None})
+                else:
+                    if is_before_split:
+                        preprocess_pipes.append((name, pipe, *rest))
+                    else:
+                        stages[-1]["cpu_components"].append((name, pipe, *rest))
+                if name is split_into_batches_after:
+                    is_before_split = False
+
+            # Start at cpu_idx to avoid having all workers sending their
+            # first batch (0 % num_device, cf below) to the same gpu
+            active_batches = {}
+
+            logging.info(f"Starting {self} on {os.getpid()}")
+
+            # Inform the main process that we are ready
+            self.exchanger.put_results((None, 0, None, None))
+
+            for stage, (gpu_idx, batch_id, result) in read_tasks():
+                docs, task_id = active_batches.pop(batch_id)
+                for name, pipe, *rest in lc.pipeline:
+                    if hasattr(pipe, "enable_cache"):
+                        pipe.enable_cache(batch_id)
+                if stage > 0:
+                    gpu_pipe = stages[stage - 1]["gpu_component"]
+                    docs = gpu_pipe.postprocess(docs, result)  # type: ignore
+
+                docs = apply_basic_pipes(docs, stages[stage]["cpu_components"])
+
+                gpu_pipe: "TrainablePipe" = stages[stage]["gpu_component"]
+                if gpu_pipe is not None:
+                    preprocessed = gpu_pipe.make_batch(docs)  # type: ignore
+                    active_batches[batch_id] = (docs, task_id)
+                    if gpu_idx is None:
+                        gpu_idx = batch_id % len(self.exchanger.gpu_worker_devices)
+                    collated = gpu_pipe.collate(preprocessed)
+                    collated = gpu_pipe.batch_to_device(
+                        collated,
+                        device=self.exchanger.gpu_worker_devices[gpu_idx],
+                    )
+                    self.exchanger.put_gpu(
+                        item=(self.cpu_idx, batch_id, collated),
+                        idx=gpu_idx,
+                        stage=stage,
+                    )
+                else:
+                    for name, pipe, *rest in lc.pipeline:
+                        if hasattr(pipe, "disable_cache"):
+                            pipe.disable_cache(batch_id)
+                    results, count = (
+                        lc.writer.write_worker(docs)
+                        if lc.writer is not None
+                        else (docs, len(docs))
+                    )
+                    self.exchanger.put_results(
+                        (
+                            results,
+                            count,
+                            self.cpu_idx,
+                            task_id,
+                        )
+                    )
+
+            results, count = lc.writer.finalize() if lc.writer is not None else ([], 0)
+            self.exchanger.put_results((results, count, self.cpu_idx, "finalize"))
+
+        except BaseException as e:  # pragma: no cover
+            import traceback
+
+            print(f"Error in {self}:\n{traceback.format_exc()}", flush=True)
+            self.exchanger.put_results((e, 0, self.cpu_idx, None))
+        # We need to drain the queues of GPUWorker fed inputs (pre-moved to GPU)
+        # to ensure no tensor allocated on producer processes (CPUWorker via
+        # collate) are left in consumer processes
+        task = True  # anything but None
+        stage = None
+        while (stage, task) != (0, None):
+            try:
+                stage, task = self.exchanger.get_cpu_task(self.cpu_idx)
+            finally:
+                pass
+
+    def __repr__(self):
+        return f"<CPUWorker idx={self.cpu_idx}>"
+
+
+class GPUWorker:
+    def __init__(
+        self,
+        gpu_idx,
+        exchanger: Exchanger,
+        gpu_pipe_names: List[str],
+        lazy_collection_path: str,
+        device: Union[str, "torch.device"],
+    ):
+        super().__init__()
+
+        self.device = device
+        self.gpu_idx = gpu_idx
+        self.exchanger = exchanger
+
+        self.gpu_pipe_names = gpu_pipe_names
+        self.lazy_collection_path = lazy_collection_path
+
+    def run(self):
+        import torch
+
+        # mp._prctl_pr_set_pdeathsig(signal.SIGINT)
+        try:
+            lc = load(self.lazy_collection_path, map_location=self.device)
+            stage_components = [
+                pipe
+                # move_to_device(pipe, self.device)
+                for name, pipe, *_ in lc.pipeline
+                if name in self.gpu_pipe_names
+            ]
+
+            del lc
+            logging.info(f"Starting {self} on {os.getpid()}")
+
+            # Inform the main process that we are ready
+            self.exchanger.put_results((None, 0, None, None))
+
+            with torch.no_grad():
+                while True:
+                    stage, task = self.exchanger.get_gpu_task(self.gpu_idx)
+                    if task is None:
+                        break
+
+                    cpu_idx, batch_id, batch = task
+                    pipe = stage_components[stage]
+                    pipe.enable_cache(batch_id)
+                    res = pipe.module_forward(batch)
+                    self.exchanger.put_cpu(
+                        item=(
+                            self.gpu_idx,
+                            batch_id,
+                            {
+                                k: v.to("cpu") if hasattr(v, "to") else v
+                                for k, v in res.items()
+                            },
+                        ),
+                        stage=stage + 1,
+                        idx=cpu_idx,
+                    )
+                    if stage == len(stage_components) - 1:
+                        pipe.disable_cache(batch_id)
+                    del batch, task
+
+                task = batch = res = None  # noqa
+        except BaseException as e:  # pragma: no cover
+            import traceback
+
+            print(f"Error in {self}:\n{traceback.format_exc()}", flush=True)
+            self.exchanger.put_results((e, 0, None, None))
+
+        from edspdf.trainable_pipe import _caches
+
+        task = batch = res = None  # noqa
+        _caches.clear()
+        gc.collect()
+        sys.modules["torch"].cuda.empty_cache()
+
+        # We need to drain the queues of CPUWorker fed inputs (pre-moved to GPU)
+        # to ensure no tensor allocated on producer processes (CPUWorker via
+        # collate) are left in consumer processes
+        stage = None
+        task = None
+        while (stage, task) != (0, None):
+            try:
+                stage, task = self.exchanger.get_gpu_task(self.gpu_idx)
+            finally:
+                pass
+
+    def __repr__(self):
+        return f"<GPUWorker idx={self.gpu_idx}>"
+
+
+DEFAULT_MAX_CPU_WORKERS = 4
+
+
+def execute_multiprocessing_backend(
+    lc: LazyCollection,
+):
+    """
+    If you have multiple CPU cores, and optionally multiple GPUs, we provide the
+    `multiprocessing` backend that allows to run the inference on multiple
+    processes.
+
+    This accelerator dispatches the batches between multiple workers
+    (data-parallelism), and distribute the computation of a given batch on one or two
+    workers (model-parallelism):
+
+    - a `CPUWorker` which handles the non deep-learning components and the
+      preprocessing, collating and postprocessing of deep-learning components
+    - a `GPUWorker` which handles the forward call of the deep-learning components
+
+    If no GPU is available, no `GPUWorker` is started, and the `CPUWorkers` handle
+    the forward call of the deep-learning components as well.
+
+    The advantage of dedicating a worker to the deep-learning components is that it
+    allows to prepare multiple batches in parallel in multiple `CPUWorker`, and ensure
+    that the `GPUWorker` never wait for a batch to be ready.
+
+    The overall architecture described in the following figure, for 3 CPU workers and 2
+    GPU workers.
+
+    <div style="text-align:center">
+        <img src="/assets/images/multiprocessing.png" style="height:400px" />
+    </div>
+
+    Here is how a small pipeline with rule-based components and deep-learning components
+    is distributed between the workers:
+
+    <div style="text-align:center">
+        <img src="/assets/images/model-parallelism.png" />
+    </div>
+
+    """
+    try:
+        TrainablePipe = sys.modules["edspdf.trainable_pipe"].TrainablePipe
+    except (KeyError, AttributeError):  # pragma: no cover
+        TrainablePipe = None
+
+    steps = lc.pipeline
+    num_cpu_workers = lc.num_cpu_workers
+    num_gpu_workers = lc.num_gpu_workers
+    show_progress = lc.show_progress
+    process_start_method = lc.process_start_method
+
+    # Infer which pipes should be accelerated on GPU
+    gpu_steps_candidates = (
+        [name for name, component, *_ in steps if isinstance(component, TrainablePipe)]
+        if TrainablePipe is not None
+        else []
+    )
+    gpu_pipe_names = (
+        gpu_steps_candidates if lc.gpu_pipe_names is None else lc.gpu_pipe_names
+    )
+    if set(gpu_pipe_names) - set(gpu_steps_candidates):
+        raise ValueError(
+            "GPU accelerated pipes {} could not be found in the model".format(
+                sorted(set(gpu_pipe_names) - set(gpu_steps_candidates))
+            )
+        )
+
+    old_environ = {
+        k: os.environ.get(k) for k in ("TOKENIZERS_PARALLELISM", "OMP_NUM_THREADS")
+    }
+    if lc.disable_implicit_parallelism:
+        os.environ["TOKENIZERS_PARALLELISM"] = "false"
+        os.environ["OMP_NUM_THREADS"] = "1"
+
+    requires_gpu = (
+        num_gpu_workers is None
+        and len(gpu_pipe_names)
+        or num_gpu_workers is not None
+        and num_gpu_workers > 0
+    )
+
+    num_devices = 0
+    if requires_gpu:
+        import torch
+
+        num_devices = torch.cuda.device_count()
+        logging.info(f"Number of available devices: {num_devices}")
+
+        if num_gpu_workers is None:
+            num_gpu_workers = num_devices
+    else:
+        num_gpu_workers = 0
+
+    if any(gpu_steps_candidates):
+        if process_start_method == "fork":
+            warnings.warn(
+                "Using fork start method with GPU workers may lead to deadlocks. "
+                "Consider using process_start_method='spawn' instead."
+            )
+
+        process_start_method = process_start_method or "spawn"
+
+    default_method = multiprocessing.get_start_method()
+    if process_start_method is not None and default_method != process_start_method:
+        logging.info(f"Switching process start method to {process_start_method}")
+
+    mp = multiprocessing.get_context(process_start_method)
+    max_workers = max(min(mp.cpu_count() - num_gpu_workers, DEFAULT_MAX_CPU_WORKERS), 0)
+    num_cpu_workers = (
+        (num_gpu_workers or max_workers)
+        if num_cpu_workers is None
+        else max_workers + num_cpu_workers + 1
+        if num_cpu_workers < 0
+        else num_cpu_workers
+    )
+
+    if num_gpu_workers == 0:
+        gpu_pipe_names = []
+
+    gpu_worker_devices = (
+        [
+            f"cuda:{gpu_idx * num_devices // num_gpu_workers}"
+            for gpu_idx in range(num_gpu_workers)
+        ]
+        if requires_gpu and lc.gpu_worker_devices is None
+        else []
+        if lc.gpu_worker_devices is None
+        else lc.gpu_worker_devices
+    )
+    cpu_worker_devices = (
+        ["cpu"] * num_cpu_workers
+        if lc.cpu_worker_devices is None
+        else lc.cpu_worker_devices
+    )
+    assert len(cpu_worker_devices) == num_cpu_workers
+    assert len(gpu_worker_devices) == num_gpu_workers
+    if num_cpu_workers == 0:  # pragma: no cover
+        (
+            num_cpu_workers,
+            num_gpu_workers,
+            cpu_worker_devices,
+            gpu_worker_devices,
+            gpu_pipe_names,
+        ) = (num_gpu_workers, 0, gpu_worker_devices, [], [])
+
+    exchanger = Exchanger(
+        mp,
+        num_stages=len(gpu_pipe_names),
+        num_cpu_workers=num_cpu_workers,
+        num_gpu_workers=num_gpu_workers,
+        gpu_worker_devices=gpu_worker_devices,
+    )
+
+    lc = lc.to("cpu")
+
+    cpu_workers = []
+    gpu_workers = []
+
+    with tempfile.NamedTemporaryFile(delete=False) as fp:
+        dump(lc.worker_copy(), fp)
+        fp.close()
+
+    revert_pickler = replace_pickler()
+
+    for gpu_idx in range(num_gpu_workers):
+        gpu_workers.append(
+            mp.Process(
+                target=GPUWorker.run,
+                args=(
+                    GPUWorker(
+                        gpu_idx=gpu_idx,
+                        exchanger=exchanger,
+                        gpu_pipe_names=gpu_pipe_names,
+                        lazy_collection_path=fp.name,
+                        device=gpu_worker_devices[gpu_idx],
+                    ),
+                ),
+            )
+        )
+
+    for cpu_idx in range(num_cpu_workers):
+        cpu_workers.append(
+            mp.Process(
+                target=CPUWorker.run,
+                args=(
+                    CPUWorker(
+                        cpu_idx=cpu_idx,
+                        exchanger=exchanger,
+                        gpu_pipe_names=gpu_pipe_names,
+                        lazy_collection_path=fp.name,
+                        device=cpu_worker_devices[cpu_idx],
+                    ),
+                ),
+            )
+        )
+
+    logging.info(f"Main PID {os.getpid()}")
+
+    logging.info(
+        f"Starting {num_cpu_workers} cpu workers and {num_gpu_workers} gpu workers on "
+        f"{gpu_worker_devices}, with accelerated pipes: {gpu_pipe_names}",
+    )
+
+    for worker in (*cpu_workers, *gpu_workers):
+        worker.start()
+
+    logging.info("Workers are ready")
+
+    for i in range(len((*cpu_workers, *gpu_workers))):
+        outputs, count, cpu_idx, output_task_id = exchanger.outputs_queue.get()
+        if isinstance(outputs, BaseException):  # pragma: no cover
+            raise outputs
+
+    os.unlink(fp.name)
+
+    num_max_enqueued = 1
+    # Number of input/output batch per process
+    outputs_iterator = exchanger.iter_results()
+
+    cpu_worker_indices = list(range(num_cpu_workers))
+    inputs_iterator = lc.reader.read_main()
+    active_chunks = [{} for i in cpu_worker_indices]
+    non_finalized = {i for i in cpu_worker_indices}
+    max_workload = lc.chunk_size * num_max_enqueued
+
+    bar = nullcontext()
+    if show_progress:
+        from tqdm import tqdm
+
+        bar = tqdm(smoothing=0.1, mininterval=5.0)
+
+    def get_and_process_output():
+        outputs, count, cpu_idx, output_task_id = next(outputs_iterator)
+        if output_task_id == "finalize":
+            non_finalized.discard(cpu_idx)
+        if isinstance(outputs, BaseException):  # pragma: no cover
+            raise outputs
+        if show_progress:
+            bar.update(count)
+        if count > 0:
+            yield outputs
+        if output_task_id is not None:
+            active_chunks[cpu_idx].pop(output_task_id, None)
+
+    def process():
+        try:
+            with bar:
+                for input_task_id, items in enumerate(
+                    batchify(
+                        iterable=inputs_iterator,
+                        batch_size=lc.chunk_size,
+                        drop_last=False,
+                        formula=lambda x: sum(item[1] for item in x),
+                    )
+                ):
+                    batch = [item[0] for item in items]
+                    batch_size = sum(item[1] for item in items)
+
+                    while all(sum(wl.values()) >= max_workload for wl in active_chunks):
+                        yield from get_and_process_output()
+
+                    # Shuffle to ensure the first process does not receive all the
+                    # documents in case of workload equality
+                    shuffle(cpu_worker_indices)
+                    cpu_idx = min(
+                        cpu_worker_indices,
+                        key=lambda i: sum(active_chunks[i].values()),
+                    )
+                    exchanger.put_cpu((input_task_id, batch), stage=0, idx=cpu_idx)
+                    active_chunks[cpu_idx][input_task_id] = batch_size
+
+                # Inform the CPU workers that there are no more tasks to process
+                for i, worker in enumerate(cpu_workers):
+                    exchanger.cpu_inputs_queues[i][0].put(None)
+
+                while any(active_chunks):
+                    yield from get_and_process_output()
+
+                while len(non_finalized):
+                    yield from get_and_process_output()
+
+        finally:
+            revert_pickler()
+
+            for k, v in old_environ.items():
+                os.environ.pop(k, None)
+                if v is not None:
+                    os.environ[k] = v
+
+            # Send gpu and cpu process the order to stop processing data
+            # We use the prioritized queue to ensure the stop signal is processed
+            # before the next batch of data
+            for i, worker in enumerate(gpu_workers):
+                exchanger.gpu_inputs_queues[i][-1].put(None)
+            for i, worker in enumerate(cpu_workers):
+                exchanger.cpu_inputs_queues[i][-1].put(None)
+
+            # Enqueue a final non prioritized STOP signal to ensure there remains no
+            # data in the queues (cf drain loop in CPUWorker / GPUWorker)
+            for i, worker in enumerate(gpu_workers):
+                exchanger.gpu_inputs_queues[i][0].put(None)
+            for i, worker in enumerate(gpu_workers):
+                worker.join(timeout=5)
+            for i, worker in enumerate(cpu_workers):
+                exchanger.cpu_inputs_queues[i][0].put(None)
+            for i, worker in enumerate(cpu_workers):
+                worker.join(timeout=1)
+
+            # If a worker is still alive, kill it
+            # This should not happen, but for a reason I cannot explain, it does in
+            # some CPU workers sometimes when we catch an error, even though each run
+            # method of the workers completes cleanly. Maybe this has something to do
+            # with the cleanup of these processes ?
+            for i, worker in enumerate(gpu_workers):  # pragma: no cover
+                if worker.is_alive():
+                    logging.error(f"Killing <CPUWorker idx={i}>")
+                    worker.kill()
+            for i, worker in enumerate(cpu_workers):  # pragma: no cover
+                if worker.is_alive():
+                    logging.error(f"Killing <GPUWorker idx={i}>")
+                    worker.kill()
+
+            for queue_group in (
+                *exchanger.cpu_inputs_queues,
+                *exchanger.gpu_inputs_queues,
+                [exchanger.outputs_queue],
+            ):
+                for queue in queue_group:
+                    if hasattr(queue, "cancel_join_thread"):
+                        queue.cancel_join_thread()
+
+    gen = process()
+    return lc.writer.write_main(gen) if lc.writer is not None else flatten(gen)
diff --git a/edspdf/processing/simple.py b/edspdf/processing/simple.py
new file mode 100644
index 00000000..71372413
--- /dev/null
+++ b/edspdf/processing/simple.py
@@ -0,0 +1,111 @@
+from __future__ import annotations
+
+import sys
+from contextlib import nullcontext
+from typing import TYPE_CHECKING
+
+from edspdf.utils.collections import batchify, flatten
+
+if TYPE_CHECKING:
+    from edspdf.lazy_collection import LazyCollection
+
+batch_size_fns = {
+    "content_boxes": lambda batch: sum(len(doc.content_boxes) for doc in batch),
+    "pages": lambda batch: sum(len(doc.pages) for doc in batch),
+    "docs": len,
+}
+
+doc_size_fns = {
+    "content_boxes": lambda doc: len(doc.content_boxes),
+}
+
+
+def apply_basic_pipes(docs, pipes):
+    for name, pipe, kwargs in pipes:
+        if hasattr(pipe, "batch_process"):
+            docs = pipe.batch_process(docs)
+        else:
+            docs = [pipe(doc, **kwargs) for doc in docs]
+    return docs
+
+
+def execute_simple_backend(
+    lc: LazyCollection,
+):
+    """
+    This is the default execution mode which batches the documents and processes each
+    batch on the current process in a sequential manner.
+    """
+    try:
+        no_grad = sys.modules["torch"].no_grad
+    except (KeyError, AttributeError):
+        no_grad = nullcontext
+    reader = lc.reader
+    writer = lc.writer
+    show_progress = lc.show_progress
+
+    split_into_batches_after = lc.split_into_batches_after
+    if split_into_batches_after is None or lc.batch_by != "docs" or lc.sort_chunks:
+        split_into_batches_after = next(
+            (p[0] for p in lc.pipeline if p[0] is not None), None
+        )
+    names = [step[0] for step in lc.pipeline] + [None]
+    chunk_components = lc.pipeline[: names.index(split_into_batches_after)]
+    batch_components = lc.pipeline[names.index(split_into_batches_after) :]
+
+    def process():
+        bar = nullcontext()
+        if show_progress:
+            from tqdm import tqdm
+
+            bar = tqdm(smoothing=0.1, mininterval=5.0)
+
+        with bar:
+            for docs in batchify(
+                (
+                    subtask
+                    for task, count in reader.read_main()
+                    for subtask in reader.read_worker([task])
+                ),
+                batch_size=lc.chunk_size,
+            ):
+                docs = apply_basic_pipes(docs, chunk_components)
+
+                if lc.sort_chunks:
+                    docs.sort(
+                        key=doc_size_fns.get(
+                            lc.sort_chunks, doc_size_fns["content_boxes"]
+                        )
+                    )
+
+                batches = [
+                    batch
+                    for batch in batchify(
+                        docs,
+                        batch_size=lc.batch_size,
+                        formula=batch_size_fns.get(lc.batch_by, len),
+                    )
+                ]
+
+                for batch in batches:
+                    with no_grad(), lc.cache():
+                        batch = apply_basic_pipes(batch, batch_components)
+
+                    if writer is not None:
+                        result, count = writer.write_worker(batch)
+                        if show_progress:
+                            bar.update(count)
+                        yield result
+                    else:
+                        if show_progress:
+                            bar.update(len(batch))
+                        yield batch
+            if writer is not None:
+                result, count = writer.finalize()
+                if show_progress:
+                    bar.update(count)
+                if count:
+                    yield result
+
+    gen = process()
+    return flatten(gen) if writer is None else writer.write_main(gen)
diff --git a/edspdf/registry.py b/edspdf/registry.py
index 641b0eb2..5219b4cd 100644
--- a/edspdf/registry.py
+++ b/edspdf/registry.py
@@ -220,3 +220,5 @@ class registry(RegistryCollection):
     misc = Registry(("edspdf", "misc"), entry_points=True)
     adapter = Registry(("edspdf", "adapter"), entry_points=True)
     accelerator = Registry(("edspdf", "accelerator"), entry_points=True)
+    readers = Registry(("edspdf", "readers"), entry_points=True)
+    writers = Registry(("edspdf", "writers"), entry_points=True)
diff --git a/edspdf/trainable_pipe.py b/edspdf/trainable_pipe.py
index e87ae84a..e1979baa 100644
--- a/edspdf/trainable_pipe.py
+++ b/edspdf/trainable_pipe.py
@@ -1,28 +1,34 @@
+import os
 from abc import ABCMeta
 from enum import Enum
 from functools import wraps
-from pathlib import Path
 from typing import (
     Any,
+    Callable,
     Dict,
     Generic,
     Iterable,
     Optional,
     Sequence,
+    Set,
+    Tuple,
     TypeVar,
     Union,
 )
 
+import safetensors.torch
 import torch
 
 from edspdf.pipeline import Pipeline
 from edspdf.structures import PDFDoc
 from edspdf.utils.collections import batch_compress_dict, decompress_dict
 
-NestedSequences = Dict[str, Union["NestedSequences", Sequence]]
-NestedTensors = Dict[str, Union["NestedSequences", torch.Tensor]]
-InputBatch = TypeVar("InputBatch", bound=NestedTensors)
-OutputBatch = TypeVar("OutputBatch", bound=NestedTensors)
+BatchInput = TypeVar("BatchInput", bound=Dict[str, Any])
+BatchOutput = TypeVar("BatchOutput", bound=Dict[str, Any])
+Scorer = Callable[[Sequence[Tuple[PDFDoc, PDFDoc]]], Union[float, Dict[str, Any]]]
+
+ALL_CACHES = object()
+_caches = {}
 
 
 class CacheEnum(str, Enum):
@@ -36,19 +42,24 @@ def hash_batch(batch):
         return hash(tuple(id(item) for item in batch))
     elif not isinstance(batch, dict):
         return id(batch)
-    return hash((tuple(batch.keys()), tuple(map(hash_batch, batch.values()))))
+    if "__batch_hash__" in batch:
+        return batch["__batch_hash__"]
+    batch_hash = hash((tuple(batch.keys()), tuple(map(hash_batch, batch.values()))))
+    batch["__batch_hash__"] = batch_hash
+    return batch_hash
 
 
 def cached_preprocess(fn):
     @wraps(fn)
     def wrapped(self: "TrainablePipe", doc: PDFDoc):
-        if self.pipeline is None or self.pipeline._cache is None:
+        if self._current_cache_id is None:
             return fn(self, doc)
-        cache_id = (id(self), "preprocess", id(doc))
-        if cache_id in self.pipeline._cache:
-            return self.pipeline._cache[cache_id]
+        cache_key = ("preprocess", f"{type(self)}<{id(self)}>: {id(doc)}")
+        cache = _caches[self._current_cache_id]
+        if cache_key in cache:
+            return cache[cache_key]
         res = fn(self, doc)
-        self.pipeline._cache[cache_id] = res
+        cache[cache_key] = res
         return res
 
     return wrapped
@@ -57,31 +68,30 @@ def wrapped(self: "TrainablePipe", doc: PDFDoc):
 def cached_preprocess_supervised(fn):
     @wraps(fn)
     def wrapped(self: "TrainablePipe", doc: PDFDoc):
-        if self.pipeline is None or self.pipeline._cache is None:
+        if self._current_cache_id is None:
             return fn(self, doc)
-        cache_id = (id(self), "preprocess_supervised", id(doc))
-        if cache_id in self.pipeline._cache:
-            return self.pipeline._cache[cache_id]
+        cache_key = ("preprocess_supervised", f"{type(self)}<{id(self)}>: {id(doc)}")
+        cache = _caches[self._current_cache_id]
+        if cache_key in cache:
+            return cache[cache_key]
         res = fn(self, doc)
-        self.pipeline._cache[cache_id] = res
+        cache[cache_key] = res
         return res
 
     return wrapped
 
 
 def cached_collate(fn):
-    import torch
-
     @wraps(fn)
-    def wrapped(self: "TrainablePipe", batch: Dict, device: torch.device):
-        if self.pipeline is None or self.pipeline._cache is None:
-            return fn(self, batch, device)
-        cache_id = (id(self), "collate", hash_batch(batch))
-        if cache_id in self.pipeline._cache:
-            return self.pipeline._cache[cache_id]
-        res = fn(self, batch, device)
-        self.pipeline._cache[cache_id] = res
-        res["cache_id"] = cache_id
+    def wrapped(self: "TrainablePipe", batch: Dict):
+        if self._current_cache_id is None:
+            return fn(self, batch)
+        cache_key = ("collate", f"{type(self)}<{id(self)}>: {hash_batch(batch)}")
+        cache = _caches[self._current_cache_id]
+        if cache_key in cache:
+            return cache[cache_key]
+        res = fn(self, batch)
+        cache[cache_key] = res
         return res
 
     return wrapped
@@ -90,13 +100,35 @@ def wrapped(self: "TrainablePipe", batch: Dict, device: torch.device):
 def cached_forward(fn):
     @wraps(fn)
     def wrapped(self: "TrainablePipe", batch):
-        if self.pipeline is None or self.pipeline._cache is None:
+        # Convert args and kwargs to a dictionary matching fn signature
+        if self._current_cache_id is None:
             return fn(self, batch)
-        cache_id = (id(self), "collate", hash_batch(batch))
-        if cache_id in self.pipeline._cache:
-            return self.pipeline._cache[cache_id]
+        cache_key = ("forward", f"{type(self)}<{id(self)}>: {hash_batch(batch)}")
+        cache = _caches[self._current_cache_id]
+        if cache_key in cache:
+            return cache[cache_key]
         res = fn(self, batch)
-        self.pipeline._cache[cache_id] = res
+        cache[cache_key] = res
+        return res
+
+    return wrapped
+
+
+def cached_batch_to_device(fn):
+    @wraps(fn)
+    def wrapped(self: "TrainablePipe", batch, device):
+        # Convert args and kwargs to a dictionary matching fn signature
+        if self._current_cache_id is None:
+            return fn(self, batch, device)
+        cache_key = (
+            "batch_to_device",
+            f"{type(self)}<{id(self)}>: {hash_batch(batch)}",
+        )
+        cache = _caches[self._current_cache_id]
+        if cache_key in cache:
+            return cache[cache_key]
+        res = fn(self, batch, device)
+        cache[cache_key] = res
         return res
 
     return wrapped
@@ -112,6 +144,10 @@ def __new__(mcs, name, bases, class_dict):
             )
         if "collate" in class_dict:
             class_dict["collate"] = cached_collate(class_dict["collate"])
+        if "batch_to_device" in class_dict:
+            class_dict["batch_to_device"] = cached_batch_to_device(
+                class_dict["batch_to_device"]
+            )
         if "forward" in class_dict:
             class_dict["forward"] = cached_forward(class_dict["forward"])
 
@@ -120,7 +156,7 @@ def __new__(mcs, name, bases, class_dict):
 
 class TrainablePipe(
     torch.nn.Module,
-    Generic[OutputBatch],
+    Generic[BatchOutput],
     metaclass=TrainablePipeMeta,
 ):
     """
@@ -133,15 +169,30 @@ class TrainablePipe(
     for components that share a common subcomponent.
     """
 
+    call_super_init = True
+
     def __init__(self, pipeline: Optional[Pipeline], name: Optional[str]):
         super().__init__()
-        self.pipeline = pipeline
         self.name = name
-        self.cfg = {}
-        self._preprocess_cache = {}
-        self._preprocess_supervised_cache = {}
-        self._collate_cache = {}
-        self._forward_cache = {}
+        self._current_cache_id = None
+
+    def enable_cache(self, cache_id="default"):
+        self._current_cache_id = cache_id
+        _caches.setdefault(cache_id, {})
+        for name, component in self.named_component_children():
+            if hasattr(component, "enable_cache"):
+                component.enable_cache(cache_id)
+
+    def disable_cache(self, cache_id=ALL_CACHES):
+        if cache_id is ALL_CACHES:
+            _caches.clear()
+        else:
+            if cache_id in _caches:
+                del _caches[cache_id]
+        self._current_cache_id = None
+        for name, component in self.named_component_children():
+            if hasattr(component, "disable_cache"):
+                component.disable_cache(cache_id)
 
     @property
     def device(self):
@@ -152,45 +203,7 @@ def named_component_children(self):
             if isinstance(module, TrainablePipe):
                 yield name, module
 
-    def save_extra_data(self, path: Path, exclude: set):
-        """
-        Dumps vocabularies indices to json files
-
-        Parameters
-        ----------
-        path: Path
-            Path to the directory where the files will be saved
-        exclude: Set
-            The set of component names to exclude from saving
-            This is useful when components are repeated in the pipeline.
-        """
-        if self.name in exclude:
-            return
-        exclude.add(self.name)
-        for name, component in self.named_component_children():
-            if hasattr(component, "save_extra_data"):
-                component.save_extra_data(path / name, exclude)
-
-    def load_extra_data(self, path: Path, exclude: set):
-        """
-        Loads vocabularies indices from json files
-
-        Parameters
-        ----------
-        path: Path
-            Path to the directory where the files will be loaded
-        exclude: Set
-            The set of component names to exclude from loading
-            This is useful when components are repeated in the pipeline.
-        """
-        if self.name in exclude:
-            return
-        exclude.add(self.name)
-        for name, component in self.named_component_children():
-            if hasattr(component, "load_extra_data"):
-                component.load_extra_data(path / name, exclude)
-
-    def post_init(self, gold_data: Iterable[PDFDoc], exclude: set):
+    def post_init(self, gold_data: Iterable[PDFDoc], exclude: Set[str]):
         """
         This method completes the attributes of the component, by looking at some
         documents. It is especially useful to build vocabularies or detect the labels
@@ -205,9 +218,10 @@ def post_init(self, gold_data: Iterable[PDFDoc], exclude: set):
             This argument will be gradually updated  with the names of initialized
             components
         """
-        if self.name in exclude:
+        repr_id = object.__repr__(self)
+        if repr_id in exclude:
             return
-        exclude.add(self.name)
+        exclude.add(repr_id)
         for name, component in self.named_component_children():
             if hasattr(component, "post_init"):
                 component.post_init(gold_data, exclude=exclude)
@@ -233,52 +247,79 @@ def preprocess(self, doc: PDFDoc) -> Dict[str, Any]:
             for name, component in self.named_component_children()
         }
 
-    def collate(self, batch: NestedSequences, device: torch.device) -> InputBatch:
+    def collate(self, batch: Dict[str, Any]) -> BatchInput:
         """
         Collate the batch of features into a single batch of tensors that can be
         used by the forward method of the component.
 
         Parameters
         ----------
-        batch: NestedSequences
+        batch: Dict[str, Any]
             Batch of features
-        device: torch.device
-            Device on which the tensors should be moved
 
         Returns
         -------
-        InputBatch
+        BatchInput
             Dictionary (optionally nested) containing the collated tensors
         """
         return {
-            name: component.collate(batch[name], device)
+            name: component.collate(batch[name])
             for name, component in self.named_component_children()
         }
 
-    def forward(self, batch: InputBatch) -> OutputBatch:
+    def batch_to_device(
+        self,
+        batch: BatchInput,
+        device: Optional[Union[str, torch.device]],
+    ) -> BatchInput:
         """
-        Perform the forward pass of the neural network, i.e, apply transformations
-        over the collated features to compute new embeddings, probabilities, losses, etc
+        Move the batch of tensors to the specified device.
 
         Parameters
         ----------
-        batch: InputBatch
+        batch: BatchInput
+            Batch of tensors
+        device: Optional[Union[str, torch.device]]
+            Device to move the tensors to
+
+        Returns
+        -------
+        BatchInput
+        """
+        return {
+            name: (
+                value.to(device)
+                if hasattr(value, "to")
+                else getattr(self, name).batch_to_device(value, device=device)
+                if hasattr(self, name)
+                else value
+            )
+            for name, value in batch.items()
+        }
+
+    def forward(self, batch: BatchInput) -> BatchOutput:
+        """
+        Perform the forward pass of the neural network.
+
+        Parameters
+        ----------
+        batch: BatchInput
             Batch of tensors (nested dictionary) computed by the collate method
 
         Returns
         -------
-        OutputBatch
+        BatchOutput
         """
         raise NotImplementedError()
 
-    def module_forward(self, batch: InputBatch) -> OutputBatch:
+    def module_forward(self, *args, **kwargs):
         """
         This is a wrapper around `torch.nn.Module.__call__` to avoid conflict
         with the
         [`TrainablePipe.__call__`][edspdf.trainable_pipe.TrainablePipe.__call__]
         method.
         """
-        return torch.nn.Module.__call__(self, batch)
+        return torch.nn.Module.__call__(self, *args, **kwargs)
 
     def make_batch(
         self,
@@ -323,9 +364,11 @@ def batch_process(self, docs: Sequence[PDFDoc]) -> Sequence[PDFDoc]:
         Sequence[PDFDoc]
             Batch of updated documents
         """
+        device = self.device
         with torch.no_grad():
             batch = self.make_batch(docs)
-            inputs = self.collate(batch, device=self.device)
+            inputs = self.collate(batch)
+            inputs = self.batch_to_device(inputs, device=device)
             if hasattr(self, "compiled"):
                 res = self.compiled(inputs)
             else:
@@ -334,7 +377,7 @@ def batch_process(self, docs: Sequence[PDFDoc]) -> Sequence[PDFDoc]:
             return docs
 
     def postprocess(
-        self, docs: Sequence[PDFDoc], batch: OutputBatch
+        self, docs: Sequence[PDFDoc], batch: BatchOutput
     ) -> Sequence[PDFDoc]:
         """
         Update the documents with the predictions of the neural network, for instance
@@ -346,7 +389,7 @@ def postprocess(
         ----------
         docs: Sequence[PDFDoc]
             Batch of documents
-        batch: OutputBatch
+        batch: BatchOutput
             Batch of predictions, as returned by the forward method
 
         Returns
@@ -391,3 +434,41 @@ def __call__(self, doc: PDFDoc) -> PDFDoc:
         PDFDoc
         """
         return self.batch_process([doc])[0]
+
+    def to_disk(self, path, exclude: Optional[Set[str]]):
+        if object.__repr__(self) in exclude:
+            return
+        exclude.add(object.__repr__(self))
+        overrides = {}
+        for name, component in self.named_component_children():
+            if hasattr(component, "to_disk"):
+                pipe_overrides = component.to_disk(path / name, exclude=exclude)
+                if pipe_overrides:
+                    overrides[name] = pipe_overrides
+        tensor_dict = {
+            n: p
+            for n, p in self.named_parameters()
+            if object.__repr__(p) not in exclude
+        }
+        os.makedirs(path, exist_ok=True)
+        safetensors.torch.save_file(tensor_dict, path / "parameters.safetensors")
+        exclude.update(object.__repr__(p) for p in tensor_dict.values())
+        return overrides
+
+    def from_disk(self, path, exclude: Optional[Set[str]]):
+        if object.__repr__(self) in exclude:
+            return
+        exclude.add(object.__repr__(self))
+        for name, component in self.named_component_children():
+            if hasattr(component, "from_disk"):
+                component.from_disk(path / name, exclude=exclude)
+        tensor_dict = safetensors.torch.load_file(path / "parameters.safetensors")
+        self.load_state_dict(tensor_dict, strict=False)
+
+    @property
+    def load_extra_data(self):
+        return self.from_disk
+
+    @property
+    def save_extra_data(self):
+        return self.to_disk
diff --git a/edspdf/utils/collections.py b/edspdf/utils/collections.py
index f0c37db9..7ffe7861 100644
--- a/edspdf/utils/collections.py
+++ b/edspdf/utils/collections.py
@@ -1,8 +1,18 @@
 import copy
 import itertools
-import math
 from collections import defaultdict
-from typing import Any, Dict, Iterable, List, Mapping, Sequence, TypeVar
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    Mapping,
+    Sequence,
+    TypeVar,
+    Union,
+)
 
 It = TypeVar("It", bound=Iterable)
 T = TypeVar("T")
@@ -37,16 +47,46 @@ def nest_dict(flat: Dict[str, Any]) -> Dict[str, Any]:
 
 
 def ld_to_dl(ld: Iterable[Mapping[str, T]]) -> Dict[str, List[T]]:
+    """
+    Convert a list of dictionaries to a dictionary of lists
+
+    Parameters
+    ----------
+    ld: Iterable[Mapping[str, T]]
+        The list of dictionaries
+
+    Returns
+    -------
+    Dict[str, List[T]]
+        The dictionary of lists
+    """
     ld = list(ld)
-    return {k: [dic[k] for dic in ld] for k in ld[0]}
+    return {k: [dic.get(k) for dic in ld] for k in (ld[0] if len(ld) else ())}
+
+
+def dl_to_ld(dl: Mapping[str, Sequence[Any]]) -> Iterator[Dict[str, Any]]:
+    """
+    Convert a dictionary of lists to a list of dictionaries
 
+    Parameters
+    ----------
+    dl: Mapping[str, Sequence[Any]]
+        The dictionary of lists
 
-def dl_to_ld(dl: Mapping[str, Sequence[T]]) -> List[Dict[str, T]]:
-    return [dict(zip(dl, t)) for t in zip(*dl.values())]
+    Returns
+    -------
+    List[Dict[str, Any]]
+        The list of dictionaries
+    """
+    return (dict(zip(dl, t)) for t in zip(*dl.values()))
 
 
-def flatten(seq: Sequence[Sequence["T"]]) -> List["T"]:
-    return list(itertools.chain.from_iterable(seq))
+def flatten(items):
+    for item in items:
+        if isinstance(item, list):
+            yield from flatten(item)
+        else:
+            yield item
 
 
 FLATTEN_TEMPLATE = """\
@@ -64,16 +104,17 @@ def rec(current, path):
             keys[id(current)].append(path)
             return
         for key, value in current.items():
-            rec(value, (*path, key))
+            if not key.startswith("$"):
+                rec(value, (*path, key))
 
     rec(obj, ())
 
     code = FLATTEN_TEMPLATE.format(
         "{"
         + "\n".join(
-            "'{}': root{},".format(
-                "|".join(map("/".join, key_list)),
-                "".join(f"['{k}']" for k in key_list[0]),
+            "{}: root{},".format(
+                repr("|".join(map("/".join, key_list))),
+                "".join(f"[{repr(k)}]" for k in key_list[0]),
             )
             for key_list in keys.values()
         )
@@ -83,6 +124,19 @@ def rec(current, path):
 
 
 class batch_compress_dict:
+    """
+    Compress a sequence of dictionaries in which values that occur multiple times are
+    deduplicated. The corresponding keys will be merged into a single string using
+    the "|" character as a separator.
+    This is useful to preserve referential identities when decompressing the dictionary
+    after it has been serialized and deserialized.
+
+    Parameters
+    ----------
+    seq: Iterable[Dict[str, Any]]
+        Sequence of dictionaries to compress
+    """
+
     __slots__ = ("flatten", "seq")
 
     def __init__(self, seq: Iterable[Dict[str, Any]]):
@@ -109,7 +163,23 @@ def __next__(self) -> Dict[str, List]:
         return self.flatten(item)
 
 
-def decompress_dict(seq):
+def decompress_dict(seq: Union[Iterable[Dict[str, Any]], Dict[str, Any]]):
+    """
+    Decompress a dictionary of lists into a sequence of dictionaries.
+    This function assumes that the dictionary structure was obtained using the
+    `batch_compress_dict` class.
+    Keys that were merged into a single string using the "|" character as a separator
+    will be split into a nested dictionary structure.
+
+    Parameters
+    ----------
+    seq: Union[Iterable[Dict[str, Any]], Dict[str, Any]]
+        The dictionary to decompress or a sequence of dictionaries to decompress
+
+    Returns
+    -------
+
+    """
     obj = ld_to_dl(seq) if isinstance(seq, Sequence) else seq
     res = {}
     for key, value in obj.items():
@@ -122,26 +192,33 @@ def decompress_dict(seq):
     return res
 
 
-class batchify(Iterable[List[T]]):
-    def __init__(self, iterable: Iterable[T], batch_size: int):
-        self.iterable = iter(iterable)
-        self.batch_size = batch_size
-        try:
-            self.length = math.ceil(len(iterable) / batch_size)
-        except (AttributeError, TypeError):
-            pass
-
-    def __len__(self):
-        return self.length
-
-    def __iter__(self):
-        return self
-
-    def __next__(self):
-        batch = list(itertools.islice(self.iterable, self.batch_size))
-        if len(batch) == 0:
-            raise StopIteration()
-        return batch
+def batchify(
+    iterable: Iterable[T],
+    batch_size: int,
+    drop_last: bool = False,
+    formula: Callable = len,
+) -> Iterable[List[T]]:
+    """
+    Yields batch that contain at most `batch_size` elements.
+    If an item contains more than `batch_size` elements, it will be yielded as a single
+    batch.
+
+    Parameters
+    ----------
+    iterable: Iterable[T]
+    batch_size: int
+    drop_last: bool
+    formula: Callable
+    """
+    batch = []
+    for item in iterable:
+        next_size = formula(batch + [item])
+        if next_size > batch_size and len(batch) > 0:
+            yield batch
+            batch = []
+        batch.append(item)
+    if len(batch) > 0 and not drop_last:
+        yield batch
 
 
 def get_attr_item(base, attr):
diff --git a/edspdf/utils/lazy_module.py b/edspdf/utils/lazy_module.py
new file mode 100644
index 00000000..ddc75d61
--- /dev/null
+++ b/edspdf/utils/lazy_module.py
@@ -0,0 +1,108 @@
+# flake8: noqa: F811
+import ast
+import importlib
+import inspect
+import os
+
+
+def lazify():
+    def _get_module_paths(file):
+        """
+        Reads the content of the current file, parses it with ast and store the
+        import path for future potential imports. This is useful to only import
+        the module that is requested and avoid loading all the modules at once, since
+        some of them are quite heavy, or contain dependencies that are not always
+        available.
+
+        For instance:
+        > from .trainable.span_qualifier.factory import create_component as
+        span_qualifier is stored in the cache as:
+        > module_paths["span_qualifier"] = "trainable.span_qualifier.factory"
+
+        Returns
+        -------
+        Dict[str, Tuple[str, str]]
+            The absolute path of the current file.
+        """
+        module_path = os.path.abspath(file)
+        with open(module_path, "r") as f:
+            module_content = f.read()
+        module_ast = ast.parse(module_content)
+        module_paths = {}
+        for node in module_ast.body:
+            # Lookup TYPE_CHECKING
+            if not (
+                isinstance(node, ast.If)
+                and (
+                    (
+                        isinstance(node.test, ast.Name)
+                        and node.test.id == "TYPE_CHECKING"
+                    )
+                    or (
+                        isinstance(node.test, ast.Attribute)
+                        and node.test.attr == "TYPE_CHECKING"
+                    )
+                )
+            ):
+                continue
+            for import_node in node.body:
+                if isinstance(import_node, ast.ImportFrom):
+                    for name in import_node.names:
+                        module_paths[name.asname or name.name] = (
+                            import_node.module,
+                            name.name,
+                        )
+
+        return module_paths
+
+    def __getattr__(name):
+        """
+        Imports the actual module if it is in the module_paths dict.
+
+        Parameters
+        ----------
+        name
+
+        Returns
+        -------
+
+        """
+        if name in module_paths:
+            module_path, module_name = module_paths[name]
+            result = getattr(
+                importlib.__import__(
+                    module_path,
+                    fromlist=[module_name],
+                    globals=module_globals,
+                    level=1,
+                ),
+                module_name,
+            )
+            module_globals[name] = result
+            return result
+        raise AttributeError(f"module {__name__} has no attribute {name}")
+
+    def __dir__():
+        """
+        Returns the list of available modules.
+
+        Returns
+        -------
+        List[str]
+        """
+        return __all__
+
+    # Access upper frame
+    module_globals = inspect.currentframe().f_back.f_globals
+
+    module_paths = _get_module_paths(module_globals["__file__"])
+
+    __all__ = list(module_paths.keys())
+
+    module_globals.update(
+        {
+            "__getattr__": __getattr__,
+            "__dir__": __dir__,
+            "__all__": __all__,
+        }
+    )
diff --git a/edspdf/utils/package.py b/edspdf/utils/package.py
index 8440fa65..31a7c132 100644
--- a/edspdf/utils/package.py
+++ b/edspdf/utils/package.py
@@ -1,4 +1,3 @@
-import io
 import os
 import re
 import shutil
@@ -6,7 +5,6 @@
 import sys
 from contextlib import contextmanager
 from pathlib import Path
-from types import FunctionType
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -14,20 +12,13 @@
     Mapping,
     Optional,
     Sequence,
-    Tuple,
-    Type,
     Union,
 )
 
 import build
-import dill
 import toml
 from build.__main__ import build_package, build_package_via_sdist
 from confit import Cli
-from dill._dill import save_function as dill_save_function
-from dill._dill import save_type as dill_save_type
-from importlib_metadata import PackageNotFoundError
-from importlib_metadata import version as get_version
 from loguru import logger
 from typing_extensions import Literal
 
@@ -35,57 +26,6 @@
 
 py_version = f"{sys.version_info.major}.{sys.version_info.minor}"
 
-
-def get_package(obj_type: Type):
-    # Retrieve the __package__ attribute of the module of a type, if possible.
-    # And returns the package version as well
-    try:
-        module_name = obj_type.__module__
-        if module_name == "__main__":
-            raise Exception(f"Could not find package of type {obj_type}")
-        module = __import__(module_name, fromlist=["__package__"])
-        package = module.__package__
-        try:
-            version = get_version(package)
-        except (PackageNotFoundError, ValueError):
-            return None
-        return package, version
-    except (ImportError, AttributeError):
-        raise Exception(f"Cound not find package of type {obj_type}")
-
-
-def save_type(pickler, obj, *args, **kwargs):
-    package_name = get_package(obj)
-    if package_name is not None:
-        pickler.packages.add(package_name)
-    dill_save_type(pickler, obj, *args, **kwargs)
-
-
-def save_function(pickler, obj, *args, **kwargs):
-    package_name = get_package(obj)
-    if package_name is not None:
-        pickler.packages.add(package_name)
-    return dill_save_function(pickler, obj, *args, **kwargs)
-
-
-class PackagingPickler(dill.Pickler):
-    dispatch = dill.Pickler.dispatch.copy()
-
-    dispatch[FunctionType] = save_function
-    dispatch[type] = save_type
-
-    def __init__(self, *args, **kwargs):
-        self.file = io.BytesIO()
-        super().__init__(self.file, *args, **kwargs)
-        self.packages = set()
-
-
-def get_deep_dependencies(obj):
-    pickler = PackagingPickler()
-    pickler.dump(obj)
-    return sorted(pickler.packages)
-
-
 app = Cli(pretty_exceptions_show_locals=False, pretty_exceptions_enable=False)
 
 
@@ -132,6 +72,8 @@ def validate(cls, value, config=None):
 # Initialize the builder
 try:
     builder = SdistBuilder(poetry, None, None)
+    # Get the list of files to include
+    files = builder.find_files_to_add()
 except ModuleOrPackageNotFound:
     if not poetry.package.packages:
         print([])
@@ -139,15 +81,13 @@ def validate(cls, value, config=None):
 
 print([
     {k: v for k, v in {
-    "include": include._include,
-    "from": include.source,
-    "formats": include.formats,
-    }.items()}
+    "include": getattr(include, '_include'),
+    "from": getattr(include, 'source', None),
+    "formats": getattr(include, 'formats', None),
+    }.items() if v}
     for include in builder._module.includes
 ])
 
-# Get the list of files to include
-files = builder.find_files_to_add()
 
 # Print the list of files
 for file in files:
@@ -161,12 +101,16 @@ def validate(cls, value, config=None):
 
 import edspdf
 from pathlib import Path
+from typing import Optional, Dict, Any
 
 __version__ = {__version__}
 
-def load(device: "torch.device" = "cpu") -> edspdf.Pipeline:
+def load(
+    overrides: Optional[Dict[str, Any]] = None,
+    device: "torch.device" = "cpu"
+) -> edspdf.Pipeline:
     artifacts_path = Path(__file__).parent / "{artifacts_dir}"
-    model = edspdf.load(artifacts_path, device=device)
+    model = edspdf.load(artifacts_path, overrides=overrides, device=device)
     return model
 """
 
@@ -194,13 +138,12 @@ def __init__(
         self,
         pyproject: Optional[Dict[str, Any]],
         pipeline: Union[Path, "edspdf.Pipeline"],
-        version: str,
+        version: Optional[str],
         name: Optional[ModuleName],
         root_dir: Path = ".",
-        build_name: Path = "build",
-        out_dir: Path = "dist",
+        build_dir: Path = "build",
+        dist_dir: Path = "dist",
         artifacts_name: ModuleName = "artifacts",
-        dependencies: Optional[Sequence[Tuple[str, str]]] = None,
         metadata: Optional[Dict[str, Any]] = {},
     ):
         self.poetry_bin_path = (
@@ -212,13 +155,13 @@ def __init__(
         self.name = name
         self.pyproject = pyproject
         self.root_dir = root_dir.resolve()
-        self.dependencies = dependencies
         self.pipeline = pipeline
         self.artifacts_name = artifacts_name
-        self.out_dir = self.root_dir / out_dir
+        self.dist_dir = (
+            dist_dir if Path(dist_dir).is_absolute() else self.root_dir / dist_dir
+        )
 
         with self.ensure_pyproject(metadata):
-
             python_executable = (
                 Path(self.poetry_bin_path).read_text().split("\n")[0][2:]
             )
@@ -236,7 +179,9 @@ def __init__(
             out = result.stdout.decode().strip().split("\n")
 
         self.poetry_packages = eval(out[0])
-        self.build_dir = root_dir / build_name / self.name
+        self.build_dir = (
+            build_dir if Path(build_dir).is_absolute() else root_dir / build_dir
+        ) / self.name
         self.file_paths = [self.root_dir / file_path for file_path in out[1:]]
 
         logger.info(f"root_dir: {self.root_dir}")
@@ -262,13 +207,9 @@ def ensure_pyproject(self, metadata):
                         "poetry": {
                             **metadata,
                             "name": self.name,
-                            "version": self.version,
+                            "version": self.version or "0.1.0",
                             "dependencies": {
                                 "python": f">={py_version},<4.0",
-                                **{
-                                    dep_name: f"^{dep_version}"
-                                    for dep_name, dep_version in self.dependencies
-                                },
                             },
                         },
                     },
@@ -319,7 +260,7 @@ def build(
             distributions = ["wheel"]
         build_call(
             srcdir=self.build_dir,
-            outdir=self.out_dir,
+            outdir=self.dist_dir,
             distributions=distributions,
             config_settings=config_settings,
             isolation=isolation,
@@ -335,12 +276,13 @@ def update_pyproject(self):
             f"project"
         )
 
-        old_version = self.pyproject["tool"]["poetry"]["version"]
-        self.pyproject["tool"]["poetry"]["version"] = self.version
-        logger.info(
-            f"Replaced project version {old_version!r} with {self.version!r} in poetry "
-            f"based project"
-        )
+        if self.version is not None:
+            old_version = self.pyproject["tool"]["poetry"]["version"]
+            self.pyproject["tool"]["poetry"]["version"] = self.version
+            logger.info(
+                f"Replaced project version {old_version!r} with {self.version!r} in "
+                f"poetry based project"
+            )
 
         # Adding artifacts to include in pyproject.toml
         snake_name = snake_case(self.name.lower())
@@ -381,7 +323,7 @@ def make_src_dir(self):
                 build_artifacts_dir,
             )
         else:
-            self.pipeline.save(build_artifacts_dir)
+            self.pipeline.to_disk(build_artifacts_dir)
         os.makedirs(package_dir, exist_ok=True)
         with open(package_dir / "__init__.py", mode="a") as f:
             f.write(
@@ -397,10 +339,11 @@ def package(
     pipeline: Union[Path, "edspdf.Pipeline"],
     name: Optional[ModuleName] = None,
     root_dir: Path = ".",
+    build_dir: Path = "build",
+    dist_dir: Path = "dist",
     artifacts_name: ModuleName = "artifacts",
-    check_dependencies: bool = False,
     project_type: Optional[Literal["poetry", "setuptools"]] = None,
-    version: str = "0.1.0",
+    version: Optional[str] = None,
     metadata: Optional[Dict[str, Any]] = {},
     distributions: Optional[Sequence[Literal["wheel", "sdist"]]] = ["wheel"],
     config_settings: Optional[Mapping[str, Union[str, Sequence[str]]]] = None,
@@ -411,21 +354,12 @@ def package(
     pyproject_path = root_dir / "pyproject.toml"
 
     if not pyproject_path.exists():
-        check_dependencies = True
         if name is None:
             raise ValueError(
                 f"No pyproject.toml could be found in the root directory {root_dir}, "
                 f"you need to create one, or fill the name parameter."
             )
 
-    dependencies = None
-    if check_dependencies:
-        if isinstance(pipeline, Path):
-            pipeline = edspdf.load(pipeline)
-        dependencies = get_deep_dependencies(pipeline)
-        for dep in dependencies:
-            print("DEPENDENCY", dep[0].ljust(30), dep[1])
-
     root_dir = root_dir.resolve()
 
     pyproject = None
@@ -442,8 +376,9 @@ def package(
             name=name,
             version=version,
             root_dir=root_dir,
+            build_dir=build_dir,
+            dist_dir=dist_dir,
             artifacts_name=artifacts_name,
-            dependencies=dependencies,
             metadata=metadata,
         )
     else:
diff --git a/edspdf/visualization/annotations.py b/edspdf/visualization/annotations.py
index cd5d0a8d..cf1770cc 100644
--- a/edspdf/visualization/annotations.py
+++ b/edspdf/visualization/annotations.py
@@ -59,7 +59,7 @@ def show_annotations(
     """
 
     pdf_doc = pdfium.PdfDocument(pdf)
-    pages = list(pdf_doc.render_topil(scale=2))
+    pages = list([page.render(scale=2).to_pil() for page in pdf_doc])
     unique_labels = list(dict.fromkeys([box.label for box in annotations]))
 
     if colors is None:
diff --git a/pyproject.toml b/pyproject.toml
index ed6024d6..037daa39 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,22 +18,28 @@ dependencies = [
     "anyascii>=0.3.2",
     "scikit-learn>=1.0.2,<2.0.0",
     "pydantic>=1.2,<2.0.0",
-    "catalogue~=2.0",
-    "networkx~=2.6",
-    "confit>=0.4.3,<1.0.0",
-    "foldedtensor>=0.3.1,<1.0.0",
+    "catalogue>=2.0",
+    "networkx>=2.6",
+    "confit>=0.5.3,<1.0.0",
+    "fsspec<2023.1.0 ; python_version<'3.8'",
+    "fsspec ; python_version>='3.8'",
+    "foldedtensor>=0.3.3",
     "torch>1.0.0",
     "accelerate>=0.12.0,<1.0.0",
-    "tqdm~=4.64.1",
+    "tqdm>=4.64",
     "regex",
-    "pdfminer.six>=20220319",
-    "pypdfium2~=2.7",
-    "rich-logger>=0.3.0,<1.0.0",
-    "safetensors~=0.3.1",
+    "pdfminer.six>=20220319,<20231228 ; python_version<'3.8'",
+    "pdfminer.six ; python_version>='3.8'",
+    "pypdfium2>=4.0",
+    "rich-logger>=0.3",
+    "safetensors>=0.3",
     "anyascii>=0.3.2",
-    "attrs~=23.1",
+    "attrs>=23.1",
     "build>=0.10.0",
+    "pyarrow",
     "loguru",
+    "toml",
+    "dill",
 ]
 
 
@@ -50,9 +56,9 @@ dependencies = [
     "mypy>=1.0.0",
     "streamlit>=1.19",
     "coverage>=6.5.0",
-    "datasets~=2.10",
+    "datasets>=2.10",
     "huggingface_hub>=0.8.1",
-    "transformers~=4.30",
+    "transformers>=4.30",
 ]
 
 [tool.hatch.envs.default.scripts]
@@ -139,19 +145,17 @@ omit-covered-files = false
 concurrency = ["multiprocessing"]
 
 [tool.coverage.report]
-omit = [
-    "tests/*",
+include = [
+    "edspdf/*",
 ]
-# omit = [
-#     "edspdf/accelerators/multiprocessing.py",
-# ]
-exclude_also = [
+exclude_lines = [
     "def __repr__",
     "if __name__ == .__main__.:",
     "@overload",
     "pragma: no cover",
     "raise .*Error",
     "raise .*Exception",
+    "warn\\(",
     "if __name__ == .__main__.:",
     "if (self[.])?name in exclude:",
     "if TYPE_CHECKING:",
diff --git a/tests/conftest.py b/tests/conftest.py
index 289e7831..dc274392 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,6 +1,4 @@
-import copy
 import os
-from functools import lru_cache
 from pathlib import Path
 
 import pytest
@@ -53,18 +51,6 @@ def error_pdf():
     return path.read_bytes()
 
 
-@lru_cache(maxsize=1)
-def make_pdfdoc(pdf):
-    from edspdf.pipes.extractors.pdfminer import PdfMinerExtractor
-
-    return PdfMinerExtractor(render_pages=True)(pdf)
-
-
-@fixture()
-def pdfdoc(pdf):
-    return copy.deepcopy(make_pdfdoc(pdf))
-
-
 @fixture(scope="session")
 def dummy_dataset(tmpdir_factory, pdf):
     tmp_path = tmpdir_factory.mktemp("datasets")
diff --git a/tests/core/test_data.py b/tests/core/test_data.py
new file mode 100644
index 00000000..a695f66d
--- /dev/null
+++ b/tests/core/test_data.py
@@ -0,0 +1,160 @@
+import json
+import os
+from pathlib import Path
+
+import pandas as pd
+import pytest
+
+import edspdf
+import edspdf.accelerators.multiprocessing
+from edspdf import PDFDoc
+from edspdf.data.converters import CONTENT, FILENAME
+from edspdf.utils.collections import flatten
+
+
+def box_converter(x):
+    return [
+        {
+            "id": x.id,
+            "page_num": b.page_num,
+            "x0": b.x0,
+            "x1": b.x1,
+            "y0": b.y0,
+            "y1": b.y1,
+        }
+        for b in x.content_boxes
+    ]
+
+
+def full_file_converter(x):
+    return {
+        FILENAME: x.id,
+        CONTENT: x.content,
+        "annotations": [
+            {
+                "page_num": b.page_num,
+                "x0": b.x0,
+                "x1": b.x1,
+                "y0": b.y0,
+                "y1": b.y1,
+            }
+            for b in x.content_boxes
+        ],
+    }
+
+
+@pytest.mark.parametrize("write_mode", ["parquet", "pandas", "iterable", "files"])
+@pytest.mark.parametrize("num_cpu_workers", [1, 2])
+@pytest.mark.parametrize("write_in_worker", [False, True])
+def test_write_data(
+    frozen_pipeline,
+    tmp_path,
+    change_test_dir,
+    write_mode,
+    num_cpu_workers,
+    write_in_worker,
+):
+    docs = edspdf.data.read_files("file://" + os.path.abspath("../resources"))
+    docs = docs.map_pipeline(frozen_pipeline)
+    docs = docs.set_processing(
+        num_cpu_workers=num_cpu_workers,
+        gpu_pipe_names=[],
+        batch_by="content_boxes",
+        chunk_size=3,
+        sort_chunks=True,
+    )
+    if write_mode == "parquet":
+        docs.write_parquet(
+            "file://" + str(tmp_path / "parquet" / "test.parquet"),
+            converter=box_converter,
+            write_in_worker=write_in_worker,
+        )
+        df = pd.read_parquet("file://" + str(tmp_path / "parquet" / "test.parquet"))
+    elif write_mode == "pandas":
+        if write_in_worker:
+            pytest.skip()
+        df = docs.to_pandas(converter=box_converter)
+    elif write_mode == "iterable":
+        if write_in_worker:
+            pytest.skip()
+        df = pd.DataFrame(flatten(docs.to_iterable(converter=box_converter)))
+    else:
+        if write_in_worker:
+            pytest.skip()
+        docs.write_files(
+            tmp_path / "files",
+            converter=full_file_converter,
+        )
+        records = []
+        for f in (tmp_path / "files").rglob("*.json"):
+            records.extend(json.loads(f.read_text())["annotations"])
+        df = pd.DataFrame(records)
+    assert len(df) == 91
+
+
+@pytest.fixture(scope="module")
+def parquet_file(tmp_path_factory, request):
+    os.chdir(request.fspath.dirname)
+    tmp_path = tmp_path_factory.mktemp("test_input_parquet")
+    path = tmp_path / "input_test.pq"
+    docs = edspdf.data.read_files("file://" + os.path.abspath("../resources"))
+    docs.write_parquet(
+        path,
+        converter=lambda x: {
+            "content": x["content"],
+            "id": x["id"],
+        },
+    )
+    os.chdir(request.config.invocation_dir)
+    return path
+
+
+@pytest.mark.parametrize("read_mode", ["parquet", "pandas", "iterable", "files"])
+@pytest.mark.parametrize("num_cpu_workers", [1, 2])
+@pytest.mark.parametrize("read_in_worker", [False, True])
+def test_read_data(
+    frozen_pipeline,
+    tmp_path,
+    parquet_file,
+    change_test_dir,
+    read_mode,
+    num_cpu_workers,
+    read_in_worker,
+):
+    if read_mode == "files":
+        docs = edspdf.data.read_files(
+            "file://" + os.path.abspath("../resources"),
+            converter=lambda x: PDFDoc(id=x["id"], content=x["content"]),
+            # read_in_worker=True,
+        )
+        if read_in_worker:
+            pytest.skip()
+    elif read_mode == "parquet":
+        docs = edspdf.data.read_parquet(
+            parquet_file,
+            converter=lambda x: x["content"],
+            read_in_worker=True,
+        )
+    elif read_mode == "pandas":
+        if read_in_worker:
+            pytest.skip()
+        docs = edspdf.data.from_pandas(
+            pd.read_parquet(parquet_file),
+            converter=lambda x: x["content"],
+        )
+    else:
+        if read_in_worker:
+            pytest.skip()
+        docs = edspdf.data.from_iterable(
+            f.read_bytes() for f in Path("../resources").rglob("*.pdf")
+        )
+    docs = docs.map_pipeline(frozen_pipeline)
+    docs = docs.set_processing(
+        num_cpu_workers=num_cpu_workers,
+        show_progress=True,
+        batch_by="content_boxes",
+        chunk_size=3,
+        sort_chunks=True,
+    )
+    df = docs.to_pandas(converter=box_converter)
+    assert len(df) == 91
diff --git a/tests/core/test_pipeline.py b/tests/core/test_pipeline.py
index 554ef205..637a76a4 100644
--- a/tests/core/test_pipeline.py
+++ b/tests/core/test_pipeline.py
@@ -1,6 +1,7 @@
 import copy
 from itertools import chain
 from pathlib import Path
+from time import sleep
 
 import datasets
 import pytest
@@ -214,7 +215,7 @@ def score(golds, preds):
         )
 
     assert type(pipeline(pdf)) == PDFDoc
-    assert len(list(pipeline.pipe([pdf] * 4))) == 4
+    assert len(list(pipeline.pipe([pdf] * 4).set_processing(show_progress=True))) == 4
 
     data = list(make_segmentation_adapter(dummy_dataset)(pipeline))
     with pipeline.select_pipes(enable=["classifier"]):
@@ -223,11 +224,13 @@ def score(golds, preds):
 
 
 def test_cache(pipeline: Pipeline, dummy_dataset: Path, pdf: bytes):
+    from edspdf.trainable_pipe import _caches
+
     pipeline(pdf)
 
     with pipeline.cache():
         pipeline(pdf)
-        assert len(pipeline._cache) > 0
+        assert len(_caches["default"]) > 0
 
     assert pipeline._cache is None
 
@@ -249,11 +252,11 @@ def test_different_names(pipeline: Pipeline):
 
     extractor = PdfMinerExtractor(pipeline=pipeline, name="custom_name")
 
-    with pytest.raises(ValueError) as exc_info:
+    with pytest.warns() as record:
         pipeline.add_pipe(extractor, name="extractor")
 
     assert "The provided name does not match the name of the component." in str(
-        exc_info.value
+        record[0].message
     )
 
 
@@ -335,19 +338,21 @@ def test_multiprocessing_accelerator(frozen_pipeline, pdf, letter_pdf):
 
 
 def error_pipe(doc: PDFDoc):
+    sleep(0.1)
     if doc.id == "pdf-3":
         raise ValueError("error")
     return doc
 
 
-def test_multiprocessing_gpu_stub(frozen_pipeline, pdf, letter_pdf):
+def test_deprecated_multiprocessing_gpu_stub(frozen_pipeline, pdf, letter_pdf):
     edspdf.accelerators.multiprocessing.MAX_NUM_PROCESSES = 2
-    accelerator = edspdf.accelerators.multiprocessing.MultiprocessingAccelerator(
-        batch_size=2,
-        num_gpu_workers=1,
-        num_cpu_workers=1,
-        gpu_worker_devices=["cpu"],
-    )
+    accelerator = {
+        "@accelerator": "multiprocessing",
+        "batch_size": 2,
+        "num_gpu_workers": 1,
+        "num_cpu_workers": 1,
+        "gpu_worker_devices": ["cpu"],
+    }
     list(
         frozen_pipeline.pipe(
             chain.from_iterable(
@@ -364,6 +369,29 @@ def test_multiprocessing_gpu_stub(frozen_pipeline, pdf, letter_pdf):
     )
 
 
+def test_multiprocessing_gpu_stub(frozen_pipeline, pdf, letter_pdf):
+    edspdf.accelerators.multiprocessing.MAX_NUM_PROCESSES = 2
+    iterator = chain.from_iterable(
+        [
+            {"content": pdf},
+            {"content": letter_pdf},
+        ]
+        for i in range(5)
+    )
+    docs = edspdf.data.from_iterable(
+        iterator, converter=lambda x: PDFDoc(content=x["content"])
+    )
+    docs = docs.map_pipeline(frozen_pipeline)
+    docs = docs.set_processing(
+        batch_size=2,
+        num_gpu_workers=1,
+        num_cpu_workers=1,
+        gpu_worker_devices=["cpu"],
+        batch_by="content_boxes",
+    )
+    docs = list(docs.to_iterable(converter=lambda x: {"text": x.aggregated_texts}))
+
+
 def test_multiprocessing_rb_error(pipeline, pdf, letter_pdf):
     edspdf.accelerators.multiprocessing.MAX_NUM_PROCESSES = 2
     pipeline.add_pipe(error_pipe, name="error", after="extractor")
@@ -375,7 +403,7 @@ def test_multiprocessing_rb_error(pipeline, pdf, letter_pdf):
                         {"content": pdf, "id": f"pdf-{i}"},
                         {"content": letter_pdf, "id": f"letter-{i}"},
                     ]
-                    for i in range(5)
+                    for i in range(200)
                 ),
                 accelerator="multiprocessing",
                 batch_size=2,
@@ -391,13 +419,14 @@ def __init__(self, *args, **kwargs):
     def preprocess(self, doc):
         return {"num_boxes": len(doc.content_boxes), "doc_id": doc.id}
 
-    def collate(self, batch, device):
+    def collate(self, batch):
         return {
-            "num_boxes": torch.tensor(batch["num_boxes"], device=device),
+            "num_boxes": torch.tensor(batch["num_boxes"]),
             "doc_id": batch["doc_id"],
         }
 
     def forward(self, batch):
+        sleep(0.1)
         if "pdf-1" in batch["doc_id"]:
             raise RuntimeError("Deep learning error")
         return {}
@@ -426,10 +455,14 @@ def test_multiprocessing_ml_error(pipeline, pdf, letter_pdf):
                         {"content": pdf, "id": f"pdf-{i}"},
                         {"content": letter_pdf, "id": f"letter-{i}"},
                     ]
-                    for i in range(5)
+                    for i in range(200)
                 ),
                 accelerator=accelerator,
                 to_doc={"content_field": "content", "id_field": "id"},
             )
         )
     assert "Deep learning error" in str(e.value)
+
+
+def test_apply_on_empty_pdf(error_pdf, frozen_pipeline):
+    assert len(frozen_pipeline(error_pdf).content_boxes) == 0
diff --git a/tests/pipes/aggregators/test_simple.py b/tests/pipes/aggregators/test_simple.py
index f6f0eb12..cda90f65 100644
--- a/tests/pipes/aggregators/test_simple.py
+++ b/tests/pipes/aggregators/test_simple.py
@@ -63,7 +63,13 @@ def test_no_style():
 
 def test_styled_pdfminer_aggregation(styles_pdf):
     extractor = PdfMinerExtractor(extract_style=True)
-    aggregator = SimpleAggregator()
+    aggregator = SimpleAggregator(
+        sort=True,
+        label_map={
+            "header": ["header"],
+            "body": "body",
+        },
+    )
 
     doc = extractor(styles_pdf)
     for b, label in zip(doc.text_boxes, cycle(["header", "body"])):
diff --git a/tests/pipes/embeddings/test_custom.py b/tests/pipes/embeddings/test_custom.py
index 31f5a6f3..b0571177 100644
--- a/tests/pipes/embeddings/test_custom.py
+++ b/tests/pipes/embeddings/test_custom.py
@@ -3,9 +3,10 @@
 from edspdf.pipes.embeddings.embedding_combiner import EmbeddingCombiner
 from edspdf.pipes.embeddings.simple_text_embedding import SimpleTextEmbedding
 from edspdf.pipes.embeddings.sub_box_cnn_pooler import SubBoxCNNPooler
+from edspdf.pipes.extractors.pdfminer import PdfMinerExtractor
 
 
-def test_custom_embedding(pdfdoc, tmp_path):
+def test_custom_embedding(pdf, error_pdf, tmp_path):
     embedding = BoxTransformer(
         num_heads=4,
         dropout_p=0.1,
@@ -35,8 +36,14 @@ def test_custom_embedding(pdfdoc, tmp_path):
         ),
     )
     str(embedding)
+
+    extractor = PdfMinerExtractor(render_pages=True)
+    pdfdoc = extractor(pdf)
     pdfdoc.text_boxes[0].text = "Very long word of 150 letters : " + "x" * 150
     embedding.post_init([pdfdoc], set())
     embedding(pdfdoc)
     embedding.save_extra_data(tmp_path, set())
     embedding.load_extra_data(tmp_path, set())
+
+    # Test empty document
+    embedding(extractor(error_pdf))
diff --git a/tests/pipes/embeddings/test_huggingface.py b/tests/pipes/embeddings/test_huggingface.py
index e82f386d..3e619b4e 100644
--- a/tests/pipes/embeddings/test_huggingface.py
+++ b/tests/pipes/embeddings/test_huggingface.py
@@ -1,7 +1,8 @@
 from edspdf.pipes.embeddings.huggingface_embedding import HuggingfaceEmbedding
+from edspdf.pipes.extractors.pdfminer import PdfMinerExtractor
 
 
-def test_huggingface_embedding(pdfdoc):
+def test_huggingface_embedding(pdf, error_pdf):
     embedding = HuggingfaceEmbedding(
         pipeline=None,
         name="huggingface",
@@ -15,4 +16,7 @@ def test_huggingface_embedding(pdfdoc):
         "height": embedding.hf_model.config.input_size,
         "width": embedding.hf_model.config.input_size,
     }
-    embedding(pdfdoc)
+
+    extractor = PdfMinerExtractor(render_pages=True)
+    embedding(extractor(pdf))
+    embedding(extractor(error_pdf))
diff --git a/tests/utils/test_package.py b/tests/utils/test_package.py
index 28168b12..1743db9f 100644
--- a/tests/utils/test_package.py
+++ b/tests/utils/test_package.py
@@ -5,6 +5,7 @@
 import pytest
 import torch
 
+import edspdf
 from edspdf.utils.package import package
 
 
@@ -82,7 +83,6 @@ def test_package_with_files(frozen_pipeline, tmp_path, package_name):
         name=package_name,
         pipeline=tmp_path / "model",
         root_dir=tmp_path,
-        check_dependencies=True,
         version="0.1.0",
         distributions=None,
         metadata={
@@ -133,16 +133,23 @@ def test_package_with_files(frozen_pipeline, tmp_path, package_name):
 
 import edspdf
 from pathlib import Path
+from typing import Optional, Dict, Any
 
 __version__ = '0.1.0'
 
-def load(device: "torch.device" = "cpu") -> edspdf.Pipeline:
+def load(
+    overrides: Optional[Dict[str, Any]] = None,
+    device: "torch.device" = "cpu"
+) -> edspdf.Pipeline:
     artifacts_path = Path(__file__).parent / "artifacts"
-    model = edspdf.load(artifacts_path, device=device)
+    model = edspdf.load(artifacts_path, overrides=overrides, device=device)
     return model
 """
         )
 
+    module.load()
+    edspdf.load(module_name)
+
 
 @pytest.fixture(scope="session", autouse=True)
 def clean_after():