hooks: add hook for Hugging Face datasets

Add hook for `datasets` to collect its source .py files for torchscript/JIT. Add a basic dataset loading test that demonstrates the need for that.
eggertm · Dec 23, 2023 · 400dec0 · 400dec0
1 parent c7aeece
commit 400dec0
Show file tree

Hide file tree

Showing 3 changed files with 35 additions and 0 deletions.
diff --git a/news/676.new.12.rst b/news/676.new.12.rst
@@ -0,0 +1,2 @@
+Add hook for Hugging Face ``datasets`` to collect its source .py files for
+TorchScript/JIT.
diff --git a/src/_pyinstaller_hooks_contrib/hooks/stdhooks/hook-datasets.py b/src/_pyinstaller_hooks_contrib/hooks/stdhooks/hook-datasets.py
@@ -0,0 +1,14 @@
+# ------------------------------------------------------------------
+# Copyright (c) 2023 PyInstaller Development Team.
+#
+# This file is distributed under the terms of the GNU General Public
+# License (version 2.0 or later).
+#
+# The full license is available in LICENSE.GPL.txt, distributed with
+# this software.
+#
+# SPDX-License-Identifier: GPL-2.0-or-later
+# ------------------------------------------------------------------
+
+# Collect source .py files for JIT/torchscript. Requires PyInstaller >= 5.3, no-op in older versions.
+module_collection_mode = 'pyz+py'
diff --git a/src/_pyinstaller_hooks_contrib/tests/test_deep_learning.py b/src/_pyinstaller_hooks_contrib/tests/test_deep_learning.py
@@ -305,3 +305,22 @@ def test_detectron2(pyi_builder):
         #trainer = DefaultTrainer(cfg)
         #print(trainer)
     """)
+
+
+# Hugging Face datasets: Download squad dataset (76 MB train, 10 MB validation)
+@importorskip('datasets')
+@onedir_only
+def test_datasets_download_squad(pyi_builder):
+    pyi_builder.test_source("""
+        from datasets import load_dataset
+        from huggingface_hub import list_datasets
+
+        # Print all the available datasets
+        available_datasets = [dataset.id for dataset in list_datasets()]
+        print("Available datasets:", len(available_datasets))
+
+        # Load a dataset and print the first example in the training set
+        print("Loading squad dataset...")
+        squad_dataset = load_dataset('squad')
+        print("First sample:", squad_dataset['train'][0])
+    """)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		Add hook for Hugging Face ``datasets`` to collect its source .py files for
		TorchScript/JIT.