From 400dec0842d9b30e81c394e31951c16cdff7e930 Mon Sep 17 00:00:00 2001 From: Rok Mandeljc Date: Sat, 16 Dec 2023 19:36:33 +0100 Subject: [PATCH] hooks: add hook for Hugging Face datasets Add hook for `datasets` to collect its source .py files for torchscript/JIT. Add a basic dataset loading test that demonstrates the need for that. --- news/676.new.12.rst | 2 ++ .../hooks/stdhooks/hook-datasets.py | 14 ++++++++++++++ .../tests/test_deep_learning.py | 19 +++++++++++++++++++ 3 files changed, 35 insertions(+) create mode 100644 news/676.new.12.rst create mode 100644 src/_pyinstaller_hooks_contrib/hooks/stdhooks/hook-datasets.py diff --git a/news/676.new.12.rst b/news/676.new.12.rst new file mode 100644 index 00000000..aad40d42 --- /dev/null +++ b/news/676.new.12.rst @@ -0,0 +1,2 @@ +Add hook for Hugging Face ``datasets`` to collect its source .py files for +TorchScript/JIT. diff --git a/src/_pyinstaller_hooks_contrib/hooks/stdhooks/hook-datasets.py b/src/_pyinstaller_hooks_contrib/hooks/stdhooks/hook-datasets.py new file mode 100644 index 00000000..6f47b491 --- /dev/null +++ b/src/_pyinstaller_hooks_contrib/hooks/stdhooks/hook-datasets.py @@ -0,0 +1,14 @@ +# ------------------------------------------------------------------ +# Copyright (c) 2023 PyInstaller Development Team. +# +# This file is distributed under the terms of the GNU General Public +# License (version 2.0 or later). +# +# The full license is available in LICENSE.GPL.txt, distributed with +# this software. +# +# SPDX-License-Identifier: GPL-2.0-or-later +# ------------------------------------------------------------------ + +# Collect source .py files for JIT/torchscript. Requires PyInstaller >= 5.3, no-op in older versions. +module_collection_mode = 'pyz+py' diff --git a/src/_pyinstaller_hooks_contrib/tests/test_deep_learning.py b/src/_pyinstaller_hooks_contrib/tests/test_deep_learning.py index 9094ba64..45822aa2 100644 --- a/src/_pyinstaller_hooks_contrib/tests/test_deep_learning.py +++ b/src/_pyinstaller_hooks_contrib/tests/test_deep_learning.py @@ -305,3 +305,22 @@ def test_detectron2(pyi_builder): #trainer = DefaultTrainer(cfg) #print(trainer) """) + + +# Hugging Face datasets: Download squad dataset (76 MB train, 10 MB validation) +@importorskip('datasets') +@onedir_only +def test_datasets_download_squad(pyi_builder): + pyi_builder.test_source(""" + from datasets import load_dataset + from huggingface_hub import list_datasets + + # Print all the available datasets + available_datasets = [dataset.id for dataset in list_datasets()] + print("Available datasets:", len(available_datasets)) + + # Load a dataset and print the first example in the training set + print("Loading squad dataset...") + squad_dataset = load_dataset('squad') + print("First sample:", squad_dataset['train'][0]) + """)