Skip to content

Commit

Permalink
hooks: add hook for Hugging Face datasets
Browse files Browse the repository at this point in the history
Add hook for `datasets` to collect its source .py files for
torchscript/JIT.

Add a basic dataset loading test that demonstrates the need for that.
  • Loading branch information
rokm committed Dec 23, 2023
1 parent c7aeece commit 400dec0
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 0 deletions.
2 changes: 2 additions & 0 deletions news/676.new.12.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Add hook for Hugging Face ``datasets`` to collect its source .py files for
TorchScript/JIT.
14 changes: 14 additions & 0 deletions src/_pyinstaller_hooks_contrib/hooks/stdhooks/hook-datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# ------------------------------------------------------------------
# Copyright (c) 2023 PyInstaller Development Team.
#
# This file is distributed under the terms of the GNU General Public
# License (version 2.0 or later).
#
# The full license is available in LICENSE.GPL.txt, distributed with
# this software.
#
# SPDX-License-Identifier: GPL-2.0-or-later
# ------------------------------------------------------------------

# Collect source .py files for JIT/torchscript. Requires PyInstaller >= 5.3, no-op in older versions.
module_collection_mode = 'pyz+py'
19 changes: 19 additions & 0 deletions src/_pyinstaller_hooks_contrib/tests/test_deep_learning.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,3 +305,22 @@ def test_detectron2(pyi_builder):
#trainer = DefaultTrainer(cfg)
#print(trainer)
""")


# Hugging Face datasets: Download squad dataset (76 MB train, 10 MB validation)
@importorskip('datasets')
@onedir_only
def test_datasets_download_squad(pyi_builder):
pyi_builder.test_source("""
from datasets import load_dataset
from huggingface_hub import list_datasets
# Print all the available datasets
available_datasets = [dataset.id for dataset in list_datasets()]
print("Available datasets:", len(available_datasets))
# Load a dataset and print the first example in the training set
print("Loading squad dataset...")
squad_dataset = load_dataset('squad')
print("First sample:", squad_dataset['train'][0])
""")

0 comments on commit 400dec0

Please sign in to comment.