From 12598ea9b85fd10f510f808ce8384b2169566fd8 Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Tue, 20 Aug 2024 12:05:57 -0700 Subject: [PATCH 01/11] Add docs for using the family file driver with PyNWB (#1949) * Fix #1948 Add docs for using the family file driver with PyNWB --------- Co-authored-by: Steph Prince <40640337+stephprince@users.noreply.github.com> Co-authored-by: Ben Dichter Co-authored-by: Steph Prince <40640337+stephprince@users.noreply.github.com> --- CHANGELOG.md | 1 + .../advanced_io/plot_iterative_write.py | 2 + .../{linking_data.py => plot_linking_data.py} | 151 ++++++++++++++++-- 3 files changed, 140 insertions(+), 14 deletions(-) rename docs/gallery/advanced_io/{linking_data.py => plot_linking_data.py} (60%) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3d3dae88c..82370cff4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ ### Documentation and tutorial enhancements - Added pre-release pull request instructions to release process documentation @stephprince [#1928](https://github.com/NeurodataWithoutBorders/pynwb/pull/1928) +- Added section on how to use the `family` driver in `h5py` for splitting data across multiple files @oruebel [#1949](https://github.com/NeurodataWithoutBorders/pynwb/pull/1949) ### Bug fixes - Fixed `can_read` method to return False if no nwbfile version can be found @stephprince [#1934](https://github.com/NeurodataWithoutBorders/pynwb/pull/1934) diff --git a/docs/gallery/advanced_io/plot_iterative_write.py b/docs/gallery/advanced_io/plot_iterative_write.py index 958981a0b..bb629e14d 100644 --- a/docs/gallery/advanced_io/plot_iterative_write.py +++ b/docs/gallery/advanced_io/plot_iterative_write.py @@ -1,4 +1,6 @@ """ +.. _iterative_write: + Iterative Data Write ==================== diff --git a/docs/gallery/advanced_io/linking_data.py b/docs/gallery/advanced_io/plot_linking_data.py similarity index 60% rename from docs/gallery/advanced_io/linking_data.py rename to docs/gallery/advanced_io/plot_linking_data.py index 2f79d1488..88ba7e10f 100644 --- a/docs/gallery/advanced_io/linking_data.py +++ b/docs/gallery/advanced_io/plot_linking_data.py @@ -13,7 +13,7 @@ HDF5 files with NWB data files via external links. To make things more concrete, let's look at the following use case. We want to simultaneously record multiple data streams during data acquisition. Using the concept of external links allows us to save each data stream to an external HDF5 files during data acquisition and to -afterwards link the data into a single NWB file. In this case, each recording becomes represented by a +afterward link the data into a single NWB file. In this case, each recording becomes represented by a separate file-system object that can be set as read-only once the experiment is done. In the following we are using :py:meth:`~pynwb.base.TimeSeries` as an example, but the same approach works for other NWBContainers as well. @@ -42,7 +42,7 @@ Creating test data ---------------------------- +^^^^^^^^^^^^^^^^^^ In the following we are creating two :py:meth:`~pynwb.base.TimeSeries` each written to a separate file. We then show how we can integrate these files into a single NWBFile. @@ -61,7 +61,7 @@ # Create the base data start_time = datetime(2017, 4, 3, 11, tzinfo=tzlocal()) data = np.arange(1000).reshape((100, 10)) -timestamps = np.arange(100) +timestamps = np.arange(100, dtype=float) filename1 = "external1_example.nwb" filename2 = "external2_example.nwb" filename3 = "external_linkcontainer_example.nwb" @@ -105,12 +105,12 @@ ##################### # Linking to select datasets -# -------------------------- +# ^^^^^^^^^^^^^^^^^^^^^^^^^^ # #################### # Step 1: Create the new NWBFile -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Create the first file nwbfile4 = NWBFile( @@ -122,7 +122,7 @@ #################### # Step 2: Get the dataset you want to link to -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Now let's open our test files and retrieve our timeseries. # @@ -134,7 +134,7 @@ #################### # Step 3: Create the object you want to link to the data -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # To link to the dataset we can simply assign the data object (here `` timeseries_1.data``) to a new ``TimeSeries`` @@ -167,7 +167,7 @@ #################### # Step 4: Write the data -# ^^^^^^^^^^^^^^^^^^^^^^^ +# ~~~~~~~~~~~~~~~~~~~~~~~~ # with NWBHDF5IO(filename4, "w") as io4: # Use link_data=True to specify default behavior to link rather than copy data @@ -185,7 +185,7 @@ #################### # Linking to whole Containers -# --------------------------- +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # # Appending to files and linking is made possible by passing around the same # :py:class:`~hdmf.build.manager.BuildManager`. You can get a manager to pass around @@ -203,7 +203,7 @@ #################### # Step 1: Get the container object you want to link to -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Now let's open our test files and retrieve our timeseries. # @@ -219,7 +219,7 @@ #################### # Step 2: Add the container to another NWBFile -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # To integrate both :py:meth:`~pynwb.base.TimeSeries` into a single file we simply create a new # :py:meth:`~pynwb.file.NWBFile` and add our existing :py:meth:`~pynwb.base.TimeSeries` to it. PyNWB's # :py:class:`~pynwb.NWBHDF5IO` backend then automatically detects that the TimeSeries have already @@ -247,7 +247,7 @@ # ------------------------------ # # Using the :py:func:`~pynwb.file.NWBFile.copy` method allows us to easily create a shallow copy -# of a whole NWB:N file with links to all data in the original file. For example, we may want to +# of a whole NWB file with links to all data in the original file. For example, we may want to # store processed data in a new file separate from the raw data, while still being able to access # the raw data. See the :ref:`scratch` tutorial for a detailed example. # @@ -259,5 +259,128 @@ # External links are convenient but to share data we may want to hand a single file with all the # data to our collaborator rather than having to collect all relevant files. To do this, # :py:class:`~hdmf.backends.hdf5.h5tools.HDF5IO` (and in turn :py:class:`~pynwb.NWBHDF5IO`) -# provide the convenience function :py:meth:`~hdmf.backends.hdf5.h5tools.HDF5IO.copy_file`, -# which copies an HDF5 file and resolves all external links. +# provide the convenience function :py:meth:`~hdmf.backends.hdf5.h5tools.HDF5IO.export`, +# which can copy the file and resolves all external links. + + +#################### +# Automatically splitting large data across multiple HDF5 files +# ------------------------------------------------------------------- +# +# For extremely large datasets it can be useful to split data across multiple files, e.g., in cases where +# the file stystem does not allow for large files. While we can +# achieve this by writing different components (e.g., :py:meth:`~pynwb.base.TimeSeries`) to different files as described above, +# this option does not allow splitting data from single datasets. An alternative option is to use the +# ``family`` driver in ``h5py`` to automatically split the NWB file into a collection of many HDF5 files. +# The ``family`` driver stores the file on disk as a series of fixed-length chunks (each in its own file). +# In practice, to write very large arrays, we can combine this approach with :ref:`iterative_write` to +# avoid having to load all data into memory. In the example shown here we use a manual approach to +# iterative write by using :py:class:`~hdmf.backends.hdf5.h5_utils.H5DataIO` to create an empty dataset and +# then filling in the data afterward. + +#################### +# Step 1: Create the NWBFile as usual +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +from pynwb import NWBFile +from pynwb.base import TimeSeries +from datetime import datetime +from hdmf.backends.hdf5 import H5DataIO +import numpy as np + +# Create an NWBFile object +nwbfile = NWBFile(session_description='example file family', + identifier=str(uuid4()), + session_start_time=datetime.now().astimezone()) + +# Create the data as an empty dataset so that we can write to it later +data = H5DataIO(maxshape=(None, 10), # make the first dimension expandable + dtype=np.float32, # create the data as float32 + shape=(0, 10), # initial data shape to initialize as empty dataset + chunks=(1000, 10) + ) + +# Create a TimeSeries object +time_series = TimeSeries(name='example_timeseries', + data=data, + starting_time=0.0, + rate=1.0, + unit='mV') + +# Add the TimeSeries to the NWBFile +nwbfile.add_acquisition(time_series) + +#################### +# Step 2: Open the new file with the `family` driver and write +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# Here we need to open the file with `h5py` first to set up the driver, and then we can use +# that file with :py:class:`pynwb.NWBHDF5IO`. This is required, because :py:class:`pynwb.NWBHDF5IO` +# currently does not support passing the `memb_size` option required by the `family` driver. + +import h5py +from pynwb import NWBHDF5IO + +# Define the size of the individual files, determining the number of files to create +# chunk_size = 1 * 1024**3 # 1GB per file +chunk_size = 1024**2 # 1MB just for testing + +# filename pattern +filename_pattern = 'family_nwb_file_%d.nwb' + +# Create the HDF5 file using the family driver +with h5py.File(name=filename_pattern, mode='w', driver='family', memb_size=chunk_size) as f: + + # Use NWBHDF5IO to write the NWBFile to the HDF5 file + with NWBHDF5IO(file=f, mode='w') as io: + io.write(nwbfile) + + # Write new data iteratively to the file + for i in range(10): + start_index = i * 1000 + stop_index = start_index + 1000 + data.dataset.resize((stop_index, 10)) # Resize the dataset + data.dataset[start_index: stop_index , :] = i # Set the additional values + +#################### +# .. note:: +# +# Alternatively, we could have also used the :ref:`iterative_write` features to write the data +# iteratively directly as part of the `io.write` call instead of manually afterward. + +#################### +# Step 3: Read a file written with the family driver +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# + + +# Open the HDF5 file using the family driver +with h5py.File(name=filename_pattern, mode='r', driver='family', memb_size=chunk_size) as f: + # Use NWBHDF5IO to read the NWBFile from the HDF5 file + with NWBHDF5IO(file=f, manager=None, mode='r') as io: + nwbfile = io.read() + print(nwbfile) + + +#################### +# .. note:: +# +# The filename you provide when using the ``family`` driver must contain a printf-style integer format code +# (e.g.`%d`), which will be replaced by the file sequence number. +# +# .. note:: +# +# The ``memb_size`` parameter must be set on both write and read. As such, reading the file requires +# the user to know the ``memb_size`` that was used for writing. +# +# .. warning:: +# +# The DANDI archive may not support NWB files that are split in this fashion. +# +# .. note:: +# +# Other file drivers, e.g., ``split`` or ``multi`` could be used in a similar fashion. +# However, not all HDF5 drivers are supported by the the high-level API of +# ``h5py`` and as such may require a bit more complex setup via the the +# low-level HDF5 API in ``h5py``. +# + From 2d00afe30b3116c714e53a39522488e97c9160c5 Mon Sep 17 00:00:00 2001 From: Steph Prince <40640337+stephprince@users.noreply.github.com> Date: Tue, 20 Aug 2024 12:56:52 -0700 Subject: [PATCH 02/11] update ruff linter settings (#1950) * update linter settings * Update `pyproject.toml` * exclude notebooks from ruff linter * Fix ruff formatting in plot_linking_data --------- Co-authored-by: Ryan Ly Co-authored-by: Oliver Ruebel --- docs/gallery/advanced_io/plot_linking_data.py | 4 ++-- pyproject.toml | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/docs/gallery/advanced_io/plot_linking_data.py b/docs/gallery/advanced_io/plot_linking_data.py index 88ba7e10f..00dfe5056 100644 --- a/docs/gallery/advanced_io/plot_linking_data.py +++ b/docs/gallery/advanced_io/plot_linking_data.py @@ -268,8 +268,8 @@ # ------------------------------------------------------------------- # # For extremely large datasets it can be useful to split data across multiple files, e.g., in cases where -# the file stystem does not allow for large files. While we can -# achieve this by writing different components (e.g., :py:meth:`~pynwb.base.TimeSeries`) to different files as described above, +# the file stystem does not allow for large files. While we can achieve this by writing different +# components (e.g., :py:meth:`~pynwb.base.TimeSeries`) to different files as described above, # this option does not allow splitting data from single datasets. An alternative option is to use the # ``family`` driver in ``h5py`` to automatically split the NWB file into a collection of many HDF5 files. # The ``family`` driver stores the file on disk as a series of fixed-length chunks (each in its own file). diff --git a/pyproject.toml b/pyproject.toml index 4873b52e1..befa3bb0f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -91,7 +91,7 @@ omit = [ ] [tool.ruff] -select = ["E", "F", "T100", "T201", "T203"] +lint.select = ["E", "F", "T100", "T201", "T203"] exclude = [ ".git", ".tox", @@ -100,12 +100,13 @@ exclude = [ "dist/", "src/nwb-schema", "docs/source/conf.py", + "docs/notebooks/*", "src/pynwb/_due.py", "test.py" # remove when pytest comes along ] line-length = 120 -[tool.ruff.per-file-ignores] +[tool.ruff.lint.per-file-ignores] "tests/read_dandi/*" = ["T201"] "docs/gallery/*" = ["E402", "T201"] "src/*/__init__.py" = ["F401"] @@ -115,6 +116,6 @@ line-length = 120 # "test_gallery.py" = ["T201"] # Uncomment when test_gallery.py is created -[tool.ruff.mccabe] +[tool.ruff.lint.mccabe] max-complexity = 17 From 609eaac0830c7e0c57d1c2051718d91ef0314a4f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 4 Sep 2024 01:01:43 +0000 Subject: [PATCH 03/11] Bump actions/download-artifact from 3 to 4.1.7 in /.github/workflows (#1955) Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Ryan Ly --- .github/workflows/run_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index aa121acb4..ac463134b 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -283,7 +283,7 @@ jobs: python-version: '3.12' - name: Download wheel and source distributions from artifact - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4 with: name: distributions path: dist From aaa4d65f0cc45621aaea90dbddfa810bd19a973d Mon Sep 17 00:00:00 2001 From: Matthew Avaylon Date: Wed, 4 Sep 2024 13:55:12 -0700 Subject: [PATCH 04/11] Numpy 2.0 (#1956) * Update pyproject.toml * Update requirements-min.txt * Update requirements.txt * Update requirements-min.txt * Update requirements-min.txt * Update requirements.txt * Update pyproject.toml * Update export.rst * Update export.rst * Update export.rst * Update CHANGELOG.md --- CHANGELOG.md | 3 +++ docs/source/export.rst | 10 ++++------ pyproject.toml | 4 ++-- requirements-min.txt | 2 +- requirements.txt | 4 ++-- 5 files changed, 12 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 82370cff4..5350d081f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,9 @@ ## PyNWB 2.8.2 (Upcoming) +### Enhancements and minor changes +- Added support for numpy 2.0. @mavaylon1 [#1956](https://github.com/NeurodataWithoutBorders/pynwb/pull/1956) + ### Documentation and tutorial enhancements - Added pre-release pull request instructions to release process documentation @stephprince [#1928](https://github.com/NeurodataWithoutBorders/pynwb/pull/1928) - Added section on how to use the `family` driver in `h5py` for splitting data across multiple files @oruebel [#1949](https://github.com/NeurodataWithoutBorders/pynwb/pull/1949) diff --git a/docs/source/export.rst b/docs/source/export.rst index 490cd346e..218184f9b 100644 --- a/docs/source/export.rst +++ b/docs/source/export.rst @@ -53,14 +53,12 @@ on the :py:class:`~pynwb.file.NWBFile` before exporting. How do I create a copy of an NWB file with different data layouts (e.g., applying compression)? --------------------------------------------------------------------------------------------------------- -Use the `h5repack `_ command line tool from the HDF5 Group. -See also this `h5repack tutorial `_. +Use the `h5repack `_ command line tool from the HDF5 Group. How do I create a copy of an NWB file with different controls over how links are treated and whether copies are deep or shallow? --------------------------------------------------------------------------------------------------------------------------------- -Use the `h5copy `_ command line tool from the HDF5 Group. -See also this `h5copy tutorial `_. +Use the `h5copy `_ command line tool from the HDF5 Group. How do I generate new object IDs for a newly exported NWB file? @@ -101,8 +99,8 @@ For example: export_io.export(src_io=read_io, nwbfile=nwbfile, write_args={'link_data': False}) # copy linked datasets # the written file will contain no links to external datasets -You can also the `h5copy `_ command line tool \ -from the HDF5 Group. See also this `h5copy tutorial `_. +You can also the `h5copy `_ command line tool \ +from the HDF5 Group. How do I write a newly instantiated ``NWBFile`` to two different file paths? diff --git a/pyproject.toml b/pyproject.toml index befa3bb0f..3ab85a4ae 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,8 +34,8 @@ classifiers = [ ] dependencies = [ "h5py>=2.10", - "hdmf>=3.14.0", - "numpy>=1.18, <2.0", # pin below 2.0 until HDMF supports numpy 2.0 + "hdmf>=3.14.3", + "numpy>=1.18", "pandas>=1.1.5", "python-dateutil>=2.7.3", ] diff --git a/requirements-min.txt b/requirements-min.txt index a047d81c7..eef051b25 100644 --- a/requirements-min.txt +++ b/requirements-min.txt @@ -1,6 +1,6 @@ # minimum versions of package dependencies for installing PyNWB h5py==2.10 # support for selection of datasets with list of indices added in 2.10 -hdmf==3.14.0 +hdmf==3.14.3 numpy==1.18 pandas==1.1.5 python-dateutil==2.7.3 diff --git a/requirements.txt b/requirements.txt index 5b3c49ded..27716cf5a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ # pinned dependencies to reproduce an entire development environment to use PyNWB h5py==3.11.0 -hdmf==3.14.0 -numpy==1.26.4 +hdmf==3.14.3 +numpy==2.1.1 pandas==2.2.2 python-dateutil==2.9.0.post0 From 71ef8e204573366e82ef37a648acac03659e4131 Mon Sep 17 00:00:00 2001 From: Steph Prince <40640337+stephprince@users.noreply.github.com> Date: Thu, 5 Sep 2024 13:13:22 -0700 Subject: [PATCH 05/11] bump upload-artifact version (#1957) --- .github/workflows/run_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index ac463134b..e365d78cf 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -64,7 +64,7 @@ jobs: - name: Upload distribution as a workspace artifact if: ${{ matrix.upload-wheels }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: distributions path: dist From 44ef205770ccbd10e3a2815260f11c911cd63106 Mon Sep 17 00:00:00 2001 From: Ryan Ly Date: Thu, 5 Sep 2024 16:30:33 -0700 Subject: [PATCH 06/11] Expose cache_spec option in NWBHDF5IO.export (#1959) * Expose cache_spec option in NWBHDF5IO.export * Update changelog --- CHANGELOG.md | 1 + src/pynwb/__init__.py | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5350d081f..8c465adec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ ### Bug fixes - Fixed `can_read` method to return False if no nwbfile version can be found @stephprince [#1934](https://github.com/NeurodataWithoutBorders/pynwb/pull/1934) - Changed `epoch_tags` to be a NWBFile property instead of constructor argument. @stephprince [#1935](https://github.com/NeurodataWithoutBorders/pynwb/pull/1935) +- Exposed option to not cache the spec in `NWBHDF5IO.export`. @rly [#1959](https://github.com/NeurodataWithoutBorders/pynwb/pull/1959) ## PyNWB 2.8.1 (July 3, 2024) diff --git a/src/pynwb/__init__.py b/src/pynwb/__init__.py index 278e48948..727838821 100644 --- a/src/pynwb/__init__.py +++ b/src/pynwb/__init__.py @@ -368,7 +368,9 @@ def read(self, **kwargs): 'default': None}, {'name': 'write_args', 'type': dict, 'doc': 'arguments to pass to :py:meth:`~hdmf.backends.io.HDMFIO.write_builder`', - 'default': None}) + 'default': None}, + {'name': 'cache_spec', 'type': bool, 'doc': 'whether to cache the specification to file', + 'default': True}) def export(self, **kwargs): """ Export an NWB file to a new NWB file using the HDF5 backend. From 1178e0d1022220703183e6ebe73e32c4c58543f8 Mon Sep 17 00:00:00 2001 From: Ryan Ly Date: Mon, 9 Sep 2024 09:54:52 -0700 Subject: [PATCH 07/11] Fix numpy version for py39 (#1963) --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 27716cf5a..6d7a17623 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ # pinned dependencies to reproduce an entire development environment to use PyNWB h5py==3.11.0 hdmf==3.14.3 -numpy==2.1.1 +numpy==2.1.1; python_version > "3.9" # numpy 2.1+ is not compatible with py3.9 +numpy==2.0.2; python_version == "3.9" pandas==2.2.2 python-dateutil==2.9.0.post0 From 61965684ec2d33b15e59d3bef3696a6ad9651108 Mon Sep 17 00:00:00 2001 From: Steph Prince <40640337+stephprince@users.noreply.github.com> Date: Mon, 9 Sep 2024 10:25:39 -0700 Subject: [PATCH 08/11] update cached namespace retrieval in validation tests (#1961) * filter out warnings when getting namespaces in test.py * make get_cached_namespaces function public * replace get_namespaces function * update code block in docstring * update CHANGELOG.md --------- Co-authored-by: Ryan Ly --- CHANGELOG.md | 1 + src/pynwb/validate.py | 26 +++++++++++++++----------- test.py | 13 ++----------- tests/integration/ros3/test_ros3.py | 4 ++-- 4 files changed, 20 insertions(+), 24 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8c465adec..62dcfe688 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ ### Enhancements and minor changes - Added support for numpy 2.0. @mavaylon1 [#1956](https://github.com/NeurodataWithoutBorders/pynwb/pull/1956) +- Make `get_cached_namespaces_to_validate` a public function @stephprince [#1961](https://github.com/NeurodataWithoutBorders/pynwb/pull/1961) ### Documentation and tutorial enhancements - Added pre-release pull request instructions to release process documentation @stephprince [#1928](https://github.com/NeurodataWithoutBorders/pynwb/pull/1928) diff --git a/src/pynwb/validate.py b/src/pynwb/validate.py index aecfb2556..880f860a6 100644 --- a/src/pynwb/validate.py +++ b/src/pynwb/validate.py @@ -29,7 +29,7 @@ def _validate_helper(io: HDMFIO, namespace: str = CORE_NAMESPACE) -> list: return validator.validate(builder) -def _get_cached_namespaces_to_validate( +def get_cached_namespaces_to_validate( path: str, driver: Optional[str] = None, aws_region: Optional[str] = None, ) -> Tuple[List[str], BuildManager, Dict[str, str]]: """ @@ -39,14 +39,18 @@ def _get_cached_namespaces_to_validate( ------- The following example illustrates how we can use this function to validate against namespaces cached in a file. This is useful, e.g., when a file was created using an extension - >>> from pynwb import validate - >>> from pynwb.validate import _get_cached_namespaces_to_validate - >>> path = "my_nwb_file.nwb" - >>> validate_namespaces, manager, cached_namespaces = _get_cached_namespaces_to_validate(path) - >>> with NWBHDF5IO(path, "r", manager=manager) as reader: - >>> errors = [] - >>> for ns in validate_namespaces: - >>> errors += validate(io=reader, namespace=ns) + + .. code-block:: python + + from pynwb import validate + from pynwb.validate import get_cached_namespaces_to_validate + path = "my_nwb_file.nwb" + validate_namespaces, manager, cached_namespaces = get_cached_namespaces_to_validate(path) + with NWBHDF5IO(path, "r", manager=manager) as reader: + errors = [] + for ns in validate_namespaces: + errors += validate(io=reader, namespace=ns) + :param path: Path for the NWB file :return: Tuple with: - List of strings with the most specific namespace(s) to use for validation. @@ -149,7 +153,7 @@ def validate(**kwargs): io_kwargs = dict(path=path, mode="r", driver=driver) if use_cached_namespaces: - cached_namespaces, manager, namespace_dependencies = _get_cached_namespaces_to_validate( + cached_namespaces, manager, namespace_dependencies = get_cached_namespaces_to_validate( path=path, driver=driver ) io_kwargs.update(manager=manager) @@ -231,7 +235,7 @@ def validate_cli(): if args.list_namespaces: for path in args.paths: - cached_namespaces, _, _ = _get_cached_namespaces_to_validate(path=path) + cached_namespaces, _, _ = get_cached_namespaces_to_validate(path=path) print("\n".join(cached_namespaces)) else: validation_errors, validation_status = validate( diff --git a/test.py b/test.py index 5bddb7c7d..0d9e25990 100644 --- a/test.py +++ b/test.py @@ -153,6 +153,7 @@ def validate_nwbs(): examples_nwbs = glob.glob('*.nwb') import pynwb + from pynwb.validate import get_cached_namespaces_to_validate for nwb in examples_nwbs: try: @@ -171,17 +172,7 @@ def validate_nwbs(): for err in errors: print("Error: %s" % err) - def get_namespaces(nwbfile): - comp = run(["python", "-m", "pynwb.validate", - "--list-namespaces", nwbfile], - stdout=PIPE, stderr=STDOUT, universal_newlines=True, timeout=30) - - if comp.returncode != 0: - return [] - - return comp.stdout.split() - - namespaces = get_namespaces(nwb) + namespaces, _, _ = get_cached_namespaces_to_validate(nwb) if len(namespaces) == 0: FAILURES += 1 diff --git a/tests/integration/ros3/test_ros3.py b/tests/integration/ros3/test_ros3.py index 95a891760..2571e6199 100644 --- a/tests/integration/ros3/test_ros3.py +++ b/tests/integration/ros3/test_ros3.py @@ -1,6 +1,6 @@ from pynwb import NWBHDF5IO from pynwb import validate -from pynwb.validate import _get_cached_namespaces_to_validate +from pynwb.validate import get_cached_namespaces_to_validate from pynwb.testing import TestCase import urllib.request import h5py @@ -85,7 +85,7 @@ def test_dandi_get_cached_namespaces(self): ) } } - found_namespaces, _, found_namespace_dependencies = _get_cached_namespaces_to_validate( + found_namespaces, _, found_namespace_dependencies = get_cached_namespaces_to_validate( path=self.s3_test_path, driver="ros3" ) From b9f9e5a3e9d915acb18521cd760768f323138933 Mon Sep 17 00:00:00 2001 From: Steph Prince <40640337+stephprince@users.noreply.github.com> Date: Mon, 9 Sep 2024 11:12:11 -0700 Subject: [PATCH 09/11] Prepare for release of PyNWB 2.8.2 (#1960) * update changelog * update dependencies * update requirements-doc.txt * revert opt requirements update * update numpy requirement * update environment * add family driver file validation, ignore dandi file validation * revert warning filtering in validation * Update CHANGELOG.md * Update environment-ros3.yml --- CHANGELOG.md | 2 +- environment-ros3.yml | 6 +++--- test.py | 39 +++++++++++++++++++++++++++++++-------- 3 files changed, 35 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 62dcfe688..597636cd4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ # PyNWB Changelog -## PyNWB 2.8.2 (Upcoming) +## PyNWB 2.8.2 (September 9, 2024) ### Enhancements and minor changes - Added support for numpy 2.0. @mavaylon1 [#1956](https://github.com/NeurodataWithoutBorders/pynwb/pull/1956) diff --git a/environment-ros3.yml b/environment-ros3.yml index 84031808f..081408f19 100644 --- a/environment-ros3.yml +++ b/environment-ros3.yml @@ -6,9 +6,9 @@ channels: dependencies: - python==3.12 - h5py==3.11.0 - - hdmf==3.14.1 - - matplotlib==3.8.0 - - numpy==1.26.4 + - hdmf==3.14.3 + - matplotlib==3.8.4 + - numpy==2.1.1 - pandas==2.2.2 - python-dateutil==2.9.0 - setuptools diff --git a/test.py b/test.py index 0d9e25990..f64fcd75d 100644 --- a/test.py +++ b/test.py @@ -3,6 +3,7 @@ import re import argparse import glob +import h5py import inspect import logging import os.path @@ -152,6 +153,9 @@ def validate_nwbs(): logging.info('running validation tests on NWB files') examples_nwbs = glob.glob('*.nwb') + # exclude files downloaded from dandi, validation of those files is handled by dandisets-health-status checks + examples_nwbs = [x for x in examples_nwbs if not x.startswith('sub-')] + import pynwb from pynwb.validate import get_cached_namespaces_to_validate @@ -162,15 +166,34 @@ def validate_nwbs(): ws = list() with warnings.catch_warnings(record=True) as tmp: logging.info("Validating with pynwb.validate method.") - with pynwb.NWBHDF5IO(nwb, mode='r') as io: - errors = pynwb.validate(io) - TOTAL += 1 + is_family_nwb_file = False + try: + with pynwb.NWBHDF5IO(nwb, mode='r') as io: + errors = pynwb.validate(io) + except OSError as e: + # if the file was created with the family driver, need to use the family driver to open it + if 'family driver should be used' in str(e): + is_family_nwb_file = True + match = re.search(r'(\d+)', nwb) + filename_pattern = nwb[:match.start()] + '%d' + nwb[match.end():] # infer the filename pattern + memb_size = 1024**2 # note: the memb_size must be the same as the one used to create the file + with h5py.File(filename_pattern, mode='r', driver='family', memb_size=memb_size) as f: + with pynwb.NWBHDF5IO(file=f, manager=None, mode='r') as io: + errors = pynwb.validate(io) + else: + raise e + + TOTAL += 1 + + if errors: + FAILURES += 1 + ERRORS += 1 + for err in errors: + print("Error: %s" % err) - if errors: - FAILURES += 1 - ERRORS += 1 - for err in errors: - print("Error: %s" % err) + # if file was created with family driver, skip pynwb.validate CLI because not yet supported + if is_family_nwb_file: + continue namespaces, _, _ = get_cached_namespaces_to_validate(nwb) From 17adccf5ff34ea16db292f838c5b210eebdf64d6 Mon Sep 17 00:00:00 2001 From: Jonny Saunders Date: Tue, 17 Sep 2024 00:12:19 -0700 Subject: [PATCH 10/11] cache __TYPE_MAP and init submodules (#1931) Co-authored-by: Ryan Ly Co-authored-by: Matthew Avaylon --- .gitignore | 3 + CHANGELOG.md | 5 + pyproject.toml | 2 +- src/pynwb/__init__.py | 137 +++++++++++++++++---- tests/back_compat/test_import_structure.py | 5 - 5 files changed, 120 insertions(+), 32 deletions(-) diff --git a/.gitignore b/.gitignore index c0a2aca3e..95f08686e 100644 --- a/.gitignore +++ b/.gitignore @@ -77,3 +77,6 @@ tests/coverage/htmlcov # Version _version.py + +.core_typemap_version +core_typemap.pkl diff --git a/CHANGELOG.md b/CHANGELOG.md index 597636cd4..e5909f577 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # PyNWB Changelog +## PyNWB 2.8.3 (Upcoming) + +### Performance +- Cache global type map to speed import 3X. @sneakers-the-rat [#1931](https://github.com/NeurodataWithoutBorders/pynwb/pull/1931) + ## PyNWB 2.8.2 (September 9, 2024) ### Enhancements and minor changes diff --git a/pyproject.toml b/pyproject.toml index 3ab85a4ae..f798f2b5a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -98,7 +98,7 @@ exclude = [ "__pycache__", "build/", "dist/", - "src/nwb-schema", + "src/pynwb/nwb-schema", "docs/source/conf.py", "docs/notebooks/*", "src/pynwb/_due.py", diff --git a/src/pynwb/__init__.py b/src/pynwb/__init__.py index 727838821..1d109abe3 100644 --- a/src/pynwb/__init__.py +++ b/src/pynwb/__init__.py @@ -4,6 +4,8 @@ import os.path from pathlib import Path from copy import deepcopy +import subprocess +import pickle from warnings import warn import h5py @@ -23,6 +25,16 @@ from .spec import NWBDatasetSpec, NWBGroupSpec, NWBNamespace # noqa E402 from .validate import validate # noqa: F401, E402 +try: + # see https://effigies.gitlab.io/posts/python-packaging-2023/ + from ._version import __version__ +except ImportError: # pragma: no cover + # this is a relatively slower method for getting the version string + from importlib.metadata import version # noqa: E402 + + __version__ = version("pynwb") + del version + @docval({'name': 'config_path', 'type': str, 'doc': 'Path to the configuration file.'}, {'name': 'type_map', 'type': TypeMap, 'doc': 'The TypeMap.', 'default': None}, @@ -51,7 +63,7 @@ def unload_type_config(**kwargs): type_map = kwargs['type_map'] or get_type_map() hdmf_unload_type_config(type_map=type_map) -def __get_resources(): +def __get_resources() -> dict: try: from importlib.resources import files except ImportError: @@ -61,27 +73,35 @@ def __get_resources(): __location_of_this_file = files(__name__) __core_ns_file_name = 'nwb.namespace.yaml' __schema_dir = 'nwb-schema/core' + cached_core_typemap = __location_of_this_file / 'core_typemap.pkl' + cached_version_indicator = __location_of_this_file / '.core_typemap_version' ret = dict() ret['namespace_path'] = str(__location_of_this_file / __schema_dir / __core_ns_file_name) + ret['cached_typemap_path'] = str(cached_core_typemap) + ret['cached_version_indicator'] = str(cached_version_indicator) return ret def _get_resources(): # LEGACY: Needed to support legacy implementation. + # TODO: Remove this in PyNWB 3.0. + warn("The function '_get_resources' is deprecated and will be removed in a future release.", DeprecationWarning) return __get_resources() -# a global namespace catalog -global __NS_CATALOG +# a global type map global __TYPE_MAP -__NS_CATALOG = NamespaceCatalog(NWBGroupSpec, NWBDatasetSpec, NWBNamespace) +__ns_catalog = NamespaceCatalog(NWBGroupSpec, NWBDatasetSpec, NWBNamespace) hdmf_typemap = hdmf.common.get_type_map() -__TYPE_MAP = TypeMap(__NS_CATALOG) +__TYPE_MAP = TypeMap(__ns_catalog) __TYPE_MAP.merge(hdmf_typemap, ns_catalog=True) +# load the core namespace, i.e. base NWB specification +__resources = __get_resources() + @docval({'name': 'extensions', 'type': (str, TypeMap, list), 'doc': 'a path to a namespace, a TypeMap, or a list consisting of paths to namespaces and TypeMaps', @@ -139,22 +159,95 @@ def load_namespaces(**kwargs): namespace_path = getargs('namespace_path', kwargs) return __TYPE_MAP.load_namespaces(namespace_path) +def available_namespaces(): + """Returns all namespaces registered in the namespace catalog""" + return __TYPE_MAP.namespace_catalog.namespaces -# load the core namespace, i.e. base NWB specification -__resources = __get_resources() -if os.path.exists(__resources['namespace_path']): - load_namespaces(__resources['namespace_path']) -else: - raise RuntimeError( - "'core' is not a registered namespace. If you installed PyNWB locally using a git clone, you need to " - "use the --recurse_submodules flag when cloning. See developer installation instructions here: " - "https://pynwb.readthedocs.io/en/stable/install_developers.html#install-from-git-repository" - ) +def __git_cmd(*args) -> subprocess.CompletedProcess: + """ + Call git with the package as the directory regardless of cwd. + + Since any folder within a git repo works, don't try to ascend to the top, since + if we're *not* actually in a git repo we're only guaranteed to know about + the inner `pynwb` directory. + """ + parent_dir = str(Path(__file__).parent) + result = subprocess.run(["git", "-C", parent_dir, *args], capture_output=True) + return result + + +def __clone_submodules(): + if __git_cmd('rev-parse').returncode == 0: + warn( + 'NWB core schema not found in cloned installation, initializing submodules...', + stacklevel=1) + res = __git_cmd('submodule', 'update', '--init', '--recursive') + if not res.returncode == 0: # pragma: no cover + raise RuntimeError( + 'Exception while initializing submodules, got:\n' + 'stdout:\n' + ('-'*20) + res.stdout + "\nstderr:\n" + ('-'*20) + res.stderr) + else: # pragma: no cover + raise RuntimeError("Package is not installed from a git repository, can't clone submodules") + + +def __load_core_namespace(final:bool=False): + """ + Load the core namespace into __TYPE_MAP, + either by loading a pickled version or creating one anew and pickling it. -def available_namespaces(): - """Returns all namespaces registered in the namespace catalog""" - return __NS_CATALOG.namespaces + We keep a dotfile next to it that tracks what version of pynwb created it, + so that we invalidate it when the code changes. + + Args: + final (bool): This function tries again if the submodules aren't cloned, + but it shouldn't go into an infinite loop. + If final is ``True``, don't recurse. + """ + global __TYPE_MAP + global __resources + + # if we have a version indicator file and it doesn't match the current version, + # scrap the cached typemap + if os.path.exists(__resources['cached_version_indicator']): + with open(__resources['cached_version_indicator'], 'r') as f: + cached_version = f.read().strip() + if cached_version != __version__: + Path(__resources['cached_typemap_path']).unlink(missing_ok=True) + else: + # remove any cached typemap, forcing re-creation + Path(__resources['cached_typemap_path']).unlink(missing_ok=True) + + # load pickled typemap if we have one + if os.path.exists(__resources['cached_typemap_path']): + with open(__resources['cached_typemap_path'], 'rb') as f: + __TYPE_MAP = pickle.load(f) # type: TypeMap + + # otherwise make a new one and cache it + elif os.path.exists(__resources['namespace_path']): + load_namespaces(__resources['namespace_path']) + with open(__resources['cached_typemap_path'], 'wb') as f: + pickle.dump(__TYPE_MAP, f, protocol=pickle.HIGHEST_PROTOCOL) + with open(__resources['cached_version_indicator'], 'w') as f: + f.write(__version__) + + # otherwise, we don't have the schema and try and initialize from submodules, + # afterwards trying to load the namespace again + else: + try: + __clone_submodules() + except (FileNotFoundError, OSError, RuntimeError) as e: # pragma: no cover + if 'core' not in available_namespaces(): + warn( + "'core' is not a registered namespace. If you installed PyNWB locally using a git clone, " + "you need to use the --recurse_submodules flag when cloning. " + "See developer installation instructions here: " + "https://pynwb.readthedocs.io/en/stable/install_developers.html#install-from-git-repository\n" + f"Got exception: \n{e}" + ) + if not final: + __load_core_namespace(final=True) +__load_core_namespace() # a function to register a container classes with the global map @@ -427,15 +520,7 @@ def export(self, **kwargs): from hdmf.data_utils import DataChunkIterator # noqa: F401,E402 from hdmf.backends.hdf5 import H5DataIO # noqa: F401,E402 -try: - # see https://effigies.gitlab.io/posts/python-packaging-2023/ - from ._version import __version__ -except ImportError: # pragma: no cover - # this is a relatively slower method for getting the version string - from importlib.metadata import version # noqa: E402 - __version__ = version("pynwb") - del version from ._due import due, BibTeX # noqa: E402 due.cite( diff --git a/tests/back_compat/test_import_structure.py b/tests/back_compat/test_import_structure.py index 36831929d..81c4acf90 100644 --- a/tests/back_compat/test_import_structure.py +++ b/tests/back_compat/test_import_structure.py @@ -30,19 +30,14 @@ def test_outer_import_structure(self): "TimeSeries", "TypeMap", "_HDF5IO", - "__NS_CATALOG", - "__TYPE_MAP", "__builtins__", "__cached__", "__doc__", "__file__", - "__get_resources", - "__io", "__loader__", "__name__", "__package__", "__path__", - "__resources", "__spec__", "__version__", "_due", From dc98e84e73154cd13d532eed09a86878dcfbf738 Mon Sep 17 00:00:00 2001 From: Ryan Ly Date: Thu, 19 Sep 2024 14:38:10 -0700 Subject: [PATCH 11/11] Try to fix inspector tests after removal of requirements.txt (#1967) --- .github/workflows/run_inspector_tests.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/run_inspector_tests.yml b/.github/workflows/run_inspector_tests.yml index 120e30a79..ece4e000c 100644 --- a/.github/workflows/run_inspector_tests.yml +++ b/.github/workflows/run_inspector_tests.yml @@ -33,8 +33,10 @@ jobs: python -m pip list git clone https://github.com/NeurodataWithoutBorders/nwbinspector.git cd nwbinspector - python -m pip install -r requirements.txt pytest - python -m pip install . # this might install a pinned version of pynwb instead of the current one + python -m pip install pytest + python -m pip install ".[dandi]" # this might install a pinned version of pynwb instead of the current one + # Download testing data and set config path + dandi download "https://gui-staging.dandiarchive.org/#/dandiset/204919" cd .. python -m pip uninstall -y pynwb # uninstall the pinned version of pynwb python -m pip install . # reinstall current branch of pynwb