diff --git a/.github/workflows/run_coverage.yml b/.github/workflows/run_coverage.yml index acbc3bd05..18dc00903 100644 --- a/.github/workflows/run_coverage.yml +++ b/.github/workflows/run_coverage.yml @@ -78,8 +78,10 @@ jobs: python -m coverage report -m - name: Upload coverage to Codecov - uses: codecov/codecov-action@v3 + uses: codecov/codecov-action@v4 with: flags: integration files: coverage.xml fail_ci_if_error: true + env: + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 5befd21e7..a06d0280a 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -8,7 +8,7 @@ version: 2 build: os: ubuntu-20.04 tools: - python: '3.8' + python: '3.11' # Build documentation in the docs/ directory with Sphinx sphinx: diff --git a/CHANGELOG.md b/CHANGELOG.md index e5a155c5a..1180bb644 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,11 +17,17 @@ - Expose `starting_time` in `mock_ElectricalSeries`. @h-mayorquin [#1805](https://github.com/NeurodataWithoutBorders/pynwb/pull/1805) - Enhance `get_data_in_units()` to work with objects that have a `channel_conversion` attribute like the `ElectricalSeries`. @h-mayorquin [#1806](https://github.com/NeurodataWithoutBorders/pynwb/pull/1806) - Refactor validation CLI tests to use `{sys.executable} -m coverage` to use the same Python version and run correctly on Debian systems. @yarikoptic [#1811](https://github.com/NeurodataWithoutBorders/pynwb/pull/1811) +- Fixed tests to address newly caught validation errors. @rly [#1839](https://github.com/NeurodataWithoutBorders/pynwb/pull/1839) ### Bug fixes - Fix bug where namespaces were loaded in "w-" mode. @h-mayorquin [#1795](https://github.com/NeurodataWithoutBorders/pynwb/pull/1795) - Fix bug where pynwb version was reported as "unknown" to readthedocs @stephprince [#1810](https://github.com/NeurodataWithoutBorders/pynwb/pull/1810) +### Documentation and tutorial enhancements +- Add RemFile to streaming tutorial. @bendichter [#1761](https://github.com/NeurodataWithoutBorders/pynwb/pull/1761) +- Fix typos and improve clarify throughout tutorials. @zm711 [#1825](https://github.com/NeurodataWithoutBorders/pynwb/pull/1825) +- Add Zarr IO tutorial @bendichter [#1834](https://github.com/NeurodataWithoutBorders/pynwb/pull/1834) + ## PyNWB 2.5.0 (August 18, 2023) ### Enhancements and minor changes diff --git a/docs/gallery/advanced_io/linking_data.py b/docs/gallery/advanced_io/linking_data.py index 82824f6cd..2f79d1488 100644 --- a/docs/gallery/advanced_io/linking_data.py +++ b/docs/gallery/advanced_io/linking_data.py @@ -221,7 +221,7 @@ # Step 2: Add the container to another NWBFile # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # To integrate both :py:meth:`~pynwb.base.TimeSeries` into a single file we simply create a new -# :py:meth:`~pynwb.file.NWBFile` and our existing :py:meth:`~pynwb.base.TimeSeries` to it. PyNWB's +# :py:meth:`~pynwb.file.NWBFile` and add our existing :py:meth:`~pynwb.base.TimeSeries` to it. PyNWB's # :py:class:`~pynwb.NWBHDF5IO` backend then automatically detects that the TimeSeries have already # been written to another file and will create external links for us. # diff --git a/docs/gallery/advanced_io/plot_editing.py b/docs/gallery/advanced_io/plot_editing.py new file mode 100644 index 000000000..e45e3b887 --- /dev/null +++ b/docs/gallery/advanced_io/plot_editing.py @@ -0,0 +1,161 @@ +""" +.. _editing: + +Editing NWB files +================= + +This tutorial demonstrates how to edit NWB files in-place to make small changes to +existing containers. To add or remove containers from an NWB file, see +:ref:`modifying_data`. How and whether it is possible to edit an NWB file depends on the +storage backend and the type of edit. + +.. warning:: + + Manually editing an existing NWB file can make the file invalid if you are not + careful. We highly recommend making a copy before editing and running a validation + check on the file after editing it. See :ref:`validating`. + + +Editing datasets +---------------- +When reading an HDF5 NWB file, PyNWB exposes :py:class:`h5py.Dataset` objects, which can +be edited in place. For this to work, you must open the file in read/write mode +(``"r+"`` or ``"a"``). + +First, let's create an NWB file with data: +""" +from pynwb import NWBHDF5IO, NWBFile, TimeSeries +from datetime import datetime +from dateutil.tz import tzlocal +import numpy as np + +nwbfile = NWBFile( + session_description="my first synthetic recording", + identifier="EXAMPLE_ID", + session_start_time=datetime.now(tzlocal()), + session_id="LONELYMTN", +) + +nwbfile.add_acquisition( + TimeSeries( + name="synthetic_timeseries", + description="Random values", + data=np.random.randn(100, 100), + unit="m", + rate=10e3, + ) +) + +with NWBHDF5IO("test_edit.nwb", "w") as io: + io.write(nwbfile) + +############################################## +# Now, let's edit the values of the dataset + +with NWBHDF5IO("test_edit.nwb", "r+") as io: + nwbfile = io.read() + nwbfile.acquisition["synthetic_timeseries"].data[:10] = 0.0 + + +############################################## +# You can edit the attributes of that dataset through the ``attrs`` attribute: + +with NWBHDF5IO("test_edit.nwb", "r+") as io: + nwbfile = io.read() + nwbfile.acquisition["synthetic_timeseries"].data.attrs["unit"] = "volts" + +############################################## +# Changing the shape of dataset +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# Whether it is possible to change the shape of a dataset depends on how the dataset was +# created. If the dataset was created with a flexible shape, then it is possible to +# change in-place. Creating a dataset with a flexible shape is done by specifying the +# ``maxshape`` argument of the :py:class:`~hdmf.backends.hdf5.h5_utils.H5DataIO` class +# constructor. Using a ``None`` value for a component of the ``maxshape`` tuple allows +# the size of the corresponding dimension to grow, such that is can be be reset arbitrarily long +# in that dimension. Chunking is required for datasets with flexible shapes. Setting ``maxshape``, +# hence, automatically sets chunking to ``True``, if not specified. +# +# First, let's create an NWB file with a dataset with a flexible shape: + +from hdmf.backends.hdf5.h5_utils import H5DataIO + +nwbfile = NWBFile( + session_description="my first synthetic recording", + identifier="EXAMPLE_ID", + session_start_time=datetime.now(tzlocal()), + session_id="LONELYMTN", +) + +data_io = H5DataIO(data=np.random.randn(100, 100), maxshape=(None, 100)) + +nwbfile.add_acquisition( + TimeSeries( + name="synthetic_timeseries", + description="Random values", + data=data_io, + unit="m", + rate=10e3, + ) +) + +with NWBHDF5IO("test_edit2.nwb", "w") as io: + io.write(nwbfile) + +############################################## +# The ``None``value in the first component of ``maxshape`` means that the +# the first dimension of the dataset is unlimited. By setting the second dimension +# of ``maxshape`` to ``100``, that dimension is fixed to be no larger than ``100``. +# If you do not specify a``maxshape``, then the shape of the dataset will be fixed +# to the shape that the dataset was created with. Here, you can change the shape of +# the first dimension of this dataset. + + +with NWBHDF5IO("test_edit2.nwb", "r+") as io: + nwbfile = io.read() + nwbfile.acquisition["synthetic_timeseries"].data.resize((200, 100)) + +############################################## +# This will change the shape of the dataset in-place. If you try to change the shape of +# a dataset with a fixed shape, you will get an error. +# +# .. note:: +# There are several types of dataset edits that cannot be done in-place: changing the +# shape of a dataset with a fixed shape, or changing the datatype, compression, +# chunking, max-shape, or fill-value of a dataset. For any of these, we recommend using +# the :py:class:`pynwb.NWBHDF5IO.export` method to export the data to a new file. See +# :ref:`modifying_data` for more information. +# +# Editing groups +# -------------- +# Editing of groups is not yet supported in PyNWB. +# To edit the attributes of a group, open the file and edit it using :py:mod:`h5py`: + +import h5py + +with h5py.File("test_edit.nwb", "r+") as f: + f["acquisition"]["synthetic_timeseries"].attrs["description"] = "Random values in volts" + +############################################## +# .. warning:: +# Be careful not to edit values that will bring the file out of compliance with the +# NWB specification. +# +# Renaming groups and datasets +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# Rename groups and datasets in-place using the :py:meth:`~h5py.Group.move` method. For example, to rename +# the ``"synthetic_timeseries"`` group: + +with h5py.File("test_edit.nwb", "r+") as f: + f["acquisition"].move("synthetic_timeseries", "synthetic_timeseries_renamed") + +############################################## +# You can use this same technique to move a group or dataset to a different location in +# the file. For example, to move the ``"synthetic_timeseries_renamed"`` group to the +# ``"analysis"`` group: + +with h5py.File("test_edit.nwb", "r+") as f: + f["acquisition"].move( + "synthetic_timeseries_renamed", + "/analysis/synthetic_timeseries_renamed", + ) diff --git a/docs/gallery/advanced_io/plot_iterative_write.py b/docs/gallery/advanced_io/plot_iterative_write.py index c461cddf8..958981a0b 100644 --- a/docs/gallery/advanced_io/plot_iterative_write.py +++ b/docs/gallery/advanced_io/plot_iterative_write.py @@ -17,7 +17,7 @@ # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # # In the typical write process, datasets are created and written as a whole. In contrast, -# iterative data write refers to the writing of the content of a dataset in an incremental, +# iterative data write refers to the writing of the contents of a dataset in an incremental, # iterative fashion. #################### @@ -32,10 +32,10 @@ # to avoid this problem by writing the data one-subblock-at-a-time, so that we only need to hold # a small subset of the array in memory at any given time. # * **Data streaming** In the context of streaming data we are faced with several issues: -# **1)** data is not available in memory but arrives in subblocks as the stream progresses +# **1)** data is not available in-memory but arrives in subblocks as the stream progresses # **2)** caching the data of a stream in-memory is often prohibitively expensive and volatile # **3)** the total size of the data is often unknown ahead of time. -# Iterative data write allows us to address issues 1) and 2) by enabling us to save data to +# Iterative data write allows us to address issues 1) and 2) by enabling us to save data to a # file incrementally as it arrives from the data stream. Issue 3) is addressed in the HDF5 # storage backend via support for chunking, enabling the creation of resizable arrays. # @@ -44,7 +44,7 @@ # data source. # # * **Sparse data arrays** In order to reduce storage size of sparse arrays a challenge is that while -# the data array (e.g., a matrix) may be large, only few values are set. To avoid storage overhead +# the data array (e.g., a matrix) may be large, only a few values are set. To avoid storage overhead # for storing the full array we can employ (in HDF5) a combination of chunking, compression, and # and iterative data write to significantly reduce storage cost for sparse data. # @@ -161,7 +161,7 @@ def write_test_file(filename, data, close_io=True): # # Here we use a simple data generator but PyNWB does not make any assumptions about what happens # inside the generator. Instead of creating data programmatically, you may hence, e.g., receive -# data from an acquisition system (or other source). We can, hence, use the same approach to write streaming data. +# data from an acquisition system (or other source). We can use the same approach to write streaming data. #################### # Step 1: Define the data generator @@ -208,7 +208,7 @@ def iter_sin(chunk_length=10, max_chunks=100): #################### # Discussion # ^^^^^^^^^^ -# Note, we here actually do not know how long our timeseries will be. +# Note, here we don't actually know how long our timeseries will be. print( "maxshape=%s, recommended_data_shape=%s, dtype=%s" @@ -218,7 +218,7 @@ def iter_sin(chunk_length=10, max_chunks=100): #################### # As we can see :py:class:`~hdmf.data_utils.DataChunkIterator` automatically recommends # in its ``maxshape`` that the first dimensions of our array should be unlimited (``None``) and the second -# dimension be ``10`` (i.e., the length of our chunk. Since :py:class:`~hdmf.data_utils.DataChunkIterator` +# dimension should be ``10`` (i.e., the length of our chunk. Since :py:class:`~hdmf.data_utils.DataChunkIterator` # has no way of knowing the minimum size of the array it automatically recommends the size of the first # chunk as the minimum size (i.e, ``(1, 10)``) and also infers the data type automatically from the first chunk. # To further customize this behavior we may also define the ``maxshape``, ``dtype``, and ``buffer_size`` when @@ -227,8 +227,8 @@ def iter_sin(chunk_length=10, max_chunks=100): # .. tip:: # # We here used :py:class:`~hdmf.data_utils.DataChunkIterator` to conveniently wrap our data stream. -# :py:class:`~hdmf.data_utils.DataChunkIterator` assumes that our generators yields in **consecutive order** -# **single** complete element along the **first dimension** of our a array (i.e., iterate over the first +# :py:class:`~hdmf.data_utils.DataChunkIterator` assumes that our generator yields in **consecutive order** +# a **single** complete element along the **first dimension** of our array (i.e., iterate over the first # axis and yield one-element-at-a-time). This behavior is useful in many practical cases. However, if # this strategy does not match our needs, then using :py:class:`~hdmf.data_utils.GenericDataChunkIterator` # or implementing your own derived :py:class:`~hdmf.data_utils.AbstractDataChunkIterator` may be more @@ -266,7 +266,7 @@ def __next__(self): """ Return in each iteration a fully occupied data chunk of self.chunk_shape values at a random location within the matrix. Chunks are non-overlapping. REMEMBER: h5py does not support all - fancy indexing that numpy does so we need to make sure our selection can be + the fancy indexing that numpy does so we need to make sure our selection can be handled by the backend. """ if self.__chunks_created < self.num_chunks: @@ -289,7 +289,7 @@ def __next__(self): next = __next__ def recommended_chunk_shape(self): - # Here we can optionally recommend what a good chunking should be. + # Here we can optionally recommend what a good chunking could be. return self.chunk_shape def recommended_data_shape(self): @@ -379,7 +379,7 @@ def maxshape(self): # Now lets check out the size of our data file and compare it against the expected full size of our matrix import os -expected_size = xsize * ysize * 8 # This is the full size of our matrix in byte +expected_size = xsize * ysize * 8 # This is the full size of our matrix in bytes occupied_size = num_values * 8 # Number of non-zero values in out matrix file_size = os.stat( "basic_sparse_iterwrite_example.nwb" @@ -420,14 +420,14 @@ def maxshape(self): # A slight overhead (here 0.08MB) is expected because our file contains also the additional objects from # the NWBFile, plus some overhead for managing all the HDF5 metadata for all objects. # * **3) vs 2):** Adding compression does not yield any improvement here. This is expected, because, again we -# selected the chunking here in a way that we already allocated the minimum amount of storage to represent our data +# selected the chunking here in a way that we already allocated the minimum amount of storage to represent our data # and lossless compression of random data is not efficient. # * **4) vs 2):** When we increase our chunk size to ``(100,100)`` (i.e., ``100x`` larger than the chunks produced by -# our matrix generator) we observe an according roughly ``100x`` increase in file size. This is expected +# our matrix generator) we observe an accordingly roughly ``100x`` increase in file size. This is expected # since our chunks now do not align perfectly with the occupied data and each occupied chunk is allocated fully. # * **5) vs 4):** When using compression for the larger chunks we see a significant reduction # in file size (``1.14MB`` vs. ``80MB``). This is because the allocated chunks now contain in addition to the random -# values large areas of constant fillvalues, which compress easily. +# values large areas of constant fill values, which compress easily. # # **Advantages:** # @@ -435,12 +435,12 @@ def maxshape(self): # * Only the data chunks in the HDF5 file that contain non-default values are ever being allocated # * The overall size of our file is reduced significantly # * Reduced I/O load -# * On read users can use the array as usual +# * On read, users can use the array as usual # # .. tip:: # -# With great power comes great responsibility **!** I/O and storage cost will depend among others on the chunk size, -# compression options, and the write pattern, i.e., the number and structure of the +# With great power comes great responsibility **!** I/O and storage cost will depend, among other factors, +# on the chunk size, compression options, and the write pattern, i.e., the number and structure of the # :py:class:`~hdmf.data_utils.DataChunk` objects written. For example, using ``(1,1)`` chunks and writing them # one value at a time would result in poor I/O performance in most practical cases, because of the large number of # chunks and large number of small I/O operations required. @@ -471,7 +471,7 @@ def maxshape(self): # # When converting large data files, a typical problem is that it is often too expensive to load all the data # into memory. This example is very similar to the data generator example only that instead of generating -# data on-the-fly in memory we are loading data from a file one-chunk-at-a-time in our generator. +# data on-the-fly in-memory we are loading data from a file one-chunk-at-a-time in our generator. # #################### @@ -568,7 +568,7 @@ def iter_largearray(filename, shape, dtype="float64"): # In practice, data from recording devices may be distributed across many files, e.g., one file per time range # or one file per recording channel. Using iterative data write provides an elegant solution to this problem # as it allows us to process large arrays one-subarray-at-a-time. To make things more interesting we'll show -# this for the case where each recording channel (i.e, the second dimension of our ``TimeSeries``) is broken up +# this for the case where each recording channel (i.e., the second dimension of our ``TimeSeries``) is broken up # across files. #################### diff --git a/docs/gallery/advanced_io/plot_zarr_io.py b/docs/gallery/advanced_io/plot_zarr_io.py new file mode 100644 index 000000000..b61fe4a03 --- /dev/null +++ b/docs/gallery/advanced_io/plot_zarr_io.py @@ -0,0 +1,98 @@ +""" +Zarr IO +======= + +Zarr is an alternative backend option for NWB files. It is a Python package that +provides an implementation of chunked, compressed, N-dimensional arrays. Zarr is a good +option for large datasets because, like HDF5, it is designed to store data on disk and +only load the data into memory when needed. Zarr is also a good option for parallel +computing because it supports concurrent reads and writes. + +Note that the Zarr native storage formats are optimized for storage in cloud storage +(e.g., S3). For very large files, Zarr will create many files which can lead to +issues for traditional file system (that are not cloud object stores) due to limitations +on the number of files per directory (this affects local disk, GDrive, Dropbox etc.). + +Zarr read and write is provided by the :hdmf-zarr:`hdmf-zarr<>` package. First, create an +an NWBFile using PyNWB. +""" + +# sphinx_gallery_thumbnail_path = 'figures/gallery_thumbnail_plot_nwbzarrio.png' + + +from datetime import datetime +from dateutil.tz import tzlocal + +import numpy as np +from pynwb import NWBFile, TimeSeries + +# Create the NWBFile. Substitute your NWBFile generation here. +nwbfile = NWBFile( + session_description="my first synthetic recording", + identifier="EXAMPLE_ID", + session_start_time=datetime.now(tzlocal()), + session_id="LONELYMTN", +) + +####################################################################################### +# Dataset Configuration +# --------------------- +# Like HDF5, Zarr provides options to chunk and compress datasets. To leverage these +# features, replace all :py:class:`~hdmf.backends.hdf5.h5_utils.H5DataIO` with the analogous +# :py:class:`~hdmf_zarr.utils.ZarrDataIO`, which takes compressors specified by the +# :py:mod:`numcodecs` library. For example, here is an example :py:class:`.TimeSeries` +# where the ``data`` Dataset is compressed with a Blosc-zstd compressor: + +from numcodecs import Blosc +from hdmf_zarr import ZarrDataIO + +data_with_zarr_data_io = ZarrDataIO( + data=np.random.randn(100, 100), + chunks=(10, 10), + fillvalue=0, + compressor=Blosc(cname='zstd', clevel=3, shuffle=Blosc.SHUFFLE) +) + +####################################################################################### +# Now add it to the :py:class:`.NWBFile`. + +nwbfile.add_acquisition( + TimeSeries( + name="synthetic_timeseries", + data=data_with_zarr_data_io, + unit="m", + rate=10e3, + ) +) + +####################################################################################### +# Writing to Zarr +# --------------- +# To write NWB files to Zarr, replace the :py:class:`~pynwb.NWBHDF5IO` with +# :py:class:`hdmf_zarr.nwb.NWBZarrIO`. + +from hdmf_zarr.nwb import NWBZarrIO +import os + +path = "zarr_tutorial.nwb.zarr" +absolute_path = os.path.abspath(path) +with NWBZarrIO(path=path, mode="w") as io: + io.write(nwbfile) + +####################################################################################### +# .. note:: +# The main reason for using the ``absolute_path`` here is for testing purposes to +# ensure links and references work as expected. Otherwise, using the relative path +# here instead is fine. +# +# Reading from Zarr +# ----------------- +# To read NWB files from Zarr, replace the :py:class:`~pynwb.NWBHDF5IO` with the analogous +# :py:class:`hdmf_zarr.nwb.NWBZarrIO`. + +with NWBZarrIO(path=absolute_path, mode="r") as io: + read_nwbfile = io.read() + +####################################################################################### +# .. note:: +# For more information, see the :hdmf-zarr:`hdmf-zarr documentation<>`. diff --git a/docs/gallery/advanced_io/streaming.py b/docs/gallery/advanced_io/streaming.py index fb34b498b..760e2da71 100644 --- a/docs/gallery/advanced_io/streaming.py +++ b/docs/gallery/advanced_io/streaming.py @@ -90,6 +90,9 @@ # `fsspec documentation on known implementations `_ # for a full updated list of supported store formats. # +# One downside of this fsspec method is that fsspec is not optimized for reading HDF5 files, and so streaming data +# using this method can be slow. A faster alternative is ``remfile`` described below. +# # Streaming Method 2: ROS3 # ------------------------ # ROS3 stands for "read only S3" and is a driver created by the HDF5 Group that allows HDF5 to read HDF5 files stored @@ -120,19 +123,52 @@ # # pip uninstall h5py # conda install -c conda-forge "h5py>=3.2" +# +# Besides the extra burden of installing h5py from a non-PyPI source, one downside of this ROS3 method is that +# this method does not support automatic retries in case the connection fails. +################################################## +# Method 3: remfile +# ----------------- +# ``remfile`` is another library that enables indexing and streaming of files in s3. remfile is simple and fast, +# especially for the initial load of the nwb file and for accessing small pieces of data. The caveats of ``remfile`` +# are that it is a very new project that has not been tested in a variety of use-cases and caching options are +# limited compared to ``fsspec``. `remfile` is a simple, lightweight dependency with a very small codebase. +# +# You can install ``remfile`` with pip: +# +# .. code-block:: bash +# +# pip install remfile +# + +import h5py +from pynwb import NWBHDF5IO +import remfile + +rem_file = remfile.File(s3_url) + +with h5py.File(rem_file, "r") as h5py_file: + with NWBHDF5IO(file=h5py_file, load_namespaces=True) as io: + nwbfile = io.read() + print(nwbfile.acquisition["lick_times"].time_series["lick_left_times"].data[:]) + ################################################## # Which streaming method to choose? # --------------------------------- # # From a user perspective, once opened, the :py:class:`~pynwb.file.NWBFile` works the same with -# both fsspec and ros3. However, in general, we currently recommend using fsspec for streaming -# NWB files because it is more performant and reliable than ros3. In particular fsspec: +# fsspec, ros3, or remfile. However, in general, we currently recommend using fsspec for streaming +# NWB files because it is more performant and reliable than ros3 and more widely tested than remfile. +# However, if you are experiencing long wait times for the initial file load on your network, you +# may want to try remfile. +# +# Advantages of fsspec include: # # 1. supports caching, which will dramatically speed up repeated requests for the # same region of data, # 2. automatically retries when s3 fails to return, which helps avoid errors when accessing data due to -# intermittent errors in connections with S3, +# intermittent errors in connections with S3 (remfile does this as well), # 3. works also with other storage backends (e.g., GoogleDrive or Dropbox, not just S3) and file formats, and # 4. in our experience appears to provide faster out-of-the-box performance than the ros3 driver. diff --git a/docs/gallery/general/add_remove_containers.py b/docs/gallery/general/add_remove_containers.py index 26708f639..90ed8f324 100644 --- a/docs/gallery/general/add_remove_containers.py +++ b/docs/gallery/general/add_remove_containers.py @@ -70,31 +70,13 @@ # file path, and it is not possible to remove objects from an NWB file. You can use the # :py:meth:`NWBHDF5IO.export ` method, detailed below, to modify an NWB file in these ways. # -# .. warning:: -# -# NWB datasets that have been written to disk are read as :py:class:`h5py.Dataset ` objects. -# Directly modifying the data in these :py:class:`h5py.Dataset ` objects immediately -# modifies the data on disk -# (the :py:meth:`NWBHDF5IO.write ` method does not need to be called and the -# :py:class:`~pynwb.NWBHDF5IO` instance does not need to be closed). Directly modifying datasets in this way -# can lead to files that do not validate or cannot be opened, so take caution when using this method. -# Note: only chunked datasets or datasets with ``maxshape`` set can be resized. -# See the `h5py chunked storage documentation `_ -# for more details. - -############################################################################### -# .. note:: -# -# It is not possible to modify the attributes (fields) of an NWB container in memory. - -############################################################################### # Exporting a written NWB file to a new file path -# --------------------------------------------------- +# ----------------------------------------------- # Use the :py:meth:`NWBHDF5IO.export ` method to read data from an existing NWB file, # modify the data, and write the modified data to a new file path. Modifications to the data can be additions or # removals of objects, such as :py:class:`~pynwb.base.TimeSeries` objects. This is especially useful if you -# have raw data and processed data in the same NWB file and you want to create a new NWB file with all of the -# contents of the original file except for the raw data for sharing with collaborators. +# have raw data and processed data in the same NWB file and you want to create a new NWB file with all the contents of +# the original file except for the raw data for sharing with collaborators. # # To remove existing containers, use the :py:class:`~hdmf.utils.LabelledDict.pop` method on any # :py:class:`~hdmf.utils.LabelledDict` object, such as ``NWBFile.acquisition``, ``NWBFile.processing``, @@ -200,7 +182,7 @@ export_io.export(src_io=read_io, nwbfile=read_nwbfile) ############################################################################### -# More information about export -# --------------------------------- # For more information about the export functionality, see :ref:`export` # and the PyNWB documentation for :py:meth:`NWBHDF5IO.export `. +# +# For more information about editing a file in place, see :ref:`editing`. diff --git a/docs/gallery/general/extensions.py b/docs/gallery/general/extensions.py index 66645a660..4ec8f4749 100644 --- a/docs/gallery/general/extensions.py +++ b/docs/gallery/general/extensions.py @@ -100,7 +100,7 @@ # Using extensions # ----------------------------------------------------- # -# After an extension has been created, it can be used by downstream codes for reading and writing data. +# After an extension has been created, it can be used by downstream code for reading and writing data. # There are two main mechanisms for reading and writing extension data with PyNWB. # The first involves defining new :py:class:`~pynwb.core.NWBContainer` classes that are then mapped # to the neurodata types in the extension. @@ -167,7 +167,7 @@ def __init__(self, **kwargs): # By default, extensions are cached to file so that your NWB file will carry the extensions needed to read the file # with it. # -# To demonstrate this, first we will make some fake data using our extensions. +# To demonstrate this, first we will make some simulated data using our extensions. from datetime import datetime @@ -370,17 +370,17 @@ class PotatoSack(MultiContainerInterface): nwb = io.read() print(nwb.get_processing_module()["potato_sack"].get_potato("big_potato").weight) # note: you can call get_processing_module() with or without the module name as -# an argument. however, if there is more than one module, the name is required. -# here, there is more than one potato, so the name of the potato is required as -# an argument to get get_potato +# an argument. However, if there is more than one module, the name is required. +# Here, there is more than one potato, so the name of the potato is required as +# an argument to get_potato #################### # Example: Cortical Surface Mesh # ----------------------------------------------------- # # Here we show how to create extensions by creating a data class for a -# cortical surface mesh. This data type is particularly important for ECoG data, we need to know where each electrode is -# with respect to the gyri and sulci. Surface mesh objects contain two types of data: +# cortical surface mesh. This data type is particularly important for ECoG data, since we need to know where +# each electrode is with respect to the gyri and sulci. Surface mesh objects contain two types of data: # # 1. `vertices`, which is an (n, 3) matrix of floats that represents points in 3D space # diff --git a/docs/gallery/general/object_id.py b/docs/gallery/general/object_id.py index 481cbb36a..206142715 100644 --- a/docs/gallery/general/object_id.py +++ b/docs/gallery/general/object_id.py @@ -32,7 +32,7 @@ session_start_time=start_time, ) -# make some fake data +# make some simulated data timestamps = np.linspace(0, 100, 1024) data = ( np.sin(0.333 * timestamps) diff --git a/docs/source/conf.py b/docs/source/conf.py index 143d9d2c6..5725bd816 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -148,6 +148,8 @@ def __call__(self, filename): 'fsspec': ("https://filesystem-spec.readthedocs.io/en/latest/", None), 'nwbwidgets': ("https://nwb-widgets.readthedocs.io/en/latest/", None), 'nwb-overview': ("https://nwb-overview.readthedocs.io/en/latest/", None), + 'hdmf-zarr': ("https://hdmf-zarr.readthedocs.io/en/latest/", None), + 'numcodecs': ("https://numcodecs.readthedocs.io/en/latest/", None), } extlinks = { @@ -159,6 +161,7 @@ def __call__(self, filename): 'hdmf-docs': ('https://hdmf.readthedocs.io/en/stable/%s', '%s'), 'dandi': ('https://www.dandiarchive.org/%s', '%s'), "nwbinspector": ("https://nwbinspector.readthedocs.io/en/dev/%s", "%s"), + 'hdmf-zarr': ('https://hdmf-zarr.readthedocs.io/en/latest/%s', '%s'), } # Add any paths that contain templates here, relative to this directory. diff --git a/docs/source/figures/gallery_thumbnail_plot_nwbzarrio.png b/docs/source/figures/gallery_thumbnail_plot_nwbzarrio.png new file mode 100644 index 000000000..8926a47ff Binary files /dev/null and b/docs/source/figures/gallery_thumbnail_plot_nwbzarrio.png differ diff --git a/docs/source/overview_citing.rst b/docs/source/overview_citing.rst index bc72e017c..8fda20363 100644 --- a/docs/source/overview_citing.rst +++ b/docs/source/overview_citing.rst @@ -35,7 +35,7 @@ If you use PyNWB in your research, please use the following citation: Using RRID ---------- -* ResourceID: `SCR_017452 `_ +* ResourceID: `SCR_017452 `_ * Proper Citation: **(PyNWB, RRID:SCR_017452)** diff --git a/environment-ros3.yml b/environment-ros3.yml index 2bf2678d2..c84b4c090 100644 --- a/environment-ros3.yml +++ b/environment-ros3.yml @@ -16,3 +16,6 @@ dependencies: - fsspec==2023.6.0 - requests==2.28.1 - aiohttp==3.8.3 + - pip + - pip: + - remfile==0.1.9 diff --git a/requirements-doc.txt b/requirements-doc.txt index 2050f4439..c37aee646 100644 --- a/requirements-doc.txt +++ b/requirements-doc.txt @@ -12,3 +12,4 @@ dataframe_image # used to render large dataframe as image in the sphinx galler lxml # used by dataframe_image when using the matplotlib backend hdf5plugin dandi>=0.46.6 +hdmf-zarr diff --git a/requirements-min.txt b/requirements-min.txt index 8f52348f1..816d53d43 100644 --- a/requirements-min.txt +++ b/requirements-min.txt @@ -1,6 +1,6 @@ # minimum versions of package dependencies for installing PyNWB h5py==2.10 # support for selection of datasets with list of indices added in 2.10 -hdmf==3.9.0 +hdmf==3.12.0 numpy==1.18 pandas==1.1.5 python-dateutil==2.7.3 diff --git a/requirements.txt b/requirements.txt index 2ad7b813e..d09ec7425 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # pinned dependencies to reproduce an entire development environment to use PyNWB -h5py==3.8.0 -hdmf==3.9.0 -numpy==1.24.2 -pandas==2.0.0 +h5py==3.10.0 +hdmf==3.12.0 +numpy==1.26.1 +pandas==2.1.2 python-dateutil==2.8.2 -setuptools==65.5.1 +setuptools==65.5.1 \ No newline at end of file diff --git a/setup.py b/setup.py index 90aebf55f..0e48c269a 100755 --- a/setup.py +++ b/setup.py @@ -20,7 +20,7 @@ reqs = [ 'h5py>=2.10', - 'hdmf>=3.9.0', + 'hdmf>=3.12.0', 'numpy>=1.16', 'pandas>=1.1.5', 'python-dateutil>=2.7.3', diff --git a/src/pynwb/validate.py b/src/pynwb/validate.py index 62aa41426..827249cbb 100644 --- a/src/pynwb/validate.py +++ b/src/pynwb/validate.py @@ -120,7 +120,11 @@ def _get_cached_namespaces_to_validate( is_method=False, ) def validate(**kwargs): - """Validate NWB file(s) against a namespace or its cached namespaces.""" + """Validate NWB file(s) against a namespace or its cached namespaces. + + NOTE: If an io object is provided and no namespace name is specified, then the file will be validated + against the core namespace, even if use_cached_namespaces is True. + """ from . import NWBHDF5IO # TODO: modularize to avoid circular import io, paths, use_cached_namespaces, namespace, verbose, driver = getargs( diff --git a/tests/back_compat/test_read.py b/tests/back_compat/test_read.py index 792d26e7a..16a119690 100644 --- a/tests/back_compat/test_read.py +++ b/tests/back_compat/test_read.py @@ -29,6 +29,12 @@ class TestReadOldVersions(TestCase): "- expected an array of shape '[None]', got non-array data 'one publication'")], '1.0.3_str_pub.nwb': [("root/general/related_publications (general/related_publications): incorrect shape " "- expected an array of shape '[None]', got non-array data 'one publication'")], + '1.5.1_timeseries_no_data.nwb': [("TimeSeries/data/data (acquisition/test_timeseries/data): argument missing")], + '1.5.1_timeseries_no_unit.nwb': [("TimeSeries/data/unit (acquisition/test_timeseries/data): argument missing")], + '1.5.1_imageseries_no_data.nwb': [("ImageSeries/data/data (acquisition/test_imageseries/data): " + "argument missing")], + '1.5.1_imageseries_no_unit.nwb': [("ImageSeries/data/unit (acquisition/test_imageseries/data): " + "argument missing")], } def get_io(self, path): diff --git a/tests/integration/hdf5/test_misc.py b/tests/integration/hdf5/test_misc.py index 6afd7971e..cd9ab1706 100644 --- a/tests/integration/hdf5/test_misc.py +++ b/tests/integration/hdf5/test_misc.py @@ -109,20 +109,36 @@ class TestDecompositionSeriesIO(NWBH5IOMixin, TestCase): def setUpContainer(self): """ Return the test DecompositionSeries to read/write """ - self.timeseries = TimeSeries(name='dummy timeseries', description='desc', - data=np.ones((3, 3)), unit='flibs', - timestamps=np.ones((3,))) - bands = DynamicTable(name='bands', description='band info for LFPSpectralAnalysis', columns=[ - VectorData(name='band_name', description='name of bands', data=['alpha', 'beta', 'gamma']), - VectorData(name='band_limits', description='low and high cutoffs in Hz', data=np.ones((3, 2))) - ]) - spec_anal = DecompositionSeries(name='LFPSpectralAnalysis', - description='my description', - data=np.ones((3, 3, 3)), - timestamps=np.ones((3,)), - source_timeseries=self.timeseries, - metric='amplitude', - bands=bands) + self.timeseries = TimeSeries( + name='dummy timeseries', + description='desc', + data=np.ones((3, 3)), + unit='flibs', + timestamps=np.ones((3,)), + ) + bands = DynamicTable( + name='bands', + description='band info for LFPSpectralAnalysis', + columns=[ + VectorData(name='band_name', description='name of bands', data=['alpha', 'beta', 'gamma']), + VectorData(name='band_limits', description='low and high cutoffs in Hz', data=np.ones((3, 2))), + VectorData(name='band_mean', description='mean gaussian filters in Hz', data=np.ones((3,))), + VectorData( + name='band_stdev', + description='standard deviation of gaussian filters in Hz', + data=np.ones((3,)) + ), + ], + ) + spec_anal = DecompositionSeries( + name='LFPSpectralAnalysis', + description='my description', + data=np.ones((3, 3, 3)), + timestamps=np.ones((3,)), + source_timeseries=self.timeseries, + metric='amplitude', + bands=bands, + ) return spec_anal @@ -144,27 +160,48 @@ def make_electrode_table(self): """ Make an electrode table, electrode group, and device """ self.table = get_electrode_table() self.dev1 = Device(name='dev1') - self.group = ElectrodeGroup(name='tetrode1', - description='tetrode description', - location='tetrode location', - device=self.dev1) - for i in range(4): + self.group = ElectrodeGroup( + name='tetrode1', + description='tetrode description', + location='tetrode location', + device=self.dev1 + ) + for _ in range(4): self.table.add_row(location='CA1', group=self.group, group_name='tetrode1') def setUpContainer(self): """ Return the test ElectricalSeries to read/write """ self.make_electrode_table(self) - region = DynamicTableRegion(name='source_channels', - data=[0, 2], - description='the first and third electrodes', - table=self.table) + region = DynamicTableRegion( + name='source_channels', + data=[0, 2], + description='the first and third electrodes', + table=self.table + ) data = np.random.randn(100, 2, 30) timestamps = np.arange(100)/100 - ds = DecompositionSeries(name='test_DS', - data=data, - source_channels=region, - timestamps=timestamps, - metric='amplitude') + bands = DynamicTable( + name='bands', + description='band info for LFPSpectralAnalysis', + columns=[ + VectorData(name='band_name', description='name of bands', data=['alpha', 'beta', 'gamma']), + VectorData(name='band_limits', description='low and high cutoffs in Hz', data=np.ones((3, 2))), + VectorData(name='band_mean', description='mean gaussian filters in Hz', data=np.ones((3,))), + VectorData( + name='band_stdev', + description='standard deviation of gaussian filters in Hz', + data=np.ones((3,)) + ), + ], + ) + ds = DecompositionSeries( + name='test_DS', + data=data, + source_channels=region, + timestamps=timestamps, + metric='amplitude', + bands=bands, + ) return ds def addContainer(self, nwbfile): diff --git a/tests/integration/ros3/test_ros3.py b/tests/integration/ros3/test_ros3.py index c2f7b562d..95a891760 100644 --- a/tests/integration/ros3/test_ros3.py +++ b/tests/integration/ros3/test_ros3.py @@ -4,6 +4,7 @@ from pynwb.testing import TestCase import urllib.request import h5py +import warnings class TestRos3Streaming(TestCase): @@ -28,16 +29,28 @@ def setUp(self): def test_read(self): s3_path = 'https://dandiarchive.s3.amazonaws.com/ros3test.nwb' - with NWBHDF5IO(s3_path, mode='r', driver='ros3') as io: - nwbfile = io.read() - test_data = nwbfile.acquisition['ts_name'].data[:] - self.assertEqual(len(test_data), 3) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message=r"Ignoring cached namespace .*", + category=UserWarning, + ) + with NWBHDF5IO(s3_path, mode='r', driver='ros3') as io: + nwbfile = io.read() + test_data = nwbfile.acquisition['ts_name'].data[:] + self.assertEqual(len(test_data), 3) def test_dandi_read(self): - with NWBHDF5IO(path=self.s3_test_path, mode='r', driver='ros3') as io: - nwbfile = io.read() - test_data = nwbfile.acquisition['TestData'].data[:] - self.assertEqual(len(test_data), 3) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message=r"Ignoring cached namespace .*", + category=UserWarning, + ) + with NWBHDF5IO(path=self.s3_test_path, mode='r', driver='ros3') as io: + nwbfile = io.read() + test_data = nwbfile.acquisition['TestData'].data[:] + self.assertEqual(len(test_data), 3) def test_dandi_get_cached_namespaces(self): expected_namespaces = ["core"] diff --git a/tests/unit/test_misc.py b/tests/unit/test_misc.py index 99e0d6f87..9350d1d2e 100644 --- a/tests/unit/test_misc.py +++ b/tests/unit/test_misc.py @@ -33,7 +33,13 @@ def test_init(self): timestamps=[1., 2., 3.]) bands = DynamicTable(name='bands', description='band info for LFPSpectralAnalysis', columns=[ VectorData(name='band_name', description='name of bands', data=['alpha', 'beta', 'gamma']), - VectorData(name='band_limits', description='low and high cutoffs in Hz', data=np.ones((3, 2))) + VectorData(name='band_limits', description='low and high cutoffs in Hz', data=np.ones((3, 2))), + VectorData(name='band_mean', description='mean gaussian filters in Hz', data=np.ones((3,))), + VectorData( + name='band_stdev', + description='standard deviation of gaussian filters in Hz', + data=np.ones((3,)) + ), ]) spec_anal = DecompositionSeries(name='LFPSpectralAnalysis', description='my description', @@ -49,6 +55,8 @@ def test_init(self): np.testing.assert_equal(spec_anal.timestamps, [1., 2., 3.]) self.assertEqual(spec_anal.bands['band_name'].data, ['alpha', 'beta', 'gamma']) np.testing.assert_equal(spec_anal.bands['band_limits'].data, np.ones((3, 2))) + np.testing.assert_equal(spec_anal.bands['band_mean'].data, np.ones((3,))) + np.testing.assert_equal(spec_anal.bands['band_stdev'].data, np.ones((3,))) self.assertEqual(spec_anal.source_timeseries, timeseries) self.assertEqual(spec_anal.metric, 'amplitude')