Merge branch 'dev' into nwb-schema-2.7.0

NeurodataWithoutBorders · Feb 5, 2024 · 40e760c · 40e760c
2 parents 6f436d1 + dd6baaa
commit 40e760c
Show file tree

Hide file tree

Showing 24 changed files with 463 additions and 103 deletions.
diff --git a/.github/workflows/run_coverage.yml b/.github/workflows/run_coverage.yml
@@ -78,8 +78,10 @@ jobs:
           python -m coverage report -m
 
       - name: Upload coverage to Codecov
-        uses: codecov/codecov-action@v3
+        uses: codecov/codecov-action@v4
         with:
           flags: integration
           files: coverage.xml
           fail_ci_if_error: true
+        env:
+          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -8,7 +8,7 @@ version: 2
 build:
   os: ubuntu-20.04
   tools:
-    python: '3.8'
+    python: '3.11'
 
 # Build documentation in the docs/ directory with Sphinx
 sphinx:

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,11 +17,17 @@
 - Expose `starting_time` in `mock_ElectricalSeries`. @h-mayorquin [#1805](https://github.com/NeurodataWithoutBorders/pynwb/pull/1805)
 - Enhance `get_data_in_units()` to work with objects that have a `channel_conversion` attribute like the `ElectricalSeries`. @h-mayorquin [#1806](https://github.com/NeurodataWithoutBorders/pynwb/pull/1806)
 - Refactor validation CLI tests to use `{sys.executable} -m coverage` to use the same Python version and run correctly on Debian systems. @yarikoptic [#1811](https://github.com/NeurodataWithoutBorders/pynwb/pull/1811)
+- Fixed tests to address newly caught validation errors. @rly [#1839](https://github.com/NeurodataWithoutBorders/pynwb/pull/1839)
 
 ### Bug fixes
 - Fix bug where namespaces were loaded in "w-" mode. @h-mayorquin [#1795](https://github.com/NeurodataWithoutBorders/pynwb/pull/1795)
 - Fix bug where pynwb version was reported as "unknown" to readthedocs @stephprince [#1810](https://github.com/NeurodataWithoutBorders/pynwb/pull/1810)
 
+### Documentation and tutorial enhancements
+- Add RemFile to streaming tutorial. @bendichter [#1761](https://github.com/NeurodataWithoutBorders/pynwb/pull/1761)
+- Fix typos and improve clarify throughout tutorials. @zm711 [#1825](https://github.com/NeurodataWithoutBorders/pynwb/pull/1825)
+- Add Zarr IO tutorial @bendichter [#1834](https://github.com/NeurodataWithoutBorders/pynwb/pull/1834)
+
 ## PyNWB 2.5.0 (August 18, 2023)
 
 ### Enhancements and minor changes

diff --git a/docs/gallery/advanced_io/linking_data.py b/docs/gallery/advanced_io/linking_data.py
@@ -221,7 +221,7 @@
 # Step 2: Add the container to another NWBFile
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 # To integrate both :py:meth:`~pynwb.base.TimeSeries` into a single file we simply create a new
-# :py:meth:`~pynwb.file.NWBFile` and our existing :py:meth:`~pynwb.base.TimeSeries` to it. PyNWB's
+# :py:meth:`~pynwb.file.NWBFile` and add our existing :py:meth:`~pynwb.base.TimeSeries` to it. PyNWB's
 # :py:class:`~pynwb.NWBHDF5IO` backend then automatically detects that the TimeSeries have already
 # been written to another file and will create external links for us.
 #

diff --git a/docs/gallery/advanced_io/plot_editing.py b/docs/gallery/advanced_io/plot_editing.py
@@ -0,0 +1,161 @@
+"""
+.. _editing:
+
+Editing NWB files
+=================
+
+This tutorial demonstrates how to edit NWB files in-place to make small changes to
+existing containers. To add or remove containers from an NWB file, see
+:ref:`modifying_data`. How and whether it is possible to edit an NWB file depends on the
+storage backend and the type of edit.
+
+.. warning::
+
+     Manually editing an existing NWB file can make the file invalid if you are not
+     careful. We highly recommend making a copy before editing and running a validation
+     check on the file after editing it. See :ref:`validating`.
+
+
+Editing datasets
+----------------
+When reading an HDF5 NWB file, PyNWB exposes :py:class:`h5py.Dataset` objects, which can
+be edited in place. For this to work, you must open the file in read/write mode
+(``"r+"`` or ``"a"``).
+
+First, let's create an NWB file with data:
+"""
+from pynwb import NWBHDF5IO, NWBFile, TimeSeries
+from datetime import datetime
+from dateutil.tz import tzlocal
+import numpy as np
+
+nwbfile = NWBFile(
+    session_description="my first synthetic recording",
+    identifier="EXAMPLE_ID",
+    session_start_time=datetime.now(tzlocal()),
+    session_id="LONELYMTN",
+)
+
+nwbfile.add_acquisition(
+    TimeSeries(
+        name="synthetic_timeseries",
+        description="Random values",
+        data=np.random.randn(100, 100),
+        unit="m",
+        rate=10e3,
+    )
+)
+
+with NWBHDF5IO("test_edit.nwb", "w") as io:
+    io.write(nwbfile)
+
+##############################################
+# Now, let's edit the values of the dataset
+
+with NWBHDF5IO("test_edit.nwb", "r+") as io:
+    nwbfile = io.read()
+    nwbfile.acquisition["synthetic_timeseries"].data[:10] = 0.0
+
+
+##############################################
+# You can edit the attributes of that dataset through the ``attrs`` attribute:
+
+with NWBHDF5IO("test_edit.nwb", "r+") as io:
+    nwbfile = io.read()
+    nwbfile.acquisition["synthetic_timeseries"].data.attrs["unit"] = "volts"
+
+##############################################
+# Changing the shape of dataset
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Whether it is possible to change the shape of a dataset depends on how the dataset was
+# created. If the dataset was created with a flexible shape, then it is possible to
+# change in-place. Creating a dataset with a flexible shape is done by specifying the
+# ``maxshape`` argument of the :py:class:`~hdmf.backends.hdf5.h5_utils.H5DataIO` class
+# constructor. Using a ``None`` value for a component of the ``maxshape`` tuple allows
+# the size of the corresponding dimension to grow, such that is can be be reset arbitrarily long
+# in that dimension. Chunking is required for datasets with flexible shapes. Setting ``maxshape``,
+# hence,  automatically sets chunking to ``True``, if not specified.
+#
+# First, let's create an NWB file with a dataset with a flexible shape:
+
+from hdmf.backends.hdf5.h5_utils import H5DataIO
+
+nwbfile = NWBFile(
+    session_description="my first synthetic recording",
+    identifier="EXAMPLE_ID",
+    session_start_time=datetime.now(tzlocal()),
+    session_id="LONELYMTN",
+)
+
+data_io = H5DataIO(data=np.random.randn(100, 100), maxshape=(None, 100))
+
+nwbfile.add_acquisition(
+    TimeSeries(
+        name="synthetic_timeseries",
+        description="Random values",
+        data=data_io,
+        unit="m",
+        rate=10e3,
+    )
+)
+
+with NWBHDF5IO("test_edit2.nwb", "w") as io:
+    io.write(nwbfile)
+
+##############################################
+# The ``None``value  in the first component of ``maxshape`` means that the
+# the first dimension of the dataset is unlimited. By setting the second dimension
+# of ``maxshape`` to ``100``, that dimension is fixed to be no larger than ``100``.
+# If you do not specify a``maxshape``, then the shape of the dataset will be fixed
+# to the shape that the dataset was created with. Here, you can change the shape of
+# the first dimension of this dataset.
+
+
+with NWBHDF5IO("test_edit2.nwb", "r+") as io:
+    nwbfile = io.read()
+    nwbfile.acquisition["synthetic_timeseries"].data.resize((200, 100))
+
+##############################################
+# This will change the shape of the dataset in-place. If you try to change the shape of
+# a dataset with a fixed shape, you will get an error.
+#
+# .. note::
+#   There are several types of dataset edits that cannot be done in-place: changing the
+#   shape of a dataset with a fixed shape, or changing the datatype, compression,
+#   chunking, max-shape, or fill-value of a dataset. For any of these, we recommend using
+#   the :py:class:`pynwb.NWBHDF5IO.export` method to export the data to a new file. See
+#   :ref:`modifying_data` for more information.
+#
+# Editing groups
+# --------------
+# Editing of groups is not yet supported in PyNWB.
+# To edit the attributes of a group, open the file and edit it using :py:mod:`h5py`:
+
+import h5py
+
+with h5py.File("test_edit.nwb", "r+") as f:
+    f["acquisition"]["synthetic_timeseries"].attrs["description"] = "Random values in volts"
+
+##############################################
+# .. warning::
+#    Be careful not to edit values that will bring the file out of compliance with the
+#    NWB specification.
+#
+# Renaming groups and datasets
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Rename groups and datasets in-place using the :py:meth:`~h5py.Group.move` method. For example, to rename
+# the ``"synthetic_timeseries"`` group:
+
+with h5py.File("test_edit.nwb", "r+") as f:
+    f["acquisition"].move("synthetic_timeseries", "synthetic_timeseries_renamed")
+
+##############################################
+# You can use this same technique to move a group or dataset to a different location in
+# the file. For example, to move the ``"synthetic_timeseries_renamed"`` group to the
+# ``"analysis"`` group:
+
+with h5py.File("test_edit.nwb", "r+") as f:
+    f["acquisition"].move(
+        "synthetic_timeseries_renamed",
+        "/analysis/synthetic_timeseries_renamed",
+    )
diff --git a/docs/gallery/advanced_io/plot_iterative_write.py b/docs/gallery/advanced_io/plot_iterative_write.py
@@ -17,7 +17,7 @@
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 #
 # In the typical write process, datasets are created and written as a whole. In contrast,
-# iterative data write refers to the writing of the content of a dataset in an incremental,
+# iterative data write refers to the writing of the contents of a dataset in an incremental,
 # iterative fashion.
 
 ####################
@@ -32,10 +32,10 @@
 #   to avoid this problem by writing the data one-subblock-at-a-time, so that we only need to hold
 #   a small subset of the array in memory at any given time.
 # * **Data streaming** In the context of streaming data we are faced with several issues:
-#   **1)** data is not available in memory but arrives in subblocks as the stream progresses
+#   **1)** data is not available in-memory but arrives in subblocks as the stream progresses
 #   **2)** caching the data of a stream in-memory is often prohibitively expensive and volatile
 #   **3)** the total size of the data is often unknown ahead of time.
-#   Iterative data write allows us to address issues 1) and 2) by enabling us to save data to
+#   Iterative data write allows us to address issues 1) and 2) by enabling us to save data to a
 #   file incrementally as it arrives from the data stream. Issue 3) is addressed in the HDF5
 #   storage backend via support for chunking, enabling the creation of resizable arrays.
 #
@@ -44,7 +44,7 @@
 #     data source.
 #
 # * **Sparse data arrays** In order to reduce storage size of sparse arrays a challenge is that while
-#   the data array (e.g., a matrix) may be large, only few values are set. To avoid storage overhead
+#   the data array (e.g., a matrix) may be large, only a few values are set. To avoid storage overhead
 #   for storing the full array we can employ (in HDF5) a combination of chunking, compression, and
 #   and iterative data write to significantly reduce storage cost for sparse data.
 #
@@ -161,7 +161,7 @@ def write_test_file(filename, data, close_io=True):
 #
 # Here we use a simple data generator but PyNWB does not make any assumptions about what happens
 # inside the generator. Instead of creating data programmatically, you may hence, e.g., receive
-# data from an acquisition system (or other source). We can, hence, use the same approach to write streaming data.
+# data from an acquisition system (or other source). We can use the same approach to write streaming data.
 
 ####################
 # Step 1: Define the data generator
@@ -208,7 +208,7 @@ def iter_sin(chunk_length=10, max_chunks=100):
 ####################
 # Discussion
 # ^^^^^^^^^^
-# Note, we here actually do not know how long our timeseries will be.
+# Note, here we don't actually know how long our timeseries will be.
 
 print(
     "maxshape=%s, recommended_data_shape=%s, dtype=%s"
@@ -218,7 +218,7 @@ def iter_sin(chunk_length=10, max_chunks=100):
 ####################
 # As we can see :py:class:`~hdmf.data_utils.DataChunkIterator` automatically recommends
 # in its ``maxshape`` that the first dimensions of our array should be unlimited (``None``) and the second
-# dimension be ``10`` (i.e., the length of our chunk. Since :py:class:`~hdmf.data_utils.DataChunkIterator`
+# dimension should be ``10`` (i.e., the length of our chunk. Since :py:class:`~hdmf.data_utils.DataChunkIterator`
 # has no way of knowing the minimum size of the array it automatically recommends the size of the first
 # chunk as the minimum size (i.e, ``(1, 10)``) and also infers the data type automatically from the first chunk.
 # To further customize this behavior we may also define the ``maxshape``, ``dtype``, and ``buffer_size`` when
@@ -227,8 +227,8 @@ def iter_sin(chunk_length=10, max_chunks=100):
 # .. tip::
 #
 #    We here used :py:class:`~hdmf.data_utils.DataChunkIterator` to conveniently wrap our data stream.
-#    :py:class:`~hdmf.data_utils.DataChunkIterator` assumes that our generators yields in **consecutive order**
-#    **single** complete element along the **first dimension** of our a array (i.e., iterate over the first
+#    :py:class:`~hdmf.data_utils.DataChunkIterator` assumes that our generator yields in **consecutive order**
+#    a **single** complete element along the **first dimension** of our array (i.e., iterate over the first
 #    axis and yield one-element-at-a-time). This behavior is useful in many practical cases. However, if
 #    this strategy does not match our needs, then using :py:class:`~hdmf.data_utils.GenericDataChunkIterator`
 #    or implementing your own derived :py:class:`~hdmf.data_utils.AbstractDataChunkIterator` may be more
@@ -266,7 +266,7 @@ def __next__(self):
         """
         Return in each iteration a fully occupied data chunk of self.chunk_shape values at a random
         location within the matrix. Chunks are non-overlapping. REMEMBER: h5py does not support all
-        fancy indexing that numpy does so we need to make sure our selection can be
+        the fancy indexing that numpy does so we need to make sure our selection can be
         handled by the backend.
         """
         if self.__chunks_created < self.num_chunks:
@@ -289,7 +289,7 @@ def __next__(self):
     next = __next__
 
     def recommended_chunk_shape(self):
-        # Here we can optionally recommend what a good chunking should be.
+        # Here we can optionally recommend what a good chunking could be.
         return self.chunk_shape
 
     def recommended_data_shape(self):
@@ -379,7 +379,7 @@ def maxshape(self):
 # Now lets check out the size of our data file and compare it against the expected full size of our matrix
 import os
 
-expected_size = xsize * ysize * 8  # This is the full size of our matrix in byte
+expected_size = xsize * ysize * 8  # This is the full size of our matrix in bytes
 occupied_size = num_values * 8  # Number of non-zero values in out matrix
 file_size = os.stat(
     "basic_sparse_iterwrite_example.nwb"
@@ -420,27 +420,27 @@ def maxshape(self):
 #   A slight overhead (here 0.08MB) is expected because our file contains also the additional objects from
 #   the NWBFile, plus some overhead for managing all the HDF5 metadata for all objects.
 # * **3) vs 2):**  Adding compression does not yield any improvement here. This is expected, because, again we
-#   selected the chunking here in a way that we already allocated the minimum  amount of storage to represent our data
+#   selected the chunking here in a way that we already allocated the minimum amount of storage to represent our data
 #   and lossless compression of random data is not efficient.
 # * **4) vs 2):** When we increase our chunk size to ``(100,100)`` (i.e., ``100x`` larger than the chunks produced by
-#   our matrix generator) we observe an according roughly ``100x`` increase in file size. This is expected
+#   our matrix generator) we observe an accordingly roughly ``100x`` increase in file size. This is expected
 #   since our chunks now do not align perfectly with the occupied data and each occupied chunk is allocated fully.
 # * **5) vs 4):** When using compression for the larger chunks we see a significant reduction
 #   in file size (``1.14MB`` vs. ``80MB``). This is because the allocated chunks now contain in addition to the random
-#   values large areas of constant fillvalues, which compress easily.
+#   values large areas of constant fill values, which compress easily.
 #
 # **Advantages:**
 #
 # * We only need to hold one :py:class:`~hdmf.data_utils.DataChunk` in memory at any given time
 # * Only the data chunks in the HDF5 file that contain non-default values are ever being allocated
 # * The overall size of our file is reduced significantly
 # * Reduced I/O load
-# * On read users can use the array as usual
+# * On read, users can use the array as usual
 #
 # .. tip::
 #
-#    With great power comes great responsibility **!** I/O and storage cost will depend among others on the chunk size,
-#    compression options, and the write pattern, i.e., the number and structure of the
+#    With great power comes great responsibility **!** I/O and storage cost will depend, among other factors,
+#    on the chunk size, compression options, and the write pattern, i.e., the number and structure of the
 #    :py:class:`~hdmf.data_utils.DataChunk` objects written. For example, using ``(1,1)`` chunks and writing them
 #    one value at a time would result in poor I/O performance in most practical cases, because of the large number of
 #    chunks and large number of small I/O operations required.
@@ -471,7 +471,7 @@ def maxshape(self):
 #
 # When converting large data files, a typical problem is that it is often too expensive to load all the data
 # into memory. This example is very similar to the data generator example only that instead of generating
-# data on-the-fly in memory we are loading data from a file one-chunk-at-a-time in our generator.
+# data on-the-fly in-memory we are loading data from a file one-chunk-at-a-time in our generator.
 #
 
 ####################
@@ -568,7 +568,7 @@ def iter_largearray(filename, shape, dtype="float64"):
 # In practice, data from recording devices may be distributed across many files, e.g., one file per time range
 # or one file per recording channel. Using iterative data write provides an elegant solution to this problem
 # as it allows us to process large arrays one-subarray-at-a-time. To make things more interesting we'll show
-# this for the case where each recording channel (i.e, the second dimension of our ``TimeSeries``) is broken up
+# this for the case where each recording channel (i.e., the second dimension of our ``TimeSeries``) is broken up
 # across files.
 
 ####################