Skip to content

Commit

Permalink
Fix #1948 Add docs for using the family file driver with PyNWB
Browse files Browse the repository at this point in the history
  • Loading branch information
oruebel committed Aug 19, 2024
1 parent 3792136 commit 9be7a91
Showing 1 changed file with 103 additions and 12 deletions.
115 changes: 103 additions & 12 deletions docs/gallery/advanced_io/linking_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
Creating test data
---------------------------
^^^^^^^^^^^^^^^^^^
In the following we are creating two :py:meth:`~pynwb.base.TimeSeries` each written to a separate file.
We then show how we can integrate these files into a single NWBFile.
Expand Down Expand Up @@ -105,12 +105,12 @@

#####################
# Linking to select datasets
# --------------------------
# ^^^^^^^^^^^^^^^^^^^^^^^^^^
#

####################
# Step 1: Create the new NWBFile
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# Create the first file
nwbfile4 = NWBFile(
Expand All @@ -122,7 +122,7 @@

####################
# Step 2: Get the dataset you want to link to
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Now let's open our test files and retrieve our timeseries.
#

Expand All @@ -134,7 +134,7 @@

####################
# Step 3: Create the object you want to link to the data
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# To link to the dataset we can simply assign the data object (here `` timeseries_1.data``) to a new ``TimeSeries``

Expand Down Expand Up @@ -167,7 +167,7 @@

####################
# Step 4: Write the data
# ^^^^^^^^^^^^^^^^^^^^^^^
# ~~~~~~~~~~~~~~~~~~~~~~~~
#
with NWBHDF5IO(filename4, "w") as io4:
# Use link_data=True to specify default behavior to link rather than copy data
Expand All @@ -185,7 +185,7 @@

####################
# Linking to whole Containers
# ---------------------------
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
#
# Appending to files and linking is made possible by passing around the same
# :py:class:`~hdmf.build.manager.BuildManager`. You can get a manager to pass around
Expand All @@ -203,7 +203,7 @@

####################
# Step 1: Get the container object you want to link to
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Now let's open our test files and retrieve our timeseries.
#

Expand All @@ -219,7 +219,7 @@

####################
# Step 2: Add the container to another NWBFile
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# To integrate both :py:meth:`~pynwb.base.TimeSeries` into a single file we simply create a new
# :py:meth:`~pynwb.file.NWBFile` and add our existing :py:meth:`~pynwb.base.TimeSeries` to it. PyNWB's
# :py:class:`~pynwb.NWBHDF5IO` backend then automatically detects that the TimeSeries have already
Expand Down Expand Up @@ -247,7 +247,7 @@
# ------------------------------
#
# Using the :py:func:`~pynwb.file.NWBFile.copy` method allows us to easily create a shallow copy
# of a whole NWB:N file with links to all data in the original file. For example, we may want to
# of a whole NWB file with links to all data in the original file. For example, we may want to
# store processed data in a new file separate from the raw data, while still being able to access
# the raw data. See the :ref:`scratch` tutorial for a detailed example.
#
Expand All @@ -259,5 +259,96 @@
# External links are convenient but to share data we may want to hand a single file with all the
# data to our collaborator rather than having to collect all relevant files. To do this,
# :py:class:`~hdmf.backends.hdf5.h5tools.HDF5IO` (and in turn :py:class:`~pynwb.NWBHDF5IO`)
# provide the convenience function :py:meth:`~hdmf.backends.hdf5.h5tools.HDF5IO.copy_file`,
# which copies an HDF5 file and resolves all external links.
# provide the convenience function :py:meth:`~hdmf.backends.hdf5.h5tools.HDF5IO.export`,
# which can copy the file and resolves all external links.


####################
# Automatically split large data across multiple HDF5 files
# -------------------------------------------------------------------
#
# For extremely large datasets it can be useful to split data across multiple files, e.g., in cases where
# the file stystem does not allow for large files.. While we can
# achieve this by writing different components (e.g., `TimeSeries`) to different files as described above,
# this option does not allow splitting data from single datasets. An alternative option is to use the
# `family` driver in `h5py` to automatically split the NWB file into a collection of many HDF5 files.
# The `family` driver store the file on disk as a series of fixed-length chunks (each in its own file)
#

####################
# Step 1: Create the `NWBFile` as usual
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

from pynwb import NWBFile
from pynwb.base import TimeSeries
from datetime import datetime
import numpy as np

# Create an NWBFile object
nwbfile = NWBFile(description='example file family',
identifier=str(uuid4()),
session_start_time=datetime.now().astimezone())

# Create some example data
data = np.random.rand(500000) # Example large dataset
timestamps = np.arange(500000) / 1000.0 # Example timestamps in seconds

# Create a TimeSeries object
time_series = TimeSeries(name='example_timeseries',
data=data,
unit='mV',
timestamps=timestamps)

# Add the TimeSeries to the NWBFile
nwbfile.add_acquisition(time_series)

####################
# Step 2: Open the new file with the `family` driver and write
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# Here we need to open the file with `h5py` first to set up the driver, and then we can use
# that file with :py:class:`pynwb.NWBHDF5IO`. This is required, because :py:class:`pynwb.NWBHDF5IO`
# currently does not support passing the `memb_size` option required by the `family` driver.

import h5py
from pynwb import NWBHDF5IO

# Define the size of the individual files, determining the number of files to create
# chunk_size = 1 * 1024**3 # 1GB per file
chunk_size = 1024 * 1024 # 1MB for testing

# Create the HDF5 file using the family driver
with h5py.File('family_nwb_file_%d.h5', 'w', driver='family', memb_size=chunk_size) as f:

# Use NWBHDF5IO to write the NWBFile to the HDF5 file
with NWBHDF5IO(file=f, mode='w') as io:
io.write(nwbfile)

####################
# Step 3: Read a file written with the `family` driver
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
#


# Open the HDF5 file using the family driver
with h5py.File('family_nwb_file_%d.h5', 'r', driver='family', memb_size=chunk_size) as f:
# Use NWBHDF5IO to read the NWBFile from the HDF5 file
with NWBHDF5IO(file=f, manager=None, mode='r') as io:
nwbfile = io.read()
print(nwbfile)


####################
# .. note::
#
# The filename you provide when using the `family` driver must contain a printf-style integer format code
# (e.g.`%d`), which will be replaced by the file sequence number.
#
# .. note::
#
# The `memb_size` parameter must be set on both write and read. As such, reading the file requires
# the user to know the `memb_size` that was used for reading.
#
# .. note::
#
# The DANDI archive may not support NWB files that are split in this fashion.

0 comments on commit 9be7a91

Please sign in to comment.