Skip to content

Commit

Permalink
Merge branch 'dev' into link_error
Browse files Browse the repository at this point in the history
  • Loading branch information
mavaylon1 authored Nov 18, 2024
2 parents 241bb50 + b69c3e0 commit 9d2f673
Show file tree
Hide file tree
Showing 14 changed files with 205 additions and 53 deletions.
13 changes: 0 additions & 13 deletions .github/workflows/black.yml

This file was deleted.

4 changes: 2 additions & 2 deletions .readthedocs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
version: 2

build:
os: ubuntu-20.04
os: ubuntu-22.04
tools:
python: '3.9'
python: '3.12'

# Build documentation in the docs/ directory with Sphinx
sphinx:
Expand Down
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
# HDMF-ZARR Changelog

## 1.0.0 (Upcoming)

### Enhancements
* Added initial refactor of export, supporting references and internal/external links from Zarr to Zarr. This will introduce breaking changes that could lead to existing exported files to be invalid. This update removes '.' as the object default file source. @mavaylon1 [#194](https://github.com/hdmf-dev/hdmf-zarr/pull/194)
* Added support for Pathlib paths. @mavaylon1 [#212](https://github.com/hdmf-dev/hdmf-zarr/pull/212)
* Updated packages used for testing and readthedocs configuration. @mavaylon1, @rly [#214](https://github.com/hdmf-dev/hdmf-zarr/pull/214)
* Add `force_overwite` parameter for `ZarrIO.__init__` to allow overwriting an existing file or directory. @oruebel [#229](https://github.com/hdmf-dev/hdmf-zarr/pull/229)

### Bug Fixes
* Fix reading of cached specs and caching of specs during export. @rly [#232](https://github.com/hdmf-dev/hdmf-zarr/pull/232)

## 0.9.0 (September 16, 2024)
### Enhancements
Expand All @@ -12,6 +18,7 @@
* Added test for opening file with consolidated metadata from DANDI. @mavaylon1 [#206](https://github.com/hdmf-dev/hdmf-zarr/pull/206)
* Add dimension labels compatible with xarray. @mavaylon1 [#207](https://github.com/hdmf-dev/hdmf-zarr/pull/207)
* Added link_data --> clear_cache relationship to support repacking zarr nwbfiles: [#215](https://github.com/hdmf-dev/hdmf-zarr/pull/215)
* Added `NWBZarrIO.read_nwb` convenience method to simplify reading an NWB file. @oruebel [#226](https://github.com/hdmf-dev/hdmf-zarr/pull/226)

## 0.8.0 (June 4, 2024)
### Bug Fixes
Expand Down
1 change: 0 additions & 1 deletion docs/source/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ in a Conda environment. Normally we don't need to install ``hdmf`` directly, but
conda create --name hdmf-zarr-test python=3.9
conda activate hdmf-zarr-test
conda install h5py
git clone --recurse-submodules https://github.com/hdmf-dev/hdmf.git
cd hdmf
Expand Down
2 changes: 1 addition & 1 deletion requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@ pytest==7.4.3
pytest-cov==4.1.0
python-dateutil==2.8.2
ruff==0.1.3
tox==4.11.3
tox==4.11.3
6 changes: 3 additions & 3 deletions requirements-opt.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
tqdm==4.66.4
fsspec==2024.6.0
s3fs==2024.6.0
tqdm==4.67.0
fsspec==2024.10.0
s3fs==2024.10.0
12 changes: 6 additions & 6 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# pinned dependencies to reproduce an entire development environment to use HDMF-ZARR
hdmf==3.14.2
zarr==2.16.1
pynwb==2.5.0
numpy==2.0.0
numcodecs==0.12.1
threadpoolctl==3.2.0
hdmf==3.14.5
zarr==2.18.3
pynwb==2.8.2
numpy==2.1.3
numcodecs==0.13.1
threadpoolctl==3.5.0
53 changes: 44 additions & 9 deletions src/hdmf_zarr/backend.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Module with the Zarr-based I/O-backend for HDMF"""
# Python imports
import os
import shutil
import warnings
import numpy as np
import tempfile
Expand Down Expand Up @@ -91,7 +92,7 @@ def can_read(path):
'doc': 'the path to the Zarr file or a supported Zarr store'},
{'name': 'manager', 'type': BuildManager, 'doc': 'the BuildManager to use for I/O', 'default': None},
{'name': 'mode', 'type': str,
'doc': 'the mode to open the Zarr file with, one of ("w", "r", "r+", "a", "w-") '
'doc': 'the mode to open the Zarr file with, one of ("w", "r", "r+", "a", "r-"). '
'the mode r- is used to force open without consolidated metadata in read only mode.'},
{'name': 'synchronizer', 'type': (zarr.ProcessSynchronizer, zarr.ThreadSynchronizer, bool),
'doc': 'Zarr synchronizer to use for parallel I/O. If set to True a ProcessSynchronizer is used.',
Expand All @@ -102,11 +103,18 @@ def can_read(path):
'default': None},
{'name': 'storage_options', 'type': dict,
'doc': 'Zarr storage options to read remote folders',
'default': None})
'default': None},
{'name': 'force_overwrite',
'type': bool,
'doc': "force overwriting existing object when in 'w' mode. The existing file or directory"
" will be deleted when before opening (even if the object is not Zarr, e.g,. an HDF5 file)",
'default': False}
)
def __init__(self, **kwargs):
self.logger = logging.getLogger('%s.%s' % (self.__class__.__module__, self.__class__.__qualname__))
path, manager, mode, synchronizer, object_codec_class, storage_options = popargs(
'path', 'manager', 'mode', 'synchronizer', 'object_codec_class', 'storage_options', kwargs)
path, manager, mode, synchronizer, object_codec_class, storage_options, force_overwrite = popargs(
'path', 'manager', 'mode', 'synchronizer', 'object_codec_class',
'storage_options', 'force_overwrite', kwargs)
if manager is None:
manager = BuildManager(TypeMap(NamespaceCatalog()))
if isinstance(synchronizer, bool):
Expand All @@ -118,6 +126,7 @@ def __init__(self, **kwargs):
else:
self.__synchronizer = synchronizer
self.__mode = mode
self.__force_overwrite = force_overwrite
if isinstance(path, Path):
path = str(path)
self.__path = path
Expand Down Expand Up @@ -160,25 +169,44 @@ def synchronizer(self):
def object_codec_class(self):
return self.__codec_cls

@property
def mode(self):
"""
The mode specified by the user when creating the ZarrIO instance.
NOTE: The Zarr library may not honor the mode. E.g., DirectoryStore in Zarr uses
append mode and does not allow setting a file to read-only mode.
"""
return self.__mode

def open(self):
"""Open the Zarr file"""
if self.__file is None:
# Allow overwriting an existing file (e.g., an HDF5 file). Zarr will normally fail if the
# existing object at the path is a file. So if we are in `w` mode we need to delete the file first
if self.mode == 'w' and self.__force_overwrite:
if isinstance(self.path, (str, Path)) and os.path.exists(self.path):
if os.path.isdir(self.path): # directory
shutil.rmtree(self.path)
else: # File
os.remove(self.path)

# Within zarr, open_consolidated only allows the mode to be 'r' or 'r+'.
# As a result, when in other modes, the file will not use consolidated metadata.
if self.__mode != 'r':
if self.mode != 'r':
# When we consolidate metadata, we use ConsolidatedMetadataStore.
# This interface does not allow for setting items.
# In the doc string, it says it is "read only". As a result, we cannot use r+ with consolidate_metadata.
# r- is only an internal mode in ZarrIO to force the use of regular open. For Zarr we need to
# use the regular mode r when r- is specified
mode_to_use = self.__mode if self.__mode != 'r-' else 'r'
mode_to_use = self.mode if self.mode != 'r-' else 'r'
self.__file = zarr.open(store=self.path,
mode=mode_to_use,
synchronizer=self.__synchronizer,
storage_options=self.__storage_options)
else:
self.__file = self.__open_file_consolidated(store=self.path,
mode=self.__mode,
mode=self.mode,
synchronizer=self.__synchronizer,
storage_options=self.__storage_options)

Expand Down Expand Up @@ -343,9 +371,9 @@ def export(self, **kwargs):
"""Export data read from a file from any backend to Zarr.
See :py:meth:`hdmf.backends.io.HDMFIO.export` for more details.
"""
if self.__mode != 'w':
if self.mode != 'w':
raise UnsupportedOperation("Cannot export to file %s in mode '%s'. Please use mode 'w'."
% (self.source, self.__mode))
% (self.source, self.mode))

src_io = getargs('src_io', kwargs)
write_args, cache_spec = popargs('write_args', 'cache_spec', kwargs)
Expand All @@ -371,6 +399,13 @@ def export(self, **kwargs):
ckwargs['clear_cache'] = True
super().export(**ckwargs)
if cache_spec:
# add any namespaces from the src_io that have not yet been loaded
for namespace in src_io.manager.namespace_catalog.namespaces:
if namespace not in self.manager.namespace_catalog.namespaces:
self.manager.namespace_catalog.add_namespace(
name=namespace,
namespace=src_io.manager.namespace_catalog.get_namespace(namespace)
)
self.__cache_spec()

def get_written(self, builder, check_on_disk=False):
Expand Down
28 changes: 27 additions & 1 deletion src/hdmf_zarr/nwb.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Module with Zarr backend for NWB for integration with PyNWB"""
from warnings import warn
from .backend import ZarrIO
from pathlib import Path
from .backend import ZarrIO, SUPPORTED_ZARR_STORES

from hdmf.utils import (docval,
popargs,
Expand Down Expand Up @@ -63,5 +64,30 @@ def export(self, **kwargs):
kwargs['container'] = nwbfile
super().export(**kwargs)

@staticmethod
@docval({'name': 'path',
'type': (str, Path, *SUPPORTED_ZARR_STORES),
'doc': 'the path to the Zarr file or a supported Zarr store'},
is_method=False)
def read_nwb(**kwargs):
"""
Helper factory method for reading an NWB file and return the NWBFile object
"""
# Retrieve the filepath
path = popargs('path', kwargs)
if isinstance(path, Path):
path = str(path)
# determine default storage options to use when opening a file from S3
storage_options = {}
if isinstance(path, str) and path.startswith(("s3://")):
storage_options = dict(anon=True)

# open the file with NWBZarrIO and rad the file
io = NWBZarrIO(path=path, mode="r", load_namespaces=True, storage_options=storage_options)
nwbfile = io.read()

# return the NWBFile object
return nwbfile

except ImportError:
warn("PyNWB is not installed. Support for NWBZarrIO is disabled.")
16 changes: 9 additions & 7 deletions src/hdmf_zarr/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import math
import json
import logging
import os
from collections import deque
from collections.abc import Iterable
from typing import Optional, Union, Literal, Tuple, Dict, Any
Expand Down Expand Up @@ -373,12 +374,12 @@ class ZarrSpecReader(SpecReader):
Class to read format specs from Zarr
"""

@docval({'name': 'group', 'type': Group, 'doc': 'the Zarr file to read specs from'},
{'name': 'source', 'type': str, 'doc': 'the path spec files are relative to', 'default': '.'})
@docval({'name': 'group', 'type': Group, 'doc': 'the Zarr file to read specs from'})
def __init__(self, **kwargs):
self.__group, source = getargs('group', 'source', kwargs)
super_kwargs = {'source': source}
super(ZarrSpecReader, self).__init__(**super_kwargs)
self.__group = getargs('group', kwargs)
source = "%s:%s" % (os.path.abspath(self.__group.store.path), self.__group.name)
super().__init__(source=source)
self.__cache = None

def __read(self, path):
s = self.__group[path][0]
Expand All @@ -391,8 +392,9 @@ def read_spec(self, spec_path):

def read_namespace(self, ns_path):
"""Read a namespace from the given path"""
ret = self.__read(ns_path)
ret = ret['namespaces']
if self.__cache is None:
self.__cache = self.__read(ns_path)
ret = self.__cache['namespaces']
return ret


Expand Down
9 changes: 6 additions & 3 deletions tests/unit/base_tests_zarrio.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,10 @@ def setUp(self):

def tearDown(self):
if os.path.exists(self.store):
shutil.rmtree(self.store)
if os.path.isdir(self.store):
shutil.rmtree(self.store)
else: # in case a test created a file instead of a directory
os.remove(self.store)

def createReferenceBuilder(self):
data_1 = np.arange(100, 200, 10).reshape(2, 5)
Expand All @@ -116,9 +119,9 @@ def createReferenceBuilder(self):
'ref_dataset': dataset_ref})
return builder

def create_zarr(self, consolidate_metadata=True):
def create_zarr(self, consolidate_metadata=True, force_overwrite=False, mode='a'):
builder = self.createReferenceBuilder()
writer = ZarrIO(self.store, mode='a')
writer = ZarrIO(self.store, mode=mode, force_overwrite=force_overwrite)
writer.write_builder(builder, consolidate_metadata)
writer.close()

Expand Down
28 changes: 21 additions & 7 deletions tests/unit/test_fsspec_streaming.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,21 @@


class TestFSSpecStreaming(unittest.TestCase):
@unittest.skipIf(not HAVE_FSSPEC, "fsspec not installed")
def test_fsspec_streaming(self):

def setUp(self):
# PLACEHOLDER test file from Allen Institute for Neural Dynamics
# TODO: store a small test file and use it to speed up testing
remote_path = (
self.s3_aind_path = (
"s3://aind-open-data/ecephys_625749_2022-08-03_15-15-06_nwb_2023-05-16_16-34-55/"
"ecephys_625749_2022-08-03_15-15-06_nwb/"
"ecephys_625749_2022-08-03_15-15-06_experiment1_recording1.nwb.zarr/"
)
# DANDISET: 000719/icephys_9_27_2024
self.https_s3_path = "https://dandiarchive.s3.amazonaws.com/zarr/7515c603-9940-4598-aa1b-8bf32dc9b10c/"

with NWBZarrIO(remote_path, mode="r", storage_options=dict(anon=True)) as io:
@unittest.skipIf(not HAVE_FSSPEC, "fsspec not installed")
def test_fsspec_streaming(self):
with NWBZarrIO(self.s3_aind_path, mode="r", storage_options=dict(anon=True)) as io:
nwbfile = io.read()

self.assertEqual(nwbfile.identifier, "ecephys_625749_2022-08-03_15-15-06")
Expand All @@ -32,10 +36,20 @@ def test_s3_open_with_consolidated_(self):
"""
The file is a Zarr file with consolidated metadata.
"""
s3_path = "https://dandiarchive.s3.amazonaws.com/zarr/ccefbc9f-30e7-4a4c-b044-5b59d300040b/"
with NWBZarrIO(s3_path, mode='r') as read_io:
with NWBZarrIO(self.https_s3_path, mode='r') as read_io:
read_io.open()
self.assertIsInstance(read_io.file.store, zarr.storage.ConsolidatedMetadataStore)
with NWBZarrIO(s3_path, mode='-r') as read_io:
with NWBZarrIO(self.https_s3_path, mode='-r') as read_io:
read_io.open()
self.assertIsInstance(read_io.file.store, zarr.storage.FSStore)


@unittest.skipIf(not HAVE_FSSPEC, "fsspec not installed")
def test_fsspec_streaming_via_read_nwb(self):
"""
Test reading from s3 using the convenience function NWBZarrIO.read_nwb
"""
# Test with a s3:// URL
nwbfile = NWBZarrIO.read_nwb(self.s3_aind_path)
self.assertEqual(nwbfile.identifier, "ecephys_625749_2022-08-03_15-15-06")
self.assertEqual(nwbfile.institution, "AIND")
Loading

0 comments on commit 9d2f673

Please sign in to comment.