From 110f803a4689a96fc9eca672828075086378ebbf Mon Sep 17 00:00:00 2001 From: Jeremy Magland Date: Fri, 5 Apr 2024 17:49:25 -0400 Subject: [PATCH 1/3] update readme --- README.md | 84 ++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 67 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 879608d..468e4c0 100644 --- a/README.md +++ b/README.md @@ -6,14 +6,17 @@ :warning: Please note, LINDI is currently under development and should not yet be used in practice. -LINDI is a Python library that facilitates handling NWB (Neurodata Without Borders) files in an efficient, flexible manner, especially when dealing with large datasets on remote servers. The goal is to enable composition of NWB files by integrating data from multiple sources without the need to copy or move large datasets. +**HDF5 as Zarr as JSON for NWB** -LINDI features include: +LINDI provides a JSON representation of NWB (Neurodata Without Borders) data where the large data chunks are stored separately from the main metadata. This enables efficient storage, composition, and sharing of NWB files on cloud systems such as [DANDI](https://www.dandiarchive.org/) without duplicating the large data blobs. + +LINDI provides: - A specification for representing arbitrary HDF5 files as Zarr stores. This handles scalar datasets, references, soft links, and compound data types for datasets. -- A Zarr wrapper for remote or local HDF5 files (LindiH5ZarrStore). This involves pointers to remote files for remote data chunks. -- A function for generating a reference file system .zarr.json file from a Zarr store. This is inspired by [kerchunk](https://github.com/fsspec/kerchunk). -- An h5py-like interface for accessing these Zarr stores that can be used with [pynwb](https://pynwb.readthedocs.io/en/stable/). Both read and write operations are supported. +- A Zarr wrapper for remote or local HDF5 files (LindiH5ZarrStore). +- A mechanism for creating .zarr.json (or .nwb.json) files that reference data chunks in external files, inspired by [kerchunk](https://github.com/fsspec/kerchunk). +- An h5py-like interface for reading from and writing to these data sources that can be used with [pynwb](https://pynwb.readthedocs.io/en/stable/). +- A mechanism for uploading and downloading these data sources to and from cloud storage, including DANDI. This project was inspired by [kerchunk](https://github.com/fsspec/kerchunk) and [hdmf-zarr](https://hdmf-zarr.readthedocs.io/en/latest/index.html) and depends on [zarr](https://zarr.readthedocs.io/en/stable/), [h5py](https://www.h5py.org/), [remfile](https://github.com/magland/remfile) and [numcodecs](https://numcodecs.readthedocs.io/en/stable/). @@ -23,23 +26,29 @@ This project was inspired by [kerchunk](https://github.com/fsspec/kerchunk) and pip install lindi ``` -Or install from source +Or from source ```bash cd lindi pip install -e . ``` -## Example usage +## Use cases -```python -# examples/example1.py +* Represent a remote NWB/HDF5 file as a .nwb.json file. +* Read a local or remote .nwb.json file using pynwb or other tools. +* Edit a .nwb.json file using pynwb or other tools. +* Add datasets to a .nwb.json file using a local staging area. +* Upload a .nwb.json file to a cloud storage service such as DANDI. + +### Represent a remote NWB/HDF5 file as a .nwb.json file +```python import json import pynwb import lindi -# Define the URL for a remote NWB file +# URL of the remote NWB file h5_url = "https://api.dandiarchive.org/api/assets/11f512ba-5bcf-4230-a8cb-dc8d36db38cb/download/" # Create a read-only Zarr store as a wrapper for the h5 file @@ -61,18 +70,16 @@ with pynwb.NWBHDF5IO(file=client, mode="r") as io: print(nwbfile) ``` -Or if you already have a .zarr.json file prepared (loading is much faster) +### Read a local or remote .nwb.json file using pynwb or other tools ```python -# examples/example2.py - import pynwb import lindi -# Define the URL for a remote .zarr.json file +# URL of the remote .zarr.json file url = 'https://kerchunk.neurosift.org/dandi/dandisets/000939/assets/11f512ba-5bcf-4230-a8cb-dc8d36db38cb/zarr.json' -# Load the h5py-like client from the reference file system +# Load the h5py-like client for the reference file system client = lindi.LindiH5pyFile.from_reference_file_system(url) # Open using pynwb @@ -81,9 +88,52 @@ with pynwb.NWBHDF5IO(file=client, mode="r") as io: print(nwbfile) ``` -## Mixing and matching data from multiple sources +### Edit a .nwb.json file using pynwb or other tools + +```python +import json +import lindi + +# URL of the remote .zarr.json file +url = 'https://lindi.neurosift.org/dandi/dandisets/000939/assets/11f512ba-5bcf-4230-a8cb-dc8d36db38cb/zarr.json' + +# Load the h5py-like client for the reference file system +# in read-write mode +client = lindi.LindiH5pyFile.from_reference_file_system(url, mode="r+") + +# Edit an attribute +client.attrs['new_attribute'] = 'new_value' + +# Save the changes to a new .nwb.json file +rfs_new = client.to_reference_file_system() +with open('new.nwb.json', 'w') as f: + f.write(json.dumps(rfs_new, indent=2, sort_keys=True)) +``` + +### Add datasets to a .nwb.json file using a local staging area + +```python +import lindi + +# URL of the remote .zarr.json file +url = 'https://lindi.neurosift.org/dandi/dandisets/000939/assets/11f512ba-5bcf-4230-a8cb-dc8d36db38cb/zarr.json' + +# Load the h5py-like client for the reference file system +# in read-write mode with a staging area +with lindi.StagingArea.create(base_dir='lindi_staging') as staging_area: + client = lindi.LindiH5pyFile.from_reference_file_system( + url, + mode="r+", + staging_area=staging_area + ) + # add datasets to client using pynwb or other tools + # upload the changes to the remote .nwb.json file +``` + +### Upload a .nwb.json file to a cloud storage service such as DANDI + +See [this example](https://github.com/magland/lindi-dandi/blob/main/devel/lindi_test_2.py). -Once we have NWB files represented by relatively small reference file systems (e.g., .zarr.json files), we can begin to mix and match data from multiple sources. More on this to come. ## For developers From 66b089606d181857742f8143e9c237a124503bcd Mon Sep 17 00:00:00 2001 From: Jeremy Magland Date: Sat, 13 Apr 2024 07:42:22 -0400 Subject: [PATCH 2/3] Update README.md --- README.md | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 468e4c0..0185614 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ LINDI provides: - A specification for representing arbitrary HDF5 files as Zarr stores. This handles scalar datasets, references, soft links, and compound data types for datasets. - A Zarr wrapper for remote or local HDF5 files (LindiH5ZarrStore). -- A mechanism for creating .zarr.json (or .nwb.json) files that reference data chunks in external files, inspired by [kerchunk](https://github.com/fsspec/kerchunk). +- A mechanism for creating .lindi.json (or .nwb.lindi.json) files that reference data chunks in external files, inspired by [kerchunk](https://github.com/fsspec/kerchunk). - An h5py-like interface for reading from and writing to these data sources that can be used with [pynwb](https://pynwb.readthedocs.io/en/stable/). - A mechanism for uploading and downloading these data sources to and from cloud storage, including DANDI. @@ -35,13 +35,13 @@ pip install -e . ## Use cases -* Represent a remote NWB/HDF5 file as a .nwb.json file. -* Read a local or remote .nwb.json file using pynwb or other tools. -* Edit a .nwb.json file using pynwb or other tools. -* Add datasets to a .nwb.json file using a local staging area. -* Upload a .nwb.json file to a cloud storage service such as DANDI. +* Represent a remote NWB/HDF5 file as a .nwb.lindi.json file. +* Read a local or remote .nwb.lindi.json file using pynwb or other tools. +* Edit a .nwb.lindi.json file using pynwb or other tools. +* Add datasets to a .nwb.lindi.json file using a local staging area. +* Upload a .nwb.lindi.json file to a cloud storage service such as DANDI. -### Represent a remote NWB/HDF5 file as a .nwb.json file +### Represent a remote NWB/HDF5 file as a .nwb.lindi.json file ```python import json @@ -58,7 +58,7 @@ store = lindi.LindiH5ZarrStore.from_file(h5_url) rfs = store.to_reference_file_system() # Save it to a file for later use -with open("example.zarr.json", "w") as f: +with open("example.lindi.json", "w") as f: json.dump(rfs, f, indent=2) # Create an h5py-like client from the reference file system @@ -70,13 +70,13 @@ with pynwb.NWBHDF5IO(file=client, mode="r") as io: print(nwbfile) ``` -### Read a local or remote .nwb.json file using pynwb or other tools +### Read a local or remote .nwb.lindi.json file using pynwb or other tools ```python import pynwb import lindi -# URL of the remote .zarr.json file +# URL of the remote .lindi.json file url = 'https://kerchunk.neurosift.org/dandi/dandisets/000939/assets/11f512ba-5bcf-4230-a8cb-dc8d36db38cb/zarr.json' # Load the h5py-like client for the reference file system @@ -88,13 +88,13 @@ with pynwb.NWBHDF5IO(file=client, mode="r") as io: print(nwbfile) ``` -### Edit a .nwb.json file using pynwb or other tools +### Edit a .nwb.lindi.json file using pynwb or other tools ```python import json import lindi -# URL of the remote .zarr.json file +# URL of the remote .lindi.json file url = 'https://lindi.neurosift.org/dandi/dandisets/000939/assets/11f512ba-5bcf-4230-a8cb-dc8d36db38cb/zarr.json' # Load the h5py-like client for the reference file system @@ -104,18 +104,18 @@ client = lindi.LindiH5pyFile.from_reference_file_system(url, mode="r+") # Edit an attribute client.attrs['new_attribute'] = 'new_value' -# Save the changes to a new .nwb.json file +# Save the changes to a new .nwb.lindi.json file rfs_new = client.to_reference_file_system() -with open('new.nwb.json', 'w') as f: +with open('new.nwb.lindi.json', 'w') as f: f.write(json.dumps(rfs_new, indent=2, sort_keys=True)) ``` -### Add datasets to a .nwb.json file using a local staging area +### Add datasets to a .nwb.lindi.json file using a local staging area ```python import lindi -# URL of the remote .zarr.json file +# URL of the remote .lindi.json file url = 'https://lindi.neurosift.org/dandi/dandisets/000939/assets/11f512ba-5bcf-4230-a8cb-dc8d36db38cb/zarr.json' # Load the h5py-like client for the reference file system @@ -127,14 +127,13 @@ with lindi.StagingArea.create(base_dir='lindi_staging') as staging_area: staging_area=staging_area ) # add datasets to client using pynwb or other tools - # upload the changes to the remote .nwb.json file + # upload the changes to the remote .nwb.lindi.json file ``` -### Upload a .nwb.json file to a cloud storage service such as DANDI +### Upload a .nwb.lindi.json file to a cloud storage service such as DANDI See [this example](https://github.com/magland/lindi-dandi/blob/main/devel/lindi_test_2.py). - ## For developers [Special Zarr annotations used by LINDI](docs/special_zarr_annotations.md) From 599502ffee58211f7fb9a5d51068a578486d93ce Mon Sep 17 00:00:00 2001 From: Jeremy Magland Date: Tue, 16 Apr 2024 13:06:09 -0400 Subject: [PATCH 3/3] be consistent with .lindi.json and .nwb.lindi.json --- .gitignore | 2 +- README.md | 36 ++++++++++++++-------------- examples/example1.py | 2 +- examples/example2.py | 2 +- examples/example_edit_nwb.py | 4 ++-- lindi/LindiH5pyFile/LindiH5pyFile.py | 4 ++-- tests/test_core.py | 2 +- tests/test_remote_data.py | 4 ++-- tests/test_store.py | 2 +- 9 files changed, 29 insertions(+), 29 deletions(-) diff --git a/.gitignore b/.gitignore index 008c4a2..9a71519 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ -*.zarr.json +*.lindi.json *.nwb .coverage diff --git a/README.md b/README.md index 468e4c0..ae2dcf5 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ LINDI provides: - A specification for representing arbitrary HDF5 files as Zarr stores. This handles scalar datasets, references, soft links, and compound data types for datasets. - A Zarr wrapper for remote or local HDF5 files (LindiH5ZarrStore). -- A mechanism for creating .zarr.json (or .nwb.json) files that reference data chunks in external files, inspired by [kerchunk](https://github.com/fsspec/kerchunk). +- A mechanism for creating .lindi.json (or .nwb.lindi.json) files that reference data chunks in external files, inspired by [kerchunk](https://github.com/fsspec/kerchunk). - An h5py-like interface for reading from and writing to these data sources that can be used with [pynwb](https://pynwb.readthedocs.io/en/stable/). - A mechanism for uploading and downloading these data sources to and from cloud storage, including DANDI. @@ -35,13 +35,13 @@ pip install -e . ## Use cases -* Represent a remote NWB/HDF5 file as a .nwb.json file. -* Read a local or remote .nwb.json file using pynwb or other tools. -* Edit a .nwb.json file using pynwb or other tools. -* Add datasets to a .nwb.json file using a local staging area. -* Upload a .nwb.json file to a cloud storage service such as DANDI. +* Represent a remote NWB/HDF5 file as a .nwb.lindi.json file. +* Read a local or remote .nwb.lindi.json file using pynwb or other tools. +* Edit a .nwb.lindi.json file using pynwb or other tools. +* Add datasets to a .nwb.lindi.json file using a local staging area. +* Upload a .nwb.lindi.json file to a cloud storage service such as DANDI. -### Represent a remote NWB/HDF5 file as a .nwb.json file +### Represent a remote NWB/HDF5 file as a .nwb.lindi.json file ```python import json @@ -58,7 +58,7 @@ store = lindi.LindiH5ZarrStore.from_file(h5_url) rfs = store.to_reference_file_system() # Save it to a file for later use -with open("example.zarr.json", "w") as f: +with open("example.lindi.json", "w") as f: json.dump(rfs, f, indent=2) # Create an h5py-like client from the reference file system @@ -70,13 +70,13 @@ with pynwb.NWBHDF5IO(file=client, mode="r") as io: print(nwbfile) ``` -### Read a local or remote .nwb.json file using pynwb or other tools +### Read a local or remote .nwb.lindi.json file using pynwb or other tools ```python import pynwb import lindi -# URL of the remote .zarr.json file +# URL of the remote .nwb.lindi.json file url = 'https://kerchunk.neurosift.org/dandi/dandisets/000939/assets/11f512ba-5bcf-4230-a8cb-dc8d36db38cb/zarr.json' # Load the h5py-like client for the reference file system @@ -88,13 +88,13 @@ with pynwb.NWBHDF5IO(file=client, mode="r") as io: print(nwbfile) ``` -### Edit a .nwb.json file using pynwb or other tools +### Edit a .nwb.lindi.json file using pynwb or other tools ```python import json import lindi -# URL of the remote .zarr.json file +# URL of the remote .nwb.lindi.json file url = 'https://lindi.neurosift.org/dandi/dandisets/000939/assets/11f512ba-5bcf-4230-a8cb-dc8d36db38cb/zarr.json' # Load the h5py-like client for the reference file system @@ -104,18 +104,18 @@ client = lindi.LindiH5pyFile.from_reference_file_system(url, mode="r+") # Edit an attribute client.attrs['new_attribute'] = 'new_value' -# Save the changes to a new .nwb.json file +# Save the changes to a new .nwb.lindi.json file rfs_new = client.to_reference_file_system() -with open('new.nwb.json', 'w') as f: +with open('new.nwb.lindi.json', 'w') as f: f.write(json.dumps(rfs_new, indent=2, sort_keys=True)) ``` -### Add datasets to a .nwb.json file using a local staging area +### Add datasets to a .nwb.lindi.json file using a local staging area ```python import lindi -# URL of the remote .zarr.json file +# URL of the remote .nwb.lindi.json file url = 'https://lindi.neurosift.org/dandi/dandisets/000939/assets/11f512ba-5bcf-4230-a8cb-dc8d36db38cb/zarr.json' # Load the h5py-like client for the reference file system @@ -127,10 +127,10 @@ with lindi.StagingArea.create(base_dir='lindi_staging') as staging_area: staging_area=staging_area ) # add datasets to client using pynwb or other tools - # upload the changes to the remote .nwb.json file + # upload the changes to the remote .nwb.lindi.json file ``` -### Upload a .nwb.json file to a cloud storage service such as DANDI +### Upload a .nwb.lindi.json file to a cloud storage service such as DANDI See [this example](https://github.com/magland/lindi-dandi/blob/main/devel/lindi_test_2.py). diff --git a/examples/example1.py b/examples/example1.py index f5c1b6b..b55d7c2 100644 --- a/examples/example1.py +++ b/examples/example1.py @@ -12,7 +12,7 @@ rfs = store.to_reference_file_system() # Save it to a file for later use -with open("example.zarr.json", "w") as f: +with open("example.nwb.lindi.json", "w") as f: json.dump(rfs, f, indent=2) # Create an h5py-like client from the reference file system diff --git a/examples/example2.py b/examples/example2.py index 915085e..069cbd2 100644 --- a/examples/example2.py +++ b/examples/example2.py @@ -1,7 +1,7 @@ import pynwb import lindi -# Define the URL for a remote .zarr.json file +# Define the URL for a remote .nwb.lindi.json file url = 'https://kerchunk.neurosift.org/dandi/dandisets/000939/assets/11f512ba-5bcf-4230-a8cb-dc8d36db38cb/zarr.json' # Load the h5py-like client from the reference file system diff --git a/examples/example_edit_nwb.py b/examples/example_edit_nwb.py index 735def0..e1c6e7b 100644 --- a/examples/example_edit_nwb.py +++ b/examples/example_edit_nwb.py @@ -3,7 +3,7 @@ import pynwb -# Define the URL for a remote .zarr.json file +# Define the URL for a remote .nwb.lindi.json file url = 'https://kerchunk.neurosift.org/dandi/dandisets/000939/assets/11f512ba-5bcf-4230-a8cb-dc8d36db38cb/zarr.json' # Load the h5py-like client from the reference file system @@ -20,7 +20,7 @@ # Optionally write to a file # import json -# with open('new.zarr.json', 'w') as f: +# with open('new.nwb.lindi.json', 'w') as f: # json.dump(rfs_new, f) # Load a new h5py-like client from the new reference file system diff --git a/lindi/LindiH5pyFile/LindiH5pyFile.py b/lindi/LindiH5pyFile/LindiH5pyFile.py index d25f2fb..717ebe8 100644 --- a/lindi/LindiH5pyFile/LindiH5pyFile.py +++ b/lindi/LindiH5pyFile/LindiH5pyFile.py @@ -34,7 +34,7 @@ def from_reference_file_system(rfs: Union[dict, str], mode: Literal["r", "r+"] = ---------- rfs : Union[dict, str] The reference file system. This can be a dictionary or a URL or path - to a .zarr.json file. + to a .lindi.json file. mode : Literal["r", "r+"], optional The mode to open the file object in, by default "r". If the mode is "r", the file object will be read-only. If the mode is "r+", the @@ -47,7 +47,7 @@ def from_reference_file_system(rfs: Union[dict, str], mode: Literal["r", "r+"] = if isinstance(rfs, str): if rfs.startswith("http") or rfs.startswith("https"): with tempfile.TemporaryDirectory() as tmpdir: - filename = f"{tmpdir}/temp.zarr.json" + filename = f"{tmpdir}/temp.lindi.json" _download_file(rfs, filename) with open(filename, "r") as f: data = json.load(f) diff --git a/tests/test_core.py b/tests/test_core.py index 16be391..3d4d787 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -303,7 +303,7 @@ def test_reference_file_system_to_file(): with h5py.File(filename, "w") as f: f.create_dataset("X", data=[1, 2, 3]) with LindiH5ZarrStore.from_file(filename, url=filename) as store: - rfs_fname = f'{tmpdir}/test.zarr.json' + rfs_fname = f'{tmpdir}/test.lindi.json' store.to_file(rfs_fname) client = lindi.LindiH5pyFile.from_reference_file_system(rfs_fname) X = client["X"] diff --git a/tests/test_remote_data.py b/tests/test_remote_data.py index 8ecdbea..6a0dee8 100644 --- a/tests/test_remote_data.py +++ b/tests/test_remote_data.py @@ -18,7 +18,7 @@ def test_remote_data_1(): rfs = store.to_reference_file_system() # Save it to a file for later use - with open("example.zarr.json", "w") as f: + with open("example.nwb.lindi.json", "w") as f: json.dump(rfs, f, indent=2) # Create an h5py-like client from the reference file system @@ -34,7 +34,7 @@ def test_remote_data_1(): def test_remote_data_2(): import pynwb - # Define the URL for a remote .zarr.json file + # Define the URL for a remote .nwb.lindi.json file url = 'https://lindi.neurosift.org/dandi/dandisets/000939/assets/11f512ba-5bcf-4230-a8cb-dc8d36db38cb/zarr.json' # Load the h5py-like client from the reference file system diff --git a/tests/test_store.py b/tests/test_store.py index aff4667..8d7b66b 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -13,7 +13,7 @@ def test_store(): group1.create_group("group2") group1.create_dataset("dataset2", data=[4, 5, 6]) with lindi.LindiH5ZarrStore.from_file(filename, url=filename) as store: - store.to_file(f"{tmpdir}/test.zarr.json") # for coverage + store.to_file(f"{tmpdir}/test.lindi.json") # for coverage a = store.listdir('') assert _lists_are_equal_as_sets(a, ['dataset1', 'group1']) b = store.listdir('group1')