Skip to content

Commit

Permalink
Merge branch 'main' into codec
Browse files Browse the repository at this point in the history
  • Loading branch information
magland authored Apr 16, 2024
2 parents 35c6722 + bc71a9d commit 55cce74
Show file tree
Hide file tree
Showing 7 changed files with 74 additions and 17 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

:warning: Please note, LINDI is currently under development and should not yet be used in practice.

For a more up-to-date introduction to LINDI, see the [README on the dev branch](https://github.com/NeurodataWithoutBorders/lindi/tree/dev).

LINDI is a Python library that facilitates handling NWB (Neurodata Without Borders) files in an efficient, flexible manner, especially when dealing with large datasets on remote servers. The goal is to enable composition of NWB files by integrating data from multiple sources without the need to copy or move large datasets.

LINDI features include:
Expand Down
3 changes: 3 additions & 0 deletions lindi/LindiH5ZarrStore/LindiH5ZarrStore.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from ..conversion.reformat_json import reformat_json
from ..conversion.h5_filters_to_codecs import h5_filters_to_codecs
from ..conversion.create_zarr_dataset_from_h5_data import create_zarr_dataset_from_h5_data
from ..LindiH5pyFile.LindiReferenceFileSystemStore import LindiReferenceFileSystemStore


@dataclass
Expand Down Expand Up @@ -576,6 +577,8 @@ def _process_dataset(key):

# Process the groups recursively starting with the root group
_process_group("", self._h5f)

LindiReferenceFileSystemStore.replace_meta_file_contents_with_dicts(ret)
return ret


Expand Down
9 changes: 9 additions & 0 deletions lindi/LindiH5pyFile/LindiH5pyDataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,15 @@ def fletcher32(self):
else:
raise Exception(f'Unexpected dataset object type: {type(self._dataset_object)}')

@property
def chunks(self):
if isinstance(self._dataset_object, h5py.Dataset):
return self._dataset_object.chunks
elif isinstance(self._dataset_object, zarr.Array):
return self._dataset_object.chunks
else:
raise Exception(f'Unexpected dataset object type: {type(self._dataset_object)}')

def __repr__(self): # type: ignore
return f"<{self.__class__.__name__}: {self.name}>"

Expand Down
3 changes: 2 additions & 1 deletion lindi/LindiH5pyFile/LindiH5pyFile.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ def to_reference_file_system(self):
raise Exception(f"Unexpected type for zarr store: {type(self._zarr_store)}")
rfs = self._zarr_store.rfs
rfs_copy = json.loads(json.dumps(rfs))
LindiReferenceFileSystemStore.replace_meta_file_contents_with_dicts(rfs_copy)
return rfs_copy

@property
Expand Down Expand Up @@ -399,7 +400,7 @@ def _recursive_copy(src_item: Union[h5py.Group, h5py.Dataset], dest: h5py.File,
dst_rfs['refs'][dst_ref_key] = _deep_copy(src_rfs['refs'][src_ref_key])
return

dst_item = dest.create_dataset(name, data=src_item[()])
dst_item = dest.create_dataset(name, data=src_item[()], chunks=src_item.chunks)
for k, v in src_item.attrs.items():
dst_item.attrs[k] = v
else:
Expand Down
55 changes: 41 additions & 14 deletions lindi/LindiH5pyFile/LindiReferenceFileSystemStore.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import Literal, Dict
import json
import base64
from zarr.storage import Store as ZarrStore
from .FileSegmentReader.FileSegmentReader import FileSegmentReader
Expand All @@ -21,18 +22,20 @@ class LindiReferenceFileSystemStore(ZarrStore):
segment of a file is read.
To read from a file in an embargoed DANDI dataset, you will need to set the
DANDI_API_KEY environment variable to your DANDI API token. Or, if this is
a Dandiset on the staging server, you will need to set the
DANDI_API_KEY environment variable to your DANDI API token. Or, if this is a
Dandiset on the staging server, you will need to set the
DANDI_STAGING_API_KEY.
Following the fsspec convention (https://fsspec.github.io/kerchunk/spec.html),
the reference file system is specified as a
dictionary with a "refs" key. The value of "refs" is a dictionary where the
keys are the names of the files and the values are either strings or lists.
If the value is a string, it is assumed to be the data of the file, which
may be base64 encoded (see below). If the value is a list, it is assumed to
have three elements: the URL of the file (or path of a local file), the byte
offset of the data within the file, and the byte length of the data.
Following the fsspec convention
(https://fsspec.github.io/kerchunk/spec.html), the reference file system is
specified as a dictionary with a "refs" key. The value of "refs" is a
dictionary where the keys are the names of the files and each value is
either a string, a list, or a dict. If the value is a string, it is assumed
to be the data of the file, which may be base64 encoded (see below). If the
value is a list, it is assumed to have three elements: the URL of the file
(or path of a local file), the byte offset of the data within the file, and
the byte length of the data. If the value is a dict, it represents a json
file, and the content of the file is the json representation of the dict.
If the value for a file is a string, it may be prefixed with "base64:". If
it is, the string is assumed to be base64 encoded and is decoded before
Expand Down Expand Up @@ -62,6 +65,9 @@ def __init__(self, rfs: dict, mode: Literal["r", "r+"] = "r+"):
for k, v in rfs["refs"].items():
if isinstance(v, str):
pass
elif isinstance(v, dict):
# the content of the file is the json representation of the dict
pass
elif isinstance(v, list):
if len(v) != 3:
raise Exception(f"Problem with {k}: list must have 3 elements")
Expand All @@ -87,6 +93,8 @@ def __getitem__(self, key: str):
return base64.b64decode(x[len("base64:"):])
else:
return x.encode("utf-8")
elif isinstance(x, dict):
return json.dumps(x).encode("utf-8")
elif isinstance(x, list):
if len(x) != 3:
raise Exception("list must have 3 elements") # pragma: no cover
Expand All @@ -100,14 +108,20 @@ def __getitem__(self, key: str):
# and contains mutable lists
raise Exception(f"Problem with {key}: value {x} must be a string or a list")

def __setitem__(self, key: str, value):
def __setitem__(self, key: str, value: bytes):
# We intentionally do not allow value to be a dict here! When the rfs is
# written to a .json file elsewhere in the codebase of lindi, the value
# will automatically be converted to a json object if it is json
# serializable.
if not isinstance(value, bytes):
raise ValueError("value must be bytes")
try:
# try to ascii encode the value
value = value.decode("ascii")
value2 = value.decode("ascii")
except UnicodeDecodeError:
# if that fails, base64 encode it
value = "base64:" + base64.b64encode(value).decode("ascii")
self.rfs["refs"][key] = value
value2 = "base64:" + base64.b64encode(value).decode("ascii")
self.rfs["refs"][key] = value2

def __delitem__(self, key: str):
del self.rfs["refs"][key]
Expand All @@ -131,6 +145,19 @@ def is_listable(self):
def is_erasable(self):
return False

@staticmethod
def replace_meta_file_contents_with_dicts(rfs: dict) -> None:
"""
Utility function for replacing the contents of the .zattrs, .zgroup, and
.zarray files in an rfs with the json representation of the contents.
"""
# important to use the LindiReferenceFileSystemStore here because then we
# can resolve any base64 encoded values, etc when converting them to dicts
store = LindiReferenceFileSystemStore(rfs)
for k, v in rfs['refs'].items():
if k.endswith('.zattrs') or k.endswith('.zgroup') or k.endswith('.zarray') or k.endswith('zarr.json'): # note: zarr.json is for zarr v3
rfs['refs'][k] = json.loads(store[k].decode('utf-8'))


# Keep a global cache of file segment readers that apply to all instances of
# LindiReferenceFileSystemStore. The key is the URL of the file.
Expand Down
2 changes: 1 addition & 1 deletion lindi/conversion/create_zarr_dataset_from_h5_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def create_zarr_dataset_from_h5_data(
return ds
else:
raise Exception(f'Unsupported scalar value type: {type(scalar_value)}')
elif h5_dtype.kind == 'S':
elif h5_dtype.kind == 'S' or h5_dtype.kind == 'U':
# byte string
if h5_data is None:
raise Exception(f'Data must be provided for scalar dataset {label}')
Expand Down
17 changes: 16 additions & 1 deletion tests/test_core.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import pytest
import numpy as np
import h5py
Expand Down Expand Up @@ -342,9 +343,23 @@ def test_lindi_reference_file_system_store():
store["a"]
with pytest.raises(Exception):
store[{}] # invalid key type # type: ignore
rfs = {"refs": {"a": {}}} # invalid value
rfs = {"refs": {"a": 83}} # invalid value
with pytest.raises(Exception):
store = LindiReferenceFileSystemStore(rfs)
rfs = {"refs": {"a": {"test": 1}}}
store = LindiReferenceFileSystemStore(rfs)
assert json.loads(store["a"]) == {"test": 1}
rfs = {"refs": {".zattrs": "{\"test\": 2}"}}
store = LindiReferenceFileSystemStore(rfs)
assert json.loads(store[".zattrs"]) == {"test": 2}
rfs = {"refs": {".zattrs": "{\"test\": 3}"}}
LindiReferenceFileSystemStore.replace_meta_file_contents_with_dicts(rfs)
assert isinstance(rfs["refs"][".zattrs"], dict)
store = LindiReferenceFileSystemStore(rfs)
assert json.loads(store[".zattrs"]) == {"test": 3}
rfs = {"refs": {".zattrs_xxx": "{\"test\": 5}"}}
LindiReferenceFileSystemStore.replace_meta_file_contents_with_dicts(rfs)
assert isinstance(rfs["refs"][".zattrs_xxx"], str)

rfs = {"refs": {"a": "abc"}}
store = LindiReferenceFileSystemStore(rfs)
Expand Down

0 comments on commit 55cce74

Please sign in to comment.