diff --git a/README.md b/README.md index 879608d..efc6821 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,8 @@ :warning: Please note, LINDI is currently under development and should not yet be used in practice. +For a more up-to-date introduction to LINDI, see the [README on the dev branch](https://github.com/NeurodataWithoutBorders/lindi/tree/dev). + LINDI is a Python library that facilitates handling NWB (Neurodata Without Borders) files in an efficient, flexible manner, especially when dealing with large datasets on remote servers. The goal is to enable composition of NWB files by integrating data from multiple sources without the need to copy or move large datasets. LINDI features include: diff --git a/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py b/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py index 0ca71eb..9858248 100644 --- a/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py +++ b/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py @@ -18,6 +18,7 @@ from ..conversion.reformat_json import reformat_json from ..conversion.h5_filters_to_codecs import h5_filters_to_codecs from ..conversion.create_zarr_dataset_from_h5_data import create_zarr_dataset_from_h5_data +from ..LindiH5pyFile.LindiReferenceFileSystemStore import LindiReferenceFileSystemStore @dataclass @@ -576,6 +577,8 @@ def _process_dataset(key): # Process the groups recursively starting with the root group _process_group("", self._h5f) + + LindiReferenceFileSystemStore.replace_meta_file_contents_with_dicts(ret) return ret diff --git a/lindi/LindiH5pyFile/LindiH5pyDataset.py b/lindi/LindiH5pyFile/LindiH5pyDataset.py index 83c540a..4c915b7 100644 --- a/lindi/LindiH5pyFile/LindiH5pyDataset.py +++ b/lindi/LindiH5pyFile/LindiH5pyDataset.py @@ -152,6 +152,15 @@ def fletcher32(self): else: raise Exception(f'Unexpected dataset object type: {type(self._dataset_object)}') + @property + def chunks(self): + if isinstance(self._dataset_object, h5py.Dataset): + return self._dataset_object.chunks + elif isinstance(self._dataset_object, zarr.Array): + return self._dataset_object.chunks + else: + raise Exception(f'Unexpected dataset object type: {type(self._dataset_object)}') + def __repr__(self): # type: ignore return f"<{self.__class__.__name__}: {self.name}>" diff --git a/lindi/LindiH5pyFile/LindiH5pyFile.py b/lindi/LindiH5pyFile/LindiH5pyFile.py index d107535..43fab45 100644 --- a/lindi/LindiH5pyFile/LindiH5pyFile.py +++ b/lindi/LindiH5pyFile/LindiH5pyFile.py @@ -136,6 +136,7 @@ def to_reference_file_system(self): raise Exception(f"Unexpected type for zarr store: {type(self._zarr_store)}") rfs = self._zarr_store.rfs rfs_copy = json.loads(json.dumps(rfs)) + LindiReferenceFileSystemStore.replace_meta_file_contents_with_dicts(rfs_copy) return rfs_copy @property @@ -399,7 +400,7 @@ def _recursive_copy(src_item: Union[h5py.Group, h5py.Dataset], dest: h5py.File, dst_rfs['refs'][dst_ref_key] = _deep_copy(src_rfs['refs'][src_ref_key]) return - dst_item = dest.create_dataset(name, data=src_item[()]) + dst_item = dest.create_dataset(name, data=src_item[()], chunks=src_item.chunks) for k, v in src_item.attrs.items(): dst_item.attrs[k] = v else: diff --git a/lindi/LindiH5pyFile/LindiReferenceFileSystemStore.py b/lindi/LindiH5pyFile/LindiReferenceFileSystemStore.py index e98ce22..d9de9e2 100644 --- a/lindi/LindiH5pyFile/LindiReferenceFileSystemStore.py +++ b/lindi/LindiH5pyFile/LindiReferenceFileSystemStore.py @@ -1,4 +1,5 @@ from typing import Literal, Dict +import json import base64 from zarr.storage import Store as ZarrStore from .FileSegmentReader.FileSegmentReader import FileSegmentReader @@ -21,18 +22,20 @@ class LindiReferenceFileSystemStore(ZarrStore): segment of a file is read. To read from a file in an embargoed DANDI dataset, you will need to set the - DANDI_API_KEY environment variable to your DANDI API token. Or, if this is - a Dandiset on the staging server, you will need to set the + DANDI_API_KEY environment variable to your DANDI API token. Or, if this is a + Dandiset on the staging server, you will need to set the DANDI_STAGING_API_KEY. - Following the fsspec convention (https://fsspec.github.io/kerchunk/spec.html), - the reference file system is specified as a - dictionary with a "refs" key. The value of "refs" is a dictionary where the - keys are the names of the files and the values are either strings or lists. - If the value is a string, it is assumed to be the data of the file, which - may be base64 encoded (see below). If the value is a list, it is assumed to - have three elements: the URL of the file (or path of a local file), the byte - offset of the data within the file, and the byte length of the data. + Following the fsspec convention + (https://fsspec.github.io/kerchunk/spec.html), the reference file system is + specified as a dictionary with a "refs" key. The value of "refs" is a + dictionary where the keys are the names of the files and each value is + either a string, a list, or a dict. If the value is a string, it is assumed + to be the data of the file, which may be base64 encoded (see below). If the + value is a list, it is assumed to have three elements: the URL of the file + (or path of a local file), the byte offset of the data within the file, and + the byte length of the data. If the value is a dict, it represents a json + file, and the content of the file is the json representation of the dict. If the value for a file is a string, it may be prefixed with "base64:". If it is, the string is assumed to be base64 encoded and is decoded before @@ -62,6 +65,9 @@ def __init__(self, rfs: dict, mode: Literal["r", "r+"] = "r+"): for k, v in rfs["refs"].items(): if isinstance(v, str): pass + elif isinstance(v, dict): + # the content of the file is the json representation of the dict + pass elif isinstance(v, list): if len(v) != 3: raise Exception(f"Problem with {k}: list must have 3 elements") @@ -87,6 +93,8 @@ def __getitem__(self, key: str): return base64.b64decode(x[len("base64:"):]) else: return x.encode("utf-8") + elif isinstance(x, dict): + return json.dumps(x).encode("utf-8") elif isinstance(x, list): if len(x) != 3: raise Exception("list must have 3 elements") # pragma: no cover @@ -100,14 +108,20 @@ def __getitem__(self, key: str): # and contains mutable lists raise Exception(f"Problem with {key}: value {x} must be a string or a list") - def __setitem__(self, key: str, value): + def __setitem__(self, key: str, value: bytes): + # We intentionally do not allow value to be a dict here! When the rfs is + # written to a .json file elsewhere in the codebase of lindi, the value + # will automatically be converted to a json object if it is json + # serializable. + if not isinstance(value, bytes): + raise ValueError("value must be bytes") try: # try to ascii encode the value - value = value.decode("ascii") + value2 = value.decode("ascii") except UnicodeDecodeError: # if that fails, base64 encode it - value = "base64:" + base64.b64encode(value).decode("ascii") - self.rfs["refs"][key] = value + value2 = "base64:" + base64.b64encode(value).decode("ascii") + self.rfs["refs"][key] = value2 def __delitem__(self, key: str): del self.rfs["refs"][key] @@ -131,6 +145,19 @@ def is_listable(self): def is_erasable(self): return False + @staticmethod + def replace_meta_file_contents_with_dicts(rfs: dict) -> None: + """ + Utility function for replacing the contents of the .zattrs, .zgroup, and + .zarray files in an rfs with the json representation of the contents. + """ + # important to use the LindiReferenceFileSystemStore here because then we + # can resolve any base64 encoded values, etc when converting them to dicts + store = LindiReferenceFileSystemStore(rfs) + for k, v in rfs['refs'].items(): + if k.endswith('.zattrs') or k.endswith('.zgroup') or k.endswith('.zarray') or k.endswith('zarr.json'): # note: zarr.json is for zarr v3 + rfs['refs'][k] = json.loads(store[k].decode('utf-8')) + # Keep a global cache of file segment readers that apply to all instances of # LindiReferenceFileSystemStore. The key is the URL of the file. diff --git a/lindi/conversion/create_zarr_dataset_from_h5_data.py b/lindi/conversion/create_zarr_dataset_from_h5_data.py index 5a91233..fa2fe45 100644 --- a/lindi/conversion/create_zarr_dataset_from_h5_data.py +++ b/lindi/conversion/create_zarr_dataset_from_h5_data.py @@ -88,7 +88,7 @@ def create_zarr_dataset_from_h5_data( return ds else: raise Exception(f'Unsupported scalar value type: {type(scalar_value)}') - elif h5_dtype.kind == 'S': + elif h5_dtype.kind == 'S' or h5_dtype.kind == 'U': # byte string if h5_data is None: raise Exception(f'Data must be provided for scalar dataset {label}') diff --git a/tests/test_core.py b/tests/test_core.py index 16be391..20a0d46 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1,3 +1,4 @@ +import json import pytest import numpy as np import h5py @@ -342,9 +343,23 @@ def test_lindi_reference_file_system_store(): store["a"] with pytest.raises(Exception): store[{}] # invalid key type # type: ignore - rfs = {"refs": {"a": {}}} # invalid value + rfs = {"refs": {"a": 83}} # invalid value with pytest.raises(Exception): store = LindiReferenceFileSystemStore(rfs) + rfs = {"refs": {"a": {"test": 1}}} + store = LindiReferenceFileSystemStore(rfs) + assert json.loads(store["a"]) == {"test": 1} + rfs = {"refs": {".zattrs": "{\"test\": 2}"}} + store = LindiReferenceFileSystemStore(rfs) + assert json.loads(store[".zattrs"]) == {"test": 2} + rfs = {"refs": {".zattrs": "{\"test\": 3}"}} + LindiReferenceFileSystemStore.replace_meta_file_contents_with_dicts(rfs) + assert isinstance(rfs["refs"][".zattrs"], dict) + store = LindiReferenceFileSystemStore(rfs) + assert json.loads(store[".zattrs"]) == {"test": 3} + rfs = {"refs": {".zattrs_xxx": "{\"test\": 5}"}} + LindiReferenceFileSystemStore.replace_meta_file_contents_with_dicts(rfs) + assert isinstance(rfs["refs"][".zattrs_xxx"], str) rfs = {"refs": {"a": "abc"}} store = LindiReferenceFileSystemStore(rfs)