Merge branch 'main' into codec

NeurodataWithoutBorders · Apr 16, 2024 · 55cce74 · 55cce74
2 parents 35c6722 + bc71a9d
commit 55cce74
Show file tree

Hide file tree

Showing 7 changed files with 74 additions and 17 deletions.
diff --git a/README.md b/README.md
@@ -6,6 +6,8 @@
 
 :warning: Please note, LINDI is currently under development and should not yet be used in practice.
 
+For a more up-to-date introduction to LINDI, see the [README on the dev branch](https://github.com/NeurodataWithoutBorders/lindi/tree/dev).
+
 LINDI is a Python library that facilitates handling NWB (Neurodata Without Borders) files in an efficient, flexible manner, especially when dealing with large datasets on remote servers. The goal is to enable composition of NWB files by integrating data from multiple sources without the need to copy or move large datasets.
 
 LINDI features include:

diff --git a/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py b/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py
@@ -18,6 +18,7 @@
 from ..conversion.reformat_json import reformat_json
 from ..conversion.h5_filters_to_codecs import h5_filters_to_codecs
 from ..conversion.create_zarr_dataset_from_h5_data import create_zarr_dataset_from_h5_data
+from ..LindiH5pyFile.LindiReferenceFileSystemStore import LindiReferenceFileSystemStore
 
 
 @dataclass
@@ -576,6 +577,8 @@ def _process_dataset(key):
 
         # Process the groups recursively starting with the root group
         _process_group("", self._h5f)
+
+        LindiReferenceFileSystemStore.replace_meta_file_contents_with_dicts(ret)
         return ret
 
 

diff --git a/lindi/LindiH5pyFile/LindiH5pyDataset.py b/lindi/LindiH5pyFile/LindiH5pyDataset.py
@@ -152,6 +152,15 @@ def fletcher32(self):
         else:
             raise Exception(f'Unexpected dataset object type: {type(self._dataset_object)}')
 
+    @property
+    def chunks(self):
+        if isinstance(self._dataset_object, h5py.Dataset):
+            return self._dataset_object.chunks
+        elif isinstance(self._dataset_object, zarr.Array):
+            return self._dataset_object.chunks
+        else:
+            raise Exception(f'Unexpected dataset object type: {type(self._dataset_object)}')
+
     def __repr__(self):  # type: ignore
         return f"<{self.__class__.__name__}: {self.name}>"
 

diff --git a/lindi/LindiH5pyFile/LindiH5pyFile.py b/lindi/LindiH5pyFile/LindiH5pyFile.py
@@ -136,6 +136,7 @@ def to_reference_file_system(self):
             raise Exception(f"Unexpected type for zarr store: {type(self._zarr_store)}")
         rfs = self._zarr_store.rfs
         rfs_copy = json.loads(json.dumps(rfs))
+        LindiReferenceFileSystemStore.replace_meta_file_contents_with_dicts(rfs_copy)
         return rfs_copy
 
     @property
@@ -399,7 +400,7 @@ def _recursive_copy(src_item: Union[h5py.Group, h5py.Dataset], dest: h5py.File,
                             dst_rfs['refs'][dst_ref_key] = _deep_copy(src_rfs['refs'][src_ref_key])
                     return
 
-        dst_item = dest.create_dataset(name, data=src_item[()])
+        dst_item = dest.create_dataset(name, data=src_item[()], chunks=src_item.chunks)
         for k, v in src_item.attrs.items():
             dst_item.attrs[k] = v
     else:

diff --git a/lindi/LindiH5pyFile/LindiReferenceFileSystemStore.py b/lindi/LindiH5pyFile/LindiReferenceFileSystemStore.py
@@ -1,4 +1,5 @@
 from typing import Literal, Dict
+import json
 import base64
 from zarr.storage import Store as ZarrStore
 from .FileSegmentReader.FileSegmentReader import FileSegmentReader
@@ -21,18 +22,20 @@ class LindiReferenceFileSystemStore(ZarrStore):
     segment of a file is read.
 
     To read from a file in an embargoed DANDI dataset, you will need to set the
-    DANDI_API_KEY environment variable to your DANDI API token. Or, if this is
-    a Dandiset on the staging server, you will need to set the
+    DANDI_API_KEY environment variable to your DANDI API token. Or, if this is a
+    Dandiset on the staging server, you will need to set the
     DANDI_STAGING_API_KEY.
 
-    Following the fsspec convention (https://fsspec.github.io/kerchunk/spec.html),
-    the reference file system is specified as a
-    dictionary with a "refs" key. The value of "refs" is a dictionary where the
-    keys are the names of the files and the values are either strings or lists.
-    If the value is a string, it is assumed to be the data of the file, which
-    may be base64 encoded (see below). If the value is a list, it is assumed to
-    have three elements: the URL of the file (or path of a local file), the byte
-    offset of the data within the file, and the byte length of the data.
+    Following the fsspec convention
+    (https://fsspec.github.io/kerchunk/spec.html), the reference file system is
+    specified as a dictionary with a "refs" key. The value of "refs" is a
+    dictionary where the keys are the names of the files and each value is
+    either a string, a list, or a dict. If the value is a string, it is assumed
+    to be the data of the file, which may be base64 encoded (see below). If the
+    value is a list, it is assumed to have three elements: the URL of the file
+    (or path of a local file), the byte offset of the data within the file, and
+    the byte length of the data. If the value is a dict, it represents a json
+    file, and the content of the file is the json representation of the dict.
 
     If the value for a file is a string, it may be prefixed with "base64:". If
     it is, the string is assumed to be base64 encoded and is decoded before
@@ -62,6 +65,9 @@ def __init__(self, rfs: dict, mode: Literal["r", "r+"] = "r+"):
         for k, v in rfs["refs"].items():
             if isinstance(v, str):
                 pass
+            elif isinstance(v, dict):
+                # the content of the file is the json representation of the dict
+                pass
             elif isinstance(v, list):
                 if len(v) != 3:
                     raise Exception(f"Problem with {k}: list must have 3 elements")
@@ -87,6 +93,8 @@ def __getitem__(self, key: str):
                 return base64.b64decode(x[len("base64:"):])
             else:
                 return x.encode("utf-8")
+        elif isinstance(x, dict):
+            return json.dumps(x).encode("utf-8")
         elif isinstance(x, list):
             if len(x) != 3:
                 raise Exception("list must have 3 elements")  # pragma: no cover
@@ -100,14 +108,20 @@ def __getitem__(self, key: str):
             # and contains mutable lists
             raise Exception(f"Problem with {key}: value {x} must be a string or a list")
 
-    def __setitem__(self, key: str, value):
+    def __setitem__(self, key: str, value: bytes):
+        # We intentionally do not allow value to be a dict here! When the rfs is
+        # written to a .json file elsewhere in the codebase of lindi, the value
+        # will automatically be converted to a json object if it is json
+        # serializable.
+        if not isinstance(value, bytes):
+            raise ValueError("value must be bytes")
         try:
             # try to ascii encode the value
-            value = value.decode("ascii")
+            value2 = value.decode("ascii")
         except UnicodeDecodeError:
             # if that fails, base64 encode it
-            value = "base64:" + base64.b64encode(value).decode("ascii")
-        self.rfs["refs"][key] = value
+            value2 = "base64:" + base64.b64encode(value).decode("ascii")
+        self.rfs["refs"][key] = value2
 
     def __delitem__(self, key: str):
         del self.rfs["refs"][key]
@@ -131,6 +145,19 @@ def is_listable(self):
     def is_erasable(self):
         return False
 
+    @staticmethod
+    def replace_meta_file_contents_with_dicts(rfs: dict) -> None:
+        """
+        Utility function for replacing the contents of the .zattrs, .zgroup, and
+        .zarray files in an rfs with the json representation of the contents.
+        """
+        # important to use the LindiReferenceFileSystemStore here because then we
+        # can resolve any base64 encoded values, etc when converting them to dicts
+        store = LindiReferenceFileSystemStore(rfs)
+        for k, v in rfs['refs'].items():
+            if k.endswith('.zattrs') or k.endswith('.zgroup') or k.endswith('.zarray') or k.endswith('zarr.json'):  # note: zarr.json is for zarr v3
+                rfs['refs'][k] = json.loads(store[k].decode('utf-8'))
+
 
 # Keep a global cache of file segment readers that apply to all instances of
 # LindiReferenceFileSystemStore. The key is the URL of the file.

diff --git a/lindi/conversion/create_zarr_dataset_from_h5_data.py b/lindi/conversion/create_zarr_dataset_from_h5_data.py
@@ -88,7 +88,7 @@ def create_zarr_dataset_from_h5_data(
                 return ds
             else:
                 raise Exception(f'Unsupported scalar value type: {type(scalar_value)}')
-        elif h5_dtype.kind == 'S':
+        elif h5_dtype.kind == 'S' or h5_dtype.kind == 'U':
             # byte string
             if h5_data is None:
                 raise Exception(f'Data must be provided for scalar dataset {label}')

diff --git a/tests/test_core.py b/tests/test_core.py
@@ -1,3 +1,4 @@
+import json
 import pytest
 import numpy as np
 import h5py
@@ -342,9 +343,23 @@ def test_lindi_reference_file_system_store():
         store["a"]
     with pytest.raises(Exception):
         store[{}]  # invalid key type # type: ignore
-    rfs = {"refs": {"a": {}}}  # invalid value
+    rfs = {"refs": {"a": 83}}  # invalid value
     with pytest.raises(Exception):
         store = LindiReferenceFileSystemStore(rfs)
+    rfs = {"refs": {"a": {"test": 1}}}
+    store = LindiReferenceFileSystemStore(rfs)
+    assert json.loads(store["a"]) == {"test": 1}
+    rfs = {"refs": {".zattrs": "{\"test\": 2}"}}
+    store = LindiReferenceFileSystemStore(rfs)
+    assert json.loads(store[".zattrs"]) == {"test": 2}
+    rfs = {"refs": {".zattrs": "{\"test\": 3}"}}
+    LindiReferenceFileSystemStore.replace_meta_file_contents_with_dicts(rfs)
+    assert isinstance(rfs["refs"][".zattrs"], dict)
+    store = LindiReferenceFileSystemStore(rfs)
+    assert json.loads(store[".zattrs"]) == {"test": 3}
+    rfs = {"refs": {".zattrs_xxx": "{\"test\": 5}"}}
+    LindiReferenceFileSystemStore.replace_meta_file_contents_with_dicts(rfs)
+    assert isinstance(rfs["refs"][".zattrs_xxx"], str)
 
     rfs = {"refs": {"a": "abc"}}
     store = LindiReferenceFileSystemStore(rfs)