Merge pull request #40 from NeurodataWithoutBorders/rfs-json

support dicts in reference file system for json files
NeurodataWithoutBorders · Apr 16, 2024 · bc71a9d · bc71a9d
2 parents 47c5da9 + b7f2c4e
commit bc71a9d
Show file tree

Hide file tree

Showing 4 changed files with 61 additions and 15 deletions.
diff --git a/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py b/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py
@@ -18,6 +18,7 @@
 from ..conversion.reformat_json import reformat_json
 from ..conversion.h5_filters_to_codecs import h5_filters_to_codecs
 from ..conversion.create_zarr_dataset_from_h5_data import create_zarr_dataset_from_h5_data
+from ..LindiH5pyFile.LindiReferenceFileSystemStore import LindiReferenceFileSystemStore
 
 
 @dataclass
@@ -576,6 +577,8 @@ def _process_dataset(key):
 
         # Process the groups recursively starting with the root group
         _process_group("", self._h5f)
+
+        LindiReferenceFileSystemStore.replace_meta_file_contents_with_dicts(ret)
         return ret
 
 

diff --git a/lindi/LindiH5pyFile/LindiH5pyFile.py b/lindi/LindiH5pyFile/LindiH5pyFile.py
@@ -135,6 +135,7 @@ def to_reference_file_system(self):
             raise Exception(f"Unexpected type for zarr store: {type(self._zarr_store)}")
         rfs = self._zarr_store.rfs
         rfs_copy = json.loads(json.dumps(rfs))
+        LindiReferenceFileSystemStore.replace_meta_file_contents_with_dicts(rfs_copy)
         return rfs_copy
 
     @property

diff --git a/lindi/LindiH5pyFile/LindiReferenceFileSystemStore.py b/lindi/LindiH5pyFile/LindiReferenceFileSystemStore.py
@@ -1,4 +1,5 @@
 from typing import Literal, Dict
+import json
 import base64
 from zarr.storage import Store as ZarrStore
 from .FileSegmentReader.FileSegmentReader import FileSegmentReader
@@ -21,18 +22,20 @@ class LindiReferenceFileSystemStore(ZarrStore):
     segment of a file is read.
 
     To read from a file in an embargoed DANDI dataset, you will need to set the
-    DANDI_API_KEY environment variable to your DANDI API token. Or, if this is
-    a Dandiset on the staging server, you will need to set the
+    DANDI_API_KEY environment variable to your DANDI API token. Or, if this is a
+    Dandiset on the staging server, you will need to set the
     DANDI_STAGING_API_KEY.
 
-    Following the fsspec convention (https://fsspec.github.io/kerchunk/spec.html),
-    the reference file system is specified as a
-    dictionary with a "refs" key. The value of "refs" is a dictionary where the
-    keys are the names of the files and the values are either strings or lists.
-    If the value is a string, it is assumed to be the data of the file, which
-    may be base64 encoded (see below). If the value is a list, it is assumed to
-    have three elements: the URL of the file (or path of a local file), the byte
-    offset of the data within the file, and the byte length of the data.
+    Following the fsspec convention
+    (https://fsspec.github.io/kerchunk/spec.html), the reference file system is
+    specified as a dictionary with a "refs" key. The value of "refs" is a
+    dictionary where the keys are the names of the files and each value is
+    either a string, a list, or a dict. If the value is a string, it is assumed
+    to be the data of the file, which may be base64 encoded (see below). If the
+    value is a list, it is assumed to have three elements: the URL of the file
+    (or path of a local file), the byte offset of the data within the file, and
+    the byte length of the data. If the value is a dict, it represents a json
+    file, and the content of the file is the json representation of the dict.
 
     If the value for a file is a string, it may be prefixed with "base64:". If
     it is, the string is assumed to be base64 encoded and is decoded before
@@ -62,6 +65,9 @@ def __init__(self, rfs: dict, mode: Literal["r", "r+"] = "r+"):
         for k, v in rfs["refs"].items():
             if isinstance(v, str):
                 pass
+            elif isinstance(v, dict):
+                # the content of the file is the json representation of the dict
+                pass
             elif isinstance(v, list):
                 if len(v) != 3:
                     raise Exception(f"Problem with {k}: list must have 3 elements")
@@ -87,6 +93,8 @@ def __getitem__(self, key: str):
                 return base64.b64decode(x[len("base64:"):])
             else:
                 return x.encode("utf-8")
+        elif isinstance(x, dict):
+            return json.dumps(x).encode("utf-8")
         elif isinstance(x, list):
             if len(x) != 3:
                 raise Exception("list must have 3 elements")  # pragma: no cover
@@ -100,14 +108,20 @@ def __getitem__(self, key: str):
             # and contains mutable lists
             raise Exception(f"Problem with {key}: value {x} must be a string or a list")
 
-    def __setitem__(self, key: str, value):
+    def __setitem__(self, key: str, value: bytes):
+        # We intentionally do not allow value to be a dict here! When the rfs is
+        # written to a .json file elsewhere in the codebase of lindi, the value
+        # will automatically be converted to a json object if it is json
+        # serializable.
+        if not isinstance(value, bytes):
+            raise ValueError("value must be bytes")
         try:
             # try to ascii encode the value
-            value = value.decode("ascii")
+            value2 = value.decode("ascii")
         except UnicodeDecodeError:
             # if that fails, base64 encode it
-            value = "base64:" + base64.b64encode(value).decode("ascii")
-        self.rfs["refs"][key] = value
+            value2 = "base64:" + base64.b64encode(value).decode("ascii")
+        self.rfs["refs"][key] = value2
 
     def __delitem__(self, key: str):
         del self.rfs["refs"][key]
@@ -131,6 +145,19 @@ def is_listable(self):
     def is_erasable(self):
         return False
 
+    @staticmethod
+    def replace_meta_file_contents_with_dicts(rfs: dict) -> None:
+        """
+        Utility function for replacing the contents of the .zattrs, .zgroup, and
+        .zarray files in an rfs with the json representation of the contents.
+        """
+        # important to use the LindiReferenceFileSystemStore here because then we
+        # can resolve any base64 encoded values, etc when converting them to dicts
+        store = LindiReferenceFileSystemStore(rfs)
+        for k, v in rfs['refs'].items():
+            if k.endswith('.zattrs') or k.endswith('.zgroup') or k.endswith('.zarray') or k.endswith('zarr.json'):  # note: zarr.json is for zarr v3
+                rfs['refs'][k] = json.loads(store[k].decode('utf-8'))
+
 
 # Keep a global cache of file segment readers that apply to all instances of
 # LindiReferenceFileSystemStore. The key is the URL of the file.

diff --git a/tests/test_core.py b/tests/test_core.py
@@ -1,3 +1,4 @@
+import json
 import pytest
 import numpy as np
 import h5py
@@ -342,9 +343,23 @@ def test_lindi_reference_file_system_store():
         store["a"]
     with pytest.raises(Exception):
         store[{}]  # invalid key type # type: ignore
-    rfs = {"refs": {"a": {}}}  # invalid value
+    rfs = {"refs": {"a": 83}}  # invalid value
     with pytest.raises(Exception):
         store = LindiReferenceFileSystemStore(rfs)
+    rfs = {"refs": {"a": {"test": 1}}}
+    store = LindiReferenceFileSystemStore(rfs)
+    assert json.loads(store["a"]) == {"test": 1}
+    rfs = {"refs": {".zattrs": "{\"test\": 2}"}}
+    store = LindiReferenceFileSystemStore(rfs)
+    assert json.loads(store[".zattrs"]) == {"test": 2}
+    rfs = {"refs": {".zattrs": "{\"test\": 3}"}}
+    LindiReferenceFileSystemStore.replace_meta_file_contents_with_dicts(rfs)
+    assert isinstance(rfs["refs"][".zattrs"], dict)
+    store = LindiReferenceFileSystemStore(rfs)
+    assert json.loads(store[".zattrs"]) == {"test": 3}
+    rfs = {"refs": {".zattrs_xxx": "{\"test\": 5}"}}
+    LindiReferenceFileSystemStore.replace_meta_file_contents_with_dicts(rfs)
+    assert isinstance(rfs["refs"][".zattrs_xxx"], str)
 
     rfs = {"refs": {"a": "abc"}}
     store = LindiReferenceFileSystemStore(rfs)