diff --git a/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py b/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py index 0ca71eb..9858248 100644 --- a/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py +++ b/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py @@ -18,6 +18,7 @@ from ..conversion.reformat_json import reformat_json from ..conversion.h5_filters_to_codecs import h5_filters_to_codecs from ..conversion.create_zarr_dataset_from_h5_data import create_zarr_dataset_from_h5_data +from ..LindiH5pyFile.LindiReferenceFileSystemStore import LindiReferenceFileSystemStore @dataclass @@ -576,6 +577,8 @@ def _process_dataset(key): # Process the groups recursively starting with the root group _process_group("", self._h5f) + + LindiReferenceFileSystemStore.replace_meta_file_contents_with_dicts(ret) return ret diff --git a/lindi/LindiH5pyFile/LindiH5pyFile.py b/lindi/LindiH5pyFile/LindiH5pyFile.py index 179eb1d..9a93d28 100644 --- a/lindi/LindiH5pyFile/LindiH5pyFile.py +++ b/lindi/LindiH5pyFile/LindiH5pyFile.py @@ -135,6 +135,7 @@ def to_reference_file_system(self): raise Exception(f"Unexpected type for zarr store: {type(self._zarr_store)}") rfs = self._zarr_store.rfs rfs_copy = json.loads(json.dumps(rfs)) + LindiReferenceFileSystemStore.replace_meta_file_contents_with_dicts(rfs_copy) return rfs_copy @property diff --git a/lindi/LindiH5pyFile/LindiReferenceFileSystemStore.py b/lindi/LindiH5pyFile/LindiReferenceFileSystemStore.py index e98ce22..d9de9e2 100644 --- a/lindi/LindiH5pyFile/LindiReferenceFileSystemStore.py +++ b/lindi/LindiH5pyFile/LindiReferenceFileSystemStore.py @@ -1,4 +1,5 @@ from typing import Literal, Dict +import json import base64 from zarr.storage import Store as ZarrStore from .FileSegmentReader.FileSegmentReader import FileSegmentReader @@ -21,18 +22,20 @@ class LindiReferenceFileSystemStore(ZarrStore): segment of a file is read. To read from a file in an embargoed DANDI dataset, you will need to set the - DANDI_API_KEY environment variable to your DANDI API token. Or, if this is - a Dandiset on the staging server, you will need to set the + DANDI_API_KEY environment variable to your DANDI API token. Or, if this is a + Dandiset on the staging server, you will need to set the DANDI_STAGING_API_KEY. - Following the fsspec convention (https://fsspec.github.io/kerchunk/spec.html), - the reference file system is specified as a - dictionary with a "refs" key. The value of "refs" is a dictionary where the - keys are the names of the files and the values are either strings or lists. - If the value is a string, it is assumed to be the data of the file, which - may be base64 encoded (see below). If the value is a list, it is assumed to - have three elements: the URL of the file (or path of a local file), the byte - offset of the data within the file, and the byte length of the data. + Following the fsspec convention + (https://fsspec.github.io/kerchunk/spec.html), the reference file system is + specified as a dictionary with a "refs" key. The value of "refs" is a + dictionary where the keys are the names of the files and each value is + either a string, a list, or a dict. If the value is a string, it is assumed + to be the data of the file, which may be base64 encoded (see below). If the + value is a list, it is assumed to have three elements: the URL of the file + (or path of a local file), the byte offset of the data within the file, and + the byte length of the data. If the value is a dict, it represents a json + file, and the content of the file is the json representation of the dict. If the value for a file is a string, it may be prefixed with "base64:". If it is, the string is assumed to be base64 encoded and is decoded before @@ -62,6 +65,9 @@ def __init__(self, rfs: dict, mode: Literal["r", "r+"] = "r+"): for k, v in rfs["refs"].items(): if isinstance(v, str): pass + elif isinstance(v, dict): + # the content of the file is the json representation of the dict + pass elif isinstance(v, list): if len(v) != 3: raise Exception(f"Problem with {k}: list must have 3 elements") @@ -87,6 +93,8 @@ def __getitem__(self, key: str): return base64.b64decode(x[len("base64:"):]) else: return x.encode("utf-8") + elif isinstance(x, dict): + return json.dumps(x).encode("utf-8") elif isinstance(x, list): if len(x) != 3: raise Exception("list must have 3 elements") # pragma: no cover @@ -100,14 +108,20 @@ def __getitem__(self, key: str): # and contains mutable lists raise Exception(f"Problem with {key}: value {x} must be a string or a list") - def __setitem__(self, key: str, value): + def __setitem__(self, key: str, value: bytes): + # We intentionally do not allow value to be a dict here! When the rfs is + # written to a .json file elsewhere in the codebase of lindi, the value + # will automatically be converted to a json object if it is json + # serializable. + if not isinstance(value, bytes): + raise ValueError("value must be bytes") try: # try to ascii encode the value - value = value.decode("ascii") + value2 = value.decode("ascii") except UnicodeDecodeError: # if that fails, base64 encode it - value = "base64:" + base64.b64encode(value).decode("ascii") - self.rfs["refs"][key] = value + value2 = "base64:" + base64.b64encode(value).decode("ascii") + self.rfs["refs"][key] = value2 def __delitem__(self, key: str): del self.rfs["refs"][key] @@ -131,6 +145,19 @@ def is_listable(self): def is_erasable(self): return False + @staticmethod + def replace_meta_file_contents_with_dicts(rfs: dict) -> None: + """ + Utility function for replacing the contents of the .zattrs, .zgroup, and + .zarray files in an rfs with the json representation of the contents. + """ + # important to use the LindiReferenceFileSystemStore here because then we + # can resolve any base64 encoded values, etc when converting them to dicts + store = LindiReferenceFileSystemStore(rfs) + for k, v in rfs['refs'].items(): + if k.endswith('.zattrs') or k.endswith('.zgroup') or k.endswith('.zarray') or k.endswith('zarr.json'): # note: zarr.json is for zarr v3 + rfs['refs'][k] = json.loads(store[k].decode('utf-8')) + # Keep a global cache of file segment readers that apply to all instances of # LindiReferenceFileSystemStore. The key is the URL of the file. diff --git a/tests/test_core.py b/tests/test_core.py index 16be391..20a0d46 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1,3 +1,4 @@ +import json import pytest import numpy as np import h5py @@ -342,9 +343,23 @@ def test_lindi_reference_file_system_store(): store["a"] with pytest.raises(Exception): store[{}] # invalid key type # type: ignore - rfs = {"refs": {"a": {}}} # invalid value + rfs = {"refs": {"a": 83}} # invalid value with pytest.raises(Exception): store = LindiReferenceFileSystemStore(rfs) + rfs = {"refs": {"a": {"test": 1}}} + store = LindiReferenceFileSystemStore(rfs) + assert json.loads(store["a"]) == {"test": 1} + rfs = {"refs": {".zattrs": "{\"test\": 2}"}} + store = LindiReferenceFileSystemStore(rfs) + assert json.loads(store[".zattrs"]) == {"test": 2} + rfs = {"refs": {".zattrs": "{\"test\": 3}"}} + LindiReferenceFileSystemStore.replace_meta_file_contents_with_dicts(rfs) + assert isinstance(rfs["refs"][".zattrs"], dict) + store = LindiReferenceFileSystemStore(rfs) + assert json.loads(store[".zattrs"]) == {"test": 3} + rfs = {"refs": {".zattrs_xxx": "{\"test\": 5}"}} + LindiReferenceFileSystemStore.replace_meta_file_contents_with_dicts(rfs) + assert isinstance(rfs["refs"][".zattrs_xxx"], str) rfs = {"refs": {"a": "abc"}} store = LindiReferenceFileSystemStore(rfs)