Skip to content

Commit

Permalink
Merge pull request #40 from NeurodataWithoutBorders/rfs-json
Browse files Browse the repository at this point in the history
support dicts in reference file system for json files
  • Loading branch information
magland authored Apr 16, 2024
2 parents 47c5da9 + b7f2c4e commit bc71a9d
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 15 deletions.
3 changes: 3 additions & 0 deletions lindi/LindiH5ZarrStore/LindiH5ZarrStore.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from ..conversion.reformat_json import reformat_json
from ..conversion.h5_filters_to_codecs import h5_filters_to_codecs
from ..conversion.create_zarr_dataset_from_h5_data import create_zarr_dataset_from_h5_data
from ..LindiH5pyFile.LindiReferenceFileSystemStore import LindiReferenceFileSystemStore


@dataclass
Expand Down Expand Up @@ -576,6 +577,8 @@ def _process_dataset(key):

# Process the groups recursively starting with the root group
_process_group("", self._h5f)

LindiReferenceFileSystemStore.replace_meta_file_contents_with_dicts(ret)
return ret


Expand Down
1 change: 1 addition & 0 deletions lindi/LindiH5pyFile/LindiH5pyFile.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ def to_reference_file_system(self):
raise Exception(f"Unexpected type for zarr store: {type(self._zarr_store)}")
rfs = self._zarr_store.rfs
rfs_copy = json.loads(json.dumps(rfs))
LindiReferenceFileSystemStore.replace_meta_file_contents_with_dicts(rfs_copy)
return rfs_copy

@property
Expand Down
55 changes: 41 additions & 14 deletions lindi/LindiH5pyFile/LindiReferenceFileSystemStore.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import Literal, Dict
import json
import base64
from zarr.storage import Store as ZarrStore
from .FileSegmentReader.FileSegmentReader import FileSegmentReader
Expand All @@ -21,18 +22,20 @@ class LindiReferenceFileSystemStore(ZarrStore):
segment of a file is read.
To read from a file in an embargoed DANDI dataset, you will need to set the
DANDI_API_KEY environment variable to your DANDI API token. Or, if this is
a Dandiset on the staging server, you will need to set the
DANDI_API_KEY environment variable to your DANDI API token. Or, if this is a
Dandiset on the staging server, you will need to set the
DANDI_STAGING_API_KEY.
Following the fsspec convention (https://fsspec.github.io/kerchunk/spec.html),
the reference file system is specified as a
dictionary with a "refs" key. The value of "refs" is a dictionary where the
keys are the names of the files and the values are either strings or lists.
If the value is a string, it is assumed to be the data of the file, which
may be base64 encoded (see below). If the value is a list, it is assumed to
have three elements: the URL of the file (or path of a local file), the byte
offset of the data within the file, and the byte length of the data.
Following the fsspec convention
(https://fsspec.github.io/kerchunk/spec.html), the reference file system is
specified as a dictionary with a "refs" key. The value of "refs" is a
dictionary where the keys are the names of the files and each value is
either a string, a list, or a dict. If the value is a string, it is assumed
to be the data of the file, which may be base64 encoded (see below). If the
value is a list, it is assumed to have three elements: the URL of the file
(or path of a local file), the byte offset of the data within the file, and
the byte length of the data. If the value is a dict, it represents a json
file, and the content of the file is the json representation of the dict.
If the value for a file is a string, it may be prefixed with "base64:". If
it is, the string is assumed to be base64 encoded and is decoded before
Expand Down Expand Up @@ -62,6 +65,9 @@ def __init__(self, rfs: dict, mode: Literal["r", "r+"] = "r+"):
for k, v in rfs["refs"].items():
if isinstance(v, str):
pass
elif isinstance(v, dict):
# the content of the file is the json representation of the dict
pass
elif isinstance(v, list):
if len(v) != 3:
raise Exception(f"Problem with {k}: list must have 3 elements")
Expand All @@ -87,6 +93,8 @@ def __getitem__(self, key: str):
return base64.b64decode(x[len("base64:"):])
else:
return x.encode("utf-8")
elif isinstance(x, dict):
return json.dumps(x).encode("utf-8")
elif isinstance(x, list):
if len(x) != 3:
raise Exception("list must have 3 elements") # pragma: no cover
Expand All @@ -100,14 +108,20 @@ def __getitem__(self, key: str):
# and contains mutable lists
raise Exception(f"Problem with {key}: value {x} must be a string or a list")

def __setitem__(self, key: str, value):
def __setitem__(self, key: str, value: bytes):
# We intentionally do not allow value to be a dict here! When the rfs is
# written to a .json file elsewhere in the codebase of lindi, the value
# will automatically be converted to a json object if it is json
# serializable.
if not isinstance(value, bytes):
raise ValueError("value must be bytes")
try:
# try to ascii encode the value
value = value.decode("ascii")
value2 = value.decode("ascii")
except UnicodeDecodeError:
# if that fails, base64 encode it
value = "base64:" + base64.b64encode(value).decode("ascii")
self.rfs["refs"][key] = value
value2 = "base64:" + base64.b64encode(value).decode("ascii")
self.rfs["refs"][key] = value2

def __delitem__(self, key: str):
del self.rfs["refs"][key]
Expand All @@ -131,6 +145,19 @@ def is_listable(self):
def is_erasable(self):
return False

@staticmethod
def replace_meta_file_contents_with_dicts(rfs: dict) -> None:
"""
Utility function for replacing the contents of the .zattrs, .zgroup, and
.zarray files in an rfs with the json representation of the contents.
"""
# important to use the LindiReferenceFileSystemStore here because then we
# can resolve any base64 encoded values, etc when converting them to dicts
store = LindiReferenceFileSystemStore(rfs)
for k, v in rfs['refs'].items():
if k.endswith('.zattrs') or k.endswith('.zgroup') or k.endswith('.zarray') or k.endswith('zarr.json'): # note: zarr.json is for zarr v3
rfs['refs'][k] = json.loads(store[k].decode('utf-8'))


# Keep a global cache of file segment readers that apply to all instances of
# LindiReferenceFileSystemStore. The key is the URL of the file.
Expand Down
17 changes: 16 additions & 1 deletion tests/test_core.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import pytest
import numpy as np
import h5py
Expand Down Expand Up @@ -342,9 +343,23 @@ def test_lindi_reference_file_system_store():
store["a"]
with pytest.raises(Exception):
store[{}] # invalid key type # type: ignore
rfs = {"refs": {"a": {}}} # invalid value
rfs = {"refs": {"a": 83}} # invalid value
with pytest.raises(Exception):
store = LindiReferenceFileSystemStore(rfs)
rfs = {"refs": {"a": {"test": 1}}}
store = LindiReferenceFileSystemStore(rfs)
assert json.loads(store["a"]) == {"test": 1}
rfs = {"refs": {".zattrs": "{\"test\": 2}"}}
store = LindiReferenceFileSystemStore(rfs)
assert json.loads(store[".zattrs"]) == {"test": 2}
rfs = {"refs": {".zattrs": "{\"test\": 3}"}}
LindiReferenceFileSystemStore.replace_meta_file_contents_with_dicts(rfs)
assert isinstance(rfs["refs"][".zattrs"], dict)
store = LindiReferenceFileSystemStore(rfs)
assert json.loads(store[".zattrs"]) == {"test": 3}
rfs = {"refs": {".zattrs_xxx": "{\"test\": 5}"}}
LindiReferenceFileSystemStore.replace_meta_file_contents_with_dicts(rfs)
assert isinstance(rfs["refs"][".zattrs_xxx"], str)

rfs = {"refs": {"a": "abc"}}
store = LindiReferenceFileSystemStore(rfs)
Expand Down

0 comments on commit bc71a9d

Please sign in to comment.