Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Write dimension labels to DatasetBuilder on build #1081

Merged
merged 6 commits into from
Jul 26, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/run_coverage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ jobs:

- name: Run tests and generate coverage report
run: |
pytest --cov
pytest # configured to run with coverage
python -m coverage xml # codecov uploader requires xml format
python -m coverage report -m

Expand Down
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
# HDMF Changelog

## HDMF 3.14.0 (Upcoming)

### Enhancements
- Added new attribute "dimension_labels" on `DatasetBuilder` which specifies the names of the dimensions used in the
dataset based on the shape of the dataset data and the dimension names in the spec for the data type. This attribute
is available on build (during the write process), but not on read of a dataset from a file. @rly [#1081](https://github.com/hdmf-dev/hdmf/pull/1081)

## HDMF 3.13.0 (March 20, 2024)

### Enhancements
Expand Down
16 changes: 14 additions & 2 deletions src/hdmf/build/builders.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,18 +330,25 @@ class DatasetBuilder(BaseBuilder):
'doc': 'The datatype of this dataset.', 'default': None},
{'name': 'attributes', 'type': dict,
'doc': 'A dictionary of attributes to create in this dataset.', 'default': dict()},
{'name': 'dimension_labels', 'type': tuple,
'doc': ('A list of labels for each dimension of this dataset from the spec. Currently this is '
'supplied only on build.'),
'default': None},
{'name': 'maxshape', 'type': (int, tuple),
'doc': 'The shape of this dataset. Use None for scalars.', 'default': None},
{'name': 'chunks', 'type': bool, 'doc': 'Whether or not to chunk this dataset.', 'default': False},
{'name': 'parent', 'type': GroupBuilder, 'doc': 'The parent builder of this builder.', 'default': None},
{'name': 'source', 'type': str, 'doc': 'The source of the data in this builder.', 'default': None})
def __init__(self, **kwargs):
""" Create a Builder object for a dataset """
name, data, dtype, attributes, maxshape, chunks, parent, source = getargs(
'name', 'data', 'dtype', 'attributes', 'maxshape', 'chunks', 'parent', 'source', kwargs)
name, data, dtype, attributes, dimension_labels, maxshape, chunks, parent, source = getargs(
'name', 'data', 'dtype', 'attributes', 'dimension_labels', 'maxshape', 'chunks', 'parent', 'source',
kwargs
)
super().__init__(name, attributes, parent, source)
self['data'] = data
self['attributes'] = _copy.copy(attributes)
self.__dimension_labels = dimension_labels
self.__chunks = chunks
self.__maxshape = maxshape
if isinstance(data, BaseBuilder):
Expand All @@ -361,6 +368,11 @@ def data(self, val):
raise AttributeError("Cannot overwrite data.")
self['data'] = val

@property
def dimension_labels(self):
"""Labels for each dimension of this dataset from the spec."""
return self.__dimension_labels

@property
def chunks(self):
"""Whether or not this dataset is chunked."""
Expand Down
99 changes: 91 additions & 8 deletions src/hdmf/build/objectmapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,15 @@
from .errors import (BuildError, OrphanContainerBuildError, ReferenceTargetNotBuiltError, ContainerConfigurationError,
ConstructError)
from .manager import Proxy, BuildManager
from .warnings import MissingRequiredBuildWarning, DtypeConversionWarning, IncorrectQuantityBuildWarning
from .warnings import (MissingRequiredBuildWarning, DtypeConversionWarning, IncorrectQuantityBuildWarning,
IncorrectDatasetShapeBuildWarning)
from ..container import AbstractContainer, Data, DataRegion
from ..term_set import TermSetWrapper
from ..data_utils import DataIO, AbstractDataChunkIterator
from ..query import ReferenceResolver
from ..spec import Spec, AttributeSpec, DatasetSpec, GroupSpec, LinkSpec, RefSpec
from ..spec.spec import BaseStorageSpec
from ..utils import docval, getargs, ExtenderMeta, get_docval
from ..utils import docval, getargs, ExtenderMeta, get_docval, get_data_shape

_const_arg = '__constructor_arg'

Expand Down Expand Up @@ -721,19 +722,34 @@ def build(self, **kwargs):
if not isinstance(container, Data):
msg = "'container' must be of type Data with DatasetSpec"
raise ValueError(msg)
spec_dtype, spec_shape, spec = self.__check_dset_spec(self.spec, spec_ext)
spec_dtype, spec_shape, spec_dims, spec = self.__check_dset_spec(self.spec, spec_ext)
dimension_labels = self.__get_dimension_labels_from_spec(container.data, spec_shape, spec_dims)
if isinstance(spec_dtype, RefSpec):
self.logger.debug("Building %s '%s' as a dataset of references (source: %s)"
% (container.__class__.__name__, container.name, repr(source)))
# create dataset builder with data=None as a placeholder. fill in with refs later
builder = DatasetBuilder(name, data=None, parent=parent, source=source, dtype=spec_dtype.reftype)
builder = DatasetBuilder(
name,
data=None,
parent=parent,
source=source,
dtype=spec_dtype.reftype,
dimension_labels=dimension_labels,
)
manager.queue_ref(self.__set_dataset_to_refs(builder, spec_dtype, spec_shape, container, manager))
elif isinstance(spec_dtype, list):
# a compound dataset
self.logger.debug("Building %s '%s' as a dataset of compound dtypes (source: %s)"
% (container.__class__.__name__, container.name, repr(source)))
# create dataset builder with data=None, dtype=None as a placeholder. fill in with refs later
builder = DatasetBuilder(name, data=None, parent=parent, source=source, dtype=spec_dtype)
builder = DatasetBuilder(
name,
data=None,
parent=parent,
source=source,
dtype=spec_dtype,
dimension_labels=dimension_labels,
)
manager.queue_ref(self.__set_compound_dataset_to_refs(builder, spec, spec_dtype, container,
manager))
else:
Expand All @@ -744,7 +760,14 @@ def build(self, **kwargs):
% (container.__class__.__name__, container.name, repr(source)))
# an unspecified dtype and we were given references
# create dataset builder with data=None as a placeholder. fill in with refs later
builder = DatasetBuilder(name, data=None, parent=parent, source=source, dtype='object')
builder = DatasetBuilder(
name,
data=None,
parent=parent,
source=source,
dtype="object",
dimension_labels=dimension_labels,
)
manager.queue_ref(self.__set_untyped_dataset_to_refs(builder, container, manager))
else:
# a dataset that has no references, pass the conversion off to the convert_dtype method
Expand All @@ -760,7 +783,14 @@ def build(self, **kwargs):
except Exception as ex:
msg = 'could not resolve dtype for %s \'%s\'' % (type(container).__name__, container.name)
raise Exception(msg) from ex
builder = DatasetBuilder(name, bldr_data, parent=parent, source=source, dtype=dtype)
builder = DatasetBuilder(
name,
data=bldr_data,
parent=parent,
source=source,
dtype=dtype,
dimension_labels=dimension_labels,
)

# Add attributes from the specification extension to the list of attributes
all_attrs = self.__spec.attributes + getattr(spec_ext, 'attributes', tuple())
Expand All @@ -779,14 +809,67 @@ def __check_dset_spec(self, orig, ext):
"""
dtype = orig.dtype
shape = orig.shape
dims = orig.dims
spec = orig
if ext is not None:
if ext.dtype is not None:
dtype = ext.dtype
if ext.shape is not None:
shape = ext.shape
dims = ext.dims
spec = ext
return dtype, shape, spec
return dtype, shape, dims, spec

def __get_dimension_labels_from_spec(self, data, spec_shape, spec_dims) -> tuple:
if spec_shape is None or spec_dims is None:
return None
data_shape = get_data_shape(data)
# if shape is a list of allowed shapes, find the index of the shape that matches the data
if isinstance(spec_shape[0], list):
match_shape_inds = list()
for i, s in enumerate(spec_shape):
# skip this shape if it has a different number of dimensions from the data
if len(s) != len(data_shape):
continue
# check each dimension. None means any length is allowed
match = True
for j, d in enumerate(data_shape):
if s[j] is not None and s[j] != d:
match = False
break
if match:
match_shape_inds.append(i)
# use the most specific match -- the one with the fewest Nones
if match_shape_inds:
if len(match_shape_inds) == 1:
return tuple(spec_dims[match_shape_inds[0]])
else:
count_nones = [len([x for x in spec_shape[k] if x is None]) for k in match_shape_inds]
index_min_count = count_nones.index(min(count_nones))
best_match_ind = match_shape_inds[index_min_count]
return tuple(spec_dims[best_match_ind])
else:
# no matches found
msg = "Shape of data does not match any allowed shapes in spec '%s'" % self.spec.path
warnings.warn(msg, IncorrectDatasetShapeBuildWarning)
return None
else:
if len(data_shape) != len(spec_shape):
msg = "Shape of data does not match shape in spec '%s'" % self.spec.path
warnings.warn(msg, IncorrectDatasetShapeBuildWarning)
return None
# check each dimension. None means any length is allowed
match = True
for j, d in enumerate(data_shape):
if spec_shape[j] is not None and spec_shape[j] != d:
match = False
break
if not match:
msg = "Shape of data does not match shape in spec '%s'" % self.spec.path
warnings.warn(msg, IncorrectDatasetShapeBuildWarning)
return None
# shape is a single list of allowed dimension lengths
return tuple(spec_dims)

def __is_reftype(self, data):
if (isinstance(data, AbstractDataChunkIterator) or
Expand Down
7 changes: 7 additions & 0 deletions src/hdmf/build/warnings.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,13 @@ class IncorrectQuantityBuildWarning(BuildWarning):
pass


class IncorrectDatasetShapeBuildWarning(BuildWarning):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All of these warnings should really be errors because they result in an invalid file. @oruebel what do you think? Should we upgrade all of these warnings to errors?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All except for DtypeConversionWarning and the generic BuildWarning

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And maybe also the OrphanContainerWarning.

"""
Raised when a dataset has a shape that is not allowed by the spec.
"""
pass


class MissingRequiredBuildWarning(BuildWarning):
"""
Raised when a required field is missing.
Expand Down
Loading
Loading