Merge branch 'dev' into staging_expand

hdmf-dev · Aug 30, 2024 · 2921c45 · 2921c45
2 parents e488cf3 + 1fc6212
commit 2921c45
Show file tree

Hide file tree

Showing 21 changed files with 573 additions and 63 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -18,7 +18,7 @@ repos:
 #     hooks:
 #     -   id: black
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.5.7
+    rev: v0.6.1
     hooks:
     -   id: ruff
 # -   repo: https://github.com/econchick/interrogate

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,17 +1,32 @@
 # HDMF Changelog
 
-## HDMF 3.14.4 (Upcoming)
+## HDMF 4.0.0 (Upcoming)
+### Enhancements
+- Added support for datasets to be expandable by default for the HDF5 backend. @mavaylon1 [#1158](https://github.com/hdmf-dev/hdmf/pull/1158)
+
+## HDMF 3.14.4 (August 22, 2024)
 
 ### Enhancements
 - Added support to append to a dataset of references for HDMF-Zarr. @mavaylon1 [#1157](https://github.com/hdmf-dev/hdmf/pull/1157)
-- Added support for datasets to be expandable by default for the HDF5 backend. @mavaylon1 [#1158](https://github.com/hdmf-dev/hdmf/pull/1158)
+- Adjusted stacklevel of warnings to point to user code when possible. @rly [#1166](https://github.com/hdmf-dev/hdmf/pull/1166)
+- Improved "already exists" error message when adding a container to a `MultiContainerInterface`. @rly [#1165](https://github.com/hdmf-dev/hdmf/pull/1165)
+- Added support to write multidimensional string arrays. @stephprince [#1173](https://github.com/hdmf-dev/hdmf/pull/1173)
+- Add support for appending to a dataset of references. @mavaylon1 [#1135](https://github.com/hdmf-dev/hdmf/pull/1135)
+
+### Bug fixes
+- Fixed issue where scalar datasets with a compound data type were being written as non-scalar datasets @stephprince [#1176](https://github.com/hdmf-dev/hdmf/pull/1176)
+- Fixed H5DataIO not exposing `maxshape` on non-dci dsets. @cboulay [#1149](https://github.com/hdmf-dev/hdmf/pull/1149)
+- Fixed generation of classes in an extension that contain attributes or datasets storing references to other types defined in the extension.
+  @rly [#1183](https://github.com/hdmf-dev/hdmf/pull/1183)
 
 ## HDMF 3.14.3 (July 29, 2024)
 
 ### Enhancements
 - Added new attribute "dimension_labels" on `DatasetBuilder` which specifies the names of the dimensions used in the
 dataset based on the shape of the dataset data and the dimension names in the spec for the data type. This attribute
 is available on build (during the write process), but not on read of a dataset from a file. @rly [#1081](https://github.com/hdmf-dev/hdmf/pull/1081)
+- Speed up loading namespaces by skipping register_type when already registered. @magland [#1102](https://github.com/hdmf-dev/hdmf/pull/1102)
+- Speed up namespace loading: return a shallow copy rather than a deep copy in build_const_args. @magland [#1103](https://github.com/hdmf-dev/hdmf/pull/1103)
 
 ## HDMF 3.14.2 (July 7, 2024)
 

diff --git a/docs/source/install_developers.rst b/docs/source/install_developers.rst
@@ -73,7 +73,7 @@ environment by using the ``conda remove --name hdmf-venv --all`` command.
     For advanced users, we recommend using Mambaforge_, a faster version of the conda package manager
     that includes conda-forge as a default channel.
 
-.. _Anaconda: https://www.anaconda.com/products/distribution
+.. _Anaconda: https://www.anaconda.com/download
 .. _Mambaforge: https://github.com/conda-forge/miniforge
 
 Install from GitHub

diff --git a/docs/source/install_users.rst b/docs/source/install_users.rst
@@ -29,4 +29,4 @@ You can also install HDMF using ``conda`` by running the following command in a
 
    conda install -c conda-forge hdmf
 
-.. _Anaconda Distribution: https://www.anaconda.com/products/distribution
+.. _Anaconda Distribution: https://www.anaconda.com/download
diff --git a/src/hdmf/backends/hdf5/h5_utils.py b/src/hdmf/backends/hdf5/h5_utils.py
@@ -17,11 +17,11 @@
 import logging
 
 from ...array import Array
-from ...data_utils import DataIO, AbstractDataChunkIterator
+from ...data_utils import DataIO, AbstractDataChunkIterator, append_data
 from ...query import HDMFDataset, ReferenceResolver, ContainerResolver, BuilderResolver
 from ...region import RegionSlicer
 from ...spec import SpecWriter, SpecReader
-from ...utils import docval, getargs, popargs, get_docval
+from ...utils import docval, getargs, popargs, get_docval, get_data_shape
 
 
 class HDF5IODataChunkIteratorQueue(deque):
@@ -108,6 +108,20 @@ def ref(self):
     def shape(self):
         return self.dataset.shape
 
+    def append(self, arg):
+        # Get Builder
+        builder = self.io.manager.get_builder(arg)
+        if builder is None:
+            raise ValueError(
+                "The container being appended to the dataset has not yet been built. "
+                "Please write the container to the file, then open the modified file, and "
+                "append the read container to the dataset."
+            )
+
+        # Get HDF5 Reference
+        ref = self.io._create_ref(builder)
+        append_data(self.dataset, ref)
+
 
 class DatasetOfReferences(H5Dataset, ReferenceResolver, metaclass=ABCMeta):
     """
@@ -501,7 +515,7 @@ def __init__(self, **kwargs):
         # Check for possible collision with other parameters
         if not isinstance(getargs('data', kwargs), Dataset) and self.__link_data:
             self.__link_data = False
-            warnings.warn('link_data parameter in H5DataIO will be ignored', stacklevel=2)
+            warnings.warn('link_data parameter in H5DataIO will be ignored', stacklevel=3)
         # Call the super constructor and consume the data parameter
         super().__init__(**kwargs)
         # Construct the dict with the io args, ignoring all options that were set to None
@@ -525,7 +539,7 @@ def __init__(self, **kwargs):
                 self.__iosettings.pop('compression', None)
                 if 'compression_opts' in self.__iosettings:
                     warnings.warn('Compression disabled by compression=False setting. ' +
-                                  'compression_opts parameter will, therefore, be ignored.', stacklevel=2)
+                                  'compression_opts parameter will, therefore, be ignored.', stacklevel=3)
                     self.__iosettings.pop('compression_opts', None)
         # Validate the compression options used
         self._check_compression_options()
@@ -540,7 +554,7 @@ def __init__(self, **kwargs):
         if isinstance(self.data, Dataset):
             for k in self.__iosettings.keys():
                 warnings.warn("%s in H5DataIO will be ignored with H5DataIO.data being an HDF5 dataset" % k,
-                              stacklevel=2)
+                              stacklevel=3)
 
         self.__dataset = None
 
@@ -618,7 +632,7 @@ def _check_compression_options(self):
             if self.__iosettings['compression'] not in ['gzip', h5py_filters.h5z.FILTER_DEFLATE]:
                 warnings.warn(str(self.__iosettings['compression']) + " compression may not be available "
                               "on all installations of HDF5. Use of gzip is recommended to ensure portability of "
-                              "the generated HDF5 files.", stacklevel=3)
+                              "the generated HDF5 files.", stacklevel=4)
 
     @staticmethod
     def filter_available(filter, allow_plugin_filters):
@@ -658,3 +672,14 @@ def valid(self):
         if isinstance(self.data, Dataset) and not self.data.id.valid:
             return False
         return super().valid
+
+    @property
+    def maxshape(self):
+        if 'maxshape' in self.io_settings:
+            return self.io_settings['maxshape']
+        elif hasattr(self.data, 'maxshape'):
+            return self.data.maxshape
+        elif hasattr(self, "shape"):
+            return self.shape
+        else:
+            return get_data_shape(self.data)
diff --git a/src/hdmf/backends/hdf5/h5tools.py b/src/hdmf/backends/hdf5/h5tools.py
@@ -344,7 +344,7 @@ def copy_file(self, **kwargs):
         warnings.warn("The copy_file class method is no longer supported and may be removed in a future version of "
                       "HDMF. Please use the export method or h5py.File.copy method instead.",
                       category=DeprecationWarning,
-                      stacklevel=2)
+                      stacklevel=3)
 
         source_filename, dest_filename, expand_external, expand_refs, expand_soft = getargs('source_filename',
                                                                                             'dest_filename',
@@ -700,6 +700,8 @@ def __read_dataset(self, h5obj, name=None):
                     d = ReferenceBuilder(target_builder)
                 kwargs['data'] = d
                 kwargs['dtype'] = d.dtype
+            elif h5obj.dtype.kind == 'V':  # scalar compound data type
+                kwargs['data'] = np.array(scalar, dtype=h5obj.dtype)
             else:
                 kwargs["data"] = scalar
         else:
@@ -1239,6 +1241,8 @@ def _filler():
 
                 return
             # If the compound data type contains only regular data (i.e., no references) then we can write it as usual
+            elif len(np.shape(data)) == 0:
+                dset = self.__scalar_fill__(parent, name, data, options)
             else:
                 dset = self.__list_fill__(parent, name, data, matched_spec_shape, expandable, options)
         # Write a dataset containing references, i.e., a region or object reference.
@@ -1481,7 +1485,7 @@ def __list_fill__(cls, parent, name, data, matched_spec_shape, expandable, optio
             data_shape = io_settings.pop('shape')
         elif hasattr(data, 'shape'):
             data_shape = data.shape
-        elif isinstance(dtype, np.dtype):
+        elif isinstance(dtype, np.dtype) and len(dtype) > 1:  # check if compound dtype
             data_shape = (len(data),)
         else:
             data_shape = get_data_shape(data)
@@ -1531,6 +1535,7 @@ def __get_ref(self, **kwargs):
             self.logger.debug("Getting reference for %s '%s'" % (container.__class__.__name__, container.name))
             builder = self.manager.build(container)
         path = self.__get_path(builder)
+
         self.logger.debug("Getting reference at path '%s'" % path)
         if isinstance(container, RegionBuilder):
             region = container.region
@@ -1542,6 +1547,14 @@ def __get_ref(self, **kwargs):
         else:
             return self.__file[path].ref
 
+    @docval({'name': 'container', 'type': (Builder, Container, ReferenceBuilder), 'doc': 'the object to reference',
+             'default': None},
+            {'name': 'region', 'type': (slice, list, tuple), 'doc': 'the region reference indexing object',
+             'default': None},
+            returns='the reference', rtype=Reference)
+    def _create_ref(self, **kwargs):
+        return self.__get_ref(**kwargs)
+
     def __is_ref(self, dtype):
         if isinstance(dtype, DtypeSpec):
             return self.__is_ref(dtype.dtype)

diff --git a/src/hdmf/build/manager.py b/src/hdmf/build/manager.py
@@ -7,7 +7,7 @@
 from .classgenerator import ClassGenerator, CustomClassGenerator, MCIClassGenerator
 from ..container import AbstractContainer, Container, Data
 from ..term_set import TypeConfigurator
-from ..spec import DatasetSpec, GroupSpec, NamespaceCatalog
+from ..spec import DatasetSpec, GroupSpec, NamespaceCatalog, RefSpec
 from ..spec.spec import BaseStorageSpec
 from ..utils import docval, getargs, ExtenderMeta, get_docval
 
@@ -480,6 +480,7 @@ def load_namespaces(self, **kwargs):
         load_namespaces here has the advantage of being able to keep track of type dependencies across namespaces.
         '''
         deps = self.__ns_catalog.load_namespaces(**kwargs)
+        # register container types for each dependent type in each dependent namespace
         for new_ns, ns_deps in deps.items():
             for src_ns, types in ns_deps.items():
                 for dt in types:
@@ -529,7 +530,7 @@ def get_dt_container_cls(self, **kwargs):
                     namespace = ns_key
                     break
         if namespace is None:
-            raise ValueError("Namespace could not be resolved.")
+            raise ValueError(f"Namespace could not be resolved for data type '{data_type}'.")
 
         cls = self.__get_container_cls(namespace, data_type)
 
@@ -549,6 +550,8 @@ def get_dt_container_cls(self, **kwargs):
 
     def __check_dependent_types(self, spec, namespace):
         """Ensure that classes for all types used by this type exist in this namespace and generate them if not.
+
+        `spec` should be a GroupSpec or DatasetSpec in the `namespace`
         """
         def __check_dependent_types_helper(spec, namespace):
             if isinstance(spec, (GroupSpec, DatasetSpec)):
@@ -564,6 +567,16 @@ def __check_dependent_types_helper(spec, namespace):
 
         if spec.data_type_inc is not None:
             self.get_dt_container_cls(spec.data_type_inc, namespace)
+
+        # handle attributes that have a reference dtype
+        for attr_spec in spec.attributes:
+            if isinstance(attr_spec.dtype, RefSpec):
+                self.get_dt_container_cls(attr_spec.dtype.target_type, namespace)
+        # handle datasets that have a reference dtype
+        if isinstance(spec, DatasetSpec):
+            if isinstance(spec.dtype, RefSpec):
+                self.get_dt_container_cls(spec.dtype.target_type, namespace)
+        # recurse into nested types
         if isinstance(spec, GroupSpec):
             for child_spec in (spec.groups + spec.datasets + spec.links):
                 __check_dependent_types_helper(child_spec, namespace)

diff --git a/src/hdmf/build/objectmapper.py b/src/hdmf/build/objectmapper.py
@@ -10,8 +10,11 @@
 from .errors import (BuildError, OrphanContainerBuildError, ReferenceTargetNotBuiltError, ContainerConfigurationError,
                      ConstructError)
 from .manager import Proxy, BuildManager
+
 from .warnings import (MissingRequiredBuildWarning, DtypeConversionWarning, IncorrectQuantityBuildWarning,
                        IncorrectDatasetShapeBuildWarning)
+from hdmf.backends.hdf5.h5_utils import H5DataIO
+
 from ..container import AbstractContainer, Data, DataRegion
 from ..term_set import TermSetWrapper
 from ..data_utils import DataIO, AbstractDataChunkIterator, InvalidDataIOError
@@ -599,11 +602,17 @@ def __get_data_type(cls, spec):
 
     def __convert_string(self, value, spec):
         """Convert string types to the specified dtype."""
+        def __apply_string_type(value, string_type):
+            if isinstance(value, (list, tuple, np.ndarray, DataIO)):
+                return [__apply_string_type(item, string_type) for item in value]
+            else:
+                return string_type(value)
+
         ret = value
         if isinstance(spec, AttributeSpec):
             if 'text' in spec.dtype:
                 if spec.shape is not None or spec.dims is not None:
-                    ret = list(map(str, value))
+                    ret = __apply_string_type(value, str)
                 else:
                     ret = str(value)
         elif isinstance(spec, DatasetSpec):
@@ -619,7 +628,7 @@ def string_type(x):
                         return x.isoformat()  # method works for both date and datetime
                 if string_type is not None:
                     if spec.shape is not None or spec.dims is not None:
-                        ret = list(map(string_type, value))
+                        ret = __apply_string_type(value, string_type)
                     else:
                         ret = string_type(value)
                     # copy over any I/O parameters if they were specified
@@ -998,6 +1007,9 @@ def __get_ref_builder(self, builder, dtype, shape, container, build_manager):
                 for d in container.data:
                     target_builder = self.__get_target_builder(d, build_manager, builder)
                     bldr_data.append(ReferenceBuilder(target_builder))
+                if isinstance(container.data, H5DataIO):
+                    # This is here to support appending a dataset of references.
+                    bldr_data = H5DataIO(bldr_data, **container.data.get_io_params())
             else:
                 self.logger.debug("Setting %s '%s' data to reference builder"
                                   % (builder.__class__.__name__, builder.name))

diff --git a/src/hdmf/common/resources.py b/src/hdmf/common/resources.py
@@ -628,7 +628,7 @@ def add_ref(self, **kwargs):
             if entity_uri is not None:
                 entity_uri = entity.entity_uri
                 msg = 'This entity already exists. Ignoring new entity uri'
-                warn(msg, stacklevel=2)
+                warn(msg, stacklevel=3)
 
         #################
         # Validate Object

diff --git a/src/hdmf/common/table.py b/src/hdmf/common/table.py
@@ -717,7 +717,7 @@ def add_row(self, **kwargs):
                     warn(("Data has elements with different lengths and therefore cannot be coerced into an "
                           "N-dimensional array. Use the 'index' argument when creating a column to add rows "
                           "with different lengths."),
-                         stacklevel=2)
+                         stacklevel=3)
 
     def __eq__(self, other):
         """Compare if the two DynamicTables contain the same data.
@@ -776,7 +776,7 @@ def add_column(self, **kwargs):  # noqa: C901
 
         if isinstance(index, VectorIndex):
             warn("Passing a VectorIndex in for index may lead to unexpected behavior. This functionality will be "
-                 "deprecated in a future version of HDMF.", category=FutureWarning, stacklevel=2)
+                 "deprecated in a future version of HDMF.", category=FutureWarning, stacklevel=3)
 
         if name in self.__colids:  # column has already been added
             msg = "column '%s' already exists in %s '%s'" % (name, self.__class__.__name__, self.name)
@@ -793,7 +793,7 @@ def add_column(self, **kwargs):  # noqa: C901
                        "Please ensure the new column complies with the spec. "
                        "This will raise an error in a future version of HDMF."
                        % (name, self.__class__.__name__, spec_table))
-                warn(msg, stacklevel=2)
+                warn(msg, stacklevel=3)
 
             index_bool = index or not isinstance(index, bool)
             spec_index = self.__uninit_cols[name].get('index', False)
@@ -803,7 +803,7 @@ def add_column(self, **kwargs):  # noqa: C901
                        "Please ensure the new column complies with the spec. "
                        "This will raise an error in a future version of HDMF."
                        % (name, self.__class__.__name__, spec_index))
-                warn(msg, stacklevel=2)
+                warn(msg, stacklevel=3)
 
             spec_col_cls = self.__uninit_cols[name].get('class', VectorData)
             if col_cls != spec_col_cls:
@@ -841,7 +841,7 @@ def add_column(self, **kwargs):  # noqa: C901
                 warn(("Data has elements with different lengths and therefore cannot be coerced into an "
                       "N-dimensional array. Use the 'index' argument when adding a column of data with "
                       "different lengths."),
-                     stacklevel=2)
+                     stacklevel=3)
 
             # Check that we are asked to create an index
             if (isinstance(index, bool) or isinstance(index, int)) and index > 0 and len(data) > 0:
Original file line number	Diff line number	Diff line change
Expand Up		@@ -29,4 +29,4 @@ You can also install HDMF using ``conda`` by running the following command in a

		conda install -c conda-forge hdmf

		.. _Anaconda Distribution: https://www.anaconda.com/products/distribution
		.. _Anaconda Distribution: https://www.anaconda.com/download