diff --git a/CHANGELOG.md b/CHANGELOG.md index 9b1e3c476..6da8ae386 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,10 @@ ### Enhancements - Add Data.set_data_io allows(), which allows for setting a DataIO to a data object after-the-fact. @bendichter and @CodyCBakerPhD [#1013](https://github.com/hdmf-dev/hdmf/pull/1013) +### Enhancements +- Added `add_ref_termset`, updated helper methods for `HERD`, revised `add_ref` to support validations prior to populating the tables + and added `add_ref_container`. @mavaylon1 [#968](https://github.com/hdmf-dev/hdmf/pull/968) + ### Minor Improvements - Updated `__gather_columns` to ignore the order of bases when generating columns from the super class. @mavaylon1 [#991](https://github.com/hdmf-dev/hdmf/pull/991) - Update `get_key` to return all the keys if there are multiple within a `HERD` instance. @mavaylon1 [#999](https://github.com/hdmf-dev/hdmf/pull/999) diff --git a/docs/gallery/plot_external_resources.py b/docs/gallery/plot_external_resources.py index 3f7720d0b..5bf8dd5d8 100644 --- a/docs/gallery/plot_external_resources.py +++ b/docs/gallery/plot_external_resources.py @@ -91,6 +91,7 @@ # sphinx_gallery_thumbnail_path = 'figures/gallery_thumbnail_externalresources.png' from hdmf.common import HERD from hdmf.common import DynamicTable, VectorData +from hdmf.term_set import TermSet from hdmf import Container, HERDManager from hdmf import Data import numpy as np @@ -99,6 +100,13 @@ import warnings warnings.filterwarnings("ignore", category=UserWarning, message="HERD is experimental*") +try: + dir_path = os.path.dirname(os.path.abspath(__file__)) + yaml_file = os.path.join(dir_path, 'example_term_set.yaml') +except NameError: + dir_path = os.path.dirname(os.path.abspath('.')) + yaml_file = os.path.join(dir_path, 'gallery/example_term_set.yaml') + # Class to represent a file class HERDManagerContainer(Container, HERDManager): @@ -107,7 +115,7 @@ def __init__(self, **kwargs): super().__init__(**kwargs) -er = HERD() +herd = HERD() file = HERDManagerContainer(name='file') @@ -123,7 +131,8 @@ def __init__(self, **kwargs): # the underlying data structures accordingly. data = Data(name="species", data=['Homo sapiens', 'Mus musculus']) -er.add_ref( +data.parent = file +herd.add_ref( file=file, container=data, key='Homo sapiens', @@ -131,7 +140,7 @@ def __init__(self, **kwargs): entity_uri='https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9606' ) -er.add_ref( +herd.add_ref( file=file, container=data, key='Mus musculus', @@ -156,7 +165,8 @@ def __init__(self, **kwargs): genotypes = DynamicTable(name='genotypes', description='My genotypes') genotypes.add_column(name='genotype_name', description="Name of genotypes") genotypes.add_row(id=0, genotype_name='Rorb') -er.add_ref( +genotypes.parent = file +herd.add_ref( file=file, container=genotypes, attribute='genotype_name', @@ -166,8 +176,8 @@ def __init__(self, **kwargs): ) # Note: :py:func:`~hdmf.common.resources.HERD.add_ref` internally resolves the object -# to the closest parent, so that ``er.add_ref(container=genotypes, attribute='genotype_name')`` and -# ``er.add_ref(container=genotypes.genotype_name, attribute=None)`` will ultimately both use the ``object_id`` +# to the closest parent, so that ``herd.add_ref(container=genotypes, attribute='genotype_name')`` and +# ``herd.add_ref(container=genotypes.genotype_name, attribute=None)`` will ultimately both use the ``object_id`` # of the ``genotypes.genotype_name`` :py:class:`~hdmf.common.table.VectorData` column and # not the object_id of the genotypes table. @@ -188,7 +198,7 @@ def __init__(self, **kwargs): species = DynamicTable(name='species', description='My species', columns=[col1]) species.parent = file -er.add_ref( +herd.add_ref( container=species, attribute='Species_Data', key='Ursus arctos horribilis', @@ -203,15 +213,15 @@ def __init__(self, **kwargs): # as separate tables. # `~hdmf.common.resources.HERD` as a flattened table -er.to_dataframe() +herd.to_dataframe() # The individual interlinked tables: -er.files.to_dataframe() -er.objects.to_dataframe() -er.entities.to_dataframe() -er.keys.to_dataframe() -er.object_keys.to_dataframe() -er.entity_keys.to_dataframe() +herd.files.to_dataframe() +herd.objects.to_dataframe() +herd.entities.to_dataframe() +herd.keys.to_dataframe() +herd.object_keys.to_dataframe() +herd.entity_keys.to_dataframe() ############################################################################### # Using the get_key method @@ -224,11 +234,11 @@ def __init__(self, **kwargs): # The :py:func:`~hdmf.common.resources.HERD.get_key` method will be able to return the # :py:class:`~hdmf.common.resources.Key` object if the :py:class:`~hdmf.common.resources.Key` object is unique. -genotype_key_object = er.get_key(key_name='Rorb') +genotype_key_object = herd.get_key(key_name='Rorb') # If the :py:class:`~hdmf.common.resources.Key` object has a duplicate name, then the user will need # to provide the unique (file, container, relative_path, field, key) combination. -species_key_object = er.get_key(file=file, +species_key_object = herd.get_key(file=file, container=species['Species_Data'], key_name='Ursus arctos horribilis') @@ -246,7 +256,7 @@ def __init__(self, **kwargs): # :py:func:`~hdmf.common.resources.HERD.add_ref` method. If a 'key_name' # is used, a new :py:class:`~hdmf.common.resources.Key` will be created. -er.add_ref( +herd.add_ref( file=file, container=genotypes, attribute='genotype_name', @@ -262,18 +272,18 @@ def __init__(self, **kwargs): # allows the user to retrieve all entities and key information associated with an `Object` in # the form of a pandas DataFrame. -er.get_object_entities(file=file, +herd.get_object_entities(file=file, container=genotypes['genotype_name'], relative_path='') ############################################################################### # Using the get_object_type # ------------------------------------------------------ -# The :py:class:`~hdmf.common.resources.HERD.get_object_entities` method +# The :py:func:`~hdmf.common.resources.HERD.get_object_entities` method # allows the user to retrieve all entities and key information associated with an `Object` in # the form of a pandas DataFrame. -er.get_object_type(object_type='Data') +herd.get_object_type(object_type='Data') ############################################################################### # Special Case: Using add_ref with compound data @@ -286,8 +296,7 @@ def __init__(self, **kwargs): # 'x' is using the external reference. # Let's create a new instance of :py:class:`~hdmf.common.resources.HERD`. -er = HERD() -file = HERDManagerContainer(name='file') +herd = HERD() data = Data( name='data_name', @@ -296,8 +305,9 @@ def __init__(self, **kwargs): dtype=[('species', 'U14'), ('age', 'i4'), ('weight', 'f4')] ) ) +data.parent = file -er.add_ref( +herd.add_ref( file=file, container=data, field='species', @@ -306,6 +316,45 @@ def __init__(self, **kwargs): entity_uri='https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=10090' ) +############################################################################### +# Using add_ref_termset +# ------------------------------------------------------ +# The :py:func:`~hdmf.common.resources.HERD.add_ref_termset` +# method allows users to not only validate terms, i.e., keys, but also +# add references for an entire datasets, rather than single entries as we saw +# prior with :py:func:`~hdmf.common.resources.HERD.add_ref`. + +# :py:func:`~hdmf.common.resources.HERD.add_ref_termset` has many optional fields, +# giving the user a range of control when adding references. Let's see an example. +herd = HERD() +terms = TermSet(term_schema_path=yaml_file) + +herd.add_ref_termset(file=file, + container=species, + attribute='Species_Data', + key='Ursus arctos horribilis', + termset=terms) + +############################################################################### +# Using add_ref_termset for an entire dataset +# ------------------------------------------------------ +# As mentioned above, :py:func:`~hdmf.common.resources.HERD.add_ref_termset` +# supports iteratively validating and populating :py:class:`~hdmf.common.resources.HERD`. + +# When populating :py:class:`~hdmf.common.resources.HERD`, users may have some terms +# that are not in the :py:class:`~hdmf.term_set.TermSet`. As a result, +# :py:func:`~hdmf.common.resources.HERD.add_ref_termset` will return all of the missing +# terms in a dictionary. It is up to the user to either add these terms to the +# :py:class:`~hdmf.term_set.TermSet` or remove them from the dataset. + +herd = HERD() +terms = TermSet(term_schema_path=yaml_file) + +herd.add_ref_termset(file=file, + container=species, + attribute='Species_Data', + termset=terms) + ############################################################################### # Write HERD # ------------------------------------------------------ @@ -313,7 +362,7 @@ def __init__(self, **kwargs): # the individual tables written to tsv. # The user provides the path, which contains the name of the file. -er.to_zip(path='./HERD.zip') +herd.to_zip(path='./HERD.zip') ############################################################################### # Read HERD diff --git a/docs/gallery/plot_term_set.py b/docs/gallery/plot_term_set.py index 86d53e553..71053bba5 100644 --- a/docs/gallery/plot_term_set.py +++ b/docs/gallery/plot_term_set.py @@ -190,3 +190,6 @@ # To add a column that is validated using :py:class:`~hdmf.term_set.TermSetWrapper`, # wrap the data in the :py:func:`~hdmf.common.table.DynamicTable.add_column` # method as if you were making a new instance of :py:class:`~hdmf.common.table.VectorData`. +species.add_column(name='Species_3', + description='...', + data=TermSetWrapper(value=['Ursus arctos horribilis', 'Mus musculus'], termset=terms),) diff --git a/src/hdmf/backends/io.py b/src/hdmf/backends/io.py index 3d01c388b..4cd68e078 100644 --- a/src/hdmf/backends/io.py +++ b/src/hdmf/backends/io.py @@ -89,8 +89,8 @@ def write(self, **kwargs): from hdmf.common import HERD herd = HERD(type_map=self.manager.type_map) - # add_ref_term_set to search for and resolve the TermSetWrapper - herd.add_ref_term_set(container) # container would be the NWBFile + # add_ref_container to search for and resolve the TermSetWrapper + herd.add_ref_container(container) # container would be the NWBFile # write HERD herd.to_zip(path=self.herd_path) diff --git a/src/hdmf/common/resources.py b/src/hdmf/common/resources.py index 8054a758f..f9738c998 100644 --- a/src/hdmf/common/resources.py +++ b/src/hdmf/common/resources.py @@ -3,6 +3,8 @@ from . import register_class, EXP_NAMESPACE from . import get_type_map from ..container import Table, Row, Container, Data, AbstractContainer, HERDManager +from ..term_set import TermSet +from ..data_utils import DataIO from ..utils import docval, popargs, AllowPositional from ..build import TypeMap from ..term_set import TermSetWrapper @@ -10,6 +12,7 @@ import os import zipfile from collections import namedtuple +from warnings import warn class KeyTable(Table): @@ -358,16 +361,17 @@ def _check_object_field(self, **kwargs): relative_path = kwargs['relative_path'] field = kwargs['field'] create = kwargs['create'] + file_object_id = file.object_id files_idx = self.files.which(file_object_id=file_object_id) - if len(files_idx) > 1: + if len(files_idx) > 1: # pragma: no cover + # It isn't possible for len(files_idx) > 1 without the user directly using _add_file raise ValueError("Found multiple instances of the same file.") elif len(files_idx) == 1: files_idx = files_idx[0] else: - self._add_file(file_object_id) - files_idx = self.files.which(file_object_id=file_object_id)[0] + files_idx = None objecttable_idx = self.objects.which(object_id=container.object_id) @@ -378,10 +382,16 @@ def _check_object_field(self, **kwargs): if len(objecttable_idx) == 1: return self.objects.row[objecttable_idx[0]] elif len(objecttable_idx) == 0 and create: - return self._add_object(files_idx=files_idx, container=container, relative_path=relative_path, field=field) + # Used for add_ref + return {'file_object_id': file_object_id, + 'files_idx': files_idx, + 'container': container, + 'relative_path': relative_path, + 'field': field} elif len(objecttable_idx) == 0 and not create: raise ValueError("Object not in Object Table.") - else: + else: # pragma: no cover + # It isn't possible for this to happen unless the user used _add_object. raise ValueError("Found multiple instances of the same object id, relative path, " "and field in objects table.") @@ -437,7 +447,7 @@ def __check_termset_wrapper(self, **kwargs): @docval({'name': 'root_container', 'type': HERDManager, 'doc': 'The root container or file containing objects with a TermSet.'}) - def add_ref_term_set(self, **kwargs): + def add_ref_container(self, **kwargs): """ Method to search through the root_container for all instances of TermSet. Currently, only datasets are supported. By using a TermSet, the data comes validated @@ -466,64 +476,73 @@ def add_ref_term_set(self, **kwargs): entity_id=entity_id, entity_uri=entity_uri) - @docval({'name': 'key_name', 'type': str, 'doc': 'The name of the Key to get.'}, - {'name': 'file', 'type': HERDManager, 'doc': 'The file associated with the container.', + @docval({'name': 'file', 'type': HERDManager, 'doc': 'The file associated with the container.', 'default': None}, {'name': 'container', 'type': (str, AbstractContainer), 'default': None, 'doc': ('The Container/Data object that uses the key or ' - 'the object id for the Container/Data object that uses the key.')}, - {'name': 'relative_path', 'type': str, - 'doc': ('The relative_path of the attribute of the object that uses ', - 'an external resource reference key. Use an empty string if not applicable.'), - 'default': ''}, + 'the object_id for the Container/Data object that uses the key.')}, + {'name': 'attribute', 'type': str, + 'doc': 'The attribute of the container for the external reference.', 'default': None}, {'name': 'field', 'type': str, 'default': '', - 'doc': ('The field of the compound data type using an external resource.')}) - def get_key(self, **kwargs): + 'doc': ('The field of the compound data type using an external resource.')}, + {'name': 'key', 'type': (str, Key), 'default': None, + 'doc': 'The name of the key or the Key object from the KeyTable for the key to add a resource for.'}, + {'name': 'termset', 'type': TermSet, + 'doc': 'The TermSet to be used if the container/attribute does not have one.'} + ) + def add_ref_termset(self, **kwargs): """ - Return a Key. - - If container, relative_path, and field are provided, the Key that corresponds to the given name of the key - for the given container, relative_path, and field is returned. - - If there are multiple matches, a list of all matching keys will be returned. + This method allows users to take advantage of using the TermSet class to provide the entity information + for add_ref, while also validating the data. This method supports adding a single key or an entire dataset + to the HERD tables. For both cases, the term, i.e., key, will be validated against the permissible values + in the TermSet. If valid, it will proceed to call add_ref. Otherwise, the method will return a dict of + missing terms (terms not found in the TermSet). """ - key_name, container, relative_path, field = popargs('key_name', 'container', 'relative_path', 'field', kwargs) - key_idx_matches = self.keys.which(key=key_name) - file = kwargs['file'] + container = kwargs['container'] + attribute = kwargs['attribute'] + key = kwargs['key'] + field = kwargs['field'] + termset = kwargs['termset'] - if container is not None: - if file is None: - file = self._get_file_from_container(container=container) - # if same key is used multiple times, determine - # which instance based on the Container - object_field = self._check_object_field(file=file, - container=container, - relative_path=relative_path, - field=field) - for row_idx in self.object_keys.which(objects_idx=object_field.idx): - key_idx = self.object_keys['keys_idx', row_idx] - if key_idx in key_idx_matches: - return self.keys.row[key_idx] - msg = "No key found with that container." - raise ValueError(msg) + if file is None: + file = self._get_file_from_container(container=container) + # if key is provided then add_ref proceeds as normal + if key is not None: + data = [key] else: - if len(key_idx_matches) == 0: - # the key has never been used before - raise ValueError("key '%s' does not exist" % key_name) - elif len(key_idx_matches) > 1: - return [self.keys.row[x] for x in key_idx_matches] + # if the key is not provided, proceed to "bulk add" + if attribute is None: + data_object = container else: - return self.keys.row[key_idx_matches[0]] - - @docval({'name': 'entity_id', 'type': str, 'doc': 'The ID for the identifier at the resource.'}) - def get_entity(self, **kwargs): - entity_id = kwargs['entity_id'] - entity = self.entities.which(entity_id=entity_id) - if len(entity)>0: - return self.entities.row[entity[0]] - else: - return None + data_object = getattr(container, attribute) + if isinstance(data_object, (Data, DataIO)): + data = data_object.data + elif isinstance(data_object, (list, tuple, np.ndarray)): + data = data_object + else: + msg = ("The data object being used is not supported. " + "Please review the documentation for supported types.") + raise ValueError(msg) + missing_terms = [] + for term in data: + # check the data according to the permissible_values + try: + term_info = termset[term] + except ValueError: + missing_terms.append(term) + continue + entity_id = term_info[0] + entity_uri = term_info[2] + self.add_ref(file=file, + container=container, + attribute=attribute, + key=term, + field=field, + entity_id=entity_id, + entity_uri=entity_uri) + if len(missing_terms)>0: + return {"missing_terms": missing_terms} @docval({'name': 'container', 'type': (str, AbstractContainer), 'default': None, 'doc': ('The Container/Data object that uses the key or ' @@ -551,6 +570,7 @@ def add_ref(self, **kwargs): container = kwargs['container'] attribute = kwargs['attribute'] if isinstance(container, Data): + # Used when using the TermSetWrapper if attribute == 'data': attribute = None key = kwargs['key'] @@ -559,9 +579,60 @@ def add_ref(self, **kwargs): entity_uri = kwargs['entity_uri'] file = kwargs['file'] + ################## + # Set File if None + ################## if file is None: file = self._get_file_from_container(container=container) + # TODO: Add this once you've created a HDMF_file to rework testing + # else: + # file_from_container = self._get_file_from_container(container=container) + # if file.object_id != file_from_container.object_id: + # msg = "The file given does not match the file in which the container is stored." + # raise ValueError(msg) + + ################ + # Set Key Checks + ################ + add_key = False + add_object_key = False + check_object_key = False + if not isinstance(key, Key): + add_key = True + add_object_key = True + else: + # Check to see that the existing key is being used with the object. + # If true, do nothing. If false, create a new obj/key relationship + # in the ObjectKeyTable + check_object_key = True + + ################### + # Set Entity Checks + ################### + add_entity_key = False + add_entity = False + entity = self.get_entity(entity_id=entity_id) + check_entity_key = False + if entity is None: + if entity_uri is None: + msg = 'New entities must have an entity_uri.' + raise ValueError(msg) + + add_entity = True + add_entity_key = True + else: + # The entity exists and so we need to check if an entity_key exists + # for this entity and key combination. + check_entity_key = True + if entity_uri is not None: + entity_uri = entity.entity_uri + msg = 'This entity already exists. Ignoring new entity uri' + warn(msg) + + ################# + # Validate Object + ################# if attribute is None: # Trivial Case relative_path = '' object_field = self._check_object_field(file=file, @@ -606,69 +677,142 @@ def add_ref(self, **kwargs): relative_path=relative_path, field=field) - if not isinstance(key, Key): + ####################################### + # Validate Parameters and Populate HERD + ####################################### + if isinstance(object_field, dict): + # Create the object and file + if object_field['files_idx'] is None: + self._add_file(object_field['file_object_id']) + object_field['files_idx'] = self.files.which(file_object_id=object_field['file_object_id'])[0] + object_field = self._add_object(files_idx=object_field['files_idx'], + container=object_field['container'], + relative_path=object_field['relative_path'], + field=object_field['field']) + + if add_key: + # Now that object_field is set, we need to check if + # the key has been associated with that object. + # If so, just reuse the key. + key_exists = False key_idx_matches = self.keys.which(key=key) - # if same key is used multiple times, determine - # which instance based on the Container - for row_idx in self.object_keys.which(objects_idx=object_field.idx): - key_idx = self.object_keys['keys_idx', row_idx] - if key_idx in key_idx_matches: - msg = "Use Key Object when referencing an existing (container, relative_path, key)" - raise ValueError(msg) - - key = self._add_key(key) - self._add_object_key(object_field, key) - - else: - # Check to see that the existing key is being used with the object. - # If true, do nothing. If false, create a new obj/key relationship - # in the ObjectKeyTable + if len(key_idx_matches)!=0: + for row_idx in self.object_keys.which(objects_idx=object_field.idx): + key_idx = self.object_keys['keys_idx', row_idx] + if key_idx in key_idx_matches: + key_exists = True # Make sure we don't add the key. + # Automatically resolve the key for keys associated with + # the same object. + key = self.keys.row[key_idx] + + if not key_exists: + key = self._add_key(key) + + if check_object_key: + # When using a Key Object, we want to still check for whether the key + # has been used with the Object object. If not, add it to ObjectKeyTable. + # If so, do nothing and add_object_key remains False. + obj_key_exists = False key_idx = key.idx object_key_row_idx = self.object_keys.which(keys_idx=key_idx) if len(object_key_row_idx)!=0: - obj_key_check = False + # this means there exists rows where the key is in the ObjectKeyTable for row_idx in object_key_row_idx: obj_idx = self.object_keys['objects_idx', row_idx] if obj_idx == object_field.idx: - obj_key_check = True - if not obj_key_check: - self._add_object_key(object_field, key) - else: - msg = "Cannot find key object. Create new Key with string." - raise ValueError(msg) - # check if the key and object have been related in the ObjectKeyTable + obj_key_exists = True + # this means there is already a object-key relationship recorded + if not obj_key_exists: + # this means that though the key is there, there is no object-key relationship + add_object_key = True - entity = self.get_entity(entity_id=entity_id) - if entity is None: - if entity_uri is None: - msg = 'New entities must have an entity_uri.' - raise ValueError(msg) - entity = self._add_entity(entity_id, entity_uri) - self._add_entity_key(entity, key) - else: - if entity_uri is not None: - msg = 'If you plan on reusing an entity, then entity_uri parameter must be None.' - raise ValueError(msg) + if add_object_key: + self._add_object_key(object_field, key) + + if check_entity_key: # check for entity-key relationship in EntityKeyTable + entity_key_check = False key_idx = key.idx entity_key_row_idx = self.entity_keys.which(keys_idx=key_idx) if len(entity_key_row_idx)!=0: # this means there exists rows where the key is in the EntityKeyTable - entity_key_check = False for row_idx in entity_key_row_idx: entity_idx = self.entity_keys['entities_idx', row_idx] if entity_idx == entity.idx: entity_key_check = True - # this means there is already a key-entity relationship recorded + # this means there is already a entity-key relationship recorded if not entity_key_check: - # this means that though the key is there, there is not key-entity relationship - # a.k.a add it now - self._add_entity_key(entity, key) + # this means that though the key is there, there is no entity-key relationship + add_entity_key = True else: # this means that specific key is not in the EntityKeyTable, so add it and establish # the relationship with the entity - self._add_entity_key(entity, key) - return key, entity + add_entity_key = True + + if add_entity: + entity = self._add_entity(entity_id, entity_uri) + + if add_entity_key: + self._add_entity_key(entity, key) + + @docval({'name': 'key_name', 'type': str, 'doc': 'The name of the Key to get.'}, + {'name': 'file', 'type': HERDManager, 'doc': 'The file associated with the container.', + 'default': None}, + {'name': 'container', 'type': (str, AbstractContainer), 'default': None, + 'doc': ('The Container/Data object that uses the key or ' + 'the object id for the Container/Data object that uses the key.')}, + {'name': 'relative_path', 'type': str, + 'doc': ('The relative_path of the attribute of the object that uses ', + 'an external resource reference key. Use an empty string if not applicable.'), + 'default': ''}, + {'name': 'field', 'type': str, 'default': '', + 'doc': ('The field of the compound data type using an external resource.')}) + def get_key(self, **kwargs): + """ + Return a Key. + + If container, relative_path, and field are provided, the Key that corresponds to the given name of the key + for the given container, relative_path, and field is returned. + + If there are multiple matches, a list of all matching keys will be returned. + """ + key_name, container, relative_path, field = popargs('key_name', 'container', 'relative_path', 'field', kwargs) + key_idx_matches = self.keys.which(key=key_name) + + file = kwargs['file'] + + if container is not None: + if file is None: + file = self._get_file_from_container(container=container) + # if same key is used multiple times, determine + # which instance based on the Container + object_field = self._check_object_field(file=file, + container=container, + relative_path=relative_path, + field=field) + for row_idx in self.object_keys.which(objects_idx=object_field.idx): + key_idx = self.object_keys['keys_idx', row_idx] + if key_idx in key_idx_matches: + return self.keys.row[key_idx] + msg = "No key found with that container." + raise ValueError(msg) + else: + if len(key_idx_matches) == 0: + # the key has never been used before + raise ValueError("key '%s' does not exist" % key_name) + elif len(key_idx_matches) > 1: + return [self.keys.row[x] for x in key_idx_matches] + else: + return self.keys.row[key_idx_matches[0]] + + @docval({'name': 'entity_id', 'type': str, 'doc': 'The ID for the identifier at the resource.'}) + def get_entity(self, **kwargs): + entity_id = kwargs['entity_id'] + entity = self.entities.which(entity_id=entity_id) + if len(entity)>0: + return self.entities.row[entity[0]] + else: + return None @docval({'name': 'object_type', 'type': str, 'doc': 'The type of the object. This is also the parent in relative_path.'}, diff --git a/tests/unit/common/test_resources.py b/tests/unit/common/test_resources.py index 3040d78b6..8cbd8291e 100644 --- a/tests/unit/common/test_resources.py +++ b/tests/unit/common/test_resources.py @@ -4,7 +4,7 @@ from hdmf import TermSet, TermSetWrapper from hdmf.common.resources import HERD, Key from hdmf import Data, Container, HERDManager -from hdmf.testing import TestCase, H5RoundTripMixin, remove_test_file +from hdmf.testing import TestCase, remove_test_file import numpy as np from tests.unit.build_tests.test_io_map import Bar from tests.unit.helpers.utils import create_test_type_map, CORE_NAMESPACE @@ -25,7 +25,7 @@ def __init__(self, **kwargs): super().__init__(**kwargs) -class TestHERD(H5RoundTripMixin, TestCase): +class TestHERD(TestCase): def setUpContainer(self): er = HERD() @@ -88,18 +88,18 @@ def test_to_dataframe(self): file_1 = HERDManagerContainer(name='file_1') file_2 = HERDManagerContainer(name='file_2') - k1, e1 = er.add_ref(file=file_1, - container=data1, - field='species', - key='Mus musculus', - entity_id='NCBI:txid10090', - entity_uri='https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=10090') - k2, e2 = er.add_ref(file=file_2, - container=data2, - field='species', - key='Homo sapiens', - entity_id='NCBI:txid9606', - entity_uri='https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9606') + er.add_ref(file=file_1, + container=data1, + field='species', + key='Mus musculus', + entity_id='NCBI:txid10090', + entity_uri='https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=10090') + er.add_ref(file=file_2, + container=data2, + field='species', + key='Homo sapiens', + entity_id='NCBI:txid9606', + entity_uri='https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9606') # Convert to dataframe and compare against the expected result result_df = er.to_dataframe() @@ -268,6 +268,21 @@ def test_add_ref_search_for_file_error(self): entity_id='entity_id1', entity_uri='entity1') + # TODO: Add this once you've created a HDMF_file to rework testing + # def test_add_ref_file_mismatch(self): + # file = HERDManagerContainer(name='file') + # file2 = HERDManagerContainer() + # + # nested_child = Container(name='nested_child') + # child = Container(name='child') + # nested_child.parent = child + # child.parent = file + # + # er = HERD() + # with self.assertRaises(ValueError): + # er.add_ref(file=file2, container=nested_child, key='key1', + # entity_id='entity_id1', entity_uri='entity1') + @unittest.skipIf(not LINKML_INSTALLED, "optional LinkML module is not installed") def test_check_termset_wrapper(self): terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml') @@ -289,7 +304,7 @@ def test_check_termset_wrapper(self): self.assertTrue(isinstance(ret[0][2], TermSetWrapper)) @unittest.skipIf(not LINKML_INSTALLED, "optional LinkML module is not installed") - def test_add_ref_termset_data(self): + def test_add_ref_container_data(self): terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml') er = HERD() em = HERDManagerContainer() @@ -305,14 +320,14 @@ def test_add_ref_termset_data(self): species.parent = em - er.add_ref_term_set(root_container=em) + er.add_ref_container(root_container=em) self.assertEqual(er.keys.data, [('Homo sapiens',)]) self.assertEqual(er.entities.data, [('NCBI_TAXON:9606', 'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=9606')]) self.assertEqual(er.objects.data, [(0, col1.object_id, 'VectorData', '', '')]) @unittest.skipIf(not LINKML_INSTALLED, "optional LinkML module is not installed") - def test_add_ref_termset_attr(self): + def test_add_ref_container_attr(self): terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml') er = HERD() em = HERDManagerContainer() @@ -328,12 +343,159 @@ def test_add_ref_termset_attr(self): species.parent = em - er.add_ref_term_set(root_container=em) + er.add_ref_container(root_container=em) self.assertEqual(er.keys.data, [('Homo sapiens',)]) self.assertEqual(er.entities.data, [('NCBI_TAXON:9606', 'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=9606')]) self.assertEqual(er.objects.data, [(0, col1.object_id, 'VectorData', 'description', '')]) + @unittest.skipIf(not LINKML_INSTALLED, "optional LinkML module is not installed") + def test_add_ref_termset(self): + terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml') + er = HERD() + em = HERDManagerContainer() + + col1 = VectorData(name='Species_Data', + description='species from NCBI and Ensemble', + data=['Homo sapiens']) + + species = DynamicTable(name='species', description='My species', columns=[col1],) + + er.add_ref_termset(file=em, + container=species, + attribute='Species_Data', + key='Homo sapiens', + termset=terms + ) + self.assertEqual(er.keys.data, [('Homo sapiens',)]) + self.assertEqual(er.entities.data, [('NCBI_TAXON:9606', + 'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=9606')]) + self.assertEqual(er.objects.data, [(0, col1.object_id, 'VectorData', '', '')]) + + @unittest.skipIf(not LINKML_INSTALLED, "optional LinkML module is not installed") + def test_add_ref_termset_data_object_error(self): + terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml') + er = HERD() + + col1 = VectorData(name='Species_Data', + description='species from NCBI and Ensemble', + data=['Homo sapiens']) + + with self.assertRaises(ValueError): + er.add_ref_termset( + container=col1, + attribute='description', + termset=terms + ) + + @unittest.skipIf(not LINKML_INSTALLED, "optional LinkML module is not installed") + def test_add_ref_termset_attribute_none(self): + terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml') + er = HERD() + em = HERDManagerContainer() + + col1 = VectorData(name='Species_Data', + description='species from NCBI and Ensemble', + data=['Homo sapiens']) + + species = DynamicTable(name='species', description='My species', columns=[col1],) + + er.add_ref_termset(file=em, + container=species['Species_Data'], + termset=terms + ) + self.assertEqual(er.keys.data, [('Homo sapiens',)]) + self.assertEqual(er.entities.data, [('NCBI_TAXON:9606', + 'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=9606')]) + self.assertEqual(er.objects.data, [(0, col1.object_id, 'VectorData', '', '')]) + + @unittest.skipIf(not LINKML_INSTALLED, "optional LinkML module is not installed") + def test_add_ref_termset_data_object_list(self): + terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml') + er = HERD() + em = HERDManagerContainer() + + col1 = VectorData(name='Homo sapiens', + description='species from NCBI and Ensemble', + data=['Homo sapiens']) + + species = DynamicTable(name='species', description='My species', columns=[col1],) + + er.add_ref_termset(file=em, + container=species, + attribute='colnames', + termset=terms + ) + self.assertEqual(er.keys.data, [('Homo sapiens',)]) + self.assertEqual(er.entities.data, [('NCBI_TAXON:9606', + 'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=9606')]) + self.assertEqual(er.objects.data, [(0, species.object_id, 'DynamicTable', 'colnames', '')]) + + @unittest.skipIf(not LINKML_INSTALLED, "optional LinkML module is not installed") + def test_add_ref_termset_bulk(self): + terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml') + er = HERD() + em = HERDManagerContainer() + + col1 = VectorData(name='Species_Data', + description='species from NCBI and Ensemble', + data=['Homo sapiens', 'Mus musculus']) + + species = DynamicTable(name='species', description='My species', columns=[col1],) + + er.add_ref_termset(file=em, + container=species, + attribute='Species_Data', + termset=terms + ) + self.assertEqual(er.keys.data, [('Homo sapiens',), ('Mus musculus',)]) + self.assertEqual(er.entities.data, [('NCBI_TAXON:9606', + 'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=9606'), + ('NCBI_TAXON:10090', + 'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=10090')]) + self.assertEqual(er.objects.data, [(0, col1.object_id, 'VectorData', '', '')]) + + @unittest.skipIf(not LINKML_INSTALLED, "optional LinkML module is not installed") + def test_add_ref_termset_missing_terms(self): + terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml') + er = HERD() + em = HERDManagerContainer() + + col1 = VectorData(name='Species_Data', + description='species from NCBI and Ensemble', + data=['Homo sapiens', 'missing_term']) + + species = DynamicTable(name='species', description='My species', columns=[col1],) + + missing_terms = er.add_ref_termset(file=em, + container=species, + attribute='Species_Data', + termset=terms + ) + self.assertEqual(er.keys.data, [('Homo sapiens',)]) + self.assertEqual(er.entities.data, [('NCBI_TAXON:9606', + 'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=9606')]) + self.assertEqual(er.objects.data, [(0, col1.object_id, 'VectorData', '', '')]) + self.assertEqual(missing_terms, {'missing_terms': ['missing_term']}) + + @unittest.skipIf(not LINKML_INSTALLED, "optional LinkML module is not installed") + def test_add_ref_termset_missing_file_error(self): + terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml') + er = HERD() + + col1 = VectorData(name='Species_Data', + description='species from NCBI and Ensemble', + data=['Homo sapiens']) + + species = DynamicTable(name='species', description='My species', columns=[col1],) + + with self.assertRaises(ValueError): + er.add_ref_termset( + container=species, + attribute='Species_Data', + termset=terms + ) + def test_get_file_from_container(self): file = HERDManagerContainer(name='file') container = Container(name='name') @@ -826,42 +988,21 @@ def test_object_key_existing_key_new_object(self): entity_uri='entity_uri2') self.assertEqual(er.object_keys.data, [(0, 0), (1, 0)]) - def test_object_key_existing_key_new_object_error(self): + def test_reuse_key_string(self): + # With the key and entity existing, the EntityKeyTable should not have duplicates er = HERD() - data_1 = Data(name='data_name', data=np.array([('Mus musculus', 9, 81.0), ('Homo sapien', 3, 27.0)], - dtype=[('species', 'U14'), ('age', 'i4'), ('weight', 'f4')])) + data_1 = Data(name='data_name', + data=np.array([('Mus musculus', 9, 81.0), ('Homo sapien', 3, 27.0), ('mouse', 3, 27.0)], + dtype=[('species', 'U14'), ('age', 'i4'), ('weight', 'f4')])) er.add_ref(file=HERDManagerContainer(name='file'), container=data_1, key='Mus musculus', entity_id='NCBI:txid10090', entity_uri='https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=10090') - key = er._add_key('key') - with self.assertRaises(ValueError): - er.add_ref(file=HERDManagerContainer(name='file'), - container=data_1, - key=key, - entity_id='entity1', - entity_uri='entity_uri1') - - def test_reuse_key_reuse_entity(self): - # With the key and entity existing, the EntityKeyTable should not have duplicates - er = HERD() - data_1 = Data(name='data_name', data=np.array([('Mus musculus', 9, 81.0), ('Homo sapien', 3, 27.0)], - dtype=[('species', 'U14'), ('age', 'i4'), ('weight', 'f4')])) - - data_2 = Data(name='data_name', data=np.array([('Mus musculus', 9, 81.0), ('Homo sapien', 3, 27.0)], - dtype=[('species', 'U14'), ('age', 'i4'), ('weight', 'f4')])) - er.add_ref(file=HERDManagerContainer(name='file'), container=data_1, key='Mus musculus', - entity_id='NCBI:txid10090', - entity_uri='https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=10090') - existing_key = er.get_key('Mus musculus') - er.add_ref(file=HERDManagerContainer(name='file'), - container=data_2, - key=existing_key, entity_id='NCBI:txid10090') self.assertEqual(er.entity_keys.data, [(0, 0)]) @@ -922,7 +1063,7 @@ def test_entity_uri_error(self): key='Mus musculus', entity_id='NCBI:txid10090') - def test_entity_uri_reuse_error(self): + def test_entity_uri_warning(self): er = HERD() data_1 = Data(name='data_name', data=np.array([('Mus musculus', 9, 81.0), ('Homo sapien', 3, 27.0)], dtype=[('species', 'U14'), ('age', 'i4'), ('weight', 'f4')])) @@ -936,7 +1077,7 @@ def test_entity_uri_reuse_error(self): entity_id='NCBI:txid10090', entity_uri='https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=10090') existing_key = er.get_key('Mus musculus') - with self.assertRaises(ValueError): + with self.assertWarns(Warning): er.add_ref(file=HERDManagerContainer(name='file'), container=data_2, key=existing_key, @@ -963,32 +1104,32 @@ def test_key_without_entity_error(self): def test_check_object_field_add(self): er = HERD() data = Data(name="species", data=['Homo sapiens', 'Mus musculus']) - er._check_object_field(file=HERDManagerContainer(name='file'), + file = HERDManagerContainer(name='file') + _dict = er._check_object_field(file=file, container=data, relative_path='', field='') - - self.assertEqual(er.objects.data, [(0, data.object_id, 'Data', '', '')]) + expected = {'file_object_id': file.object_id, + 'files_idx': None, + 'container': data, + 'relative_path': '', + 'field': ''} + self.assertEqual(_dict, expected) def test_check_object_field_multi_files(self): er = HERD() data = Data(name="species", data=['Homo sapiens', 'Mus musculus']) file = HERDManagerContainer(name='file') - - er._check_object_field(file=file, container=data, relative_path='', field='') + er._add_file(file.object_id) er._add_file(file.object_id) - data2 = Data(name="species", data=['Homo sapiens', 'Mus musculus']) with self.assertRaises(ValueError): - er._check_object_field(file=file, container=data2, relative_path='', field='') + er._check_object_field(file=file, container=data, relative_path='', field='') def test_check_object_field_multi_error(self): er = HERD() data = Data(name="species", data=['Homo sapiens', 'Mus musculus']) - er._check_object_field(file=HERDManagerContainer(name='file'), - container=data, - relative_path='', - field='') + er._add_object(files_idx=0, container=data, relative_path='', field='') er._add_object(files_idx=0, container=data, relative_path='', field='') with self.assertRaises(ValueError): er._check_object_field(file=HERDManagerContainer(name='file'), @@ -1063,14 +1204,6 @@ def test_add_ref_compound_data(self): self.assertEqual(er.entities.data, [('NCBI:txid10090', 'entity_0_uri')]) self.assertEqual(er.objects.data, [(0, data.object_id, 'Data', '', 'species')]) - def test_roundtrip(self): - read_container = self.roundtripContainer() - pd.testing.assert_frame_equal(read_container.to_dataframe(), self.container.to_dataframe()) - - def test_roundtrip_export(self): - read_container = self.roundtripExportContainer() - pd.testing.assert_frame_equal(read_container.to_dataframe(), self.container.to_dataframe()) - class TestHERDNestedAttributes(TestCase):