From e77132c92b88c795eac4eefddbd902dc9f7ba22b Mon Sep 17 00:00:00 2001 From: mavaylon1 Date: Wed, 18 Oct 2023 15:02:58 -0700 Subject: [PATCH 01/40] HERD Updates --- src/hdmf/backends/io.py | 4 ++-- src/hdmf/common/resources.py | 2 +- tests/unit/common/test_resources.py | 8 ++++---- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/hdmf/backends/io.py b/src/hdmf/backends/io.py index 3d01c388b..4cd68e078 100644 --- a/src/hdmf/backends/io.py +++ b/src/hdmf/backends/io.py @@ -89,8 +89,8 @@ def write(self, **kwargs): from hdmf.common import HERD herd = HERD(type_map=self.manager.type_map) - # add_ref_term_set to search for and resolve the TermSetWrapper - herd.add_ref_term_set(container) # container would be the NWBFile + # add_ref_container to search for and resolve the TermSetWrapper + herd.add_ref_container(container) # container would be the NWBFile # write HERD herd.to_zip(path=self.herd_path) diff --git a/src/hdmf/common/resources.py b/src/hdmf/common/resources.py index faead635f..4a492428f 100644 --- a/src/hdmf/common/resources.py +++ b/src/hdmf/common/resources.py @@ -437,7 +437,7 @@ def __check_termset_wrapper(self, **kwargs): @docval({'name': 'root_container', 'type': HERDManager, 'doc': 'The root container or file containing objects with a TermSet.'}) - def add_ref_term_set(self, **kwargs): + def add_ref_container(self, **kwargs): """ Method to search through the root_container for all instances of TermSet. Currently, only datasets are supported. By using a TermSet, the data comes validated diff --git a/tests/unit/common/test_resources.py b/tests/unit/common/test_resources.py index 796f75db4..1f2d2afbf 100644 --- a/tests/unit/common/test_resources.py +++ b/tests/unit/common/test_resources.py @@ -289,7 +289,7 @@ def test_check_termset_wrapper(self): self.assertTrue(isinstance(ret[0][2], TermSetWrapper)) @unittest.skipIf(not LINKML_INSTALLED, "optional LinkML module is not installed") - def test_add_ref_termset_data(self): + def test_add_ref_container_data(self): terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml') er = HERD() em = HERDManagerContainer() @@ -305,14 +305,14 @@ def test_add_ref_termset_data(self): species.parent = em - er.add_ref_term_set(root_container=em) + er.add_ref_container(root_container=em) self.assertEqual(er.keys.data, [('Homo sapiens',)]) self.assertEqual(er.entities.data, [('NCBI_TAXON:9606', 'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=9606')]) self.assertEqual(er.objects.data, [(0, col1.object_id, 'VectorData', '', '')]) @unittest.skipIf(not LINKML_INSTALLED, "optional LinkML module is not installed") - def test_add_ref_termset_attr(self): + def test_add_ref_container_attr(self): terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml') er = HERD() em = HERDManagerContainer() @@ -328,7 +328,7 @@ def test_add_ref_termset_attr(self): species.parent = em - er.add_ref_term_set(root_container=em) + er.add_ref_container(root_container=em) self.assertEqual(er.keys.data, [('Homo sapiens',)]) self.assertEqual(er.entities.data, [('NCBI_TAXON:9606', 'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=9606')]) From 43877e485836f4a68810ba6f3c6b7d22df0b4e93 Mon Sep 17 00:00:00 2001 From: mavaylon1 Date: Thu, 19 Oct 2023 07:29:56 -0700 Subject: [PATCH 02/40] rework in progress --- src/hdmf/common/resources2.py | 920 ++++++++++++++++++++++++++++++++++ 1 file changed, 920 insertions(+) create mode 100644 src/hdmf/common/resources2.py diff --git a/src/hdmf/common/resources2.py b/src/hdmf/common/resources2.py new file mode 100644 index 000000000..3a6b9a0d1 --- /dev/null +++ b/src/hdmf/common/resources2.py @@ -0,0 +1,920 @@ +import pandas as pd +import numpy as np +from . import register_class, EXP_NAMESPACE +from . import get_type_map +from ..container import Table, Row, Container, Data, AbstractContainer, HERDManager +from ..utils import docval, popargs, AllowPositional +from ..build import TypeMap +from ..term_set import TermSetWrapper +from glob import glob +import os +import zipfile +from collections import namedtuple + + +class KeyTable(Table): + """ + A table for storing keys used to reference external resources. + """ + + __defaultname__ = 'keys' + + __columns__ = ( + {'name': 'key', 'type': str, + 'doc': 'The user key that maps to the resource term / registry symbol.'}, + ) + + +class Key(Row): + """ + A Row class for representing rows in the KeyTable. + """ + + __table__ = KeyTable + + +class EntityTable(Table): + """ + A table for storing the external resources a key refers to. + """ + + __defaultname__ = 'entities' + + __columns__ = ( + {'name': 'entity_id', 'type': str, + 'doc': 'The unique ID for the resource term / registry symbol.'}, + {'name': 'entity_uri', 'type': str, + 'doc': 'The URI for the resource term / registry symbol.'}, + ) + + +class Entity(Row): + """ + A Row class for representing rows in the EntityTable. + """ + + __table__ = EntityTable + + +class FileTable(Table): + """ + A table for storing file ids used in external resources. + """ + + __defaultname__ = 'files' + + __columns__ = ( + {'name': 'file_object_id', 'type': str, + 'doc': 'The file id of the file that contains the object'}, + ) + + +class File(Row): + """ + A Row class for representing rows in the FileTable. + """ + + __table__ = FileTable + + +class ObjectTable(Table): + """ + A table for storing objects (i.e. Containers) that contain keys that refer to external resources. + """ + + __defaultname__ = 'objects' + + __columns__ = ( + {'name': 'files_idx', 'type': int, + 'doc': 'The row idx for the file_object_id in FileTable containing the object.'}, + {'name': 'object_id', 'type': str, + 'doc': 'The object ID for the Container/Data.'}, + {'name': 'object_type', 'type': str, + 'doc': 'The type of the object. This is also the parent in relative_path.'}, + {'name': 'relative_path', 'type': str, + 'doc': ('The relative_path of the attribute of the object that uses ', + 'an external resource reference key. Use an empty string if not applicable.')}, + {'name': 'field', 'type': str, + 'doc': ('The field of the compound data type using an external resource. ' + 'Use an empty string if not applicable.')} + ) + + +class Object(Row): + """ + A Row class for representing rows in the ObjectTable. + """ + + __table__ = ObjectTable + + +class ObjectKeyTable(Table): + """ + A table for identifying which keys are used by which objects for referring to external resources. + """ + + __defaultname__ = 'object_keys' + + __columns__ = ( + {'name': 'objects_idx', 'type': (int, Object), + 'doc': 'The index into the objects table for the Object that uses the Key.'}, + {'name': 'keys_idx', 'type': (int, Key), + 'doc': 'The index into the keys table that is used to make an external resource reference.'} + ) + + +class EntityKeyTable(Table): + """ + A table for identifying which entities are used by which keys for referring to external resources. + """ + + __defaultname__ = 'entity_keys' + + __columns__ = ( + {'name': 'entities_idx', 'type': (int, Entity), + 'doc': 'The index into the EntityTable for the Entity that associated with the Key.'}, + {'name': 'keys_idx', 'type': (int, Key), + 'doc': 'The index into the KeyTable that is used to make an external resource reference.'} + ) + + +class EntityKey(Row): + """ + A Row class for representing rows in the EntityKeyTable. + """ + + __table__ = EntityKeyTable + + +class ObjectKey(Row): + """ + A Row class for representing rows in the ObjectKeyTable. + """ + + __table__ = ObjectKeyTable + + +@register_class('HERD', EXP_NAMESPACE) +class HERD(Container): + """ + HDMF External Resources Data Structure. + A table for mapping user terms (i.e. keys) to resource entities. + """ + + __fields__ = ( + {'name': 'keys', 'child': True}, + {'name': 'files', 'child': True}, + {'name': 'objects', 'child': True}, + {'name': 'object_keys', 'child': True}, + {'name': 'entity_keys', 'child': True}, + {'name': 'entities', 'child': True}, + ) + + @docval({'name': 'keys', 'type': KeyTable, 'default': None, + 'doc': 'The table storing user keys for referencing resources.'}, + {'name': 'files', 'type': FileTable, 'default': None, + 'doc': 'The table for storing file ids used in external resources.'}, + {'name': 'entities', 'type': EntityTable, 'default': None, + 'doc': 'The table storing entity information.'}, + {'name': 'objects', 'type': ObjectTable, 'default': None, + 'doc': 'The table storing object information.'}, + {'name': 'object_keys', 'type': ObjectKeyTable, 'default': None, + 'doc': 'The table storing object-key relationships.'}, + {'name': 'entity_keys', 'type': EntityKeyTable, 'default': None, + 'doc': 'The table storing entity-key relationships.'}, + {'name': 'type_map', 'type': TypeMap, 'default': None, + 'doc': 'The type map. If None is provided, the HDMF-common type map will be used.'}, + allow_positional=AllowPositional.WARNING) + def __init__(self, **kwargs): + name = 'external_resources' + super().__init__(name) + self.keys = kwargs['keys'] or KeyTable() + self.files = kwargs['files'] or FileTable() + self.entities = kwargs['entities'] or EntityTable() + self.objects = kwargs['objects'] or ObjectTable() + self.object_keys = kwargs['object_keys'] or ObjectKeyTable() + self.entity_keys = kwargs['entity_keys'] or EntityKeyTable() + self.type_map = kwargs['type_map'] or get_type_map() + + @staticmethod + def assert_external_resources_equal(left, right, check_dtype=True): + """ + Compare that the keys, resources, entities, objects, and object_keys tables match + + :param left: HERD object to compare with right + :param right: HERD object to compare with left + :param check_dtype: Enforce strict checking of dtypes. Dtypes may be different + for example for ids, where depending on how the data was saved + ids may change from int64 to int32. (Default: True) + :returns: The function returns True if all values match. If mismatches are found, + AssertionError will be raised. + :raises AssertionError: Raised if any differences are found. The function collects + all differences into a single error so that the assertion will indicate + all found differences. + """ + errors = [] + try: + pd.testing.assert_frame_equal(left.keys.to_dataframe(), + right.keys.to_dataframe(), + check_dtype=check_dtype) + except AssertionError as e: + errors.append(e) + try: + pd.testing.assert_frame_equal(left.files.to_dataframe(), + right.files.to_dataframe(), + check_dtype=check_dtype) + except AssertionError as e: + errors.append(e) + try: + pd.testing.assert_frame_equal(left.objects.to_dataframe(), + right.objects.to_dataframe(), + check_dtype=check_dtype) + except AssertionError as e: + errors.append(e) + try: + pd.testing.assert_frame_equal(left.entities.to_dataframe(), + right.entities.to_dataframe(), + check_dtype=check_dtype) + except AssertionError as e: + errors.append(e) + try: + pd.testing.assert_frame_equal(left.object_keys.to_dataframe(), + right.object_keys.to_dataframe(), + check_dtype=check_dtype) + except AssertionError as e: + errors.append(e) + if len(errors) > 0: + msg = ''.join(str(e)+"\n\n" for e in errors) + raise AssertionError(msg) + return True + + @docval({'name': 'key_name', 'type': str, 'doc': 'The name of the key to be added.'}) + def _add_key(self, **kwargs): + """ + Add a key to be used for making references to external resources. + + It is possible to use the same *key_name* to refer to different resources so long as the *key_name* is not + used within the same object, relative_path, and field. To do so, this method must be called for the + two different resources. + + The returned Key objects must be managed by the caller so as to be appropriately passed to subsequent calls + to methods for storing information about the different resources. + """ + key = kwargs['key_name'] + return Key(key, table=self.keys) + + @docval({'name': 'file_object_id', 'type': str, 'doc': 'The id of the file'}) + def _add_file(self, **kwargs): + """ + Add a file to be used for making references to external resources. + + This is optional when working in HDMF. + """ + file_object_id = kwargs['file_object_id'] + return File(file_object_id, table=self.files) + + @docval({'name': 'entity_id', 'type': str, 'doc': 'The unique entity id.'}, + {'name': 'entity_uri', 'type': str, 'doc': 'The URI for the entity.'}) + def _add_entity(self, **kwargs): + """ + Add an entity that will be referenced to using keys specified in HERD.entity_keys. + """ + entity_id = kwargs['entity_id'] + entity_uri = kwargs['entity_uri'] + entity = Entity( entity_id, entity_uri, table=self.entities) + return entity + + @docval({'name': 'container', 'type': (str, AbstractContainer), + 'doc': 'The Container/Data object to add or the object id of the Container/Data object to add.'}, + {'name': 'files_idx', 'type': int, + 'doc': 'The file_object_id row idx.'}, + {'name': 'object_type', 'type': str, 'default': None, + 'doc': ('The type of the object. This is also the parent in relative_path. If omitted, ' + 'the name of the container class is used.')}, + {'name': 'relative_path', 'type': str, + 'doc': ('The relative_path of the attribute of the object that uses ', + 'an external resource reference key. Use an empty string if not applicable.')}, + {'name': 'field', 'type': str, 'default': '', + 'doc': ('The field of the compound data type using an external resource.')}) + def _add_object(self, **kwargs): + """ + Add an object that references an external resource. + """ + files_idx, container, object_type, relative_path, field = popargs('files_idx', + 'container', + 'object_type', + 'relative_path', + 'field', kwargs) + + if object_type is None: + object_type = container.__class__.__name__ + + if isinstance(container, AbstractContainer): + container = container.object_id + obj = Object(files_idx, container, object_type, relative_path, field, table=self.objects) + return obj + + @docval({'name': 'obj', 'type': (int, Object), 'doc': 'The Object that uses the Key.'}, + {'name': 'key', 'type': (int, Key), 'doc': 'The Key that the Object uses.'}) + def _add_object_key(self, **kwargs): + """ + Specify that an object (i.e. container and relative_path) uses a key to reference + an external resource. + """ + obj, key = popargs('obj', 'key', kwargs) + return ObjectKey(obj, key, table=self.object_keys) + + @docval({'name': 'entity', 'type': (int, Entity), 'doc': 'The Entity associated with the Key.'}, + {'name': 'key', 'type': (int, Key), 'doc': 'The Key that the connected to the Entity.'}) + def _add_entity_key(self, **kwargs): + """ + Add entity-key relationship to the EntityKeyTable. + """ + entity, key = popargs('entity', 'key', kwargs) + return EntityKey(entity, key, table=self.entity_keys) + + @docval({'name': 'file', 'type': HERDManager, 'doc': 'The file associated with the container.'}, + {'name': 'container', 'type': AbstractContainer, + 'doc': ('The Container/Data object that uses the key or ' + 'the object id for the Container/Data object that uses the key.')}, + {'name': 'relative_path', 'type': str, + 'doc': ('The relative_path of the attribute of the object that uses ', + 'an external resource reference key. Use an empty string if not applicable.'), + 'default': ''}, + {'name': 'field', 'type': str, 'default': '', + 'doc': ('The field of the compound data type using an external resource.')}, + {'name': 'create', 'type': bool, 'default': True}) + def _check_object_field(self, **kwargs): + """ + Check if a container, relative path, and field have been added. + + The container can be either an object_id string or an AbstractContainer. + + If the container, relative_path, and field have not been added, add them + and return the corresponding Object. Otherwise, just return the Object. + """ + file = kwargs['file'] + container = kwargs['container'] + relative_path = kwargs['relative_path'] + field = kwargs['field'] + create = kwargs['create'] + file_object_id = file.object_id + files_idx = self.files.which(file_object_id=file_object_id) + + if len(files_idx) > 1: + raise ValueError("Found multiple instances of the same file.") + elif len(files_idx) == 1: + files_idx = files_idx[0] + else: + self._add_file(file_object_id) + files_idx = self.files.which(file_object_id=file_object_id)[0] + + objecttable_idx = self.objects.which(object_id=container.object_id) + + if len(objecttable_idx) > 0: + relative_path_idx = self.objects.which(relative_path=relative_path) + field_idx = self.objects.which(field=field) + objecttable_idx = list(set(objecttable_idx) & set(relative_path_idx) & set(field_idx)) + if len(objecttable_idx) == 1: + return self.objects.row[objecttable_idx[0]] + elif len(objecttable_idx) == 0 and create: + return self._add_object(files_idx=files_idx, container=container, relative_path=relative_path, field=field) + elif len(objecttable_idx) == 0 and not create: + raise ValueError("Object not in Object Table.") + else: + raise ValueError("Found multiple instances of the same object id, relative path, " + "and field in objects table.") + + @docval({'name': 'container', 'type': (str, AbstractContainer), + 'doc': ('The Container/Data object that uses the key or ' + 'the object id for the Container/Data object that uses the key.')}) + def _get_file_from_container(self, **kwargs): + """ + Method to retrieve a file associated with the container in the case a file is not provided. + """ + container = kwargs['container'] + + if isinstance(container, HERDManager): + file = container + return file + else: + parent = container.parent + if parent is not None: + while parent is not None: + if isinstance(parent, HERDManager): + file = parent + return file + else: + parent = parent.parent + else: + msg = 'Could not find file. Add container to the file.' + raise ValueError(msg) + + @docval({'name': 'objects', 'type': list, + 'doc': 'List of objects to check for TermSetWrapper within the fields.'}) + def __check_termset_wrapper(self, **kwargs): + """ + Takes a list of objects and checks the fields for TermSetWrapper. + + wrapped_obj = namedtuple('wrapped_obj', ['object', 'attribute', 'wrapper']) + :return: [wrapped_obj(object1, attribute_name1, wrapper1), ...] + """ + objects = kwargs['objects'] + + ret = [] # list to be returned with the objects, attributes and corresponding termsets + + for obj in objects: + # Get all the fields, parse out the methods and internal variables + obj_fields = [a for a in dir(obj) if not a.startswith('_') and not callable(getattr(obj, a))] + for attribute in obj_fields: + attr = getattr(obj, attribute) + if isinstance(attr, TermSetWrapper): + # Search objects that are wrapped + wrapped_obj = namedtuple('wrapped_obj', ['object', 'attribute', 'wrapper']) + ret.append(wrapped_obj(obj, attribute, attr)) + + return ret + + @docval({'name': 'root_container', 'type': HERDManager, + 'doc': 'The root container or file containing objects with a TermSet.'}) + def add_ref_container(self, **kwargs): + """ + Method to search through the root_container for all instances of TermSet. + Currently, only datasets are supported. By using a TermSet, the data comes validated + and can use the permissible values within the set to populate HERD. + """ + root_container = kwargs['root_container'] + + all_objects = root_container.all_children() # list of child objects and the container itself + + add_ref_items = self.__check_termset_wrapper(objects=all_objects) + for ref in add_ref_items: + container, attr_name, wrapper = ref + if isinstance(wrapper.value, (list, np.ndarray, tuple)): + values = wrapper.value + else: + # create list for single values (edge-case) for a simple iteration downstream + values = [wrapper.value] + for term in values: + term_info = wrapper.termset[term] + entity_id = term_info[0] + entity_uri = term_info[2] + self.add_ref(file=root_container, + container=container, + attribute=attr_name, + key=term, + entity_id=entity_id, + entity_uri=entity_uri) + + @docval({'name': 'key_name', 'type': str, 'doc': 'The name of the Key to get.'}, + {'name': 'file', 'type': HERDManager, 'doc': 'The file associated with the container.', + 'default': None}, + {'name': 'container', 'type': (str, AbstractContainer), 'default': None, + 'doc': ('The Container/Data object that uses the key or ' + 'the object id for the Container/Data object that uses the key.')}, + {'name': 'relative_path', 'type': str, + 'doc': ('The relative_path of the attribute of the object that uses ', + 'an external resource reference key. Use an empty string if not applicable.'), + 'default': ''}, + {'name': 'field', 'type': str, 'default': '', + 'doc': ('The field of the compound data type using an external resource.')}) + def get_key(self, **kwargs): + """ + Return a Key. + + If container, relative_path, and field are provided, the Key that corresponds to the given name of the key + for the given container, relative_path, and field is returned. + """ + key_name, container, relative_path, field = popargs('key_name', 'container', 'relative_path', 'field', kwargs) + key_idx_matches = self.keys.which(key=key_name) + + file = kwargs['file'] + + if container is not None: + if file is None: + file = self._get_file_from_container(container=container) + # if same key is used multiple times, determine + # which instance based on the Container + object_field = self._check_object_field(file=file, + container=container, + relative_path=relative_path, + field=field) + for row_idx in self.object_keys.which(objects_idx=object_field.idx): + key_idx = self.object_keys['keys_idx', row_idx] + if key_idx in key_idx_matches: + return self.keys.row[key_idx] + msg = "No key found with that container." + raise ValueError(msg) + else: + if len(key_idx_matches) == 0: + # the key has never been used before + raise ValueError("key '%s' does not exist" % key_name) + elif len(key_idx_matches) > 1: + msg = "There are more than one key with that name. Please search with additional information." + raise ValueError(msg) + else: + return self.keys.row[key_idx_matches[0]] + + @docval({'name': 'entity_id', 'type': str, 'doc': 'The ID for the identifier at the resource.'}) + def get_entity(self, **kwargs): + entity_id = kwargs['entity_id'] + entity = self.entities.which(entity_id=entity_id) + if len(entity)>0: + return self.entities.row[entity[0]] + else: + return None + + @docval({'name': 'container', 'type': (str, AbstractContainer), 'default': None, + 'doc': ('The Container/Data object that uses the key or ' + 'the object_id for the Container/Data object that uses the key.')}, + {'name': 'attribute', 'type': str, + 'doc': 'The attribute of the container for the external reference.', 'default': None}, + {'name': 'field', 'type': str, 'default': '', + 'doc': ('The field of the compound data type using an external resource.')}, + {'name': 'key', 'type': (str, Key), 'default': None, + 'doc': 'The name of the key or the Key object from the KeyTable for the key to add a resource for.'}, + {'name': 'entity_id', 'type': str, 'doc': 'The identifier for the entity at the resource.'}, + {'name': 'entity_uri', 'type': str, 'doc': 'The URI for the identifier at the resource.', 'default': None}, + {'name': 'file', 'type': HERDManager, 'doc': 'The file associated with the container.', + 'default': None}, + ) + def add_ref(self, **kwargs): + """ + Add information about an external reference used in this file. + + It is possible to use the same name of the key to refer to different resources + so long as the name of the key is not used within the same object, relative_path, and + field combination. This method does not support such functionality by default. + """ + ############################################################### + container = kwargs['container'] + attribute = kwargs['attribute'] + if isinstance(container, Data): + if attribute == 'data': + attribute = None + key = kwargs['key'] + field = kwargs['field'] + entity_id = kwargs['entity_id'] + entity_uri = kwargs['entity_uri'] + file = kwargs['file'] + + ################## + # Set File if None + ################## + if file is None: + file = self._get_file_from_container(container=container) + + ############## + # Validate Key + ############## + add_key = False + add_object_key = False + if not isinstance(key, Key): + key_idx_matches = self.keys.which(key=key) + # if same key is used multiple times, determine + # which instance based on the Container + # """ + # TODO: Resolve key if in same object + # """ + for row_idx in self.object_keys.which(objects_idx=object_field.idx): + key_idx = self.object_keys['keys_idx', row_idx] + if key_idx in key_idx_matches: + msg = "Use Key Object when referencing an existing (container, relative_path, key)" + raise ValueError(msg) + + # key = self._add_key(key) TODO + # self._add_object_key(object_field, key) TODO + add_key = True + add_object_key = True + else: + # Check to see that the existing key is being used with the object. + # If true, do nothing. If false, create a new obj/key relationship + # in the ObjectKeyTable + key_idx = key.idx + object_key_row_idx = self.object_keys.which(keys_idx=key_idx) + if len(object_key_row_idx)!=0: + obj_key_check = False + for row_idx in object_key_row_idx: + obj_idx = self.object_keys['objects_idx', row_idx] + if obj_idx == object_field.idx: + obj_key_check = True + if not obj_key_check: + # self._add_object_key(object_field, key) # TODO + add_object_key = True + else: + msg = "Cannot find Key. Create new Key with string." + raise ValueError(msg) + + ################# + # Validate Entity + ################# + add_entity_key = False + add_entity = False + + entity = self.get_entity(entity_id=entity_id) + if entity is None: + if entity_uri is None: + msg = 'New entities must have an entity_uri.' + raise ValueError(msg) + # entity = self._add_entity(entity_id, entity_uri) TODO + # self._add_entity_key(entity, key) TODO + add_entity = True + add_entity_key = True + else: + if entity_uri is not None: + msg = 'If you plan on reusing an entity, then entity_uri parameter must be None.' + raise ValueError(msg) # TODO: Change to Warn that the uri provided is being ignored + + # check for entity-key relationship in EntityKeyTable + key_idx = key.idx + entity_key_row_idx = self.entity_keys.which(keys_idx=key_idx) + if len(entity_key_row_idx)!=0: + # this means there exists rows where the key is in the EntityKeyTable + entity_key_check = False + for row_idx in entity_key_row_idx: + entity_idx = self.entity_keys['entities_idx', row_idx] + if entity_idx == entity.idx: + entity_key_check = True + # this means there is already a key-entity relationship recorded + if not entity_key_check: + # this means that though the key is there, there is not key-entity relationship + # a.k.a add it now + # self._add_entity_key(entity, key) TODO + add_entity_key = True + else: + # this means that specific key is not in the EntityKeyTable, so add it and establish + # the relationship with the entity + # self._add_entity_key(entity, key) TODO + add_entity_key = True + + ################# + # Validate Object + ################# + + + ############### + # Populate HERD + ############### + + + + @docval({'name': 'object_type', 'type': str, + 'doc': 'The type of the object. This is also the parent in relative_path.'}, + {'name': 'relative_path', 'type': str, + 'doc': ('The relative_path of the attribute of the object that uses ', + 'an external resource reference key. Use an empty string if not applicable.'), + 'default': ''}, + {'name': 'field', 'type': str, 'default': '', + 'doc': ('The field of the compound data type using an external resource.')}, + {'name': 'all_instances', 'type': bool, 'default': False, + 'doc': ('The bool to return a dataframe with all instances of the object_type.', + 'If True, relative_path and field inputs will be ignored.')}) + def get_object_type(self, **kwargs): + """ + Get all entities/resources associated with an object_type. + """ + object_type = kwargs['object_type'] + relative_path = kwargs['relative_path'] + field = kwargs['field'] + all_instances = kwargs['all_instances'] + + df = self.to_dataframe() + + if all_instances: + df = df.loc[df['object_type'] == object_type] + else: + df = df.loc[(df['object_type'] == object_type) + & (df['relative_path'] == relative_path) + & (df['field'] == field)] + return df + + @docval({'name': 'file', 'type': HERDManager, 'doc': 'The file.', + 'default': None}, + {'name': 'container', 'type': (str, AbstractContainer), + 'doc': 'The Container/data object that is linked to resources/entities.'}, + {'name': 'attribute', 'type': str, + 'doc': 'The attribute of the container for the external reference.', 'default': None}, + {'name': 'relative_path', 'type': str, + 'doc': ('The relative_path of the attribute of the object that uses ', + 'an external resource reference key. Use an empty string if not applicable.'), + 'default': ''}, + {'name': 'field', 'type': str, 'default': '', + 'doc': ('The field of the compound data type using an external resource.')}) + def get_object_entities(self, **kwargs): + """ + Get all entities/resources associated with an object. + """ + file = kwargs['file'] + container = kwargs['container'] + attribute = kwargs['attribute'] + relative_path = kwargs['relative_path'] + field = kwargs['field'] + + if file is None: + file = self._get_file_from_container(container=container) + + keys = [] + entities = [] + if attribute is None: + object_field = self._check_object_field(file=file, + container=container, + relative_path=relative_path, + field=field, + create=False) + else: + object_field = self._check_object_field(file=file, + container=container[attribute], + relative_path=relative_path, + field=field, + create=False) + # Find all keys associated with the object + for row_idx in self.object_keys.which(objects_idx=object_field.idx): + keys.append(self.object_keys['keys_idx', row_idx]) + # Find all the entities/resources for each key. + for key_idx in keys: + entity_key_row_idx = self.entity_keys.which(keys_idx=key_idx) + for row_idx in entity_key_row_idx: + entity_idx = self.entity_keys['entities_idx', row_idx] + entities.append(self.entities.__getitem__(entity_idx)) + df = pd.DataFrame(entities, columns=['entity_id', 'entity_uri']) + return df + + @docval({'name': 'use_categories', 'type': bool, 'default': False, + 'doc': 'Use a multi-index on the columns to indicate which category each column belongs to.'}, + rtype=pd.DataFrame, returns='A DataFrame with all data merged into a flat, denormalized table.') + def to_dataframe(self, **kwargs): + """ + Convert the data from the keys, resources, entities, objects, and object_keys tables + to a single joint dataframe. I.e., here data is being denormalized, e.g., keys that + are used across multiple entities or objects will duplicated across the corresponding + rows. + + Returns: :py:class:`~pandas.DataFrame` with all data merged into a single, flat, denormalized table. + + """ + use_categories = popargs('use_categories', kwargs) + # Step 1: Combine the entities, keys, and entity_keys table + ent_key_df = self.entity_keys.to_dataframe() + entities_mapped_df = self.entities.to_dataframe().iloc[ent_key_df['entities_idx']].reset_index(drop=True) + keys_mapped_df = self.keys.to_dataframe().iloc[ent_key_df['keys_idx']].reset_index(drop=True) + ent_key_df = pd.concat(objs=[ent_key_df, entities_mapped_df, keys_mapped_df], + axis=1, + verify_integrity=False) + # Step 2: Combine the the files, object_keys and objects tables + object_keys_df = self.object_keys.to_dataframe() + objects_mapped_df = self.objects.to_dataframe().iloc[object_keys_df['objects_idx']].reset_index(drop=True) + object_keys_df = pd.concat(objs=[object_keys_df, objects_mapped_df], + axis=1, + verify_integrity=False) + files_df = self.files.to_dataframe().iloc[object_keys_df['files_idx']].reset_index(drop=True) + file_object_object_key_df = pd.concat(objs=[object_keys_df, files_df], + axis=1, + verify_integrity=False) + # Step 3: merge the combined entities_df and object_keys_df DataFrames + result_df = pd.concat( + # Create for each row in the objects_keys table a DataFrame with all corresponding data from all tables + objs=[pd.merge( + # Find all entities that correspond to the row i of the object_keys_table + ent_key_df[ent_key_df['keys_idx'] == object_keys_df['keys_idx'].iloc[i]].reset_index(drop=True), + # Get a DataFrame for row i of the objects_keys_table + file_object_object_key_df.iloc[[i, ]], + # Merge the entities and object_keys on the keys_idx column so that the values from the single + # object_keys_table row are copied across all corresponding rows in the entities table + on='keys_idx') + for i in range(len(object_keys_df))], + # Concatenate the rows of the objs + axis=0, + verify_integrity=False) + # Step 4: Clean up the index and sort columns by table type and name + result_df.reset_index(inplace=True, drop=True) + # ADD files + file_id_col = [] + for idx in result_df['files_idx']: + file_id_val = self.files.to_dataframe().iloc[int(idx)]['file_object_id'] + file_id_col.append(file_id_val) + + result_df['file_object_id'] = file_id_col + column_labels = [('files', 'file_object_id'), + ('objects', 'objects_idx'), ('objects', 'object_id'), ('objects', 'files_idx'), + ('objects', 'object_type'), ('objects', 'relative_path'), ('objects', 'field'), + ('keys', 'keys_idx'), ('keys', 'key'), + ('entities', 'entities_idx'), ('entities', 'entity_id'), ('entities', 'entity_uri')] + # sort the columns based on our custom order + result_df = result_df.reindex(labels=[c[1] for c in column_labels], + axis=1) + result_df = result_df.astype({'keys_idx': 'uint32', + 'objects_idx': 'uint32', + 'files_idx': 'uint32', + 'entities_idx': 'uint32'}) + # Add the categories if requested + if use_categories: + result_df.columns = pd.MultiIndex.from_tuples(column_labels) + # return the result + return result_df + + @docval({'name': 'path', 'type': str, 'doc': 'The path to the zip file.'}) + def to_zip(self, **kwargs): + """ + Write the tables in HERD to zipped tsv files. + """ + zip_file = kwargs['path'] + directory = os.path.dirname(zip_file) + + files = [os.path.join(directory, child.name)+'.tsv' for child in self.children] + for i in range(len(self.children)): + df = self.children[i].to_dataframe() + df.to_csv(files[i], sep='\t', index=False) + + with zipfile.ZipFile(zip_file, 'w') as zipF: + for file in files: + zipF.write(file) + + # remove tsv files + for file in files: + os.remove(file) + + @classmethod + @docval({'name': 'path', 'type': str, 'doc': 'The path to the zip file.'}) + def from_zip(cls, **kwargs): + """ + Method to read in zipped tsv files to populate HERD. + """ + zip_file = kwargs['path'] + directory = os.path.dirname(zip_file) + + with zipfile.ZipFile(zip_file, 'r') as zip: + zip.extractall(directory) + tsv_paths = glob(directory+'/*') + + for file in tsv_paths: + file_name = os.path.basename(file) + if file_name == 'files.tsv': + files_df = pd.read_csv(file, sep='\t').replace(np.nan, '') + files = FileTable().from_dataframe(df=files_df, name='files', extra_ok=False) + os.remove(file) + continue + if file_name == 'keys.tsv': + keys_df = pd.read_csv(file, sep='\t').replace(np.nan, '') + keys = KeyTable().from_dataframe(df=keys_df, name='keys', extra_ok=False) + os.remove(file) + continue + if file_name == 'entities.tsv': + entities_df = pd.read_csv(file, sep='\t').replace(np.nan, '') + entities = EntityTable().from_dataframe(df=entities_df, name='entities', extra_ok=False) + os.remove(file) + continue + if file_name == 'objects.tsv': + objects_df = pd.read_csv(file, sep='\t').replace(np.nan, '') + objects = ObjectTable().from_dataframe(df=objects_df, name='objects', extra_ok=False) + os.remove(file) + continue + if file_name == 'object_keys.tsv': + object_keys_df = pd.read_csv(file, sep='\t').replace(np.nan, '') + object_keys = ObjectKeyTable().from_dataframe(df=object_keys_df, name='object_keys', extra_ok=False) + os.remove(file) + continue + if file_name == 'entity_keys.tsv': + ent_key_df = pd.read_csv(file, sep='\t').replace(np.nan, '') + entity_keys = EntityKeyTable().from_dataframe(df=ent_key_df, name='entity_keys', extra_ok=False) + os.remove(file) + continue + + # we need to check the idx columns in entities, objects, and object_keys + entity_idx = entity_keys['entities_idx'] + for idx in entity_idx: + if not int(idx) < len(entities): + msg = "Entity Index out of range in EntityTable. Please check for alterations." + raise ValueError(msg) + + files_idx = objects['files_idx'] + for idx in files_idx: + if not int(idx) < len(files): + msg = "File_ID Index out of range in ObjectTable. Please check for alterations." + raise ValueError(msg) + + object_idx = object_keys['objects_idx'] + for idx in object_idx: + if not int(idx) < len(objects): + msg = "Object Index out of range in ObjectKeyTable. Please check for alterations." + raise ValueError(msg) + + keys_idx = object_keys['keys_idx'] + for idx in keys_idx: + if not int(idx) < len(keys): + msg = "Key Index out of range in ObjectKeyTable. Please check for alterations." + raise ValueError(msg) + + keys_idx = entity_keys['keys_idx'] + for idx in keys_idx: + if not int(idx) < len(keys): + msg = "Key Index out of range in EntityKeyTable. Please check for alterations." + raise ValueError(msg) + + + er = HERD(files=files, + keys=keys, + entities=entities, + entity_keys=entity_keys, + objects=objects, + object_keys=object_keys) + return er From 54cbf955f295a4236cb1a4fce6a9b84f6104c607 Mon Sep 17 00:00:00 2001 From: mavaylon1 Date: Sat, 21 Oct 2023 11:19:44 -0700 Subject: [PATCH 03/40] updated tests/go through for final checks --- src/hdmf/common/resources2.py | 153 ++++++++++++++++++++-------- tests/unit/common/test_resources.py | 82 +++++---------- 2 files changed, 138 insertions(+), 97 deletions(-) diff --git a/src/hdmf/common/resources2.py b/src/hdmf/common/resources2.py index 3a6b9a0d1..b26f7a8bf 100644 --- a/src/hdmf/common/resources2.py +++ b/src/hdmf/common/resources2.py @@ -10,6 +10,7 @@ import os import zipfile from collections import namedtuple +from warnings import warn class KeyTable(Table): @@ -358,16 +359,17 @@ def _check_object_field(self, **kwargs): relative_path = kwargs['relative_path'] field = kwargs['field'] create = kwargs['create'] + file_object_id = file.object_id files_idx = self.files.which(file_object_id=file_object_id) if len(files_idx) > 1: + # It isn't possible for len(files_idx) > 1 without the user directly using _add_file raise ValueError("Found multiple instances of the same file.") elif len(files_idx) == 1: files_idx = files_idx[0] else: - self._add_file(file_object_id) - files_idx = self.files.which(file_object_id=file_object_id)[0] + files_idx = None objecttable_idx = self.objects.which(object_id=container.object_id) @@ -378,10 +380,15 @@ def _check_object_field(self, **kwargs): if len(objecttable_idx) == 1: return self.objects.row[objecttable_idx[0]] elif len(objecttable_idx) == 0 and create: - return self._add_object(files_idx=files_idx, container=container, relative_path=relative_path, field=field) + return {'file_object_id': file_object_id, + 'files_idx': files_idx, + 'container': container, + 'relative_path': relative_path, + 'field': field} elif len(objecttable_idx) == 0 and not create: raise ValueError("Object not in Object Table.") else: + # It isn't possible for this to happen unless the user used _add_object. raise ValueError("Found multiple instances of the same object id, relative path, " "and field in objects table.") @@ -569,41 +576,15 @@ def add_ref(self, **kwargs): ############## add_key = False add_object_key = False + check_object_key = False if not isinstance(key, Key): - key_idx_matches = self.keys.which(key=key) - # if same key is used multiple times, determine - # which instance based on the Container - # """ - # TODO: Resolve key if in same object - # """ - for row_idx in self.object_keys.which(objects_idx=object_field.idx): - key_idx = self.object_keys['keys_idx', row_idx] - if key_idx in key_idx_matches: - msg = "Use Key Object when referencing an existing (container, relative_path, key)" - raise ValueError(msg) - - # key = self._add_key(key) TODO - # self._add_object_key(object_field, key) TODO add_key = True add_object_key = True else: # Check to see that the existing key is being used with the object. # If true, do nothing. If false, create a new obj/key relationship # in the ObjectKeyTable - key_idx = key.idx - object_key_row_idx = self.object_keys.which(keys_idx=key_idx) - if len(object_key_row_idx)!=0: - obj_key_check = False - for row_idx in object_key_row_idx: - obj_idx = self.object_keys['objects_idx', row_idx] - if obj_idx == object_field.idx: - obj_key_check = True - if not obj_key_check: - # self._add_object_key(object_field, key) # TODO - add_object_key = True - else: - msg = "Cannot find Key. Create new Key with string." - raise ValueError(msg) + check_object_key = True ################# # Validate Entity @@ -612,18 +593,108 @@ def add_ref(self, **kwargs): add_entity = False entity = self.get_entity(entity_id=entity_id) + check_entity_key = False if entity is None: if entity_uri is None: msg = 'New entities must have an entity_uri.' raise ValueError(msg) - # entity = self._add_entity(entity_id, entity_uri) TODO - # self._add_entity_key(entity, key) TODO + add_entity = True add_entity_key = True else: + check_entity_key = True + + ################# + # Validate Object + ################# + if attribute is None: # Trivial Case + relative_path = '' + object_field = self._check_object_field(file=file, + container=container, + relative_path=relative_path, + field=field) + else: # DataType Attribute Case + attribute_object = getattr(container, attribute) # returns attribute object + if isinstance(attribute_object, AbstractContainer): + relative_path = '' + object_field = self._check_object_field(file=file, + container=attribute_object, + relative_path=relative_path, + field=field) + else: # Non-DataType Attribute Case: + obj_mapper = self.type_map.get_map(container) + spec = obj_mapper.get_attr_spec(attr_name=attribute) + parent_spec = spec.parent # return the parent spec of the attribute + if parent_spec.data_type is None: + while parent_spec.data_type is None: + parent_spec = parent_spec.parent # find the closest parent with a data_type + parent_cls = self.type_map.get_dt_container_cls(data_type=parent_spec.data_type, autogen=False) + if isinstance(container, parent_cls): + parent = container + # We need to get the path of the spec for relative_path + absolute_path = spec.path + relative_path = absolute_path[absolute_path.find('/')+1:] + object_field = self._check_object_field(file=file, + container=parent, + relative_path=relative_path, + field=field) + else: + msg = 'Container not the nearest data_type' + raise ValueError(msg) + else: + parent = container # container needs to be the parent + absolute_path = spec.path + relative_path = absolute_path[absolute_path.find('/')+1:] + # this regex removes everything prior to the container on the absolute_path + object_field = self._check_object_field(file=file, + container=parent, + relative_path=relative_path, + field=field) + + ############### + # Populate HERD + ############### + if isinstance(object_field, dict): + if object_field['files_idx'] is None: + self._add_file(object_field['file_object_id']) + object_field['files_idx'] = self.files.which(file_object_id=object_field['file_object_id'])[0] + object_field = self._add_object(files_idx=object_field['files_idx'], + container=object_field['container'], + relative_path=object_field['relative_path'], + field=object_field['field']) + + # Since object_field is set, we need to check if + # the key has been associated with that object. + # If so, just reuse the key. + if add_key: + key_exists = False + key_idx_matches = self.keys.which(key=key) + for row_idx in self.object_keys.which(objects_idx=object_field.idx): + key_idx = self.object_keys['keys_idx', row_idx] + if key_idx in key_idx_matches: + key_exists = True # Make sure we don't add the key + key = self.keys.row[key_idx] + if not key_exists: + key = self._add_key(key) + + if check_object_key: + key_idx = key.idx + object_key_row_idx = self.object_keys.which(keys_idx=key_idx) + obj_key_exists = False + for row_idx in object_key_row_idx: + obj_idx = self.object_keys['objects_idx', row_idx] + if obj_idx == object_field.idx: + obj_key_exists = True + if not obj_key_exists: + add_object_key = True + + if add_object_key: + self._add_object_key(object_field, key) + + if check_entity_key: if entity_uri is not None: - msg = 'If you plan on reusing an entity, then entity_uri parameter must be None.' - raise ValueError(msg) # TODO: Change to Warn that the uri provided is being ignored + msg = 'This entity already exists. Ignoring new entity uri' + warn(msg) # TODO: Change to Warn that the uri provided is being ignored # check for entity-key relationship in EntityKeyTable key_idx = key.idx @@ -647,15 +718,11 @@ def add_ref(self, **kwargs): # self._add_entity_key(entity, key) TODO add_entity_key = True - ################# - # Validate Object - ################# - - - ############### - # Populate HERD - ############### + if add_entity: + entity = self._add_entity(entity_id, entity_uri) + if add_entity_key: + self._add_entity_key(entity, key) @docval({'name': 'object_type', 'type': str, diff --git a/tests/unit/common/test_resources.py b/tests/unit/common/test_resources.py index 1f2d2afbf..19a6a95c3 100644 --- a/tests/unit/common/test_resources.py +++ b/tests/unit/common/test_resources.py @@ -2,9 +2,9 @@ import unittest from hdmf.common import DynamicTable, VectorData from hdmf import TermSet, TermSetWrapper -from hdmf.common.resources import HERD, Key +from hdmf.common.resources2 import HERD, Key from hdmf import Data, Container, HERDManager -from hdmf.testing import TestCase, H5RoundTripMixin, remove_test_file +from hdmf.testing import TestCase, remove_test_file import numpy as np from tests.unit.build_tests.test_io_map import Bar from tests.unit.helpers.utils import create_test_type_map, CORE_NAMESPACE @@ -25,7 +25,7 @@ def __init__(self, **kwargs): super().__init__(**kwargs) -class TestHERD(H5RoundTripMixin, TestCase): +class TestHERD(TestCase): def setUpContainer(self): er = HERD() @@ -88,18 +88,18 @@ def test_to_dataframe(self): file_1 = HERDManagerContainer(name='file_1') file_2 = HERDManagerContainer(name='file_2') - k1, e1 = er.add_ref(file=file_1, - container=data1, - field='species', - key='Mus musculus', - entity_id='NCBI:txid10090', - entity_uri='https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=10090') - k2, e2 = er.add_ref(file=file_2, - container=data2, - field='species', - key='Homo sapiens', - entity_id='NCBI:txid9606', - entity_uri='https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9606') + er.add_ref(file=file_1, + container=data1, + field='species', + key='Mus musculus', + entity_id='NCBI:txid10090', + entity_uri='https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=10090') + er.add_ref(file=file_2, + container=data2, + field='species', + key='Homo sapiens', + entity_id='NCBI:txid9606', + entity_uri='https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9606') # Convert to dataframe and compare against the expected result result_df = er.to_dataframe() @@ -826,24 +826,6 @@ def test_object_key_existing_key_new_object(self): entity_uri='entity_uri2') self.assertEqual(er.object_keys.data, [(0, 0), (1, 0)]) - def test_object_key_existing_key_new_object_error(self): - er = HERD() - data_1 = Data(name='data_name', data=np.array([('Mus musculus', 9, 81.0), ('Homo sapien', 3, 27.0)], - dtype=[('species', 'U14'), ('age', 'i4'), ('weight', 'f4')])) - - er.add_ref(file=HERDManagerContainer(name='file'), - container=data_1, - key='Mus musculus', - entity_id='NCBI:txid10090', - entity_uri='https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=10090') - key = er._add_key('key') - with self.assertRaises(ValueError): - er.add_ref(file=HERDManagerContainer(name='file'), - container=data_1, - key=key, - entity_id='entity1', - entity_uri='entity_uri1') - def test_reuse_key_reuse_entity(self): # With the key and entity existing, the EntityKeyTable should not have duplicates er = HERD() @@ -922,7 +904,7 @@ def test_entity_uri_error(self): key='Mus musculus', entity_id='NCBI:txid10090') - def test_entity_uri_reuse_error(self): + def test_entity_uri_warning(self): er = HERD() data_1 = Data(name='data_name', data=np.array([('Mus musculus', 9, 81.0), ('Homo sapien', 3, 27.0)], dtype=[('species', 'U14'), ('age', 'i4'), ('weight', 'f4')])) @@ -936,7 +918,7 @@ def test_entity_uri_reuse_error(self): entity_id='NCBI:txid10090', entity_uri='https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=10090') existing_key = er.get_key('Mus musculus') - with self.assertRaises(ValueError): + with self.assertWarns(Warning): er.add_ref(file=HERDManagerContainer(name='file'), container=data_2, key=existing_key, @@ -963,32 +945,32 @@ def test_key_without_entity_error(self): def test_check_object_field_add(self): er = HERD() data = Data(name="species", data=['Homo sapiens', 'Mus musculus']) - er._check_object_field(file=HERDManagerContainer(name='file'), + file = HERDManagerContainer(name='file') + _dict = er._check_object_field(file=file, container=data, relative_path='', field='') - - self.assertEqual(er.objects.data, [(0, data.object_id, 'Data', '', '')]) + expected = {'file_object_id': file.object_id, + 'files_idx': None, + 'container': data, + 'relative_path': '', + 'field': ''} + self.assertEqual(_dict, expected) def test_check_object_field_multi_files(self): er = HERD() data = Data(name="species", data=['Homo sapiens', 'Mus musculus']) file = HERDManagerContainer(name='file') - - er._check_object_field(file=file, container=data, relative_path='', field='') + er._add_file(file.object_id) er._add_file(file.object_id) - data2 = Data(name="species", data=['Homo sapiens', 'Mus musculus']) with self.assertRaises(ValueError): - er._check_object_field(file=file, container=data2, relative_path='', field='') + er._check_object_field(file=file, container=data, relative_path='', field='') def test_check_object_field_multi_error(self): er = HERD() data = Data(name="species", data=['Homo sapiens', 'Mus musculus']) - er._check_object_field(file=HERDManagerContainer(name='file'), - container=data, - relative_path='', - field='') + er._add_object(files_idx=0, container=data, relative_path='', field='') er._add_object(files_idx=0, container=data, relative_path='', field='') with self.assertRaises(ValueError): er._check_object_field(file=HERDManagerContainer(name='file'), @@ -1063,14 +1045,6 @@ def test_add_ref_compound_data(self): self.assertEqual(er.entities.data, [('NCBI:txid10090', 'entity_0_uri')]) self.assertEqual(er.objects.data, [(0, data.object_id, 'Data', '', 'species')]) - def test_roundtrip(self): - read_container = self.roundtripContainer() - pd.testing.assert_frame_equal(read_container.to_dataframe(), self.container.to_dataframe()) - - def test_roundtrip_export(self): - read_container = self.roundtripExportContainer() - pd.testing.assert_frame_equal(read_container.to_dataframe(), self.container.to_dataframe()) - class TestHERDNestedAttributes(TestCase): From 2f8ab37957a213c988e791494422f82e808930b6 Mon Sep 17 00:00:00 2001 From: mavaylon1 Date: Mon, 30 Oct 2023 09:51:48 -0700 Subject: [PATCH 04/40] add_ref_termset and tests --- src/hdmf/common/resources.py | 310 ++++++--- src/hdmf/common/resources2.py | 987 ---------------------------- tests/unit/common/test_resources.py | 92 ++- 3 files changed, 314 insertions(+), 1075 deletions(-) delete mode 100644 src/hdmf/common/resources2.py diff --git a/src/hdmf/common/resources.py b/src/hdmf/common/resources.py index 4a492428f..cda480592 100644 --- a/src/hdmf/common/resources.py +++ b/src/hdmf/common/resources.py @@ -3,6 +3,8 @@ from . import register_class, EXP_NAMESPACE from . import get_type_map from ..container import Table, Row, Container, Data, AbstractContainer, HERDManager +from ..term_set import TermSet +from ..data_utils import DataIO from ..utils import docval, popargs, AllowPositional from ..build import TypeMap from ..term_set import TermSetWrapper @@ -10,6 +12,7 @@ import os import zipfile from collections import namedtuple +from warnings import warn class KeyTable(Table): @@ -358,16 +361,17 @@ def _check_object_field(self, **kwargs): relative_path = kwargs['relative_path'] field = kwargs['field'] create = kwargs['create'] + file_object_id = file.object_id files_idx = self.files.which(file_object_id=file_object_id) if len(files_idx) > 1: + # It isn't possible for len(files_idx) > 1 without the user directly using _add_file raise ValueError("Found multiple instances of the same file.") elif len(files_idx) == 1: files_idx = files_idx[0] else: - self._add_file(file_object_id) - files_idx = self.files.which(file_object_id=file_object_id)[0] + files_idx = None objecttable_idx = self.objects.which(object_id=container.object_id) @@ -378,10 +382,15 @@ def _check_object_field(self, **kwargs): if len(objecttable_idx) == 1: return self.objects.row[objecttable_idx[0]] elif len(objecttable_idx) == 0 and create: - return self._add_object(files_idx=files_idx, container=container, relative_path=relative_path, field=field) + return {'file_object_id': file_object_id, + 'files_idx': files_idx, + 'container': container, + 'relative_path': relative_path, + 'field': field} elif len(objecttable_idx) == 0 and not create: raise ValueError("Object not in Object Table.") else: + # It isn't possible for this to happen unless the user used _add_object. raise ValueError("Found multiple instances of the same object id, relative path, " "and field in objects table.") @@ -466,63 +475,84 @@ def add_ref_container(self, **kwargs): entity_id=entity_id, entity_uri=entity_uri) - @docval({'name': 'key_name', 'type': str, 'doc': 'The name of the Key to get.'}, - {'name': 'file', 'type': HERDManager, 'doc': 'The file associated with the container.', + @docval({'name': 'file', 'type': HERDManager, 'doc': 'The file associated with the container.', 'default': None}, {'name': 'container', 'type': (str, AbstractContainer), 'default': None, 'doc': ('The Container/Data object that uses the key or ' - 'the object id for the Container/Data object that uses the key.')}, - {'name': 'relative_path', 'type': str, - 'doc': ('The relative_path of the attribute of the object that uses ', - 'an external resource reference key. Use an empty string if not applicable.'), - 'default': ''}, + 'the object_id for the Container/Data object that uses the key.')}, + {'name': 'attribute', 'type': str, + 'doc': 'The attribute of the container for the external reference.', 'default': None}, {'name': 'field', 'type': str, 'default': '', - 'doc': ('The field of the compound data type using an external resource.')}) - def get_key(self, **kwargs): + 'doc': ('The field of the compound data type using an external resource.')}, + {'name': 'key', 'type': (str, Key), 'default': None, + 'doc': 'The name of the key or the Key object from the KeyTable for the key to add a resource for.'}, + {'name': 'termset', 'type': TermSet, 'default': None, + 'doc': 'The TermSet to be used if the container/attribute does not have one.'} + ) + def add_ref_termset(self, **kwargs): """ - Return a Key. - - If container, relative_path, and field are provided, the Key that corresponds to the given name of the key - for the given container, relative_path, and field is returned. + This method allows users to take advantage of using the TermSet class to provide the entity information + for add_ref, while also validating the data. This method supports adding a single key and an entire dataset + to the HERD tables. For both cases, the term, i.e., key, will be validated against the permissible values + in the TermSet. If valid, it will proceed to call add_ref. Otherwise, the method will return a dict of + missing terms. """ - key_name, container, relative_path, field = popargs('key_name', 'container', 'relative_path', 'field', kwargs) - key_idx_matches = self.keys.which(key=key_name) - file = kwargs['file'] - - if container is not None: - if file is None: - file = self._get_file_from_container(container=container) - # if same key is used multiple times, determine - # which instance based on the Container - object_field = self._check_object_field(file=file, - container=container, - relative_path=relative_path, - field=field) - for row_idx in self.object_keys.which(objects_idx=object_field.idx): - key_idx = self.object_keys['keys_idx', row_idx] - if key_idx in key_idx_matches: - return self.keys.row[key_idx] - msg = "No key found with that container." - raise ValueError(msg) - else: - if len(key_idx_matches) == 0: - # the key has never been used before - raise ValueError("key '%s' does not exist" % key_name) - elif len(key_idx_matches) > 1: - msg = "There are more than one key with that name. Please search with additional information." - raise ValueError(msg) + container = kwargs['container'] + attribute = kwargs['attribute'] + key = kwargs['key'] + field = kwargs['field'] + termset = kwargs['termset'] + + if termset is None: + if attribute is None: + try: + termset = container.termset + except AttributeError: + msg = "Cannot Find TermSet" + raise AttributeError(msg) else: - return self.keys.row[key_idx_matches[0]] + termset = container[attribute].termset + if termset is None: + msg = "Cannot Find TermSet" + raise ValueError(msg) - @docval({'name': 'entity_id', 'type': str, 'doc': 'The ID for the identifier at the resource.'}) - def get_entity(self, **kwargs): - entity_id = kwargs['entity_id'] - entity = self.entities.which(entity_id=entity_id) - if len(entity)>0: - return self.entities.row[entity[0]] + if file is None: + file = self._get_file_from_container(container=container) + + # if key is provided then add_ref proceeds as normal + if key is not None: + data = [key] else: - return None + # if the key is not provided, proceed to "bulk add" + if attribute is None: + data_object = container + else: + data_object = getattr(container, attribute) + if isinstance(data_object, (Data, DataIO)): + data = data_object.data + elif isinstance(data_object, (list, np.ndarray)): + data = data_object + missing_terms = [] + for term in data: + # check the data according to the permissible_values + try: + term_info = termset[term] + except ValueError: + missing_terms.append(term) + continue + # prep for add_ref + entity_id = term_info[0] + entity_uri = term_info[2] + self.add_ref(file=file, + container=container, + attribute=attribute, + key=term, + field=field, + entity_id=entity_id, + entity_uri=entity_uri) + if len(missing_terms)>0: + return {"missing_terms": missing_terms} @docval({'name': 'container', 'type': (str, AbstractContainer), 'default': None, 'doc': ('The Container/Data object that uses the key or ' @@ -558,9 +588,48 @@ def add_ref(self, **kwargs): entity_uri = kwargs['entity_uri'] file = kwargs['file'] + ################## + # Set File if None + ################## if file is None: file = self._get_file_from_container(container=container) + ############## + # Validate Key + ############## + add_key = False + add_object_key = False + check_object_key = False + if not isinstance(key, Key): + add_key = True + add_object_key = True + else: + # Check to see that the existing key is being used with the object. + # If true, do nothing. If false, create a new obj/key relationship + # in the ObjectKeyTable + check_object_key = True + + ################# + # Validate Entity + ################# + add_entity_key = False + add_entity = False + + entity = self.get_entity(entity_id=entity_id) + check_entity_key = False + if entity is None: + if entity_uri is None: + msg = 'New entities must have an entity_uri.' + raise ValueError(msg) + + add_entity = True + add_entity_key = True + else: + check_entity_key = True + + ################# + # Validate Object + ################# if attribute is None: # Trivial Case relative_path = '' object_field = self._check_object_field(file=file, @@ -605,49 +674,51 @@ def add_ref(self, **kwargs): relative_path=relative_path, field=field) - if not isinstance(key, Key): + ############### + # Populate HERD + ############### + if isinstance(object_field, dict): + if object_field['files_idx'] is None: + self._add_file(object_field['file_object_id']) + object_field['files_idx'] = self.files.which(file_object_id=object_field['file_object_id'])[0] + object_field = self._add_object(files_idx=object_field['files_idx'], + container=object_field['container'], + relative_path=object_field['relative_path'], + field=object_field['field']) + + # Since object_field is set, we need to check if + # the key has been associated with that object. + # If so, just reuse the key. + if add_key: + key_exists = False key_idx_matches = self.keys.which(key=key) - # if same key is used multiple times, determine - # which instance based on the Container for row_idx in self.object_keys.which(objects_idx=object_field.idx): key_idx = self.object_keys['keys_idx', row_idx] if key_idx in key_idx_matches: - msg = "Use Key Object when referencing an existing (container, relative_path, key)" - raise ValueError(msg) + key_exists = True # Make sure we don't add the key + key = self.keys.row[key_idx] + if not key_exists: + key = self._add_key(key) - key = self._add_key(key) - self._add_object_key(object_field, key) - - else: - # Check to see that the existing key is being used with the object. - # If true, do nothing. If false, create a new obj/key relationship - # in the ObjectKeyTable + if check_object_key: key_idx = key.idx object_key_row_idx = self.object_keys.which(keys_idx=key_idx) - if len(object_key_row_idx)!=0: - obj_key_check = False - for row_idx in object_key_row_idx: - obj_idx = self.object_keys['objects_idx', row_idx] - if obj_idx == object_field.idx: - obj_key_check = True - if not obj_key_check: - self._add_object_key(object_field, key) - else: - msg = "Cannot find key object. Create new Key with string." - raise ValueError(msg) - # check if the key and object have been related in the ObjectKeyTable + obj_key_exists = False + for row_idx in object_key_row_idx: + obj_idx = self.object_keys['objects_idx', row_idx] + if obj_idx == object_field.idx: + obj_key_exists = True + if not obj_key_exists: + add_object_key = True + + if add_object_key: + self._add_object_key(object_field, key) - entity = self.get_entity(entity_id=entity_id) - if entity is None: - if entity_uri is None: - msg = 'New entities must have an entity_uri.' - raise ValueError(msg) - entity = self._add_entity(entity_id, entity_uri) - self._add_entity_key(entity, key) - else: + if check_entity_key: if entity_uri is not None: - msg = 'If you plan on reusing an entity, then entity_uri parameter must be None.' - raise ValueError(msg) + msg = 'This entity already exists. Ignoring new entity uri' + warn(msg) # TODO: Change to Warn that the uri provided is being ignored + # check for entity-key relationship in EntityKeyTable key_idx = key.idx entity_key_row_idx = self.entity_keys.which(keys_idx=key_idx) @@ -662,12 +733,77 @@ def add_ref(self, **kwargs): if not entity_key_check: # this means that though the key is there, there is not key-entity relationship # a.k.a add it now - self._add_entity_key(entity, key) + # self._add_entity_key(entity, key) TODO + add_entity_key = True else: # this means that specific key is not in the EntityKeyTable, so add it and establish # the relationship with the entity - self._add_entity_key(entity, key) - return key, entity + # self._add_entity_key(entity, key) TODO + add_entity_key = True + + if add_entity: + entity = self._add_entity(entity_id, entity_uri) + + if add_entity_key: + self._add_entity_key(entity, key) + + @docval({'name': 'key_name', 'type': str, 'doc': 'The name of the Key to get.'}, + {'name': 'file', 'type': HERDManager, 'doc': 'The file associated with the container.', + 'default': None}, + {'name': 'container', 'type': (str, AbstractContainer), 'default': None, + 'doc': ('The Container/Data object that uses the key or ' + 'the object id for the Container/Data object that uses the key.')}, + {'name': 'relative_path', 'type': str, + 'doc': ('The relative_path of the attribute of the object that uses ', + 'an external resource reference key. Use an empty string if not applicable.'), + 'default': ''}, + {'name': 'field', 'type': str, 'default': '', + 'doc': ('The field of the compound data type using an external resource.')}) + def get_key(self, **kwargs): + """ + Return a Key. + + If container, relative_path, and field are provided, the Key that corresponds to the given name of the key + for the given container, relative_path, and field is returned. + """ + key_name, container, relative_path, field = popargs('key_name', 'container', 'relative_path', 'field', kwargs) + key_idx_matches = self.keys.which(key=key_name) + + file = kwargs['file'] + + if container is not None: + if file is None: + file = self._get_file_from_container(container=container) + # if same key is used multiple times, determine + # which instance based on the Container + object_field = self._check_object_field(file=file, + container=container, + relative_path=relative_path, + field=field) + for row_idx in self.object_keys.which(objects_idx=object_field.idx): + key_idx = self.object_keys['keys_idx', row_idx] + if key_idx in key_idx_matches: + return self.keys.row[key_idx] + msg = "No key found with that container." + raise ValueError(msg) + else: + if len(key_idx_matches) == 0: + # the key has never been used before + raise ValueError("key '%s' does not exist" % key_name) + elif len(key_idx_matches) > 1: + msg = "There are more than one key with that name. Please search with additional information." + raise ValueError(msg) + else: + return self.keys.row[key_idx_matches[0]] + + @docval({'name': 'entity_id', 'type': str, 'doc': 'The ID for the identifier at the resource.'}) + def get_entity(self, **kwargs): + entity_id = kwargs['entity_id'] + entity = self.entities.which(entity_id=entity_id) + if len(entity)>0: + return self.entities.row[entity[0]] + else: + return None @docval({'name': 'object_type', 'type': str, 'doc': 'The type of the object. This is also the parent in relative_path.'}, diff --git a/src/hdmf/common/resources2.py b/src/hdmf/common/resources2.py deleted file mode 100644 index b26f7a8bf..000000000 --- a/src/hdmf/common/resources2.py +++ /dev/null @@ -1,987 +0,0 @@ -import pandas as pd -import numpy as np -from . import register_class, EXP_NAMESPACE -from . import get_type_map -from ..container import Table, Row, Container, Data, AbstractContainer, HERDManager -from ..utils import docval, popargs, AllowPositional -from ..build import TypeMap -from ..term_set import TermSetWrapper -from glob import glob -import os -import zipfile -from collections import namedtuple -from warnings import warn - - -class KeyTable(Table): - """ - A table for storing keys used to reference external resources. - """ - - __defaultname__ = 'keys' - - __columns__ = ( - {'name': 'key', 'type': str, - 'doc': 'The user key that maps to the resource term / registry symbol.'}, - ) - - -class Key(Row): - """ - A Row class for representing rows in the KeyTable. - """ - - __table__ = KeyTable - - -class EntityTable(Table): - """ - A table for storing the external resources a key refers to. - """ - - __defaultname__ = 'entities' - - __columns__ = ( - {'name': 'entity_id', 'type': str, - 'doc': 'The unique ID for the resource term / registry symbol.'}, - {'name': 'entity_uri', 'type': str, - 'doc': 'The URI for the resource term / registry symbol.'}, - ) - - -class Entity(Row): - """ - A Row class for representing rows in the EntityTable. - """ - - __table__ = EntityTable - - -class FileTable(Table): - """ - A table for storing file ids used in external resources. - """ - - __defaultname__ = 'files' - - __columns__ = ( - {'name': 'file_object_id', 'type': str, - 'doc': 'The file id of the file that contains the object'}, - ) - - -class File(Row): - """ - A Row class for representing rows in the FileTable. - """ - - __table__ = FileTable - - -class ObjectTable(Table): - """ - A table for storing objects (i.e. Containers) that contain keys that refer to external resources. - """ - - __defaultname__ = 'objects' - - __columns__ = ( - {'name': 'files_idx', 'type': int, - 'doc': 'The row idx for the file_object_id in FileTable containing the object.'}, - {'name': 'object_id', 'type': str, - 'doc': 'The object ID for the Container/Data.'}, - {'name': 'object_type', 'type': str, - 'doc': 'The type of the object. This is also the parent in relative_path.'}, - {'name': 'relative_path', 'type': str, - 'doc': ('The relative_path of the attribute of the object that uses ', - 'an external resource reference key. Use an empty string if not applicable.')}, - {'name': 'field', 'type': str, - 'doc': ('The field of the compound data type using an external resource. ' - 'Use an empty string if not applicable.')} - ) - - -class Object(Row): - """ - A Row class for representing rows in the ObjectTable. - """ - - __table__ = ObjectTable - - -class ObjectKeyTable(Table): - """ - A table for identifying which keys are used by which objects for referring to external resources. - """ - - __defaultname__ = 'object_keys' - - __columns__ = ( - {'name': 'objects_idx', 'type': (int, Object), - 'doc': 'The index into the objects table for the Object that uses the Key.'}, - {'name': 'keys_idx', 'type': (int, Key), - 'doc': 'The index into the keys table that is used to make an external resource reference.'} - ) - - -class EntityKeyTable(Table): - """ - A table for identifying which entities are used by which keys for referring to external resources. - """ - - __defaultname__ = 'entity_keys' - - __columns__ = ( - {'name': 'entities_idx', 'type': (int, Entity), - 'doc': 'The index into the EntityTable for the Entity that associated with the Key.'}, - {'name': 'keys_idx', 'type': (int, Key), - 'doc': 'The index into the KeyTable that is used to make an external resource reference.'} - ) - - -class EntityKey(Row): - """ - A Row class for representing rows in the EntityKeyTable. - """ - - __table__ = EntityKeyTable - - -class ObjectKey(Row): - """ - A Row class for representing rows in the ObjectKeyTable. - """ - - __table__ = ObjectKeyTable - - -@register_class('HERD', EXP_NAMESPACE) -class HERD(Container): - """ - HDMF External Resources Data Structure. - A table for mapping user terms (i.e. keys) to resource entities. - """ - - __fields__ = ( - {'name': 'keys', 'child': True}, - {'name': 'files', 'child': True}, - {'name': 'objects', 'child': True}, - {'name': 'object_keys', 'child': True}, - {'name': 'entity_keys', 'child': True}, - {'name': 'entities', 'child': True}, - ) - - @docval({'name': 'keys', 'type': KeyTable, 'default': None, - 'doc': 'The table storing user keys for referencing resources.'}, - {'name': 'files', 'type': FileTable, 'default': None, - 'doc': 'The table for storing file ids used in external resources.'}, - {'name': 'entities', 'type': EntityTable, 'default': None, - 'doc': 'The table storing entity information.'}, - {'name': 'objects', 'type': ObjectTable, 'default': None, - 'doc': 'The table storing object information.'}, - {'name': 'object_keys', 'type': ObjectKeyTable, 'default': None, - 'doc': 'The table storing object-key relationships.'}, - {'name': 'entity_keys', 'type': EntityKeyTable, 'default': None, - 'doc': 'The table storing entity-key relationships.'}, - {'name': 'type_map', 'type': TypeMap, 'default': None, - 'doc': 'The type map. If None is provided, the HDMF-common type map will be used.'}, - allow_positional=AllowPositional.WARNING) - def __init__(self, **kwargs): - name = 'external_resources' - super().__init__(name) - self.keys = kwargs['keys'] or KeyTable() - self.files = kwargs['files'] or FileTable() - self.entities = kwargs['entities'] or EntityTable() - self.objects = kwargs['objects'] or ObjectTable() - self.object_keys = kwargs['object_keys'] or ObjectKeyTable() - self.entity_keys = kwargs['entity_keys'] or EntityKeyTable() - self.type_map = kwargs['type_map'] or get_type_map() - - @staticmethod - def assert_external_resources_equal(left, right, check_dtype=True): - """ - Compare that the keys, resources, entities, objects, and object_keys tables match - - :param left: HERD object to compare with right - :param right: HERD object to compare with left - :param check_dtype: Enforce strict checking of dtypes. Dtypes may be different - for example for ids, where depending on how the data was saved - ids may change from int64 to int32. (Default: True) - :returns: The function returns True if all values match. If mismatches are found, - AssertionError will be raised. - :raises AssertionError: Raised if any differences are found. The function collects - all differences into a single error so that the assertion will indicate - all found differences. - """ - errors = [] - try: - pd.testing.assert_frame_equal(left.keys.to_dataframe(), - right.keys.to_dataframe(), - check_dtype=check_dtype) - except AssertionError as e: - errors.append(e) - try: - pd.testing.assert_frame_equal(left.files.to_dataframe(), - right.files.to_dataframe(), - check_dtype=check_dtype) - except AssertionError as e: - errors.append(e) - try: - pd.testing.assert_frame_equal(left.objects.to_dataframe(), - right.objects.to_dataframe(), - check_dtype=check_dtype) - except AssertionError as e: - errors.append(e) - try: - pd.testing.assert_frame_equal(left.entities.to_dataframe(), - right.entities.to_dataframe(), - check_dtype=check_dtype) - except AssertionError as e: - errors.append(e) - try: - pd.testing.assert_frame_equal(left.object_keys.to_dataframe(), - right.object_keys.to_dataframe(), - check_dtype=check_dtype) - except AssertionError as e: - errors.append(e) - if len(errors) > 0: - msg = ''.join(str(e)+"\n\n" for e in errors) - raise AssertionError(msg) - return True - - @docval({'name': 'key_name', 'type': str, 'doc': 'The name of the key to be added.'}) - def _add_key(self, **kwargs): - """ - Add a key to be used for making references to external resources. - - It is possible to use the same *key_name* to refer to different resources so long as the *key_name* is not - used within the same object, relative_path, and field. To do so, this method must be called for the - two different resources. - - The returned Key objects must be managed by the caller so as to be appropriately passed to subsequent calls - to methods for storing information about the different resources. - """ - key = kwargs['key_name'] - return Key(key, table=self.keys) - - @docval({'name': 'file_object_id', 'type': str, 'doc': 'The id of the file'}) - def _add_file(self, **kwargs): - """ - Add a file to be used for making references to external resources. - - This is optional when working in HDMF. - """ - file_object_id = kwargs['file_object_id'] - return File(file_object_id, table=self.files) - - @docval({'name': 'entity_id', 'type': str, 'doc': 'The unique entity id.'}, - {'name': 'entity_uri', 'type': str, 'doc': 'The URI for the entity.'}) - def _add_entity(self, **kwargs): - """ - Add an entity that will be referenced to using keys specified in HERD.entity_keys. - """ - entity_id = kwargs['entity_id'] - entity_uri = kwargs['entity_uri'] - entity = Entity( entity_id, entity_uri, table=self.entities) - return entity - - @docval({'name': 'container', 'type': (str, AbstractContainer), - 'doc': 'The Container/Data object to add or the object id of the Container/Data object to add.'}, - {'name': 'files_idx', 'type': int, - 'doc': 'The file_object_id row idx.'}, - {'name': 'object_type', 'type': str, 'default': None, - 'doc': ('The type of the object. This is also the parent in relative_path. If omitted, ' - 'the name of the container class is used.')}, - {'name': 'relative_path', 'type': str, - 'doc': ('The relative_path of the attribute of the object that uses ', - 'an external resource reference key. Use an empty string if not applicable.')}, - {'name': 'field', 'type': str, 'default': '', - 'doc': ('The field of the compound data type using an external resource.')}) - def _add_object(self, **kwargs): - """ - Add an object that references an external resource. - """ - files_idx, container, object_type, relative_path, field = popargs('files_idx', - 'container', - 'object_type', - 'relative_path', - 'field', kwargs) - - if object_type is None: - object_type = container.__class__.__name__ - - if isinstance(container, AbstractContainer): - container = container.object_id - obj = Object(files_idx, container, object_type, relative_path, field, table=self.objects) - return obj - - @docval({'name': 'obj', 'type': (int, Object), 'doc': 'The Object that uses the Key.'}, - {'name': 'key', 'type': (int, Key), 'doc': 'The Key that the Object uses.'}) - def _add_object_key(self, **kwargs): - """ - Specify that an object (i.e. container and relative_path) uses a key to reference - an external resource. - """ - obj, key = popargs('obj', 'key', kwargs) - return ObjectKey(obj, key, table=self.object_keys) - - @docval({'name': 'entity', 'type': (int, Entity), 'doc': 'The Entity associated with the Key.'}, - {'name': 'key', 'type': (int, Key), 'doc': 'The Key that the connected to the Entity.'}) - def _add_entity_key(self, **kwargs): - """ - Add entity-key relationship to the EntityKeyTable. - """ - entity, key = popargs('entity', 'key', kwargs) - return EntityKey(entity, key, table=self.entity_keys) - - @docval({'name': 'file', 'type': HERDManager, 'doc': 'The file associated with the container.'}, - {'name': 'container', 'type': AbstractContainer, - 'doc': ('The Container/Data object that uses the key or ' - 'the object id for the Container/Data object that uses the key.')}, - {'name': 'relative_path', 'type': str, - 'doc': ('The relative_path of the attribute of the object that uses ', - 'an external resource reference key. Use an empty string if not applicable.'), - 'default': ''}, - {'name': 'field', 'type': str, 'default': '', - 'doc': ('The field of the compound data type using an external resource.')}, - {'name': 'create', 'type': bool, 'default': True}) - def _check_object_field(self, **kwargs): - """ - Check if a container, relative path, and field have been added. - - The container can be either an object_id string or an AbstractContainer. - - If the container, relative_path, and field have not been added, add them - and return the corresponding Object. Otherwise, just return the Object. - """ - file = kwargs['file'] - container = kwargs['container'] - relative_path = kwargs['relative_path'] - field = kwargs['field'] - create = kwargs['create'] - - file_object_id = file.object_id - files_idx = self.files.which(file_object_id=file_object_id) - - if len(files_idx) > 1: - # It isn't possible for len(files_idx) > 1 without the user directly using _add_file - raise ValueError("Found multiple instances of the same file.") - elif len(files_idx) == 1: - files_idx = files_idx[0] - else: - files_idx = None - - objecttable_idx = self.objects.which(object_id=container.object_id) - - if len(objecttable_idx) > 0: - relative_path_idx = self.objects.which(relative_path=relative_path) - field_idx = self.objects.which(field=field) - objecttable_idx = list(set(objecttable_idx) & set(relative_path_idx) & set(field_idx)) - if len(objecttable_idx) == 1: - return self.objects.row[objecttable_idx[0]] - elif len(objecttable_idx) == 0 and create: - return {'file_object_id': file_object_id, - 'files_idx': files_idx, - 'container': container, - 'relative_path': relative_path, - 'field': field} - elif len(objecttable_idx) == 0 and not create: - raise ValueError("Object not in Object Table.") - else: - # It isn't possible for this to happen unless the user used _add_object. - raise ValueError("Found multiple instances of the same object id, relative path, " - "and field in objects table.") - - @docval({'name': 'container', 'type': (str, AbstractContainer), - 'doc': ('The Container/Data object that uses the key or ' - 'the object id for the Container/Data object that uses the key.')}) - def _get_file_from_container(self, **kwargs): - """ - Method to retrieve a file associated with the container in the case a file is not provided. - """ - container = kwargs['container'] - - if isinstance(container, HERDManager): - file = container - return file - else: - parent = container.parent - if parent is not None: - while parent is not None: - if isinstance(parent, HERDManager): - file = parent - return file - else: - parent = parent.parent - else: - msg = 'Could not find file. Add container to the file.' - raise ValueError(msg) - - @docval({'name': 'objects', 'type': list, - 'doc': 'List of objects to check for TermSetWrapper within the fields.'}) - def __check_termset_wrapper(self, **kwargs): - """ - Takes a list of objects and checks the fields for TermSetWrapper. - - wrapped_obj = namedtuple('wrapped_obj', ['object', 'attribute', 'wrapper']) - :return: [wrapped_obj(object1, attribute_name1, wrapper1), ...] - """ - objects = kwargs['objects'] - - ret = [] # list to be returned with the objects, attributes and corresponding termsets - - for obj in objects: - # Get all the fields, parse out the methods and internal variables - obj_fields = [a for a in dir(obj) if not a.startswith('_') and not callable(getattr(obj, a))] - for attribute in obj_fields: - attr = getattr(obj, attribute) - if isinstance(attr, TermSetWrapper): - # Search objects that are wrapped - wrapped_obj = namedtuple('wrapped_obj', ['object', 'attribute', 'wrapper']) - ret.append(wrapped_obj(obj, attribute, attr)) - - return ret - - @docval({'name': 'root_container', 'type': HERDManager, - 'doc': 'The root container or file containing objects with a TermSet.'}) - def add_ref_container(self, **kwargs): - """ - Method to search through the root_container for all instances of TermSet. - Currently, only datasets are supported. By using a TermSet, the data comes validated - and can use the permissible values within the set to populate HERD. - """ - root_container = kwargs['root_container'] - - all_objects = root_container.all_children() # list of child objects and the container itself - - add_ref_items = self.__check_termset_wrapper(objects=all_objects) - for ref in add_ref_items: - container, attr_name, wrapper = ref - if isinstance(wrapper.value, (list, np.ndarray, tuple)): - values = wrapper.value - else: - # create list for single values (edge-case) for a simple iteration downstream - values = [wrapper.value] - for term in values: - term_info = wrapper.termset[term] - entity_id = term_info[0] - entity_uri = term_info[2] - self.add_ref(file=root_container, - container=container, - attribute=attr_name, - key=term, - entity_id=entity_id, - entity_uri=entity_uri) - - @docval({'name': 'key_name', 'type': str, 'doc': 'The name of the Key to get.'}, - {'name': 'file', 'type': HERDManager, 'doc': 'The file associated with the container.', - 'default': None}, - {'name': 'container', 'type': (str, AbstractContainer), 'default': None, - 'doc': ('The Container/Data object that uses the key or ' - 'the object id for the Container/Data object that uses the key.')}, - {'name': 'relative_path', 'type': str, - 'doc': ('The relative_path of the attribute of the object that uses ', - 'an external resource reference key. Use an empty string if not applicable.'), - 'default': ''}, - {'name': 'field', 'type': str, 'default': '', - 'doc': ('The field of the compound data type using an external resource.')}) - def get_key(self, **kwargs): - """ - Return a Key. - - If container, relative_path, and field are provided, the Key that corresponds to the given name of the key - for the given container, relative_path, and field is returned. - """ - key_name, container, relative_path, field = popargs('key_name', 'container', 'relative_path', 'field', kwargs) - key_idx_matches = self.keys.which(key=key_name) - - file = kwargs['file'] - - if container is not None: - if file is None: - file = self._get_file_from_container(container=container) - # if same key is used multiple times, determine - # which instance based on the Container - object_field = self._check_object_field(file=file, - container=container, - relative_path=relative_path, - field=field) - for row_idx in self.object_keys.which(objects_idx=object_field.idx): - key_idx = self.object_keys['keys_idx', row_idx] - if key_idx in key_idx_matches: - return self.keys.row[key_idx] - msg = "No key found with that container." - raise ValueError(msg) - else: - if len(key_idx_matches) == 0: - # the key has never been used before - raise ValueError("key '%s' does not exist" % key_name) - elif len(key_idx_matches) > 1: - msg = "There are more than one key with that name. Please search with additional information." - raise ValueError(msg) - else: - return self.keys.row[key_idx_matches[0]] - - @docval({'name': 'entity_id', 'type': str, 'doc': 'The ID for the identifier at the resource.'}) - def get_entity(self, **kwargs): - entity_id = kwargs['entity_id'] - entity = self.entities.which(entity_id=entity_id) - if len(entity)>0: - return self.entities.row[entity[0]] - else: - return None - - @docval({'name': 'container', 'type': (str, AbstractContainer), 'default': None, - 'doc': ('The Container/Data object that uses the key or ' - 'the object_id for the Container/Data object that uses the key.')}, - {'name': 'attribute', 'type': str, - 'doc': 'The attribute of the container for the external reference.', 'default': None}, - {'name': 'field', 'type': str, 'default': '', - 'doc': ('The field of the compound data type using an external resource.')}, - {'name': 'key', 'type': (str, Key), 'default': None, - 'doc': 'The name of the key or the Key object from the KeyTable for the key to add a resource for.'}, - {'name': 'entity_id', 'type': str, 'doc': 'The identifier for the entity at the resource.'}, - {'name': 'entity_uri', 'type': str, 'doc': 'The URI for the identifier at the resource.', 'default': None}, - {'name': 'file', 'type': HERDManager, 'doc': 'The file associated with the container.', - 'default': None}, - ) - def add_ref(self, **kwargs): - """ - Add information about an external reference used in this file. - - It is possible to use the same name of the key to refer to different resources - so long as the name of the key is not used within the same object, relative_path, and - field combination. This method does not support such functionality by default. - """ - ############################################################### - container = kwargs['container'] - attribute = kwargs['attribute'] - if isinstance(container, Data): - if attribute == 'data': - attribute = None - key = kwargs['key'] - field = kwargs['field'] - entity_id = kwargs['entity_id'] - entity_uri = kwargs['entity_uri'] - file = kwargs['file'] - - ################## - # Set File if None - ################## - if file is None: - file = self._get_file_from_container(container=container) - - ############## - # Validate Key - ############## - add_key = False - add_object_key = False - check_object_key = False - if not isinstance(key, Key): - add_key = True - add_object_key = True - else: - # Check to see that the existing key is being used with the object. - # If true, do nothing. If false, create a new obj/key relationship - # in the ObjectKeyTable - check_object_key = True - - ################# - # Validate Entity - ################# - add_entity_key = False - add_entity = False - - entity = self.get_entity(entity_id=entity_id) - check_entity_key = False - if entity is None: - if entity_uri is None: - msg = 'New entities must have an entity_uri.' - raise ValueError(msg) - - add_entity = True - add_entity_key = True - else: - check_entity_key = True - - ################# - # Validate Object - ################# - if attribute is None: # Trivial Case - relative_path = '' - object_field = self._check_object_field(file=file, - container=container, - relative_path=relative_path, - field=field) - else: # DataType Attribute Case - attribute_object = getattr(container, attribute) # returns attribute object - if isinstance(attribute_object, AbstractContainer): - relative_path = '' - object_field = self._check_object_field(file=file, - container=attribute_object, - relative_path=relative_path, - field=field) - else: # Non-DataType Attribute Case: - obj_mapper = self.type_map.get_map(container) - spec = obj_mapper.get_attr_spec(attr_name=attribute) - parent_spec = spec.parent # return the parent spec of the attribute - if parent_spec.data_type is None: - while parent_spec.data_type is None: - parent_spec = parent_spec.parent # find the closest parent with a data_type - parent_cls = self.type_map.get_dt_container_cls(data_type=parent_spec.data_type, autogen=False) - if isinstance(container, parent_cls): - parent = container - # We need to get the path of the spec for relative_path - absolute_path = spec.path - relative_path = absolute_path[absolute_path.find('/')+1:] - object_field = self._check_object_field(file=file, - container=parent, - relative_path=relative_path, - field=field) - else: - msg = 'Container not the nearest data_type' - raise ValueError(msg) - else: - parent = container # container needs to be the parent - absolute_path = spec.path - relative_path = absolute_path[absolute_path.find('/')+1:] - # this regex removes everything prior to the container on the absolute_path - object_field = self._check_object_field(file=file, - container=parent, - relative_path=relative_path, - field=field) - - ############### - # Populate HERD - ############### - if isinstance(object_field, dict): - if object_field['files_idx'] is None: - self._add_file(object_field['file_object_id']) - object_field['files_idx'] = self.files.which(file_object_id=object_field['file_object_id'])[0] - object_field = self._add_object(files_idx=object_field['files_idx'], - container=object_field['container'], - relative_path=object_field['relative_path'], - field=object_field['field']) - - # Since object_field is set, we need to check if - # the key has been associated with that object. - # If so, just reuse the key. - if add_key: - key_exists = False - key_idx_matches = self.keys.which(key=key) - for row_idx in self.object_keys.which(objects_idx=object_field.idx): - key_idx = self.object_keys['keys_idx', row_idx] - if key_idx in key_idx_matches: - key_exists = True # Make sure we don't add the key - key = self.keys.row[key_idx] - if not key_exists: - key = self._add_key(key) - - if check_object_key: - key_idx = key.idx - object_key_row_idx = self.object_keys.which(keys_idx=key_idx) - obj_key_exists = False - for row_idx in object_key_row_idx: - obj_idx = self.object_keys['objects_idx', row_idx] - if obj_idx == object_field.idx: - obj_key_exists = True - if not obj_key_exists: - add_object_key = True - - if add_object_key: - self._add_object_key(object_field, key) - - if check_entity_key: - if entity_uri is not None: - msg = 'This entity already exists. Ignoring new entity uri' - warn(msg) # TODO: Change to Warn that the uri provided is being ignored - - # check for entity-key relationship in EntityKeyTable - key_idx = key.idx - entity_key_row_idx = self.entity_keys.which(keys_idx=key_idx) - if len(entity_key_row_idx)!=0: - # this means there exists rows where the key is in the EntityKeyTable - entity_key_check = False - for row_idx in entity_key_row_idx: - entity_idx = self.entity_keys['entities_idx', row_idx] - if entity_idx == entity.idx: - entity_key_check = True - # this means there is already a key-entity relationship recorded - if not entity_key_check: - # this means that though the key is there, there is not key-entity relationship - # a.k.a add it now - # self._add_entity_key(entity, key) TODO - add_entity_key = True - else: - # this means that specific key is not in the EntityKeyTable, so add it and establish - # the relationship with the entity - # self._add_entity_key(entity, key) TODO - add_entity_key = True - - if add_entity: - entity = self._add_entity(entity_id, entity_uri) - - if add_entity_key: - self._add_entity_key(entity, key) - - - @docval({'name': 'object_type', 'type': str, - 'doc': 'The type of the object. This is also the parent in relative_path.'}, - {'name': 'relative_path', 'type': str, - 'doc': ('The relative_path of the attribute of the object that uses ', - 'an external resource reference key. Use an empty string if not applicable.'), - 'default': ''}, - {'name': 'field', 'type': str, 'default': '', - 'doc': ('The field of the compound data type using an external resource.')}, - {'name': 'all_instances', 'type': bool, 'default': False, - 'doc': ('The bool to return a dataframe with all instances of the object_type.', - 'If True, relative_path and field inputs will be ignored.')}) - def get_object_type(self, **kwargs): - """ - Get all entities/resources associated with an object_type. - """ - object_type = kwargs['object_type'] - relative_path = kwargs['relative_path'] - field = kwargs['field'] - all_instances = kwargs['all_instances'] - - df = self.to_dataframe() - - if all_instances: - df = df.loc[df['object_type'] == object_type] - else: - df = df.loc[(df['object_type'] == object_type) - & (df['relative_path'] == relative_path) - & (df['field'] == field)] - return df - - @docval({'name': 'file', 'type': HERDManager, 'doc': 'The file.', - 'default': None}, - {'name': 'container', 'type': (str, AbstractContainer), - 'doc': 'The Container/data object that is linked to resources/entities.'}, - {'name': 'attribute', 'type': str, - 'doc': 'The attribute of the container for the external reference.', 'default': None}, - {'name': 'relative_path', 'type': str, - 'doc': ('The relative_path of the attribute of the object that uses ', - 'an external resource reference key. Use an empty string if not applicable.'), - 'default': ''}, - {'name': 'field', 'type': str, 'default': '', - 'doc': ('The field of the compound data type using an external resource.')}) - def get_object_entities(self, **kwargs): - """ - Get all entities/resources associated with an object. - """ - file = kwargs['file'] - container = kwargs['container'] - attribute = kwargs['attribute'] - relative_path = kwargs['relative_path'] - field = kwargs['field'] - - if file is None: - file = self._get_file_from_container(container=container) - - keys = [] - entities = [] - if attribute is None: - object_field = self._check_object_field(file=file, - container=container, - relative_path=relative_path, - field=field, - create=False) - else: - object_field = self._check_object_field(file=file, - container=container[attribute], - relative_path=relative_path, - field=field, - create=False) - # Find all keys associated with the object - for row_idx in self.object_keys.which(objects_idx=object_field.idx): - keys.append(self.object_keys['keys_idx', row_idx]) - # Find all the entities/resources for each key. - for key_idx in keys: - entity_key_row_idx = self.entity_keys.which(keys_idx=key_idx) - for row_idx in entity_key_row_idx: - entity_idx = self.entity_keys['entities_idx', row_idx] - entities.append(self.entities.__getitem__(entity_idx)) - df = pd.DataFrame(entities, columns=['entity_id', 'entity_uri']) - return df - - @docval({'name': 'use_categories', 'type': bool, 'default': False, - 'doc': 'Use a multi-index on the columns to indicate which category each column belongs to.'}, - rtype=pd.DataFrame, returns='A DataFrame with all data merged into a flat, denormalized table.') - def to_dataframe(self, **kwargs): - """ - Convert the data from the keys, resources, entities, objects, and object_keys tables - to a single joint dataframe. I.e., here data is being denormalized, e.g., keys that - are used across multiple entities or objects will duplicated across the corresponding - rows. - - Returns: :py:class:`~pandas.DataFrame` with all data merged into a single, flat, denormalized table. - - """ - use_categories = popargs('use_categories', kwargs) - # Step 1: Combine the entities, keys, and entity_keys table - ent_key_df = self.entity_keys.to_dataframe() - entities_mapped_df = self.entities.to_dataframe().iloc[ent_key_df['entities_idx']].reset_index(drop=True) - keys_mapped_df = self.keys.to_dataframe().iloc[ent_key_df['keys_idx']].reset_index(drop=True) - ent_key_df = pd.concat(objs=[ent_key_df, entities_mapped_df, keys_mapped_df], - axis=1, - verify_integrity=False) - # Step 2: Combine the the files, object_keys and objects tables - object_keys_df = self.object_keys.to_dataframe() - objects_mapped_df = self.objects.to_dataframe().iloc[object_keys_df['objects_idx']].reset_index(drop=True) - object_keys_df = pd.concat(objs=[object_keys_df, objects_mapped_df], - axis=1, - verify_integrity=False) - files_df = self.files.to_dataframe().iloc[object_keys_df['files_idx']].reset_index(drop=True) - file_object_object_key_df = pd.concat(objs=[object_keys_df, files_df], - axis=1, - verify_integrity=False) - # Step 3: merge the combined entities_df and object_keys_df DataFrames - result_df = pd.concat( - # Create for each row in the objects_keys table a DataFrame with all corresponding data from all tables - objs=[pd.merge( - # Find all entities that correspond to the row i of the object_keys_table - ent_key_df[ent_key_df['keys_idx'] == object_keys_df['keys_idx'].iloc[i]].reset_index(drop=True), - # Get a DataFrame for row i of the objects_keys_table - file_object_object_key_df.iloc[[i, ]], - # Merge the entities and object_keys on the keys_idx column so that the values from the single - # object_keys_table row are copied across all corresponding rows in the entities table - on='keys_idx') - for i in range(len(object_keys_df))], - # Concatenate the rows of the objs - axis=0, - verify_integrity=False) - # Step 4: Clean up the index and sort columns by table type and name - result_df.reset_index(inplace=True, drop=True) - # ADD files - file_id_col = [] - for idx in result_df['files_idx']: - file_id_val = self.files.to_dataframe().iloc[int(idx)]['file_object_id'] - file_id_col.append(file_id_val) - - result_df['file_object_id'] = file_id_col - column_labels = [('files', 'file_object_id'), - ('objects', 'objects_idx'), ('objects', 'object_id'), ('objects', 'files_idx'), - ('objects', 'object_type'), ('objects', 'relative_path'), ('objects', 'field'), - ('keys', 'keys_idx'), ('keys', 'key'), - ('entities', 'entities_idx'), ('entities', 'entity_id'), ('entities', 'entity_uri')] - # sort the columns based on our custom order - result_df = result_df.reindex(labels=[c[1] for c in column_labels], - axis=1) - result_df = result_df.astype({'keys_idx': 'uint32', - 'objects_idx': 'uint32', - 'files_idx': 'uint32', - 'entities_idx': 'uint32'}) - # Add the categories if requested - if use_categories: - result_df.columns = pd.MultiIndex.from_tuples(column_labels) - # return the result - return result_df - - @docval({'name': 'path', 'type': str, 'doc': 'The path to the zip file.'}) - def to_zip(self, **kwargs): - """ - Write the tables in HERD to zipped tsv files. - """ - zip_file = kwargs['path'] - directory = os.path.dirname(zip_file) - - files = [os.path.join(directory, child.name)+'.tsv' for child in self.children] - for i in range(len(self.children)): - df = self.children[i].to_dataframe() - df.to_csv(files[i], sep='\t', index=False) - - with zipfile.ZipFile(zip_file, 'w') as zipF: - for file in files: - zipF.write(file) - - # remove tsv files - for file in files: - os.remove(file) - - @classmethod - @docval({'name': 'path', 'type': str, 'doc': 'The path to the zip file.'}) - def from_zip(cls, **kwargs): - """ - Method to read in zipped tsv files to populate HERD. - """ - zip_file = kwargs['path'] - directory = os.path.dirname(zip_file) - - with zipfile.ZipFile(zip_file, 'r') as zip: - zip.extractall(directory) - tsv_paths = glob(directory+'/*') - - for file in tsv_paths: - file_name = os.path.basename(file) - if file_name == 'files.tsv': - files_df = pd.read_csv(file, sep='\t').replace(np.nan, '') - files = FileTable().from_dataframe(df=files_df, name='files', extra_ok=False) - os.remove(file) - continue - if file_name == 'keys.tsv': - keys_df = pd.read_csv(file, sep='\t').replace(np.nan, '') - keys = KeyTable().from_dataframe(df=keys_df, name='keys', extra_ok=False) - os.remove(file) - continue - if file_name == 'entities.tsv': - entities_df = pd.read_csv(file, sep='\t').replace(np.nan, '') - entities = EntityTable().from_dataframe(df=entities_df, name='entities', extra_ok=False) - os.remove(file) - continue - if file_name == 'objects.tsv': - objects_df = pd.read_csv(file, sep='\t').replace(np.nan, '') - objects = ObjectTable().from_dataframe(df=objects_df, name='objects', extra_ok=False) - os.remove(file) - continue - if file_name == 'object_keys.tsv': - object_keys_df = pd.read_csv(file, sep='\t').replace(np.nan, '') - object_keys = ObjectKeyTable().from_dataframe(df=object_keys_df, name='object_keys', extra_ok=False) - os.remove(file) - continue - if file_name == 'entity_keys.tsv': - ent_key_df = pd.read_csv(file, sep='\t').replace(np.nan, '') - entity_keys = EntityKeyTable().from_dataframe(df=ent_key_df, name='entity_keys', extra_ok=False) - os.remove(file) - continue - - # we need to check the idx columns in entities, objects, and object_keys - entity_idx = entity_keys['entities_idx'] - for idx in entity_idx: - if not int(idx) < len(entities): - msg = "Entity Index out of range in EntityTable. Please check for alterations." - raise ValueError(msg) - - files_idx = objects['files_idx'] - for idx in files_idx: - if not int(idx) < len(files): - msg = "File_ID Index out of range in ObjectTable. Please check for alterations." - raise ValueError(msg) - - object_idx = object_keys['objects_idx'] - for idx in object_idx: - if not int(idx) < len(objects): - msg = "Object Index out of range in ObjectKeyTable. Please check for alterations." - raise ValueError(msg) - - keys_idx = object_keys['keys_idx'] - for idx in keys_idx: - if not int(idx) < len(keys): - msg = "Key Index out of range in ObjectKeyTable. Please check for alterations." - raise ValueError(msg) - - keys_idx = entity_keys['keys_idx'] - for idx in keys_idx: - if not int(idx) < len(keys): - msg = "Key Index out of range in EntityKeyTable. Please check for alterations." - raise ValueError(msg) - - - er = HERD(files=files, - keys=keys, - entities=entities, - entity_keys=entity_keys, - objects=objects, - object_keys=object_keys) - return er diff --git a/tests/unit/common/test_resources.py b/tests/unit/common/test_resources.py index 19a6a95c3..1a02be21f 100644 --- a/tests/unit/common/test_resources.py +++ b/tests/unit/common/test_resources.py @@ -2,7 +2,7 @@ import unittest from hdmf.common import DynamicTable, VectorData from hdmf import TermSet, TermSetWrapper -from hdmf.common.resources2 import HERD, Key +from hdmf.common.resources import HERD, Key from hdmf import Data, Container, HERDManager from hdmf.testing import TestCase, remove_test_file import numpy as np @@ -334,6 +334,96 @@ def test_add_ref_container_attr(self): 'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=9606')]) self.assertEqual(er.objects.data, [(0, col1.object_id, 'VectorData', 'description', '')]) + @unittest.skipIf(not LINKML_INSTALLED, "optional LinkML module is not installed") + def test_add_ref_termset(self): + terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml') + er = HERD() + em = HERDManagerContainer() + em.link_resources(er) + + col1 = VectorData(name='Species_Data', + description='species from NCBI and Ensemble', + data=['Homo sapiens']) + + species = DynamicTable(name='species', description='My species', columns=[col1],) + + er.add_ref_termset(file=em, + container=species, + attribute='Species_Data', + termset=terms + ) + self.assertEqual(er.keys.data, [('Homo sapiens',)]) + self.assertEqual(er.entities.data, [('NCBI_TAXON:9606', + 'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=9606')]) + self.assertEqual(er.objects.data, [(0, col1.object_id, 'VectorData', '', '')]) + + @unittest.skipIf(not LINKML_INSTALLED, "optional LinkML module is not installed") + def test_add_ref_termset_bulk(self): + terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml') + er = HERD() + em = HERDManagerContainer() + em.link_resources(er) + + col1 = VectorData(name='Species_Data', + description='species from NCBI and Ensemble', + data=['Homo sapiens', 'Mus musculus']) + + species = DynamicTable(name='species', description='My species', columns=[col1],) + + er.add_ref_termset(file=em, + container=species, + attribute='Species_Data', + termset=terms + ) + self.assertEqual(er.keys.data, [('Homo sapiens',), ('Mus musculus',)]) + self.assertEqual(er.entities.data, [('NCBI_TAXON:9606', + 'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=9606'), + ('NCBI_TAXON:10090', + 'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=10090')]) + self.assertEqual(er.objects.data, [(0, col1.object_id, 'VectorData', '', '')]) + + @unittest.skipIf(not LINKML_INSTALLED, "optional LinkML module is not installed") + def test_add_ref_termset_missing_terms(self): + terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml') + er = HERD() + em = HERDManagerContainer() + em.link_resources(er) + + col1 = VectorData(name='Species_Data', + description='species from NCBI and Ensemble', + data=['Homo sapiens', 'missing_term']) + + species = DynamicTable(name='species', description='My species', columns=[col1],) + + missing_terms = er.add_ref_termset(file=em, + container=species, + attribute='Species_Data', + termset=terms + ) + self.assertEqual(er.keys.data, [('Homo sapiens',)]) + self.assertEqual(er.entities.data, [('NCBI_TAXON:9606', + 'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=9606')]) + self.assertEqual(er.objects.data, [(0, col1.object_id, 'VectorData', '', '')]) + self.assertEqual(missing_terms, {'missing_terms': ['missing_term']}) + + @unittest.skipIf(not LINKML_INSTALLED, "optional LinkML module is not installed") + def test_add_ref_termset_missing_file_error(self): + terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml') + er = HERD() + + col1 = VectorData(name='Species_Data', + description='species from NCBI and Ensemble', + data=['Homo sapiens']) + + species = DynamicTable(name='species', description='My species', columns=[col1],) + + with self.assertRaises(ValueError): + er.add_ref_termset( + container=species, + attribute='Species_Data', + termset=terms + ) + def test_get_file_from_container(self): file = HERDManagerContainer(name='file') container = Container(name='name') From e9c9728a96bc0b6ea29f7e34b3e539a7bf57a251 Mon Sep 17 00:00:00 2001 From: mavaylon1 Date: Mon, 30 Oct 2023 09:57:40 -0700 Subject: [PATCH 05/40] comments --- src/hdmf/common/resources.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/hdmf/common/resources.py b/src/hdmf/common/resources.py index cda480592..8a409dd3e 100644 --- a/src/hdmf/common/resources.py +++ b/src/hdmf/common/resources.py @@ -625,6 +625,8 @@ def add_ref(self, **kwargs): add_entity = True add_entity_key = True else: + # The entity exists and so we need to check if an entity_key exists + # for this entity and key combination. check_entity_key = True ################# @@ -717,7 +719,7 @@ def add_ref(self, **kwargs): if check_entity_key: if entity_uri is not None: msg = 'This entity already exists. Ignoring new entity uri' - warn(msg) # TODO: Change to Warn that the uri provided is being ignored + warn(msg) # check for entity-key relationship in EntityKeyTable key_idx = key.idx @@ -733,12 +735,10 @@ def add_ref(self, **kwargs): if not entity_key_check: # this means that though the key is there, there is not key-entity relationship # a.k.a add it now - # self._add_entity_key(entity, key) TODO add_entity_key = True else: # this means that specific key is not in the EntityKeyTable, so add it and establish # the relationship with the entity - # self._add_entity_key(entity, key) TODO add_entity_key = True if add_entity: From 2fb33fd95383649512af879939e63c04fef6c875 Mon Sep 17 00:00:00 2001 From: mavaylon1 Date: Mon, 30 Oct 2023 10:02:15 -0700 Subject: [PATCH 06/40] comments --- src/hdmf/common/resources.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/src/hdmf/common/resources.py b/src/hdmf/common/resources.py index 8a409dd3e..b611dcd25 100644 --- a/src/hdmf/common/resources.py +++ b/src/hdmf/common/resources.py @@ -703,15 +703,20 @@ def add_ref(self, **kwargs): key = self._add_key(key) if check_object_key: + # check for object-key relationship in ObjectKeyTable key_idx = key.idx object_key_row_idx = self.object_keys.which(keys_idx=key_idx) - obj_key_exists = False - for row_idx in object_key_row_idx: - obj_idx = self.object_keys['objects_idx', row_idx] - if obj_idx == object_field.idx: - obj_key_exists = True - if not obj_key_exists: - add_object_key = True + if len(object_key_row_idx)!=0: + # this means there exists rows where the key is in the ObjectKeyTable + obj_key_exists = False + for row_idx in object_key_row_idx: + obj_idx = self.object_keys['objects_idx', row_idx] + if obj_idx == object_field.idx: + obj_key_exists = True + # this means there is already a object-key relationship recorded + if not obj_key_exists: + # this means that though the key is there, there is not object-key relationship + add_object_key = True if add_object_key: self._add_object_key(object_field, key) @@ -731,10 +736,9 @@ def add_ref(self, **kwargs): entity_idx = self.entity_keys['entities_idx', row_idx] if entity_idx == entity.idx: entity_key_check = True - # this means there is already a key-entity relationship recorded + # this means there is already a entity-key relationship recorded if not entity_key_check: - # this means that though the key is there, there is not key-entity relationship - # a.k.a add it now + # this means that though the key is there, there is not entity-key relationship add_entity_key = True else: # this means that specific key is not in the EntityKeyTable, so add it and establish From 1a287b1d66ca4294a90c8baf7347bc5027e6f4b1 Mon Sep 17 00:00:00 2001 From: mavaylon1 Date: Mon, 30 Oct 2023 11:12:58 -0700 Subject: [PATCH 07/40] gallery --- docs/gallery/plot_external_resources.py | 49 ++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/docs/gallery/plot_external_resources.py b/docs/gallery/plot_external_resources.py index 3f7720d0b..cc007dc44 100644 --- a/docs/gallery/plot_external_resources.py +++ b/docs/gallery/plot_external_resources.py @@ -91,6 +91,7 @@ # sphinx_gallery_thumbnail_path = 'figures/gallery_thumbnail_externalresources.png' from hdmf.common import HERD from hdmf.common import DynamicTable, VectorData +from hdmf.term_set import TermSet from hdmf import Container, HERDManager from hdmf import Data import numpy as np @@ -269,7 +270,7 @@ def __init__(self, **kwargs): ############################################################################### # Using the get_object_type # ------------------------------------------------------ -# The :py:class:`~hdmf.common.resources.HERD.get_object_entities` method +# The :py:func:`~hdmf.common.resources.HERD.get_object_entities` method # allows the user to retrieve all entities and key information associated with an `Object` in # the form of a pandas DataFrame. @@ -306,6 +307,52 @@ def __init__(self, **kwargs): entity_uri='https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=10090' ) +############################################################################### +# Using add_ref_termset +# ------------------------------------------------------ +# :py:class:`~hdmf.common.resources.HERD` has multiple ways for users to add +# external references. The :py:func:`~hdmf.common.resources.HERD.add_ref_termset` +# method allows users to not only validate terms, i.e., keys, but also perform +# bulk populating of the data structure. + +# The :py:func:`~hdmf.common.resources.HERD.add_ref_container` method is directly +# used for populating :py:class:`~hdmf.common.resources.HERD` when writing a file. +# :py:func:`~hdmf.common.resources.HERD.add_ref_termset` can be used for new files; +# however, it is also the best practice when adding references for existing files. + +# :py:func:`~hdmf.common.resources.HERD.add_ref_termset` has many optional fields, +# giving the user a range of control when adding references. Let's see an example. +er = HERD() +terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml') +file = HERDManagerContainer(name='file') + +er.add_ref_termset(file=file, + container=species, + attribute='Species_Data', + key='Ursus arctos horribilis', + termset=terms) + +############################################################################### +# Using add_ref_termset for an entire dataset +# ------------------------------------------------------ +# As mentioned above, :py:func:`~hdmf.common.resources.HERD.add_ref_termset` +# supports iteratively validating and populating :py:class:`~hdmf.common.resources.HERD`. + +# When populating :py:class:`~hdmf.common.resources.HERD`, users may have some terms +# that are not in the :py:class:`~hdmf.term_set.TermSet`. As a result, +# :py:func:`~hdmf.common.resources.HERD.add_ref_termset` will return an all the missing +# terms in a dictionary. It is up to the user to either add these terms to the +# :py:class:`~hdmf.term_set.TermSet` or remove them from the dataset. + +er = HERD() +terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml') +file = HERDManagerContainer(name='file') + +er.add_ref_termset(file=file, + container=species, + attribute='Species_Data', + termset=terms) + ############################################################################### # Write HERD # ------------------------------------------------------ From 9cf7bc0fa16894874422339214add35ad5a679ac Mon Sep 17 00:00:00 2001 From: mavaylon1 Date: Mon, 30 Oct 2023 11:31:27 -0700 Subject: [PATCH 08/40] checks --- docs/gallery/plot_term_set.py | 3 +++ src/hdmf/common/resources.py | 15 +-------------- tests/unit/common/test_resources.py | 3 --- 3 files changed, 4 insertions(+), 17 deletions(-) diff --git a/docs/gallery/plot_term_set.py b/docs/gallery/plot_term_set.py index 86d53e553..71053bba5 100644 --- a/docs/gallery/plot_term_set.py +++ b/docs/gallery/plot_term_set.py @@ -190,3 +190,6 @@ # To add a column that is validated using :py:class:`~hdmf.term_set.TermSetWrapper`, # wrap the data in the :py:func:`~hdmf.common.table.DynamicTable.add_column` # method as if you were making a new instance of :py:class:`~hdmf.common.table.VectorData`. +species.add_column(name='Species_3', + description='...', + data=TermSetWrapper(value=['Ursus arctos horribilis', 'Mus musculus'], termset=terms),) diff --git a/src/hdmf/common/resources.py b/src/hdmf/common/resources.py index b611dcd25..4e202629b 100644 --- a/src/hdmf/common/resources.py +++ b/src/hdmf/common/resources.py @@ -486,7 +486,7 @@ def add_ref_container(self, **kwargs): 'doc': ('The field of the compound data type using an external resource.')}, {'name': 'key', 'type': (str, Key), 'default': None, 'doc': 'The name of the key or the Key object from the KeyTable for the key to add a resource for.'}, - {'name': 'termset', 'type': TermSet, 'default': None, + {'name': 'termset', 'type': TermSet, 'doc': 'The TermSet to be used if the container/attribute does not have one.'} ) def add_ref_termset(self, **kwargs): @@ -504,19 +504,6 @@ def add_ref_termset(self, **kwargs): field = kwargs['field'] termset = kwargs['termset'] - if termset is None: - if attribute is None: - try: - termset = container.termset - except AttributeError: - msg = "Cannot Find TermSet" - raise AttributeError(msg) - else: - termset = container[attribute].termset - if termset is None: - msg = "Cannot Find TermSet" - raise ValueError(msg) - if file is None: file = self._get_file_from_container(container=container) diff --git a/tests/unit/common/test_resources.py b/tests/unit/common/test_resources.py index 1a02be21f..ab16ce6dd 100644 --- a/tests/unit/common/test_resources.py +++ b/tests/unit/common/test_resources.py @@ -339,7 +339,6 @@ def test_add_ref_termset(self): terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml') er = HERD() em = HERDManagerContainer() - em.link_resources(er) col1 = VectorData(name='Species_Data', description='species from NCBI and Ensemble', @@ -362,7 +361,6 @@ def test_add_ref_termset_bulk(self): terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml') er = HERD() em = HERDManagerContainer() - em.link_resources(er) col1 = VectorData(name='Species_Data', description='species from NCBI and Ensemble', @@ -387,7 +385,6 @@ def test_add_ref_termset_missing_terms(self): terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml') er = HERD() em = HERDManagerContainer() - em.link_resources(er) col1 = VectorData(name='Species_Data', description='species from NCBI and Ensemble', From a8d1ecc127ee1cc837c6c9c10894838ecb7decdb Mon Sep 17 00:00:00 2001 From: mavaylon1 Date: Tue, 31 Oct 2023 11:42:12 -0700 Subject: [PATCH 09/40] path --- docs/gallery/plot_external_resources.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/gallery/plot_external_resources.py b/docs/gallery/plot_external_resources.py index cc007dc44..1659f8174 100644 --- a/docs/gallery/plot_external_resources.py +++ b/docs/gallery/plot_external_resources.py @@ -322,8 +322,11 @@ def __init__(self, **kwargs): # :py:func:`~hdmf.common.resources.HERD.add_ref_termset` has many optional fields, # giving the user a range of control when adding references. Let's see an example. +dir_path = os.path.dirname(os.path.abspath(__file__)) +yaml_file = os.path.join(dir_path, 'example_term_set.yaml') + er = HERD() -terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml') +terms = TermSet(term_schema_path=yaml_file) file = HERDManagerContainer(name='file') er.add_ref_termset(file=file, From d2cb2e8896331472716878b062eb0617f79d07b6 Mon Sep 17 00:00:00 2001 From: mavaylon1 Date: Tue, 31 Oct 2023 11:47:20 -0700 Subject: [PATCH 10/40] path --- docs/gallery/plot_external_resources.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/docs/gallery/plot_external_resources.py b/docs/gallery/plot_external_resources.py index 1659f8174..93d4c4448 100644 --- a/docs/gallery/plot_external_resources.py +++ b/docs/gallery/plot_external_resources.py @@ -322,11 +322,8 @@ def __init__(self, **kwargs): # :py:func:`~hdmf.common.resources.HERD.add_ref_termset` has many optional fields, # giving the user a range of control when adding references. Let's see an example. -dir_path = os.path.dirname(os.path.abspath(__file__)) -yaml_file = os.path.join(dir_path, 'example_term_set.yaml') - er = HERD() -terms = TermSet(term_schema_path=yaml_file) +terms = TermSet(term_schema_path='docs/gallery/example_test_term_set.yaml') file = HERDManagerContainer(name='file') er.add_ref_termset(file=file, @@ -348,7 +345,7 @@ def __init__(self, **kwargs): # :py:class:`~hdmf.term_set.TermSet` or remove them from the dataset. er = HERD() -terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml') +terms = TermSet(term_schema_path='docs/gallery/example_test_term_set.yaml') file = HERDManagerContainer(name='file') er.add_ref_termset(file=file, From 18e345601988dcba3ec3d841ca7662bcadc70ad0 Mon Sep 17 00:00:00 2001 From: mavaylon1 Date: Tue, 31 Oct 2023 11:53:52 -0700 Subject: [PATCH 11/40] path --- docs/gallery/plot_external_resources.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/gallery/plot_external_resources.py b/docs/gallery/plot_external_resources.py index 93d4c4448..d22ff9028 100644 --- a/docs/gallery/plot_external_resources.py +++ b/docs/gallery/plot_external_resources.py @@ -323,7 +323,7 @@ def __init__(self, **kwargs): # :py:func:`~hdmf.common.resources.HERD.add_ref_termset` has many optional fields, # giving the user a range of control when adding references. Let's see an example. er = HERD() -terms = TermSet(term_schema_path='docs/gallery/example_test_term_set.yaml') +terms = TermSet(term_schema_path='example_term_set.yaml') file = HERDManagerContainer(name='file') er.add_ref_termset(file=file, @@ -345,7 +345,7 @@ def __init__(self, **kwargs): # :py:class:`~hdmf.term_set.TermSet` or remove them from the dataset. er = HERD() -terms = TermSet(term_schema_path='docs/gallery/example_test_term_set.yaml') +terms = TermSet(term_schema_path='example_term_set.yaml') file = HERDManagerContainer(name='file') er.add_ref_termset(file=file, From 2c24e0484105b54f7a51d68e21ebe9349fba27a1 Mon Sep 17 00:00:00 2001 From: mavaylon1 Date: Tue, 31 Oct 2023 13:35:36 -0700 Subject: [PATCH 12/40] cov --- src/hdmf/common/resources.py | 3 +- tests/unit/common/test_resources.py | 46 +++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 2 deletions(-) diff --git a/src/hdmf/common/resources.py b/src/hdmf/common/resources.py index 4e202629b..657782f64 100644 --- a/src/hdmf/common/resources.py +++ b/src/hdmf/common/resources.py @@ -506,7 +506,6 @@ def add_ref_termset(self, **kwargs): if file is None: file = self._get_file_from_container(container=container) - # if key is provided then add_ref proceeds as normal if key is not None: data = [key] @@ -518,7 +517,7 @@ def add_ref_termset(self, **kwargs): data_object = getattr(container, attribute) if isinstance(data_object, (Data, DataIO)): data = data_object.data - elif isinstance(data_object, (list, np.ndarray)): + elif isinstance(data_object, (list, tuple, np.ndarray)): data = data_object missing_terms = [] for term in data: diff --git a/tests/unit/common/test_resources.py b/tests/unit/common/test_resources.py index ab16ce6dd..0eb95ccfd 100644 --- a/tests/unit/common/test_resources.py +++ b/tests/unit/common/test_resources.py @@ -23,6 +23,7 @@ class HERDManagerContainer(Container, HERDManager): def __init__(self, **kwargs): kwargs['name'] = 'HERDManagerContainer' super().__init__(**kwargs) + attr = ['Homo sapiens'] class TestHERD(TestCase): @@ -349,6 +350,7 @@ def test_add_ref_termset(self): er.add_ref_termset(file=em, container=species, attribute='Species_Data', + key='Homo sapiens', termset=terms ) self.assertEqual(er.keys.data, [('Homo sapiens',)]) @@ -356,6 +358,50 @@ def test_add_ref_termset(self): 'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=9606')]) self.assertEqual(er.objects.data, [(0, col1.object_id, 'VectorData', '', '')]) + @unittest.skipIf(not LINKML_INSTALLED, "optional LinkML module is not installed") + def test_add_ref_termset_attribute_none(self): + terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml') + er = HERD() + em = HERDManagerContainer() + + col1 = VectorData(name='Species_Data', + description='species from NCBI and Ensemble', + data=['Homo sapiens']) + + species = DynamicTable(name='species', description='My species', columns=[col1],) + + er.add_ref_termset(file=em, + container=species['Species_Data'], + key='Homo sapiens', + termset=terms + ) + self.assertEqual(er.keys.data, [('Homo sapiens',)]) + self.assertEqual(er.entities.data, [('NCBI_TAXON:9606', + 'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=9606')]) + self.assertEqual(er.objects.data, [(0, col1.object_id, 'VectorData', '', '')]) + + @unittest.skipIf(not LINKML_INSTALLED, "optional LinkML module is not installed") + def test_add_ref_termset_data_object_list(self): + terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml') + er = HERD() + em = HERDManagerContainer() + + col1 = VectorData(name='Homo sapiens', + description='species from NCBI and Ensemble', + data=['Homo sapiens']) + + species = DynamicTable(name='species', description='My species', columns=[col1],) + + er.add_ref_termset(file=em, + container=species, + attribute='colnames', + termset=terms + ) + self.assertEqual(er.keys.data, [('Homo sapiens',)]) + self.assertEqual(er.entities.data, [('NCBI_TAXON:9606', + 'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=9606')]) + self.assertEqual(er.objects.data, [(0, species.object_id, 'DynamicTable', 'colnames', '')]) + @unittest.skipIf(not LINKML_INSTALLED, "optional LinkML module is not installed") def test_add_ref_termset_bulk(self): terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml') From 8043d0f3198425dd34af1a35e69ddf9123090acf Mon Sep 17 00:00:00 2001 From: mavaylon1 Date: Tue, 31 Oct 2023 14:07:13 -0700 Subject: [PATCH 13/40] cov --- src/hdmf/common/resources.py | 16 +++++++++++----- tests/unit/common/test_resources.py | 17 ++++++----------- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/src/hdmf/common/resources.py b/src/hdmf/common/resources.py index 657782f64..f15fbd210 100644 --- a/src/hdmf/common/resources.py +++ b/src/hdmf/common/resources.py @@ -674,22 +674,28 @@ def add_ref(self, **kwargs): relative_path=object_field['relative_path'], field=object_field['field']) - # Since object_field is set, we need to check if - # the key has been associated with that object. - # If so, just reuse the key. if add_key: + # Now that object_field is set, we need to check if + # the key has been associated with that object. + # If so, just reuse the key. key_exists = False key_idx_matches = self.keys.which(key=key) for row_idx in self.object_keys.which(objects_idx=object_field.idx): key_idx = self.object_keys['keys_idx', row_idx] + # breakpoint() if key_idx in key_idx_matches: - key_exists = True # Make sure we don't add the key + key_exists = True # Make sure we don't add the key. + # Automatically resolve the key for keys associated with + # the same object. key = self.keys.row[key_idx] + if not key_exists: key = self._add_key(key) if check_object_key: - # check for object-key relationship in ObjectKeyTable + # When using a Key Object, we want to still check for whether the key + # has been used with the Object object. If not, add it to ObjectKeyTable. + # If so, do nothing and add_object_key remains False. key_idx = key.idx object_key_row_idx = self.object_keys.which(keys_idx=key_idx) if len(object_key_row_idx)!=0: diff --git a/tests/unit/common/test_resources.py b/tests/unit/common/test_resources.py index 0eb95ccfd..c726a90c5 100644 --- a/tests/unit/common/test_resources.py +++ b/tests/unit/common/test_resources.py @@ -23,7 +23,6 @@ class HERDManagerContainer(Container, HERDManager): def __init__(self, **kwargs): kwargs['name'] = 'HERDManagerContainer' super().__init__(**kwargs) - attr = ['Homo sapiens'] class TestHERD(TestCase): @@ -372,7 +371,6 @@ def test_add_ref_termset_attribute_none(self): er.add_ref_termset(file=em, container=species['Species_Data'], - key='Homo sapiens', termset=terms ) self.assertEqual(er.keys.data, [('Homo sapiens',)]) @@ -959,24 +957,21 @@ def test_object_key_existing_key_new_object(self): entity_uri='entity_uri2') self.assertEqual(er.object_keys.data, [(0, 0), (1, 0)]) - def test_reuse_key_reuse_entity(self): + def test_reuse_key_string(self): # With the key and entity existing, the EntityKeyTable should not have duplicates er = HERD() - data_1 = Data(name='data_name', data=np.array([('Mus musculus', 9, 81.0), ('Homo sapien', 3, 27.0)], - dtype=[('species', 'U14'), ('age', 'i4'), ('weight', 'f4')])) - - data_2 = Data(name='data_name', data=np.array([('Mus musculus', 9, 81.0), ('Homo sapien', 3, 27.0)], - dtype=[('species', 'U14'), ('age', 'i4'), ('weight', 'f4')])) + data_1 = Data(name='data_name', + data=np.array([('Mus musculus', 9, 81.0), ('Homo sapien', 3, 27.0), ('mouse', 3, 27.0)], + dtype=[('species', 'U14'), ('age', 'i4'), ('weight', 'f4')])) er.add_ref(file=HERDManagerContainer(name='file'), container=data_1, key='Mus musculus', entity_id='NCBI:txid10090', entity_uri='https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=10090') - existing_key = er.get_key('Mus musculus') er.add_ref(file=HERDManagerContainer(name='file'), - container=data_2, - key=existing_key, + container=data_1, + key='Mus musculus', entity_id='NCBI:txid10090') self.assertEqual(er.entity_keys.data, [(0, 0)]) From a62ad0066d2e5dd4b3b5d78e15988ce3edc694c9 Mon Sep 17 00:00:00 2001 From: mavaylon1 Date: Tue, 31 Oct 2023 14:17:52 -0700 Subject: [PATCH 14/40] doc --- docs/gallery/plot_external_resources.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/gallery/plot_external_resources.py b/docs/gallery/plot_external_resources.py index d22ff9028..366fee133 100644 --- a/docs/gallery/plot_external_resources.py +++ b/docs/gallery/plot_external_resources.py @@ -312,8 +312,8 @@ def __init__(self, **kwargs): # ------------------------------------------------------ # :py:class:`~hdmf.common.resources.HERD` has multiple ways for users to add # external references. The :py:func:`~hdmf.common.resources.HERD.add_ref_termset` -# method allows users to not only validate terms, i.e., keys, but also perform -# bulk populating of the data structure. +# method allows users to not only validate terms, i.e., keys, but also adds the +# ability to iteratively add references for entire datasets, lists, arrays, etc. # The :py:func:`~hdmf.common.resources.HERD.add_ref_container` method is directly # used for populating :py:class:`~hdmf.common.resources.HERD` when writing a file. From 90abebe75bf98f04f196c69fc63174c279e7c179 Mon Sep 17 00:00:00 2001 From: mavaylon1 Date: Tue, 31 Oct 2023 14:26:48 -0700 Subject: [PATCH 15/40] notes' --- src/hdmf/common/resources.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/hdmf/common/resources.py b/src/hdmf/common/resources.py index f15fbd210..ecf574750 100644 --- a/src/hdmf/common/resources.py +++ b/src/hdmf/common/resources.py @@ -382,6 +382,7 @@ def _check_object_field(self, **kwargs): if len(objecttable_idx) == 1: return self.objects.row[objecttable_idx[0]] elif len(objecttable_idx) == 0 and create: + # Used for add_ref return {'file_object_id': file_object_id, 'files_idx': files_idx, 'container': container, @@ -527,7 +528,6 @@ def add_ref_termset(self, **kwargs): except ValueError: missing_terms.append(term) continue - # prep for add_ref entity_id = term_info[0] entity_uri = term_info[2] self.add_ref(file=file, @@ -566,6 +566,7 @@ def add_ref(self, **kwargs): container = kwargs['container'] attribute = kwargs['attribute'] if isinstance(container, Data): + # Used when using the TermSetWrapper if attribute == 'data': attribute = None key = kwargs['key'] @@ -580,9 +581,9 @@ def add_ref(self, **kwargs): if file is None: file = self._get_file_from_container(container=container) - ############## - # Validate Key - ############## + ################ + # Set Key Checks + ################ add_key = False add_object_key = False check_object_key = False @@ -595,9 +596,9 @@ def add_ref(self, **kwargs): # in the ObjectKeyTable check_object_key = True - ################# - # Validate Entity - ################# + ################### + # Set Entity Checks + ################### add_entity_key = False add_entity = False @@ -662,10 +663,11 @@ def add_ref(self, **kwargs): relative_path=relative_path, field=field) - ############### - # Populate HERD - ############### + ####################################### + # Validate Parameters and Populate HERD + ####################################### if isinstance(object_field, dict): + # Create the object and file if object_field['files_idx'] is None: self._add_file(object_field['file_object_id']) object_field['files_idx'] = self.files.which(file_object_id=object_field['file_object_id'])[0] From abcbcc7ad60c740e6cdcd7a0c546d7ce9ba9b113 Mon Sep 17 00:00:00 2001 From: Matthew Avaylon Date: Tue, 31 Oct 2023 14:31:14 -0700 Subject: [PATCH 16/40] Update CHANGELOG.md --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c6d4b8591..fed60d75b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # HDMF Changelog +## HDMF 3.11.1 (Upcoming) + +### Enhancements +- Added `add_ref_termset`, updated helper methods for `HERD`, and revised `add_ref` to support validations prior to populating the tables. @mavaylon1 [#968](https://github.com/hdmf-dev/hdmf/pull/968) + ## HDMF 3.11.0 (October 30, 2023) ### Enhancements From b5787b84d0a6bd35ffaf0d36cafbb92ca2977924 Mon Sep 17 00:00:00 2001 From: mavaylon1 Date: Tue, 31 Oct 2023 14:38:25 -0700 Subject: [PATCH 17/40] resuse uri --- src/hdmf/common/resources.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/hdmf/common/resources.py b/src/hdmf/common/resources.py index ecf574750..e29f3a49d 100644 --- a/src/hdmf/common/resources.py +++ b/src/hdmf/common/resources.py @@ -615,6 +615,10 @@ def add_ref(self, **kwargs): # The entity exists and so we need to check if an entity_key exists # for this entity and key combination. check_entity_key = True + if entity_uri is not None: + entity = entity.entity_uri + msg = 'This entity already exists. Ignoring new entity uri' + warn(msg) ################# # Validate Object @@ -716,10 +720,6 @@ def add_ref(self, **kwargs): self._add_object_key(object_field, key) if check_entity_key: - if entity_uri is not None: - msg = 'This entity already exists. Ignoring new entity uri' - warn(msg) - # check for entity-key relationship in EntityKeyTable key_idx = key.idx entity_key_row_idx = self.entity_keys.which(keys_idx=key_idx) From 756f062e0ff155e8f68898a754b200713db9d7b1 Mon Sep 17 00:00:00 2001 From: mavaylon1 Date: Tue, 31 Oct 2023 14:41:17 -0700 Subject: [PATCH 18/40] typo --- src/hdmf/common/resources.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hdmf/common/resources.py b/src/hdmf/common/resources.py index e29f3a49d..5766c6988 100644 --- a/src/hdmf/common/resources.py +++ b/src/hdmf/common/resources.py @@ -616,7 +616,7 @@ def add_ref(self, **kwargs): # for this entity and key combination. check_entity_key = True if entity_uri is not None: - entity = entity.entity_uri + entity_uri = entity.entity_uri msg = 'This entity already exists. Ignoring new entity uri' warn(msg) From ecd01f816d3743e581a23f4f2f3cfc309027e45a Mon Sep 17 00:00:00 2001 From: mavaylon1 Date: Tue, 31 Oct 2023 14:49:18 -0700 Subject: [PATCH 19/40] edits --- src/hdmf/common/resources.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/hdmf/common/resources.py b/src/hdmf/common/resources.py index 5766c6988..cc56cca38 100644 --- a/src/hdmf/common/resources.py +++ b/src/hdmf/common/resources.py @@ -686,14 +686,14 @@ def add_ref(self, **kwargs): # If so, just reuse the key. key_exists = False key_idx_matches = self.keys.which(key=key) - for row_idx in self.object_keys.which(objects_idx=object_field.idx): - key_idx = self.object_keys['keys_idx', row_idx] - # breakpoint() - if key_idx in key_idx_matches: - key_exists = True # Make sure we don't add the key. - # Automatically resolve the key for keys associated with - # the same object. - key = self.keys.row[key_idx] + if len(key_idx_matches)!=0: + for row_idx in self.object_keys.which(objects_idx=object_field.idx): + key_idx = self.object_keys['keys_idx', row_idx] + if key_idx in key_idx_matches: + key_exists = True # Make sure we don't add the key. + # Automatically resolve the key for keys associated with + # the same object. + key = self.keys.row[key_idx] if not key_exists: key = self._add_key(key) @@ -702,11 +702,11 @@ def add_ref(self, **kwargs): # When using a Key Object, we want to still check for whether the key # has been used with the Object object. If not, add it to ObjectKeyTable. # If so, do nothing and add_object_key remains False. + obj_key_exists = False key_idx = key.idx object_key_row_idx = self.object_keys.which(keys_idx=key_idx) if len(object_key_row_idx)!=0: # this means there exists rows where the key is in the ObjectKeyTable - obj_key_exists = False for row_idx in object_key_row_idx: obj_idx = self.object_keys['objects_idx', row_idx] if obj_idx == object_field.idx: @@ -721,11 +721,11 @@ def add_ref(self, **kwargs): if check_entity_key: # check for entity-key relationship in EntityKeyTable + entity_key_check = False key_idx = key.idx entity_key_row_idx = self.entity_keys.which(keys_idx=key_idx) if len(entity_key_row_idx)!=0: # this means there exists rows where the key is in the EntityKeyTable - entity_key_check = False for row_idx in entity_key_row_idx: entity_idx = self.entity_keys['entities_idx', row_idx] if entity_idx == entity.idx: From 6e75289049fe6599bb154bbe32d372ac69bb8990 Mon Sep 17 00:00:00 2001 From: Matthew Avaylon Date: Tue, 7 Nov 2023 10:29:34 -0800 Subject: [PATCH 20/40] Update CHANGELOG.md --- CHANGELOG.md | 1 - 1 file changed, 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index feb3b51b0..c16a5b952 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,6 @@ ### Minor Improvements - Updated `__gather_columns` to ignore the order of bases when generating columns from the super class. @mavaylon1 [#991](https://github.com/hdmf-dev/hdmf/pull/991) - ## HDMF 3.11.0 (October 30, 2023) ### Enhancements From 07930e97a10bf4d056ab20ca206b939c946e1692 Mon Sep 17 00:00:00 2001 From: Matthew Avaylon Date: Fri, 8 Dec 2023 14:28:41 -0800 Subject: [PATCH 21/40] Update docs/gallery/plot_external_resources.py Co-authored-by: Ryan Ly --- docs/gallery/plot_external_resources.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/gallery/plot_external_resources.py b/docs/gallery/plot_external_resources.py index 366fee133..1f6c99b0d 100644 --- a/docs/gallery/plot_external_resources.py +++ b/docs/gallery/plot_external_resources.py @@ -312,8 +312,8 @@ def __init__(self, **kwargs): # ------------------------------------------------------ # :py:class:`~hdmf.common.resources.HERD` has multiple ways for users to add # external references. The :py:func:`~hdmf.common.resources.HERD.add_ref_termset` -# method allows users to not only validate terms, i.e., keys, but also adds the -# ability to iteratively add references for entire datasets, lists, arrays, etc. +# method allows users to not only validate terms, i.e., keys, but also +# add references for entire datasets, lists, arrays, etc. # The :py:func:`~hdmf.common.resources.HERD.add_ref_container` method is directly # used for populating :py:class:`~hdmf.common.resources.HERD` when writing a file. From 5c617082f007a57243f863055e7c05fc8251f3dd Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 8 Dec 2023 22:28:47 +0000 Subject: [PATCH 22/40] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- docs/gallery/plot_external_resources.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/gallery/plot_external_resources.py b/docs/gallery/plot_external_resources.py index 1f6c99b0d..bc306973b 100644 --- a/docs/gallery/plot_external_resources.py +++ b/docs/gallery/plot_external_resources.py @@ -312,7 +312,7 @@ def __init__(self, **kwargs): # ------------------------------------------------------ # :py:class:`~hdmf.common.resources.HERD` has multiple ways for users to add # external references. The :py:func:`~hdmf.common.resources.HERD.add_ref_termset` -# method allows users to not only validate terms, i.e., keys, but also +# method allows users to not only validate terms, i.e., keys, but also # add references for entire datasets, lists, arrays, etc. # The :py:func:`~hdmf.common.resources.HERD.add_ref_container` method is directly From 8a3f5b20ce378786163667f7e66b13c84175cd03 Mon Sep 17 00:00:00 2001 From: mavaylon1 Date: Fri, 8 Dec 2023 14:36:23 -0800 Subject: [PATCH 23/40] name change --- docs/gallery/plot_external_resources.py | 54 ++++++++++++------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/docs/gallery/plot_external_resources.py b/docs/gallery/plot_external_resources.py index bc306973b..6a49f4b3f 100644 --- a/docs/gallery/plot_external_resources.py +++ b/docs/gallery/plot_external_resources.py @@ -108,7 +108,7 @@ def __init__(self, **kwargs): super().__init__(**kwargs) -er = HERD() +herd = HERD() file = HERDManagerContainer(name='file') @@ -124,7 +124,7 @@ def __init__(self, **kwargs): # the underlying data structures accordingly. data = Data(name="species", data=['Homo sapiens', 'Mus musculus']) -er.add_ref( +herd.add_ref( file=file, container=data, key='Homo sapiens', @@ -132,7 +132,7 @@ def __init__(self, **kwargs): entity_uri='https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9606' ) -er.add_ref( +herd.add_ref( file=file, container=data, key='Mus musculus', @@ -157,7 +157,7 @@ def __init__(self, **kwargs): genotypes = DynamicTable(name='genotypes', description='My genotypes') genotypes.add_column(name='genotype_name', description="Name of genotypes") genotypes.add_row(id=0, genotype_name='Rorb') -er.add_ref( +herd.add_ref( file=file, container=genotypes, attribute='genotype_name', @@ -167,13 +167,13 @@ def __init__(self, **kwargs): ) # Note: :py:func:`~hdmf.common.resources.HERD.add_ref` internally resolves the object -# to the closest parent, so that ``er.add_ref(container=genotypes, attribute='genotype_name')`` and -# ``er.add_ref(container=genotypes.genotype_name, attribute=None)`` will ultimately both use the ``object_id`` +# to the closest parent, so that ``herd.add_ref(container=genotypes, attribute='genotype_name')`` and +# ``herd.add_ref(container=genotypes.genotype_name, attribute=None)`` will ultimately both use the ``object_id`` # of the ``genotypes.genotype_name`` :py:class:`~hdmf.common.table.VectorData` column and # not the object_id of the genotypes table. ############################################################################### -# Using the add_ref method without the file parameter. +# Using the add_ref method without the file parametherd. # ------------------------------------------------------ # Even though :py:class:`~hdmf.common.resources.File` is required to create/add a new reference, # the user can omit the file parameter if the :py:class:`~hdmf.common.resources.Object` has a file @@ -189,7 +189,7 @@ def __init__(self, **kwargs): species = DynamicTable(name='species', description='My species', columns=[col1]) species.parent = file -er.add_ref( +herd.add_ref( container=species, attribute='Species_Data', key='Ursus arctos horribilis', @@ -204,15 +204,15 @@ def __init__(self, **kwargs): # as separate tables. # `~hdmf.common.resources.HERD` as a flattened table -er.to_dataframe() +herd.to_dataframe() # The individual interlinked tables: -er.files.to_dataframe() -er.objects.to_dataframe() -er.entities.to_dataframe() -er.keys.to_dataframe() -er.object_keys.to_dataframe() -er.entity_keys.to_dataframe() +herd.files.to_dataframe() +herd.objects.to_dataframe() +herd.entities.to_dataframe() +herd.keys.to_dataframe() +herd.object_keys.to_dataframe() +herd.entity_keys.to_dataframe() ############################################################################### # Using the get_key method @@ -225,11 +225,11 @@ def __init__(self, **kwargs): # The :py:func:`~hdmf.common.resources.HERD.get_key` method will be able to return the # :py:class:`~hdmf.common.resources.Key` object if the :py:class:`~hdmf.common.resources.Key` object is unique. -genotype_key_object = er.get_key(key_name='Rorb') +genotype_key_object = herd.get_key(key_name='Rorb') # If the :py:class:`~hdmf.common.resources.Key` object has a duplicate name, then the user will need # to provide the unique (file, container, relative_path, field, key) combination. -species_key_object = er.get_key(file=file, +species_key_object = herd.get_key(file=file, container=species['Species_Data'], key_name='Ursus arctos horribilis') @@ -247,7 +247,7 @@ def __init__(self, **kwargs): # :py:func:`~hdmf.common.resources.HERD.add_ref` method. If a 'key_name' # is used, a new :py:class:`~hdmf.common.resources.Key` will be created. -er.add_ref( +herd.add_ref( file=file, container=genotypes, attribute='genotype_name', @@ -263,7 +263,7 @@ def __init__(self, **kwargs): # allows the user to retrieve all entities and key information associated with an `Object` in # the form of a pandas DataFrame. -er.get_object_entities(file=file, +herd.get_object_entities(file=file, container=genotypes['genotype_name'], relative_path='') @@ -274,7 +274,7 @@ def __init__(self, **kwargs): # allows the user to retrieve all entities and key information associated with an `Object` in # the form of a pandas DataFrame. -er.get_object_type(object_type='Data') +herd.get_object_type(object_type='Data') ############################################################################### # Special Case: Using add_ref with compound data @@ -287,7 +287,7 @@ def __init__(self, **kwargs): # 'x' is using the external reference. # Let's create a new instance of :py:class:`~hdmf.common.resources.HERD`. -er = HERD() +herd = HERD() file = HERDManagerContainer(name='file') data = Data( @@ -298,7 +298,7 @@ def __init__(self, **kwargs): ) ) -er.add_ref( +herd.add_ref( file=file, container=data, field='species', @@ -322,11 +322,11 @@ def __init__(self, **kwargs): # :py:func:`~hdmf.common.resources.HERD.add_ref_termset` has many optional fields, # giving the user a range of control when adding references. Let's see an example. -er = HERD() +herd = HERD() terms = TermSet(term_schema_path='example_term_set.yaml') file = HERDManagerContainer(name='file') -er.add_ref_termset(file=file, +herd.add_ref_termset(file=file, container=species, attribute='Species_Data', key='Ursus arctos horribilis', @@ -344,11 +344,11 @@ def __init__(self, **kwargs): # terms in a dictionary. It is up to the user to either add these terms to the # :py:class:`~hdmf.term_set.TermSet` or remove them from the dataset. -er = HERD() +herd = HERD() terms = TermSet(term_schema_path='example_term_set.yaml') file = HERDManagerContainer(name='file') -er.add_ref_termset(file=file, +herd.add_ref_termset(file=file, container=species, attribute='Species_Data', termset=terms) @@ -360,7 +360,7 @@ def __init__(self, **kwargs): # the individual tables written to tsv. # The user provides the path, which contains the name of the file. -er.to_zip(path='./HERD.zip') +herd.to_zip(path='./HERD.zip') ############################################################################### # Read HERD From f49cfd40932184b0fbae6c0c9ce29430d762f571 Mon Sep 17 00:00:00 2001 From: Matthew Avaylon Date: Fri, 8 Dec 2023 14:38:28 -0800 Subject: [PATCH 24/40] Update src/hdmf/common/resources.py Co-authored-by: Ryan Ly --- src/hdmf/common/resources.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hdmf/common/resources.py b/src/hdmf/common/resources.py index cc56cca38..950e4621e 100644 --- a/src/hdmf/common/resources.py +++ b/src/hdmf/common/resources.py @@ -732,7 +732,7 @@ def add_ref(self, **kwargs): entity_key_check = True # this means there is already a entity-key relationship recorded if not entity_key_check: - # this means that though the key is there, there is not entity-key relationship + # this means that though the key is there, there is no entity-key relationship add_entity_key = True else: # this means that specific key is not in the EntityKeyTable, so add it and establish From fe97bba66a538f3194e5b99d7aea4d92fb8c7ee7 Mon Sep 17 00:00:00 2001 From: Matthew Avaylon Date: Fri, 8 Dec 2023 14:38:41 -0800 Subject: [PATCH 25/40] Update src/hdmf/common/resources.py Co-authored-by: Ryan Ly --- src/hdmf/common/resources.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hdmf/common/resources.py b/src/hdmf/common/resources.py index 950e4621e..e35bf3d61 100644 --- a/src/hdmf/common/resources.py +++ b/src/hdmf/common/resources.py @@ -365,7 +365,7 @@ def _check_object_field(self, **kwargs): file_object_id = file.object_id files_idx = self.files.which(file_object_id=file_object_id) - if len(files_idx) > 1: + if len(files_idx) > 1: # pragma: no cover # It isn't possible for len(files_idx) > 1 without the user directly using _add_file raise ValueError("Found multiple instances of the same file.") elif len(files_idx) == 1: From d92873e12451582bd1cd691c4f810180cf96ce4e Mon Sep 17 00:00:00 2001 From: Matthew Avaylon Date: Fri, 8 Dec 2023 14:38:49 -0800 Subject: [PATCH 26/40] Update src/hdmf/common/resources.py Co-authored-by: Ryan Ly --- src/hdmf/common/resources.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hdmf/common/resources.py b/src/hdmf/common/resources.py index e35bf3d61..7e0e0d922 100644 --- a/src/hdmf/common/resources.py +++ b/src/hdmf/common/resources.py @@ -493,7 +493,7 @@ def add_ref_container(self, **kwargs): def add_ref_termset(self, **kwargs): """ This method allows users to take advantage of using the TermSet class to provide the entity information - for add_ref, while also validating the data. This method supports adding a single key and an entire dataset + for add_ref, while also validating the data. This method supports adding a single key or an entire dataset to the HERD tables. For both cases, the term, i.e., key, will be validated against the permissible values in the TermSet. If valid, it will proceed to call add_ref. Otherwise, the method will return a dict of missing terms. From b02807715dbd1eefaf028fcf8838b95297a4060d Mon Sep 17 00:00:00 2001 From: Matthew Avaylon Date: Fri, 8 Dec 2023 14:38:58 -0800 Subject: [PATCH 27/40] Update src/hdmf/common/resources.py Co-authored-by: Ryan Ly --- src/hdmf/common/resources.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hdmf/common/resources.py b/src/hdmf/common/resources.py index 7e0e0d922..147e21668 100644 --- a/src/hdmf/common/resources.py +++ b/src/hdmf/common/resources.py @@ -713,7 +713,7 @@ def add_ref(self, **kwargs): obj_key_exists = True # this means there is already a object-key relationship recorded if not obj_key_exists: - # this means that though the key is there, there is not object-key relationship + # this means that though the key is there, there is no object-key relationship add_object_key = True if add_object_key: From b5b36ac6657a9b543a6f66ed982cd0c7ecfb16ec Mon Sep 17 00:00:00 2001 From: Matthew Avaylon Date: Fri, 8 Dec 2023 14:39:10 -0800 Subject: [PATCH 28/40] Update src/hdmf/common/resources.py Co-authored-by: Ryan Ly --- src/hdmf/common/resources.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hdmf/common/resources.py b/src/hdmf/common/resources.py index 147e21668..e0632254a 100644 --- a/src/hdmf/common/resources.py +++ b/src/hdmf/common/resources.py @@ -496,7 +496,7 @@ def add_ref_termset(self, **kwargs): for add_ref, while also validating the data. This method supports adding a single key or an entire dataset to the HERD tables. For both cases, the term, i.e., key, will be validated against the permissible values in the TermSet. If valid, it will proceed to call add_ref. Otherwise, the method will return a dict of - missing terms. + missing terms (terms not found in the TermSet). """ file = kwargs['file'] container = kwargs['container'] From 7a636de589f3c1b0d048423ed57d32e3107ce0ea Mon Sep 17 00:00:00 2001 From: mavaylon1 Date: Mon, 11 Dec 2023 17:29:50 -0800 Subject: [PATCH 29/40] doc --- docs/gallery/plot_external_resources.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/docs/gallery/plot_external_resources.py b/docs/gallery/plot_external_resources.py index 6a49f4b3f..c8ef9ea93 100644 --- a/docs/gallery/plot_external_resources.py +++ b/docs/gallery/plot_external_resources.py @@ -310,15 +310,10 @@ def __init__(self, **kwargs): ############################################################################### # Using add_ref_termset # ------------------------------------------------------ -# :py:class:`~hdmf.common.resources.HERD` has multiple ways for users to add -# external references. The :py:func:`~hdmf.common.resources.HERD.add_ref_termset` +# The :py:func:`~hdmf.common.resources.HERD.add_ref_termset` # method allows users to not only validate terms, i.e., keys, but also -# add references for entire datasets, lists, arrays, etc. - -# The :py:func:`~hdmf.common.resources.HERD.add_ref_container` method is directly -# used for populating :py:class:`~hdmf.common.resources.HERD` when writing a file. -# :py:func:`~hdmf.common.resources.HERD.add_ref_termset` can be used for new files; -# however, it is also the best practice when adding references for existing files. +# add references for an entire datasets, rather than single entries as we saw +# prior with :py:func:`~hdmf.common.resources.HERD.add_ref`. # :py:func:`~hdmf.common.resources.HERD.add_ref_termset` has many optional fields, # giving the user a range of control when adding references. Let's see an example. From a1ffa34c19088582d4c613d76b6b8c6e80af0380 Mon Sep 17 00:00:00 2001 From: Matthew Avaylon Date: Mon, 11 Dec 2023 17:30:08 -0800 Subject: [PATCH 30/40] Update src/hdmf/common/resources.py Co-authored-by: Ryan Ly --- src/hdmf/common/resources.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hdmf/common/resources.py b/src/hdmf/common/resources.py index e0632254a..f7e7303b1 100644 --- a/src/hdmf/common/resources.py +++ b/src/hdmf/common/resources.py @@ -390,7 +390,7 @@ def _check_object_field(self, **kwargs): 'field': field} elif len(objecttable_idx) == 0 and not create: raise ValueError("Object not in Object Table.") - else: + else: # pragma: no cover # It isn't possible for this to happen unless the user used _add_object. raise ValueError("Found multiple instances of the same object id, relative path, " "and field in objects table.") From d78f335a58330a6d5d7492a45c30098ebf615c00 Mon Sep 17 00:00:00 2001 From: mavaylon1 Date: Mon, 11 Dec 2023 19:24:06 -0800 Subject: [PATCH 31/40] checks --- CHANGELOG.md | 3 ++- src/hdmf/common/resources.py | 8 ++++++++ tests/unit/common/test_resources.py | 32 +++++++++++++++++++++++++++++ 3 files changed, 42 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b665fbee2..a29562da1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,8 @@ ## HDMF 3.11.1 (Upcoming) ### Enhancements -- Added `add_ref_termset`, updated helper methods for `HERD`, and revised `add_ref` to support validations prior to populating the tables. @mavaylon1 [#968](https://github.com/hdmf-dev/hdmf/pull/968) +- Added `add_ref_termset`, updated helper methods for `HERD`, revised `add_ref` to support validations prior to populating the tables + and added `add_ref_container`. @mavaylon1 [#968](https://github.com/hdmf-dev/hdmf/pull/968) ### Minor Improvements - Updated `__gather_columns` to ignore the order of bases when generating columns from the super class. @mavaylon1 [#991](https://github.com/hdmf-dev/hdmf/pull/991) diff --git a/src/hdmf/common/resources.py b/src/hdmf/common/resources.py index e0632254a..5345b10d4 100644 --- a/src/hdmf/common/resources.py +++ b/src/hdmf/common/resources.py @@ -520,6 +520,9 @@ def add_ref_termset(self, **kwargs): data = data_object.data elif isinstance(data_object, (list, tuple, np.ndarray)): data = data_object + else: + msg = "The data object being used is not supported. Please review the documentation for support types." + raise ValueError(msg) missing_terms = [] for term in data: # check the data according to the permissible_values @@ -580,6 +583,11 @@ def add_ref(self, **kwargs): ################## if file is None: file = self._get_file_from_container(container=container) + else: + file_from_container = self._get_file_from_container(container=container) + if file.object_id != file_from_container.object_id: + msg = "The file given does not match the file in which the container is stored." + raise ValueError(msg) ################ # Set Key Checks diff --git a/tests/unit/common/test_resources.py b/tests/unit/common/test_resources.py index c726a90c5..d9475f4ff 100644 --- a/tests/unit/common/test_resources.py +++ b/tests/unit/common/test_resources.py @@ -268,6 +268,21 @@ def test_add_ref_search_for_file_error(self): entity_id='entity_id1', entity_uri='entity1') + def test_add_ref_file_mismatch(self): + file = HERDManagerContainer(name='file') + file2 = HERDManagerContainer() + + + nested_child = Container(name='nested_child') + child = Container(name='child') + nested_child.parent = child + child.parent = file + + er = HERD() + with self.assertRaises(ValueError): + er.add_ref(file=file2, container=nested_child, key='key1', + entity_id='entity_id1', entity_uri='entity1') + @unittest.skipIf(not LINKML_INSTALLED, "optional LinkML module is not installed") def test_check_termset_wrapper(self): terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml') @@ -357,6 +372,23 @@ def test_add_ref_termset(self): 'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=9606')]) self.assertEqual(er.objects.data, [(0, col1.object_id, 'VectorData', '', '')]) + @unittest.skipIf(not LINKML_INSTALLED, "optional LinkML module is not installed") + def test_add_ref_termset_data_object_error(self): + terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml') + er = HERD() + em = HERDManagerContainer() + + col1 = VectorData(name='Species_Data', + description='species from NCBI and Ensemble', + data=['Homo sapiens']) + + with self.assertRaises(ValueError): + er.add_ref_termset( + container=col1, + attribute='description', + termset=terms + ) + @unittest.skipIf(not LINKML_INSTALLED, "optional LinkML module is not installed") def test_add_ref_termset_attribute_none(self): terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml') From ae6494b317d7ba94ecc50b5c85dd7dce0b899cb0 Mon Sep 17 00:00:00 2001 From: mavaylon1 Date: Mon, 11 Dec 2023 19:25:24 -0800 Subject: [PATCH 32/40] ruff --- tests/unit/common/test_resources.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/unit/common/test_resources.py b/tests/unit/common/test_resources.py index 5141e3d35..330168556 100644 --- a/tests/unit/common/test_resources.py +++ b/tests/unit/common/test_resources.py @@ -376,7 +376,6 @@ def test_add_ref_termset(self): def test_add_ref_termset_data_object_error(self): terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml') er = HERD() - em = HERDManagerContainer() col1 = VectorData(name='Species_Data', description='species from NCBI and Ensemble', From 12132dcaad8dc9eda90d44b89c7e7fe167427d3d Mon Sep 17 00:00:00 2001 From: mavaylon1 Date: Mon, 11 Dec 2023 19:36:10 -0800 Subject: [PATCH 33/40] gallery --- docs/gallery/plot_external_resources.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/gallery/plot_external_resources.py b/docs/gallery/plot_external_resources.py index c8ef9ea93..9b6a2ca8b 100644 --- a/docs/gallery/plot_external_resources.py +++ b/docs/gallery/plot_external_resources.py @@ -124,6 +124,7 @@ def __init__(self, **kwargs): # the underlying data structures accordingly. data = Data(name="species", data=['Homo sapiens', 'Mus musculus']) +data.parent = file herd.add_ref( file=file, container=data, @@ -157,6 +158,7 @@ def __init__(self, **kwargs): genotypes = DynamicTable(name='genotypes', description='My genotypes') genotypes.add_column(name='genotype_name', description="Name of genotypes") genotypes.add_row(id=0, genotype_name='Rorb') +genotypes.parent = file herd.add_ref( file=file, container=genotypes, @@ -288,7 +290,6 @@ def __init__(self, **kwargs): # Let's create a new instance of :py:class:`~hdmf.common.resources.HERD`. herd = HERD() -file = HERDManagerContainer(name='file') data = Data( name='data_name', @@ -297,6 +298,7 @@ def __init__(self, **kwargs): dtype=[('species', 'U14'), ('age', 'i4'), ('weight', 'f4')] ) ) +data.parent = file herd.add_ref( file=file, @@ -318,8 +320,7 @@ def __init__(self, **kwargs): # :py:func:`~hdmf.common.resources.HERD.add_ref_termset` has many optional fields, # giving the user a range of control when adding references. Let's see an example. herd = HERD() -terms = TermSet(term_schema_path='example_term_set.yaml') -file = HERDManagerContainer(name='file') +terms = TermSet(term_schema_path='docs/gallery/example_term_set.yaml') herd.add_ref_termset(file=file, container=species, @@ -340,8 +341,7 @@ def __init__(self, **kwargs): # :py:class:`~hdmf.term_set.TermSet` or remove them from the dataset. herd = HERD() -terms = TermSet(term_schema_path='example_term_set.yaml') -file = HERDManagerContainer(name='file') +terms = TermSet(term_schema_path='docs/gallery/example_term_set.yaml') herd.add_ref_termset(file=file, container=species, From 82d6ed40189e4bd7e8a29a8dab08afe899534f81 Mon Sep 17 00:00:00 2001 From: mavaylon1 Date: Mon, 11 Dec 2023 19:48:26 -0800 Subject: [PATCH 34/40] path --- docs/gallery/plot_external_resources.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/docs/gallery/plot_external_resources.py b/docs/gallery/plot_external_resources.py index 9b6a2ca8b..55eb49a99 100644 --- a/docs/gallery/plot_external_resources.py +++ b/docs/gallery/plot_external_resources.py @@ -100,6 +100,13 @@ import warnings warnings.filterwarnings("ignore", category=UserWarning, message="HERD is experimental*") +try: + dir_path = os.path.dirname(os.path.abspath(__file__)) + yaml_file = os.path.join(dir_path, 'example_term_set.yaml') +except NameError: + dir_path = os.path.dirname(os.path.abspath('.')) + yaml_file = os.path.join(dir_path, 'gallery/example_term_set.yaml') + # Class to represent a file class HERDManagerContainer(Container, HERDManager): @@ -320,7 +327,7 @@ def __init__(self, **kwargs): # :py:func:`~hdmf.common.resources.HERD.add_ref_termset` has many optional fields, # giving the user a range of control when adding references. Let's see an example. herd = HERD() -terms = TermSet(term_schema_path='docs/gallery/example_term_set.yaml') +terms = TermSet(term_schema_path=yaml_file) herd.add_ref_termset(file=file, container=species, @@ -341,7 +348,7 @@ def __init__(self, **kwargs): # :py:class:`~hdmf.term_set.TermSet` or remove them from the dataset. herd = HERD() -terms = TermSet(term_schema_path='docs/gallery/example_term_set.yaml') +terms = TermSet(term_schema_path=yaml_file) herd.add_ref_termset(file=file, container=species, From fd72d73daba3e55a94619304410985f44fb53e46 Mon Sep 17 00:00:00 2001 From: mavaylon1 Date: Tue, 12 Dec 2023 10:35:55 -0800 Subject: [PATCH 35/40] todo --- src/hdmf/common/resources.py | 11 ++++++----- tests/unit/common/test_resources.py | 28 ++++++++++++++-------------- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/src/hdmf/common/resources.py b/src/hdmf/common/resources.py index 94d67c8da..4ac863fc2 100644 --- a/src/hdmf/common/resources.py +++ b/src/hdmf/common/resources.py @@ -583,11 +583,12 @@ def add_ref(self, **kwargs): ################## if file is None: file = self._get_file_from_container(container=container) - else: - file_from_container = self._get_file_from_container(container=container) - if file.object_id != file_from_container.object_id: - msg = "The file given does not match the file in which the container is stored." - raise ValueError(msg) + # TODO: Add this once you've created a HDMF_file to rework testing + # else: + # file_from_container = self._get_file_from_container(container=container) + # if file.object_id != file_from_container.object_id: + # msg = "The file given does not match the file in which the container is stored." + # raise ValueError(msg) ################ # Set Key Checks diff --git a/tests/unit/common/test_resources.py b/tests/unit/common/test_resources.py index 330168556..627db3ee0 100644 --- a/tests/unit/common/test_resources.py +++ b/tests/unit/common/test_resources.py @@ -268,20 +268,20 @@ def test_add_ref_search_for_file_error(self): entity_id='entity_id1', entity_uri='entity1') - def test_add_ref_file_mismatch(self): - file = HERDManagerContainer(name='file') - file2 = HERDManagerContainer() - - - nested_child = Container(name='nested_child') - child = Container(name='child') - nested_child.parent = child - child.parent = file - - er = HERD() - with self.assertRaises(ValueError): - er.add_ref(file=file2, container=nested_child, key='key1', - entity_id='entity_id1', entity_uri='entity1') + # TODO: Add this once you've created a HDMF_file to rework testing + # def test_add_ref_file_mismatch(self): + # file = HERDManagerContainer(name='file') + # file2 = HERDManagerContainer() + # + # nested_child = Container(name='nested_child') + # child = Container(name='child') + # nested_child.parent = child + # child.parent = file + # + # er = HERD() + # with self.assertRaises(ValueError): + # er.add_ref(file=file2, container=nested_child, key='key1', + # entity_id='entity_id1', entity_uri='entity1') @unittest.skipIf(not LINKML_INSTALLED, "optional LinkML module is not installed") def test_check_termset_wrapper(self): From e688be0f5fcede5a1246a74a0e7f37e62ef4e21f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 12 Dec 2023 18:37:54 +0000 Subject: [PATCH 36/40] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/unit/common/test_resources.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/common/test_resources.py b/tests/unit/common/test_resources.py index 627db3ee0..8cbd8291e 100644 --- a/tests/unit/common/test_resources.py +++ b/tests/unit/common/test_resources.py @@ -268,7 +268,7 @@ def test_add_ref_search_for_file_error(self): entity_id='entity_id1', entity_uri='entity1') - # TODO: Add this once you've created a HDMF_file to rework testing + # TODO: Add this once you've created a HDMF_file to rework testing # def test_add_ref_file_mismatch(self): # file = HERDManagerContainer(name='file') # file2 = HERDManagerContainer() From 9042dfa74e6bcc5b8b1617e366f6955fb4e7cad5 Mon Sep 17 00:00:00 2001 From: Ryan Ly Date: Wed, 13 Dec 2023 13:35:34 -0800 Subject: [PATCH 37/40] Update docs/gallery/plot_external_resources.py --- docs/gallery/plot_external_resources.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/gallery/plot_external_resources.py b/docs/gallery/plot_external_resources.py index 55eb49a99..21131cc92 100644 --- a/docs/gallery/plot_external_resources.py +++ b/docs/gallery/plot_external_resources.py @@ -182,7 +182,7 @@ def __init__(self, **kwargs): # not the object_id of the genotypes table. ############################################################################### -# Using the add_ref method without the file parametherd. +# Using the add_ref method without the file parameter. # ------------------------------------------------------ # Even though :py:class:`~hdmf.common.resources.File` is required to create/add a new reference, # the user can omit the file parameter if the :py:class:`~hdmf.common.resources.Object` has a file From 4477bea8dc2c49ea21fb2bf4a0aab8df21d3ad64 Mon Sep 17 00:00:00 2001 From: Ryan Ly Date: Wed, 13 Dec 2023 13:37:11 -0800 Subject: [PATCH 38/40] Update src/hdmf/common/resources.py --- src/hdmf/common/resources.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hdmf/common/resources.py b/src/hdmf/common/resources.py index 4ac863fc2..a4c8fca78 100644 --- a/src/hdmf/common/resources.py +++ b/src/hdmf/common/resources.py @@ -521,7 +521,7 @@ def add_ref_termset(self, **kwargs): elif isinstance(data_object, (list, tuple, np.ndarray)): data = data_object else: - msg = "The data object being used is not supported. Please review the documentation for support types." + msg = "The data object being used is not supported. Please review the documentation for supported types." raise ValueError(msg) missing_terms = [] for term in data: From d441b1f4f0f78ff2b75a91b200689c4f19e9590e Mon Sep 17 00:00:00 2001 From: Ryan Ly Date: Wed, 13 Dec 2023 13:41:05 -0800 Subject: [PATCH 39/40] Update docs/gallery/plot_external_resources.py --- docs/gallery/plot_external_resources.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/gallery/plot_external_resources.py b/docs/gallery/plot_external_resources.py index 21131cc92..5bf8dd5d8 100644 --- a/docs/gallery/plot_external_resources.py +++ b/docs/gallery/plot_external_resources.py @@ -343,7 +343,7 @@ def __init__(self, **kwargs): # When populating :py:class:`~hdmf.common.resources.HERD`, users may have some terms # that are not in the :py:class:`~hdmf.term_set.TermSet`. As a result, -# :py:func:`~hdmf.common.resources.HERD.add_ref_termset` will return an all the missing +# :py:func:`~hdmf.common.resources.HERD.add_ref_termset` will return all of the missing # terms in a dictionary. It is up to the user to either add these terms to the # :py:class:`~hdmf.term_set.TermSet` or remove them from the dataset. From 25f5afb93968abc82ec613a27e5752638be382dc Mon Sep 17 00:00:00 2001 From: Ryan Ly Date: Wed, 13 Dec 2023 13:43:07 -0800 Subject: [PATCH 40/40] Update resources.py --- src/hdmf/common/resources.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/hdmf/common/resources.py b/src/hdmf/common/resources.py index a4c8fca78..f9738c998 100644 --- a/src/hdmf/common/resources.py +++ b/src/hdmf/common/resources.py @@ -521,7 +521,8 @@ def add_ref_termset(self, **kwargs): elif isinstance(data_object, (list, tuple, np.ndarray)): data = data_object else: - msg = "The data object being used is not supported. Please review the documentation for supported types." + msg = ("The data object being used is not supported. " + "Please review the documentation for supported types.") raise ValueError(msg) missing_terms = [] for term in data: