Merge pull request #788 from hed-standard/develop

Merge develop with master in preparation for release
hed-standard · Oct 27, 2023 · 6557bdf · 6557bdf
2 parents 4eaca61 + c6bfcf1
commit 6557bdf
Show file tree

Hide file tree

Showing 13 changed files with 307 additions and 109 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,11 +1,19 @@
+Release 0.4.0 October 27, 2023
+- Refactored the model classes to be based on DataFrame.
+- Added additional command line options for remodeling tools.
+- Restructured summaries for better reporting.
+- Minor refactoring to reduce code complexity.
+- Finalized and automated SPEC tests.
+- Improvements to GitHub automation -- including adding CodeSpell.
+- Improvements to API-Docs.
+
 Release 0.3.1 July 3, 2023
 - Pinned the version of the pydantic and inflect libraries due to inflict.
 - Reorganized JSON output of remodeling summaries so that all of consistent form.
 - Fixed summarize_hed_tags_op so that tags were correctly categorized for output.
 - Minor refactoring to reduce code complexity.
 - BaseInput and Sidecar now raise HedFileError if input could not be read.
 
-
 Release 0.3.0 June 20, 2023
 - Introduction of partnered schema.
 - Improved error handling for schema validation.

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -24,8 +24,8 @@
 author = 'HED Working Group'
 
 # The full version, including alpha/beta/rc tags
-version = '0.3.1'
-release = '0.3.1'
+version = '0.4.0'
+release = '0.4.0'
 
 currentdir = os.path.realpath(os.path.dirname(__file__))
 
@@ -89,7 +89,7 @@
     # Toc options
     'collapse_navigation': False,
     'sticky_navigation': True,
-    'navigation_depth': 4,
+    'navigation_depth': 7,
     'includehidden': True,
     'titles_only': False
 }

diff --git a/hed/errors/error_messages.py b/hed/errors/error_messages.py
@@ -321,7 +321,7 @@ def def_error_no_takes_value(def_name, placeholder_tag):
 
 @hed_tag_error(DefinitionErrors.BAD_PROP_IN_DEFINITION, actual_code=ValidationErrors.DEFINITION_INVALID)
 def def_error_no_takes_value(tag, def_name):
-    return f"Tag '{str(tag)}' in Definition '{def_name}' has has a tag with the unique or required attribute."
+    return f"Tag '{str(tag)}' in Definition '{def_name}' has has a the unique or required attribute."
 
 
 @hed_tag_error(DefinitionErrors.BAD_DEFINITION_LOCATION, actual_code=ValidationErrors.DEFINITION_INVALID)

diff --git a/hed/models/column_metadata.py b/hed/models/column_metadata.py
@@ -1,6 +1,7 @@
 from enum import Enum
 from hed.errors.error_types import SidecarErrors
 import pandas as pd
+import copy
 
 
 class ColumnType(Enum):
@@ -102,13 +103,15 @@ def set_hed_strings(self, new_strings):
         return True
 
     @staticmethod
-    def _detect_column_type(dict_for_entry):
+    def _detect_column_type(dict_for_entry, basic_validation=True):
         """ Determine the ColumnType of a given json entry.
 
         Parameters:
             dict_for_entry (dict): The loaded json entry a specific column.
                 Generally has a "HED" entry among other optional ones.
-
+            basic_validation (bool): If False, does not verify past "HED" exists and the type
+                                     This is used to issue more precise errors that are normally just silently ignored,
+                                     but also not crash.
         Returns:
             ColumnType: The determined type of given column.  Returns None if unknown.
 
@@ -122,14 +125,14 @@ def _detect_column_type(dict_for_entry):
 
         hed_entry = dict_for_entry["HED"]
         if isinstance(hed_entry, dict):
-            if not all(isinstance(entry, str) for entry in hed_entry.values()):
+            if basic_validation and not all(isinstance(entry, str) for entry in hed_entry.values()):
                 return None
             return ColumnType.Categorical
 
         if not isinstance(hed_entry, str):
             return None
 
-        if "#" not in dict_for_entry["HED"]:
+        if basic_validation and "#" not in dict_for_entry["HED"]:
             return None
 
         return ColumnType.Value
@@ -155,3 +158,10 @@ def expected_pound_sign_count(column_type):
         else:
             return 0, None
         return expected_count, error_type
+
+    def _get_unvalidated_data(self):
+        """Returns a copy with less preliminary validation done(such as verifying all data types)"""
+        return_copy = copy.deepcopy(self)
+        return_copy.column_type = ColumnMetadata._detect_column_type(dict_for_entry=return_copy.source_dict,
+                                                                     basic_validation=False)
+        return return_copy
diff --git a/hed/models/df_util.py b/hed/models/df_util.py
@@ -120,26 +120,6 @@ def expand_defs(df, hed_schema, def_dict, columns=None):
             df.loc[mask, column] = df.loc[mask, column].apply(partial(_expand_defs, hed_schema=hed_schema, def_dict=def_dict))
 
 
-def sort_strings(df, hed_schema, tag_form="short_tag", columns=None):
-    """ Expands any def tags found in the dataframe.
-
-        Converts in place
-
-    Parameters:
-        df (pd.Dataframe or pd.Series): The dataframe or series to modify
-        hed_schema (HedSchema or None): The schema to use to identify defs
-        columns (list or None): The columns to modify on the dataframe
-    """
-    if isinstance(df, pd.Series):
-        df[:] = df.apply(partial(_sort, hed_schema=hed_schema, tag_form=tag_form))
-    else:
-        if columns is None:
-            columns = df.columns
-
-        for column in columns:
-            df.loc[column] = df.loc[column].apply(partial(_sort, hed_schema=hed_schema, tag_form=tag_form))
-
-
 def _convert_to_form(hed_string, hed_schema, tag_form):
     return str(HedString(hed_string, hed_schema).get_as_form(tag_form))
 
@@ -152,12 +132,6 @@ def _expand_defs(hed_string, hed_schema, def_dict):
     return str(HedString(hed_string, hed_schema, def_dict).expand_defs())
 
 
-def _sort(hed_string, hed_schema, tag_form):
-    sorted_string = HedString(hed_string, hed_schema)
-    sorted_string.sort()
-    return sorted_string.get_as_form(tag_form)
-
-
 def process_def_expands(hed_strings, hed_schema, known_defs=None, ambiguous_defs=None):
     """ Gather def-expand tags in the strings/compare with known definitions to find any differences
 

diff --git a/hed/models/indexed_df.py b/hed/models/indexed_df.py
diff --git a/hed/schema/schema_attribute_validators.py b/hed/schema/schema_attribute_validators.py
@@ -209,7 +209,7 @@ def in_library_check(hed_schema, tag_entry, attribute_name):
 
     library = tag_entry.attributes.get(attribute_name, "")
     if hed_schema.library != library:
-        issues += ErrorHandler.format_error(SchemaAttributeErrors.SCHEMA_ALLOWED_CHARACTERS_INVALID,
+        issues += ErrorHandler.format_error(SchemaAttributeErrors.SCHEMA_IN_LIBRARY_INVALID,
                                             tag_entry.name,
                                             library)
     return issues
diff --git a/hed/validator/sidecar_validator.py b/hed/validator/sidecar_validator.py
@@ -59,6 +59,7 @@ def validate(self, sidecar, extra_def_dicts=None, name=None, error_handler=None)
         definition_checks = {}
         for column_data in sidecar:
             column_name = column_data.column_name
+            column_data = column_data._get_unvalidated_data()
             hed_strings = column_data.get_hed_strings()
             error_handler.push_error_context(ErrorContext.SIDECAR_COLUMN_NAME, column_name)
             for key_name, hed_string in hed_strings.items():
@@ -180,20 +181,28 @@ def _find_non_matching_braces(hed_string):
 
     @staticmethod
     def _check_for_key(key, data):
+        # Probably can be cleaned up more -> Return True if any data or subdata is key
         if isinstance(data, dict):
-            if key in data:
-                return bool(data[key])
-            else:
-                for sub_data in data.values():
-                    result = SidecarValidator._check_for_key(key, sub_data)
-                    if result is not None:
-                        return result
+            return SidecarValidator._check_dict(key, data)
         elif isinstance(data, list):
-            for sub_data in data:
-                result = SidecarValidator._check_for_key(key, sub_data)
-                if result is not None:
-                    return result
-        return None
+            return SidecarValidator._check_list(key, data)
+        return False
+
+    @staticmethod
+    def _check_dict(key, data_dict):
+        if key in data_dict:
+            return True
+        for sub_data in data_dict.values():
+            if SidecarValidator._check_for_key(key, sub_data):
+                return True
+        return False
+
+    @staticmethod
+    def _check_list(key, data_list):
+        for sub_data in data_list:
+            if SidecarValidator._check_for_key(key, sub_data):
+                return True
+        return False
 
     def _validate_column_structure(self, column_name, dict_for_entry, error_handler):
         """ Checks primarily for type errors such as expecting a string and getting a list in a json sidecar.
@@ -210,7 +219,7 @@ def _validate_column_structure(self, column_name, dict_for_entry, error_handler)
             val_issues += error_handler.format_error_with_context(SidecarErrors.SIDECAR_HED_USED_COLUMN)
             return val_issues
 
-        column_type = ColumnMetadata._detect_column_type(dict_for_entry=dict_for_entry)
+        column_type = ColumnMetadata._detect_column_type(dict_for_entry=dict_for_entry, basic_validation=False)
         if column_type is None:
             val_issues += error_handler.format_error_with_context(SidecarErrors.UNKNOWN_COLUMN_TYPE,
                                                                   column_name=column_name)
@@ -219,25 +228,27 @@ def _validate_column_structure(self, column_name, dict_for_entry, error_handler)
             if found_hed:
                 val_issues += error_handler.format_error_with_context(SidecarErrors.SIDECAR_HED_USED)
         elif column_type == ColumnType.Categorical:
-            raw_hed_dict = dict_for_entry["HED"]
-            if not raw_hed_dict:
+            val_issues += self._validate_categorical_column(column_name, dict_for_entry, error_handler)
+
+        return val_issues
+
+    def _validate_categorical_column(self, column_name, dict_for_entry, error_handler):
+        """Validates a categorical column in a json sidecar."""
+        val_issues = []
+        raw_hed_dict = dict_for_entry["HED"]
+        if not raw_hed_dict:
+            val_issues += error_handler.format_error_with_context(SidecarErrors.BLANK_HED_STRING)
+        for key_name, hed_string in raw_hed_dict.items():
+            error_handler.push_error_context(ErrorContext.SIDECAR_KEY_NAME, key_name)
+            if not hed_string:
                 val_issues += error_handler.format_error_with_context(SidecarErrors.BLANK_HED_STRING)
-            if not isinstance(raw_hed_dict, dict):
+            elif not isinstance(hed_string, str):
                 val_issues += error_handler.format_error_with_context(SidecarErrors.WRONG_HED_DATA_TYPE,
-                                                                      given_type=type(raw_hed_dict),
-                                                                      expected_type="dict")
-            for key_name, hed_string in raw_hed_dict.items():
-                error_handler.push_error_context(ErrorContext.SIDECAR_KEY_NAME, key_name)
-                if not isinstance(hed_string, str):
-                    val_issues += error_handler.format_error_with_context(SidecarErrors.WRONG_HED_DATA_TYPE,
-                                                                          given_type=type(hed_string),
-                                                                          expected_type="str")
-                if not hed_string:
-                    val_issues += error_handler.format_error_with_context(SidecarErrors.BLANK_HED_STRING)
-                if key_name in self.reserved_category_values:
-                    val_issues += error_handler.format_error_with_context(SidecarErrors.SIDECAR_NA_USED, column_name)
-                error_handler.pop_error_context()
-
+                                                                      given_type=type(hed_string),
+                                                                      expected_type="str")
+            elif key_name in self.reserved_category_values:
+                val_issues += error_handler.format_error_with_context(SidecarErrors.SIDECAR_NA_USED, column_name)
+            error_handler.pop_error_context()
         return val_issues
 
     def _validate_pound_sign_count(self, hed_string, column_type):

diff --git a/tests/models/test_sidecar.py b/tests/models/test_sidecar.py
@@ -94,7 +94,7 @@ def test__iter__(self):
 
     def test_validate_column_group(self):
         validation_issues = self.errors_sidecar.validate(self.hed_schema)
-        self.assertEqual(len(validation_issues), 5)
+        self.assertEqual(len(validation_issues), 4)
 
         validation_issues2 = self.errors_sidecar_minor.validate(self.hed_schema)
         self.assertEqual(len(validation_issues2), 1)