Merge pull request #855 from IanCa/develop

Allow validation of files with out of order onsets.
hed-standard · Feb 7, 2024 · d6f3b73 · d6f3b73
2 parents f1e4749 + 3df2dab
commit d6f3b73
Show file tree

Hide file tree

Showing 21 changed files with 687 additions and 992 deletions.
diff --git a/hed/errors/error_messages.py b/hed/errors/error_messages.py
@@ -60,6 +60,12 @@ def val_error_CURLY_BRACE_UNSUPPORTED_HERE(tag, problem_tag):
     return (f"Curly braces are only permitted in sidecars, fully wrapping text in place of a tag.  "
             f"Invalid character '{problem_tag}' in tag '{tag}'")
 
+
+@hed_error(ValidationErrors.ONSETS_OUT_OF_ORDER, default_severity=ErrorSeverity.WARNING)
+def val_error_ONSETS_OUT_OF_ORDER():
+    return "Onsets need to be temporally increasing for most downstream tools to work."
+
+
 @hed_error(ValidationErrors.COMMA_MISSING)
 def val_error_comma_missing(tag):
     return f"Comma missing after - '{tag}'"

diff --git a/hed/errors/error_types.py b/hed/errors/error_types.py
@@ -91,7 +91,7 @@ class ValidationErrors:
     INVALID_TAG_CHARACTER = 'invalidTagCharacter'
 
     CURLY_BRACE_UNSUPPORTED_HERE = "CURLY_BRACE_UNSUPPORTED_HERE"
-
+    ONSETS_OUT_OF_ORDER = "ONSETS_OUT_OF_ORDER"
 
 
 class SidecarErrors:

diff --git a/hed/models/__init__.py b/hed/models/__init__.py
@@ -5,7 +5,7 @@
 from .column_metadata import ColumnMetadata, ColumnType
 from .definition_dict import DefinitionDict
 from .definition_entry import DefinitionEntry
-from .expression_parser import QueryParser
+from .query_handler import QueryHandler
 from .hed_group import HedGroup
 from .spreadsheet_input import SpreadsheetInput
 from .hed_string import HedString

diff --git a/hed/models/base_input.py b/hed/models/base_input.py
@@ -157,6 +157,14 @@ def onsets(self):
         if "onset" in self.columns:
             return self._dataframe["onset"]
 
+    @property
+    def needs_sorting(self):
+        """Returns True if this both has an onset column, and it needs sorting."""
+        onsets = self.onsets
+        if onsets is not None:
+            onsets = onsets.astype(float)
+            return not onsets.is_monotonic_increasing
+
     @property
     def name(self):
         """ Name of the data. """

diff --git a/hed/models/df_util.py b/hed/models/df_util.py
@@ -7,8 +7,7 @@
 from hed.models.definition_dict import DefinitionDict
 
 
-def get_assembled(tabular_file, sidecar, hed_schema, extra_def_dicts=None, join_columns=True,
-                  shrink_defs=False, expand_defs=True):
+def get_assembled(tabular_file, sidecar, hed_schema, extra_def_dicts=None, shrink_defs=False, expand_defs=True):
     """ Create an array of assembled HedString objects (or list of these) of the same length as tabular file with.
 
     Args:
@@ -20,8 +19,6 @@ def get_assembled(tabular_file, sidecar, hed_schema, extra_def_dicts=None, join_
             If str, will attempt to load as a version if it doesn't have a valid extension.
         extra_def_dicts: list of DefinitionDict, optional
             Any extra DefinitionDict objects to use when parsing the HED tags.
-        join_columns: bool
-            If True, join all HED columns into one.
         shrink_defs: bool
             Shrink any def-expand tags found
         expand_defs: bool
@@ -41,19 +38,12 @@ def get_assembled(tabular_file, sidecar, hed_schema, extra_def_dicts=None, join_
     if sidecar:
         def_dict = sidecar.get_def_dict(hed_schema=hed_schema, extra_def_dicts=extra_def_dicts)
 
-    if join_columns:
-        if expand_defs:
-            return [HedString(x, hed_schema, def_dict).expand_defs() for x in tabular_file.series_a], def_dict
-        elif shrink_defs:
-            return [HedString(x, hed_schema, def_dict).shrink_defs() for x in tabular_file.series_a], def_dict
-        else:
-            return [HedString(x, hed_schema, def_dict) for x in tabular_file.series_a], def_dict
+    if expand_defs:
+        return [HedString(x, hed_schema, def_dict).expand_defs() for x in tabular_file.series_a], def_dict
+    elif shrink_defs:
+        return [HedString(x, hed_schema, def_dict).shrink_defs() for x in tabular_file.series_a], def_dict
     else:
-        return [[HedString(x, hed_schema, def_dict).expand_defs() if expand_defs
-                 else HedString(x, hed_schema, def_dict).shrink_defs() if shrink_defs
-                 else HedString(x, hed_schema, def_dict)
-                 for x in text_file_row] for text_file_row in tabular_file.dataframe_a.itertuples(index=False)], \
-               def_dict
+        return [HedString(x, hed_schema, def_dict) for x in tabular_file.series_a], def_dict
 
 
 def convert_to_form(df, hed_schema, tag_form, columns=None):
@@ -151,3 +141,22 @@ def process_def_expands(hed_strings, hed_schema, known_defs=None, ambiguous_defs
     from hed.models.def_expand_gather import DefExpandGatherer
     def_gatherer = DefExpandGatherer(hed_schema, known_defs, ambiguous_defs)
     return def_gatherer.process_def_expands(hed_strings)
+
+
+def sort_dataframe_by_onsets(df):
+    """ Gather def-expand tags in the strings/compare with known definitions to find any differences
+
+    Parameters:
+        df(pd.Dataframe): Dataframe to sort
+    Returns:
+        The sorted dataframe, or the original dataframe if it didn't have an onset column.
+    """
+    if "onset" in df.columns:
+        # Create a copy and sort by onsets as floats(if needed), but continue to keep the string version.
+        df_copy = df.copy()
+        df_copy['_temp_onset_sort'] = df_copy['onset'].astype(float)
+        df_copy.sort_values(by='_temp_onset_sort', inplace=True)
+        df_copy.drop(columns=['_temp_onset_sort'], inplace=True)
+
+        return df_copy
+    return df