Add annotationExternal to omn generation

Change how schemas are sorted(so library nodes always go to the top) Move a bunch of ontology_util to df_util
IanCa · Jul 31, 2024 · bc007ba · bc007ba
1 parent d9196f6
commit bc007ba
Show file tree

Hide file tree

Showing 21 changed files with 19,036 additions and 18,888 deletions.
diff --git a/hed/schema/hed_schema.py b/hed/schema/hed_schema.py
@@ -1,13 +1,12 @@
 import json
 
+
 from hed.schema.hed_schema_constants import HedKey, HedSectionKey, HedKeyOld
 from hed.schema import hed_schema_constants as constants
-from hed.schema.schema_io import schema_util
+from hed.schema.schema_io import schema_util, df_util
 from hed.schema.schema_io.schema2xml import Schema2XML
 from hed.schema.schema_io.schema2wiki import Schema2Wiki
 from hed.schema.schema_io.schema2df import Schema2DF
-from hed.schema.schema_io import ontology_util
-
 
 from hed.schema.hed_schema_section import (HedSchemaSection, HedSchemaTagSection, HedSchemaUnitClassSection,
                                            HedSchemaUnitSection)
@@ -329,7 +328,7 @@ def save_as_dataframes(self, base_filename, save_merged=False):
             - File cannot be saved for some reason.
         """
         output_dfs = Schema2DF().process_schema(self, save_merged)
-        ontology_util.save_dataframes(base_filename, output_dfs)
+        df_util.save_dataframes(base_filename, output_dfs)
 
     def set_schema_prefix(self, schema_namespace):
         """ Set library namespace associated for this schema.

diff --git a/hed/schema/hed_schema_df_constants.py b/hed/schema/hed_schema_df_constants.py
@@ -16,12 +16,20 @@
 
 ATTRIBUTE_PROPERTY_KEY = "AttributeProperty"
 
+PREFIXES_KEY = "Prefixes"
+EXTERNAL_ANNOTATION_KEY = "AnnotationPropertyExternal"
+
 PROPERTY_KEYS = [ANNOTATION_KEY, DATA_KEY, OBJECT_KEY]
 DF_SUFFIXES = {TAG_KEY, STRUCT_KEY, VALUE_CLASS_KEY,
                UNIT_CLASS_KEY, UNIT_KEY, UNIT_MODIFIER_KEY,
                *PROPERTY_KEYS, ATTRIBUTE_PROPERTY_KEY}
 
-section_mapping = {
+
+DF_EXTRA_SUFFIXES = {PREFIXES_KEY, EXTERNAL_ANNOTATION_KEY}
+DF_SUFFIXES_OMN = { *DF_SUFFIXES, *DF_EXTRA_SUFFIXES}
+
+
+section_mapping_hed_id = {
     STRUCT_KEY: None,
     TAG_KEY: HedSectionKey.Tags,
     VALUE_CLASS_KEY: HedSectionKey.ValueClasses,
@@ -43,6 +51,8 @@
 description = "dc:description"
 equivalent_to = "omn:EquivalentTo"
 has_unit_class = "hasUnitClass"
+annotations = "Annotations"
+
 
 struct_columns = [hed_id, name, attributes, subclass_of, description, equivalent_to]
 tag_columns = [hed_id, name, level, subclass_of, attributes, description, equivalent_to]
@@ -84,3 +94,8 @@
     hed_schema_constants.WITH_STANDARD_ATTRIBUTE: "HED_0000302",
     hed_schema_constants.UNMERGED_ATTRIBUTE: "HED_0000303"
 }
+
+# Extra spreadsheet column ideas
+Prefix = "Prefix"
+ID = "ID"
+NamespaceIRI = "Namespace IRI"
diff --git a/hed/schema/hed_schema_section.py b/hed/schema/hed_schema_section.py
@@ -278,21 +278,17 @@ def _finalize_section(self, hed_schema):
 
         split_list = self._group_by_top_level_tag(self.all_entries)
         # Sort the extension allowed lists
-        extension_allowed_node = 0
         for values in split_list:
             node = values[0]
             if node.has_attribute(HedKey.ExtensionAllowed):
                 # Make sure we sort / characters to the front.
                 values.sort(key=lambda x: x.long_tag_name.replace("/", "\0"))
-                extension_allowed_node += 1
 
-        # sort the top level nodes so extension allowed is at the bottom
-        split_list.sort(key=lambda x: x[0].has_attribute(HedKey.ExtensionAllowed))
+        # Sort ones without inLibrary to the end, and then sort library ones at the top.
+        split_list.sort(key=lambda x: (x[0].has_attribute(HedKey.InLibrary, return_value=True) is None,
+                                       x[0].has_attribute(HedKey.InLibrary, return_value=True)))
 
-        # sort the extension allowed top level nodes
-        if extension_allowed_node:
-            split_list[extension_allowed_node:] = sorted(split_list[extension_allowed_node:],
-                                                         key=lambda x: x[0].long_tag_name)
+        # split_list.sort(key=lambda x: x[0].has_attribute(HedKey.ExtensionAllowed))
         self.all_entries = [subitem for tag_list in split_list for subitem in tag_list]
 
         super()._finalize_section(hed_schema)

diff --git a/hed/schema/schema_attribute_validator_hed_id.py b/hed/schema/schema_attribute_validator_hed_id.py
@@ -1,4 +1,5 @@
-from hed.schema.schema_io.ontology_util import get_library_data, remove_prefix
+from hed.schema.schema_io.ontology_util import get_library_data
+from hed.schema.schema_io.df_util import remove_prefix
 from semantic_version import Version
 from hed.schema.hed_schema_io import load_schema_version
 from hed.schema.hed_cache import get_hed_versions

diff --git a/hed/schema/schema_io/__init__.py b/hed/schema/schema_io/__init__.py
@@ -1 +1 @@
-from hed.schema.schema_io.ontology_util import save_dataframes, load_dataframes
+from hed.schema.schema_io.df_util import save_dataframes, load_dataframes
diff --git a/hed/schema/schema_io/df2schema.py b/hed/schema/schema_io/df2schema.py
@@ -3,7 +3,7 @@
 """
 import io
 
-from hed.schema.schema_io import ontology_util, load_dataframes
+from hed.schema.schema_io import df_util, load_dataframes
 from hed.schema.hed_schema_constants import HedSectionKey, HedKey
 from hed.errors.exceptions import HedFileError, HedExceptions
 from hed.schema.schema_io.base2schema import SchemaLoader
@@ -22,7 +22,7 @@ class SchemaLoaderDF(SchemaLoader):
     """
 
     def __init__(self, filenames, schema_as_strings_or_df, name=""):
-        self.filenames = ontology_util.convert_filenames_to_dict(filenames)
+        self.filenames = df_util.convert_filenames_to_dict(filenames)
         self.schema_as_strings_or_df = schema_as_strings_or_df
         if self.filenames:
             reported_filename = self.filenames.get(constants.STRUCT_KEY)
@@ -251,7 +251,7 @@ def _get_tag_attributes(self, row_number, row):
             dict: Dictionary of attributes.
         """
         try:
-            return ontology_util.get_attributes_from_row(row)
+            return df_util.get_attributes_from_row(row)
         except ValueError as e:
             self._add_fatal_error(row_number, str(row), str(e))
 

diff --git a/hed/schema/schema_io/df_util.py b/hed/schema/schema_io/df_util.py
@@ -0,0 +1,185 @@
+import csv
+import os
+
+import pandas as pd
+
+from hed.errors import HedFileError, HedExceptions
+from hed.schema import hed_schema_df_constants as constants
+from hed.schema.hed_schema_constants import HedKey
+from hed.schema.hed_cache import get_library_data
+from hed.schema.schema_io.text_util import parse_attribute_string, _parse_header_attributes_line
+
+UNKNOWN_LIBRARY_VALUE = 0
+
+
+def save_dataframes(base_filename, dataframe_dict):
+    """ Writes out the dataframes using the provided suffixes.
+
+    Does not validate contents or suffixes.
+
+    If base_filename has a .tsv suffix, save directly to the indicated location.
+    If base_filename is a directory(does NOT have a .tsv suffix), save the contents into a directory named that.
+    The subfiles are named the same.  e.g. HED8.3.0/HED8.3.0_Tag.tsv
+
+    Parameters:
+        base_filename(str): The base filename to use.  Output is {base_filename}_{suffix}.tsv
+                            See DF_SUFFIXES for all expected names.
+        dataframe_dict(dict of str: df.DataFrame): The list of files to save out.  No validation is done.
+    """
+    if base_filename.lower().endswith(".tsv"):
+        base, base_ext = os.path.splitext(base_filename)
+        base_dir, base_name = os.path.split(base)
+    else:
+        # Assumed as a directory name
+        base_dir = base_filename
+        base_filename = os.path.split(base_dir)[1]
+        base = os.path.join(base_dir, base_filename)
+    os.makedirs(base_dir, exist_ok=True)
+    for suffix, dataframe in dataframe_dict.items():
+        filename = f"{base}_{suffix}.tsv"
+        with open(filename, mode='w', encoding='utf-8') as opened_file:
+            dataframe.to_csv(opened_file, sep='\t', index=False, header=True, quoting=csv.QUOTE_NONE,
+                             lineterminator="\n")
+
+
+def convert_filenames_to_dict(filenames, include_prefix_dfs=False):
+    """Infers filename meaning based on suffix, e.g. _Tag for the tags sheet
+
+    Parameters:
+        filenames(str or None or list or dict): The list to convert to a dict
+            If a string with a .tsv suffix: Save to that location, adding the suffix to each .tsv file
+            If a string with no .tsv suffix: Save to that folder, with the contents being the separate .tsv files.
+        include_prefix_dfs(bool): If True, include the prefixes and external annotation dataframes.
+    Returns:
+        filename_dict(str: str): The required suffix to filename mapping"""
+    result_filenames = {}
+    dataframe_names = constants.DF_SUFFIXES_OMN if include_prefix_dfs else constants.DF_SUFFIXES
+    if isinstance(filenames, str):
+        if filenames.endswith(".tsv"):
+            base, base_ext = os.path.splitext(filenames)
+        else:
+            # Load as foldername/foldername_suffix.tsv
+            base_dir = filenames
+            base_filename = os.path.split(base_dir)[1]
+            base = os.path.join(base_dir, base_filename)
+        for suffix in dataframe_names:
+            filename = f"{base}_{suffix}.tsv"
+            result_filenames[suffix] = filename
+        filenames = result_filenames
+    elif isinstance(filenames, list):
+        for filename in filenames:
+            remainder, suffix = filename.replace("_", "-").rsplit("-")
+            for needed_suffix in dataframe_names:
+                if needed_suffix in suffix:
+                    result_filenames[needed_suffix] = filename
+        filenames = result_filenames
+
+    return filenames
+
+
+def create_empty_dataframes():
+    """Returns the default empty dataframes"""
+    base_dfs = {constants.STRUCT_KEY: pd.DataFrame(columns=constants.struct_columns, dtype=str),
+                constants.TAG_KEY: pd.DataFrame(columns=constants.tag_columns, dtype=str),
+                constants.UNIT_KEY: pd.DataFrame(columns=constants.unit_columns, dtype=str),
+                constants.UNIT_CLASS_KEY: pd.DataFrame(columns=constants.other_columns, dtype=str),
+                constants.UNIT_MODIFIER_KEY: pd.DataFrame(columns=constants.other_columns, dtype=str),
+                constants.VALUE_CLASS_KEY: pd.DataFrame(columns=constants.other_columns, dtype=str),
+                constants.ANNOTATION_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str),
+                constants.DATA_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str),
+                constants.OBJECT_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str),
+                constants.ATTRIBUTE_PROPERTY_KEY: pd.DataFrame(columns=constants.property_columns_reduced, dtype=str), }
+    return base_dfs
+
+
+def load_dataframes(filenames, include_prefix_dfs=False):
+    """Load the dataframes from the source folder or series of files.
+
+    Parameters:
+        filenames(str or None or list or dict): The input filenames
+            If a string with a .tsv suffix: Save to that location, adding the suffix to each .tsv file
+            If a string with no .tsv suffix: Save to that folder, with the contents being the separate .tsv files.
+        include_prefix_dfs(bool): If True, include the prefixes and external annotation dataframes.
+    Returns:
+        dataframes_dict(str: dataframes): The suffix:dataframe dict
+    """
+    dict_filenames = convert_filenames_to_dict(filenames, include_prefix_dfs=include_prefix_dfs)
+    dataframes = create_empty_dataframes()
+    for key, filename in dict_filenames.items():
+        try:
+            loaded_dataframe = pd.read_csv(filename, sep="\t", dtype=str, na_filter=False)
+            if key in dataframes:
+                columns_not_in_loaded = dataframes[key].columns[~dataframes[key].columns.isin(loaded_dataframe.columns)]
+                # and not dataframes[key].columns.isin(loaded_dataframe.columns).all():
+                if columns_not_in_loaded.any():
+                    raise HedFileError(HedExceptions.SCHEMA_LOAD_FAILED,
+                                       f"Required column(s) {list(columns_not_in_loaded)} missing from {filename}.  "
+                                       f"The required columns are {list(dataframes[key].columns)}", filename=filename)
+            dataframes[key] = loaded_dataframe
+        except OSError as e:
+            # todo: consider if we want to report this error(we probably do)
+            pass  # We will use a blank one for this
+    return dataframes
+
+
+def get_library_name_and_id(schema):
+    """ Get the library("Standard" for the standard schema) and first id for a schema range
+
+    Parameters:
+        schema(HedSchema): The schema to check
+
+    Returns:
+        library_name(str): The capitalized library name
+        first_id(int): the first id for a given library
+    """
+
+    name = schema.library
+
+    library_data = get_library_data(name)
+    starting_id, _ = library_data.get("id_range", (UNKNOWN_LIBRARY_VALUE, UNKNOWN_LIBRARY_VALUE))
+    if not name:
+        name = "standard"
+    return name.capitalize(), starting_id
+
+
+# todo: Replace this once we no longer support < python 3.9
+def remove_prefix(text, prefix):
+    if text and text.startswith(prefix):
+        return text[len(prefix):]
+    return text
+
+
+def calculate_attribute_type(attribute_entry):
+    """Returns the type of this attribute(annotation, object, data)
+
+    Returns:
+        attribute_type(str): "annotation", "object", or "data".
+    """
+    attributes = attribute_entry.attributes
+    object_ranges = {HedKey.TagRange, HedKey.UnitRange, HedKey.UnitClassRange, HedKey.ValueClassRange}
+    if HedKey.AnnotationProperty in attributes:
+        return "annotation"
+    elif any(attribute in object_ranges for attribute in attributes):
+        return "object"
+    return "data"
+
+
+def get_attributes_from_row(row):
+    """ Get the tag attributes from a line.
+
+    Parameters:
+        row (pd.Series): A tag line.
+    Returns:
+        dict: Dictionary of attributes.
+    """
+    if constants.properties in row.index:
+        attr_string = row[constants.properties]
+    elif constants.attributes in row.index:
+        attr_string = row[constants.attributes]
+    else:
+        attr_string = ""
+
+    if constants.subclass_of in row.index and row[constants.subclass_of] == "HedHeader":
+        header_attributes, _ = _parse_header_attributes_line(attr_string)
+        return header_attributes
+    return parse_attribute_string(attr_string)
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		from hed.schema.schema_io.ontology_util import save_dataframes, load_dataframes
		from hed.schema.schema_io.df_util import save_dataframes, load_dataframes