Improve .tsv loading to support children referenced before parent

hed-standard · Jul 11, 2024 · a7dcd0f · a7dcd0f
1 parent 54a5de4
commit a7dcd0f
Show file tree

Hide file tree

Showing 6 changed files with 246 additions and 135 deletions.
diff --git a/hed/errors/exceptions.py b/hed/errors/exceptions.py
@@ -49,6 +49,8 @@ class HedExceptions:
     CANNOT_PARSE_RDF = "CANNOT_PARSE_RDF"
     SCHEMA_LOAD_FAILED = "SCHEMA_LOAD_FAILED"
 
+    SCHEMA_TAG_TSV_BAD_PARENT = "SCHEMA_TAG_TSV_BAD_PARENT"
+
 
 class HedFileError(Exception):
     """Exception raised when a file cannot be parsed due to being malformed, file IO, etc."""

diff --git a/hed/schema/hed_schema_io.py b/hed/schema/hed_schema_io.py
@@ -65,6 +65,7 @@ def load_schema(hed_path, schema_namespace=None, schema=None, name=None):
         hed_path (str): A filepath or url to open a schema from.
             If loading a TSV file, this should be a single filename where:
             Template: basename.tsv, where files are named basename_Struct.tsv, basename_Tag.tsv, etc.
+            Alternatively, you can point to a directory containing the .tsv files.
         schema_namespace (str or None): The name_prefix all tags in this schema will accept.
         schema(HedSchema or None): A hed schema to merge this new file into
                                    It must be a with-standard schema with the same value.

diff --git a/hed/schema/schema_io/df2schema.py b/hed/schema/schema_io/df2schema.py
@@ -7,14 +7,14 @@
 from hed.schema.schema_io import ontology_util
 from hed.schema.hed_schema_constants import HedSectionKey, HedKey
 from hed.errors.exceptions import HedFileError, HedExceptions
-from hed.schema.schema_io.text2schema import SchemaLoaderText
+from hed.schema.schema_io.base2schema import SchemaLoader
 import pandas as pd
 import hed.schema.hed_schema_df_constants as constants
 from hed.errors import error_reporter
 from hed.schema.schema_io import text_util
 
 
-class SchemaLoaderDF(SchemaLoaderText):
+class SchemaLoaderDF(SchemaLoader):
     """ Load dataframe schemas from filenames
 
         Expected usage is SchemaLoaderDF.load(filenames)
@@ -139,17 +139,82 @@ def _read_schema(self, dataframe):
         """
         self._schema._initialize_attributes(HedSectionKey.Tags)
         known_parent_tags =  {"HedTag": []}
-        level_adj = 0
-        for row_number, row in dataframe[constants.TAG_KEY].iterrows():
-            # skip blank rows, though there shouldn't be any
-            if not any(row):
-                continue
-            parent_tag = row[constants.subclass_of]
-            org_parent_tags = known_parent_tags.get(parent_tag, []).copy()
+        iterations = 0
+        # Handle this over multiple iterations incase tags have parent tags listed later in the file.
+        # A properly formatted .tsv file will never have parents after the child.
+        current_rows = list(dataframe[constants.TAG_KEY].iterrows())
+        while current_rows:
+            iterations += 1
+            next_round_rows = []
+            for row_number, row in current_rows:
+                # skip blank rows, though there shouldn't be any
+                if not any(row):
+                    continue
+
+                parent_tag = row[constants.subclass_of]
+                org_parent_tags = known_parent_tags.get(parent_tag)
+                tag_entry = self._create_tag_entry(org_parent_tags, row_number, row)
+                if not tag_entry:
+                    # This will have already raised an error
+                    continue
+
+                # If this is NOT a rooted tag and we have no parent, try it in another round.
+                if org_parent_tags is None and not tag_entry.has_attribute(HedKey.Rooted):
+                    next_round_rows.append((row_number, row))
+                    continue
+
+                tag_entry = self._add_tag_entry(tag_entry, row_number, row)
+                if tag_entry:
+                    known_parent_tags[tag_entry.short_tag_name] = tag_entry.name.split("/")
+
+            if len(next_round_rows) == len(current_rows):
+                for row_number, row in current_rows:
+                    tag_name = self._get_tag_name(row)
+                    msg = (f"Cannot resolve parent tag.  "
+                           f"There is probably an issue with circular parent tags of {tag_name} on row {row_number}.")
+                    self._add_fatal_error(row_number, row, msg, HedExceptions.SCHEMA_TAG_TSV_BAD_PARENT)
+                break
+            current_rows = next_round_rows
+
+    def _add_tag_entry(self, tag_entry, row_number, row):
+        try:
+            rooted_entry = self.find_rooted_entry(tag_entry, self._schema, self._loading_merged)
+            if rooted_entry:
+                parent_tags = rooted_entry.long_tag_name.split("/")
+                # Create the entry again for rooted tags, to get the full name.
+                tag_entry = self._create_tag_entry(parent_tags, row_number, row)
+        except HedFileError as e:
+            self._add_fatal_error(row_number, row, e.message, e.code)
+            return None
+
+        tag_entry = self._add_to_dict(row_number, row, tag_entry, HedSectionKey.Tags)
+
+        return tag_entry
+
+    def _create_tag_entry(self, parent_tags, row_number, row):
+        """ Create a tag entry(does not add to dict)
 
-            tag_entry, parent_tags, _ = self._add_tag_meta(org_parent_tags, row_number, row, level_adj)
-            if tag_entry:
-                known_parent_tags[tag_entry.short_tag_name] = parent_tags.copy()
+        Parameters:
+            parent_tags (list): A list of parent tags in order.
+            row_number (int): The row number to report errors as
+            row (str or pd.Series): A tag row or pandas series(depends on format)
+
+        Returns:
+            HedSchemaEntry: The entry for the added tag.
+
+        Notes:
+            Includes attributes and description.
+        """
+        tag_name = self._get_tag_name(row)
+        if tag_name:
+            if parent_tags:
+                long_tag_name = "/".join(parent_tags) + "/" + tag_name
+            else:
+                long_tag_name = tag_name
+            return self._create_entry(row_number, row, HedSectionKey.Tags, long_tag_name)
+
+        self._add_fatal_error(row_number, row, f"No tag name found in row.",
+                              error_code=HedExceptions.GENERIC_ERROR)
 
     def _read_section(self, df, section_key):
         self._schema._initialize_attributes(section_key)
@@ -185,11 +250,11 @@ def _read_attribute_section(self, df, annotation_property=False, section_key=Hed
     def _get_tag_name(self, row):
         base_tag_name = row[constants.name]
         if base_tag_name.endswith("-#"):
-            return "#", 0
-        return base_tag_name, 0
+            return "#"
+        return base_tag_name
 
     def _create_entry(self, row_number, row, key_class, full_tag_name=None):
-        element_name, _ = self._get_tag_name(row)
+        element_name = self._get_tag_name(row)
         if full_tag_name:
             element_name = full_tag_name
 
@@ -224,6 +289,14 @@ def _get_tag_attributes(self, row_number, row):
         except ValueError as e:
             self._add_fatal_error(row_number, str(row), str(e))
 
+    def _add_to_dict(self, row_number, row, entry, key_class):
+        if entry.has_attribute(HedKey.InLibrary) and not self._loading_merged and not self.appending_to_schema:
+            self._add_fatal_error(row_number, row,
+                                  "Library tag in unmerged schema has InLibrary attribute",
+                                  HedExceptions.IN_LIBRARY_IN_UNMERGED)
+
+        return self._add_to_dict_base(entry, key_class)
+
 
 def load_dataframes(filenames):
     dict_filenames = SchemaLoaderDF.convert_filenames_to_dict(filenames)

diff --git a/hed/schema/schema_io/text2schema.py b/hed/schema/schema_io/text2schema.py
diff --git a/hed/schema/schema_io/wiki2schema.py b/hed/schema/schema_io/wiki2schema.py
@@ -3,11 +3,11 @@
 """
 import re
 
-from hed.schema.hed_schema_constants import HedSectionKey
+from hed.schema.hed_schema_constants import HedSectionKey, HedKey
 from hed.errors.exceptions import HedFileError, HedExceptions
 from hed.errors import error_reporter
 from hed.schema.schema_io import wiki_constants
-from hed.schema.schema_io.text2schema import SchemaLoaderText
+from hed.schema.schema_io.base2schema import SchemaLoader
 from hed.schema.schema_io.wiki_constants import HedWikiSection, SectionStarts, SectionNames
 from hed.schema.schema_io import text_util
 
@@ -34,7 +34,7 @@
 ]
 
 
-class SchemaLoaderWiki(SchemaLoaderText):
+class SchemaLoaderWiki(SchemaLoader):
     """ Load MediaWiki schemas from filenames or strings.
 
         Expected usage is SchemaLoaderWiki.load(filename)
@@ -45,8 +45,6 @@ class SchemaLoaderWiki(SchemaLoaderText):
     def __init__(self, filename, schema_as_string=None, schema=None, file_format=None, name=""):
         super().__init__(filename, schema_as_string, schema, file_format, name)
         self._schema.source_format = ".mediawiki"
-        self._no_name_msg = "Schema term is empty or the line is malformed",
-        self._no_name_error = HedExceptions.WIKI_DELIMITERS_INVALID
 
     def _open_file(self):
         if self.filename:
@@ -151,22 +149,29 @@ def _read_schema(self, lines):
         self._schema._initialize_attributes(HedSectionKey.Tags)
         parent_tags = []
         level_adj = 0
-        for line_number, line in lines:
-            if line.startswith(wiki_constants.ROOT_TAG):
+        for row_number, row in lines:
+            if row.startswith(wiki_constants.ROOT_TAG):
                 parent_tags = []
                 level_adj = 0
             else:
-                level = self._get_tag_level(line) + level_adj
+                level = self._get_tag_level(row) + level_adj
                 if level < len(parent_tags):
                     parent_tags = parent_tags[:level]
                 elif level > len(parent_tags):
-                    self._add_fatal_error(line_number, line,
+                    self._add_fatal_error(row_number, row,
                                           "Line has too many *'s at front.  You cannot skip a level.",
                                           HedExceptions.WIKI_LINE_START_INVALID)
                     continue
 
             # Create the entry
-            tag_entry, parent_tags, level_adj = self._add_tag_meta(parent_tags, line_number, line, level_adj)
+            tag_entry = self._create_tag_entry(parent_tags, row_number, row)
+            if not tag_entry:
+                # This will have already raised an error
+                continue
+
+            tag_entry, level_adj = self._add_tag_entry(tag_entry, row_number, row, level_adj)
+            if tag_entry:
+                parent_tags = tag_entry.name.split("/")
 
     def _read_unit_classes(self, lines):
         """Add the unit classes section.
@@ -468,3 +473,52 @@ def _split_lines_into_sections(self, wiki_lines):
                     strings_for_section[current_section].append((line_number + 1, line))
 
         return strings_for_section
+
+    def _add_tag_entry(self, tag_entry, row_number, row, level_adj):
+        try:
+            rooted_entry = self.find_rooted_entry(tag_entry, self._schema, self._loading_merged)
+            if rooted_entry:
+                parent_tags = rooted_entry.long_tag_name.split("/")
+                level_adj = len(parent_tags)
+                # Create the entry again for rooted tags, to get the full name.
+                tag_entry = self._create_tag_entry(parent_tags, row_number, row)
+        except HedFileError as e:
+            self._add_fatal_error(row_number, row, e.message, e.code)
+            return None, level_adj
+
+        tag_entry = self._add_to_dict(row_number, row, tag_entry, HedSectionKey.Tags)
+
+        return tag_entry, level_adj
+
+    def _create_tag_entry(self, parent_tags, row_number, row):
+        """ Create a tag entry(does not add to schema)
+
+        Parameters:
+            parent_tags (list): A list of parent tags in order.
+            row_number (int): The row number to report errors as
+            row (str or pd.Series): A tag row or pandas series(depends on format)
+
+        Returns:
+            HedSchemaEntry: The entry for the added tag.
+
+        Notes:
+            Includes attributes and description.
+        """
+        tag_name, _ = self._get_tag_name(row)
+        if tag_name:
+            if parent_tags:
+                long_tag_name = "/".join(parent_tags) + "/" + tag_name
+            else:
+                long_tag_name = tag_name
+            return self._create_entry(row_number, row, HedSectionKey.Tags, long_tag_name)
+
+        self._add_fatal_error(row_number, row, "Schema term is empty or the line is malformed"
+                              , error_code=HedExceptions.WIKI_DELIMITERS_INVALID)
+
+    def _add_to_dict(self, row_number, row, entry, key_class):
+        if entry.has_attribute(HedKey.InLibrary) and not self._loading_merged and not self.appending_to_schema:
+            self._add_fatal_error(row_number, row,
+                                  "Library tag in unmerged schema has InLibrary attribute",
+                                  HedExceptions.IN_LIBRARY_IN_UNMERGED)
+
+        return self._add_to_dict_base(entry, key_class)