Skip to content

Commit

Permalink
Improve .tsv loading to support children referenced before parent
Browse files Browse the repository at this point in the history
  • Loading branch information
IanCa committed Jul 11, 2024
1 parent 54a5de4 commit a7dcd0f
Show file tree
Hide file tree
Showing 6 changed files with 246 additions and 135 deletions.
2 changes: 2 additions & 0 deletions hed/errors/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ class HedExceptions:
CANNOT_PARSE_RDF = "CANNOT_PARSE_RDF"
SCHEMA_LOAD_FAILED = "SCHEMA_LOAD_FAILED"

SCHEMA_TAG_TSV_BAD_PARENT = "SCHEMA_TAG_TSV_BAD_PARENT"


class HedFileError(Exception):
"""Exception raised when a file cannot be parsed due to being malformed, file IO, etc."""
Expand Down
1 change: 1 addition & 0 deletions hed/schema/hed_schema_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ def load_schema(hed_path, schema_namespace=None, schema=None, name=None):
hed_path (str): A filepath or url to open a schema from.
If loading a TSV file, this should be a single filename where:
Template: basename.tsv, where files are named basename_Struct.tsv, basename_Tag.tsv, etc.
Alternatively, you can point to a directory containing the .tsv files.
schema_namespace (str or None): The name_prefix all tags in this schema will accept.
schema(HedSchema or None): A hed schema to merge this new file into
It must be a with-standard schema with the same value.
Expand Down
103 changes: 88 additions & 15 deletions hed/schema/schema_io/df2schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,14 @@
from hed.schema.schema_io import ontology_util
from hed.schema.hed_schema_constants import HedSectionKey, HedKey
from hed.errors.exceptions import HedFileError, HedExceptions
from hed.schema.schema_io.text2schema import SchemaLoaderText
from hed.schema.schema_io.base2schema import SchemaLoader
import pandas as pd
import hed.schema.hed_schema_df_constants as constants
from hed.errors import error_reporter
from hed.schema.schema_io import text_util


class SchemaLoaderDF(SchemaLoaderText):
class SchemaLoaderDF(SchemaLoader):
""" Load dataframe schemas from filenames
Expected usage is SchemaLoaderDF.load(filenames)
Expand Down Expand Up @@ -139,17 +139,82 @@ def _read_schema(self, dataframe):
"""
self._schema._initialize_attributes(HedSectionKey.Tags)
known_parent_tags = {"HedTag": []}
level_adj = 0
for row_number, row in dataframe[constants.TAG_KEY].iterrows():
# skip blank rows, though there shouldn't be any
if not any(row):
continue
parent_tag = row[constants.subclass_of]
org_parent_tags = known_parent_tags.get(parent_tag, []).copy()
iterations = 0
# Handle this over multiple iterations incase tags have parent tags listed later in the file.

Check failure on line 143 in hed/schema/schema_io/df2schema.py

View workflow job for this annotation

GitHub Actions / Check for spelling errors

incase ==> in case
# A properly formatted .tsv file will never have parents after the child.
current_rows = list(dataframe[constants.TAG_KEY].iterrows())
while current_rows:
iterations += 1
next_round_rows = []
for row_number, row in current_rows:
# skip blank rows, though there shouldn't be any
if not any(row):
continue

parent_tag = row[constants.subclass_of]
org_parent_tags = known_parent_tags.get(parent_tag)
tag_entry = self._create_tag_entry(org_parent_tags, row_number, row)
if not tag_entry:
# This will have already raised an error
continue

# If this is NOT a rooted tag and we have no parent, try it in another round.
if org_parent_tags is None and not tag_entry.has_attribute(HedKey.Rooted):
next_round_rows.append((row_number, row))
continue

tag_entry = self._add_tag_entry(tag_entry, row_number, row)
if tag_entry:
known_parent_tags[tag_entry.short_tag_name] = tag_entry.name.split("/")

if len(next_round_rows) == len(current_rows):
for row_number, row in current_rows:
tag_name = self._get_tag_name(row)
msg = (f"Cannot resolve parent tag. "
f"There is probably an issue with circular parent tags of {tag_name} on row {row_number}.")
self._add_fatal_error(row_number, row, msg, HedExceptions.SCHEMA_TAG_TSV_BAD_PARENT)
break
current_rows = next_round_rows

def _add_tag_entry(self, tag_entry, row_number, row):
try:
rooted_entry = self.find_rooted_entry(tag_entry, self._schema, self._loading_merged)
if rooted_entry:
parent_tags = rooted_entry.long_tag_name.split("/")
# Create the entry again for rooted tags, to get the full name.
tag_entry = self._create_tag_entry(parent_tags, row_number, row)
except HedFileError as e:
self._add_fatal_error(row_number, row, e.message, e.code)
return None

tag_entry = self._add_to_dict(row_number, row, tag_entry, HedSectionKey.Tags)

return tag_entry

def _create_tag_entry(self, parent_tags, row_number, row):
""" Create a tag entry(does not add to dict)
tag_entry, parent_tags, _ = self._add_tag_meta(org_parent_tags, row_number, row, level_adj)
if tag_entry:
known_parent_tags[tag_entry.short_tag_name] = parent_tags.copy()
Parameters:
parent_tags (list): A list of parent tags in order.
row_number (int): The row number to report errors as
row (str or pd.Series): A tag row or pandas series(depends on format)
Returns:
HedSchemaEntry: The entry for the added tag.
Notes:
Includes attributes and description.
"""
tag_name = self._get_tag_name(row)
if tag_name:
if parent_tags:
long_tag_name = "/".join(parent_tags) + "/" + tag_name
else:
long_tag_name = tag_name
return self._create_entry(row_number, row, HedSectionKey.Tags, long_tag_name)

self._add_fatal_error(row_number, row, f"No tag name found in row.",
error_code=HedExceptions.GENERIC_ERROR)

def _read_section(self, df, section_key):
self._schema._initialize_attributes(section_key)
Expand Down Expand Up @@ -185,11 +250,11 @@ def _read_attribute_section(self, df, annotation_property=False, section_key=Hed
def _get_tag_name(self, row):
base_tag_name = row[constants.name]
if base_tag_name.endswith("-#"):
return "#", 0
return base_tag_name, 0
return "#"
return base_tag_name

def _create_entry(self, row_number, row, key_class, full_tag_name=None):
element_name, _ = self._get_tag_name(row)
element_name = self._get_tag_name(row)
if full_tag_name:
element_name = full_tag_name

Expand Down Expand Up @@ -224,6 +289,14 @@ def _get_tag_attributes(self, row_number, row):
except ValueError as e:
self._add_fatal_error(row_number, str(row), str(e))

def _add_to_dict(self, row_number, row, entry, key_class):
if entry.has_attribute(HedKey.InLibrary) and not self._loading_merged and not self.appending_to_schema:
self._add_fatal_error(row_number, row,
"Library tag in unmerged schema has InLibrary attribute",
HedExceptions.IN_LIBRARY_IN_UNMERGED)

return self._add_to_dict_base(entry, key_class)


def load_dataframes(filenames):
dict_filenames = SchemaLoaderDF.convert_filenames_to_dict(filenames)
Expand Down
107 changes: 0 additions & 107 deletions hed/schema/schema_io/text2schema.py

This file was deleted.

74 changes: 64 additions & 10 deletions hed/schema/schema_io/wiki2schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
"""
import re

from hed.schema.hed_schema_constants import HedSectionKey
from hed.schema.hed_schema_constants import HedSectionKey, HedKey
from hed.errors.exceptions import HedFileError, HedExceptions
from hed.errors import error_reporter
from hed.schema.schema_io import wiki_constants
from hed.schema.schema_io.text2schema import SchemaLoaderText
from hed.schema.schema_io.base2schema import SchemaLoader
from hed.schema.schema_io.wiki_constants import HedWikiSection, SectionStarts, SectionNames
from hed.schema.schema_io import text_util

Expand All @@ -34,7 +34,7 @@
]


class SchemaLoaderWiki(SchemaLoaderText):
class SchemaLoaderWiki(SchemaLoader):
""" Load MediaWiki schemas from filenames or strings.
Expected usage is SchemaLoaderWiki.load(filename)
Expand All @@ -45,8 +45,6 @@ class SchemaLoaderWiki(SchemaLoaderText):
def __init__(self, filename, schema_as_string=None, schema=None, file_format=None, name=""):
super().__init__(filename, schema_as_string, schema, file_format, name)
self._schema.source_format = ".mediawiki"
self._no_name_msg = "Schema term is empty or the line is malformed",
self._no_name_error = HedExceptions.WIKI_DELIMITERS_INVALID

def _open_file(self):
if self.filename:
Expand Down Expand Up @@ -151,22 +149,29 @@ def _read_schema(self, lines):
self._schema._initialize_attributes(HedSectionKey.Tags)
parent_tags = []
level_adj = 0
for line_number, line in lines:
if line.startswith(wiki_constants.ROOT_TAG):
for row_number, row in lines:
if row.startswith(wiki_constants.ROOT_TAG):
parent_tags = []
level_adj = 0
else:
level = self._get_tag_level(line) + level_adj
level = self._get_tag_level(row) + level_adj
if level < len(parent_tags):
parent_tags = parent_tags[:level]
elif level > len(parent_tags):
self._add_fatal_error(line_number, line,
self._add_fatal_error(row_number, row,
"Line has too many *'s at front. You cannot skip a level.",
HedExceptions.WIKI_LINE_START_INVALID)
continue

# Create the entry
tag_entry, parent_tags, level_adj = self._add_tag_meta(parent_tags, line_number, line, level_adj)
tag_entry = self._create_tag_entry(parent_tags, row_number, row)
if not tag_entry:
# This will have already raised an error
continue

tag_entry, level_adj = self._add_tag_entry(tag_entry, row_number, row, level_adj)
if tag_entry:
parent_tags = tag_entry.name.split("/")

def _read_unit_classes(self, lines):
"""Add the unit classes section.
Expand Down Expand Up @@ -468,3 +473,52 @@ def _split_lines_into_sections(self, wiki_lines):
strings_for_section[current_section].append((line_number + 1, line))

return strings_for_section

def _add_tag_entry(self, tag_entry, row_number, row, level_adj):
try:
rooted_entry = self.find_rooted_entry(tag_entry, self._schema, self._loading_merged)
if rooted_entry:
parent_tags = rooted_entry.long_tag_name.split("/")
level_adj = len(parent_tags)
# Create the entry again for rooted tags, to get the full name.
tag_entry = self._create_tag_entry(parent_tags, row_number, row)
except HedFileError as e:
self._add_fatal_error(row_number, row, e.message, e.code)
return None, level_adj

tag_entry = self._add_to_dict(row_number, row, tag_entry, HedSectionKey.Tags)

return tag_entry, level_adj

def _create_tag_entry(self, parent_tags, row_number, row):
""" Create a tag entry(does not add to schema)
Parameters:
parent_tags (list): A list of parent tags in order.
row_number (int): The row number to report errors as
row (str or pd.Series): A tag row or pandas series(depends on format)
Returns:
HedSchemaEntry: The entry for the added tag.
Notes:
Includes attributes and description.
"""
tag_name, _ = self._get_tag_name(row)
if tag_name:
if parent_tags:
long_tag_name = "/".join(parent_tags) + "/" + tag_name
else:
long_tag_name = tag_name
return self._create_entry(row_number, row, HedSectionKey.Tags, long_tag_name)

self._add_fatal_error(row_number, row, "Schema term is empty or the line is malformed"
, error_code=HedExceptions.WIKI_DELIMITERS_INVALID)

def _add_to_dict(self, row_number, row, entry, key_class):
if entry.has_attribute(HedKey.InLibrary) and not self._loading_merged and not self.appending_to_schema:
self._add_fatal_error(row_number, row,
"Library tag in unmerged schema has InLibrary attribute",
HedExceptions.IN_LIBRARY_IN_UNMERGED)

return self._add_to_dict_base(entry, key_class)
Loading

0 comments on commit a7dcd0f

Please sign in to comment.