Skip to content

Commit

Permalink
Merge pull request #1000 from IanCa/develop
Browse files Browse the repository at this point in the history
Add annotationExternal to omn generation
  • Loading branch information
IanCa authored Jul 31, 2024
2 parents d9196f6 + bc007ba commit a1a54ba
Show file tree
Hide file tree
Showing 21 changed files with 19,036 additions and 18,888 deletions.
7 changes: 3 additions & 4 deletions hed/schema/hed_schema.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
import json


from hed.schema.hed_schema_constants import HedKey, HedSectionKey, HedKeyOld
from hed.schema import hed_schema_constants as constants
from hed.schema.schema_io import schema_util
from hed.schema.schema_io import schema_util, df_util
from hed.schema.schema_io.schema2xml import Schema2XML
from hed.schema.schema_io.schema2wiki import Schema2Wiki
from hed.schema.schema_io.schema2df import Schema2DF
from hed.schema.schema_io import ontology_util


from hed.schema.hed_schema_section import (HedSchemaSection, HedSchemaTagSection, HedSchemaUnitClassSection,
HedSchemaUnitSection)
Expand Down Expand Up @@ -329,7 +328,7 @@ def save_as_dataframes(self, base_filename, save_merged=False):
- File cannot be saved for some reason.
"""
output_dfs = Schema2DF().process_schema(self, save_merged)
ontology_util.save_dataframes(base_filename, output_dfs)
df_util.save_dataframes(base_filename, output_dfs)

def set_schema_prefix(self, schema_namespace):
""" Set library namespace associated for this schema.
Expand Down
17 changes: 16 additions & 1 deletion hed/schema/hed_schema_df_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,20 @@

ATTRIBUTE_PROPERTY_KEY = "AttributeProperty"

PREFIXES_KEY = "Prefixes"
EXTERNAL_ANNOTATION_KEY = "AnnotationPropertyExternal"

PROPERTY_KEYS = [ANNOTATION_KEY, DATA_KEY, OBJECT_KEY]
DF_SUFFIXES = {TAG_KEY, STRUCT_KEY, VALUE_CLASS_KEY,
UNIT_CLASS_KEY, UNIT_KEY, UNIT_MODIFIER_KEY,
*PROPERTY_KEYS, ATTRIBUTE_PROPERTY_KEY}

section_mapping = {

DF_EXTRA_SUFFIXES = {PREFIXES_KEY, EXTERNAL_ANNOTATION_KEY}
DF_SUFFIXES_OMN = { *DF_SUFFIXES, *DF_EXTRA_SUFFIXES}


section_mapping_hed_id = {
STRUCT_KEY: None,
TAG_KEY: HedSectionKey.Tags,
VALUE_CLASS_KEY: HedSectionKey.ValueClasses,
Expand All @@ -43,6 +51,8 @@
description = "dc:description"
equivalent_to = "omn:EquivalentTo"
has_unit_class = "hasUnitClass"
annotations = "Annotations"


struct_columns = [hed_id, name, attributes, subclass_of, description, equivalent_to]
tag_columns = [hed_id, name, level, subclass_of, attributes, description, equivalent_to]
Expand Down Expand Up @@ -84,3 +94,8 @@
hed_schema_constants.WITH_STANDARD_ATTRIBUTE: "HED_0000302",
hed_schema_constants.UNMERGED_ATTRIBUTE: "HED_0000303"
}

# Extra spreadsheet column ideas
Prefix = "Prefix"
ID = "ID"
NamespaceIRI = "Namespace IRI"
12 changes: 4 additions & 8 deletions hed/schema/hed_schema_section.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,21 +278,17 @@ def _finalize_section(self, hed_schema):

split_list = self._group_by_top_level_tag(self.all_entries)
# Sort the extension allowed lists
extension_allowed_node = 0
for values in split_list:
node = values[0]
if node.has_attribute(HedKey.ExtensionAllowed):
# Make sure we sort / characters to the front.
values.sort(key=lambda x: x.long_tag_name.replace("/", "\0"))
extension_allowed_node += 1

# sort the top level nodes so extension allowed is at the bottom
split_list.sort(key=lambda x: x[0].has_attribute(HedKey.ExtensionAllowed))
# Sort ones without inLibrary to the end, and then sort library ones at the top.
split_list.sort(key=lambda x: (x[0].has_attribute(HedKey.InLibrary, return_value=True) is None,
x[0].has_attribute(HedKey.InLibrary, return_value=True)))

# sort the extension allowed top level nodes
if extension_allowed_node:
split_list[extension_allowed_node:] = sorted(split_list[extension_allowed_node:],
key=lambda x: x[0].long_tag_name)
# split_list.sort(key=lambda x: x[0].has_attribute(HedKey.ExtensionAllowed))
self.all_entries = [subitem for tag_list in split_list for subitem in tag_list]

super()._finalize_section(hed_schema)
Expand Down
3 changes: 2 additions & 1 deletion hed/schema/schema_attribute_validator_hed_id.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from hed.schema.schema_io.ontology_util import get_library_data, remove_prefix
from hed.schema.schema_io.ontology_util import get_library_data
from hed.schema.schema_io.df_util import remove_prefix
from semantic_version import Version
from hed.schema.hed_schema_io import load_schema_version
from hed.schema.hed_cache import get_hed_versions
Expand Down
2 changes: 1 addition & 1 deletion hed/schema/schema_io/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from hed.schema.schema_io.ontology_util import save_dataframes, load_dataframes
from hed.schema.schema_io.df_util import save_dataframes, load_dataframes
6 changes: 3 additions & 3 deletions hed/schema/schema_io/df2schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"""
import io

from hed.schema.schema_io import ontology_util, load_dataframes
from hed.schema.schema_io import df_util, load_dataframes
from hed.schema.hed_schema_constants import HedSectionKey, HedKey
from hed.errors.exceptions import HedFileError, HedExceptions
from hed.schema.schema_io.base2schema import SchemaLoader
Expand All @@ -22,7 +22,7 @@ class SchemaLoaderDF(SchemaLoader):
"""

def __init__(self, filenames, schema_as_strings_or_df, name=""):
self.filenames = ontology_util.convert_filenames_to_dict(filenames)
self.filenames = df_util.convert_filenames_to_dict(filenames)
self.schema_as_strings_or_df = schema_as_strings_or_df
if self.filenames:
reported_filename = self.filenames.get(constants.STRUCT_KEY)
Expand Down Expand Up @@ -251,7 +251,7 @@ def _get_tag_attributes(self, row_number, row):
dict: Dictionary of attributes.
"""
try:
return ontology_util.get_attributes_from_row(row)
return df_util.get_attributes_from_row(row)
except ValueError as e:
self._add_fatal_error(row_number, str(row), str(e))

Expand Down
185 changes: 185 additions & 0 deletions hed/schema/schema_io/df_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
import csv
import os

import pandas as pd

from hed.errors import HedFileError, HedExceptions
from hed.schema import hed_schema_df_constants as constants
from hed.schema.hed_schema_constants import HedKey
from hed.schema.hed_cache import get_library_data
from hed.schema.schema_io.text_util import parse_attribute_string, _parse_header_attributes_line

UNKNOWN_LIBRARY_VALUE = 0


def save_dataframes(base_filename, dataframe_dict):
""" Writes out the dataframes using the provided suffixes.
Does not validate contents or suffixes.
If base_filename has a .tsv suffix, save directly to the indicated location.
If base_filename is a directory(does NOT have a .tsv suffix), save the contents into a directory named that.
The subfiles are named the same. e.g. HED8.3.0/HED8.3.0_Tag.tsv
Parameters:
base_filename(str): The base filename to use. Output is {base_filename}_{suffix}.tsv
See DF_SUFFIXES for all expected names.
dataframe_dict(dict of str: df.DataFrame): The list of files to save out. No validation is done.
"""
if base_filename.lower().endswith(".tsv"):
base, base_ext = os.path.splitext(base_filename)
base_dir, base_name = os.path.split(base)
else:
# Assumed as a directory name
base_dir = base_filename
base_filename = os.path.split(base_dir)[1]
base = os.path.join(base_dir, base_filename)
os.makedirs(base_dir, exist_ok=True)
for suffix, dataframe in dataframe_dict.items():
filename = f"{base}_{suffix}.tsv"
with open(filename, mode='w', encoding='utf-8') as opened_file:
dataframe.to_csv(opened_file, sep='\t', index=False, header=True, quoting=csv.QUOTE_NONE,
lineterminator="\n")


def convert_filenames_to_dict(filenames, include_prefix_dfs=False):
"""Infers filename meaning based on suffix, e.g. _Tag for the tags sheet
Parameters:
filenames(str or None or list or dict): The list to convert to a dict
If a string with a .tsv suffix: Save to that location, adding the suffix to each .tsv file
If a string with no .tsv suffix: Save to that folder, with the contents being the separate .tsv files.
include_prefix_dfs(bool): If True, include the prefixes and external annotation dataframes.
Returns:
filename_dict(str: str): The required suffix to filename mapping"""
result_filenames = {}
dataframe_names = constants.DF_SUFFIXES_OMN if include_prefix_dfs else constants.DF_SUFFIXES
if isinstance(filenames, str):
if filenames.endswith(".tsv"):
base, base_ext = os.path.splitext(filenames)
else:
# Load as foldername/foldername_suffix.tsv
base_dir = filenames
base_filename = os.path.split(base_dir)[1]
base = os.path.join(base_dir, base_filename)
for suffix in dataframe_names:
filename = f"{base}_{suffix}.tsv"
result_filenames[suffix] = filename
filenames = result_filenames
elif isinstance(filenames, list):
for filename in filenames:
remainder, suffix = filename.replace("_", "-").rsplit("-")
for needed_suffix in dataframe_names:
if needed_suffix in suffix:
result_filenames[needed_suffix] = filename
filenames = result_filenames

return filenames


def create_empty_dataframes():
"""Returns the default empty dataframes"""
base_dfs = {constants.STRUCT_KEY: pd.DataFrame(columns=constants.struct_columns, dtype=str),
constants.TAG_KEY: pd.DataFrame(columns=constants.tag_columns, dtype=str),
constants.UNIT_KEY: pd.DataFrame(columns=constants.unit_columns, dtype=str),
constants.UNIT_CLASS_KEY: pd.DataFrame(columns=constants.other_columns, dtype=str),
constants.UNIT_MODIFIER_KEY: pd.DataFrame(columns=constants.other_columns, dtype=str),
constants.VALUE_CLASS_KEY: pd.DataFrame(columns=constants.other_columns, dtype=str),
constants.ANNOTATION_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str),
constants.DATA_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str),
constants.OBJECT_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str),
constants.ATTRIBUTE_PROPERTY_KEY: pd.DataFrame(columns=constants.property_columns_reduced, dtype=str), }
return base_dfs


def load_dataframes(filenames, include_prefix_dfs=False):
"""Load the dataframes from the source folder or series of files.
Parameters:
filenames(str or None or list or dict): The input filenames
If a string with a .tsv suffix: Save to that location, adding the suffix to each .tsv file
If a string with no .tsv suffix: Save to that folder, with the contents being the separate .tsv files.
include_prefix_dfs(bool): If True, include the prefixes and external annotation dataframes.
Returns:
dataframes_dict(str: dataframes): The suffix:dataframe dict
"""
dict_filenames = convert_filenames_to_dict(filenames, include_prefix_dfs=include_prefix_dfs)
dataframes = create_empty_dataframes()
for key, filename in dict_filenames.items():
try:
loaded_dataframe = pd.read_csv(filename, sep="\t", dtype=str, na_filter=False)
if key in dataframes:
columns_not_in_loaded = dataframes[key].columns[~dataframes[key].columns.isin(loaded_dataframe.columns)]
# and not dataframes[key].columns.isin(loaded_dataframe.columns).all():
if columns_not_in_loaded.any():
raise HedFileError(HedExceptions.SCHEMA_LOAD_FAILED,
f"Required column(s) {list(columns_not_in_loaded)} missing from {filename}. "
f"The required columns are {list(dataframes[key].columns)}", filename=filename)
dataframes[key] = loaded_dataframe
except OSError as e:
# todo: consider if we want to report this error(we probably do)
pass # We will use a blank one for this
return dataframes


def get_library_name_and_id(schema):
""" Get the library("Standard" for the standard schema) and first id for a schema range
Parameters:
schema(HedSchema): The schema to check
Returns:
library_name(str): The capitalized library name
first_id(int): the first id for a given library
"""

name = schema.library

library_data = get_library_data(name)
starting_id, _ = library_data.get("id_range", (UNKNOWN_LIBRARY_VALUE, UNKNOWN_LIBRARY_VALUE))
if not name:
name = "standard"
return name.capitalize(), starting_id


# todo: Replace this once we no longer support < python 3.9
def remove_prefix(text, prefix):
if text and text.startswith(prefix):
return text[len(prefix):]
return text


def calculate_attribute_type(attribute_entry):
"""Returns the type of this attribute(annotation, object, data)
Returns:
attribute_type(str): "annotation", "object", or "data".
"""
attributes = attribute_entry.attributes
object_ranges = {HedKey.TagRange, HedKey.UnitRange, HedKey.UnitClassRange, HedKey.ValueClassRange}
if HedKey.AnnotationProperty in attributes:
return "annotation"
elif any(attribute in object_ranges for attribute in attributes):
return "object"
return "data"


def get_attributes_from_row(row):
""" Get the tag attributes from a line.
Parameters:
row (pd.Series): A tag line.
Returns:
dict: Dictionary of attributes.
"""
if constants.properties in row.index:
attr_string = row[constants.properties]
elif constants.attributes in row.index:
attr_string = row[constants.attributes]
else:
attr_string = ""

if constants.subclass_of in row.index and row[constants.subclass_of] == "HedHeader":
header_attributes, _ = _parse_header_attributes_line(attr_string)
return header_attributes
return parse_attribute_string(attr_string)
Loading

0 comments on commit a1a54ba

Please sign in to comment.