Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add annotationExternal to omn generation #1000

Merged
merged 1 commit into from
Jul 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions hed/schema/hed_schema.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
import json


from hed.schema.hed_schema_constants import HedKey, HedSectionKey, HedKeyOld
from hed.schema import hed_schema_constants as constants
from hed.schema.schema_io import schema_util
from hed.schema.schema_io import schema_util, df_util
from hed.schema.schema_io.schema2xml import Schema2XML
from hed.schema.schema_io.schema2wiki import Schema2Wiki
from hed.schema.schema_io.schema2df import Schema2DF
from hed.schema.schema_io import ontology_util


from hed.schema.hed_schema_section import (HedSchemaSection, HedSchemaTagSection, HedSchemaUnitClassSection,
HedSchemaUnitSection)
Expand Down Expand Up @@ -329,7 +328,7 @@ def save_as_dataframes(self, base_filename, save_merged=False):
- File cannot be saved for some reason.
"""
output_dfs = Schema2DF().process_schema(self, save_merged)
ontology_util.save_dataframes(base_filename, output_dfs)
df_util.save_dataframes(base_filename, output_dfs)

def set_schema_prefix(self, schema_namespace):
""" Set library namespace associated for this schema.
Expand Down
17 changes: 16 additions & 1 deletion hed/schema/hed_schema_df_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,20 @@

ATTRIBUTE_PROPERTY_KEY = "AttributeProperty"

PREFIXES_KEY = "Prefixes"
EXTERNAL_ANNOTATION_KEY = "AnnotationPropertyExternal"

PROPERTY_KEYS = [ANNOTATION_KEY, DATA_KEY, OBJECT_KEY]
DF_SUFFIXES = {TAG_KEY, STRUCT_KEY, VALUE_CLASS_KEY,
UNIT_CLASS_KEY, UNIT_KEY, UNIT_MODIFIER_KEY,
*PROPERTY_KEYS, ATTRIBUTE_PROPERTY_KEY}

section_mapping = {

DF_EXTRA_SUFFIXES = {PREFIXES_KEY, EXTERNAL_ANNOTATION_KEY}
DF_SUFFIXES_OMN = { *DF_SUFFIXES, *DF_EXTRA_SUFFIXES}


section_mapping_hed_id = {
STRUCT_KEY: None,
TAG_KEY: HedSectionKey.Tags,
VALUE_CLASS_KEY: HedSectionKey.ValueClasses,
Expand All @@ -43,6 +51,8 @@
description = "dc:description"
equivalent_to = "omn:EquivalentTo"
has_unit_class = "hasUnitClass"
annotations = "Annotations"


struct_columns = [hed_id, name, attributes, subclass_of, description, equivalent_to]
tag_columns = [hed_id, name, level, subclass_of, attributes, description, equivalent_to]
Expand Down Expand Up @@ -84,3 +94,8 @@
hed_schema_constants.WITH_STANDARD_ATTRIBUTE: "HED_0000302",
hed_schema_constants.UNMERGED_ATTRIBUTE: "HED_0000303"
}

# Extra spreadsheet column ideas
Prefix = "Prefix"
ID = "ID"
NamespaceIRI = "Namespace IRI"
12 changes: 4 additions & 8 deletions hed/schema/hed_schema_section.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,21 +278,17 @@ def _finalize_section(self, hed_schema):

split_list = self._group_by_top_level_tag(self.all_entries)
# Sort the extension allowed lists
extension_allowed_node = 0
for values in split_list:
node = values[0]
if node.has_attribute(HedKey.ExtensionAllowed):
# Make sure we sort / characters to the front.
values.sort(key=lambda x: x.long_tag_name.replace("/", "\0"))
extension_allowed_node += 1

# sort the top level nodes so extension allowed is at the bottom
split_list.sort(key=lambda x: x[0].has_attribute(HedKey.ExtensionAllowed))
# Sort ones without inLibrary to the end, and then sort library ones at the top.
split_list.sort(key=lambda x: (x[0].has_attribute(HedKey.InLibrary, return_value=True) is None,
x[0].has_attribute(HedKey.InLibrary, return_value=True)))

# sort the extension allowed top level nodes
if extension_allowed_node:
split_list[extension_allowed_node:] = sorted(split_list[extension_allowed_node:],
key=lambda x: x[0].long_tag_name)
# split_list.sort(key=lambda x: x[0].has_attribute(HedKey.ExtensionAllowed))
self.all_entries = [subitem for tag_list in split_list for subitem in tag_list]

super()._finalize_section(hed_schema)
Expand Down
3 changes: 2 additions & 1 deletion hed/schema/schema_attribute_validator_hed_id.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from hed.schema.schema_io.ontology_util import get_library_data, remove_prefix
from hed.schema.schema_io.ontology_util import get_library_data
from hed.schema.schema_io.df_util import remove_prefix
from semantic_version import Version
from hed.schema.hed_schema_io import load_schema_version
from hed.schema.hed_cache import get_hed_versions
Expand Down
2 changes: 1 addition & 1 deletion hed/schema/schema_io/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from hed.schema.schema_io.ontology_util import save_dataframes, load_dataframes
from hed.schema.schema_io.df_util import save_dataframes, load_dataframes
6 changes: 3 additions & 3 deletions hed/schema/schema_io/df2schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"""
import io

from hed.schema.schema_io import ontology_util, load_dataframes
from hed.schema.schema_io import df_util, load_dataframes
from hed.schema.hed_schema_constants import HedSectionKey, HedKey
from hed.errors.exceptions import HedFileError, HedExceptions
from hed.schema.schema_io.base2schema import SchemaLoader
Expand All @@ -22,7 +22,7 @@ class SchemaLoaderDF(SchemaLoader):
"""

def __init__(self, filenames, schema_as_strings_or_df, name=""):
self.filenames = ontology_util.convert_filenames_to_dict(filenames)
self.filenames = df_util.convert_filenames_to_dict(filenames)
self.schema_as_strings_or_df = schema_as_strings_or_df
if self.filenames:
reported_filename = self.filenames.get(constants.STRUCT_KEY)
Expand Down Expand Up @@ -251,7 +251,7 @@ def _get_tag_attributes(self, row_number, row):
dict: Dictionary of attributes.
"""
try:
return ontology_util.get_attributes_from_row(row)
return df_util.get_attributes_from_row(row)
except ValueError as e:
self._add_fatal_error(row_number, str(row), str(e))

Expand Down
185 changes: 185 additions & 0 deletions hed/schema/schema_io/df_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
import csv
import os

import pandas as pd

from hed.errors import HedFileError, HedExceptions
from hed.schema import hed_schema_df_constants as constants
from hed.schema.hed_schema_constants import HedKey
from hed.schema.hed_cache import get_library_data
from hed.schema.schema_io.text_util import parse_attribute_string, _parse_header_attributes_line

UNKNOWN_LIBRARY_VALUE = 0


def save_dataframes(base_filename, dataframe_dict):
""" Writes out the dataframes using the provided suffixes.

Does not validate contents or suffixes.

If base_filename has a .tsv suffix, save directly to the indicated location.
If base_filename is a directory(does NOT have a .tsv suffix), save the contents into a directory named that.
The subfiles are named the same. e.g. HED8.3.0/HED8.3.0_Tag.tsv

Parameters:
base_filename(str): The base filename to use. Output is {base_filename}_{suffix}.tsv
See DF_SUFFIXES for all expected names.
dataframe_dict(dict of str: df.DataFrame): The list of files to save out. No validation is done.
"""
if base_filename.lower().endswith(".tsv"):
base, base_ext = os.path.splitext(base_filename)
base_dir, base_name = os.path.split(base)
else:
# Assumed as a directory name
base_dir = base_filename
base_filename = os.path.split(base_dir)[1]
base = os.path.join(base_dir, base_filename)
os.makedirs(base_dir, exist_ok=True)
for suffix, dataframe in dataframe_dict.items():
filename = f"{base}_{suffix}.tsv"
with open(filename, mode='w', encoding='utf-8') as opened_file:
dataframe.to_csv(opened_file, sep='\t', index=False, header=True, quoting=csv.QUOTE_NONE,
lineterminator="\n")


def convert_filenames_to_dict(filenames, include_prefix_dfs=False):
"""Infers filename meaning based on suffix, e.g. _Tag for the tags sheet

Parameters:
filenames(str or None or list or dict): The list to convert to a dict
If a string with a .tsv suffix: Save to that location, adding the suffix to each .tsv file
If a string with no .tsv suffix: Save to that folder, with the contents being the separate .tsv files.
include_prefix_dfs(bool): If True, include the prefixes and external annotation dataframes.
Returns:
filename_dict(str: str): The required suffix to filename mapping"""
result_filenames = {}
dataframe_names = constants.DF_SUFFIXES_OMN if include_prefix_dfs else constants.DF_SUFFIXES
if isinstance(filenames, str):
if filenames.endswith(".tsv"):
base, base_ext = os.path.splitext(filenames)
else:
# Load as foldername/foldername_suffix.tsv
base_dir = filenames
base_filename = os.path.split(base_dir)[1]
base = os.path.join(base_dir, base_filename)
for suffix in dataframe_names:
filename = f"{base}_{suffix}.tsv"
result_filenames[suffix] = filename
filenames = result_filenames
elif isinstance(filenames, list):
for filename in filenames:
remainder, suffix = filename.replace("_", "-").rsplit("-")
for needed_suffix in dataframe_names:
if needed_suffix in suffix:
result_filenames[needed_suffix] = filename
filenames = result_filenames

return filenames


def create_empty_dataframes():
"""Returns the default empty dataframes"""
base_dfs = {constants.STRUCT_KEY: pd.DataFrame(columns=constants.struct_columns, dtype=str),
constants.TAG_KEY: pd.DataFrame(columns=constants.tag_columns, dtype=str),
constants.UNIT_KEY: pd.DataFrame(columns=constants.unit_columns, dtype=str),
constants.UNIT_CLASS_KEY: pd.DataFrame(columns=constants.other_columns, dtype=str),
constants.UNIT_MODIFIER_KEY: pd.DataFrame(columns=constants.other_columns, dtype=str),
constants.VALUE_CLASS_KEY: pd.DataFrame(columns=constants.other_columns, dtype=str),
constants.ANNOTATION_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str),
constants.DATA_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str),
constants.OBJECT_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str),
constants.ATTRIBUTE_PROPERTY_KEY: pd.DataFrame(columns=constants.property_columns_reduced, dtype=str), }
return base_dfs


def load_dataframes(filenames, include_prefix_dfs=False):
"""Load the dataframes from the source folder or series of files.

Parameters:
filenames(str or None or list or dict): The input filenames
If a string with a .tsv suffix: Save to that location, adding the suffix to each .tsv file
If a string with no .tsv suffix: Save to that folder, with the contents being the separate .tsv files.
include_prefix_dfs(bool): If True, include the prefixes and external annotation dataframes.
Returns:
dataframes_dict(str: dataframes): The suffix:dataframe dict
"""
dict_filenames = convert_filenames_to_dict(filenames, include_prefix_dfs=include_prefix_dfs)
dataframes = create_empty_dataframes()
for key, filename in dict_filenames.items():
try:
loaded_dataframe = pd.read_csv(filename, sep="\t", dtype=str, na_filter=False)
if key in dataframes:
columns_not_in_loaded = dataframes[key].columns[~dataframes[key].columns.isin(loaded_dataframe.columns)]
# and not dataframes[key].columns.isin(loaded_dataframe.columns).all():
if columns_not_in_loaded.any():
raise HedFileError(HedExceptions.SCHEMA_LOAD_FAILED,
f"Required column(s) {list(columns_not_in_loaded)} missing from {filename}. "
f"The required columns are {list(dataframes[key].columns)}", filename=filename)
dataframes[key] = loaded_dataframe
except OSError as e:
# todo: consider if we want to report this error(we probably do)
pass # We will use a blank one for this
return dataframes


def get_library_name_and_id(schema):
""" Get the library("Standard" for the standard schema) and first id for a schema range

Parameters:
schema(HedSchema): The schema to check

Returns:
library_name(str): The capitalized library name
first_id(int): the first id for a given library
"""

name = schema.library

library_data = get_library_data(name)
starting_id, _ = library_data.get("id_range", (UNKNOWN_LIBRARY_VALUE, UNKNOWN_LIBRARY_VALUE))
if not name:
name = "standard"
return name.capitalize(), starting_id


# todo: Replace this once we no longer support < python 3.9
def remove_prefix(text, prefix):
if text and text.startswith(prefix):
return text[len(prefix):]
return text


def calculate_attribute_type(attribute_entry):
"""Returns the type of this attribute(annotation, object, data)

Returns:
attribute_type(str): "annotation", "object", or "data".
"""
attributes = attribute_entry.attributes
object_ranges = {HedKey.TagRange, HedKey.UnitRange, HedKey.UnitClassRange, HedKey.ValueClassRange}
if HedKey.AnnotationProperty in attributes:
return "annotation"
elif any(attribute in object_ranges for attribute in attributes):
return "object"
return "data"


def get_attributes_from_row(row):
""" Get the tag attributes from a line.

Parameters:
row (pd.Series): A tag line.
Returns:
dict: Dictionary of attributes.
"""
if constants.properties in row.index:
attr_string = row[constants.properties]
elif constants.attributes in row.index:
attr_string = row[constants.attributes]
else:
attr_string = ""

if constants.subclass_of in row.index and row[constants.subclass_of] == "HedHeader":
header_attributes, _ = _parse_header_attributes_line(attr_string)
return header_attributes
return parse_attribute_string(attr_string)
Loading