Skip to content

Commit

Permalink
Don't escape double quotes in .tsv schemas
Browse files Browse the repository at this point in the history
Slightly improve importing of load_dataframes/save_dataframes by adding to init
  • Loading branch information
IanCa committed Jul 25, 2024
1 parent e4f4c67 commit 7e0f66b
Show file tree
Hide file tree
Showing 7 changed files with 58 additions and 56 deletions.
1 change: 1 addition & 0 deletions hed/schema/schema_io/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from hed.schema.schema_io.ontology_util import save_dataframes, load_dataframes
52 changes: 4 additions & 48 deletions hed/schema/schema_io/df2schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,8 @@
This module is used to create a HedSchema object from a set of .tsv files.
"""
import io
import os

from hed.schema.schema_io import ontology_util
from hed.schema.schema_io import ontology_util, load_dataframes
from hed.schema.hed_schema_constants import HedSectionKey, HedKey
from hed.errors.exceptions import HedFileError, HedExceptions
from hed.schema.schema_io.base2schema import SchemaLoader
Expand All @@ -23,7 +22,7 @@ class SchemaLoaderDF(SchemaLoader):
"""

def __init__(self, filenames, schema_as_strings_or_df, name=""):
self.filenames = self.convert_filenames_to_dict(filenames)
self.filenames = ontology_util.convert_filenames_to_dict(filenames)
self.schema_as_strings_or_df = schema_as_strings_or_df
if self.filenames:
reported_filename = self.filenames.get(constants.STRUCT_KEY)
Expand All @@ -47,39 +46,6 @@ def load_spreadsheet(cls, filenames=None, schema_as_strings_or_df=None, name="")
loader = cls(filenames, schema_as_strings_or_df=schema_as_strings_or_df, name=name)
return loader._load()

@staticmethod
def convert_filenames_to_dict(filenames):
"""Infers filename meaning based on suffix, e.g. _Tag for the tags sheet
Parameters:
filenames(str or None or list or dict): The list to convert to a dict
If a string with a .tsv suffix: Save to that location, adding the suffix to each .tsv file
If a string with no .tsv suffix: Save to that folder, with the contents being the separate .tsv files.
Returns:
filename_dict(str: str): The required suffix to filename mapping"""
result_filenames = {}
if isinstance(filenames, str):
if filenames.endswith(".tsv"):
base, base_ext = os.path.splitext(filenames)
else:
# Load as foldername/foldername_suffix.tsv
base_dir = filenames
base_filename = os.path.split(base_dir)[1]
base = os.path.join(base_dir, base_filename)
for suffix in constants.DF_SUFFIXES:
filename = f"{base}_{suffix}.tsv"
result_filenames[suffix] = filename
filenames = result_filenames
elif isinstance(filenames, list):
for filename in filenames:
remainder, suffix = filename.replace("_", "-").rsplit("-")
for needed_suffix in constants.DF_SUFFIXES:
if needed_suffix in suffix:
result_filenames[needed_suffix] = filename
filenames = result_filenames

return filenames

def _open_file(self):
if self.filenames:
dataframes = load_dataframes(self.filenames)
Expand Down Expand Up @@ -298,18 +264,6 @@ def _add_to_dict(self, row_number, row, entry, key_class):
return self._add_to_dict_base(entry, key_class)


def load_dataframes(filenames):
dict_filenames = SchemaLoaderDF.convert_filenames_to_dict(filenames)
dataframes = ontology_util.create_empty_dataframes()
for key, filename in dict_filenames.items():
try:
dataframes[key] = pd.read_csv(filename, sep="\t", dtype=str, na_filter=False)
except OSError:
# todo: consider if we want to report this error(we probably do)
pass # We will use a blank one for this
return dataframes


def load_dataframes_from_strings(schema_data):
""" Load the given strings/dataframes as dataframes.
Expand All @@ -322,3 +276,5 @@ def load_dataframes_from_strings(schema_data):
return {key: value if isinstance(value, pd.DataFrame) else pd.read_csv(io.StringIO(value), sep="\t",
dtype=str, na_filter=False)
for key, value in schema_data.items()}


48 changes: 47 additions & 1 deletion hed/schema/schema_io/ontology_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os

import pandas as pd
import csv

from hed.schema.schema_io import schema_util
from hed.errors.exceptions import HedFileError
Expand Down Expand Up @@ -391,7 +392,40 @@ def save_dataframes(base_filename, dataframe_dict):
for suffix, dataframe in dataframe_dict.items():
filename = f"{base}_{suffix}.tsv"
with open(filename, mode='w', encoding='utf-8') as opened_file:
dataframe.to_csv(opened_file, sep='\t', index=False, header=True)
dataframe.to_csv(opened_file, sep='\t', index=False, header=True, quoting=csv.QUOTE_NONE)


def convert_filenames_to_dict(filenames):
"""Infers filename meaning based on suffix, e.g. _Tag for the tags sheet
Parameters:
filenames(str or None or list or dict): The list to convert to a dict
If a string with a .tsv suffix: Save to that location, adding the suffix to each .tsv file
If a string with no .tsv suffix: Save to that folder, with the contents being the separate .tsv files.
Returns:
filename_dict(str: str): The required suffix to filename mapping"""
result_filenames = {}
if isinstance(filenames, str):
if filenames.endswith(".tsv"):
base, base_ext = os.path.splitext(filenames)
else:
# Load as foldername/foldername_suffix.tsv
base_dir = filenames
base_filename = os.path.split(base_dir)[1]
base = os.path.join(base_dir, base_filename)
for suffix in constants.DF_SUFFIXES:
filename = f"{base}_{suffix}.tsv"
result_filenames[suffix] = filename
filenames = result_filenames
elif isinstance(filenames, list):
for filename in filenames:
remainder, suffix = filename.replace("_", "-").rsplit("-")
for needed_suffix in constants.DF_SUFFIXES:
if needed_suffix in suffix:
result_filenames[needed_suffix] = filename
filenames = result_filenames

return filenames


def get_attributes_from_row(row):
Expand Down Expand Up @@ -429,3 +463,15 @@ def create_empty_dataframes():
constants.OBJECT_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str),
constants.ATTRIBUTE_PROPERTY_KEY: pd.DataFrame(columns=constants.property_columns_reduced, dtype=str),
}


def load_dataframes(filenames):
dict_filenames = convert_filenames_to_dict(filenames)
dataframes = create_empty_dataframes()
for key, filename in dict_filenames.items():
try:
dataframes[key] = pd.read_csv(filename, sep="\t", dtype=str, na_filter=False)
except OSError:
# todo: consider if we want to report this error(we probably do)
pass # We will use a blank one for this
return dataframes
4 changes: 2 additions & 2 deletions hed/scripts/add_hed_ids.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from hed.scripts.script_util import get_prerelease_path
from hed.scripts.convert_and_update_schema import convert_and_update
import argparse
from hed.schema.schema_io.df2schema import SchemaLoaderDF
from hed.schema.schema_io.ontology_util import convert_filenames_to_dict

# Slightly tweaked version of convert_and_update_schema.py with a new main function to allow different parameters.
def main():
Expand All @@ -13,7 +13,7 @@ def main():
args = parser.parse_args()

basepath = get_prerelease_path(args.repo_path, schema_name=args.schema_name, schema_version=args.schema_version)
filenames = list(SchemaLoaderDF.convert_filenames_to_dict(basepath).values())
filenames = list(convert_filenames_to_dict(basepath).values())
set_ids = True

return convert_and_update(filenames, set_ids)
Expand Down
2 changes: 1 addition & 1 deletion hed/scripts/convert_and_update_schema.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from hed.scripts.script_util import sort_base_schemas, validate_all_schemas, add_extension
from hed.schema.schema_io.df2schema import load_dataframes
from hed.schema.schema_io import load_dataframes
from hed.schema.schema_io.ontology_util import update_dataframes_from_schema, save_dataframes
from hed.schema.hed_schema_io import load_schema, from_dataframes
from hed.errors import get_printable_issue_string, HedFileError
Expand Down
2 changes: 1 addition & 1 deletion hed/scripts/create_ontology.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from hed.errors import HedFileError, get_printable_issue_string
from hed.schema.schema_io.df2schema import load_dataframes
from hed.schema.schema_io import load_dataframes
from hed.schema.schema_io.ontology_util import convert_df_to_omn
from hed.scripts.script_util import get_prerelease_path, get_schema_filename
import argparse
Expand Down
5 changes: 2 additions & 3 deletions tests/schema/test_hed_schema_io_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,8 @@
import pandas as pd
from hed.errors import HedExceptions, HedFileError
from hed.schema.hed_schema_io import load_schema, load_schema_version, from_dataframes
from hed.schema.schema_io.df2schema import SchemaLoaderDF
from hed.schema import hed_schema_df_constants as df_constants
from hed.schema.schema_io.ontology_util import create_empty_dataframes
from hed.schema.schema_io.ontology_util import create_empty_dataframes, convert_filenames_to_dict


class TestHedSchemaDF(unittest.TestCase):
Expand Down Expand Up @@ -49,7 +48,7 @@ def test_from_dataframes(self):
filename = self.output_folder + "test_8_string.tsv"
schema.save_as_dataframes(self.output_folder + "test_8_string.tsv")

filenames = SchemaLoaderDF.convert_filenames_to_dict(filename)
filenames = convert_filenames_to_dict(filename)
new_file_strings = {}
for key, value in filenames.items():
with open(value, "r") as f:
Expand Down

0 comments on commit 7e0f66b

Please sign in to comment.