From 7e0f66be4066ae8b7ef43da4bd65a53ddfe1da53 Mon Sep 17 00:00:00 2001 From: IanCa Date: Thu, 25 Jul 2024 12:53:32 -0500 Subject: [PATCH] Don't escape double quotes in .tsv schemas Slightly improve importing of load_dataframes/save_dataframes by adding to init --- hed/schema/schema_io/__init__.py | 1 + hed/schema/schema_io/df2schema.py | 52 ++---------------------- hed/schema/schema_io/ontology_util.py | 48 +++++++++++++++++++++- hed/scripts/add_hed_ids.py | 4 +- hed/scripts/convert_and_update_schema.py | 2 +- hed/scripts/create_ontology.py | 2 +- tests/schema/test_hed_schema_io_df.py | 5 +-- 7 files changed, 58 insertions(+), 56 deletions(-) diff --git a/hed/schema/schema_io/__init__.py b/hed/schema/schema_io/__init__.py index e69de29bb..99418f69c 100644 --- a/hed/schema/schema_io/__init__.py +++ b/hed/schema/schema_io/__init__.py @@ -0,0 +1 @@ +from hed.schema.schema_io.ontology_util import save_dataframes, load_dataframes diff --git a/hed/schema/schema_io/df2schema.py b/hed/schema/schema_io/df2schema.py index 13ffb4731..ae44167b1 100644 --- a/hed/schema/schema_io/df2schema.py +++ b/hed/schema/schema_io/df2schema.py @@ -2,9 +2,8 @@ This module is used to create a HedSchema object from a set of .tsv files. """ import io -import os -from hed.schema.schema_io import ontology_util +from hed.schema.schema_io import ontology_util, load_dataframes from hed.schema.hed_schema_constants import HedSectionKey, HedKey from hed.errors.exceptions import HedFileError, HedExceptions from hed.schema.schema_io.base2schema import SchemaLoader @@ -23,7 +22,7 @@ class SchemaLoaderDF(SchemaLoader): """ def __init__(self, filenames, schema_as_strings_or_df, name=""): - self.filenames = self.convert_filenames_to_dict(filenames) + self.filenames = ontology_util.convert_filenames_to_dict(filenames) self.schema_as_strings_or_df = schema_as_strings_or_df if self.filenames: reported_filename = self.filenames.get(constants.STRUCT_KEY) @@ -47,39 +46,6 @@ def load_spreadsheet(cls, filenames=None, schema_as_strings_or_df=None, name="") loader = cls(filenames, schema_as_strings_or_df=schema_as_strings_or_df, name=name) return loader._load() - @staticmethod - def convert_filenames_to_dict(filenames): - """Infers filename meaning based on suffix, e.g. _Tag for the tags sheet - - Parameters: - filenames(str or None or list or dict): The list to convert to a dict - If a string with a .tsv suffix: Save to that location, adding the suffix to each .tsv file - If a string with no .tsv suffix: Save to that folder, with the contents being the separate .tsv files. - Returns: - filename_dict(str: str): The required suffix to filename mapping""" - result_filenames = {} - if isinstance(filenames, str): - if filenames.endswith(".tsv"): - base, base_ext = os.path.splitext(filenames) - else: - # Load as foldername/foldername_suffix.tsv - base_dir = filenames - base_filename = os.path.split(base_dir)[1] - base = os.path.join(base_dir, base_filename) - for suffix in constants.DF_SUFFIXES: - filename = f"{base}_{suffix}.tsv" - result_filenames[suffix] = filename - filenames = result_filenames - elif isinstance(filenames, list): - for filename in filenames: - remainder, suffix = filename.replace("_", "-").rsplit("-") - for needed_suffix in constants.DF_SUFFIXES: - if needed_suffix in suffix: - result_filenames[needed_suffix] = filename - filenames = result_filenames - - return filenames - def _open_file(self): if self.filenames: dataframes = load_dataframes(self.filenames) @@ -298,18 +264,6 @@ def _add_to_dict(self, row_number, row, entry, key_class): return self._add_to_dict_base(entry, key_class) -def load_dataframes(filenames): - dict_filenames = SchemaLoaderDF.convert_filenames_to_dict(filenames) - dataframes = ontology_util.create_empty_dataframes() - for key, filename in dict_filenames.items(): - try: - dataframes[key] = pd.read_csv(filename, sep="\t", dtype=str, na_filter=False) - except OSError: - # todo: consider if we want to report this error(we probably do) - pass # We will use a blank one for this - return dataframes - - def load_dataframes_from_strings(schema_data): """ Load the given strings/dataframes as dataframes. @@ -322,3 +276,5 @@ def load_dataframes_from_strings(schema_data): return {key: value if isinstance(value, pd.DataFrame) else pd.read_csv(io.StringIO(value), sep="\t", dtype=str, na_filter=False) for key, value in schema_data.items()} + + diff --git a/hed/schema/schema_io/ontology_util.py b/hed/schema/schema_io/ontology_util.py index 59cd34c6b..917653e1b 100644 --- a/hed/schema/schema_io/ontology_util.py +++ b/hed/schema/schema_io/ontology_util.py @@ -2,6 +2,7 @@ import os import pandas as pd +import csv from hed.schema.schema_io import schema_util from hed.errors.exceptions import HedFileError @@ -391,7 +392,40 @@ def save_dataframes(base_filename, dataframe_dict): for suffix, dataframe in dataframe_dict.items(): filename = f"{base}_{suffix}.tsv" with open(filename, mode='w', encoding='utf-8') as opened_file: - dataframe.to_csv(opened_file, sep='\t', index=False, header=True) + dataframe.to_csv(opened_file, sep='\t', index=False, header=True, quoting=csv.QUOTE_NONE) + + +def convert_filenames_to_dict(filenames): + """Infers filename meaning based on suffix, e.g. _Tag for the tags sheet + + Parameters: + filenames(str or None or list or dict): The list to convert to a dict + If a string with a .tsv suffix: Save to that location, adding the suffix to each .tsv file + If a string with no .tsv suffix: Save to that folder, with the contents being the separate .tsv files. + Returns: + filename_dict(str: str): The required suffix to filename mapping""" + result_filenames = {} + if isinstance(filenames, str): + if filenames.endswith(".tsv"): + base, base_ext = os.path.splitext(filenames) + else: + # Load as foldername/foldername_suffix.tsv + base_dir = filenames + base_filename = os.path.split(base_dir)[1] + base = os.path.join(base_dir, base_filename) + for suffix in constants.DF_SUFFIXES: + filename = f"{base}_{suffix}.tsv" + result_filenames[suffix] = filename + filenames = result_filenames + elif isinstance(filenames, list): + for filename in filenames: + remainder, suffix = filename.replace("_", "-").rsplit("-") + for needed_suffix in constants.DF_SUFFIXES: + if needed_suffix in suffix: + result_filenames[needed_suffix] = filename + filenames = result_filenames + + return filenames def get_attributes_from_row(row): @@ -429,3 +463,15 @@ def create_empty_dataframes(): constants.OBJECT_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str), constants.ATTRIBUTE_PROPERTY_KEY: pd.DataFrame(columns=constants.property_columns_reduced, dtype=str), } + + +def load_dataframes(filenames): + dict_filenames = convert_filenames_to_dict(filenames) + dataframes = create_empty_dataframes() + for key, filename in dict_filenames.items(): + try: + dataframes[key] = pd.read_csv(filename, sep="\t", dtype=str, na_filter=False) + except OSError: + # todo: consider if we want to report this error(we probably do) + pass # We will use a blank one for this + return dataframes diff --git a/hed/scripts/add_hed_ids.py b/hed/scripts/add_hed_ids.py index da05bc83d..b12ca2a60 100644 --- a/hed/scripts/add_hed_ids.py +++ b/hed/scripts/add_hed_ids.py @@ -1,7 +1,7 @@ from hed.scripts.script_util import get_prerelease_path from hed.scripts.convert_and_update_schema import convert_and_update import argparse -from hed.schema.schema_io.df2schema import SchemaLoaderDF +from hed.schema.schema_io.ontology_util import convert_filenames_to_dict # Slightly tweaked version of convert_and_update_schema.py with a new main function to allow different parameters. def main(): @@ -13,7 +13,7 @@ def main(): args = parser.parse_args() basepath = get_prerelease_path(args.repo_path, schema_name=args.schema_name, schema_version=args.schema_version) - filenames = list(SchemaLoaderDF.convert_filenames_to_dict(basepath).values()) + filenames = list(convert_filenames_to_dict(basepath).values()) set_ids = True return convert_and_update(filenames, set_ids) diff --git a/hed/scripts/convert_and_update_schema.py b/hed/scripts/convert_and_update_schema.py index a6ff92fc1..716cff000 100644 --- a/hed/scripts/convert_and_update_schema.py +++ b/hed/scripts/convert_and_update_schema.py @@ -1,5 +1,5 @@ from hed.scripts.script_util import sort_base_schemas, validate_all_schemas, add_extension -from hed.schema.schema_io.df2schema import load_dataframes +from hed.schema.schema_io import load_dataframes from hed.schema.schema_io.ontology_util import update_dataframes_from_schema, save_dataframes from hed.schema.hed_schema_io import load_schema, from_dataframes from hed.errors import get_printable_issue_string, HedFileError diff --git a/hed/scripts/create_ontology.py b/hed/scripts/create_ontology.py index 1f6623eb4..24b108c11 100644 --- a/hed/scripts/create_ontology.py +++ b/hed/scripts/create_ontology.py @@ -1,5 +1,5 @@ from hed.errors import HedFileError, get_printable_issue_string -from hed.schema.schema_io.df2schema import load_dataframes +from hed.schema.schema_io import load_dataframes from hed.schema.schema_io.ontology_util import convert_df_to_omn from hed.scripts.script_util import get_prerelease_path, get_schema_filename import argparse diff --git a/tests/schema/test_hed_schema_io_df.py b/tests/schema/test_hed_schema_io_df.py index a73dafc09..9fde076b9 100644 --- a/tests/schema/test_hed_schema_io_df.py +++ b/tests/schema/test_hed_schema_io_df.py @@ -4,9 +4,8 @@ import pandas as pd from hed.errors import HedExceptions, HedFileError from hed.schema.hed_schema_io import load_schema, load_schema_version, from_dataframes -from hed.schema.schema_io.df2schema import SchemaLoaderDF from hed.schema import hed_schema_df_constants as df_constants -from hed.schema.schema_io.ontology_util import create_empty_dataframes +from hed.schema.schema_io.ontology_util import create_empty_dataframes, convert_filenames_to_dict class TestHedSchemaDF(unittest.TestCase): @@ -49,7 +48,7 @@ def test_from_dataframes(self): filename = self.output_folder + "test_8_string.tsv" schema.save_as_dataframes(self.output_folder + "test_8_string.tsv") - filenames = SchemaLoaderDF.convert_filenames_to_dict(filename) + filenames = convert_filenames_to_dict(filename) new_file_strings = {} for key, value in filenames.items(): with open(value, "r") as f: