From 7e0f66be4066ae8b7ef43da4bd65a53ddfe1da53 Mon Sep 17 00:00:00 2001
From: IanCa <ianrcallanan@gmail.com>
Date: Thu, 25 Jul 2024 12:53:32 -0500
Subject: [PATCH] Don't escape double quotes in .tsv schemas Slightly improve
 importing of load_dataframes/save_dataframes by adding to init

---
 hed/schema/schema_io/__init__.py         |  1 +
 hed/schema/schema_io/df2schema.py        | 52 ++----------------------
 hed/schema/schema_io/ontology_util.py    | 48 +++++++++++++++++++++-
 hed/scripts/add_hed_ids.py               |  4 +-
 hed/scripts/convert_and_update_schema.py |  2 +-
 hed/scripts/create_ontology.py           |  2 +-
 tests/schema/test_hed_schema_io_df.py    |  5 +--
 7 files changed, 58 insertions(+), 56 deletions(-)

diff --git a/hed/schema/schema_io/__init__.py b/hed/schema/schema_io/__init__.py
index e69de29bb..99418f69c 100644
--- a/hed/schema/schema_io/__init__.py
+++ b/hed/schema/schema_io/__init__.py
@@ -0,0 +1 @@
+from hed.schema.schema_io.ontology_util import save_dataframes, load_dataframes
diff --git a/hed/schema/schema_io/df2schema.py b/hed/schema/schema_io/df2schema.py
index 13ffb4731..ae44167b1 100644
--- a/hed/schema/schema_io/df2schema.py
+++ b/hed/schema/schema_io/df2schema.py
@@ -2,9 +2,8 @@
 This module is used to create a HedSchema object from a set of .tsv files.
 """
 import io
-import os
 
-from hed.schema.schema_io import ontology_util
+from hed.schema.schema_io import ontology_util, load_dataframes
 from hed.schema.hed_schema_constants import HedSectionKey, HedKey
 from hed.errors.exceptions import HedFileError, HedExceptions
 from hed.schema.schema_io.base2schema import SchemaLoader
@@ -23,7 +22,7 @@ class SchemaLoaderDF(SchemaLoader):
     """
 
     def __init__(self, filenames, schema_as_strings_or_df, name=""):
-        self.filenames = self.convert_filenames_to_dict(filenames)
+        self.filenames = ontology_util.convert_filenames_to_dict(filenames)
         self.schema_as_strings_or_df = schema_as_strings_or_df
         if self.filenames:
             reported_filename = self.filenames.get(constants.STRUCT_KEY)
@@ -47,39 +46,6 @@ def load_spreadsheet(cls, filenames=None, schema_as_strings_or_df=None, name="")
         loader = cls(filenames, schema_as_strings_or_df=schema_as_strings_or_df, name=name)
         return loader._load()
 
-    @staticmethod
-    def convert_filenames_to_dict(filenames):
-        """Infers filename meaning based on suffix, e.g. _Tag for the tags sheet
-
-        Parameters:
-            filenames(str or None or list or dict): The list to convert to a dict
-                If a string with a .tsv suffix: Save to that location, adding the suffix to each .tsv file
-                If a string with no .tsv suffix: Save to that folder, with the contents being the separate .tsv files.
-        Returns:
-            filename_dict(str: str): The required suffix to filename mapping"""
-        result_filenames = {}
-        if isinstance(filenames, str):
-            if filenames.endswith(".tsv"):
-                base, base_ext = os.path.splitext(filenames)
-            else:
-                # Load as foldername/foldername_suffix.tsv
-                base_dir = filenames
-                base_filename = os.path.split(base_dir)[1]
-                base = os.path.join(base_dir, base_filename)
-            for suffix in constants.DF_SUFFIXES:
-                filename = f"{base}_{suffix}.tsv"
-                result_filenames[suffix] = filename
-            filenames = result_filenames
-        elif isinstance(filenames, list):
-            for filename in filenames:
-                remainder, suffix = filename.replace("_", "-").rsplit("-")
-                for needed_suffix in constants.DF_SUFFIXES:
-                    if needed_suffix in suffix:
-                        result_filenames[needed_suffix] = filename
-            filenames = result_filenames
-
-        return filenames
-
     def _open_file(self):
         if self.filenames:
             dataframes = load_dataframes(self.filenames)
@@ -298,18 +264,6 @@ def _add_to_dict(self, row_number, row, entry, key_class):
         return self._add_to_dict_base(entry, key_class)
 
 
-def load_dataframes(filenames):
-    dict_filenames = SchemaLoaderDF.convert_filenames_to_dict(filenames)
-    dataframes = ontology_util.create_empty_dataframes()
-    for key, filename in dict_filenames.items():
-        try:
-            dataframes[key] = pd.read_csv(filename, sep="\t", dtype=str, na_filter=False)
-        except OSError:
-            # todo: consider if we want to report this error(we probably do)
-            pass  # We will use a blank one for this
-    return dataframes
-
-
 def load_dataframes_from_strings(schema_data):
     """ Load the given strings/dataframes as dataframes.
 
@@ -322,3 +276,5 @@ def load_dataframes_from_strings(schema_data):
     return {key: value if isinstance(value, pd.DataFrame) else pd.read_csv(io.StringIO(value), sep="\t",
                                                                            dtype=str, na_filter=False)
             for key, value in schema_data.items()}
+
+
diff --git a/hed/schema/schema_io/ontology_util.py b/hed/schema/schema_io/ontology_util.py
index 59cd34c6b..917653e1b 100644
--- a/hed/schema/schema_io/ontology_util.py
+++ b/hed/schema/schema_io/ontology_util.py
@@ -2,6 +2,7 @@
 import os
 
 import pandas as pd
+import csv
 
 from hed.schema.schema_io import schema_util
 from hed.errors.exceptions import HedFileError
@@ -391,7 +392,40 @@ def save_dataframes(base_filename, dataframe_dict):
     for suffix, dataframe in dataframe_dict.items():
         filename = f"{base}_{suffix}.tsv"
         with open(filename, mode='w', encoding='utf-8') as opened_file:
-            dataframe.to_csv(opened_file, sep='\t', index=False, header=True)
+            dataframe.to_csv(opened_file, sep='\t', index=False, header=True, quoting=csv.QUOTE_NONE)
+
+
+def convert_filenames_to_dict(filenames):
+    """Infers filename meaning based on suffix, e.g. _Tag for the tags sheet
+
+    Parameters:
+        filenames(str or None or list or dict): The list to convert to a dict
+            If a string with a .tsv suffix: Save to that location, adding the suffix to each .tsv file
+            If a string with no .tsv suffix: Save to that folder, with the contents being the separate .tsv files.
+    Returns:
+        filename_dict(str: str): The required suffix to filename mapping"""
+    result_filenames = {}
+    if isinstance(filenames, str):
+        if filenames.endswith(".tsv"):
+            base, base_ext = os.path.splitext(filenames)
+        else:
+            # Load as foldername/foldername_suffix.tsv
+            base_dir = filenames
+            base_filename = os.path.split(base_dir)[1]
+            base = os.path.join(base_dir, base_filename)
+        for suffix in constants.DF_SUFFIXES:
+            filename = f"{base}_{suffix}.tsv"
+            result_filenames[suffix] = filename
+        filenames = result_filenames
+    elif isinstance(filenames, list):
+        for filename in filenames:
+            remainder, suffix = filename.replace("_", "-").rsplit("-")
+            for needed_suffix in constants.DF_SUFFIXES:
+                if needed_suffix in suffix:
+                    result_filenames[needed_suffix] = filename
+        filenames = result_filenames
+
+    return filenames
 
 
 def get_attributes_from_row(row):
@@ -429,3 +463,15 @@ def create_empty_dataframes():
         constants.OBJECT_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str),
         constants.ATTRIBUTE_PROPERTY_KEY: pd.DataFrame(columns=constants.property_columns_reduced, dtype=str),
     }
+
+
+def load_dataframes(filenames):
+    dict_filenames = convert_filenames_to_dict(filenames)
+    dataframes = create_empty_dataframes()
+    for key, filename in dict_filenames.items():
+        try:
+            dataframes[key] = pd.read_csv(filename, sep="\t", dtype=str, na_filter=False)
+        except OSError:
+            # todo: consider if we want to report this error(we probably do)
+            pass  # We will use a blank one for this
+    return dataframes
diff --git a/hed/scripts/add_hed_ids.py b/hed/scripts/add_hed_ids.py
index da05bc83d..b12ca2a60 100644
--- a/hed/scripts/add_hed_ids.py
+++ b/hed/scripts/add_hed_ids.py
@@ -1,7 +1,7 @@
 from hed.scripts.script_util import get_prerelease_path
 from hed.scripts.convert_and_update_schema import convert_and_update
 import argparse
-from hed.schema.schema_io.df2schema import SchemaLoaderDF
+from hed.schema.schema_io.ontology_util import convert_filenames_to_dict
 
 # Slightly tweaked version of convert_and_update_schema.py with a new main function to allow different parameters.
 def main():
@@ -13,7 +13,7 @@ def main():
     args = parser.parse_args()
 
     basepath = get_prerelease_path(args.repo_path, schema_name=args.schema_name, schema_version=args.schema_version)
-    filenames = list(SchemaLoaderDF.convert_filenames_to_dict(basepath).values())
+    filenames = list(convert_filenames_to_dict(basepath).values())
     set_ids = True
 
     return convert_and_update(filenames, set_ids)
diff --git a/hed/scripts/convert_and_update_schema.py b/hed/scripts/convert_and_update_schema.py
index a6ff92fc1..716cff000 100644
--- a/hed/scripts/convert_and_update_schema.py
+++ b/hed/scripts/convert_and_update_schema.py
@@ -1,5 +1,5 @@
 from hed.scripts.script_util import sort_base_schemas, validate_all_schemas, add_extension
-from hed.schema.schema_io.df2schema import load_dataframes
+from hed.schema.schema_io import load_dataframes
 from hed.schema.schema_io.ontology_util import update_dataframes_from_schema, save_dataframes
 from hed.schema.hed_schema_io import load_schema, from_dataframes
 from hed.errors import get_printable_issue_string, HedFileError
diff --git a/hed/scripts/create_ontology.py b/hed/scripts/create_ontology.py
index 1f6623eb4..24b108c11 100644
--- a/hed/scripts/create_ontology.py
+++ b/hed/scripts/create_ontology.py
@@ -1,5 +1,5 @@
 from hed.errors import HedFileError, get_printable_issue_string
-from hed.schema.schema_io.df2schema import load_dataframes
+from hed.schema.schema_io import load_dataframes
 from hed.schema.schema_io.ontology_util import convert_df_to_omn
 from hed.scripts.script_util import get_prerelease_path, get_schema_filename
 import argparse
diff --git a/tests/schema/test_hed_schema_io_df.py b/tests/schema/test_hed_schema_io_df.py
index a73dafc09..9fde076b9 100644
--- a/tests/schema/test_hed_schema_io_df.py
+++ b/tests/schema/test_hed_schema_io_df.py
@@ -4,9 +4,8 @@
 import pandas as pd
 from hed.errors import HedExceptions, HedFileError
 from hed.schema.hed_schema_io import load_schema, load_schema_version, from_dataframes
-from hed.schema.schema_io.df2schema import SchemaLoaderDF
 from hed.schema import hed_schema_df_constants as df_constants
-from hed.schema.schema_io.ontology_util import create_empty_dataframes
+from hed.schema.schema_io.ontology_util import create_empty_dataframes, convert_filenames_to_dict
 
 
 class TestHedSchemaDF(unittest.TestCase):
@@ -49,7 +48,7 @@ def test_from_dataframes(self):
         filename = self.output_folder + "test_8_string.tsv"
         schema.save_as_dataframes(self.output_folder + "test_8_string.tsv")
 
-        filenames = SchemaLoaderDF.convert_filenames_to_dict(filename)
+        filenames = convert_filenames_to_dict(filename)
         new_file_strings = {}
         for key, value in filenames.items():
             with open(value, "r") as f: