Update dataframe loading/saving to allow passing a folder name

Also make saving much more efficient
IanCa · May 16, 2024 · b7ab42f · b7ab42f
1 parent c61c7af
commit b7ab42f
Show file tree

Hide file tree

Showing 5 changed files with 33 additions and 7 deletions.
diff --git a/hed/schema/hed_schema.py b/hed/schema/hed_schema.py
@@ -310,7 +310,11 @@ def save_as_xml(self, filename, save_merged=True):
             opened_file.write(xml_string)
 
     def save_as_dataframes(self, base_filename, save_merged=False):
-        """ Save as mediawiki to a file.
+        """ Save as dataframes to a folder of files.
+
+            If base_filename has a .tsv suffix, save directly to the indicated location.
+            If base_filename is a directory(does NOT have a .tsv suffix), save the contents into a directory named that.
+                The subfiles are named the same.  e.g. HED8.3.0/HED8.3.0_Tag.tsv
 
         base_filename: str
             save filename.  A suffix will be added to most, e.g. _Tag

diff --git a/hed/schema/hed_schema_io.py b/hed/schema/hed_schema_io.py
@@ -94,7 +94,7 @@ def load_schema(hed_path, schema_namespace=None, schema=None, name=None):
         hed_schema = SchemaLoaderXML.load(hed_path, schema=schema, name=name)
     elif hed_path.lower().endswith(".mediawiki"):
         hed_schema = SchemaLoaderWiki.load(hed_path, schema=schema, name=name)
-    elif hed_path.lower().endswith(".tsv"):
+    elif hed_path.lower().endswith(".tsv") or os.path.isdir(hed_path):
         if schema is not None:
             raise HedFileError(HedExceptions.INVALID_HED_FORMAT,
                                "Cannot pass a schema to merge into spreadsheet loading currently.", filename=name)

diff --git a/hed/schema/schema_io/df2schema.py b/hed/schema/schema_io/df2schema.py
@@ -53,12 +53,19 @@ def convert_filenames_to_dict(filenames):
 
         Parameters:
             filenames(str or None or list or dict): The list to convert to a dict
-
+                If a string with a .tsv suffix: Save to that location, adding the suffix to each .tsv file
+                If a string with no .tsv suffix: Save to that folder, with the contents being the separate .tsv files.
         Returns:
             filename_dict(str: str): The required suffix to filename mapping"""
         result_filenames = {}
         if isinstance(filenames, str):
-            base, base_ext = os.path.splitext(filenames)
+            if filenames.endswith(".tsv"):
+                base, base_ext = os.path.splitext(filenames)
+            else:
+                # Load as foldername/foldername_suffix.tsv
+                base_dir = filenames
+                base_filename = os.path.split(base_dir)[1]
+                base = os.path.join(base_dir, base_filename)
             for suffix in constants.DF_SUFFIXES:
                 filename = f"{base}_{suffix}.tsv"
                 result_filenames[suffix] = filename

diff --git a/hed/schema/schema_io/ontology_util.py b/hed/schema/schema_io/ontology_util.py
@@ -360,12 +360,24 @@ def save_dataframes(base_filename, dataframe_dict):
 
     Does not validate contents or suffixes.
 
+    If base_filename has a .tsv suffix, save directly to the indicated location.
+    If base_filename is a directory(does NOT have a .tsv suffix), save the contents into a directory named that.
+        The subfiles are named the same.  e.g. HED8.3.0/HED8.3.0_Tag.tsv
+
     Parameters:
         base_filename(str): The base filename to use.  Output is {base_filename}_{suffix}.tsv
                             See DF_SUFFIXES for all expected names.
         dataframe_dict(dict of str: df.DataFrame): The list of files to save out.  No validation is done.
     """
-    base, base_ext = os.path.splitext(base_filename)
+    if base_filename.lower().endswith(".tsv"):
+        base, base_ext = os.path.splitext(base_filename)
+        base_dir, base_name = os.path.split(base)
+    else:
+        # Assumed as a directory name
+        base_dir = base_filename
+        base_filename = os.path.split(base_dir)[1]
+        base = os.path.join(base_dir, base_filename)
+    os.makedirs(base_dir, exist_ok=True)
     for suffix, dataframe in dataframe_dict.items():
         filename = f"{base}_{suffix}.tsv"
         with open(filename, mode='w', encoding='utf-8') as opened_file:

diff --git a/hed/schema/schema_io/schema2df.py b/hed/schema/schema_io/schema2df.py
@@ -28,6 +28,7 @@ def __init__(self, get_as_ids=False):
         """
         super().__init__()
         self._get_as_ids = get_as_ids
+        self._tag_rows = []
 
     def _get_object_name_and_id(self, object_name, include_prefix=False):
         """ Get the adjusted name and ID for the given object type.
@@ -67,6 +68,7 @@ def _initialize_output(self):
             constants.OBJECT_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str),
             constants.ATTRIBUTE_PROPERTY_KEY: pd.DataFrame(columns=constants.property_columns_reduced, dtype=str),
         }
+        self._tag_rows = []
 
     def _create_and_add_object_row(self, base_object, attributes="", description=""):
         name, full_hed_id = self._get_object_name_and_id(base_object)
@@ -95,7 +97,7 @@ def _start_section(self, key_class):
         pass
 
     def _end_tag_section(self):
-        pass
+        self.output[constants.TAG_KEY] = pd.DataFrame(self._tag_rows, columns=constants.tag_columns, dtype=str)
 
     def _write_tag_entry(self, tag_entry, parent_node=None, level=0):
         tag_id = tag_entry.attributes.get(HedKey.HedID, "")
@@ -108,7 +110,8 @@ def _write_tag_entry(self, tag_entry, parent_node=None, level=0):
             constants.description: tag_entry.description,
             constants.equivalent_to: self._get_tag_equivalent_to(tag_entry),
         }
-        self.output[constants.TAG_KEY].loc[len(self.output[constants.TAG_KEY])] = new_row
+        # Todo: do other sections like this as well for efficiency
+        self._tag_rows.append(new_row)
 
     def _write_entry(self, entry, parent_node, include_props=True):
         df_key = section_key_to_df.get(entry.section_key)