From b7ab42f705681f969456f00a0366c2cfb1442861 Mon Sep 17 00:00:00 2001 From: IanCa Date: Thu, 16 May 2024 15:26:28 -0500 Subject: [PATCH 1/3] Update dataframe loading/saving to allow passing a folder name Also make saving much more efficient --- hed/schema/hed_schema.py | 6 +++++- hed/schema/hed_schema_io.py | 2 +- hed/schema/schema_io/df2schema.py | 11 +++++++++-- hed/schema/schema_io/ontology_util.py | 14 +++++++++++++- hed/schema/schema_io/schema2df.py | 7 +++++-- 5 files changed, 33 insertions(+), 7 deletions(-) diff --git a/hed/schema/hed_schema.py b/hed/schema/hed_schema.py index b82b87bc..0c47538a 100644 --- a/hed/schema/hed_schema.py +++ b/hed/schema/hed_schema.py @@ -310,7 +310,11 @@ def save_as_xml(self, filename, save_merged=True): opened_file.write(xml_string) def save_as_dataframes(self, base_filename, save_merged=False): - """ Save as mediawiki to a file. + """ Save as dataframes to a folder of files. + + If base_filename has a .tsv suffix, save directly to the indicated location. + If base_filename is a directory(does NOT have a .tsv suffix), save the contents into a directory named that. + The subfiles are named the same. e.g. HED8.3.0/HED8.3.0_Tag.tsv base_filename: str save filename. A suffix will be added to most, e.g. _Tag diff --git a/hed/schema/hed_schema_io.py b/hed/schema/hed_schema_io.py index 04661a80..7f134d45 100644 --- a/hed/schema/hed_schema_io.py +++ b/hed/schema/hed_schema_io.py @@ -94,7 +94,7 @@ def load_schema(hed_path, schema_namespace=None, schema=None, name=None): hed_schema = SchemaLoaderXML.load(hed_path, schema=schema, name=name) elif hed_path.lower().endswith(".mediawiki"): hed_schema = SchemaLoaderWiki.load(hed_path, schema=schema, name=name) - elif hed_path.lower().endswith(".tsv"): + elif hed_path.lower().endswith(".tsv") or os.path.isdir(hed_path): if schema is not None: raise HedFileError(HedExceptions.INVALID_HED_FORMAT, "Cannot pass a schema to merge into spreadsheet loading currently.", filename=name) diff --git a/hed/schema/schema_io/df2schema.py b/hed/schema/schema_io/df2schema.py index fe69c82b..71de1a09 100644 --- a/hed/schema/schema_io/df2schema.py +++ b/hed/schema/schema_io/df2schema.py @@ -53,12 +53,19 @@ def convert_filenames_to_dict(filenames): Parameters: filenames(str or None or list or dict): The list to convert to a dict - + If a string with a .tsv suffix: Save to that location, adding the suffix to each .tsv file + If a string with no .tsv suffix: Save to that folder, with the contents being the separate .tsv files. Returns: filename_dict(str: str): The required suffix to filename mapping""" result_filenames = {} if isinstance(filenames, str): - base, base_ext = os.path.splitext(filenames) + if filenames.endswith(".tsv"): + base, base_ext = os.path.splitext(filenames) + else: + # Load as foldername/foldername_suffix.tsv + base_dir = filenames + base_filename = os.path.split(base_dir)[1] + base = os.path.join(base_dir, base_filename) for suffix in constants.DF_SUFFIXES: filename = f"{base}_{suffix}.tsv" result_filenames[suffix] = filename diff --git a/hed/schema/schema_io/ontology_util.py b/hed/schema/schema_io/ontology_util.py index 7461bf56..25d53c49 100644 --- a/hed/schema/schema_io/ontology_util.py +++ b/hed/schema/schema_io/ontology_util.py @@ -360,12 +360,24 @@ def save_dataframes(base_filename, dataframe_dict): Does not validate contents or suffixes. + If base_filename has a .tsv suffix, save directly to the indicated location. + If base_filename is a directory(does NOT have a .tsv suffix), save the contents into a directory named that. + The subfiles are named the same. e.g. HED8.3.0/HED8.3.0_Tag.tsv + Parameters: base_filename(str): The base filename to use. Output is {base_filename}_{suffix}.tsv See DF_SUFFIXES for all expected names. dataframe_dict(dict of str: df.DataFrame): The list of files to save out. No validation is done. """ - base, base_ext = os.path.splitext(base_filename) + if base_filename.lower().endswith(".tsv"): + base, base_ext = os.path.splitext(base_filename) + base_dir, base_name = os.path.split(base) + else: + # Assumed as a directory name + base_dir = base_filename + base_filename = os.path.split(base_dir)[1] + base = os.path.join(base_dir, base_filename) + os.makedirs(base_dir, exist_ok=True) for suffix, dataframe in dataframe_dict.items(): filename = f"{base}_{suffix}.tsv" with open(filename, mode='w', encoding='utf-8') as opened_file: diff --git a/hed/schema/schema_io/schema2df.py b/hed/schema/schema_io/schema2df.py index 9c2b6c79..1728a8dc 100644 --- a/hed/schema/schema_io/schema2df.py +++ b/hed/schema/schema_io/schema2df.py @@ -28,6 +28,7 @@ def __init__(self, get_as_ids=False): """ super().__init__() self._get_as_ids = get_as_ids + self._tag_rows = [] def _get_object_name_and_id(self, object_name, include_prefix=False): """ Get the adjusted name and ID for the given object type. @@ -67,6 +68,7 @@ def _initialize_output(self): constants.OBJECT_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str), constants.ATTRIBUTE_PROPERTY_KEY: pd.DataFrame(columns=constants.property_columns_reduced, dtype=str), } + self._tag_rows = [] def _create_and_add_object_row(self, base_object, attributes="", description=""): name, full_hed_id = self._get_object_name_and_id(base_object) @@ -95,7 +97,7 @@ def _start_section(self, key_class): pass def _end_tag_section(self): - pass + self.output[constants.TAG_KEY] = pd.DataFrame(self._tag_rows, columns=constants.tag_columns, dtype=str) def _write_tag_entry(self, tag_entry, parent_node=None, level=0): tag_id = tag_entry.attributes.get(HedKey.HedID, "") @@ -108,7 +110,8 @@ def _write_tag_entry(self, tag_entry, parent_node=None, level=0): constants.description: tag_entry.description, constants.equivalent_to: self._get_tag_equivalent_to(tag_entry), } - self.output[constants.TAG_KEY].loc[len(self.output[constants.TAG_KEY])] = new_row + # Todo: do other sections like this as well for efficiency + self._tag_rows.append(new_row) def _write_entry(self, entry, parent_node, include_props=True): df_key = section_key_to_df.get(entry.section_key) From b768dc9b5ecd9c7cab1b2b2ea4ce79a2c3d2c8b1 Mon Sep 17 00:00:00 2001 From: IanCa Date: Thu, 16 May 2024 16:11:41 -0500 Subject: [PATCH 2/3] add tests --- tests/schema/test_hed_schema_io_df.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tests/schema/test_hed_schema_io_df.py b/tests/schema/test_hed_schema_io_df.py index a82ad2e7..bf59dbc1 100644 --- a/tests/schema/test_hed_schema_io_df.py +++ b/tests/schema/test_hed_schema_io_df.py @@ -62,3 +62,26 @@ def test_from_dataframes(self): reloaded_schema = from_dataframes(dfs) self.assertEqual(schema, reloaded_schema) + def test_save_load_location(self): + schema = load_schema_version("8.3.0") + schema_name = "test_output" + output_location = self.output_folder + schema_name + schema.save_as_dataframes(output_location) + expected_location = os.path.join(output_location, f"{schema_name}_Tag.tsv") + self.assertTrue(os.path.exists(expected_location)) + + reloaded_schema = load_schema(output_location) + + self.assertEqual(schema, reloaded_schema) + + def test_save_load_location2(self): + schema = load_schema_version("8.3.0") + schema_name = "test_output" + output_location = self.output_folder + schema_name + ".tsv" + schema.save_as_dataframes(output_location) + expected_location = self.output_folder + schema_name + "_Tag.tsv" + self.assertTrue(os.path.exists(expected_location)) + + reloaded_schema = load_schema(output_location) + + self.assertEqual(schema, reloaded_schema) From 40964c23d26674db394099174e26bb1d5e98f337 Mon Sep 17 00:00:00 2001 From: IanCa Date: Thu, 16 May 2024 16:40:58 -0500 Subject: [PATCH 3/3] Access matplotlib color map in a different way --- hed/tools/remodeling/operations/summarize_hed_tags_op.py | 2 +- hed/tools/visualization/word_cloud_util.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/hed/tools/remodeling/operations/summarize_hed_tags_op.py b/hed/tools/remodeling/operations/summarize_hed_tags_op.py index a45e7b86..cb1b6296 100644 --- a/hed/tools/remodeling/operations/summarize_hed_tags_op.py +++ b/hed/tools/remodeling/operations/summarize_hed_tags_op.py @@ -335,7 +335,7 @@ def save_visualizations(self, save_dir, file_formats=['.svg'], individual_summar specifics = overall_summary.get("Specifics", {}) word_dict = self.summary_to_dict(specifics, scale_adjustment=wc["scale_adjustment"]) - tag_wc = tag_word_cloud.tag_word_cloud.create_wordcloud(word_dict, mask_path=wc["mask_path"], + tag_wc = tag_word_cloud.create_wordcloud(word_dict, mask_path=wc["mask_path"], width=wc["width"], height=wc["height"], prefer_horizontal=wc["prefer_horizontal"], background_color=wc["background_color"], min_font_size=wc["min_font_size"], max_font_size=wc["max_font_size"], diff --git a/hed/tools/visualization/word_cloud_util.py b/hed/tools/visualization/word_cloud_util.py index 8026f0d4..00d8e07a 100644 --- a/hed/tools/visualization/word_cloud_util.py +++ b/hed/tools/visualization/word_cloud_util.py @@ -4,7 +4,7 @@ import numpy as np from PIL import Image, ImageFilter -from matplotlib import cm +import matplotlib as mp1 import wordcloud as wcloud @@ -139,7 +139,7 @@ def __init__(self, colormap='nipy_spectral', color_range=(0.0, 0.5), color_step_ This is the speed at which it goes through the range chosen. .25 means it will go through 1/4 of the range each pick. """ - self.colormap = cm.get_cmap(colormap) + self.colormap = mp1.colormaps[colormap] self.color_range = color_range self.color_step_range = color_step_range self.current_fraction = random.uniform(0, 1) # Start at a random point