Updating word cloud visualization in HED tag summary to have parameters

hed-standard · Feb 19, 2024 · a91a4dd · a91a4dd
1 parent 9766bea
commit a91a4dd
Show file tree

Hide file tree

Showing 5 changed files with 95 additions and 59 deletions.
diff --git a/hed/tools/remodeling/dispatcher.py b/hed/tools/remodeling/dispatcher.py
@@ -37,8 +37,8 @@ def __init__(self, operation_list, data_root=None,
 
         :raises ValueError:
             - If any of the operations cannot be parsed correctly.
-
         """
+
         self.data_root = data_root
         self.backup_name = backup_name
         self.backup_man = None
@@ -60,7 +60,6 @@ def get_summaries(self, file_formats=['.txt', '.json']):
 
         Returns:
             list: A list of dictionaries of summaries keyed to filenames.
-
         """
 
         summary_list = []
@@ -101,9 +100,9 @@ def get_data_file(self, file_designator):
               In this case, the corresponding backup file is read and returned.    
             - If a string is passed and there is no backup manager,
               the data file corresponding to the file_designator is read and returned.    
-            - If a Pandas DataFrame, return a copy.   
-
+            - If a Pandas DataFrame, return a copy.
         """
+
         if isinstance(file_designator, pd.DataFrame):
             return file_designator.copy()
         if self.backup_man:
@@ -126,7 +125,6 @@ def get_summary_save_dir(self):
 
         :raises HedFileError:
             - If this dispatcher does not have a data_root.
-
         """
 
         if self.data_root:
@@ -143,7 +141,6 @@ def run_operations(self, file_path, sidecar=None, verbose=False):
 
         Returns:
             DataFrame:  The processed dataframe.
-
         """
 
         # string to functions
@@ -173,8 +170,8 @@ def save_summaries(self, save_formats=['.json', '.txt'], individual_summaries="s
             - "consolidated" means that the overall summary and summaries of individual files are in one summary file.  
             - "individual" means that the summaries of individual files are in separate files.
             - "none" means that only the overall summary is produced.
-
         """
+
         if not save_formats:
             return
         if not summary_dir:
@@ -185,6 +182,15 @@ def save_summaries(self, save_formats=['.json', '.txt'], individual_summaries="s
 
     @staticmethod
     def parse_operations(operation_list):
+        """ Return a parsed a list of remodeler operations.
+
+        Parameters:
+            operation_list (list): List of JSON remodeler operations.
+
+        Returns:
+            list: List of Python objects containing parsed remodeler operations.
+        """
+
         operations = []
         for index, item in enumerate(operation_list):
             new_operation = valid_operations[item["operation"]](item["parameters"])
@@ -197,24 +203,24 @@ def prep_data(df):
 
         Parameters:
             df (DataFrame) - The DataFrame to be processed.
-
         """
+
         result = df.replace('n/a', np.nan)
         # Comment in the next line if this behavior was actually needed, but I don't think it is.
         # result = result.infer_objects(copy=False)
         return result
 
     @staticmethod
     def post_proc_data(df):
-        """ Replace all nan entries with 'n/a' for BIDS compliance
+        """ Replace all nan entries with 'n/a' for BIDS compliance.
 
         Parameters:
             df (DataFrame): The DataFrame to be processed.
 
         Returns:
-            DataFrame: DataFrame with the 'np.NAN replaced by 'n/a'
-
+            DataFrame: DataFrame with the 'np.NAN replaced by 'n/a'.
         """
+
         dtypes = df.dtypes.to_dict()
         for col_name, typ in dtypes.items():
             if typ == 'category':
@@ -232,10 +238,9 @@ def errors_to_str(messages, title="", sep='\n'):
 
         Returns:
             str:  Single string representing the messages.
-
-
         """
-        error_list = [0]*len(messages)
+
+        error_list = [0] * len(messages)
         for index, message in enumerate(messages):
             error_list[index] = f"Operation[{message.get('index', None)}] " + \
                                 f"has error:{message.get('error_type', None)}" + \
@@ -255,8 +260,8 @@ def get_schema(hed_versions):
 
         Returns:
              HedSchema or HedSchemaGroup: Objects loaded from the hed_versions specification.
-
         """
+
         if not hed_versions:
             return None
         elif isinstance(hed_versions, str) or isinstance(hed_versions, list):

diff --git a/hed/tools/remodeling/operations/summarize_hed_tags_op.py b/hed/tools/remodeling/operations/summarize_hed_tags_op.py
@@ -74,7 +74,43 @@ class SummarizeHedTagsOp(BaseOp):
                 "type": "boolean"
             },
             "word_cloud": {
-                "type": "boolean"
+                "type": "object",
+                "properties": {
+                    "height": {
+                        "type": "integer"
+                    },
+                    "width": {
+                        "type": "integer"
+                    },
+                    "prefer_horizontal": {
+                        "type": "number"
+                    },
+                    "min_font_size": {
+                        "type": "number"
+                    },
+                    "max_font_size": {
+                        "type": "number"
+                    },
+                    "scale_adjustment": {
+                        "type": "number"
+                    },
+                    "contour_width": {
+                        "type": "number"
+                    },
+                    "contour_color": {
+                        "type": "string"
+                    },
+                    "background_color": {
+                        "type": "string"
+                    },
+                    "use_mask": {
+                        "type": "boolean"
+                    },
+                    "mask_path": {
+                        "type": "string"
+                    }
+                },
+                "additionalProperties": False
             },
         },
         "required": [
@@ -102,7 +138,26 @@ def __init__(self, parameters):
         self.include_context = parameters.get('include_context', True)
         self.replace_defs = parameters.get("replace_defs", True)
         self.remove_types = parameters.get("remove_types", [])
-        self.word_cloud = parameters.get("word_cloud", False)
+        if "word_cloud" not in parameters:
+            self.word_cloud = None
+        else:
+            wc_params = parameters["word_cloud"]
+            self.word_cloud = {
+                "height": wc_params.get("height", 300),
+                "width": wc_params.get("width", 400),
+                "prefer_horizontal": wc_params.get("prefer_horizontal", 0.75),
+                "min_font_size": wc_params.get("min_font_size", 8),
+                "max_font_size": wc_params.get("max_font_size", 15),
+                "scale_adjustment": wc_params.get("scale_adjustment", 7),
+                "contour_width": wc_params.get("contour_width", 3),
+                "contour_color": wc_params.get("contour_color", 'black'),
+                "background_color": wc_params.get("background_color", None),
+                "use_mask": wc_params.get("use_mask", False),
+                "mask_path": wc_params.get("mask_path", None)
+            }
+            if self.word_cloud["use_mask"] and not self.word_cloud["mask_path"]:
+                self.word_cloud["mask_path"] = os.path.realpath(os.path.join(os.path.dirname(__file__),
+                                                                '../../../resources/word_cloud_brain_mask.png'))
 
     def do_op(self, dispatcher, df, name, sidecar=None):
         """ Summarize the HED tags present in the dataset.
@@ -144,6 +199,7 @@ def __init__(self, sum_op):
             sum_op (BaseOp): Operation associated with this summary.
 
         """
+
         super().__init__(sum_op)
         self.sum_op = sum_op
 
@@ -237,31 +293,34 @@ def save_visualizations(self, save_dir, file_formats=['.svg'], individual_summar
         """
         if not self.sum_op.word_cloud:
             return
+        else:
+            wc = self.sum_op.word_cloud
         # summary = self.get_summary(individual_summaries='none')
         summary = self.get_summary(individual_summaries='none')
         overall_summary = summary.get("Dataset", {})
         overall_summary = overall_summary.get("Overall summary", {})
         specifics = overall_summary.get("Specifics", {})
-        word_dict = self.summary_to_dict(specifics)
-        width = 400
-        height = 300
-        mask_path = os.path.realpath(os.path.join(os.path.dirname(__file__),
-                                                  '../../../resources/word_cloud_brain_mask.png'))
-        tag_wc = create_wordcloud(word_dict, mask_path=mask_path, width=width, height=height)
+        word_dict = self.summary_to_dict(specifics, scale_adjustment=wc["scale_adjustment"])
+
+        tag_wc = create_wordcloud(word_dict, mask_path=wc["mask_path"], width=wc["width"], height=wc["height"],
+                                  prefer_horizontal=wc["prefer_horizontal"], background_color=wc["background_color"],
+                                  min_font_size=wc["min_font_size"], max_font_size=wc["max_font_size"],
+                                  contour_width=wc["contour_width"], contour_color=wc["contour_color"])
         svg_data = word_cloud_to_svg(tag_wc)
         cloud_filename = os.path.realpath(os.path.join(save_dir, self.op.summary_name, '_word_cloud.svg'))
         with open(cloud_filename, "w") as outfile:
             outfile.writelines(svg_data)
 
     @staticmethod
-    def summary_to_dict(specifics, transform=np.log10, adjustment=7):
+    def summary_to_dict(specifics, transform=np.log10, scale_adjustment=7):
         """Convert a HedTagSummary json specifics dict into the word cloud input format.
 
         Parameters:
             specifics(dict): Dictionary with keys "Main tags" and "Other tags".
             transform(func): The function to transform the number of found tags.
                              Default log10
-            adjustment(int): Value added after transform.
+            scale_adjustment(int): Value added after transform.
+
         Returns:
             word_dict(dict): a dict of the words and their occurrence count.
 
@@ -278,10 +337,10 @@ def transform(x):
             if tag == "Exclude tags":
                 continue
             for tag_sub_dict in tag_sub_list:
-                word_dict[tag_sub_dict['tag']] = transform(tag_sub_dict['events']) + adjustment
+                word_dict[tag_sub_dict['tag']] = transform(tag_sub_dict['events']) + scale_adjustment
         other_dict = specifics.get("Other tags", [])
         for tag_sub_list in other_dict:
-            word_dict[tag_sub_list['tag']] = transform(tag_sub_list['events']) + adjustment
+            word_dict[tag_sub_list['tag']] = transform(tag_sub_list['events']) + scale_adjustment
         return word_dict
 
     @staticmethod

diff --git a/hed/tools/visualization/__init__.py b/hed/tools/visualization/__init__.py
@@ -1,4 +1,4 @@
 """ Visualization tools for HED. """
 
-from .tag_word_cloud import create_wordcloud, summary_to_dict, word_cloud_to_svg
+from .tag_word_cloud import create_wordcloud, word_cloud_to_svg
 
diff --git a/hed/tools/visualization/tag_word_cloud.py b/hed/tools/visualization/tag_word_cloud.py
@@ -41,7 +41,8 @@ def create_wordcloud(word_dict, mask_path=None, background_color=None, width=400
     kwargs.setdefault('color_func', default_color_func)
     kwargs.setdefault('relative_scaling', 1)
     kwargs.setdefault('max_font_size', height / 20)
-    kwargs.setdefault('min_font_size', 8)
+    kwargs.setdefault('min_font_size', 8),
+
 
     wc = WordCloud(background_color=background_color, mask=mask_image,
                    width=width, height=height, mode="RGBA", **kwargs)
@@ -66,35 +67,6 @@ def word_cloud_to_svg(wc):
     return svg_string
 
 
-def summary_to_dict(summary, transform=np.log10, adjustment=5):
-    """Convert a HedTagSummary JSON dict into the word cloud input format.
-
-    Parameters:
-        summary(dict): The summary from a SummarizeHedTagsOp.
-        transform(func): The function to transform the number of found tags (Default log10).
-        adjustment(int): Value added after transform.
-
-    Returns:
-        word_dict(dict): A dict of the words and their occurrence count.
-
-    :raises KeyError:
-        A malformed dictionary was passed.
-
-    """
-    if transform is None:
-        def transform(x):
-            return x
-    overall_summary = summary.get("Overall summary", {})
-    specifics = overall_summary.get("Specifics", {})
-    tag_dict = specifics.get("Main tags", {})
-    word_dict = {}
-    for tag_sub_list in tag_dict.values():
-        for tag_sub_dict in tag_sub_list:
-            word_dict[tag_sub_dict['tag']] = transform(tag_sub_dict['events']) + adjustment
-
-    return word_dict
-
-
 def load_and_resize_mask(mask_path, width=None, height=None):
     """ Load a mask image and resize it according to given dimensions.
 

diff --git a/tests/tools/remodeling/operations/test_summarize_hed_tags_op.py b/tests/tools/remodeling/operations/test_summarize_hed_tags_op.py
@@ -274,7 +274,7 @@ def test_convert_summary_to_word_dict(self):
         }
         expected_output = {'tag1': 5, 'tag2': 3, 'tag3': 7}
 
-        word_dict = HedTagSummary.summary_to_dict(summary_json, transform=None, adjustment=0)
+        word_dict = HedTagSummary.summary_to_dict(summary_json, transform=None, scale_adjustment=0)
         self.assertEqual(word_dict, expected_output)