From 836105534646c385d1c609e35afbc424c75ee262 Mon Sep 17 00:00:00 2001
From: WiemKhlifi <w.khlifi@instadeep.com>
Date: Mon, 13 Jan 2025 14:58:57 +0100
Subject: [PATCH] feat: enhance plotting functions with customizable colors and
 improved layout

---
 examples/simple_example.py             | 203 ++++++++++---------------
 marl_eval/plotting_tools/plot_utils.py |  12 +-
 marl_eval/plotting_tools/plotting.py   |  45 +++++-
 3 files changed, 123 insertions(+), 137 deletions(-)

diff --git a/examples/simple_example.py b/examples/simple_example.py
index 0d59807c..5f826caf 100644
--- a/examples/simple_example.py
+++ b/examples/simple_example.py
@@ -27,20 +27,26 @@
     create_matrices_for_rliable,
     data_process_pipeline,
 )
-
-##############################
-# Read in and process data
-##############################
-METRICS_TO_NORMALIZE = ["return"]
-LEGEND_MAP = {
-    "algo_1": "Algorithm 1",
-    "algo_2": "Algorithm 2",
-    "algo_3": "Algorithm 3",
-    "algo_4": "Algorithm 4",
-    "algo_5": "Algorithm 5",
+from marl_eval.json_tools.json_utils import concatenate_json_files, pull_neptune_data
+import matplotlib.pyplot as plt
+import colorcet as cc
+import seaborn as sns
+import numpy as np
+
+METRICS_TO_NORMALIZE = []
+metric_name="episode_return"
+legend_map = { 
+    "rec_mappo": "MAPPO",
+    "ff_ippo": "IPPO",
+    "rec_qmix": "QMIX",
+    "rec_iql": "IQL"
 }
 
-with open("examples/example_results.json") as f:
+algorithms = list(legend_map.values())
+colors = dict(zip(algorithms, sns.color_palette(cc.glasbey_category10)))
+
+env_name = "LevelBasedForaging" 
+with open("concatenated_json_files/metrics.json") as f:
     raw_data = json.load(f)
 
 processed_data = data_process_pipeline(
@@ -49,7 +55,7 @@
 
 environment_comparison_matrix, sample_effeciency_matrix = create_matrices_for_rliable(
     data_dictionary=processed_data,
-    environment_name="env_1",
+    environment_name=env_name,
     metrics_to_normalize=METRICS_TO_NORMALIZE,
 )
 
@@ -60,70 +66,58 @@
 ##############################
 # Plot success rate data
 ##############################
-
-# Aggregate data over a single task.
-
-task = "task_1"
-fig = plot_single_task(
-    processed_data=processed_data,
-    environment_name="env_1",
-    task_name=task,
-    metric_name="success_rate",
-    metrics_to_normalize=METRICS_TO_NORMALIZE,
-    legend_map=LEGEND_MAP,
-)
-
-fig.figure.savefig(
-    f"examples/plots/env_1_{task}_agg_success_rate.png", bbox_inches="tight"
-)
-
-# Aggregate data over all environment tasks.
-
-fig = performance_profiles(
-    environment_comparison_matrix,
-    metric_name="success_rate",
-    metrics_to_normalize=METRICS_TO_NORMALIZE,
-    legend_map=LEGEND_MAP,
-)
-fig.figure.savefig(
-    "examples/plots/success_rate_performance_profile.png", bbox_inches="tight"
-)
-
-fig, _, _ = aggregate_scores(  # type: ignore
-    environment_comparison_matrix,
-    metric_name="success_rate",
-    metrics_to_normalize=METRICS_TO_NORMALIZE,
-    save_tabular_as_latex=True,
-    legend_map=LEGEND_MAP,
-)
-fig.figure.savefig(
-    "examples/plots/success_rate_aggregate_scores.png", bbox_inches="tight"
-)
-
-fig = probability_of_improvement(
-    environment_comparison_matrix,
-    metric_name="success_rate",
-    metrics_to_normalize=METRICS_TO_NORMALIZE,
-    algorithms_to_compare=[
-        ["algo_1", "algo_2"],
-        ["algo_1", "algo_3"],
-        ["algo_2", "algo_4"],
-    ],
-    legend_map=LEGEND_MAP,
-)
-fig.figure.savefig(
-    "examples/plots/success_rate_prob_of_improvement.png", bbox_inches="tight"
-)
+# # Aggregate data over all environment tasks.
+# fig = performance_profiles(
+#     environment_comparison_matrix,
+#     metric_name=metric_name,
+#     metrics_to_normalize=METRICS_TO_NORMALIZE,
+#     legend_map=legend_map,
+#     colors=colors,
+# )
+# # plt.legend(loc='lower center', bbox_to_anchor=(0.5, 0.9), prop={'size': 17}, ncol=5, bbox_transform=plt.gcf().transFigure, borderaxespad=0.2, frameon=True)
+# fig.figure.savefig("examples/plots/return_performance_profile.pdf", bbox_inches="tight")
+
+# fig, _, _ = aggregate_scores(  # type: ignore
+#     environment_comparison_matrix,
+#     metric_name=metric_name,
+#     metrics_to_normalize=METRICS_TO_NORMALIZE,
+#     save_tabular_as_latex=True,
+#     legend_map=legend_map,
+# )
+# fig.figure.savefig( "examples/plots/return_aggregate_scores.pdf", bbox_inches="tight")
+
+# fig = probability_of_improvement(
+#     environment_comparison_matrix,
+#     metric_name=metric_name,
+#     metrics_to_normalize=METRICS_TO_NORMALIZE,
+#     algorithms_to_compare=[
+#         ["ff_mappo", "ff_ippo"],
+#         ["rec_mappo", "rec_ippo"],
+#         ["ff_mappo", "rec_mappo"],
+#         ["ff_ippo", "rec_ippo"],
+#     ],
+#     legend_map=legend_map,
+# )
+# fig.figure.savefig("examples/plots/return_prob_of_improvement.pdf", bbox_inches="tight")
 
 fig, _, _ = sample_efficiency_curves(  # type: ignore
     sample_effeciency_matrix,
-    metric_name="success_rate",
+    metric_name=metric_name,
     metrics_to_normalize=METRICS_TO_NORMALIZE,
-    legend_map=LEGEND_MAP,
-)
-fig.figure.savefig(
-    "examples/plots/success_rate_sample_effeciency_curve.png", bbox_inches="tight"
+    legend_map=legend_map,
+    colors=colors,
 )
+# legend = plt.legend(loc='lower center', bbox_to_anchor=(0.5, 0.9), prop={'size': 15}, ncol=7, bbox_transform=plt.gcf().transFigure, borderaxespad=0.2, frameon=True)
+# fig  = legend.figure
+# fig.canvas.draw()
+# bbox  = legend.get_window_extent()
+# bbox = bbox.from_extents(*(bbox.extents + np.array([-4,-4,4,4])))
+# bbox = bbox.transformed(fig.dpi_scale_trans.inverted())
+# fig.savefig('legend.png', dpi=1200, bbox_inches=bbox)
+# plt.hlines(y=2, xmin=0, xmax=4e7, colors='gray', linestyles='--', label='Threshold')
+# plt.legend()
+fig.figure.savefig(f"examples/plots/{env_name}_sample_effeciency_curve.pdf", bbox_inches="tight")
+
 
 ##############################
 # Plot episode return data
@@ -131,56 +125,15 @@
 
 # Aggregate data over a single task
 
-task = "task_1"
-fig = plot_single_task(
-    processed_data=processed_data,
-    environment_name="env_1",
-    task_name=task,
-    metric_name="return",
-    metrics_to_normalize=METRICS_TO_NORMALIZE,
-    legend_map=LEGEND_MAP,
-)
-
-fig.figure.savefig(f"examples/plots/env_1_{task}_agg_return.png", bbox_inches="tight")
-
-# Aggregate data over all environment tasks.
-
-fig = performance_profiles(
-    environment_comparison_matrix,
-    metric_name="return",
-    metrics_to_normalize=METRICS_TO_NORMALIZE,
-    legend_map=LEGEND_MAP,
-)
-fig.figure.savefig("examples/plots/return_performance_profile.png", bbox_inches="tight")
-
-fig, _, _ = aggregate_scores(  # type: ignore
-    environment_comparison_matrix,
-    metric_name="return",
-    metrics_to_normalize=METRICS_TO_NORMALIZE,
-    save_tabular_as_latex=True,
-    legend_map=LEGEND_MAP,
-)
-fig.figure.savefig("examples/plots/return_aggregate_scores.png", bbox_inches="tight")
-
-fig = probability_of_improvement(
-    environment_comparison_matrix,
-    metric_name="return",
-    metrics_to_normalize=METRICS_TO_NORMALIZE,
-    algorithms_to_compare=[
-        ["algo_1", "algo_2"],
-        ["algo_1", "algo_3"],
-        ["algo_2", "algo_4"],
-    ],
-    legend_map=LEGEND_MAP,
-)
-fig.figure.savefig("examples/plots/return_prob_of_improvement.png", bbox_inches="tight")
-
-fig, _, _ = sample_efficiency_curves(  # type: ignore
-    sample_effeciency_matrix,
-    metric_name="return",
-    metrics_to_normalize=METRICS_TO_NORMALIZE,
-    legend_map=LEGEND_MAP,
-)
-fig.figure.savefig(
-    "examples/plots/return_sample_effeciency_curve.png", bbox_inches="tight"
-)
+for task in processed_data[env_name.lower()].keys(): 
+    fig = plot_single_task(
+        processed_data=processed_data,
+        environment_name=env_name,
+        task_name=task,
+        metric_name=metric_name,
+        metrics_to_normalize=METRICS_TO_NORMALIZE,
+        legend_map=legend_map,
+        colors=colors,
+    )
+
+    fig.figure.savefig(f"examples/plots/{env_name}_{task}_agg_return.pdf", bbox_inches="tight")
diff --git a/marl_eval/plotting_tools/plot_utils.py b/marl_eval/plotting_tools/plot_utils.py
index 02b34948..651b66d7 100644
--- a/marl_eval/plotting_tools/plot_utils.py
+++ b/marl_eval/plotting_tools/plot_utils.py
@@ -82,7 +82,9 @@ def plot_single_task_curve(
         x_axis_len = len(aggregated_data[algorithm]["mean"])
 
         # Set x-axis values to match evaluation interval steps.
-        x_axis_values = np.arange(x_axis_len) * extra_info["evaluation_interval"]
+        # x_axis_values = np.arange(x_axis_len) * extra_info["evaluation_interval"]
+        # Note: This is hardcoded for now as well.
+        x_axis_values = np.linspace(0,20, x_axis_len) 
 
         if run_times is not None:
             x_axis_values = np.linspace(0, run_times[algorithm] / 60, x_axis_len)
@@ -102,15 +104,17 @@ def plot_single_task_curve(
         ax.plot(
             x_axis_values,
             metric_values,
-            color=colors[algorithm],
+            color=colors[algorithm_name],
             marker=marker,
             linewidth=linewidth,
             label=algorithm_name,
         )
         ax.fill_between(
-            x_axis_values, y1=lower, y2=upper, color=colors[algorithm], alpha=0.2
+            x_axis_values, y1=lower, y2=upper, color=colors[algorithm_name], alpha=0.2
         )
-
+    # plt.hlines(y=0.15, xmin=0, xmax=2e7, colors=colors["MAPPO"], linestyles='--', label='JaxMARL MAPPO')
+    # plt.hlines(y=0.1, xmin=0, xmax=2e7, colors=colors["IPPO"], linestyles='--', label='JaxMARL IPPO')
+    # plt.legend()
     return _annotate_and_decorate_axis(
         ax,
         xlabel=xlabel,
diff --git a/marl_eval/plotting_tools/plotting.py b/marl_eval/plotting_tools/plotting.py
index 49b33e5f..d24320de 100644
--- a/marl_eval/plotting_tools/plotting.py
+++ b/marl_eval/plotting_tools/plotting.py
@@ -38,6 +38,7 @@ def performance_profiles(
     metric_name: str,
     metrics_to_normalize: List[str],
     legend_map: Optional[Dict[str, str]] = None,
+    colors: Optional[Dict[str, str]] = None,
 ) -> Figure:
     """Produces performance profile plots.
 
@@ -65,6 +66,7 @@ def performance_profiles(
     upper_algo_dict = {algo.upper(): value for algo, value in data_dictionary.items()}
     data_dictionary = upper_algo_dict
     algorithms = list(data_dictionary.keys())
+    algorithms.sort(reverse=True)
 
     if legend_map is not None:
         legend_map = {algo.upper(): value for algo, value in legend_map.items()}
@@ -73,6 +75,7 @@ def performance_profiles(
             legend_map[algo]: value for algo, value in data_dictionary.items()
         }
         algorithms = list(data_dictionary.keys())
+        algorithms.sort(reverse=True)
 
     if metric_name in metrics_to_normalize:
         xlabel = "Normalized " + " ".join(metric_name.split("_"))
@@ -85,16 +88,20 @@ def performance_profiles(
     )
 
     # Plot score distributions
-    fig, ax = plt.subplots(ncols=1, figsize=(7, 5))
+    fig, ax = plt.subplots(ncols=1, figsize=(12, 12)) # Note: change this based on needs
     plot_utils.plot_performance_profiles(
         score_distributions,
         np.linspace(0, 1, 100),
         performance_profile_cis=score_distributions_cis,
-        colors=dict(zip(algorithms, sns.color_palette(cc.glasbey_category10))),
+        colors=dict(zip(algorithms, sns.color_palette(cc.glasbey_category10))) if colors is None else colors,
         xlabel=f"{xlabel} " + r"$(\tau)$",
         ax=ax,
-        legend=algorithms,
+        legend=algorithms, # Note: legend=algorithms or legend=[] to remove the legend.
     )
+    plt.ylabel(r"Fraction of runs with score > $\tau$",fontsize=40)
+    plt.xlabel("Mean episode return " + r"$(\tau)$",fontsize=40)
+    plt.xticks(fontsize=30)
+    plt.yticks(fontsize=30)
     return fig
 
 
@@ -140,6 +147,7 @@ def aggregate_scores(
     upper_algo_dict = {algo.upper(): value for algo, value in data_dictionary.items()}
     data_dictionary = upper_algo_dict
     algorithms = list(data_dictionary.keys())
+    algorithms.sort(reverse=True)
 
     if legend_map is not None:
         legend_map = {algo.upper(): value for algo, value in legend_map.items()}
@@ -148,6 +156,7 @@ def aggregate_scores(
             legend_map[algo]: value for algo, value in data_dictionary.items()
         }
         algorithms = list(data_dictionary.keys())
+        algorithms.sort(reverse=True)
 
     aggregate_func = lambda x: np.array(  # noqa: E731
         [
@@ -311,6 +320,7 @@ def sample_efficiency_curves(
     metrics_to_normalize: List[str],
     legend_map: Optional[Dict[str, str]] = None,
     xlabel: str = "Timesteps",
+    colors: Optional[Dict[str, str]] = None,
 ) -> Tuple[Figure, Dict[str, np.ndarray], Dict[str, np.ndarray]]:
     """Produces sample efficiency curve plots.
 
@@ -346,6 +356,7 @@ def sample_efficiency_curves(
     upper_algo_dict = {algo.upper(): value for algo, value in data_dictionary.items()}
     data_dictionary = upper_algo_dict
     algorithms = list(data_dictionary.keys())
+    algorithms.sort(reverse=True)
 
     if legend_map is not None:
         legend_map = {algo.upper(): value for algo, value in legend_map.items()}
@@ -354,6 +365,7 @@ def sample_efficiency_curves(
             legend_map[algo]: value for algo, value in data_dictionary.items()
         }
         algorithms = list(data_dictionary.keys())
+        algorithms.sort(reverse=True) 
 
     # Find lowest values from amount of runs that have completed
     # across all algorithms
@@ -375,6 +387,9 @@ def sample_efficiency_curves(
 
     iqm_scores, iqm_cis = rly.get_interval_estimates(scores_dict, iqm, reps=5000)
 
+    xticks = np.linspace(0, 20, len(x_axis_values)) # Note: assuming we do 20M timesteps
+    x_axis_values = xticks
+    
     fig = plot_utils.plot_sample_efficiency_curve(
         x_axis_values,
         iqm_scores,
@@ -382,12 +397,19 @@ def sample_efficiency_curves(
         algorithms=algorithms,
         xlabel=xlabel,
         ylabel=ylabel,
-        legend=algorithms,
+        # xticks=xticks,
+        # xticklabels=xticklabels,
+        legend=[], # Note: legend=algorithms or legend=[] to remove the legend.
         figsize=(15, 8),
-        color_palette=cc.glasbey_category10,
+        colors=dict(zip(algorithms, sns.color_palette(cc.glasbey_category10))) if colors is None else colors,
     )
 
     dictionary["extra"] = extra
+    plt.xlabel("Timesteps [Millions]",fontsize=40)
+    plt.ylabel("Mean episode return",fontsize=40)
+    plt.xticks(fontsize=30)
+    plt.yticks(fontsize=30)
+    # plt.legend(prop={'size': 25}) # Note: Comment when NOT inserting the legend.
 
     return fig, iqm_scores, iqm_cis
 
@@ -401,6 +423,7 @@ def plot_single_task(
     xlabel: str = "Timesteps",
     legend_map: Optional[Dict[str, str]] = None,
     run_times: Optional[Dict[str, float]] = None,
+    colors: Optional[Dict[str, str]] = None,
 ) -> Figure:
     """Produces aggregated plot for a single task in an environment.
 
@@ -441,6 +464,7 @@ def plot_single_task(
     task_mean_ci_data = upper_algo_dict
     algorithms = list(task_mean_ci_data.keys())
     algorithms.remove("extra")
+    algorithms.sort(reverse=True)
 
     if legend_map is not None:
         legend_map = {algo.upper(): value for algo, value in legend_map.items()}
@@ -454,12 +478,17 @@ def plot_single_task(
         algorithms=algorithms,
         xlabel=xlabel,
         ylabel=ylabel,
-        legend=algorithms,
+        legend=algorithms, # Note: legend=algorithms or legend=[] to remove the legend.
         figsize=(15, 8),
-        color_palette=cc.glasbey_category10,
+        colors=dict(zip(algorithms, sns.color_palette(cc.glasbey_category10))) if colors is None else colors,
         legend_map=legend_map,
         run_times=run_times,
         marker="",
     )
-
+    plt.title(f"{task_name}", fontsize=40)
+    plt.xlabel("Timesteps [Millions]",fontsize=40)
+    plt.ylabel("Mean episode return",fontsize=40)
+    plt.xticks(fontsize=30)
+    plt.yticks(fontsize=30)
+    plt.legend(prop={'size': 25}) # Note: Uncomment when inserting the legend.
     return fig