From 836105534646c385d1c609e35afbc424c75ee262 Mon Sep 17 00:00:00 2001 From: WiemKhlifi Date: Mon, 13 Jan 2025 14:58:57 +0100 Subject: [PATCH] feat: enhance plotting functions with customizable colors and improved layout --- examples/simple_example.py | 203 ++++++++++--------------- marl_eval/plotting_tools/plot_utils.py | 12 +- marl_eval/plotting_tools/plotting.py | 45 +++++- 3 files changed, 123 insertions(+), 137 deletions(-) diff --git a/examples/simple_example.py b/examples/simple_example.py index 0d59807c..5f826caf 100644 --- a/examples/simple_example.py +++ b/examples/simple_example.py @@ -27,20 +27,26 @@ create_matrices_for_rliable, data_process_pipeline, ) - -############################## -# Read in and process data -############################## -METRICS_TO_NORMALIZE = ["return"] -LEGEND_MAP = { - "algo_1": "Algorithm 1", - "algo_2": "Algorithm 2", - "algo_3": "Algorithm 3", - "algo_4": "Algorithm 4", - "algo_5": "Algorithm 5", +from marl_eval.json_tools.json_utils import concatenate_json_files, pull_neptune_data +import matplotlib.pyplot as plt +import colorcet as cc +import seaborn as sns +import numpy as np + +METRICS_TO_NORMALIZE = [] +metric_name="episode_return" +legend_map = { + "rec_mappo": "MAPPO", + "ff_ippo": "IPPO", + "rec_qmix": "QMIX", + "rec_iql": "IQL" } -with open("examples/example_results.json") as f: +algorithms = list(legend_map.values()) +colors = dict(zip(algorithms, sns.color_palette(cc.glasbey_category10))) + +env_name = "LevelBasedForaging" +with open("concatenated_json_files/metrics.json") as f: raw_data = json.load(f) processed_data = data_process_pipeline( @@ -49,7 +55,7 @@ environment_comparison_matrix, sample_effeciency_matrix = create_matrices_for_rliable( data_dictionary=processed_data, - environment_name="env_1", + environment_name=env_name, metrics_to_normalize=METRICS_TO_NORMALIZE, ) @@ -60,70 +66,58 @@ ############################## # Plot success rate data ############################## - -# Aggregate data over a single task. - -task = "task_1" -fig = plot_single_task( - processed_data=processed_data, - environment_name="env_1", - task_name=task, - metric_name="success_rate", - metrics_to_normalize=METRICS_TO_NORMALIZE, - legend_map=LEGEND_MAP, -) - -fig.figure.savefig( - f"examples/plots/env_1_{task}_agg_success_rate.png", bbox_inches="tight" -) - -# Aggregate data over all environment tasks. - -fig = performance_profiles( - environment_comparison_matrix, - metric_name="success_rate", - metrics_to_normalize=METRICS_TO_NORMALIZE, - legend_map=LEGEND_MAP, -) -fig.figure.savefig( - "examples/plots/success_rate_performance_profile.png", bbox_inches="tight" -) - -fig, _, _ = aggregate_scores( # type: ignore - environment_comparison_matrix, - metric_name="success_rate", - metrics_to_normalize=METRICS_TO_NORMALIZE, - save_tabular_as_latex=True, - legend_map=LEGEND_MAP, -) -fig.figure.savefig( - "examples/plots/success_rate_aggregate_scores.png", bbox_inches="tight" -) - -fig = probability_of_improvement( - environment_comparison_matrix, - metric_name="success_rate", - metrics_to_normalize=METRICS_TO_NORMALIZE, - algorithms_to_compare=[ - ["algo_1", "algo_2"], - ["algo_1", "algo_3"], - ["algo_2", "algo_4"], - ], - legend_map=LEGEND_MAP, -) -fig.figure.savefig( - "examples/plots/success_rate_prob_of_improvement.png", bbox_inches="tight" -) +# # Aggregate data over all environment tasks. +# fig = performance_profiles( +# environment_comparison_matrix, +# metric_name=metric_name, +# metrics_to_normalize=METRICS_TO_NORMALIZE, +# legend_map=legend_map, +# colors=colors, +# ) +# # plt.legend(loc='lower center', bbox_to_anchor=(0.5, 0.9), prop={'size': 17}, ncol=5, bbox_transform=plt.gcf().transFigure, borderaxespad=0.2, frameon=True) +# fig.figure.savefig("examples/plots/return_performance_profile.pdf", bbox_inches="tight") + +# fig, _, _ = aggregate_scores( # type: ignore +# environment_comparison_matrix, +# metric_name=metric_name, +# metrics_to_normalize=METRICS_TO_NORMALIZE, +# save_tabular_as_latex=True, +# legend_map=legend_map, +# ) +# fig.figure.savefig( "examples/plots/return_aggregate_scores.pdf", bbox_inches="tight") + +# fig = probability_of_improvement( +# environment_comparison_matrix, +# metric_name=metric_name, +# metrics_to_normalize=METRICS_TO_NORMALIZE, +# algorithms_to_compare=[ +# ["ff_mappo", "ff_ippo"], +# ["rec_mappo", "rec_ippo"], +# ["ff_mappo", "rec_mappo"], +# ["ff_ippo", "rec_ippo"], +# ], +# legend_map=legend_map, +# ) +# fig.figure.savefig("examples/plots/return_prob_of_improvement.pdf", bbox_inches="tight") fig, _, _ = sample_efficiency_curves( # type: ignore sample_effeciency_matrix, - metric_name="success_rate", + metric_name=metric_name, metrics_to_normalize=METRICS_TO_NORMALIZE, - legend_map=LEGEND_MAP, -) -fig.figure.savefig( - "examples/plots/success_rate_sample_effeciency_curve.png", bbox_inches="tight" + legend_map=legend_map, + colors=colors, ) +# legend = plt.legend(loc='lower center', bbox_to_anchor=(0.5, 0.9), prop={'size': 15}, ncol=7, bbox_transform=plt.gcf().transFigure, borderaxespad=0.2, frameon=True) +# fig = legend.figure +# fig.canvas.draw() +# bbox = legend.get_window_extent() +# bbox = bbox.from_extents(*(bbox.extents + np.array([-4,-4,4,4]))) +# bbox = bbox.transformed(fig.dpi_scale_trans.inverted()) +# fig.savefig('legend.png', dpi=1200, bbox_inches=bbox) +# plt.hlines(y=2, xmin=0, xmax=4e7, colors='gray', linestyles='--', label='Threshold') +# plt.legend() +fig.figure.savefig(f"examples/plots/{env_name}_sample_effeciency_curve.pdf", bbox_inches="tight") + ############################## # Plot episode return data @@ -131,56 +125,15 @@ # Aggregate data over a single task -task = "task_1" -fig = plot_single_task( - processed_data=processed_data, - environment_name="env_1", - task_name=task, - metric_name="return", - metrics_to_normalize=METRICS_TO_NORMALIZE, - legend_map=LEGEND_MAP, -) - -fig.figure.savefig(f"examples/plots/env_1_{task}_agg_return.png", bbox_inches="tight") - -# Aggregate data over all environment tasks. - -fig = performance_profiles( - environment_comparison_matrix, - metric_name="return", - metrics_to_normalize=METRICS_TO_NORMALIZE, - legend_map=LEGEND_MAP, -) -fig.figure.savefig("examples/plots/return_performance_profile.png", bbox_inches="tight") - -fig, _, _ = aggregate_scores( # type: ignore - environment_comparison_matrix, - metric_name="return", - metrics_to_normalize=METRICS_TO_NORMALIZE, - save_tabular_as_latex=True, - legend_map=LEGEND_MAP, -) -fig.figure.savefig("examples/plots/return_aggregate_scores.png", bbox_inches="tight") - -fig = probability_of_improvement( - environment_comparison_matrix, - metric_name="return", - metrics_to_normalize=METRICS_TO_NORMALIZE, - algorithms_to_compare=[ - ["algo_1", "algo_2"], - ["algo_1", "algo_3"], - ["algo_2", "algo_4"], - ], - legend_map=LEGEND_MAP, -) -fig.figure.savefig("examples/plots/return_prob_of_improvement.png", bbox_inches="tight") - -fig, _, _ = sample_efficiency_curves( # type: ignore - sample_effeciency_matrix, - metric_name="return", - metrics_to_normalize=METRICS_TO_NORMALIZE, - legend_map=LEGEND_MAP, -) -fig.figure.savefig( - "examples/plots/return_sample_effeciency_curve.png", bbox_inches="tight" -) +for task in processed_data[env_name.lower()].keys(): + fig = plot_single_task( + processed_data=processed_data, + environment_name=env_name, + task_name=task, + metric_name=metric_name, + metrics_to_normalize=METRICS_TO_NORMALIZE, + legend_map=legend_map, + colors=colors, + ) + + fig.figure.savefig(f"examples/plots/{env_name}_{task}_agg_return.pdf", bbox_inches="tight") diff --git a/marl_eval/plotting_tools/plot_utils.py b/marl_eval/plotting_tools/plot_utils.py index 02b34948..651b66d7 100644 --- a/marl_eval/plotting_tools/plot_utils.py +++ b/marl_eval/plotting_tools/plot_utils.py @@ -82,7 +82,9 @@ def plot_single_task_curve( x_axis_len = len(aggregated_data[algorithm]["mean"]) # Set x-axis values to match evaluation interval steps. - x_axis_values = np.arange(x_axis_len) * extra_info["evaluation_interval"] + # x_axis_values = np.arange(x_axis_len) * extra_info["evaluation_interval"] + # Note: This is hardcoded for now as well. + x_axis_values = np.linspace(0,20, x_axis_len) if run_times is not None: x_axis_values = np.linspace(0, run_times[algorithm] / 60, x_axis_len) @@ -102,15 +104,17 @@ def plot_single_task_curve( ax.plot( x_axis_values, metric_values, - color=colors[algorithm], + color=colors[algorithm_name], marker=marker, linewidth=linewidth, label=algorithm_name, ) ax.fill_between( - x_axis_values, y1=lower, y2=upper, color=colors[algorithm], alpha=0.2 + x_axis_values, y1=lower, y2=upper, color=colors[algorithm_name], alpha=0.2 ) - + # plt.hlines(y=0.15, xmin=0, xmax=2e7, colors=colors["MAPPO"], linestyles='--', label='JaxMARL MAPPO') + # plt.hlines(y=0.1, xmin=0, xmax=2e7, colors=colors["IPPO"], linestyles='--', label='JaxMARL IPPO') + # plt.legend() return _annotate_and_decorate_axis( ax, xlabel=xlabel, diff --git a/marl_eval/plotting_tools/plotting.py b/marl_eval/plotting_tools/plotting.py index 49b33e5f..d24320de 100644 --- a/marl_eval/plotting_tools/plotting.py +++ b/marl_eval/plotting_tools/plotting.py @@ -38,6 +38,7 @@ def performance_profiles( metric_name: str, metrics_to_normalize: List[str], legend_map: Optional[Dict[str, str]] = None, + colors: Optional[Dict[str, str]] = None, ) -> Figure: """Produces performance profile plots. @@ -65,6 +66,7 @@ def performance_profiles( upper_algo_dict = {algo.upper(): value for algo, value in data_dictionary.items()} data_dictionary = upper_algo_dict algorithms = list(data_dictionary.keys()) + algorithms.sort(reverse=True) if legend_map is not None: legend_map = {algo.upper(): value for algo, value in legend_map.items()} @@ -73,6 +75,7 @@ def performance_profiles( legend_map[algo]: value for algo, value in data_dictionary.items() } algorithms = list(data_dictionary.keys()) + algorithms.sort(reverse=True) if metric_name in metrics_to_normalize: xlabel = "Normalized " + " ".join(metric_name.split("_")) @@ -85,16 +88,20 @@ def performance_profiles( ) # Plot score distributions - fig, ax = plt.subplots(ncols=1, figsize=(7, 5)) + fig, ax = plt.subplots(ncols=1, figsize=(12, 12)) # Note: change this based on needs plot_utils.plot_performance_profiles( score_distributions, np.linspace(0, 1, 100), performance_profile_cis=score_distributions_cis, - colors=dict(zip(algorithms, sns.color_palette(cc.glasbey_category10))), + colors=dict(zip(algorithms, sns.color_palette(cc.glasbey_category10))) if colors is None else colors, xlabel=f"{xlabel} " + r"$(\tau)$", ax=ax, - legend=algorithms, + legend=algorithms, # Note: legend=algorithms or legend=[] to remove the legend. ) + plt.ylabel(r"Fraction of runs with score > $\tau$",fontsize=40) + plt.xlabel("Mean episode return " + r"$(\tau)$",fontsize=40) + plt.xticks(fontsize=30) + plt.yticks(fontsize=30) return fig @@ -140,6 +147,7 @@ def aggregate_scores( upper_algo_dict = {algo.upper(): value for algo, value in data_dictionary.items()} data_dictionary = upper_algo_dict algorithms = list(data_dictionary.keys()) + algorithms.sort(reverse=True) if legend_map is not None: legend_map = {algo.upper(): value for algo, value in legend_map.items()} @@ -148,6 +156,7 @@ def aggregate_scores( legend_map[algo]: value for algo, value in data_dictionary.items() } algorithms = list(data_dictionary.keys()) + algorithms.sort(reverse=True) aggregate_func = lambda x: np.array( # noqa: E731 [ @@ -311,6 +320,7 @@ def sample_efficiency_curves( metrics_to_normalize: List[str], legend_map: Optional[Dict[str, str]] = None, xlabel: str = "Timesteps", + colors: Optional[Dict[str, str]] = None, ) -> Tuple[Figure, Dict[str, np.ndarray], Dict[str, np.ndarray]]: """Produces sample efficiency curve plots. @@ -346,6 +356,7 @@ def sample_efficiency_curves( upper_algo_dict = {algo.upper(): value for algo, value in data_dictionary.items()} data_dictionary = upper_algo_dict algorithms = list(data_dictionary.keys()) + algorithms.sort(reverse=True) if legend_map is not None: legend_map = {algo.upper(): value for algo, value in legend_map.items()} @@ -354,6 +365,7 @@ def sample_efficiency_curves( legend_map[algo]: value for algo, value in data_dictionary.items() } algorithms = list(data_dictionary.keys()) + algorithms.sort(reverse=True) # Find lowest values from amount of runs that have completed # across all algorithms @@ -375,6 +387,9 @@ def sample_efficiency_curves( iqm_scores, iqm_cis = rly.get_interval_estimates(scores_dict, iqm, reps=5000) + xticks = np.linspace(0, 20, len(x_axis_values)) # Note: assuming we do 20M timesteps + x_axis_values = xticks + fig = plot_utils.plot_sample_efficiency_curve( x_axis_values, iqm_scores, @@ -382,12 +397,19 @@ def sample_efficiency_curves( algorithms=algorithms, xlabel=xlabel, ylabel=ylabel, - legend=algorithms, + # xticks=xticks, + # xticklabels=xticklabels, + legend=[], # Note: legend=algorithms or legend=[] to remove the legend. figsize=(15, 8), - color_palette=cc.glasbey_category10, + colors=dict(zip(algorithms, sns.color_palette(cc.glasbey_category10))) if colors is None else colors, ) dictionary["extra"] = extra + plt.xlabel("Timesteps [Millions]",fontsize=40) + plt.ylabel("Mean episode return",fontsize=40) + plt.xticks(fontsize=30) + plt.yticks(fontsize=30) + # plt.legend(prop={'size': 25}) # Note: Comment when NOT inserting the legend. return fig, iqm_scores, iqm_cis @@ -401,6 +423,7 @@ def plot_single_task( xlabel: str = "Timesteps", legend_map: Optional[Dict[str, str]] = None, run_times: Optional[Dict[str, float]] = None, + colors: Optional[Dict[str, str]] = None, ) -> Figure: """Produces aggregated plot for a single task in an environment. @@ -441,6 +464,7 @@ def plot_single_task( task_mean_ci_data = upper_algo_dict algorithms = list(task_mean_ci_data.keys()) algorithms.remove("extra") + algorithms.sort(reverse=True) if legend_map is not None: legend_map = {algo.upper(): value for algo, value in legend_map.items()} @@ -454,12 +478,17 @@ def plot_single_task( algorithms=algorithms, xlabel=xlabel, ylabel=ylabel, - legend=algorithms, + legend=algorithms, # Note: legend=algorithms or legend=[] to remove the legend. figsize=(15, 8), - color_palette=cc.glasbey_category10, + colors=dict(zip(algorithms, sns.color_palette(cc.glasbey_category10))) if colors is None else colors, legend_map=legend_map, run_times=run_times, marker="", ) - + plt.title(f"{task_name}", fontsize=40) + plt.xlabel("Timesteps [Millions]",fontsize=40) + plt.ylabel("Mean episode return",fontsize=40) + plt.xticks(fontsize=30) + plt.yticks(fontsize=30) + plt.legend(prop={'size': 25}) # Note: Uncomment when inserting the legend. return fig