for-ai · ljvmiranda921 · Oct 13, 2024 · Oct 12, 2024 · Oct 12, 2024 · Oct 12, 2024
diff --git a/analysis/avg_agreement_final.py b/analysis/avg_agreement_final.py
@@ -2,91 +2,76 @@
 import matplotlib.pyplot as plt
 import numpy as np
 
+FONT_SIZES = {"small": 12, "medium": 16, "large": 18}
+COLORS = {"green": "#355145", "purple": "#d8a6e5", "orange": "#fe7759"}
+
+PLOT_PARAMS = {
+    "font.family": "serif",
+    "font.serif": ["Times New Roman", "STIX"],
+    "font.size": FONT_SIZES.get("medium"),
+    "axes.titlesize": FONT_SIZES.get("large"),
+    "axes.labelsize": FONT_SIZES.get("large"),
+    "xtick.labelsize": FONT_SIZES.get("large"),
+    "ytick.labelsize": FONT_SIZES.get("large"),
+    "legend.fontsize": FONT_SIZES.get("medium"),
+    "figure.titlesize": FONT_SIZES.get("medium"),
+    "text.usetex": False,
+}
+
+plt.rcParams.update(PLOT_PARAMS)
+
+
 data = {
-  "meta-llama/Meta-Llama-3.1-8B-Instruct": [
-    0.3533086666014079,
-    0.052422082615756406
-  ],
-  "cohere/c4ai-aya-23-35b": [
-    0.43767196047824003,
-    0.026040919354464294
-  ],
-  "cohere/c4ai-aya-23-8b": [
-    0.013483014909052663,
-    0.03363706833599835
-  ],
-  "cohere/command-r-08-2024": [
-    0.374457668650282,
-    0.02926089754079793
-  ],
-  "cohere/command-r-plus-08-2024": [
-    0.3830841816733316,
-    0.020185255968455686
-  ],
-  "google/gemma-1.1-7b-it": [
-    0.5190375637539242,
-    0.027757722654111305
-  ],
-  "google/gemma-2-9b-it": [
-    0.5181663123111222,
-    0.031090119385244894
-  ],
-  "meta-llama/Meta-Llama-3-70B-Instruct": [
-    0.5685224105896568,
-    0.04853344616275034
-  ],
-  "meta-llama/Meta-Llama-3-8B-Instruct": [
-    0.37936948540837095,
-    0.032172769265151994
-  ],
-  "meta-llama/Meta-Llama-3.1-70B-Instruct": [
-    0.603536768244583,
-    0.027191895488989915
-  ],
-  "mistralai/Mistral-7B-Instruct-v0.2": [
-    0.4071166722276529,
-    0.04577594028555328
-  ],
-  "mistralai/Mistral-7B-Instruct-v0.3": [
-    0.41195018984687265,
-    0.056184679972755454
-  ],
-  "openai/gpt-4-turbo-2024-04-09": [
-    0.6106943361444249,
-    0.02932446842558468
-  ],
-  "openai/gpt-4o-2024-05-13": [
-    0.5833874065757011,
-    0.023695391445384514
-  ]
+    "LlaMa 3.1 8B": [0.3533086666014079, 0.052422082615756406],
+    "Aya 23 35B": [0.43767196047824003, 0.026040919354464294],
+    # "Aya 23 8B": [0.013483014909052663, 0.03363706833599835],
+    "Command R": [0.374457668650282, 0.02926089754079793],
+    "Command R+": [0.3830841816733316, 0.020185255968455686],
+    "Gemma 1.1 7B": [0.5190375637539242, 0.027757722654111305],
+    "Gemma 2 9B": [0.5181663123111222, 0.031090119385244894],
+    "LlaMa 3 70B": [0.5685224105896568, 0.04853344616275034],
+    "LlaMa 3 8B": [0.37936948540837095, 0.032172769265151994],
+    "LlaMa 3.1 70B": [0.603536768244583, 0.027191895488989915],
+    "Mistal 7B v0.2": [0.4071166722276529, 0.04577594028555328],
+    "Mistral 7B v0.3": [0.41195018984687265, 0.056184679972755454],
+    "GPT-4 Turbo": [0.6106943361444249, 0.02932446842558468],
+    "GPT-4o": [0.5833874065757011, 0.023695391445384514],
 }
 
 sorted_data = dict(sorted(data.items(), key=lambda item: item[1][0]))
 labels_sorted = list(sorted_data.keys())
 means_sorted = [v[0] for v in sorted_data.values()]
 std_devs_sorted = [v[1] for v in sorted_data.values()]
 
-sns.set(style="whitegrid")
-palette = sns.color_palette("coolwarm", len(labels_sorted))
+# sns.set(style="whitegrid")
+# palette = sns.color_palette("coolwarm", len(labels_sorted))
 
-plt.figure(figsize=(10, 6))
+plt.figure(figsize=(7, 7))
 x_pos_sorted = np.arange(len(labels_sorted))
 
-ax1 = sns.barplot(x=x_pos_sorted, y=means_sorted, palette=palette, errorbar=None)
-plt.errorbar(x_pos_sorted, means_sorted, yerr=std_devs_sorted, fmt='none', c='black', capsize=5)
+ax1 = sns.barplot(
+    x=x_pos_sorted,
+    y=means_sorted,
+    errorbar=None,
+    color=COLORS.get("orange"),
+    edgecolor=COLORS.get("green"),
+)
+plt.errorbar(x_pos_sorted, means_sorted, yerr=std_devs_sorted, fmt="none", c="black", capsize=5)
 
-ax1.spines['top'].set_color('black')
-ax1.spines['right'].set_color('black')
-ax1.spines['left'].set_color('black')
-ax1.spines['bottom'].set_color('black')
-for spine in ax1.spines.values():
-    spine.set_linewidth(2)  # Make the border thicker
+# ax1.spines["top"].set_color("black")
+# ax1.spines["right"].set_color("black")
+# ax1.spines["left"].set_color("black")
+# ax1.spines["bottom"].set_color("black")
+# for spine in ax1.spines.values():
+#     spine.set_linewidth(2)  # Make the border thicker
+plt.grid(color="gray", axis="y", alpha=0.2)
 
 plt.ylim(0, 0.8)
+plt.gca().set_axisbelow(True)
 
-plt.xticks(x_pos_sorted, labels_sorted, rotation=90)
+plt.xticks(x_pos_sorted, labels_sorted, rotation=45, ha="right")
 plt.ylabel("Cohen's Kappa")
-plt.title('Average Inner-Model Agreement Across Languages')
+plt.title("Average Inner-Model Agreement Across Languages")
 
 plt.tight_layout()
-plt.savefig(f"./innermodel_agreement.pdf", bbox_inches='tight')
+plt.savefig("plots/innermodel_agreement_green_oracle.pdf", bbox_inches="tight")
diff --git a/analysis/plot_results.py b/analysis/plot_results.py
@@ -13,7 +13,7 @@
 
 PLOT_PARAMS = {
     "font.family": "serif",
-    "font.serif": ["Times New Roman", "STIX"],
+    "font.serif": ["Times", "Times New Roman", "STIX"],
     "font.size": FONT_SIZES.get("medium"),
     "axes.titlesize": FONT_SIZES.get("large"),
     "axes.labelsize": FONT_SIZES.get("large"),
@@ -66,6 +66,8 @@
     "zho": "zh",
 }
 
+COLORS = {"green": "#355145", "purple": "#d8a6e5", "orange": "#fe7759"}
+
 
 def get_args():
     # fmt: off
@@ -122,6 +124,7 @@ def plot_main_heatmap(
     df = pd.read_csv(input_path)
     # Remove unnecessary column
     df.pop("eng_Latn")
+    df.pop("Family")
 
     df = df.sort_values(by="Avg_Multilingual", ascending=False).head(10).reset_index(drop=True)
     data = df[[col for col in df.columns if col not in ["Model_Type"]]].rename(columns={"Avg_Multilingual": "Avg"})
@@ -133,14 +136,39 @@ def plot_main_heatmap(
     data.pop("zho_Hant")
     data = data[sorted(data.columns)]
     data.columns = [col.split("_")[0] for col in data.columns]
+    data["Var"] = data[list(LANG_STANDARDIZATION.keys())].var(axis=1)
     data = data.rename(columns=LANG_STANDARDIZATION)
 
-    fig, ax = plt.subplots(1, 1, figsize=figsize)
-    sns.heatmap(data, ax=ax, cmap="YlGn", annot=True, annot_kws={"size": 16}, fmt=".2f", cbar=False)
-    ax.xaxis.set_ticks_position("top")
-    ax.tick_params(axis="x")
-    ax.set_ylabel("")
-    ax.set_yticklabels([f"{model}     " for model in data.index])
+    lang_results = data[list(LANG_STANDARDIZATION.values())]
+    avg = data[["Avg"]]
+    var = data[["Var"]]
+
+    fig, axs = plt.subplots(ncols=3, figsize=figsize, gridspec_kw={"width_ratios": [0.5, 0.5, 9]}, sharey=True)
+    cmap = "Greys"
+    fmt = ".1f"
+
+    sns.heatmap(avg, ax=axs[0], cmap=cmap, annot=True, annot_kws={"size": 16}, fmt=fmt, cbar=False)
+    axs[0].xaxis.set_ticks_position("top")
+    axs[0].set_xticklabels(avg.columns, fontsize=20)
+    axs[0].tick_params(axis="x")
+    axs[0].set_ylabel("")
+    axs[0].set_yticklabels([f"{model}     " for model in avg.index], fontsize=20)
+
+    sns.heatmap(var, ax=axs[1], cmap=cmap, annot=True, annot_kws={"size": 16}, fmt=fmt, cbar=False)
+    axs[1].xaxis.set_ticks_position("top")
+    axs[1].set_xticklabels(var.columns, fontsize=20)
+    axs[1].tick_params(axis="x")
+    axs[1].set_ylabel("")
+    axs[1].tick_params(axis="y", length=0)
+    axs[1].set_yticklabels([f"{model}     " for model in var.index], fontsize=20)
+
+    sns.heatmap(lang_results, ax=axs[2], cmap=cmap, annot=True, annot_kws={"size": 16}, fmt=fmt, cbar=False)
+    axs[2].xaxis.set_ticks_position("top")
+    axs[2].set_xticklabels(lang_results.columns, fontsize=20)
+    axs[2].tick_params(axis="x")
+    axs[2].tick_params(axis="y", length=0)
+    axs[2].set_ylabel("")
+    axs[2].set_yticklabels([f"{model}     " for model in lang_results.index], fontsize=20)
 
     plt.tight_layout()
     fig.savefig(output_path, bbox_inches="tight")
@@ -155,7 +183,7 @@ def plot_eng_drop_line(
     from scipy.stats import pearsonr, spearmanr
 
     df = pd.read_csv(input_path)
-    df = df[["Model", "Model_Type", "eng_Latn", "Avg_Multilingual"]]
+    df = df[["Model", "Model_Type", "Family", "eng_Latn", "Avg_Multilingual"]]
     df = df.sort_values(by="Avg_Multilingual", ascending=False).reset_index(drop=True)
     data = df.set_index("Model").dropna()
     data[data.select_dtypes(include="number").columns] = data.select_dtypes(include="number") * 100
@@ -166,11 +194,19 @@ def plot_eng_drop_line(
 
     fig, ax = plt.subplots(figsize=figsize)
 
-    colors = ["red", "green", "blue"]
+    colors = [COLORS.get("green"), COLORS.get("purple"), COLORS.get("orange")]
+    markers = ["o", "*", "D"]
     for (label, group), color in zip(data.groupby("Model_Type"), colors):
         mrewardbench_scores = group["Avg_Multilingual"]
         rewardbench_scores = group["eng_Latn"]
-        ax.scatter(rewardbench_scores, mrewardbench_scores, marker="o", s=40, label=label, color=color)
+        ax.scatter(
+            rewardbench_scores,
+            mrewardbench_scores,
+            marker="o",
+            s=60,
+            label=label,
+            color=color,
+        )
 
     mrewardbench_scores = data["Avg_Multilingual"]
     rewardbench_scores = data["eng_Latn"]
@@ -188,22 +224,23 @@ def plot_eng_drop_line(
     ax.set_aspect("equal")
     ax.legend(frameon=False, handletextpad=0.2, fontsize=12)
 
-    model_names = [MODEL_STANDARDIZATION[model] for model in data.index]
-    texts = [
-        ax.text(
-            rewardbench_scores[idx],
-            mrewardbench_scores[idx],
-            model_names[idx],
-            fontsize=14,
+    if top_n:
+        model_names = [MODEL_STANDARDIZATION[model] for model in data.index]
+        texts = [
+            ax.text(
+                rewardbench_scores[idx],
+                mrewardbench_scores[idx],
+                model_names[idx],
+                fontsize=14,
+            )
+            for idx in range(len(data))
+        ]
+        adjust_text(
+            texts,
+            ax=ax,
+            # force_static=0.15,
+            arrowprops=dict(arrowstyle="->", color="gray"),
         )
-        for idx in range(len(data))
-    ]
-    adjust_text(
-        texts,
-        ax=ax,
-        # force_static=0.15,
-        arrowprops=dict(arrowstyle="->", color="gray"),
-    )
 
     # ax.text(
     #     0.6,
@@ -270,7 +307,8 @@ def plot_ling_dims(
             y=dim,
             data=lingdf,
             ax=ax,
-            color="green",
+            color=COLORS.get("orange"),
+            edgecolor=COLORS.get("green"),
             width=0.4 if dim == "Resource Availability" else 0.7,
         )
         ax.set_title(dim)