Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update all charts #47

Merged
merged 10 commits into from
Oct 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 55 additions & 70 deletions analysis/avg_agreement_final.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,91 +2,76 @@
import matplotlib.pyplot as plt
import numpy as np

FONT_SIZES = {"small": 12, "medium": 16, "large": 18}
COLORS = {"green": "#355145", "purple": "#d8a6e5", "orange": "#fe7759"}

PLOT_PARAMS = {
"font.family": "serif",
"font.serif": ["Times New Roman", "STIX"],
"font.size": FONT_SIZES.get("medium"),
"axes.titlesize": FONT_SIZES.get("large"),
"axes.labelsize": FONT_SIZES.get("large"),
"xtick.labelsize": FONT_SIZES.get("large"),
"ytick.labelsize": FONT_SIZES.get("large"),
"legend.fontsize": FONT_SIZES.get("medium"),
"figure.titlesize": FONT_SIZES.get("medium"),
"text.usetex": False,
}

plt.rcParams.update(PLOT_PARAMS)


data = {
"meta-llama/Meta-Llama-3.1-8B-Instruct": [
0.3533086666014079,
0.052422082615756406
],
"cohere/c4ai-aya-23-35b": [
0.43767196047824003,
0.026040919354464294
],
"cohere/c4ai-aya-23-8b": [
0.013483014909052663,
0.03363706833599835
],
"cohere/command-r-08-2024": [
0.374457668650282,
0.02926089754079793
],
"cohere/command-r-plus-08-2024": [
0.3830841816733316,
0.020185255968455686
],
"google/gemma-1.1-7b-it": [
0.5190375637539242,
0.027757722654111305
],
"google/gemma-2-9b-it": [
0.5181663123111222,
0.031090119385244894
],
"meta-llama/Meta-Llama-3-70B-Instruct": [
0.5685224105896568,
0.04853344616275034
],
"meta-llama/Meta-Llama-3-8B-Instruct": [
0.37936948540837095,
0.032172769265151994
],
"meta-llama/Meta-Llama-3.1-70B-Instruct": [
0.603536768244583,
0.027191895488989915
],
"mistralai/Mistral-7B-Instruct-v0.2": [
0.4071166722276529,
0.04577594028555328
],
"mistralai/Mistral-7B-Instruct-v0.3": [
0.41195018984687265,
0.056184679972755454
],
"openai/gpt-4-turbo-2024-04-09": [
0.6106943361444249,
0.02932446842558468
],
"openai/gpt-4o-2024-05-13": [
0.5833874065757011,
0.023695391445384514
]
"LlaMa 3.1 8B": [0.3533086666014079, 0.052422082615756406],
"Aya 23 35B": [0.43767196047824003, 0.026040919354464294],
# "Aya 23 8B": [0.013483014909052663, 0.03363706833599835],
"Command R": [0.374457668650282, 0.02926089754079793],
"Command R+": [0.3830841816733316, 0.020185255968455686],
"Gemma 1.1 7B": [0.5190375637539242, 0.027757722654111305],
"Gemma 2 9B": [0.5181663123111222, 0.031090119385244894],
"LlaMa 3 70B": [0.5685224105896568, 0.04853344616275034],
"LlaMa 3 8B": [0.37936948540837095, 0.032172769265151994],
"LlaMa 3.1 70B": [0.603536768244583, 0.027191895488989915],
"Mistal 7B v0.2": [0.4071166722276529, 0.04577594028555328],
"Mistral 7B v0.3": [0.41195018984687265, 0.056184679972755454],
"GPT-4 Turbo": [0.6106943361444249, 0.02932446842558468],
"GPT-4o": [0.5833874065757011, 0.023695391445384514],
}

sorted_data = dict(sorted(data.items(), key=lambda item: item[1][0]))
labels_sorted = list(sorted_data.keys())
means_sorted = [v[0] for v in sorted_data.values()]
std_devs_sorted = [v[1] for v in sorted_data.values()]

sns.set(style="whitegrid")
palette = sns.color_palette("coolwarm", len(labels_sorted))
# sns.set(style="whitegrid")
# palette = sns.color_palette("coolwarm", len(labels_sorted))

plt.figure(figsize=(10, 6))
plt.figure(figsize=(7, 7))
x_pos_sorted = np.arange(len(labels_sorted))

ax1 = sns.barplot(x=x_pos_sorted, y=means_sorted, palette=palette, errorbar=None)
plt.errorbar(x_pos_sorted, means_sorted, yerr=std_devs_sorted, fmt='none', c='black', capsize=5)
ax1 = sns.barplot(
x=x_pos_sorted,
y=means_sorted,
errorbar=None,
color=COLORS.get("orange"),
edgecolor=COLORS.get("green"),
)
plt.errorbar(x_pos_sorted, means_sorted, yerr=std_devs_sorted, fmt="none", c="black", capsize=5)

ax1.spines['top'].set_color('black')
ax1.spines['right'].set_color('black')
ax1.spines['left'].set_color('black')
ax1.spines['bottom'].set_color('black')
for spine in ax1.spines.values():
spine.set_linewidth(2) # Make the border thicker
# ax1.spines["top"].set_color("black")
# ax1.spines["right"].set_color("black")
# ax1.spines["left"].set_color("black")
# ax1.spines["bottom"].set_color("black")
# for spine in ax1.spines.values():
# spine.set_linewidth(2) # Make the border thicker
plt.grid(color="gray", axis="y", alpha=0.2)

plt.ylim(0, 0.8)
plt.gca().set_axisbelow(True)

plt.xticks(x_pos_sorted, labels_sorted, rotation=90)
plt.xticks(x_pos_sorted, labels_sorted, rotation=45, ha="right")
plt.ylabel("Cohen's Kappa")
plt.title('Average Inner-Model Agreement Across Languages')
plt.title("Average Inner-Model Agreement Across Languages")

plt.tight_layout()
plt.savefig(f"./innermodel_agreement.pdf", bbox_inches='tight')
plt.savefig("plots/innermodel_agreement_green_oracle.pdf", bbox_inches="tight")
90 changes: 64 additions & 26 deletions analysis/plot_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

PLOT_PARAMS = {
"font.family": "serif",
"font.serif": ["Times New Roman", "STIX"],
"font.serif": ["Times", "Times New Roman", "STIX"],
"font.size": FONT_SIZES.get("medium"),
"axes.titlesize": FONT_SIZES.get("large"),
"axes.labelsize": FONT_SIZES.get("large"),
Expand Down Expand Up @@ -66,6 +66,8 @@
"zho": "zh",
}

COLORS = {"green": "#355145", "purple": "#d8a6e5", "orange": "#fe7759"}


def get_args():
# fmt: off
Expand Down Expand Up @@ -122,6 +124,7 @@ def plot_main_heatmap(
df = pd.read_csv(input_path)
# Remove unnecessary column
df.pop("eng_Latn")
df.pop("Family")

df = df.sort_values(by="Avg_Multilingual", ascending=False).head(10).reset_index(drop=True)
data = df[[col for col in df.columns if col not in ["Model_Type"]]].rename(columns={"Avg_Multilingual": "Avg"})
Expand All @@ -133,14 +136,39 @@ def plot_main_heatmap(
data.pop("zho_Hant")
data = data[sorted(data.columns)]
data.columns = [col.split("_")[0] for col in data.columns]
data["Var"] = data[list(LANG_STANDARDIZATION.keys())].var(axis=1)
data = data.rename(columns=LANG_STANDARDIZATION)

fig, ax = plt.subplots(1, 1, figsize=figsize)
sns.heatmap(data, ax=ax, cmap="YlGn", annot=True, annot_kws={"size": 16}, fmt=".2f", cbar=False)
ax.xaxis.set_ticks_position("top")
ax.tick_params(axis="x")
ax.set_ylabel("")
ax.set_yticklabels([f"{model} " for model in data.index])
lang_results = data[list(LANG_STANDARDIZATION.values())]
avg = data[["Avg"]]
var = data[["Var"]]

fig, axs = plt.subplots(ncols=3, figsize=figsize, gridspec_kw={"width_ratios": [0.5, 0.5, 9]}, sharey=True)
cmap = "Greys"
fmt = ".1f"

sns.heatmap(avg, ax=axs[0], cmap=cmap, annot=True, annot_kws={"size": 16}, fmt=fmt, cbar=False)
axs[0].xaxis.set_ticks_position("top")
axs[0].set_xticklabels(avg.columns, fontsize=20)
axs[0].tick_params(axis="x")
axs[0].set_ylabel("")
axs[0].set_yticklabels([f"{model} " for model in avg.index], fontsize=20)

sns.heatmap(var, ax=axs[1], cmap=cmap, annot=True, annot_kws={"size": 16}, fmt=fmt, cbar=False)
axs[1].xaxis.set_ticks_position("top")
axs[1].set_xticklabels(var.columns, fontsize=20)
axs[1].tick_params(axis="x")
axs[1].set_ylabel("")
axs[1].tick_params(axis="y", length=0)
axs[1].set_yticklabels([f"{model} " for model in var.index], fontsize=20)

sns.heatmap(lang_results, ax=axs[2], cmap=cmap, annot=True, annot_kws={"size": 16}, fmt=fmt, cbar=False)
axs[2].xaxis.set_ticks_position("top")
axs[2].set_xticklabels(lang_results.columns, fontsize=20)
axs[2].tick_params(axis="x")
axs[2].tick_params(axis="y", length=0)
axs[2].set_ylabel("")
axs[2].set_yticklabels([f"{model} " for model in lang_results.index], fontsize=20)

plt.tight_layout()
fig.savefig(output_path, bbox_inches="tight")
Expand All @@ -155,7 +183,7 @@ def plot_eng_drop_line(
from scipy.stats import pearsonr, spearmanr

df = pd.read_csv(input_path)
df = df[["Model", "Model_Type", "eng_Latn", "Avg_Multilingual"]]
df = df[["Model", "Model_Type", "Family", "eng_Latn", "Avg_Multilingual"]]
df = df.sort_values(by="Avg_Multilingual", ascending=False).reset_index(drop=True)
data = df.set_index("Model").dropna()
data[data.select_dtypes(include="number").columns] = data.select_dtypes(include="number") * 100
Expand All @@ -166,11 +194,19 @@ def plot_eng_drop_line(

fig, ax = plt.subplots(figsize=figsize)

colors = ["red", "green", "blue"]
colors = [COLORS.get("green"), COLORS.get("purple"), COLORS.get("orange")]
markers = ["o", "*", "D"]
for (label, group), color in zip(data.groupby("Model_Type"), colors):
mrewardbench_scores = group["Avg_Multilingual"]
rewardbench_scores = group["eng_Latn"]
ax.scatter(rewardbench_scores, mrewardbench_scores, marker="o", s=40, label=label, color=color)
ax.scatter(
rewardbench_scores,
mrewardbench_scores,
marker="o",
s=60,
label=label,
color=color,
)

mrewardbench_scores = data["Avg_Multilingual"]
rewardbench_scores = data["eng_Latn"]
Expand All @@ -188,22 +224,23 @@ def plot_eng_drop_line(
ax.set_aspect("equal")
ax.legend(frameon=False, handletextpad=0.2, fontsize=12)

model_names = [MODEL_STANDARDIZATION[model] for model in data.index]
texts = [
ax.text(
rewardbench_scores[idx],
mrewardbench_scores[idx],
model_names[idx],
fontsize=14,
if top_n:
model_names = [MODEL_STANDARDIZATION[model] for model in data.index]
texts = [
ax.text(
rewardbench_scores[idx],
mrewardbench_scores[idx],
model_names[idx],
fontsize=14,
)
for idx in range(len(data))
]
adjust_text(
texts,
ax=ax,
# force_static=0.15,
arrowprops=dict(arrowstyle="->", color="gray"),
)
for idx in range(len(data))
]
adjust_text(
texts,
ax=ax,
# force_static=0.15,
arrowprops=dict(arrowstyle="->", color="gray"),
)

# ax.text(
# 0.6,
Expand Down Expand Up @@ -270,7 +307,8 @@ def plot_ling_dims(
y=dim,
data=lingdf,
ax=ax,
color="green",
color=COLORS.get("orange"),
edgecolor=COLORS.get("green"),
width=0.4 if dim == "Resource Availability" else 0.7,
)
ax.set_title(dim)
Expand Down