Skip to content

Commit

Permalink
Add coling2025 experiments
Browse files Browse the repository at this point in the history
  • Loading branch information
dustalov committed Dec 30, 2024
1 parent a1e3f54 commit 66111a3
Show file tree
Hide file tree
Showing 10 changed files with 960 additions and 0 deletions.
3 changes: 3 additions & 0 deletions coling2025/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
clean_battle_20240814_public.json
llmfao.csv
scale/*.parquet
28 changes: 28 additions & 0 deletions coling2025/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Code to Reproduce Experiments from the COLING 2025 Paper

- Ustalov, D. [Reliable, Reproducible, and Really Fast Leaderboards with Evalica](https://arxiv.org/abs/2412.11314). 2024. arXiv: [2412.11314 [cs.CL]](https://arxiv.org/abs/2412.11314).

## Prerequisites

- [`requirements.txt`](requirements.txt)
- Chatbot Arena's Dump (August 2024): <https://storage.googleapis.com/arena_external_data/public/clean_battle_20240814_public.json>
- LLMFAO Dataset: <https://raw.githubusercontent.com/dustalov/llmfao/refs/heads/master/crowd-comparisons.csv>

## Table 1: [chatbot_arena.csv](chatbot_arena.csv)

```shell
python3 -m chatbot_arena
```

## Table 2: [rust_python.csv](rust_python.csv)

```shell
python3 -m rust_python
```

## Table 3: [scale.csv](scale.csv)

```shell
python3 -m scale_data
python3 -m scale_compute
```
41 changes: 41 additions & 0 deletions coling2025/chatbot_arena.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
algorithm,solver,time
elo,arena,4.505625984999824
elo,arena,4.30741854200005
elo,arena,3.8287284730004103
elo,arena,3.2124255979997542
elo,arena,3.1871768069995596
elo,arena,4.54556304200014
elo,arena,3.89093991000027
elo,arena,3.2158020300003045
elo,arena,3.2279247070000565
elo,arena,3.690373288999581
bradley_terry,arena,53.84085044400035
bradley_terry,arena,49.05527460100075
bradley_terry,arena,49.824193399999785
bradley_terry,arena,49.06932971599963
bradley_terry,arena,48.84145686500051
bradley_terry,arena,48.852593298999636
bradley_terry,arena,51.96913476999998
bradley_terry,arena,53.00518341099996
bradley_terry,arena,55.14430098199955
bradley_terry,arena,57.280526522999935
elo,evalica,1.2934383190004155
elo,evalica,1.2451738849995309
elo,evalica,1.263170829000046
elo,evalica,1.3015334930005338
elo,evalica,1.2956993719999446
elo,evalica,1.2331900440003665
elo,evalica,1.2465266949993747
elo,evalica,1.240900351000164
elo,evalica,1.2116083800001434
elo,evalica,1.218696920000184
bradley_terry,evalica,1.1849060429995006
bradley_terry,evalica,1.164167107999674
bradley_terry,evalica,1.1925056350000887
bradley_terry,evalica,1.1563715420006702
bradley_terry,evalica,1.196678212999359
bradley_terry,evalica,1.167977401999451
bradley_terry,evalica,1.1835675629999969
bradley_terry,evalica,1.1618928819998473
bradley_terry,evalica,1.1576560439998502
bradley_terry,evalica,1.1638413099999525
97 changes: 97 additions & 0 deletions coling2025/chatbot_arena.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#!/usr/bin/env python3

# ruff: noqa: E501, EM101, F401, N803

from __future__ import annotations

import math
from collections import defaultdict # noqa: TC003
from functools import partial
from timeit import repeat

import evalica
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from tqdm.auto import tqdm

REPETITIONS = 10


def chatbot_arena_elo(
battles: pd.DataFrame,
K: float = 4,
SCALE: float = 400,
BASE: float = 10,
INIT_RATING: float = 1000,
) -> defaultdict[str, float]:
raise NotImplementedError(
"Please copy the code from the official Chatbot Arena notebook and paste it here: "
"https://colab.research.google.com/drive/1KdwokPjirkTmpO_P1WByFNFiqxWQquwH "
"(compute_online_elo function)",
)


def arena_hard_bradley_terry(
df: pd.DataFrame,
SCALE: float = 400,
BASE: float = 10,
INIT_RATING: float = 1000,
) -> pd.Series[str]:
raise NotImplementedError(
"Please copy the code from the official Arena-Hard repository and paste it here: "
"https://github.com/lmarena/arena-hard-auto/blob/2971e34d066f986c09bc5a463fa286fa93bcca3c/utils_math.py#L38-L69",
)


def main() -> None:
df_arena = pd.read_json("clean_battle_20240814_public.json")
df_arena = df_arena[df_arena["anony"]]
df_arena = df_arena[df_arena["dedup_tag"].apply(lambda x: x.get("sampled", False))]
df_arena["evalica"] = df_arena["winner"].map({
"model_a": evalica.Winner.X,
"model_b": evalica.Winner.Y,
"tie": evalica.Winner.Draw,
"tie (bothbad)": evalica.Winner.Draw,
})
df_arena = df_arena[~df_arena["evalica"].isna()]

results = []

with tqdm(total=4) as pbar:
arena_elo_time = repeat(
partial(chatbot_arena_elo, df_arena),
repeat=REPETITIONS, number=1,
)
results.append(("elo", "arena", arena_elo_time))
pbar.update()

hard_arena_bt_time = repeat(
partial(arena_hard_bradley_terry, df_arena),
repeat=REPETITIONS, number=1,
)
results.append(("bradley_terry", "arena", hard_arena_bt_time))
pbar.update()

evalica_elo_time = repeat(
partial(evalica.elo, df_arena["model_a"], df_arena["model_b"], df_arena["evalica"]),
repeat=REPETITIONS, number=1,
)
results.append(("elo", "evalica", evalica_elo_time))
pbar.update()

evalica_bt_time = repeat(
partial(evalica.bradley_terry, df_arena["model_a"], df_arena["model_b"], df_arena["evalica"]),
repeat=REPETITIONS, number=1,
)
results.append(("bradley_terry", "evalica", evalica_bt_time))
pbar.update()

df_results = pd.DataFrame(results, columns=["algorithm", "solver", "time"])
df_results = df_results.explode("time")
df_results = df_results.reset_index(drop=True)
df_results.to_csv("chatbot_arena.csv", index=False)


if __name__ == "__main__":
main()
5 changes: 5 additions & 0 deletions coling2025/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
evalica==0.3.2
pandas==2.2.3
pyarrow==18.1.0
scikit-learn==1.6.0
tqdm==4.67.1
141 changes: 141 additions & 0 deletions coling2025/rust_python.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
algorithm,solver,time
counting,pyo3,0.006875361001220881
counting,pyo3,0.005204043000048841
counting,pyo3,0.005150118999154074
counting,pyo3,0.004930110999339377
counting,pyo3,0.005055014999015839
counting,pyo3,0.004834370000025956
counting,pyo3,0.0048147329998755595
counting,pyo3,0.005039478999606217
counting,pyo3,0.0049270449999312405
counting,pyo3,0.005079647999082226
counting,naive,0.009094708000702667
counting,naive,0.009216813999955775
counting,naive,0.009107648000281188
counting,naive,0.009295828000176698
counting,naive,0.009225302001141245
counting,naive,0.009263153999199858
counting,naive,0.009250584000255913
counting,naive,0.009232936999978847
counting,naive,0.00893497099968954
counting,naive,0.008893333000742132
average_win_rate,pyo3,0.005162987999938196
average_win_rate,pyo3,0.005016049999539973
average_win_rate,pyo3,0.0049611690010351595
average_win_rate,pyo3,0.004952190000039991
average_win_rate,pyo3,0.004995280000002822
average_win_rate,pyo3,0.004976274000000558
average_win_rate,pyo3,0.00487582499954442
average_win_rate,pyo3,0.004842217000259552
average_win_rate,pyo3,0.004915758001516224
average_win_rate,pyo3,0.004940922999594477
average_win_rate,naive,0.0056375940002908465
average_win_rate,naive,0.0056304649988305755
average_win_rate,naive,0.0067451510003593285
average_win_rate,naive,0.005464813999424223
average_win_rate,naive,0.0059818110003106995
average_win_rate,naive,0.005634520999592496
average_win_rate,naive,0.0056934169988380745
average_win_rate,naive,0.006093824000345194
average_win_rate,naive,0.005781127998488955
average_win_rate,naive,0.0062054570007603616
bradley_terry,pyo3,0.0053178769994701724
bradley_terry,pyo3,0.005525047999981325
bradley_terry,pyo3,0.005011375000322005
bradley_terry,pyo3,0.005122900998685509
bradley_terry,pyo3,0.005099248999613337
bradley_terry,pyo3,0.0050138889982918045
bradley_terry,pyo3,0.005214843999056029
bradley_terry,pyo3,0.005149094000444165
bradley_terry,pyo3,0.005218072999923606
bradley_terry,pyo3,0.005254742998658912
bradley_terry,naive,0.012066170998878079
bradley_terry,naive,0.011944162999498076
bradley_terry,naive,0.011667112999930396
bradley_terry,naive,0.011669860999973025
bradley_terry,naive,0.011628184000073816
bradley_terry,naive,0.011669400000755559
bradley_terry,naive,0.01161658199998783
bradley_terry,naive,0.011653039000520948
bradley_terry,naive,0.011644427000646829
bradley_terry,naive,0.011589874000492273
elo,pyo3,0.005369069000153104
elo,pyo3,0.00532382100027462
elo,pyo3,0.005319439000231796
elo,pyo3,0.005307326000547619
elo,pyo3,0.005343168000763399
elo,pyo3,0.005356769001082284
elo,pyo3,0.005366054001569864
elo,pyo3,0.005641824000122142
elo,pyo3,0.005391536000388442
elo,pyo3,0.005369290000089677
elo,naive,0.49616283500108693
elo,naive,0.4852133749991481
elo,naive,0.47851063500093005
elo,naive,0.48006601499946555
elo,naive,0.4753923959997337
elo,naive,0.4769150800002535
elo,naive,0.4766232599995419
elo,naive,0.47964533800040954
elo,naive,0.49262491800072894
elo,naive,0.48891441200066765
eigen,pyo3,0.005105121999804396
eigen,pyo3,0.004977573998985463
eigen,pyo3,0.005370251999920583
eigen,pyo3,0.005091636001452571
eigen,pyo3,0.004964488000041456
eigen,pyo3,0.005006197001421242
eigen,pyo3,0.005002247999073006
eigen,pyo3,0.004940893999446416
eigen,pyo3,0.004896967999229673
eigen,pyo3,0.004950393000399345
eigen,naive,0.007578472999739461
eigen,naive,0.0068903650007996475
eigen,naive,0.006166182000015397
eigen,naive,0.005998622998959036
eigen,naive,0.006027541001458303
eigen,naive,0.006044929999916349
eigen,naive,0.006003292999594123
eigen,naive,0.006016929000907112
eigen,naive,0.006057766000594711
eigen,naive,0.005994141001792741
pagerank,pyo3,0.005109638999783783
pagerank,pyo3,0.004911364998406498
pagerank,pyo3,0.005008294001527247
pagerank,pyo3,0.004950368998834165
pagerank,pyo3,0.005036065000240342
pagerank,pyo3,0.004928320999169955
pagerank,pyo3,0.004861629000515677
pagerank,pyo3,0.004890345000603702
pagerank,pyo3,0.004856256000493886
pagerank,pyo3,0.004860412998823449
pagerank,naive,0.005966113998510991
pagerank,naive,0.005886898999960977
pagerank,naive,0.006147760001113056
pagerank,naive,0.005819226000312483
pagerank,naive,0.0057333940003445605
pagerank,naive,0.005826475999128888
pagerank,naive,0.006016974999511149
pagerank,naive,0.006921724998392165
pagerank,naive,0.006082464000428445
pagerank,naive,0.006042460001481231
newman,pyo3,0.0063594679995730985
newman,pyo3,0.00596360400049889
newman,pyo3,0.005977647999316105
newman,pyo3,0.0058701870002551
newman,pyo3,0.00590245500097808
newman,pyo3,0.006189169000208494
newman,pyo3,0.005855299999893759
newman,pyo3,0.0060658649999822956
newman,pyo3,0.006033386998751666
newman,pyo3,0.006011262999891187
newman,naive,0.009793019000426284
newman,naive,0.009593479999239207
newman,naive,0.009580083999026101
newman,naive,0.009858966999672703
newman,naive,0.009588980999978958
newman,naive,0.009542887999486993
newman,naive,0.009545767001327476
newman,naive,0.00950388599994767
newman,naive,0.009521482999844011
newman,naive,0.009310036999522708
57 changes: 57 additions & 0 deletions coling2025/rust_python.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#!/usr/bin/env python3

from functools import partial
from timeit import repeat

import evalica
import pandas as pd
from tqdm.auto import tqdm

ALGORITHMS = [
evalica.counting,
evalica.average_win_rate,
evalica.bradley_terry,
evalica.elo,
evalica.eigen,
evalica.pagerank,
evalica.newman,
]

REPETITIONS = 10

def main() -> None:
df_llmfao = pd.read_csv("llmfao.csv", dtype=str)
df_llmfao = df_llmfao[["left", "right", "winner"]]
df_llmfao["winner"] = df_llmfao["winner"].map({
"left": evalica.Winner.X,
"right": evalica.Winner.Y,
"tie": evalica.Winner.Draw,
})

_, _, index = evalica.indexing(df_llmfao["left"], df_llmfao["right"])

results = []

for algorithm in tqdm(ALGORITHMS):
for solver in ("pyo3", "naive"):
stmt = partial(
algorithm,
xs=df_llmfao["left"],
ys=df_llmfao["right"],
winners=df_llmfao["winner"],
index=index,
solver=solver,
)

time = repeat(stmt, repeat=REPETITIONS, number=1)

results.append((algorithm.__name__, solver, time))

df_results = pd.DataFrame(results, columns=["algorithm", "solver", "time"])
df_results = df_results.explode("time")
df_results = df_results.reset_index(drop=True)
df_results.to_csv("rust_python.csv", index=False)


if __name__ == "__main__":
main()
Loading

0 comments on commit 66111a3

Please sign in to comment.