Skip to content

Commit

Permalink
benchmarking
Browse files Browse the repository at this point in the history
  • Loading branch information
HashemAlsaket committed Aug 14, 2023
1 parent fd7cc76 commit 4ae216c
Show file tree
Hide file tree
Showing 6 changed files with 286 additions and 100,454 deletions.
82 changes: 64 additions & 18 deletions prompttools/benchmarks/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@
# This source code's license can be found in the
# LICENSE file in the root directory of this source tree.

import logging
from typing import Any, List
from typing import Any, Callable, List, Optional
import pandas as pd
import warnings


class Benchmark:
Expand All @@ -15,7 +16,7 @@ class Benchmark:
Args:
----
experiments (list(experiment types)): list of experiments
experiment (experiment type): experiment to use
eval_methods (list(eval methods)): list of evaluation methods to measure response similarity
prompts (list(str)): list of queries, questions, prompts for LLMs to respond to
response_options (list(str)): possible responses to measure against
Expand All @@ -24,28 +25,73 @@ class Benchmark:

def __init__(
self,
experiments: List[Any],
eval_methods: List[Any],
experiment: Any,
eval_method: Callable,
prompts: List[str],
response_options: List[str],
correct_response_index: List[int]
response_options: List[Any],
correct_response_indices: Optional[List[int]] = None,
):
self.experiments = experiments
self.eval_methods = eval_methods
self.experiment = experiment
self.eval_method = eval_method
self.prompts = prompts
self.response_options = response_options
self.correct_response_index = correct_response_index
self.correct_response_indices = correct_response_indices

def _get_precision(
self,
dataframe: pd.DataFrame,
pred_col: str,
label_col: str,
) -> float:
r"""
Calculate precision.
"""
# TODO: coming soon
pass

def run(
def multiple_choice_accuracy(
self,
early_stopping
):
dataframe: pd.DataFrame,
col1: str,
col2: str,
) -> float:
r"""
Run model experiments to measure response quality.
Benchmark LLM accuracy on multiple choice
prompt endings.
"""
correct = 0
for _, row in dataframe.iterrows():
if row[col1] == row[col2]:
correct += 1
return correct / len(dataframe)

Args:
----
early_stopping: maximum time to allow benchmark to run
def multiple_choice_benchmark(
self,
) -> Any:
r"""
Run model experiments to measure response quality.
"""
pass
self.experiment.run()

if "prompt" not in self.experiment.full_df.columns:
# Assume messages column is in place of prompt
self.experiment.full_df["prompt"] = self.experiment.full_df["messages"].map(lambda x: str(x))
warnings.warn("Column 'prompt' does not exist. Using column 'messages' instead.", UserWarning, stacklevel=2)
# Get option with highest similarity to LLM response
self.benchmark_df = self.experiment.full_df[["prompt", "response"]]
self.benchmark_df["response_options"] = self.response_options
self.benchmark_df = self.benchmark_df.explode(column="response_options").reset_index()
scores = []
for _, row in self.benchmark_df.iterrows():
scores.append(self.eval_method(row=row, expected=row["response_options"]))
self.benchmark_df["scores"] = scores
self.benchmark_df['max_value'] = self.benchmark_df.groupby("prompt")["scores"].transform('max')
self.benchmark_df = self.benchmark_df[self.benchmark_df["scores"] == self.benchmark_df["max_value"]]
self.benchmark_df = self.benchmark_df.sort_index()
# Colect model choices
model_choice = []
for i, choice in enumerate(self.benchmark_df["response_options"].values):
model_choice.append(self.response_options[i].index(choice))
self.benchmark_df["model_choice"] = model_choice
self.benchmark_df["labels"] = self.correct_response_indices
return self.multiple_choice_accuracy(self.benchmark_df, "model_choice", "labels")
Loading

0 comments on commit 4ae216c

Please sign in to comment.