benchmarking

hegelai · Aug 14, 2023 · 4ae216c · 4ae216c
1 parent fd7cc76
commit 4ae216c
Show file tree

Hide file tree

Showing 6 changed files with 286 additions and 100,454 deletions.
diff --git a/prompttools/benchmarks/benchmark.py b/prompttools/benchmarks/benchmark.py
@@ -4,8 +4,9 @@
 # This source code's license can be found in the
 # LICENSE file in the root directory of this source tree.
 
-import logging
-from typing import Any, List
+from typing import Any, Callable, List, Optional
+import pandas as pd
+import warnings
 
 
 class Benchmark:
@@ -15,7 +16,7 @@ class Benchmark:
 
  Args:
  ----
- experiments (list(experiment types)): list of experiments
+ experiment (experiment type): experiment to use
  eval_methods (list(eval methods)): list of evaluation methods to measure response similarity
  prompts (list(str)): list of queries, questions, prompts for LLMs to respond to
  response_options (list(str)): possible responses to measure against
@@ -24,28 +25,73 @@ class Benchmark:
 
  def __init__(
  self,
- experiments: List[Any],
- eval_methods: List[Any],
+ experiment: Any,
+ eval_method: Callable,
  prompts: List[str],
- response_options: List[str],
- correct_response_index: List[int]
+ response_options: List[Any],
+ correct_response_indices: Optional[List[int]] = None,
  ):
- self.experiments = experiments
- self.eval_methods = eval_methods
+ self.experiment = experiment
+ self.eval_method = eval_method
  self.prompts = prompts
  self.response_options = response_options
- self.correct_response_index = correct_response_index
+ self.correct_response_indices = correct_response_indices
+
+ def _get_precision(
+ self,
+ dataframe: pd.DataFrame,
+ pred_col: str,
+ label_col: str,
+ ) -> float:
+ r"""
+ Calculate precision.
+ """
+ # TODO: coming soon
  pass
 
- def run(
+ def multiple_choice_accuracy(
  self,
- early_stopping
- ):
+ dataframe: pd.DataFrame,
+ col1: str,
+ col2: str,
+ ) -> float:
  r"""
- Run model experiments to measure response quality.
+ Benchmark LLM accuracy on multiple choice
+ prompt endings.
+ """
+ correct = 0
+ for _, row in dataframe.iterrows():
+ if row[col1] == row[col2]:
+ correct += 1
+ return correct / len(dataframe)
 
- Args:
- ----
- early_stopping: maximum time to allow benchmark to run
+ def multiple_choice_benchmark(
+ self,
+ ) -> Any:
+ r"""
+ Run model experiments to measure response quality.
  """
- pass
+ self.experiment.run()
+
+ if "prompt" not in self.experiment.full_df.columns:
+ # Assume messages column is in place of prompt
+ self.experiment.full_df["prompt"] = self.experiment.full_df["messages"].map(lambda x: str(x))
+ warnings.warn("Column 'prompt' does not exist. Using column 'messages' instead.", UserWarning, stacklevel=2)
+ # Get option with highest similarity to LLM response
+ self.benchmark_df = self.experiment.full_df[["prompt", "response"]]
+ self.benchmark_df["response_options"] = self.response_options
+ self.benchmark_df = self.benchmark_df.explode(column="response_options").reset_index()
+ scores = []
+ for _, row in self.benchmark_df.iterrows():
+ scores.append(self.eval_method(row=row, expected=row["response_options"]))
+ self.benchmark_df["scores"] = scores
+ self.benchmark_df['max_value'] = self.benchmark_df.groupby("prompt")["scores"].transform('max')
+ self.benchmark_df = self.benchmark_df[self.benchmark_df["scores"] == self.benchmark_df["max_value"]]
+ self.benchmark_df = self.benchmark_df.sort_index()
+ # Colect model choices
+ model_choice = []
+ for i, choice in enumerate(self.benchmark_df["response_options"].values):
+ model_choice.append(self.response_options[i].index(choice))
+ self.benchmark_df["model_choice"] = model_choice
+ self.benchmark_df["labels"] = self.correct_response_indices
+ return self.multiple_choice_accuracy(self.benchmark_df, "model_choice", "labels")