Skip to content

Commit

Permalink
Add TODOs in evaluate for re-factor soon
Browse files Browse the repository at this point in the history
  • Loading branch information
NivekT committed Jul 25, 2023
1 parent e9ac38c commit da7cfd4
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 6 deletions.
6 changes: 3 additions & 3 deletions prompttools/experiment/experiments/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,14 +126,15 @@ def run(
self.queue.enqueue(
self.completion_fn,
# We need to filter out defaults that are invalid JSON from the request
{k: v for k, v in combo.items() if (v != None) and (v != float("inf"))},
{k: v for k, v in combo.items() if (v is not None) and (v != float("inf"))},
)
self.results = self.queue.results()
self.scores["latency"] = self.queue.latencies()
if len(self.results) == 0:
logging.error("No results. Something went wrong.")
raise PromptExperimentException

# TODO: Ideally, `eval_fn` should accept one row at a time, compute the metric, and add that to the row.
def evaluate(
self,
metric_name: str,
Expand Down Expand Up @@ -298,11 +299,10 @@ def aggregate(self, metric_name, column_name, is_average=False):
table = self.get_table(pivot_data=None, pivot_columns=None, pivot=False)
sorted_scores = self._aggregate_metric(table, metric_name, column_name, is_average)
if is_interactive():
plt.bar(range(len(sorted_scores)), list(sorted_scores.values()), align='center')
plt.bar(range(len(sorted_scores)), list(sorted_scores.values()), align="center")
plt.xticks(range(len(sorted_scores)), list(sorted_scores.keys()))
plt.show()


def rank(
self,
pivot_data: Dict[str, object],
Expand Down
12 changes: 9 additions & 3 deletions prompttools/utils/autoeval.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@


import os
from typing import Dict, List
from typing import Dict
import openai
import jinja2
from .error import PromptToolsUtilityError
Expand Down Expand Up @@ -51,8 +51,14 @@ def compute(prompt: str, response: str, model: str = "gpt-4") -> float:
return 1.0 if "RIGHT" in evaluation["choices"][0]["message"]["content"] else 0.0


def evaluate(prompt: str, response: str, metadata: Dict) -> float:
# TODO: After we refactor experiment.evaluate()
# Rename these functions to make it easier for users to distinguish between different utils functions
def evaluate(prompt: str, response: str, _metadata: Dict) -> float:
r"""
Uses auto-evaluation to score the model response.
Uses auto-evaluation to score the model response, using "gpt-4".
Args:
prompt (str): The input prompt.
response (str): The model response.
"""
return compute(prompt, response)

0 comments on commit da7cfd4

Please sign in to comment.