Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for other models in AutoEval #59

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 7 additions & 14 deletions prompttools/utils/autoeval.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,11 @@
# LICENSE file in the root directory of this source tree.


import os
from typing import Dict
import openai
import pandas.core.series
import jinja2
from .error import PromptToolsUtilityError

from .model_evaluators.EvaluatorUtils import get_evaluator_for_model

EVALUATION_SYSTEM_PROMPT = """
Determine whether or not the response is following directions.
Expand All @@ -21,18 +20,14 @@
EVALUATION_USER_TEMPLATE = """
PROMPT: {{prompt}}
RESPONSE: {{response}}
ANSWER:
"""


def _get_messages(prompt: str, response: str):
def _get_user_prompt(prompt: str, response: str):
environment = jinja2.Environment()
template = environment.from_string(EVALUATION_USER_TEMPLATE)
user_message = template.render({"prompt": prompt, "response": response})
return [
{"role": "system", "content": EVALUATION_SYSTEM_PROMPT},
{"role": "user", "content": user_message},
]
user_prompt = template.render({"prompt": prompt, "response": response})
return user_prompt


def compute(prompt: str, response: str, model: str = "gpt-4") -> float:
Expand All @@ -46,10 +41,8 @@ def compute(prompt: str, response: str, model: str = "gpt-4") -> float:
model (str): The OpenAI chat model to use for generating an expected response.
Defaults to GPT-4.
"""
if not os.environ["OPENAI_API_KEY"]:
raise PromptToolsUtilityError
evaluation = openai.ChatCompletion.create(model=model, messages=_get_messages(prompt, response))
return 1.0 if "RIGHT" in evaluation["choices"][0]["message"]["content"] else 0.0
response = get_evaluator_for_model(model).evaluate(model, prompt, )
return 1.0 if "RIGHT" in response else 0.0


def evaluate(prompt: str, response: str, _metadata: Dict) -> float:
Expand Down
19 changes: 7 additions & 12 deletions prompttools/utils/autoeval_from_expected.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,9 @@
# LICENSE file in the root directory of this source tree.


import os
import openai
import jinja2
import pandas
from .error import PromptToolsUtilityError
from .model_evaluators.EvaluatorUtils import get_evaluator_for_model

EVALUATION_SYSTEM_PROMPT = """
You are a grader evaluating responses to math questions.
Expand All @@ -25,14 +23,11 @@
"""


def _get_messages(prompt: str, expected: str, response: str):
def _get_user_prompt(prompt: str, expected: str, response: str):
environment = jinja2.Environment()
template = environment.from_string(EVALUATION_USER_TEMPLATE)
user_message = template.render({"prompt": prompt, "expected": expected, "actual": response})
return [
{"role": "system", "content": EVALUATION_SYSTEM_PROMPT},
{"role": "user", "content": user_message},
]
user_prompt = template.render({"prompt": prompt, "expected": expected, "actual": response})
return user_prompt


# TODO: Should this be removed since no one is using it?
Expand All @@ -47,9 +42,9 @@ def compute(prompt: str, expected: str, response: str, model: str = "gpt-4") ->
model (str): The OpenAI chat model to use for generating an expected response.
Defaults to GPT-4.
"""
if not os.environ["OPENAI_API_KEY"]:
raise PromptToolsUtilityError("Missing API key for evaluation.")
evaluation = openai.ChatCompletion.create(model=model, messages=_get_messages(prompt, expected, response))
evaluation = get_evaluator_for_model(model).evaluate(
model, EVALUATION_SYSTEM_PROMPT, _get_user_prompt(prompt, expected, response)
)
return 1.0 if "RIGHT" in evaluation["choices"][0]["message"]["content"] else 0.0


Expand Down
43 changes: 5 additions & 38 deletions prompttools/utils/autoeval_scoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,39 +9,14 @@
import pandas.core.series
import jinja2

from prompttools.utils.model_evaluators.EvaluatorUtils import get_evaluator_for_model

try:
import anthropic
except ImportError:
anthropic = None


AUTO_EVAL_PROMPT_TEMPLATE = """
{{HUMAN_PROMPT}} Given the fact {{fact}}

Evaluate the following Answer on a scale from 1 - 7. Please only respond with an integer from 1 - 7 with no other text.
Lower score means the answer is factually wrong, higher score means the answer is correct. A medium score for
uncertain but not wrong.

Answer: {{model_answer}}

{{AI_PROMPT}}
"""


def _generate_auto_eval_prompt(fact: str, model_answer: str):
environment = jinja2.Environment()
template = environment.from_string(AUTO_EVAL_PROMPT_TEMPLATE)
auto_eval_prompt = template.render(
{
"HUMAN_PROMPT": anthropic.HUMAN_PROMPT,
"AI_PROMPT": anthropic.AI_PROMPT,
"fact": fact,
"model_answer": model_answer,
}
)
return auto_eval_prompt


def compute(fact: str, model_answer: str, model: str = "claude-2") -> float:
r"""
Uses a high quality chat model, like claude-2, to automatically score a given
Expand All @@ -54,13 +29,10 @@ def compute(fact: str, model_answer: str, model: str = "claude-2") -> float:
model (str): The model that will be judging how close is the response from the truth.
Defaults to Claude 2.
"""
if not os.environ["ANTHROPIC_API_KEY"]:
raise RuntimeError("Missing API key for evaluation.")
client = anthropic.Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"])
completion_response = client.completions.create(
max_tokens_to_sample=100, model=model, prompt=_generate_auto_eval_prompt(fact, model_answer)
response = get_evaluator_for_model(model).evaluate_and_score(
model, fact, model_answer
)
return int(completion_response.completion)
return int(response)


def autoeval_scoring(row: pandas.core.series.Series, expected: str, response_column_name: str = "response") -> float:
Expand All @@ -73,9 +45,4 @@ def autoeval_scoring(row: pandas.core.series.Series, expected: str, response_col
expected (str): the expected response
response_column_name (str): name of the column that contains the model's response, defaults to ``"response"``
"""
if anthropic is None:
raise ModuleNotFoundError(
"Package `anthropic` is required to be installed to use this experiment."
"Please use `pip install anthropic` to install the package"
)
return compute(fact=expected, model_answer=row[response_column_name])
99 changes: 99 additions & 0 deletions prompttools/utils/model_evaluators/AnthropicEvaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# Copyright (c) Hegel AI, Inc.
# All rights reserved.
#
# This source code's license can be found in the
# LICENSE file in the root directory of this source tree.


from functools import cached_property
from overrides import override
from .ModelEvaluator import ModelEvaluator
import jinja2
import anthropic
import os

EVALUATE_AND_SCORE_PROMPT_TEMPLATE = """\
{{HUMAN_PROMPT}} Given the fact {{fact}}
Evaluate the following Answer on a scale from 1 - 7.\
Please only respond with an integer from 1 - 7 with no other text.
Lower score means the answer is factually wrong, higher score means the answer\
is correct. A medium score for uncertain but not wrong.

Answer: {{answer}}
{{AI_PROMPT}}
"""

EVALUATE_RIGHT_OR_WRONG_PROMPT_TEMPLATE = """\
{{HUMAN_PROMPT}} Determine whether or not the response is following directions.\
Your answer should either be "RIGHT" if the response follows directions,\
or "WRONG" if the model is not following directions.

PROMPT: {{prompt}}
RESPONSE: {{response}}
{{AI_PROMPT}}
"""


class AnthropicEvaluator(ModelEvaluator):
def __init__(self) -> None:
self.client = None
self.supported_models = ["claude-1", "claude-2"]
self.right_or_wrong_evaluation_template = jinja2.Environment().from_string(
EVALUATE_RIGHT_OR_WRONG_PROMPT_TEMPLATE
)

self.evaluate_and_score_template = jinja2.Environment().from_string(
EVALUATE_AND_SCORE_PROMPT_TEMPLATE
)

@cached_property
def get_client(self):
try:
self.client = anthropic.Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"])
except:
self.client = None

def supports_model(self, model: str):
return model in self.supports_model(model)

@override
def evaluate(self, model: str, prompt: str, response: str):
client = self.validate_and_get_client()
eval_prompt = self.right_or_wrong_evaluation_template.render(
HUMAN_PROMPT=anthropic.HUMAN_PROMPT,
prompt=prompt,
response=response,
AI_PROMPT=anthropic.AI_PROMPT
)

response = client.completions.create(max_tokens_to_sample=100, model=model, prompt=eval_prompt)
return response.completion

@override
def evaluate_and_score(self, model: str, fact: str, answer: str):
client = self.validate_and_get_client()
eval_prompt = self.evaluate_and_score_template.render(
HUMAN_PROMPT=anthropic.HUMAN_PROMPT,
fact=fact,
answer=answer,
AI_PROMPT=anthropic.AI_PROMPT
)

response = client.completions.create(max_tokens_to_sample=100, model=model, prompt=eval_prompt)
return response.completion

def validate_and_get_client(self) -> anthropic.Anthropic:
if anthropic is None:
raise ModuleNotFoundError(
"Package `anthropic` is required to be installed to use this experiment."
"Please use `pip install anthropic` to install the package"
)

if not os.environ["ANTHROPIC_API_KEY"]:
raise RuntimeError("Missing API key for evaluation.")

client = self.validate_and_get_client()
if client == None:
raise RuntimeError("Could not connect to Anthropic Client")

return self.get_client()
17 changes: 17 additions & 0 deletions prompttools/utils/model_evaluators/EvaluatorUtils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Copyright (c) Hegel AI, Inc.
# All rights reserved.
#
# This source code's license can be found in the
# LICENSE file in the root directory of this source tree.

from .ModelEvaluator import ModelEvaluator
from .GptEvaluator import GptEvaluator
from .AnthropicEvaluator import AnthropicEvaluator

Evaluators = [GptEvaluator(), AnthropicEvaluator()]


def get_evaluator_for_model(model: str) -> ModelEvaluator:
for evaluator in Evaluators:
if evaluator.supports_model(model):
return evaluator
95 changes: 95 additions & 0 deletions prompttools/utils/model_evaluators/GptEvaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# Copyright (c) Hegel AI, Inc.
# All rights reserved.
#
# This source code's license can be found in the
# LICENSE file in the root directory of this source tree.

import os
import jinja2
from overrides import override

from prompttools.utils.error import PromptToolsUtilityError
from .ModelEvaluator import ModelEvaluator
import openai

EVALUATION_SYSTEM_PROMPT_BY_TYPE = {
"RIGHT_OR_WRONG": """\
Determine whether or not the response is following directions.\
Your answer should either be "RIGHT" if the response follows directions,\
or "WRONG" if the model is not following directions.""",

"SCORE": """\
Given the Fact and Answer, Evaluate the statement on a scale from 1 - 7.\
Please only respond with an integer from 1 - 7 with no other text.\
Lower score means the answer is factually wrong, higher score means\
the answer is correct. A medium score for uncertain but not wrong"""
}

EVALUATION_USER_TEMPLATE = """
PROMPT: {{prompt}}
RESPONSE: {{response}}
ANSWER:
"""

EVALUATE_AND_SCORE_USER_TEMPLATE = """
FACT: {{fact}}
ANSWER: {{answer}}
"""


class GptEvaluator(ModelEvaluator):
def __init__(self) -> None:
# source: https://platform.openai.com/docs/models/model-endpoint-compatibility
self.supported_models = [
"gpt-4",
"gpt-4-0613",
"gpt-4-32k",
"gpt-4-32k-0613",
"gpt-3.5-turbo",
"gpt-3.5-turbo-0613",
"gpt-3.5-turbo-16k",
"gpt-3.5-turbo-16k-0613",
]

self.evaluation_template = jinja2.Environment().from_string(
EVALUATION_USER_TEMPLATE
)

self.evaluate_and_score_template = jinja2.Environment().from_string(
EVALUATE_AND_SCORE_USER_TEMPLATE
)

@override
def supports_model(self, model) -> bool:
return model in self.supported_models

@override
def evaluate(self, model: str, prompt: str, response: str):
if not os.environ["OPENAI_API_KEY"]:
raise PromptToolsUtilityError

eval_prompt = self.evaluation_template.render(prompt=prompt, response=response)
response = openai.ChatCompletion.create(
model=model, messages=self.get_messages("RIGHT/WRONG", eval_prompt)
)

return response["choices"][0]["message"]["content"]

def evaluate_and_score(self, model: str, fact: str, answer: str):
if not os.environ["OPENAI_API_KEY"]:
raise PromptToolsUtilityError

eval_prompt = self.evaluate_and_score_template.render(fact=fact, answer=answer)
response = openai.ChatCompletion.create(
model=model, messages=self.get_messages("SCORE", eval_prompt)
)

return response["choices"][0]["message"]["content"]

def get_messages(self, evaluation_type: str, eval_prompt: str) -> list:
messages = [
{"role": "system", "content": EVALUATION_SYSTEM_PROMPT_BY_TYPE[evaluation_type]},
{"role": "user", "content": eval_prompt},
]

return messages
22 changes: 22 additions & 0 deletions prompttools/utils/model_evaluators/ModelEvaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Copyright (c) Hegel AI, Inc.
# All rights reserved.
#
# This source code's license can be found in the
# LICENSE file in the root directory of this source tree.


from abc import ABC, abstractmethod


class ModelEvaluator(ABC):
@abstractmethod
def evaluate(self, model: str, prompt: str, response: str):
pass

@abstractmethod
def supports_model(self, model: str):
pass

@abstractmethod
def evaluate_and_score(self, model: str, fact: str, answer: str):
pass
Loading