Skip to content

Commit

Permalink
Enforce structured output for LLM as a Judge metrics (#503)
Browse files Browse the repository at this point in the history
* Enforce structured output for LLM as a Judge metrics

* Update following review
  • Loading branch information
jverre authored Oct 30, 2024
1 parent 103ad95 commit a2ce4a7
Show file tree
Hide file tree
Showing 16 changed files with 288 additions and 152 deletions.
86 changes: 86 additions & 0 deletions sdks/python/examples/metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from opik.evaluation import metrics

# Hallucination metric example
print("\n\nHallucination metric example:")

hallucination_metric = metrics.Hallucination()

hallucination_score = hallucination_metric.score(
input="What is the capital of France?",
output="The capital of France is Paris. It is famous for its iconic Eiffel Tower and rich cultural heritage.",
)
print("hallucination_score:", hallucination_score)

# G-Eval metric example
print("\n\nG-Eval metric example:")

g_eval_metric = metrics.GEval(
task_introduction="You are an expert judge tasked with evaluating the faithfulness of an AI-generated answer to the given context.",
evaluation_criteria="The OUTPUT must not introduce new information beyond what's provided in the CONTEXT.",
)

g_eval_score = g_eval_metric.score(
input={
"OUTPUT": "What is the capital of France?",
"CONTEXT": [
"France is a country in Western Europe. Its capital is Paris, which is known for landmarks like the Eiffel Tower."
],
}
)
print("g_eval_score:", g_eval_score)

# Moderation metric example
print("\n\nModeration metric example:")

moderation_metric = metrics.Moderation()

moderation_score = moderation_metric.score(
input="What is the capital of France?",
output="The capital of France is Paris. It is famous for its iconic Eiffel Tower and rich cultural heritage.",
context=[
"France is a country in Western Europe. Its capital is Paris, which is known for landmarks like the Eiffel Tower."
],
)

print("moderation_score:", moderation_score)

# Answer Relevance metric example
print("\n\nAnswer Relevance metric example:")

answer_relevance_metric = metrics.AnswerRelevance()
answer_relevance_score = answer_relevance_metric.score(
input="What is the capital of France?",
output="The capital of France is Paris. It is famous for its iconic Eiffel Tower and rich cultural heritage.",
context=[
"France is a country in Western Europe. Its capital is Paris, which is known for landmarks like the Eiffel Tower."
],
)
print("answer_relevance_score:", answer_relevance_score)

# ContextPrecision metric example
print("\n\nContextPrecision metric example:")

context_precision_metric = metrics.ContextPrecision()
context_precision_score = context_precision_metric.score(
input="What is the capital of France?",
output="The capital of France is Paris. It is famous for its iconic Eiffel Tower and rich cultural heritage.",
expected_output="Paris",
context=[
"France is a country in Western Europe. Its capital is Paris, which is known for landmarks like the Eiffel Tower."
],
)
print("context_precision_score:", context_precision_score)

# ContextRecall metric example
print("\n\nContextRecall metric example:")

context_recall_metric = metrics.ContextRecall()
context_recall_score = context_recall_metric.score(
input="What is the capital of France?",
output="The capital of France is Paris. It is famous for its iconic Eiffel Tower and rich cultural heritage.",
expected_output="Paris",
context=[
"France is a country in Western Europe. Its capital is Paris, which is known for landmarks like the Eiffel Tower."
],
)
print("context_recall_score:", context_recall_score)
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json
import logging
from typing import Any, List, Optional, Union
import pydantic

from opik import logging_messages
from opik.evaluation.metrics import base_metric, score_result
Expand All @@ -12,6 +13,11 @@
LOGGER = logging.getLogger(__name__)


class AnswerRelevanceResponseFormat(pydantic.BaseModel):
answer_relevance_score: float
reason: str


class AnswerRelevance(base_metric.BaseMetric):
"""
A metric that evaluates the relevance of an answer to a given input using an LLM.
Expand Down Expand Up @@ -77,7 +83,9 @@ def score(
(between 0.0 and 1.0) and a reason for the score.
"""
llm_query = template.generate_query(input=input, output=output, context=context)
model_output = self._model.generate_string(input=llm_query)
model_output = self._model.generate_string(
input=llm_query, response_format=AnswerRelevanceResponseFormat
)

return self._parse_model_output(model_output)

Expand All @@ -100,20 +108,22 @@ async def ascore(
score_result.ScoreResult: A ScoreResult object with the answer relevance score and reason.
"""
llm_query = template.generate_query(input=input, output=output, context=context)
model_output = await self._model.agenerate_string(input=llm_query)
model_output = await self._model.agenerate_string(
input=llm_query, response_format=AnswerRelevanceResponseFormat
)

return self._parse_model_output(model_output)

def _parse_model_output(self, content: str) -> score_result.ScoreResult:
try:
dict_content = json.loads(content)
score: float = dict_content[template.VERDICT_KEY]
score: float = dict_content["answer_relevance_score"]

if not (0.0 <= score <= 1.0):
score = 0.5

return score_result.ScoreResult(
name=self.name, value=score, reason=dict_content[template.REASON_KEY]
name=self.name, value=score, reason=dict_content["reason"]
)
except Exception:
raise exceptions.MetricComputationError(
Expand Down
Original file line number Diff line number Diff line change
@@ -1,16 +1,12 @@
from typing import List, TypedDict


VERDICT_KEY = "relevance_score"
REASON_KEY = "reason"


class FewShotExampleAnswerRelevance(TypedDict):
title: str
input: str
output: str
context: List[str]
relevance_score: float
answer_relevance_score: float
reason: str


Expand All @@ -23,7 +19,7 @@ class FewShotExampleAnswerRelevance(TypedDict):
"France is a country in Europe.",
"Paris is known for its iconic Eiffel Tower.",
],
"relevance_score": 0.2,
"answer_relevance_score": 0.2,
"reason": "The answer provides information about the Eiffel Tower, which is related to France, but fails to address the specific question about the capital city. It doesn't directly answer the user's query, resulting in low relevance.",
},
{
Expand All @@ -34,7 +30,7 @@ class FewShotExampleAnswerRelevance(TypedDict):
"France is a country in Europe.",
"Paris is the capital and largest city of France.",
],
"relevance_score": 0.6,
"answer_relevance_score": 0.6,
"reason": "The answer mentions Paris, which is the correct capital, but it's presented as just one of many cities rather than explicitly stating it's the capital. The response is partially relevant but lacks directness in addressing the specific question.",
},
{
Expand All @@ -45,7 +41,7 @@ class FewShotExampleAnswerRelevance(TypedDict):
"France is a country in Europe.",
"Paris is the capital and largest city of France.",
],
"relevance_score": 0.9,
"answer_relevance_score": 0.9,
"reason": "The answer directly and correctly identifies Paris as the capital of France, which is highly relevant to the user's question. It also provides additional context about the Eiffel Tower, which aligns with the provided context. The response is comprehensive and relevant, though slightly more detailed than necessary, preventing a perfect score.",
},
]
Expand All @@ -66,8 +62,8 @@ def generate_query(
f"- **Result:**\n"
f" ```json\n"
f" {{\n"
f" \"{VERDICT_KEY}\": {example['relevance_score']},\n"
f" \"{REASON_KEY}\": \"{example['reason']}\"\n"
f" \"answer_relevance_score\": {example['answer_relevance_score']},\n"
f" \"reason\": \"{example['reason']}\"\n"
f" }}\n"
f" ```"
for i, example in enumerate(few_shot_examples)
Expand Down Expand Up @@ -102,20 +98,20 @@ def generate_query(
3.2. JUSTIFY THE SCORE WITH A BRIEF EXPLANATION THAT HIGHLIGHTS THE STRENGTHS OR WEAKNESSES OF THE ANSWER.
4. **Generating the JSON Output:**
4.1. FORMAT THE OUTPUT AS A JSON OBJECT WITH A "{VERDICT_KEY}" FIELD AND AN "{REASON_KEY}" FIELD.
4.1. FORMAT THE OUTPUT AS A JSON OBJECT WITH A "answer_relevance_score" FIELD AND AN "reason" FIELD.
4.2. ENSURE THE SCORE IS A FLOATING-POINT NUMBER BETWEEN 0.0 AND 1.0.
###WHAT NOT TO DO###
- DO NOT GIVE A SCORE WITHOUT FULLY ANALYZING BOTH THE CONTEXT AND THE USER INPUT.
- AVOID SCORES THAT DO NOT MATCH THE EXPLANATION PROVIDED.
- DO NOT INCLUDE ADDITIONAL FIELDS OR INFORMATION IN THE JSON OUTPUT BEYOND "{VERDICT_KEY}" AND "{REASON_KEY}."
- DO NOT INCLUDE ADDITIONAL FIELDS OR INFORMATION IN THE JSON OUTPUT BEYOND "answer_relevance_score" AND "reason."
- NEVER ASSIGN A PERFECT SCORE UNLESS THE ANSWER IS FULLY RELEVANT AND FREE OF ANY IRRELEVANT INFORMATION.
###EXAMPLE OUTPUT FORMAT###
{{
"{VERDICT_KEY}": 0.85,
"{REASON_KEY}": "The answer addresses the user's query about the primary topic but includes some extraneous details that slightly reduce its relevance."
"answer_relevance_score": 0.85,
"reason": "The answer addresses the user's query about the primary topic but includes some extraneous details that slightly reduce its relevance."
}}
###FEW-SHOT EXAMPLES###
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import json
import logging
from typing import Any, List, Optional, Union

import pydantic
from opik import logging_messages
from opik.evaluation.metrics import base_metric, score_result
from opik.evaluation.models import base_model, models_factory
Expand All @@ -12,6 +12,11 @@
LOGGER = logging.getLogger(__name__)


class ContextPrecisionResponseFormat(pydantic.BaseModel):
context_precision_score: float
reason: str


class ContextPrecision(base_metric.BaseMetric):
"""
A metric that evaluates the context precision of an input-output pair using an LLM.
Expand Down Expand Up @@ -87,7 +92,9 @@ def score(
context=context,
few_shot_examples=self.few_shot_examples,
)
model_output = self._model.generate_string(input=llm_query)
model_output = self._model.generate_string(
input=llm_query, response_format=ContextPrecisionResponseFormat
)

return self._parse_model_output(model_output)

Expand Down Expand Up @@ -122,20 +129,22 @@ async def ascore(
context=context,
few_shot_examples=self.few_shot_examples,
)
model_output = await self._model.agenerate_string(input=llm_query)
model_output = await self._model.agenerate_string(
input=llm_query, response_format=ContextPrecisionResponseFormat
)

return self._parse_model_output(model_output)

def _parse_model_output(self, content: str) -> score_result.ScoreResult:
try:
dict_content = json.loads(content)
score: float = dict_content[template.VERDICT_KEY]
score: float = dict_content["context_precision_score"]

if not (0.0 <= score <= 1.0):
score = 0.5

return score_result.ScoreResult(
name=self.name, value=score, reason=dict_content[template.REASON_KEY]
name=self.name, value=score, reason=dict_content["reason"]
)
except Exception:
raise exceptions.MetricComputationError(
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
from typing import List, TypedDict

VERDICT_KEY = "context_precision_score"
REASON_KEY = "reason"


class FewShotExampleContextPrecision(TypedDict):
title: str
Expand Down Expand Up @@ -62,8 +59,8 @@ def generate_query(
f"- **Result:**\n"
f" ```json\n"
f" {{\n"
f" \"{VERDICT_KEY}\": {example['context_precision_score']},\n"
f" \"{REASON_KEY}\": \"{example['reason']}\"\n"
f" \"context_precision_score\": {example['context_precision_score']},\n"
f" \"reason\": \"{example['reason']}\"\n"
f" }}\n"
f" ```"
for i, example in enumerate(few_shot_examples)
Expand All @@ -82,19 +79,19 @@ def generate_query(
###SCALE FOR CONTEXT PRECISION METRIC (0.0 - 1.0)###
- **0.0:** COMPLETELY INACCURATE The LLM's answer is entirely off-topic, irrelevant, or incorrect based on the context and expected answer.
- **0.2:** MOSTLY INACCURATE The answer contains significant errors, misunderstanding of the context, or is largely irrelevant.
- **0.4:** PARTIALLY ACCURATE Some correct elements are present, but the answer is incomplete or partially misaligned with the context and expected answer.
- **0.6:** MOSTLY ACCURATE The answer is generally correct and relevant but may contain minor errors or lack complete precision in aligning with the expected answer.
- **0.8:** HIGHLY ACCURATE The answer is very close to the expected answer, with only minor discrepancies that do not significantly impact the overall correctness.
- **1.0:** PERFECTLY ACCURATE The LLM's answer matches the expected answer precisely, with full adherence to the context and no errors.
- **0.0:** COMPLETELY INACCURATE - The LLM's answer is entirely off-topic, irrelevant, or incorrect based on the context and expected answer.
- **0.2:** MOSTLY INACCURATE - The answer contains significant errors, misunderstanding of the context, or is largely irrelevant.
- **0.4:** PARTIALLY ACCURATE - Some correct elements are present, but the answer is incomplete or partially misaligned with the context and expected answer.
- **0.6:** MOSTLY ACCURATE - The answer is generally correct and relevant but may contain minor errors or lack complete precision in aligning with the expected answer.
- **0.8:** HIGHLY ACCURATE - The answer is very close to the expected answer, with only minor discrepancies that do not significantly impact the overall correctness.
- **1.0:** PERFECTLY ACCURATE - The LLM's answer matches the expected answer precisely, with full adherence to the context and no errors.
2. **PROVIDE A REASON FOR THE SCORE:**
- **JUSTIFY** why the specific score was given, considering the alignment with context, accuracy, relevance, and completeness.
3. **RETURN THE RESULT IN A JSON FORMAT** as follows:
- `"{VERDICT_KEY}"`: The score between 0.0 and 1.0.
- `"{REASON_KEY}"`: A detailed explanation of why the score was assigned.
- `"context_precision_score"`: The score between 0.0 and 1.0.
- `"reason"`: A detailed explanation of why the score was assigned.
###WHAT NOT TO DO###
- **DO NOT** assign a high score to answers that are off-topic or irrelevant, even if they contain some correct information.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json
import logging
from typing import Any, List, Optional, Union
import pydantic

from opik import logging_messages
from opik.evaluation.metrics import base_metric, score_result
Expand All @@ -12,6 +13,11 @@
LOGGER = logging.getLogger(__name__)


class ContextRecallResponseFormat(pydantic.BaseModel):
context_recall_score: float
reason: str


class ContextRecall(base_metric.BaseMetric):
"""
A metric that evaluates the context recall of an input-output pair using an LLM.
Expand Down Expand Up @@ -85,7 +91,9 @@ def score(
context=context,
few_shot_examples=self.few_shot_examples,
)
model_output = self._model.generate_string(input=llm_query)
model_output = self._model.generate_string(
input=llm_query, response_format=ContextRecallResponseFormat
)

return self._parse_model_output(model_output)

Expand Down Expand Up @@ -120,20 +128,22 @@ async def ascore(
context=context,
few_shot_examples=self.few_shot_examples,
)
model_output = await self._model.agenerate_string(input=llm_query)
model_output = await self._model.agenerate_string(
input=llm_query, response_format=ContextRecallResponseFormat
)

return self._parse_model_output(model_output)

def _parse_model_output(self, content: str) -> score_result.ScoreResult:
try:
dict_content = json.loads(content)
score: float = dict_content[template.VERDICT_KEY]
score: float = dict_content["context_recall_score"]

if not (0.0 <= score <= 1.0):
score = 0.5

return score_result.ScoreResult(
name=self.name, value=score, reason=dict_content[template.REASON_KEY]
name=self.name, value=score, reason=dict_content["reason"]
)
except Exception:
raise exceptions.MetricComputationError(
Expand Down
Loading

0 comments on commit a2ce4a7

Please sign in to comment.