Enforce structured output for LLM as a Judge metrics (#503)

* Enforce structured output for LLM as a Judge metrics * Update following review
comet-ml · Oct 30, 2024 · a2ce4a7 · a2ce4a7
1 parent 103ad95
commit a2ce4a7
Show file tree

Hide file tree

Showing 16 changed files with 288 additions and 152 deletions.
diff --git a/sdks/python/examples/metrics.py b/sdks/python/examples/metrics.py
@@ -0,0 +1,86 @@
+from opik.evaluation import metrics
+
+# Hallucination metric example
+print("\n\nHallucination metric example:")
+
+hallucination_metric = metrics.Hallucination()
+
+hallucination_score = hallucination_metric.score(
+    input="What is the capital of France?",
+    output="The capital of France is Paris. It is famous for its iconic Eiffel Tower and rich cultural heritage.",
+)
+print("hallucination_score:", hallucination_score)
+
+# G-Eval metric example
+print("\n\nG-Eval metric example:")
+
+g_eval_metric = metrics.GEval(
+    task_introduction="You are an expert judge tasked with evaluating the faithfulness of an AI-generated answer to the given context.",
+    evaluation_criteria="The OUTPUT must not introduce new information beyond what's provided in the CONTEXT.",
+)
+
+g_eval_score = g_eval_metric.score(
+    input={
+        "OUTPUT": "What is the capital of France?",
+        "CONTEXT": [
+            "France is a country in Western Europe. Its capital is Paris, which is known for landmarks like the Eiffel Tower."
+        ],
+    }
+)
+print("g_eval_score:", g_eval_score)
+
+# Moderation metric example
+print("\n\nModeration metric example:")
+
+moderation_metric = metrics.Moderation()
+
+moderation_score = moderation_metric.score(
+    input="What is the capital of France?",
+    output="The capital of France is Paris. It is famous for its iconic Eiffel Tower and rich cultural heritage.",
+    context=[
+        "France is a country in Western Europe. Its capital is Paris, which is known for landmarks like the Eiffel Tower."
+    ],
+)
+
+print("moderation_score:", moderation_score)
+
+# Answer Relevance metric example
+print("\n\nAnswer Relevance metric example:")
+
+answer_relevance_metric = metrics.AnswerRelevance()
+answer_relevance_score = answer_relevance_metric.score(
+    input="What is the capital of France?",
+    output="The capital of France is Paris. It is famous for its iconic Eiffel Tower and rich cultural heritage.",
+    context=[
+        "France is a country in Western Europe. Its capital is Paris, which is known for landmarks like the Eiffel Tower."
+    ],
+)
+print("answer_relevance_score:", answer_relevance_score)
+
+# ContextPrecision metric example
+print("\n\nContextPrecision metric example:")
+
+context_precision_metric = metrics.ContextPrecision()
+context_precision_score = context_precision_metric.score(
+    input="What is the capital of France?",
+    output="The capital of France is Paris. It is famous for its iconic Eiffel Tower and rich cultural heritage.",
+    expected_output="Paris",
+    context=[
+        "France is a country in Western Europe. Its capital is Paris, which is known for landmarks like the Eiffel Tower."
+    ],
+)
+print("context_precision_score:", context_precision_score)
+
+# ContextRecall metric example
+print("\n\nContextRecall metric example:")
+
+context_recall_metric = metrics.ContextRecall()
+context_recall_score = context_recall_metric.score(
+    input="What is the capital of France?",
+    output="The capital of France is Paris. It is famous for its iconic Eiffel Tower and rich cultural heritage.",
+    expected_output="Paris",
+    context=[
+        "France is a country in Western Europe. Its capital is Paris, which is known for landmarks like the Eiffel Tower."
+    ],
+)
+print("context_recall_score:", context_recall_score)
diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/answer_relevance/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/answer_relevance/metric.py
@@ -1,6 +1,7 @@
 import json
 import logging
 from typing import Any, List, Optional, Union
+import pydantic
 
 from opik import logging_messages
 from opik.evaluation.metrics import base_metric, score_result
@@ -12,6 +13,11 @@
 LOGGER = logging.getLogger(__name__)
 
 
+class AnswerRelevanceResponseFormat(pydantic.BaseModel):
+    answer_relevance_score: float
+    reason: str
+
+
 class AnswerRelevance(base_metric.BaseMetric):
     """
     A metric that evaluates the relevance of an answer to a given input using an LLM.
@@ -77,7 +83,9 @@ def score(
             (between 0.0 and 1.0) and a reason for the score.
         """
         llm_query = template.generate_query(input=input, output=output, context=context)
-        model_output = self._model.generate_string(input=llm_query)
+        model_output = self._model.generate_string(
+            input=llm_query, response_format=AnswerRelevanceResponseFormat
+        )
 
         return self._parse_model_output(model_output)
 
@@ -100,20 +108,22 @@ async def ascore(
             score_result.ScoreResult: A ScoreResult object with the answer relevance score and reason.
         """
         llm_query = template.generate_query(input=input, output=output, context=context)
-        model_output = await self._model.agenerate_string(input=llm_query)
+        model_output = await self._model.agenerate_string(
+            input=llm_query, response_format=AnswerRelevanceResponseFormat
+        )
 
         return self._parse_model_output(model_output)
 
     def _parse_model_output(self, content: str) -> score_result.ScoreResult:
         try:
             dict_content = json.loads(content)
-            score: float = dict_content[template.VERDICT_KEY]
+            score: float = dict_content["answer_relevance_score"]
 
             if not (0.0 <= score <= 1.0):
                 score = 0.5
 
             return score_result.ScoreResult(
-                name=self.name, value=score, reason=dict_content[template.REASON_KEY]
+                name=self.name, value=score, reason=dict_content["reason"]
             )
         except Exception:
             raise exceptions.MetricComputationError(

diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/answer_relevance/template.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/answer_relevance/template.py
@@ -1,16 +1,12 @@
 from typing import List, TypedDict
 
 
-VERDICT_KEY = "relevance_score"
-REASON_KEY = "reason"
-
-
 class FewShotExampleAnswerRelevance(TypedDict):
     title: str
     input: str
     output: str
     context: List[str]
-    relevance_score: float
+    answer_relevance_score: float
     reason: str
 
 
@@ -23,7 +19,7 @@ class FewShotExampleAnswerRelevance(TypedDict):
             "France is a country in Europe.",
             "Paris is known for its iconic Eiffel Tower.",
         ],
-        "relevance_score": 0.2,
+        "answer_relevance_score": 0.2,
         "reason": "The answer provides information about the Eiffel Tower, which is related to France, but fails to address the specific question about the capital city. It doesn't directly answer the user's query, resulting in low relevance.",
     },
     {
@@ -34,7 +30,7 @@ class FewShotExampleAnswerRelevance(TypedDict):
             "France is a country in Europe.",
             "Paris is the capital and largest city of France.",
         ],
-        "relevance_score": 0.6,
+        "answer_relevance_score": 0.6,
         "reason": "The answer mentions Paris, which is the correct capital, but it's presented as just one of many cities rather than explicitly stating it's the capital. The response is partially relevant but lacks directness in addressing the specific question.",
     },
     {
@@ -45,7 +41,7 @@ class FewShotExampleAnswerRelevance(TypedDict):
             "France is a country in Europe.",
             "Paris is the capital and largest city of France.",
         ],
-        "relevance_score": 0.9,
+        "answer_relevance_score": 0.9,
         "reason": "The answer directly and correctly identifies Paris as the capital of France, which is highly relevant to the user's question. It also provides additional context about the Eiffel Tower, which aligns with the provided context. The response is comprehensive and relevant, though slightly more detailed than necessary, preventing a perfect score.",
     },
 ]
@@ -66,8 +62,8 @@ def generate_query(
             f"- **Result:**\n"
             f"  ```json\n"
             f"  {{\n"
-            f"    \"{VERDICT_KEY}\": {example['relevance_score']},\n"
-            f"    \"{REASON_KEY}\": \"{example['reason']}\"\n"
+            f"    \"answer_relevance_score\": {example['answer_relevance_score']},\n"
+            f"    \"reason\": \"{example['reason']}\"\n"
             f"  }}\n"
             f"  ```"
             for i, example in enumerate(few_shot_examples)
@@ -102,20 +98,20 @@ def generate_query(
            3.2. JUSTIFY THE SCORE WITH A BRIEF EXPLANATION THAT HIGHLIGHTS THE STRENGTHS OR WEAKNESSES OF THE ANSWER.
 
         4. **Generating the JSON Output:**
-           4.1. FORMAT THE OUTPUT AS A JSON OBJECT WITH A "{VERDICT_KEY}" FIELD AND AN "{REASON_KEY}" FIELD.
+           4.1. FORMAT THE OUTPUT AS A JSON OBJECT WITH A "answer_relevance_score" FIELD AND AN "reason" FIELD.
            4.2. ENSURE THE SCORE IS A FLOATING-POINT NUMBER BETWEEN 0.0 AND 1.0.
 
         ###WHAT NOT TO DO###
 
         - DO NOT GIVE A SCORE WITHOUT FULLY ANALYZING BOTH THE CONTEXT AND THE USER INPUT.
         - AVOID SCORES THAT DO NOT MATCH THE EXPLANATION PROVIDED.
-        - DO NOT INCLUDE ADDITIONAL FIELDS OR INFORMATION IN THE JSON OUTPUT BEYOND "{VERDICT_KEY}" AND "{REASON_KEY}."
+        - DO NOT INCLUDE ADDITIONAL FIELDS OR INFORMATION IN THE JSON OUTPUT BEYOND "answer_relevance_score" AND "reason."
         - NEVER ASSIGN A PERFECT SCORE UNLESS THE ANSWER IS FULLY RELEVANT AND FREE OF ANY IRRELEVANT INFORMATION.
 
         ###EXAMPLE OUTPUT FORMAT###
         {{
-          "{VERDICT_KEY}": 0.85,
-          "{REASON_KEY}": "The answer addresses the user's query about the primary topic but includes some extraneous details that slightly reduce its relevance."
+          "answer_relevance_score": 0.85,
+          "reason": "The answer addresses the user's query about the primary topic but includes some extraneous details that slightly reduce its relevance."
         }}
 
         ###FEW-SHOT EXAMPLES###

diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/context_precision/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/context_precision/metric.py
@@ -1,7 +1,7 @@
 import json
 import logging
 from typing import Any, List, Optional, Union
-
+import pydantic
 from opik import logging_messages
 from opik.evaluation.metrics import base_metric, score_result
 from opik.evaluation.models import base_model, models_factory
@@ -12,6 +12,11 @@
 LOGGER = logging.getLogger(__name__)
 
 
+class ContextPrecisionResponseFormat(pydantic.BaseModel):
+    context_precision_score: float
+    reason: str
+
+
 class ContextPrecision(base_metric.BaseMetric):
     """
     A metric that evaluates the context precision of an input-output pair using an LLM.
@@ -87,7 +92,9 @@ def score(
             context=context,
             few_shot_examples=self.few_shot_examples,
         )
-        model_output = self._model.generate_string(input=llm_query)
+        model_output = self._model.generate_string(
+            input=llm_query, response_format=ContextPrecisionResponseFormat
+        )
 
         return self._parse_model_output(model_output)
 
@@ -122,20 +129,22 @@ async def ascore(
             context=context,
             few_shot_examples=self.few_shot_examples,
         )
-        model_output = await self._model.agenerate_string(input=llm_query)
+        model_output = await self._model.agenerate_string(
+            input=llm_query, response_format=ContextPrecisionResponseFormat
+        )
 
         return self._parse_model_output(model_output)
 
     def _parse_model_output(self, content: str) -> score_result.ScoreResult:
         try:
             dict_content = json.loads(content)
-            score: float = dict_content[template.VERDICT_KEY]
+            score: float = dict_content["context_precision_score"]
 
             if not (0.0 <= score <= 1.0):
                 score = 0.5
 
             return score_result.ScoreResult(
-                name=self.name, value=score, reason=dict_content[template.REASON_KEY]
+                name=self.name, value=score, reason=dict_content["reason"]
             )
         except Exception:
             raise exceptions.MetricComputationError(

diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/context_precision/template.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/context_precision/template.py
@@ -1,8 +1,5 @@
 from typing import List, TypedDict
 
-VERDICT_KEY = "context_precision_score"
-REASON_KEY = "reason"
-
 
 class FewShotExampleContextPrecision(TypedDict):
     title: str
@@ -62,8 +59,8 @@ def generate_query(
             f"- **Result:**\n"
             f"  ```json\n"
             f"  {{\n"
-            f"    \"{VERDICT_KEY}\": {example['context_precision_score']},\n"
-            f"    \"{REASON_KEY}\": \"{example['reason']}\"\n"
+            f"    \"context_precision_score\": {example['context_precision_score']},\n"
+            f"    \"reason\": \"{example['reason']}\"\n"
             f"  }}\n"
             f"  ```"
             for i, example in enumerate(few_shot_examples)
@@ -82,19 +79,19 @@ def generate_query(
 
 ###SCALE FOR CONTEXT PRECISION METRIC (0.0 - 1.0)###
 
-- **0.0:** COMPLETELY INACCURATE – The LLM's answer is entirely off-topic, irrelevant, or incorrect based on the context and expected answer.
-- **0.2:** MOSTLY INACCURATE – The answer contains significant errors, misunderstanding of the context, or is largely irrelevant.
-- **0.4:** PARTIALLY ACCURATE – Some correct elements are present, but the answer is incomplete or partially misaligned with the context and expected answer.
-- **0.6:** MOSTLY ACCURATE – The answer is generally correct and relevant but may contain minor errors or lack complete precision in aligning with the expected answer.
-- **0.8:** HIGHLY ACCURATE – The answer is very close to the expected answer, with only minor discrepancies that do not significantly impact the overall correctness.
-- **1.0:** PERFECTLY ACCURATE – The LLM's answer matches the expected answer precisely, with full adherence to the context and no errors.
+- **0.0:** COMPLETELY INACCURATE - The LLM's answer is entirely off-topic, irrelevant, or incorrect based on the context and expected answer.
+- **0.2:** MOSTLY INACCURATE - The answer contains significant errors, misunderstanding of the context, or is largely irrelevant.
+- **0.4:** PARTIALLY ACCURATE - Some correct elements are present, but the answer is incomplete or partially misaligned with the context and expected answer.
+- **0.6:** MOSTLY ACCURATE - The answer is generally correct and relevant but may contain minor errors or lack complete precision in aligning with the expected answer.
+- **0.8:** HIGHLY ACCURATE - The answer is very close to the expected answer, with only minor discrepancies that do not significantly impact the overall correctness.
+- **1.0:** PERFECTLY ACCURATE - The LLM's answer matches the expected answer precisely, with full adherence to the context and no errors.
 
 2. **PROVIDE A REASON FOR THE SCORE:**
     - **JUSTIFY** why the specific score was given, considering the alignment with context, accuracy, relevance, and completeness.
 
 3. **RETURN THE RESULT IN A JSON FORMAT** as follows:
-    - `"{VERDICT_KEY}"`: The score between 0.0 and 1.0.
-    - `"{REASON_KEY}"`: A detailed explanation of why the score was assigned.
+    - `"context_precision_score"`: The score between 0.0 and 1.0.
+    - `"reason"`: A detailed explanation of why the score was assigned.
 
 ###WHAT NOT TO DO###
 - **DO NOT** assign a high score to answers that are off-topic or irrelevant, even if they contain some correct information.

diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/context_recall/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/context_recall/metric.py
@@ -1,6 +1,7 @@
 import json
 import logging
 from typing import Any, List, Optional, Union
+import pydantic
 
 from opik import logging_messages
 from opik.evaluation.metrics import base_metric, score_result
@@ -12,6 +13,11 @@
 LOGGER = logging.getLogger(__name__)
 
 
+class ContextRecallResponseFormat(pydantic.BaseModel):
+    context_recall_score: float
+    reason: str
+
+
 class ContextRecall(base_metric.BaseMetric):
     """
     A metric that evaluates the context recall of an input-output pair using an LLM.
@@ -85,7 +91,9 @@ def score(
             context=context,
             few_shot_examples=self.few_shot_examples,
         )
-        model_output = self._model.generate_string(input=llm_query)
+        model_output = self._model.generate_string(
+            input=llm_query, response_format=ContextRecallResponseFormat
+        )
 
         return self._parse_model_output(model_output)
 
@@ -120,20 +128,22 @@ async def ascore(
             context=context,
             few_shot_examples=self.few_shot_examples,
         )
-        model_output = await self._model.agenerate_string(input=llm_query)
+        model_output = await self._model.agenerate_string(
+            input=llm_query, response_format=ContextRecallResponseFormat
+        )
 
         return self._parse_model_output(model_output)
 
     def _parse_model_output(self, content: str) -> score_result.ScoreResult:
         try:
             dict_content = json.loads(content)
-            score: float = dict_content[template.VERDICT_KEY]
+            score: float = dict_content["context_recall_score"]
 
             if not (0.0 <= score <= 1.0):
                 score = 0.5
 
             return score_result.ScoreResult(
-                name=self.name, value=score, reason=dict_content[template.REASON_KEY]
+                name=self.name, value=score, reason=dict_content["reason"]
             )
         except Exception:
             raise exceptions.MetricComputationError(