Fix llm-as-judge implementation

pwr-ai · Aug 12, 2024 · 7824769 · 7824769
1 parent f29864f
commit 7824769
Showing 1 changed file with 51 additions and 40 deletions.
diff --git a/juddges/evaluation/eval_structured_llm_judge.py b/juddges/evaluation/eval_structured_llm_judge.py
@@ -5,32 +5,38 @@
 from juddges.evaluation.eval_structured import StructuredEvaluatorBase
 from juddges.evaluation.parse import EMPTY_ANSWER
 
+# TODO: might be a configurable prompt in future
+# Credit: https://github.com/openai/evals
 PROMPT = """
-You are professional assistant comparing a submitted Answer to a Reference written in Polish.
-Assess correctness of the Answer with one of the following options:
-- (Subset) The submitted Answer is a subset, i.e., contains part of information of the Reference and is fully consistent with it.
-- (Superset) The submitted Answer is a superset, i.e., contains all and some extra information of the Reference and is fully consistent with it.
-- (Correct) The submitted Answer contains all the same details as the Reference.
-- (Disagreement) There is a disagreement, either full or partial, between the submitted Answer and the Reference.
-
+You are comparing the extracted information from a submission to the expert-provided information on a given text in Polish. Here is the data:
 [BEGIN DATA]
 ************
-[Reference]: {gold}
+[Expert Extraction]: {gold}
 ************
-[Answer]: {answer}
+[Submission Extraction]: {answer}
 ************
 [END DATA]
 
-Format your judgment as only a single word in parentheses, e.g., "(Superset)"
+Compare the factual content of the extracted information with the expert-provided information. Ignore any minor differences in style, grammar, punctuation, or abbreviations.
+The extracted information may either be a subset or superset of the expert extraction, or it may conflict with it. Determine which case applies. Assess the extraction by selecting one of the following options:
+(Subset) The extracted information is a subset, i.e., contains part of the expert-provided information and is fully consistent with it.
+(Superset) The extracted information is a superset, i.e., contains all and some extra information of the expert-provided information and is fully consistent with it.
+(Correct) The extracted information contains all the same details as the expert-provided information.
+(Disagreement) There is a disagreement, either full or partial, between the extracted information and the expert-provided information.
+
+Format your answer as only a single word in parentheses, e.g., "(Superset)".
 """
 
 INVALID_JUDGMENT = "(non-evaluable)"
+CORRECT_JUDGEMENT = "(Correct)"
+MISSING_ANSWER = "(empty-answer)"
 allowed_answers = [
     "(Subset)",
     "(Superset)",
-    "(Correct)",
+    CORRECT_JUDGEMENT,
     "(Disagreement)",
     INVALID_JUDGMENT,
+    MISSING_ANSWER,
 ]
 
 
@@ -61,44 +67,49 @@ def evaluate(
         }
 
     def compute_metrics(self, preds: list[str], golds: list[str]) -> dict[str, float]:
+        """Assesses single field prediction either by comparing raw string or using LLM-as-judge otherwise."""
         assert len(golds) == len(preds)
-        results = []
+        llm_assessments = []
         num_llm_evals = 0
-        iter_golds_preds = enumerate(zip(golds, preds))
-        with tqdm(iter_golds_preds, total=len(golds), leave=False, desc="Evaluating") as pbar:
-            for i, (g, p) in pbar:
-                if p == EMPTY_ANSWER:
-                    results.append("(incorrect)")
-                elif p == g:
-                    results.append(1)
-                else:
-                    num_llm_evals += 1
-                    # TODO: Further can be improved with asynchronous requests
-                    response = self.oai_client.chat.completions.create(
-                        model=self.model_name,
-                        messages=[{"role": "user", "content": PROMPT.format(gold=g, answer=p)}],
-                        temperature=0.0,
-                        n=1,
-                    )
-
-                    response_msg = response.choices[0].message.content
-                    try:
-                        results.append(response_msg)
-                    except KeyError:
-                        print(f"Unexpected response: {response_msg}")
-                        results.append(-1)
-                pbar.set_postfix({"llm_calls": f"{num_llm_evals}/{i + 1}"})
+        enum_golds_preds = enumerate(zip(golds, preds))
+        for i, (ans_gold, ans_pred) in (
+            pbar := tqdm(enum_golds_preds, total=len(golds), leave=False, desc="Evaluating")
+        ):
+            if ans_pred == EMPTY_ANSWER:
+                llm_assessments.append(MISSING_ANSWER)
+            elif ans_pred == ans_gold:
+                llm_assessments.append(CORRECT_JUDGEMENT)
+            else:
+                num_llm_evals += 1
+                # TODO: Further can be improved with asynchronous requests
+                response = self.oai_client.chat.completions.create(
+                    model=self.model_name,
+                    messages=[
+                        {"role": "user", "content": PROMPT.format(gold=ans_gold, answer=ans_pred)}
+                    ],
+                    temperature=0.0,
+                    n=1,
+                )
+
+                response_msg = response.choices[0].message.content
+                llm_assessments.append(response_msg)
+
+            pbar.set_postfix({"llm_calls": f"{num_llm_evals}/{i + 1}"})
+
+        results_summary = self.answers_to_metrics(llm_assessments)
+
+        return results_summary
 
+    @staticmethod
+    def answers_to_metrics(llm_assessments: list[str]) -> dict[str, float]:
         results_summary = dict.fromkeys(allowed_answers, 0)
-        for judgement in results:
+        for judgement in llm_assessments:
             try:
                 results_summary[judgement] += 1
             except KeyError:
                 results_summary[INVALID_JUDGMENT] += 1
 
-        results_summary = {name: val / len(results) for name, val in results_summary.items()}
-
-        return results_summary
+        return {name: val / len(llm_assessments) for name, val in results_summary.items()}
 
 
 if __name__ == "__main__":