Skip to content

Commit

Permalink
Fix llm-as-judge implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
binkjakub committed Aug 12, 2024
1 parent f29864f commit 7824769
Showing 1 changed file with 51 additions and 40 deletions.
91 changes: 51 additions & 40 deletions juddges/evaluation/eval_structured_llm_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,32 +5,38 @@
from juddges.evaluation.eval_structured import StructuredEvaluatorBase
from juddges.evaluation.parse import EMPTY_ANSWER

# TODO: might be a configurable prompt in future
# Credit: https://github.com/openai/evals
PROMPT = """
You are professional assistant comparing a submitted Answer to a Reference written in Polish.
Assess correctness of the Answer with one of the following options:
- (Subset) The submitted Answer is a subset, i.e., contains part of information of the Reference and is fully consistent with it.
- (Superset) The submitted Answer is a superset, i.e., contains all and some extra information of the Reference and is fully consistent with it.
- (Correct) The submitted Answer contains all the same details as the Reference.
- (Disagreement) There is a disagreement, either full or partial, between the submitted Answer and the Reference.
You are comparing the extracted information from a submission to the expert-provided information on a given text in Polish. Here is the data:
[BEGIN DATA]
************
[Reference]: {gold}
[Expert Extraction]: {gold}
************
[Answer]: {answer}
[Submission Extraction]: {answer}
************
[END DATA]
Format your judgment as only a single word in parentheses, e.g., "(Superset)"
Compare the factual content of the extracted information with the expert-provided information. Ignore any minor differences in style, grammar, punctuation, or abbreviations.
The extracted information may either be a subset or superset of the expert extraction, or it may conflict with it. Determine which case applies. Assess the extraction by selecting one of the following options:
(Subset) The extracted information is a subset, i.e., contains part of the expert-provided information and is fully consistent with it.
(Superset) The extracted information is a superset, i.e., contains all and some extra information of the expert-provided information and is fully consistent with it.
(Correct) The extracted information contains all the same details as the expert-provided information.
(Disagreement) There is a disagreement, either full or partial, between the extracted information and the expert-provided information.
Format your answer as only a single word in parentheses, e.g., "(Superset)".
"""

INVALID_JUDGMENT = "(non-evaluable)"
CORRECT_JUDGEMENT = "(Correct)"
MISSING_ANSWER = "(empty-answer)"
allowed_answers = [
"(Subset)",
"(Superset)",
"(Correct)",
CORRECT_JUDGEMENT,
"(Disagreement)",
INVALID_JUDGMENT,
MISSING_ANSWER,
]


Expand Down Expand Up @@ -61,44 +67,49 @@ def evaluate(
}

def compute_metrics(self, preds: list[str], golds: list[str]) -> dict[str, float]:
"""Assesses single field prediction either by comparing raw string or using LLM-as-judge otherwise."""
assert len(golds) == len(preds)
results = []
llm_assessments = []
num_llm_evals = 0
iter_golds_preds = enumerate(zip(golds, preds))
with tqdm(iter_golds_preds, total=len(golds), leave=False, desc="Evaluating") as pbar:
for i, (g, p) in pbar:
if p == EMPTY_ANSWER:
results.append("(incorrect)")
elif p == g:
results.append(1)
else:
num_llm_evals += 1
# TODO: Further can be improved with asynchronous requests
response = self.oai_client.chat.completions.create(
model=self.model_name,
messages=[{"role": "user", "content": PROMPT.format(gold=g, answer=p)}],
temperature=0.0,
n=1,
)

response_msg = response.choices[0].message.content
try:
results.append(response_msg)
except KeyError:
print(f"Unexpected response: {response_msg}")
results.append(-1)
pbar.set_postfix({"llm_calls": f"{num_llm_evals}/{i + 1}"})
enum_golds_preds = enumerate(zip(golds, preds))
for i, (ans_gold, ans_pred) in (
pbar := tqdm(enum_golds_preds, total=len(golds), leave=False, desc="Evaluating")
):
if ans_pred == EMPTY_ANSWER:
llm_assessments.append(MISSING_ANSWER)
elif ans_pred == ans_gold:
llm_assessments.append(CORRECT_JUDGEMENT)
else:
num_llm_evals += 1
# TODO: Further can be improved with asynchronous requests
response = self.oai_client.chat.completions.create(
model=self.model_name,
messages=[
{"role": "user", "content": PROMPT.format(gold=ans_gold, answer=ans_pred)}
],
temperature=0.0,
n=1,
)

response_msg = response.choices[0].message.content
llm_assessments.append(response_msg)

pbar.set_postfix({"llm_calls": f"{num_llm_evals}/{i + 1}"})

results_summary = self.answers_to_metrics(llm_assessments)

return results_summary

@staticmethod
def answers_to_metrics(llm_assessments: list[str]) -> dict[str, float]:
results_summary = dict.fromkeys(allowed_answers, 0)
for judgement in results:
for judgement in llm_assessments:
try:
results_summary[judgement] += 1
except KeyError:
results_summary[INVALID_JUDGMENT] += 1

results_summary = {name: val / len(results) for name, val in results_summary.items()}

return results_summary
return {name: val / len(llm_assessments) for name, val in results_summary.items()}


if __name__ == "__main__":
Expand Down

0 comments on commit 7824769

Please sign in to comment.