From a29d3e993021c3af73ede6358366ce90680d187b Mon Sep 17 00:00:00 2001
From: Merlin Kallenborn <Merlin.Kallenborn@ext.aleph-alpha.com>
Date: Wed, 15 May 2024 15:28:50 +0200
Subject: [PATCH] WIP: feat: Add incremental elo evaluation logic

TASK: IL-394
---
 src/documentation/elo_qa_eval.ipynb           |  15 +-
 .../evaluator/incremental_elo_evaluator.py    | 170 ++++++++++++++++++
 2 files changed, 180 insertions(+), 5 deletions(-)
 create mode 100644 src/intelligence_layer/evaluation/evaluation/evaluator/incremental_elo_evaluator.py

diff --git a/src/documentation/elo_qa_eval.ipynb b/src/documentation/elo_qa_eval.ipynb
index 714b3c628..0c55a33a9 100644
--- a/src/documentation/elo_qa_eval.ipynb
+++ b/src/documentation/elo_qa_eval.ipynb
@@ -378,9 +378,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "\n",
     "from intelligence_layer.evaluation import Aggregator\n",
-    "from intelligence_layer.evaluation.aggregation.elo_aggregation import MatchesAggregationLogic\n",
+    "from intelligence_layer.evaluation.aggregation.elo_aggregation import (\n",
+    "    MatchesAggregationLogic,\n",
+    ")\n",
     "\n",
     "aggregator = Aggregator(\n",
     "    evaluation_repository=evaluation_repository,\n",
@@ -509,18 +510,22 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "from intelligence_layer.evaluation.evaluation.evaluator.incremental_evaluator import (\n",
+    "    IncrementalEvaluator,\n",
+    ")\n",
+    "\n",
     "high_priority_runs = [\n",
     "    overview.id\n",
     "    for overview in run_repository.run_overviews()\n",
     "    if overview.description.startswith(\"New QA\")\n",
     "]\n",
     "\n",
-    "evaluator_missing_runs = Evaluator(\n",
+    "evaluator_missing_runs = IncrementalEvaluator(\n",
     "    dataset_repository=dataset_repository,\n",
     "    run_repository=run_repository,\n",
     "    evaluation_repository=evaluation_repository,\n",
     "    description=\"ELO QA evaluation for newly added model\",  # this description will be used later to query for specific evaluations\n",
-    "    evaluation_logic=EloEvaluationLogic(\n",
+    "    evaluation_logic=EloQaEvaluationLogic(\n",
     "        client=aa_client,\n",
     "        high_priority_runs=frozenset(high_priority_runs),\n",
     "    ),\n",
@@ -539,7 +544,7 @@
    "source": [
     "# ensure that for each example there are evaluated comparisons\n",
     "for example_evaluation in evaluation_repository.example_evaluations(\n",
-    "    new_evaluation_overview.id, Match\n",
+    "    new_evaluation_overview.id, Matches\n",
     "):\n",
     "    assert (\n",
     "        len(example_evaluation.result.matches) > 0\n",
diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_elo_evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_elo_evaluator.py
new file mode 100644
index 000000000..99cfb4038
--- /dev/null
+++ b/src/intelligence_layer/evaluation/evaluation/evaluator/incremental_elo_evaluator.py
@@ -0,0 +1,170 @@
+from itertools import combinations
+import math
+from typing import Mapping, Sequence
+
+from aleph_alpha_client import Prompt
+from liquid import Template
+from intelligence_layer.core.detect_language import Language
+from intelligence_layer.core.model import CompleteInput, CompleteOutput, ControlModel
+from intelligence_layer.core.tracer.tracer import NoOpTracer, TaskSpan, Tracer
+from intelligence_layer.evaluation.dataset.domain import Example
+from intelligence_layer.evaluation.evaluation.evaluator.elo_evaluator import ComparisonEvaluation, EloGradingInput, MatchOutcome, Matches
+from intelligence_layer.evaluation.evaluation.evaluator.incremental_evaluator import IncrementalEvaluationLogic
+from intelligence_layer.evaluation.run.domain import SuccessfulExampleOutput
+from intelligence_layer.examples.qa.single_chunk_qa import QA_INSTRUCTIONS, SingleChunkQaInput, SingleChunkQaOutput
+
+
+class IncrementalEloQaEvaluationLogic(
+    IncrementalEvaluationLogic[SingleChunkQaInput, SingleChunkQaOutput, SingleChunkQaOutput, Matches]
+):
+    INPUT_TEMPLATE = """
+Your task is to compare two answers to an instruction on one metric.
+
+Please make sure you read and understand these instruction carefully. Please keep this document open while reviewing, and refer to it as needed.
+
+The Instruction for the answers was:{instruction}
+
+Evaluation Procedure:
+1. Read both answers carefully and identify the main facts and details they present.
+2. Check if the answers contain any factual errors that are not supported by the instruction.
+3. Evaluate which answer is more correct.
+
+Answer A:{first_completion}
+
+Answer B:{second_completion}
+
+Which answer is more correct given the Instruction and Evaluation Procedure, Answer A or Answer B?
+
+Response: Answer """
+    VALUES = [
+        " A",
+        " B",
+    ]  # The space before the A and B is important due to tokenization
+
+    def __init__(
+        self,
+        model: ControlModel,
+        tracer: Tracer = NoOpTracer(),
+    ):
+        super().__init__()
+        self._model = model
+        self.tracer = tracer
+        
+    def do_incremental_evaluate(
+        self,
+        example: Example[SingleChunkQaInput, SingleChunkQaOutput],
+        outputs: list[SuccessfulExampleOutput[SingleChunkQaOutput]],
+        already_evaluated_outputs: list[list[SuccessfulExampleOutput[SingleChunkQaOutput]]],
+    ) -> Matches:
+        
+        pairs = combinations(outputs, 2)
+        unique_pre_evaluated_runs: set[str] = set()
+       
+        for pre_run_output in already_evaluated_outputs:
+            for current_output in pre_run_output:
+                unique_pre_evaluated_runs.add(current_output.run_id)
+
+        
+        
+
+        return Matches(
+            comparison_evaluations=[
+                ComparisonEvaluation(
+                    first_player=player_a.run_id,
+                    second_player=player_b.run_id,
+                    outcome=self.grade(player_a, player_b, example),
+                )
+                for [player_a, player_b] in pairs
+                if unique_pre_evaluated_runs is None
+                or len(unique_pre_evaluated_runs) == 0
+                or not (player_a.run_id in unique_pre_evaluated_runs and player_b.run_id in unique_pre_evaluated_runs)              
+            ]  
+        )
+    
+    
+
+    def grade(
+        self,
+        first: SuccessfulExampleOutput[SingleChunkQaOutput],
+        second: SuccessfulExampleOutput[SingleChunkQaOutput],
+        example: Example[SingleChunkQaInput, SingleChunkQaOutput],
+    ) -> MatchOutcome:
+        grading_input = self._create_grading_input(first, second, example)
+
+        return MatchOutcome(
+            self.do_run(
+                grading_input,
+                self.tracer.task_span(
+                    task_name="elo_qa_run_grader", input=grading_input
+                ),
+            )
+        ) 
+    
+    def _create_grading_input(
+        self,
+        first: SuccessfulExampleOutput[SingleChunkQaOutput],
+        second: SuccessfulExampleOutput[SingleChunkQaOutput],
+        example: Example[SingleChunkQaInput, SingleChunkQaOutput],
+    ) -> EloGradingInput:
+        qa_instruction = Template(
+            QA_INSTRUCTIONS[Language("en")].unformatted_instruction
+        ).render(question=example.input.question)
+
+        no_answer = "There is no answer."
+        return EloGradingInput(
+            instruction=f"{example.input.chunk} {qa_instruction}",
+            first_completion=(
+                first.output.answer if first.output.answer is not None else no_answer
+            ),
+            second_completion=(
+                second.output.answer if second.output.answer is not None else no_answer
+            ),
+        )
+
+    def do_run(self, input: EloGradingInput, task_span: TaskSpan) -> MatchOutcome:
+        text = self.INPUT_TEMPLATE.format(
+            instruction=input.instruction,
+            first_completion=input.first_completion,
+            second_completion=input.second_completion,
+        )
+
+        complete_input = CompleteInput(
+            prompt=Prompt.from_text(text),
+            maximum_tokens=1,
+            log_probs=3,
+            disable_optimizations=True,
+        )
+        complete_output = self._model.complete_task().run(complete_input, task_span)
+
+        return self.calculate_winners(complete_output)
+    
+    def calculate_winners(self, complete_output: CompleteOutput) -> MatchOutcome:
+        default_log_prob = float("-inf")
+
+        def get_normalized_prob(
+            log_prob_list: Sequence[Mapping[str, float | None]] | None,
+        ) -> float:
+            assert log_prob_list is not None
+            log_probs = log_prob_list[0]
+            values = [
+                math.exp(log_probs.get(str(key), default_log_prob) or default_log_prob)
+                for key in self.VALUES
+            ]
+            if all(v == 0 for v in values):
+                raise ValueError(
+                    f"LLM evaluation response does not contain logprobs for the required tokens for the values: {self.VALUES}"
+                )
+            return values[0] / sum(values)
+
+        def categorize_value(value: float) -> MatchOutcome:
+            if value > 0.7:
+                return MatchOutcome.A_WINS
+            elif 0.3 > value:
+                return MatchOutcome.B_WINS
+            else:
+                return MatchOutcome.DRAW
+
+        normalized_probability = get_normalized_prob(
+            complete_output.completions[0].log_probs
+        )
+        return categorize_value(normalized_probability)