diff --git a/src/intelligence_layer/evaluation/__init__.py b/src/intelligence_layer/evaluation/__init__.py index 971b77539..afbd226a0 100644 --- a/src/intelligence_layer/evaluation/__init__.py +++ b/src/intelligence_layer/evaluation/__init__.py @@ -7,7 +7,7 @@ from .aggregation.domain import AggregatedEvaluation as AggregatedEvaluation from .aggregation.domain import AggregationOverview as AggregationOverview from .aggregation.elo_aggregation import ( - ComparisonAggregationLogic as ComparisonAggregationLogic, + ComparisonEvaluationAggregationLogic as ComparisonEvaluationAggregationLogic, ) from .aggregation.elo_aggregation import EloCalculator as EloCalculator from .aggregation.elo_aggregation import WinRateCalculator as WinRateCalculator diff --git a/src/intelligence_layer/evaluation/aggregation/elo_aggregation.py b/src/intelligence_layer/evaluation/aggregation/elo_aggregation.py index a2f9a5243..3877839e0 100644 --- a/src/intelligence_layer/evaluation/aggregation/elo_aggregation.py +++ b/src/intelligence_layer/evaluation/aggregation/elo_aggregation.py @@ -8,6 +8,58 @@ from intelligence_layer.evaluation import ComparisonEvaluation, MatchOutcome from intelligence_layer.evaluation.aggregation.accumulator import MeanAccumulator from intelligence_layer.evaluation.aggregation.aggregator import AggregationLogic +from intelligence_layer.evaluation.evaluation.evaluator.elo_evaluator import Matches + + +class PlayerScore(BaseModel): + elo: float + elo_standard_error: float + win_rate: float + num_matches: int + + +class AggregatedComparison(BaseModel): + scores: Mapping[str, PlayerScore] + + +class EloAggregationAdapter: + @staticmethod + def aggregate(evaluations: Iterable[ComparisonEvaluation]) -> AggregatedComparison: + evaluations = list(evaluations) + player_counter = Counter( + player + for comparison_evaluation in evaluations + for player in [ + comparison_evaluation.first_player, + comparison_evaluation.second_player, + ] + ) + + player_counts = dict(player_counter) + players = player_counts.keys() + + accumulators = {p: MeanAccumulator() for p in players} + for _ in range(100): + elo_calc = EloCalculator(players) + random.shuffle(evaluations) + elo_calc.calculate(evaluations) + for p in players: + accumulators[p].add(elo_calc.ratings[p]) + + win_rate_calc = WinRateCalculator(players) + win_rate = win_rate_calc.calculate(evaluations) + + return AggregatedComparison( + scores={ + p: PlayerScore( + elo=acc.extract(), + elo_standard_error=acc.standard_error(), + win_rate=win_rate[p], + num_matches=player_counts[p], + ) + for p, acc in accumulators.items() + }, + ) class EloCalculator: @@ -48,13 +100,15 @@ def _calc_difs( actual_b - expected_win_rate_b ) - def calculate(self, matches: Sequence[tuple[str, str, MatchOutcome]]) -> None: - for a, b, o in matches: - dif_a, dif_b = self._calc_difs(o, a, b) - self.ratings[a] += dif_a - self.ratings[b] += dif_b - self._match_counts[a] += 1 - self._match_counts[b] += 1 + def calculate(self, matches: Sequence[ComparisonEvaluation]) -> None: + for match in matches: + dif_a, dif_b = self._calc_difs( + match.outcome, match.first_player, match.second_player + ) + self.ratings[match.first_player] += dif_a + self.ratings[match.second_player] += dif_b + self._match_counts[match.first_player] += 1 + self._match_counts[match.second_player] += 1 class WinRateCalculator: @@ -62,14 +116,12 @@ def __init__(self, players: Iterable[str]) -> None: self.match_count: dict[str, int] = {p: 0 for p in players} self.win_count: dict[str, float] = {p: 0 for p in players} - def calculate( - self, matches: Sequence[tuple[str, str, MatchOutcome]] - ) -> Mapping[str, float]: - for a, b, o in matches: - self.match_count[a] += 1 - self.match_count[b] += 1 - self.win_count[a] += o.payoff[0] - self.win_count[b] += o.payoff[1] + def calculate(self, matches: Sequence[ComparisonEvaluation]) -> Mapping[str, float]: + for match in matches: + self.match_count[match.first_player] += 1 + self.match_count[match.second_player] += 1 + self.win_count[match.first_player] += match.outcome.payoff[0] + self.win_count[match.second_player] += match.outcome.payoff[1] return { player: self.win_count[player] / match_count @@ -77,56 +129,20 @@ def calculate( } -class PlayerScore(BaseModel): - elo: float - elo_standard_error: float - win_rate: float - num_matches: int - - -class AggregatedComparison(BaseModel): - scores: Mapping[str, PlayerScore] - - -class ComparisonAggregationLogic( +class ComparisonEvaluationAggregationLogic( AggregationLogic[ComparisonEvaluation, AggregatedComparison] ): def aggregate( self, evaluations: Iterable[ComparisonEvaluation] ) -> AggregatedComparison: - flattened_evaluations = [ - ( - evaluation.first_player, - evaluation.second_player, - evaluation.outcome, - ) - for evaluation in evaluations - ] - player_counter = Counter( - player for match in flattened_evaluations for player in [match[0], match[1]] - ) - player_counts = dict(player_counter) - players = player_counts.keys() + return EloAggregationAdapter.aggregate(evaluations) - accumulators = {p: MeanAccumulator() for p in players} - for _ in range(100): - elo_calc = EloCalculator(players) - random.shuffle(flattened_evaluations) - elo_calc.calculate(flattened_evaluations) - for p in players: - accumulators[p].add(elo_calc.ratings[p]) - win_rate_calc = WinRateCalculator(players) - win_rate = win_rate_calc.calculate(flattened_evaluations) - - return AggregatedComparison( - scores={ - p: PlayerScore( - elo=acc.extract(), - elo_standard_error=acc.standard_error(), - win_rate=win_rate[p], - num_matches=player_counts[p], - ) - for p, acc in accumulators.items() - }, - ) +class MatchesAggregationLogic(AggregationLogic[Matches, AggregatedComparison]): + def aggregate(self, evaluations: Iterable[Matches]) -> AggregatedComparison: + flattened_matches = [ + comparison_evaluation + for match in evaluations + for comparison_evaluation in match.comparison_evaluations + ] + return EloAggregationAdapter.aggregate(flattened_matches) diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator/elo_evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator/elo_evaluator.py index fd336fa9b..9bd794cc7 100644 --- a/src/intelligence_layer/evaluation/evaluation/evaluator/elo_evaluator.py +++ b/src/intelligence_layer/evaluation/evaluation/evaluator/elo_evaluator.py @@ -44,7 +44,7 @@ class ComparisonEvaluation(BaseModel): class Matches(BaseModel): - matches: Sequence[ComparisonEvaluation] + comparison_evaluations: Sequence[ComparisonEvaluation] class EloGradingInput(BaseModel): @@ -70,7 +70,7 @@ def do_evaluate( ) -> Matches: pairs = combinations(output, 2) return Matches( - matches=[ + comparison_evaluations=[ ComparisonEvaluation( first_player=player_a.run_id, second_player=player_b.run_id, diff --git a/tests/evaluation/test_argilla_evaluator.py b/tests/evaluation/test_argilla_evaluator.py index 56e3b9cf0..c0167d641 100644 --- a/tests/evaluation/test_argilla_evaluator.py +++ b/tests/evaluation/test_argilla_evaluator.py @@ -16,8 +16,8 @@ ArgillaEvaluationLogic, ArgillaEvaluator, AsyncInMemoryEvaluationRepository, - ComparisonAggregationLogic, ComparisonEvaluation, + ComparisonEvaluationAggregationLogic, DatasetRepository, Example, InMemoryDatasetRepository, @@ -327,7 +327,7 @@ def test_argilla_evaluator_abort_on_error_works( def test_argilla_aggregation_logic_works() -> None: - argilla_aggregation_logic = ComparisonAggregationLogic() + argilla_aggregation_logic = ComparisonEvaluationAggregationLogic() evaluations = ( ComparisonEvaluation( first_player="player_1", diff --git a/tests/evaluation/test_elo_calculator.py b/tests/evaluation/test_elo_calculator.py index b32525308..d70102b14 100644 --- a/tests/evaluation/test_elo_calculator.py +++ b/tests/evaluation/test_elo_calculator.py @@ -5,6 +5,9 @@ from pytest import fixture from intelligence_layer.evaluation import EloCalculator, MatchOutcome, WinRateCalculator +from intelligence_layer.evaluation.evaluation.evaluator.elo_evaluator import ( + ComparisonEvaluation, +) @fixture @@ -33,7 +36,7 @@ def test_match_outcome_serializes() -> None: def test_elo_calculator_works( - players: Sequence[str], matches: Sequence[tuple[str, str, MatchOutcome]] + players: Sequence[str], matches: Sequence[ComparisonEvaluation] ) -> None: elo_calculator = EloCalculator(players) elo_calculator.calculate(matches) @@ -52,7 +55,7 @@ def test_elo_calculator_works( def test_win_rate_calculator_works( - players: Sequence[str], matches: Sequence[tuple[str, str, MatchOutcome]] + players: Sequence[str], matches: Sequence[ComparisonEvaluation] ) -> None: win_rate_calculator = WinRateCalculator(players) scores = win_rate_calculator.calculate(matches) diff --git a/tests/evaluation/test_elo_evaluator.py b/tests/evaluation/test_elo_evaluator.py index 7f7f3c930..51cec543a 100644 --- a/tests/evaluation/test_elo_evaluator.py +++ b/tests/evaluation/test_elo_evaluator.py @@ -183,7 +183,7 @@ def test_evaluate_runs_creates_correct_matches_for_elo_qa_eval( 0 ].evaluation.result assert isinstance(eval_result, Matches) - matches = eval_result.matches + matches = eval_result.comparison_evaluations for match in matches: assert isinstance(match, ComparisonEvaluation) diff --git a/tests/evaluation/test_instruct_comparison_argilla_evaluator.py b/tests/evaluation/test_instruct_comparison_argilla_evaluator.py index 3b0db8e65..d6e85e2be 100644 --- a/tests/evaluation/test_instruct_comparison_argilla_evaluator.py +++ b/tests/evaluation/test_instruct_comparison_argilla_evaluator.py @@ -19,8 +19,8 @@ Aggregator, ArgillaEvaluator, AsyncInMemoryEvaluationRepository, - ComparisonAggregationLogic, ComparisonEvaluation, + ComparisonEvaluationAggregationLogic, EloCalculator, Example, ExampleOutput, @@ -109,8 +109,8 @@ def any_instruct_output() -> CompleteOutput: @fixture -def argilla_aggregation_logic() -> ComparisonAggregationLogic: - return ComparisonAggregationLogic() +def argilla_aggregation_logic() -> ComparisonEvaluationAggregationLogic: + return ComparisonEvaluationAggregationLogic() def create_dummy_dataset( @@ -165,7 +165,7 @@ def test_evaluate_run_submits_pairwise_comparison_records( in_memory_run_repository: InMemoryRunRepository, async_in_memory_evaluation_repository: AsyncInMemoryEvaluationRepository, in_memory_aggregation_repository: InMemoryAggregationRepository, - argilla_aggregation_logic: ComparisonAggregationLogic, + argilla_aggregation_logic: ComparisonEvaluationAggregationLogic, any_instruct_output: CompleteOutput, argilla_fake: ArgillaFake, ) -> None: @@ -244,10 +244,10 @@ def test_elo_calculating_works_as_expected() -> None: player1 = "player1" player2 = "player2" matches = [ - ( - player1, - player2, - MatchOutcome.A_WINS, + ComparisonEvaluation( + first_player=player1, + second_player=player2, + outcome=MatchOutcome.A_WINS, ) for _ in range(10) ] @@ -258,10 +258,10 @@ def test_elo_calculating_works_as_expected() -> None: assert elo.ratings[player2] < 1500 comeback_matches = [ - ( - player1, - player2, - MatchOutcome.B_WINS, + ComparisonEvaluation( + first_player=player1, + second_player=player2, + outcome=MatchOutcome.B_WINS, ) for i in range(10) ]