diff --git a/src/intelligence_layer/evaluation/infrastructure/repository_navigator.py b/src/intelligence_layer/evaluation/infrastructure/repository_navigator.py index b678f8fdd..dcb08ced1 100644 --- a/src/intelligence_layer/evaluation/infrastructure/repository_navigator.py +++ b/src/intelligence_layer/evaluation/infrastructure/repository_navigator.py @@ -22,7 +22,7 @@ class RunLineage(BaseModel, Generic[Input, ExpectedOutput, Output]): output: ExampleOutput[Output] -class EvalLineage(BaseModel, Generic[Input, ExpectedOutput, Output, Evaluation]): +class EvaluationLineage(BaseModel, Generic[Input, ExpectedOutput, Output, Evaluation]): example: Example[Input, ExpectedOutput] outputs: Sequence[ExampleOutput[Output]] evaluation: ExampleEvaluation[Evaluation] @@ -41,7 +41,7 @@ def __init__( self._run_repository = run_repository self._eval_repository = evaluation_repository - def run_data( + def run_lineages( self, run_id: str, input_type: type[Input], @@ -50,7 +50,7 @@ def run_data( ) -> Iterable[RunLineage[Input, ExpectedOutput, Output]]: run_overview = self._run_repository.run_overview(run_id) if run_overview is None: - return [] + raise ValueError(f"Run repository does not contain a run with id {run_id}.") examples = list( self._dataset_repository.examples( @@ -69,38 +69,53 @@ def run_data( if example.id == example_output.example_id: yield RunLineage(example=example, output=example_output) - def eval_data( + def evaluation_lineages( self, - eval_id: str, + evaluation_id: str, input_type: type[Input], expected_output_type: type[ExpectedOutput], output_type: type[Output], evaluation_type: type[Evaluation], - ) -> Iterable[EvalLineage[Input, ExpectedOutput, Output, Evaluation]]: + ) -> Iterable[EvaluationLineage[Input, ExpectedOutput, Output, Evaluation]]: if self._eval_repository is None: raise ValueError("Evaluation Repository is not set, but required.") - eval_overview = self._eval_repository.evaluation_overview(eval_id) + + eval_overview = self._eval_repository.evaluation_overview(evaluation_id) if eval_overview is None: - return [] + raise ValueError( + f"Evaluation repository does not contain an evaluation with id {evaluation_id}." + ) evaluations = list( - self._eval_repository.example_evaluations(eval_id, evaluation_type) + self._eval_repository.example_evaluations(evaluation_id, evaluation_type) ) - run_lineages = itertools.chain.from_iterable( - self.run_data(overview.id, input_type, expected_output_type, output_type) - for overview in eval_overview.run_overviews + run_lineages = list( + itertools.chain.from_iterable( + self.run_lineages( + overview.id, input_type, expected_output_type, output_type + ) + for overview in eval_overview.run_overviews + ) ) # join - for run_lineage, evaluation in itertools.product(run_lineages, evaluations): - if run_lineage.example.id == evaluation.example_id: - yield EvalLineage( - example=run_lineage.example, - output=run_lineage.output, - evaluation=evaluation, + for evaluation in evaluations: + example = None + outputs = [] + for run_lineage in run_lineages: + if run_lineage.example.id == evaluation.example_id: + if example is None: + # the evaluation has only one example + # and all relevant run lineages contain the same example + example = run_lineage.example + outputs.append(run_lineage.output) + + if example is not None: + yield EvaluationLineage( + example=example, outputs=outputs, evaluation=evaluation ) - def run_single_example( + def run_lineage( self, run_id: str, example_id: str, @@ -111,42 +126,61 @@ def run_single_example( run_overview = self._run_repository.run_overview(run_id) if run_overview is None: - return None + raise ValueError(f"Run repository does not contain a run with id {run_id}.") example = self._dataset_repository.example( run_overview.dataset_id, example_id, input_type, expected_output_type ) + if example is None: + return None + example_output = self._run_repository.example_output( run_id, example_id, output_type ) + if example_output is None: + return None return RunLineage(example=example, output=example_output) - def eval_single_example( + def evaluation_lineage( self, - eval_id: str, + evaluation_id: str, example_id: str, input_type: type[Input], expected_output_type: type[ExpectedOutput], output_type: type[Output], evaluation_type: type[Evaluation], - ) -> Sequence[EvalLineage[Input, ExpectedOutput, Output, Evaluation]] | None: + ) -> EvaluationLineage[Input, ExpectedOutput, Output, Evaluation] | None: - eval_overview = self._eval_repository.evaluation_overview(eval_id) + if self._eval_repository is None: + raise ValueError("Evaluation Repository is not set, but required.") + + eval_overview = self._eval_repository.evaluation_overview(evaluation_id) if eval_overview is None: - return None + raise ValueError( + f"Evaluation repository does not contain an evaluation with id {evaluation_id}." + ) run_lineages = [ - self.run_single_example( + self.run_lineage( overview.id, example_id, input_type, expected_output_type, output_type ) for overview in eval_overview.run_overviews ] + existing_run_lineages = [ + lineage for lineage in run_lineages if lineage is not None + ] + if len(existing_run_lineages) == 0: + return None example_evaluation = self._eval_repository.example_evaluation( - eval_id, example_id, evaluation_type + evaluation_id, example_id, evaluation_type ) + if example_evaluation is None: + return None - return EvalLineage( - example=example, output=example_output, evaluation=example_evaluation + return EvaluationLineage( + example=existing_run_lineages[0].example, + outputs=[lineage.output for lineage in existing_run_lineages], + evaluation=example_evaluation, ) diff --git a/tests/evaluation/test_repository_navigator.py b/tests/evaluation/test_repository_navigator.py index 5aef81ff7..7302b1b42 100644 --- a/tests/evaluation/test_repository_navigator.py +++ b/tests/evaluation/test_repository_navigator.py @@ -145,7 +145,7 @@ def test_works_on_run_overviews( run_overview: RunOverview, ) -> None: # when - res = list(repository_navigator.run_data(run_overview.id, str, str, str)) + res = list(repository_navigator.run_lineages(run_overview.id, str, str, str)) # then res = sorted(res, key=lambda result: result.example.input) @@ -161,7 +161,9 @@ def test_works_on_evaluation( ) -> None: # when res = list( - repository_navigator.eval_data(eval_overview.id, str, str, str, DummyEval) + repository_navigator.evaluation_lineages( + eval_overview.id, str, str, str, DummyEval + ) ) # then @@ -182,20 +184,23 @@ def test_initialization_gives_warning_if_not_compatible() -> None: x = RepositoryNavigator(dataset_repository, run_repository) with pytest.raises(ValueError): - list(x.eval_data("irrelevant", str, str, str, DummyEval)) + list(x.evaluation_lineages("irrelevant", str, str, str, DummyEval)) + with pytest.raises(ValueError): + x.evaluation_lineage("irrelevant", "irrelevant", str, str, str, DummyEval) def test_get_run_lineage_for_single_example( examples: Sequence[DummyExample], repository_navigator: RepositoryNavigator, run_overview: RunOverview, -): +) -> None: # when - res = repository_navigator.run_single_example( + res = repository_navigator.run_lineage( run_overview.id, examples[0].id, str, str, str ) # Then + assert res is not None assert res.example.input == "input0" assert res.output.output == "input0 -> output" @@ -204,13 +209,55 @@ def test_get_eval_lineage_for_single_example( examples: Sequence[DummyExample], repository_navigator: RepositoryNavigator, eval_overview: EvaluationOverview, -): +) -> None: # when - res = repository_navigator.eval_single_example( + res = repository_navigator.evaluation_lineage( eval_overview.id, examples[0].id, str, str, str, DummyEval ) # Then + assert res is not None assert res.example.input == "input0" - assert res.output.output == "input0 -> output" - assert res.evaluation.result.startswith("input0") + assert res.outputs[0].output == "input0 -> output" + assert len(res.outputs) == 2 + eval_result = res.evaluation.result + assert isinstance(eval_result, DummyEval) + assert eval_result.eval.startswith("input0") + + +def test_get_run_lineage_for_non_existent_example_returns_none( + repository_navigator: RepositoryNavigator, + run_overview: RunOverview, +) -> None: + res = repository_navigator.run_lineage( + run_overview.id, "non-existent-id", str, str, str + ) + + assert res is None + + +def test_get_eval_lineage_for_non_existent_example_returns_none( + repository_navigator: RepositoryNavigator, + eval_overview: EvaluationOverview, +) -> None: + res = repository_navigator.evaluation_lineage( + eval_overview.id, "non-existent-id", str, str, str, DummyEval + ) + + assert res is None + + +def test_get_run_lineage_for_non_existent_run_id_returns_none( + repository_navigator: RepositoryNavigator, +) -> None: + with pytest.raises(ValueError): + repository_navigator.run_lineage("non-existent-id", "irrelevant", str, str, str) + + +def test_get_eval_lineage_for_non_existent_eval_id_returns_none( + repository_navigator: RepositoryNavigator, +) -> None: + with pytest.raises(ValueError): + repository_navigator.evaluation_lineage( + "non-existent-id", "irrelevant", str, str, str, DummyEval + )