feat: IL-405 added Runner.failed_runs(..), `RunRepository.failed_ex…

…ample_outputs(..)`, extended `test_runner_runs_dataset`
Aleph-Alpha · Apr 3, 2024 · c51a4dd · c51a4dd
1 parent 877e916
commit c51a4dd
Show file tree

Hide file tree

Showing 3 changed files with 46 additions and 3 deletions.
diff --git a/src/intelligence_layer/evaluation/run/run_repository.py b/src/intelligence_layer/evaluation/run/run_repository.py
@@ -2,7 +2,11 @@
 from typing import Iterable, Optional, Sequence
 
 from intelligence_layer.core import Output, Tracer
-from intelligence_layer.evaluation.run.domain import ExampleOutput, RunOverview
+from intelligence_layer.evaluation.run.domain import (
+    ExampleOutput,
+    FailedExampleRun,
+    RunOverview,
+)
 from intelligence_layer.evaluation.run.trace import ExampleTrace
 
 
@@ -132,3 +136,18 @@ def example_output_ids(self, run_id: str) -> Sequence[str]:
             A :class:`Sequence` of all :class:`ExampleOutput` IDs.
         """
         ...
+
+    def failed_example_outputs(
+        self, run_id: str, output_type: type[Output]
+    ) -> Iterable[ExampleOutput[Output]]:
+        """Returns all :class:`ExampleOutput` for failed example runs with a given run-overview ID sorted by their example ID.
+
+        Args:
+            run_id: The ID of the run overview.
+            output_type: Type of output that the `Task` returned in :func:`Task.do_run`
+
+        Returns:
+            :class:`Iterable` of :class:`ExampleOutput`s.
+        """
+        results = self.example_outputs(run_id, output_type)
+        return [r for r in results if isinstance(r.output, FailedExampleRun)]
diff --git a/src/intelligence_layer/evaluation/run/runner.py b/src/intelligence_layer/evaluation/run/runner.py
@@ -150,6 +150,27 @@ def run(
         self._run_repository.store_run_overview(run_overview)
         return run_overview
 
+    def failed_runs(
+        self, run_id: str, expected_output_type: type[ExpectedOutput]
+    ) -> Iterable[RunLineage[Input, ExpectedOutput, Output]]:
+        """Returns the `RunLineage` objects for all failed example runs that belong to the given run ID.
+
+        Args:
+            run_id: The ID of the run overview
+            expected_output_type: Type of output that the `Task` returned in :func:`Task.do_run`
+
+        Returns:
+            :class:`Iterable` of :class:`RunLineage`s.
+        """
+        failed_example_outputs = self._run_repository.failed_example_outputs(
+            run_id, output_type=self.output_type()
+        )
+        lineages = [
+            self.run_lineage(run_id, output.example_id, expected_output_type)
+            for output in failed_example_outputs
+        ]
+        return [lineage for lineage in lineages if lineage is not None]
+
     def run_lineages(
         self,
         run_id: str,

diff --git a/tests/evaluation/test_runner.py b/tests/evaluation/test_runner.py
@@ -5,7 +5,7 @@
     InMemoryRunRepository,
     Runner,
 )
-from tests.evaluation.conftest import FAIL_IN_EVAL_INPUT, FAIL_IN_TASK_INPUT, DummyTask
+from tests.evaluation.conftest import FAIL_IN_TASK_INPUT, DummyTask
 
 
 def test_runner_runs_dataset(
@@ -19,7 +19,6 @@ def test_runner_runs_dataset(
     examples = [
         Example(input="success", expected_output=None),
         Example(input=FAIL_IN_TASK_INPUT, expected_output=None),
-        Example(input=FAIL_IN_EVAL_INPUT, expected_output=None),
     ]
 
     dataset_id = in_memory_dataset_repository.create_dataset(
@@ -36,6 +35,10 @@ def test_runner_runs_dataset(
         example.id for example in examples
     )
 
+    failed_runs = list(runner.failed_runs(overview.id, type(None)))
+    assert len(failed_runs) == 1
+    assert failed_runs[0].example.id == examples[1].id
+
 
 def test_runner_runs_n_examples(
     in_memory_dataset_repository: InMemoryDatasetRepository,