Skip to content

Commit

Permalink
feat: IL-405 added Runner.failed_runs(..), `RunRepository.failed_ex…
Browse files Browse the repository at this point in the history
…ample_outputs(..)`, extended `test_runner_runs_dataset`
  • Loading branch information
FelixFehseTNG committed Apr 3, 2024
1 parent 877e916 commit c51a4dd
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 3 deletions.
21 changes: 20 additions & 1 deletion src/intelligence_layer/evaluation/run/run_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@
from typing import Iterable, Optional, Sequence

from intelligence_layer.core import Output, Tracer
from intelligence_layer.evaluation.run.domain import ExampleOutput, RunOverview
from intelligence_layer.evaluation.run.domain import (
ExampleOutput,
FailedExampleRun,
RunOverview,
)
from intelligence_layer.evaluation.run.trace import ExampleTrace


Expand Down Expand Up @@ -132,3 +136,18 @@ def example_output_ids(self, run_id: str) -> Sequence[str]:
A :class:`Sequence` of all :class:`ExampleOutput` IDs.
"""
...

def failed_example_outputs(
self, run_id: str, output_type: type[Output]
) -> Iterable[ExampleOutput[Output]]:
"""Returns all :class:`ExampleOutput` for failed example runs with a given run-overview ID sorted by their example ID.
Args:
run_id: The ID of the run overview.
output_type: Type of output that the `Task` returned in :func:`Task.do_run`
Returns:
:class:`Iterable` of :class:`ExampleOutput`s.
"""
results = self.example_outputs(run_id, output_type)
return [r for r in results if isinstance(r.output, FailedExampleRun)]
21 changes: 21 additions & 0 deletions src/intelligence_layer/evaluation/run/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,27 @@ def run(
self._run_repository.store_run_overview(run_overview)
return run_overview

def failed_runs(
self, run_id: str, expected_output_type: type[ExpectedOutput]
) -> Iterable[RunLineage[Input, ExpectedOutput, Output]]:
"""Returns the `RunLineage` objects for all failed example runs that belong to the given run ID.
Args:
run_id: The ID of the run overview
expected_output_type: Type of output that the `Task` returned in :func:`Task.do_run`
Returns:
:class:`Iterable` of :class:`RunLineage`s.
"""
failed_example_outputs = self._run_repository.failed_example_outputs(
run_id, output_type=self.output_type()
)
lineages = [
self.run_lineage(run_id, output.example_id, expected_output_type)
for output in failed_example_outputs
]
return [lineage for lineage in lineages if lineage is not None]

def run_lineages(
self,
run_id: str,
Expand Down
7 changes: 5 additions & 2 deletions tests/evaluation/test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
InMemoryRunRepository,
Runner,
)
from tests.evaluation.conftest import FAIL_IN_EVAL_INPUT, FAIL_IN_TASK_INPUT, DummyTask
from tests.evaluation.conftest import FAIL_IN_TASK_INPUT, DummyTask


def test_runner_runs_dataset(
Expand All @@ -19,7 +19,6 @@ def test_runner_runs_dataset(
examples = [
Example(input="success", expected_output=None),
Example(input=FAIL_IN_TASK_INPUT, expected_output=None),
Example(input=FAIL_IN_EVAL_INPUT, expected_output=None),
]

dataset_id = in_memory_dataset_repository.create_dataset(
Expand All @@ -36,6 +35,10 @@ def test_runner_runs_dataset(
example.id for example in examples
)

failed_runs = list(runner.failed_runs(overview.id, type(None)))
assert len(failed_runs) == 1
assert failed_runs[0].example.id == examples[1].id


def test_runner_runs_n_examples(
in_memory_dataset_repository: InMemoryDatasetRepository,
Expand Down

0 comments on commit c51a4dd

Please sign in to comment.