Skip to content

Commit

Permalink
feat: IL-405 added tests for abort_on_error flag in Runner and `E…
Browse files Browse the repository at this point in the history
…valuator`
  • Loading branch information
FelixFehseTNG committed Apr 3, 2024
1 parent 20a75a4 commit 00b8269
Show file tree
Hide file tree
Showing 5 changed files with 62 additions and 39 deletions.
35 changes: 12 additions & 23 deletions src/intelligence_layer/evaluation/evaluation/evaluator.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import typing
from abc import ABC, abstractmethod
from concurrent.futures import ThreadPoolExecutor
from functools import lru_cache
Expand Down Expand Up @@ -297,7 +298,7 @@ def generate_evaluation_inputs() -> Iterable[
current_example = 0
for example_outputs in examples_zipped:
successful_example_outputs = [
output
typing.cast(SuccessfulExampleOutput[Output], output)
for output in example_outputs
if not isinstance(output.output, FailedExampleRun)
]
Expand All @@ -324,31 +325,19 @@ def generate_evaluation_inputs() -> Iterable[
yield (
example,
eval_id,
[
SuccessfulExampleOutput(
run_id=example_output.run_id,
example_id=example_output.example_id,
output=example_output.output,
)
for example_output in successful_example_outputs
if not isinstance(example_output.output, FailedExampleRun)
],
successful_example_outputs,
)

def evaluate(
args: Tuple[
Example[Input, ExpectedOutput],
str,
Sequence[SuccessfulExampleOutput[Output]],
],
) -> None:
example, eval_id, example_outputs = args
self.evaluate(example, eval_id, abort_on_error, *example_outputs)

with ThreadPoolExecutor(max_workers=10) as executor:
tqdm(
executor.map(evaluate, generate_evaluation_inputs()),
desc="Evaluating",
list( # the list is needed to consume the iterator returned from the executor.map
tqdm(
executor.map(
lambda args: self.evaluate(
args[0], args[1], abort_on_error, *args[2]
),
generate_evaluation_inputs(),
)
)
)

partial_overview = EvaluationOverview(
Expand Down
9 changes: 9 additions & 0 deletions tests/evaluation/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,15 @@ class DummyAggregatedEvaluationWithResultList(BaseModel):
results: Sequence[DummyEvaluation]


@fixture
def sequence_examples() -> Iterable[Example[str, None]]:
return [
Example(input="success", expected_output=None, id="example-1"),
Example(input=FAIL_IN_TASK_INPUT, expected_output=None, id="example-2"),
Example(input=FAIL_IN_EVAL_INPUT, expected_output=None, id="example-3"),
]


@fixture
def evaluation_id() -> str:
return "evaluation-id-1"
Expand Down
24 changes: 15 additions & 9 deletions tests/evaluation/test_evaluator.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from typing import Generic, Iterable, Optional, TypeVar

import pytest
from pydantic import BaseModel
from pytest import fixture

Expand Down Expand Up @@ -117,15 +118,6 @@ def do_run(self, input: str, tracer: Tracer): # type: ignore
return input


@fixture
def sequence_examples() -> Iterable[Example[str, None]]:
return [
Example(input="success", expected_output=None, id="example-1"),
Example(input=FAIL_IN_TASK_INPUT, expected_output=None, id="example-2"),
Example(input=FAIL_IN_EVAL_INPUT, expected_output=None, id="example-3"),
]


@fixture
def sequence_good_examples() -> Iterable[Example[str, None]]:
return [
Expand Down Expand Up @@ -255,6 +247,20 @@ def test_eval_and_aggregate_runs_returns_generic_statistics(
assert aggregation_overview.failed_evaluation_count == 2


def test_evaluator_aborts_on_error(
dummy_evaluator: Evaluator[str, str, None, DummyEvaluation],
dummy_aggregator: Aggregator[
DummyEvaluation, DummyAggregatedEvaluationWithResultList
],
dummy_runner: Runner[str, str],
dataset_id: str,
) -> None:
run_overview = dummy_runner.run_dataset(dataset_id)

with pytest.raises(RuntimeError):
dummy_evaluator.evaluate_runs(run_overview.id, abort_on_error=True)


def test_eval_and_aggregate_runs_uses_passed_tracer(
dummy_evaluator: Evaluator[str, str, None, DummyEvaluation],
dummy_aggregator: Aggregator[
Expand Down
6 changes: 3 additions & 3 deletions tests/evaluation/test_run_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,11 +304,11 @@ def test_successful_example_outputs_returns_only_successful_examples(
ExampleOutput(run_id=run_overview.id, example_id="2", output=None)
)

failed_outputs = list(
successful_outputs = list(
run_repository.successful_example_outputs(
run_id=run_overview.id, output_type=type(None)
)
)

assert len(failed_outputs) == 1
assert failed_outputs[0].example_id == "2"
assert len(successful_outputs) == 1
assert successful_outputs[0].example_id == "2"
27 changes: 23 additions & 4 deletions tests/evaluation/test_runner.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
from typing import Iterable

import pytest

from intelligence_layer.core import InMemoryTracer
from intelligence_layer.evaluation import (
Example,
Expand All @@ -11,15 +15,13 @@
def test_runner_runs_dataset(
in_memory_dataset_repository: InMemoryDatasetRepository,
in_memory_run_repository: InMemoryRunRepository,
sequence_examples: Iterable[Example[str, None]],
) -> None:
examples = list(sequence_examples)
task = DummyTask()
runner = Runner(
task, in_memory_dataset_repository, in_memory_run_repository, "dummy-runner"
)
examples = [
Example(input="success", expected_output=None),
Example(input=FAIL_IN_TASK_INPUT, expected_output=None),
]

dataset_id = in_memory_dataset_repository.create_dataset(
examples=examples, dataset_name="test-dataset"
Expand All @@ -40,6 +42,23 @@ def test_runner_runs_dataset(
assert failed_runs[0].example.id == examples[1].id


def test_runner_aborts_on_error(
in_memory_dataset_repository: InMemoryDatasetRepository,
in_memory_run_repository: InMemoryRunRepository,
sequence_examples: Iterable[Example[str, None]],
) -> None:
task = DummyTask()
runner = Runner(
task, in_memory_dataset_repository, in_memory_run_repository, "dummy-runner"
)

dataset_id = in_memory_dataset_repository.create_dataset(
examples=sequence_examples, dataset_name="test-dataset"
).id
with pytest.raises(RuntimeError):
runner.run_dataset(dataset_id, abort_on_error=True)


def test_runner_runs_n_examples(
in_memory_dataset_repository: InMemoryDatasetRepository,
in_memory_run_repository: InMemoryRunRepository,
Expand Down

0 comments on commit 00b8269

Please sign in to comment.