diff --git a/CHANGELOG.md b/CHANGELOG.md index dd33839ca..c6bb4f600 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,18 @@ # Changelog +## 0.8.0 + +### Breaking Changes + +### New Features +- feature: Error information is printed to the console on failed runs and evaluations. +- feature: The stack trace of a failed run/evaluation is included in the `FailedExampleRun`/`FailedExampleEvaluation` object +- feature: The `Runner.run_dataset` and `Evaluator.evaluate_run` have an optional flag `abort_on_error` to stop running/evaluating when an error occurs. +- feature: Added `Runner.failed_runs` and `Evaluator.failed_evaluations` to retrieve all failed run / evaluation lineages +- feature: Added `.successful_example_outputs` and `.failed_example_outputs` to `RunRepository` to match the evaluation repository + +### Fixes + ## 0.7.0 ### Breaking Changes diff --git a/src/intelligence_layer/evaluation/evaluation/domain.py b/src/intelligence_layer/evaluation/evaluation/domain.py index 187865155..e4cd7d254 100644 --- a/src/intelligence_layer/evaluation/evaluation/domain.py +++ b/src/intelligence_layer/evaluation/evaluation/domain.py @@ -1,3 +1,4 @@ +import traceback from datetime import datetime from typing import Generic, Optional, TypeVar @@ -22,7 +23,7 @@ class FailedExampleEvaluation(BaseModel): @staticmethod def from_exception(exception: Exception) -> "FailedExampleEvaluation": return FailedExampleEvaluation( - error_message=f"{type(exception)}: {str(exception)}" + error_message=f"{type(exception)}: {str(exception)}\n{traceback.format_exc()}" ) diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator.py index 819299b6a..4315556d8 100644 --- a/src/intelligence_layer/evaluation/evaluation/evaluator.py +++ b/src/intelligence_layer/evaluation/evaluation/evaluator.py @@ -1,3 +1,4 @@ +import typing from abc import ABC, abstractmethod from concurrent.futures import ThreadPoolExecutor from functools import lru_cache @@ -222,7 +223,10 @@ def evaluation_type(self) -> type[Evaluation]: @final def evaluate_runs( - self, *run_ids: str, num_examples: Optional[int] = None + self, + *run_ids: str, + num_examples: Optional[int] = None, + abort_on_error: bool = False, ) -> EvaluationOverview: """Evaluates all generated outputs in the run. @@ -239,6 +243,7 @@ def evaluate_runs( specific evaluation. The method compares all run of the provided ids to each other. num_examples: The number of examples which should be evaluated from the given runs. Always the first n runs stored in the evaluation repository + abort_on_error: Flag to abort all evaluations when an error occurs. Defaults to False. Returns: EvaluationOverview: An overview of the evaluation. Individual :class:`Evaluation`s will not be @@ -293,7 +298,7 @@ def generate_evaluation_inputs() -> Iterable[ current_example = 0 for example_outputs in examples_zipped: successful_example_outputs = [ - output + typing.cast(SuccessfulExampleOutput[Output], output) for output in example_outputs if not isinstance(output.output, FailedExampleRun) ] @@ -320,31 +325,19 @@ def generate_evaluation_inputs() -> Iterable[ yield ( example, eval_id, - [ - SuccessfulExampleOutput( - run_id=example_output.run_id, - example_id=example_output.example_id, - output=example_output.output, - ) - for example_output in successful_example_outputs - if not isinstance(example_output.output, FailedExampleRun) - ], + successful_example_outputs, ) - def evaluate( - args: Tuple[ - Example[Input, ExpectedOutput], - str, - Sequence[SuccessfulExampleOutput[Output]], - ], - ) -> None: - example, eval_id, example_outputs = args - self.evaluate(example, eval_id, *example_outputs) - with ThreadPoolExecutor(max_workers=10) as executor: - tqdm( - executor.map(evaluate, generate_evaluation_inputs()), - desc="Evaluating", + list( # the list is needed to consume the iterator returned from the executor.map + tqdm( + executor.map( + lambda args: self.evaluate( + args[0], args[1], abort_on_error, *args[2] + ), + generate_evaluation_inputs(), + ) + ) ) partial_overview = EvaluationOverview( @@ -362,6 +355,7 @@ def evaluate( self, example: Example[Input, ExpectedOutput], evaluation_id: str, + abort_on_error: bool, *example_outputs: SuccessfulExampleOutput[Output], ) -> None: try: @@ -372,6 +366,11 @@ def evaluate( ) ) except Exception as e: + if abort_on_error: + raise e + print( + f'FAILED EVALUATION: example "{example.id}", {type(e).__qualname__}: "{e}"' + ) result = FailedExampleEvaluation.from_exception(e) self._evaluation_repository.store_example_evaluation( ExampleEvaluation( @@ -379,6 +378,28 @@ def evaluate( ) ) + def failed_evaluations( + self, evaluation_id: str + ) -> Iterable[EvaluationLineage[Input, ExpectedOutput, Output, Evaluation]]: + """Returns the `EvaluationLineage` objects for all failed example evalations that belong to the given evaluation ID. + + Args: + evaluation_id: The ID of the evaluation overview + + Returns: + :class:`Iterable` of :class:`EvaluationLineage`s. + """ + failed_example_evaluations = ( + self._evaluation_repository.failed_example_evaluations( + evaluation_id, evaluation_type=self.evaluation_type() + ) + ) + lineages = ( + self.evaluation_lineage(evaluation_id, output.example_id) + for output in failed_example_evaluations + ) + return (lineage for lineage in lineages if lineage is not None) + def evaluation_lineages( self, evaluation_id: str ) -> Iterable[EvaluationLineage[Input, ExpectedOutput, Output, Evaluation]]: diff --git a/src/intelligence_layer/evaluation/run/domain.py b/src/intelligence_layer/evaluation/run/domain.py index 446487be4..5fbcbb60b 100644 --- a/src/intelligence_layer/evaluation/run/domain.py +++ b/src/intelligence_layer/evaluation/run/domain.py @@ -1,3 +1,4 @@ +import traceback from datetime import datetime from typing import Generic @@ -18,7 +19,9 @@ class FailedExampleRun(BaseModel): @staticmethod def from_exception(exception: Exception) -> "FailedExampleRun": - return FailedExampleRun(error_message=f"{type(exception)}: {str(exception)}") + return FailedExampleRun( + error_message=f"{type(exception)}: {str(exception)}\n{traceback.format_exc()}" + ) class ExampleOutput(BaseModel, Generic[Output]): diff --git a/src/intelligence_layer/evaluation/run/run_repository.py b/src/intelligence_layer/evaluation/run/run_repository.py index 7f7b9bcfb..54292f832 100644 --- a/src/intelligence_layer/evaluation/run/run_repository.py +++ b/src/intelligence_layer/evaluation/run/run_repository.py @@ -2,7 +2,11 @@ from typing import Iterable, Optional, Sequence from intelligence_layer.core import Output, Tracer -from intelligence_layer.evaluation.run.domain import ExampleOutput, RunOverview +from intelligence_layer.evaluation.run.domain import ( + ExampleOutput, + FailedExampleRun, + RunOverview, +) from intelligence_layer.evaluation.run.trace import ExampleTrace @@ -132,3 +136,33 @@ def example_output_ids(self, run_id: str) -> Sequence[str]: A :class:`Sequence` of all :class:`ExampleOutput` IDs. """ ... + + def successful_example_outputs( + self, run_id: str, output_type: type[Output] + ) -> Iterable[ExampleOutput[Output]]: + """Returns all :class:`ExampleOutput` for successful example runs with a given run-overview ID sorted by their example ID. + + Args: + run_id: The ID of the run overview. + output_type: Type of output that the `Task` returned in :func:`Task.do_run` + + Returns: + :class:`Iterable` of :class:`ExampleOutput`s. + """ + results = self.example_outputs(run_id, output_type) + return (r for r in results if not isinstance(r.output, FailedExampleRun)) + + def failed_example_outputs( + self, run_id: str, output_type: type[Output] + ) -> Iterable[ExampleOutput[Output]]: + """Returns all :class:`ExampleOutput` for failed example runs with a given run-overview ID sorted by their example ID. + + Args: + run_id: The ID of the run overview. + output_type: Type of output that the `Task` returned in :func:`Task.do_run` + + Returns: + :class:`Iterable` of :class:`ExampleOutput`s. + """ + results = self.example_outputs(run_id, output_type) + return (r for r in results if isinstance(r.output, FailedExampleRun)) diff --git a/src/intelligence_layer/evaluation/run/runner.py b/src/intelligence_layer/evaluation/run/runner.py index ee6b95122..b1e919ed5 100644 --- a/src/intelligence_layer/evaluation/run/runner.py +++ b/src/intelligence_layer/evaluation/run/runner.py @@ -75,6 +75,7 @@ def run_dataset( dataset_id: str, tracer: Optional[Tracer] = None, num_examples: Optional[int] = None, + abort_on_error: bool = False, ) -> RunOverview: """Generates all outputs for the provided dataset. @@ -86,6 +87,7 @@ def run_dataset( tracer: An optional :class:`Tracer` to trace all the runs from each example num_examples: An optional int to specify how many examples from the dataset should be run. Always the first n examples will be taken. + abort_on_error: Flag to abort all run when an error occurs. Defaults to False. Returns: An overview of the run. Outputs will not be returned but instead stored in the @@ -101,7 +103,11 @@ def run( try: return example.id, self._task.run(example.input, evaluate_tracer) except Exception as e: - print(e) + if abort_on_error: + raise e + print( + f'FAILED RUN: example "{example.id}", {type(e).__qualname__}: "{e}"' + ) return example.id, FailedExampleRun.from_exception(e) # mypy does not like union types @@ -144,6 +150,27 @@ def run( self._run_repository.store_run_overview(run_overview) return run_overview + def failed_runs( + self, run_id: str, expected_output_type: type[ExpectedOutput] + ) -> Iterable[RunLineage[Input, ExpectedOutput, Output]]: + """Returns the `RunLineage` objects for all failed example runs that belong to the given run ID. + + Args: + run_id: The ID of the run overview + expected_output_type: Type of output that the `Task` returned in :func:`Task.do_run` + + Returns: + :class:`Iterable` of :class:`RunLineage`s. + """ + failed_example_outputs = self._run_repository.failed_example_outputs( + run_id, output_type=self.output_type() + ) + lineages = ( + self.run_lineage(run_id, output.example_id, expected_output_type) + for output in failed_example_outputs + ) + return (lineage for lineage in lineages if lineage is not None) + def run_lineages( self, run_id: str, diff --git a/tests/evaluation/conftest.py b/tests/evaluation/conftest.py index 95efdb87c..20ce8ef6d 100644 --- a/tests/evaluation/conftest.py +++ b/tests/evaluation/conftest.py @@ -60,6 +60,15 @@ class DummyAggregatedEvaluationWithResultList(BaseModel): results: Sequence[DummyEvaluation] +@fixture +def sequence_examples() -> Iterable[Example[str, None]]: + return [ + Example(input="success", expected_output=None, id="example-1"), + Example(input=FAIL_IN_TASK_INPUT, expected_output=None, id="example-2"), + Example(input=FAIL_IN_EVAL_INPUT, expected_output=None, id="example-3"), + ] + + @fixture def evaluation_id() -> str: return "evaluation-id-1" diff --git a/tests/evaluation/test_evaluator.py b/tests/evaluation/test_evaluator.py index 251860ec3..11188e249 100644 --- a/tests/evaluation/test_evaluator.py +++ b/tests/evaluation/test_evaluator.py @@ -1,5 +1,6 @@ from typing import Generic, Iterable, Optional, TypeVar +import pytest from pydantic import BaseModel from pytest import fixture @@ -117,15 +118,6 @@ def do_run(self, input: str, tracer: Tracer): # type: ignore return input -@fixture -def sequence_examples() -> Iterable[Example[str, None]]: - return [ - Example(input="success", expected_output=None, id="example-1"), - Example(input=FAIL_IN_TASK_INPUT, expected_output=None, id="example-2"), - Example(input=FAIL_IN_EVAL_INPUT, expected_output=None, id="example-3"), - ] - - @fixture def sequence_good_examples() -> Iterable[Example[str, None]]: return [ @@ -255,6 +247,20 @@ def test_eval_and_aggregate_runs_returns_generic_statistics( assert aggregation_overview.failed_evaluation_count == 2 +def test_evaluator_aborts_on_error( + dummy_evaluator: Evaluator[str, str, None, DummyEvaluation], + dummy_aggregator: Aggregator[ + DummyEvaluation, DummyAggregatedEvaluationWithResultList + ], + dummy_runner: Runner[str, str], + dataset_id: str, +) -> None: + run_overview = dummy_runner.run_dataset(dataset_id) + + with pytest.raises(RuntimeError): + dummy_evaluator.evaluate_runs(run_overview.id, abort_on_error=True) + + def test_eval_and_aggregate_runs_uses_passed_tracer( dummy_evaluator: Evaluator[str, str, None, DummyEvaluation], dummy_aggregator: Aggregator[ @@ -274,39 +280,34 @@ def test_eval_and_aggregate_runs_uses_passed_tracer( def test_eval_and_aggregate_runs_stores_example_evaluations( + dummy_runner: Runner[str, str], dummy_evaluator: Evaluator[str, str, None, DummyEvaluation], dummy_aggregator: Aggregator[ DummyEvaluation, DummyAggregatedEvaluationWithResultList ], dataset_id: str, - dummy_runner: Runner[str, str], ) -> None: evaluation_repository = dummy_evaluator._evaluation_repository dataset_repository = dummy_evaluator._dataset_repository - dataset_id = list(dataset_repository.dataset_ids())[0] - dataset: Optional[Iterable[Example[str, None]]] = dataset_repository.examples( - dataset_id, str, type(None) - ) - assert dataset is not None + examples = list(dataset_repository.examples(dataset_id, str, type(None))) run_overview = dummy_runner.run_dataset(dataset_id, NoOpTracer()) evaluation_overview = dummy_evaluator.evaluate_runs(run_overview.id) aggregation_overview = dummy_aggregator.aggregate_evaluation(evaluation_overview.id) + assert next(iter(aggregation_overview.evaluation_overviews)) == evaluation_overview - examples = list(dataset) - eval_overview = next(iter(aggregation_overview.evaluation_overviews)) success_result = evaluation_repository.example_evaluation( - eval_overview.id, + evaluation_overview.id, examples[0].id, DummyEvaluation, ) failure_result_task = evaluation_repository.example_evaluation( - eval_overview.id, + evaluation_overview.id, examples[1].id, DummyEvaluation, ) failure_result_eval = evaluation_repository.example_evaluation( - eval_overview.id, + evaluation_overview.id, examples[2].id, DummyEvaluation, ) @@ -318,6 +319,23 @@ def test_eval_and_aggregate_runs_stores_example_evaluations( ) +def test_failed_evaluations_returns_only_failed_evaluations( + dummy_runner: Runner[str, str], + dummy_evaluator: Evaluator[str, str, None, DummyEvaluation], + dataset_id: str, + sequence_examples: Iterable[Example[str, None]], +) -> None: + run_overview = dummy_runner.run_dataset(dataset_id, NoOpTracer()) + evaluation_overview = dummy_evaluator.evaluate_runs(run_overview.id) + failed_evaluations = list( + dummy_evaluator.failed_evaluations(evaluation_overview.id) + ) + + assert len(failed_evaluations) == 1 + assert isinstance(failed_evaluations[0].evaluation.result, FailedExampleEvaluation) + assert failed_evaluations[0].example.id == list(sequence_examples)[-1].id + + def test_eval_and_aggregate_runs_stores_example_traces( dummy_evaluator: Evaluator[str, str, None, DummyEvaluation], dummy_aggregator: Aggregator[ diff --git a/tests/evaluation/test_run_repository.py b/tests/evaluation/test_run_repository.py index d9edca43b..d931f1db8 100644 --- a/tests/evaluation/test_run_repository.py +++ b/tests/evaluation/test_run_repository.py @@ -19,6 +19,7 @@ RunRepository, TaskSpanTrace, ) +from intelligence_layer.evaluation.run.domain import FailedExampleRun from tests.conftest import DummyStringInput test_repository_fixtures = [ @@ -249,3 +250,65 @@ def test_run_overview_ids_returns_all_sorted_ids( stored_run_overview_ids = list(run_repository.run_overview_ids()) assert stored_run_overview_ids == sorted(run_overview_ids) + + +@mark.parametrize( + "repository_fixture", + test_repository_fixtures, +) +def test_failed_example_outputs_returns_only_failed_examples( + repository_fixture: str, request: FixtureRequest, run_overview: RunOverview +) -> None: + run_repository: RunRepository = request.getfixturevalue(repository_fixture) + run_repository.store_run_overview(run_overview) + + run_repository.store_example_output( + ExampleOutput( + run_id=run_overview.id, + example_id="1", + output=FailedExampleRun(error_message="test"), + ) + ) + run_repository.store_example_output( + ExampleOutput(run_id=run_overview.id, example_id="2", output=None) + ) + + failed_outputs = list( + run_repository.failed_example_outputs( + run_id=run_overview.id, output_type=type(None) + ) + ) + + assert len(failed_outputs) == 1 + assert failed_outputs[0].example_id == "1" + + +@mark.parametrize( + "repository_fixture", + test_repository_fixtures, +) +def test_successful_example_outputs_returns_only_successful_examples( + repository_fixture: str, request: FixtureRequest, run_overview: RunOverview +) -> None: + run_repository: RunRepository = request.getfixturevalue(repository_fixture) + run_repository.store_run_overview(run_overview) + + run_repository.store_example_output( + ExampleOutput( + run_id=run_overview.id, + example_id="1", + output=FailedExampleRun(error_message="test"), + ) + ) + run_repository.store_example_output( + ExampleOutput(run_id=run_overview.id, example_id="2", output=None) + ) + + successful_outputs = list( + run_repository.successful_example_outputs( + run_id=run_overview.id, output_type=type(None) + ) + ) + + assert len(successful_outputs) == 1 + assert successful_outputs[0].example_id == "2" diff --git a/tests/evaluation/test_runner.py b/tests/evaluation/test_runner.py index 55970d4d0..d07021b0b 100644 --- a/tests/evaluation/test_runner.py +++ b/tests/evaluation/test_runner.py @@ -1,3 +1,7 @@ +from typing import Iterable + +import pytest + from intelligence_layer.core import InMemoryTracer from intelligence_layer.evaluation import ( Example, @@ -5,22 +9,19 @@ InMemoryRunRepository, Runner, ) -from tests.evaluation.conftest import FAIL_IN_EVAL_INPUT, FAIL_IN_TASK_INPUT, DummyTask +from tests.evaluation.conftest import FAIL_IN_TASK_INPUT, DummyTask def test_runner_runs_dataset( in_memory_dataset_repository: InMemoryDatasetRepository, in_memory_run_repository: InMemoryRunRepository, + sequence_examples: Iterable[Example[str, None]], ) -> None: + examples = list(sequence_examples) task = DummyTask() runner = Runner( task, in_memory_dataset_repository, in_memory_run_repository, "dummy-runner" ) - examples = [ - Example(input="success", expected_output=None), - Example(input=FAIL_IN_TASK_INPUT, expected_output=None), - Example(input=FAIL_IN_EVAL_INPUT, expected_output=None), - ] dataset_id = in_memory_dataset_repository.create_dataset( examples=examples, dataset_name="test-dataset" @@ -36,6 +37,27 @@ def test_runner_runs_dataset( example.id for example in examples ) + failed_runs = list(runner.failed_runs(overview.id, type(None))) + assert len(failed_runs) == 1 + assert failed_runs[0].example.id == examples[1].id + + +def test_runner_aborts_on_error( + in_memory_dataset_repository: InMemoryDatasetRepository, + in_memory_run_repository: InMemoryRunRepository, + sequence_examples: Iterable[Example[str, None]], +) -> None: + task = DummyTask() + runner = Runner( + task, in_memory_dataset_repository, in_memory_run_repository, "dummy-runner" + ) + + dataset_id = in_memory_dataset_repository.create_dataset( + examples=sequence_examples, dataset_name="test-dataset" + ).id + with pytest.raises(RuntimeError): + runner.run_dataset(dataset_id, abort_on_error=True) + def test_runner_runs_n_examples( in_memory_dataset_repository: InMemoryDatasetRepository,