diff --git a/CHANGELOG.md b/CHANGELOG.md index dd33839ca..f7b8b9e40 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,16 @@ # Changelog +## 0.8.0 + +### Breaking Changes + +### New Features +- feature: Error information is printed to the console on failed runs and evaluations. +- feature: The stack trace of a failed run/evaluation is included in the `FailedExampleRun`/`FailedExampleEvaluation` object +- feature: The `Runner.run_dataset` and `Evaluator.evaluate_run` have an optional flag `abort_on_error` to stop running/evaluating when an error occurs. + +### Fixes + ## 0.7.0 ### Breaking Changes diff --git a/src/intelligence_layer/evaluation/evaluation/domain.py b/src/intelligence_layer/evaluation/evaluation/domain.py index 187865155..e4cd7d254 100644 --- a/src/intelligence_layer/evaluation/evaluation/domain.py +++ b/src/intelligence_layer/evaluation/evaluation/domain.py @@ -1,3 +1,4 @@ +import traceback from datetime import datetime from typing import Generic, Optional, TypeVar @@ -22,7 +23,7 @@ class FailedExampleEvaluation(BaseModel): @staticmethod def from_exception(exception: Exception) -> "FailedExampleEvaluation": return FailedExampleEvaluation( - error_message=f"{type(exception)}: {str(exception)}" + error_message=f"{type(exception)}: {str(exception)}\n{traceback.format_exc()}" ) diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator.py index 819299b6a..d9747102d 100644 --- a/src/intelligence_layer/evaluation/evaluation/evaluator.py +++ b/src/intelligence_layer/evaluation/evaluation/evaluator.py @@ -222,7 +222,10 @@ def evaluation_type(self) -> type[Evaluation]: @final def evaluate_runs( - self, *run_ids: str, num_examples: Optional[int] = None + self, + *run_ids: str, + num_examples: Optional[int] = None, + abort_on_error: bool = False, ) -> EvaluationOverview: """Evaluates all generated outputs in the run. @@ -239,6 +242,7 @@ def evaluate_runs( specific evaluation. The method compares all run of the provided ids to each other. num_examples: The number of examples which should be evaluated from the given runs. Always the first n runs stored in the evaluation repository + abort_on_error: Flag to abort all evaluations when an error occurs. Defaults to False. Returns: EvaluationOverview: An overview of the evaluation. Individual :class:`Evaluation`s will not be @@ -339,7 +343,7 @@ def evaluate( ], ) -> None: example, eval_id, example_outputs = args - self.evaluate(example, eval_id, *example_outputs) + self.evaluate(example, eval_id, abort_on_error, *example_outputs) with ThreadPoolExecutor(max_workers=10) as executor: tqdm( @@ -362,6 +366,7 @@ def evaluate( self, example: Example[Input, ExpectedOutput], evaluation_id: str, + abort_on_error: bool, *example_outputs: SuccessfulExampleOutput[Output], ) -> None: try: @@ -372,6 +377,11 @@ def evaluate( ) ) except Exception as e: + if abort_on_error: + raise e + print( + f'FAILED EVALUATION: example {example.id}, {type(e).__qualname__}: "{e}"' + ) result = FailedExampleEvaluation.from_exception(e) self._evaluation_repository.store_example_evaluation( ExampleEvaluation( diff --git a/src/intelligence_layer/evaluation/run/domain.py b/src/intelligence_layer/evaluation/run/domain.py index 446487be4..5fbcbb60b 100644 --- a/src/intelligence_layer/evaluation/run/domain.py +++ b/src/intelligence_layer/evaluation/run/domain.py @@ -1,3 +1,4 @@ +import traceback from datetime import datetime from typing import Generic @@ -18,7 +19,9 @@ class FailedExampleRun(BaseModel): @staticmethod def from_exception(exception: Exception) -> "FailedExampleRun": - return FailedExampleRun(error_message=f"{type(exception)}: {str(exception)}") + return FailedExampleRun( + error_message=f"{type(exception)}: {str(exception)}\n{traceback.format_exc()}" + ) class ExampleOutput(BaseModel, Generic[Output]): diff --git a/src/intelligence_layer/evaluation/run/runner.py b/src/intelligence_layer/evaluation/run/runner.py index ee6b95122..f817902ef 100644 --- a/src/intelligence_layer/evaluation/run/runner.py +++ b/src/intelligence_layer/evaluation/run/runner.py @@ -75,6 +75,7 @@ def run_dataset( dataset_id: str, tracer: Optional[Tracer] = None, num_examples: Optional[int] = None, + abort_on_error: bool = False, ) -> RunOverview: """Generates all outputs for the provided dataset. @@ -86,6 +87,7 @@ def run_dataset( tracer: An optional :class:`Tracer` to trace all the runs from each example num_examples: An optional int to specify how many examples from the dataset should be run. Always the first n examples will be taken. + abort_on_error: Flag to abort all run when an error occurs. Defaults to False. Returns: An overview of the run. Outputs will not be returned but instead stored in the @@ -101,7 +103,11 @@ def run( try: return example.id, self._task.run(example.input, evaluate_tracer) except Exception as e: - print(e) + if abort_on_error: + raise e + print( + f'FAILED RUN: example {example.id}, {type(e).__qualname__}: "{e}"' + ) return example.id, FailedExampleRun.from_exception(e) # mypy does not like union types