feat: IL-405 print info on failed runs and evaluations to console and…

… `FailedExample...` The user gets informed when the run/evaluation pipeline crashes for certain examples. * Information is printed on failed runs and evaluations. * The stack trace is stored in the `FailedExampleRun`/`FailedExampleEvaluation` object * The `Runner` and `Evaluator` have an optional flag `abort_on_error` to stop running/evaluating when an error occurs.
Aleph-Alpha · Apr 3, 2024 · 877e916 · 877e916
1 parent 65f4052
commit 877e916
Show file tree

Hide file tree

Showing 5 changed files with 36 additions and 5 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,16 @@
 # Changelog
 
+## 0.8.0
+
+### Breaking Changes
+
+### New Features
+- feature: Error information is printed to the console on failed runs and evaluations.
+- feature: The stack trace of a failed run/evaluation is included in the `FailedExampleRun`/`FailedExampleEvaluation` object
+- feature: The `Runner.run_dataset` and `Evaluator.evaluate_run` have an optional flag `abort_on_error` to stop running/evaluating when an error occurs.
+
+### Fixes
+
 ## 0.7.0
 
 ### Breaking Changes

diff --git a/src/intelligence_layer/evaluation/evaluation/domain.py b/src/intelligence_layer/evaluation/evaluation/domain.py
@@ -1,3 +1,4 @@
+import traceback
 from datetime import datetime
 from typing import Generic, Optional, TypeVar
 
@@ -22,7 +23,7 @@ class FailedExampleEvaluation(BaseModel):
     @staticmethod
     def from_exception(exception: Exception) -> "FailedExampleEvaluation":
         return FailedExampleEvaluation(
-            error_message=f"{type(exception)}: {str(exception)}"
+            error_message=f"{type(exception)}: {str(exception)}\n{traceback.format_exc()}"
         )
 
 

diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator.py
@@ -222,7 +222,10 @@ def evaluation_type(self) -> type[Evaluation]:
 
     @final
     def evaluate_runs(
-        self, *run_ids: str, num_examples: Optional[int] = None
+        self,
+        *run_ids: str,
+        num_examples: Optional[int] = None,
+        abort_on_error: bool = False,
     ) -> EvaluationOverview:
         """Evaluates all generated outputs in the run.
 
@@ -239,6 +242,7 @@ def evaluate_runs(
                 specific evaluation. The method compares all run of the provided ids to each other.
             num_examples: The number of examples which should be evaluated from the given runs.
                 Always the first n runs stored in the evaluation repository
+            abort_on_error: Flag to abort all evaluations when an error occurs. Defaults to False.
 
         Returns:
             EvaluationOverview: An overview of the evaluation. Individual :class:`Evaluation`s will not be
@@ -339,7 +343,7 @@ def evaluate(
             ],
         ) -> None:
             example, eval_id, example_outputs = args
-            self.evaluate(example, eval_id, *example_outputs)
+            self.evaluate(example, eval_id, abort_on_error, *example_outputs)
 
         with ThreadPoolExecutor(max_workers=10) as executor:
             tqdm(
@@ -362,6 +366,7 @@ def evaluate(
         self,
         example: Example[Input, ExpectedOutput],
         evaluation_id: str,
+        abort_on_error: bool,
         *example_outputs: SuccessfulExampleOutput[Output],
     ) -> None:
         try:
@@ -372,6 +377,11 @@ def evaluate(
                 )
             )
         except Exception as e:
+            if abort_on_error:
+                raise e
+            print(
+                f'FAILED EVALUATION: example {example.id}, {type(e).__qualname__}: "{e}"'
+            )
             result = FailedExampleEvaluation.from_exception(e)
         self._evaluation_repository.store_example_evaluation(
             ExampleEvaluation(

diff --git a/src/intelligence_layer/evaluation/run/domain.py b/src/intelligence_layer/evaluation/run/domain.py
@@ -1,3 +1,4 @@
+import traceback
 from datetime import datetime
 from typing import Generic
 
@@ -18,7 +19,9 @@ class FailedExampleRun(BaseModel):
 
     @staticmethod
     def from_exception(exception: Exception) -> "FailedExampleRun":
-        return FailedExampleRun(error_message=f"{type(exception)}: {str(exception)}")
+        return FailedExampleRun(
+            error_message=f"{type(exception)}: {str(exception)}\n{traceback.format_exc()}"
+        )
 
 
 class ExampleOutput(BaseModel, Generic[Output]):

diff --git a/src/intelligence_layer/evaluation/run/runner.py b/src/intelligence_layer/evaluation/run/runner.py
@@ -75,6 +75,7 @@ def run_dataset(
         dataset_id: str,
         tracer: Optional[Tracer] = None,
         num_examples: Optional[int] = None,
+        abort_on_error: bool = False,
     ) -> RunOverview:
         """Generates all outputs for the provided dataset.
 
@@ -86,6 +87,7 @@ def run_dataset(
             tracer: An optional :class:`Tracer` to trace all the runs from each example
             num_examples: An optional int to specify how many examples from the dataset should be run.
                 Always the first n examples will be taken.
+            abort_on_error: Flag to abort all run when an error occurs. Defaults to False.
 
         Returns:
             An overview of the run. Outputs will not be returned but instead stored in the
@@ -101,7 +103,11 @@ def run(
             try:
                 return example.id, self._task.run(example.input, evaluate_tracer)
             except Exception as e:
-                print(e)
+                if abort_on_error:
+                    raise e
+                print(
+                    f'FAILED RUN: example {example.id}, {type(e).__qualname__}: "{e}"'
+                )
                 return example.id, FailedExampleRun.from_exception(e)
 
         # mypy does not like union types