Skip to content

Commit

Permalink
feat: IL-405 print info on failed runs and evaluations to console and…
Browse files Browse the repository at this point in the history
… `FailedExample...`

The user gets informed when the run/evaluation pipeline crashes for certain examples.
* Information is printed on failed runs and evaluations.
* The stack trace is stored in the `FailedExampleRun`/`FailedExampleEvaluation` object
* The `Runner` and `Evaluator` have an optional flag `abort_on_error` to stop running/evaluating when an error occurs.
  • Loading branch information
FelixFehseTNG committed Apr 3, 2024
1 parent 65f4052 commit 877e916
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 5 deletions.
11 changes: 11 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,16 @@
# Changelog

## 0.8.0

### Breaking Changes

### New Features
- feature: Error information is printed to the console on failed runs and evaluations.
- feature: The stack trace of a failed run/evaluation is included in the `FailedExampleRun`/`FailedExampleEvaluation` object
- feature: The `Runner.run_dataset` and `Evaluator.evaluate_run` have an optional flag `abort_on_error` to stop running/evaluating when an error occurs.

### Fixes

## 0.7.0

### Breaking Changes
Expand Down
3 changes: 2 additions & 1 deletion src/intelligence_layer/evaluation/evaluation/domain.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import traceback
from datetime import datetime
from typing import Generic, Optional, TypeVar

Expand All @@ -22,7 +23,7 @@ class FailedExampleEvaluation(BaseModel):
@staticmethod
def from_exception(exception: Exception) -> "FailedExampleEvaluation":
return FailedExampleEvaluation(
error_message=f"{type(exception)}: {str(exception)}"
error_message=f"{type(exception)}: {str(exception)}\n{traceback.format_exc()}"
)


Expand Down
14 changes: 12 additions & 2 deletions src/intelligence_layer/evaluation/evaluation/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,10 @@ def evaluation_type(self) -> type[Evaluation]:

@final
def evaluate_runs(
self, *run_ids: str, num_examples: Optional[int] = None
self,
*run_ids: str,
num_examples: Optional[int] = None,
abort_on_error: bool = False,
) -> EvaluationOverview:
"""Evaluates all generated outputs in the run.
Expand All @@ -239,6 +242,7 @@ def evaluate_runs(
specific evaluation. The method compares all run of the provided ids to each other.
num_examples: The number of examples which should be evaluated from the given runs.
Always the first n runs stored in the evaluation repository
abort_on_error: Flag to abort all evaluations when an error occurs. Defaults to False.
Returns:
EvaluationOverview: An overview of the evaluation. Individual :class:`Evaluation`s will not be
Expand Down Expand Up @@ -339,7 +343,7 @@ def evaluate(
],
) -> None:
example, eval_id, example_outputs = args
self.evaluate(example, eval_id, *example_outputs)
self.evaluate(example, eval_id, abort_on_error, *example_outputs)

with ThreadPoolExecutor(max_workers=10) as executor:
tqdm(
Expand All @@ -362,6 +366,7 @@ def evaluate(
self,
example: Example[Input, ExpectedOutput],
evaluation_id: str,
abort_on_error: bool,
*example_outputs: SuccessfulExampleOutput[Output],
) -> None:
try:
Expand All @@ -372,6 +377,11 @@ def evaluate(
)
)
except Exception as e:
if abort_on_error:
raise e
print(
f'FAILED EVALUATION: example {example.id}, {type(e).__qualname__}: "{e}"'
)
result = FailedExampleEvaluation.from_exception(e)
self._evaluation_repository.store_example_evaluation(
ExampleEvaluation(
Expand Down
5 changes: 4 additions & 1 deletion src/intelligence_layer/evaluation/run/domain.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import traceback
from datetime import datetime
from typing import Generic

Expand All @@ -18,7 +19,9 @@ class FailedExampleRun(BaseModel):

@staticmethod
def from_exception(exception: Exception) -> "FailedExampleRun":
return FailedExampleRun(error_message=f"{type(exception)}: {str(exception)}")
return FailedExampleRun(
error_message=f"{type(exception)}: {str(exception)}\n{traceback.format_exc()}"
)


class ExampleOutput(BaseModel, Generic[Output]):
Expand Down
8 changes: 7 additions & 1 deletion src/intelligence_layer/evaluation/run/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ def run_dataset(
dataset_id: str,
tracer: Optional[Tracer] = None,
num_examples: Optional[int] = None,
abort_on_error: bool = False,
) -> RunOverview:
"""Generates all outputs for the provided dataset.
Expand All @@ -86,6 +87,7 @@ def run_dataset(
tracer: An optional :class:`Tracer` to trace all the runs from each example
num_examples: An optional int to specify how many examples from the dataset should be run.
Always the first n examples will be taken.
abort_on_error: Flag to abort all run when an error occurs. Defaults to False.
Returns:
An overview of the run. Outputs will not be returned but instead stored in the
Expand All @@ -101,7 +103,11 @@ def run(
try:
return example.id, self._task.run(example.input, evaluate_tracer)
except Exception as e:
print(e)
if abort_on_error:
raise e
print(
f'FAILED RUN: example {example.id}, {type(e).__qualname__}: "{e}"'
)
return example.id, FailedExampleRun.from_exception(e)

# mypy does not like union types
Expand Down

0 comments on commit 877e916

Please sign in to comment.