diff --git a/CHANGELOG.md b/CHANGELOG.md
index dd33839ca..c6bb4f600 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,18 @@
 # Changelog
 
+## 0.8.0
+
+### Breaking Changes
+
+### New Features
+- feature: Error information is printed to the console on failed runs and evaluations.
+- feature: The stack trace of a failed run/evaluation is included in the `FailedExampleRun`/`FailedExampleEvaluation` object
+- feature: The `Runner.run_dataset` and `Evaluator.evaluate_run` have an optional flag `abort_on_error` to stop running/evaluating when an error occurs.
+- feature: Added `Runner.failed_runs` and `Evaluator.failed_evaluations` to retrieve all failed run / evaluation lineages
+- feature: Added `.successful_example_outputs` and `.failed_example_outputs` to `RunRepository` to match the evaluation repository
+
+### Fixes
+
 ## 0.7.0
 
 ### Breaking Changes
diff --git a/src/intelligence_layer/evaluation/evaluation/domain.py b/src/intelligence_layer/evaluation/evaluation/domain.py
index 187865155..e4cd7d254 100644
--- a/src/intelligence_layer/evaluation/evaluation/domain.py
+++ b/src/intelligence_layer/evaluation/evaluation/domain.py
@@ -1,3 +1,4 @@
+import traceback
 from datetime import datetime
 from typing import Generic, Optional, TypeVar
 
@@ -22,7 +23,7 @@ class FailedExampleEvaluation(BaseModel):
     @staticmethod
     def from_exception(exception: Exception) -> "FailedExampleEvaluation":
         return FailedExampleEvaluation(
-            error_message=f"{type(exception)}: {str(exception)}"
+            error_message=f"{type(exception)}: {str(exception)}\n{traceback.format_exc()}"
         )
 
 
diff --git a/src/intelligence_layer/evaluation/evaluation/evaluator.py b/src/intelligence_layer/evaluation/evaluation/evaluator.py
index 819299b6a..4315556d8 100644
--- a/src/intelligence_layer/evaluation/evaluation/evaluator.py
+++ b/src/intelligence_layer/evaluation/evaluation/evaluator.py
@@ -1,3 +1,4 @@
+import typing
 from abc import ABC, abstractmethod
 from concurrent.futures import ThreadPoolExecutor
 from functools import lru_cache
@@ -222,7 +223,10 @@ def evaluation_type(self) -> type[Evaluation]:
 
     @final
     def evaluate_runs(
-        self, *run_ids: str, num_examples: Optional[int] = None
+        self,
+        *run_ids: str,
+        num_examples: Optional[int] = None,
+        abort_on_error: bool = False,
     ) -> EvaluationOverview:
         """Evaluates all generated outputs in the run.
 
@@ -239,6 +243,7 @@ def evaluate_runs(
                 specific evaluation. The method compares all run of the provided ids to each other.
             num_examples: The number of examples which should be evaluated from the given runs.
                 Always the first n runs stored in the evaluation repository
+            abort_on_error: Flag to abort all evaluations when an error occurs. Defaults to False.
 
         Returns:
             EvaluationOverview: An overview of the evaluation. Individual :class:`Evaluation`s will not be
@@ -293,7 +298,7 @@ def generate_evaluation_inputs() -> Iterable[
             current_example = 0
             for example_outputs in examples_zipped:
                 successful_example_outputs = [
-                    output
+                    typing.cast(SuccessfulExampleOutput[Output], output)
                     for output in example_outputs
                     if not isinstance(output.output, FailedExampleRun)
                 ]
@@ -320,31 +325,19 @@ def generate_evaluation_inputs() -> Iterable[
                 yield (
                     example,
                     eval_id,
-                    [
-                        SuccessfulExampleOutput(
-                            run_id=example_output.run_id,
-                            example_id=example_output.example_id,
-                            output=example_output.output,
-                        )
-                        for example_output in successful_example_outputs
-                        if not isinstance(example_output.output, FailedExampleRun)
-                    ],
+                    successful_example_outputs,
                 )
 
-        def evaluate(
-            args: Tuple[
-                Example[Input, ExpectedOutput],
-                str,
-                Sequence[SuccessfulExampleOutput[Output]],
-            ],
-        ) -> None:
-            example, eval_id, example_outputs = args
-            self.evaluate(example, eval_id, *example_outputs)
-
         with ThreadPoolExecutor(max_workers=10) as executor:
-            tqdm(
-                executor.map(evaluate, generate_evaluation_inputs()),
-                desc="Evaluating",
+            list(  # the list is needed to consume the iterator returned from the executor.map
+                tqdm(
+                    executor.map(
+                        lambda args: self.evaluate(
+                            args[0], args[1], abort_on_error, *args[2]
+                        ),
+                        generate_evaluation_inputs(),
+                    )
+                )
             )
 
         partial_overview = EvaluationOverview(
@@ -362,6 +355,7 @@ def evaluate(
         self,
         example: Example[Input, ExpectedOutput],
         evaluation_id: str,
+        abort_on_error: bool,
         *example_outputs: SuccessfulExampleOutput[Output],
     ) -> None:
         try:
@@ -372,6 +366,11 @@ def evaluate(
                 )
             )
         except Exception as e:
+            if abort_on_error:
+                raise e
+            print(
+                f'FAILED EVALUATION: example "{example.id}", {type(e).__qualname__}: "{e}"'
+            )
             result = FailedExampleEvaluation.from_exception(e)
         self._evaluation_repository.store_example_evaluation(
             ExampleEvaluation(
@@ -379,6 +378,28 @@ def evaluate(
             )
         )
 
+    def failed_evaluations(
+        self, evaluation_id: str
+    ) -> Iterable[EvaluationLineage[Input, ExpectedOutput, Output, Evaluation]]:
+        """Returns the `EvaluationLineage` objects for all failed example evalations that belong to the given evaluation ID.
+
+        Args:
+            evaluation_id: The ID of the evaluation overview
+
+        Returns:
+            :class:`Iterable` of :class:`EvaluationLineage`s.
+        """
+        failed_example_evaluations = (
+            self._evaluation_repository.failed_example_evaluations(
+                evaluation_id, evaluation_type=self.evaluation_type()
+            )
+        )
+        lineages = (
+            self.evaluation_lineage(evaluation_id, output.example_id)
+            for output in failed_example_evaluations
+        )
+        return (lineage for lineage in lineages if lineage is not None)
+
     def evaluation_lineages(
         self, evaluation_id: str
     ) -> Iterable[EvaluationLineage[Input, ExpectedOutput, Output, Evaluation]]:
diff --git a/src/intelligence_layer/evaluation/run/domain.py b/src/intelligence_layer/evaluation/run/domain.py
index 446487be4..5fbcbb60b 100644
--- a/src/intelligence_layer/evaluation/run/domain.py
+++ b/src/intelligence_layer/evaluation/run/domain.py
@@ -1,3 +1,4 @@
+import traceback
 from datetime import datetime
 from typing import Generic
 
@@ -18,7 +19,9 @@ class FailedExampleRun(BaseModel):
 
     @staticmethod
     def from_exception(exception: Exception) -> "FailedExampleRun":
-        return FailedExampleRun(error_message=f"{type(exception)}: {str(exception)}")
+        return FailedExampleRun(
+            error_message=f"{type(exception)}: {str(exception)}\n{traceback.format_exc()}"
+        )
 
 
 class ExampleOutput(BaseModel, Generic[Output]):
diff --git a/src/intelligence_layer/evaluation/run/run_repository.py b/src/intelligence_layer/evaluation/run/run_repository.py
index 7f7b9bcfb..54292f832 100644
--- a/src/intelligence_layer/evaluation/run/run_repository.py
+++ b/src/intelligence_layer/evaluation/run/run_repository.py
@@ -2,7 +2,11 @@
 from typing import Iterable, Optional, Sequence
 
 from intelligence_layer.core import Output, Tracer
-from intelligence_layer.evaluation.run.domain import ExampleOutput, RunOverview
+from intelligence_layer.evaluation.run.domain import (
+    ExampleOutput,
+    FailedExampleRun,
+    RunOverview,
+)
 from intelligence_layer.evaluation.run.trace import ExampleTrace
 
 
@@ -132,3 +136,33 @@ def example_output_ids(self, run_id: str) -> Sequence[str]:
             A :class:`Sequence` of all :class:`ExampleOutput` IDs.
         """
         ...
+
+    def successful_example_outputs(
+        self, run_id: str, output_type: type[Output]
+    ) -> Iterable[ExampleOutput[Output]]:
+        """Returns all :class:`ExampleOutput` for successful example runs with a given run-overview ID sorted by their example ID.
+
+        Args:
+            run_id: The ID of the run overview.
+            output_type: Type of output that the `Task` returned in :func:`Task.do_run`
+
+        Returns:
+            :class:`Iterable` of :class:`ExampleOutput`s.
+        """
+        results = self.example_outputs(run_id, output_type)
+        return (r for r in results if not isinstance(r.output, FailedExampleRun))
+
+    def failed_example_outputs(
+        self, run_id: str, output_type: type[Output]
+    ) -> Iterable[ExampleOutput[Output]]:
+        """Returns all :class:`ExampleOutput` for failed example runs with a given run-overview ID sorted by their example ID.
+
+        Args:
+            run_id: The ID of the run overview.
+            output_type: Type of output that the `Task` returned in :func:`Task.do_run`
+
+        Returns:
+            :class:`Iterable` of :class:`ExampleOutput`s.
+        """
+        results = self.example_outputs(run_id, output_type)
+        return (r for r in results if isinstance(r.output, FailedExampleRun))
diff --git a/src/intelligence_layer/evaluation/run/runner.py b/src/intelligence_layer/evaluation/run/runner.py
index ee6b95122..b1e919ed5 100644
--- a/src/intelligence_layer/evaluation/run/runner.py
+++ b/src/intelligence_layer/evaluation/run/runner.py
@@ -75,6 +75,7 @@ def run_dataset(
         dataset_id: str,
         tracer: Optional[Tracer] = None,
         num_examples: Optional[int] = None,
+        abort_on_error: bool = False,
     ) -> RunOverview:
         """Generates all outputs for the provided dataset.
 
@@ -86,6 +87,7 @@ def run_dataset(
             tracer: An optional :class:`Tracer` to trace all the runs from each example
             num_examples: An optional int to specify how many examples from the dataset should be run.
                 Always the first n examples will be taken.
+            abort_on_error: Flag to abort all run when an error occurs. Defaults to False.
 
         Returns:
             An overview of the run. Outputs will not be returned but instead stored in the
@@ -101,7 +103,11 @@ def run(
             try:
                 return example.id, self._task.run(example.input, evaluate_tracer)
             except Exception as e:
-                print(e)
+                if abort_on_error:
+                    raise e
+                print(
+                    f'FAILED RUN: example "{example.id}", {type(e).__qualname__}: "{e}"'
+                )
                 return example.id, FailedExampleRun.from_exception(e)
 
         # mypy does not like union types
@@ -144,6 +150,27 @@ def run(
         self._run_repository.store_run_overview(run_overview)
         return run_overview
 
+    def failed_runs(
+        self, run_id: str, expected_output_type: type[ExpectedOutput]
+    ) -> Iterable[RunLineage[Input, ExpectedOutput, Output]]:
+        """Returns the `RunLineage` objects for all failed example runs that belong to the given run ID.
+
+        Args:
+            run_id: The ID of the run overview
+            expected_output_type: Type of output that the `Task` returned in :func:`Task.do_run`
+
+        Returns:
+            :class:`Iterable` of :class:`RunLineage`s.
+        """
+        failed_example_outputs = self._run_repository.failed_example_outputs(
+            run_id, output_type=self.output_type()
+        )
+        lineages = (
+            self.run_lineage(run_id, output.example_id, expected_output_type)
+            for output in failed_example_outputs
+        )
+        return (lineage for lineage in lineages if lineage is not None)
+
     def run_lineages(
         self,
         run_id: str,
diff --git a/tests/evaluation/conftest.py b/tests/evaluation/conftest.py
index 95efdb87c..20ce8ef6d 100644
--- a/tests/evaluation/conftest.py
+++ b/tests/evaluation/conftest.py
@@ -60,6 +60,15 @@ class DummyAggregatedEvaluationWithResultList(BaseModel):
     results: Sequence[DummyEvaluation]
 
 
+@fixture
+def sequence_examples() -> Iterable[Example[str, None]]:
+    return [
+        Example(input="success", expected_output=None, id="example-1"),
+        Example(input=FAIL_IN_TASK_INPUT, expected_output=None, id="example-2"),
+        Example(input=FAIL_IN_EVAL_INPUT, expected_output=None, id="example-3"),
+    ]
+
+
 @fixture
 def evaluation_id() -> str:
     return "evaluation-id-1"
diff --git a/tests/evaluation/test_evaluator.py b/tests/evaluation/test_evaluator.py
index 251860ec3..11188e249 100644
--- a/tests/evaluation/test_evaluator.py
+++ b/tests/evaluation/test_evaluator.py
@@ -1,5 +1,6 @@
 from typing import Generic, Iterable, Optional, TypeVar
 
+import pytest
 from pydantic import BaseModel
 from pytest import fixture
 
@@ -117,15 +118,6 @@ def do_run(self, input: str, tracer: Tracer):  # type: ignore
         return input
 
 
-@fixture
-def sequence_examples() -> Iterable[Example[str, None]]:
-    return [
-        Example(input="success", expected_output=None, id="example-1"),
-        Example(input=FAIL_IN_TASK_INPUT, expected_output=None, id="example-2"),
-        Example(input=FAIL_IN_EVAL_INPUT, expected_output=None, id="example-3"),
-    ]
-
-
 @fixture
 def sequence_good_examples() -> Iterable[Example[str, None]]:
     return [
@@ -255,6 +247,20 @@ def test_eval_and_aggregate_runs_returns_generic_statistics(
     assert aggregation_overview.failed_evaluation_count == 2
 
 
+def test_evaluator_aborts_on_error(
+    dummy_evaluator: Evaluator[str, str, None, DummyEvaluation],
+    dummy_aggregator: Aggregator[
+        DummyEvaluation, DummyAggregatedEvaluationWithResultList
+    ],
+    dummy_runner: Runner[str, str],
+    dataset_id: str,
+) -> None:
+    run_overview = dummy_runner.run_dataset(dataset_id)
+
+    with pytest.raises(RuntimeError):
+        dummy_evaluator.evaluate_runs(run_overview.id, abort_on_error=True)
+
+
 def test_eval_and_aggregate_runs_uses_passed_tracer(
     dummy_evaluator: Evaluator[str, str, None, DummyEvaluation],
     dummy_aggregator: Aggregator[
@@ -274,39 +280,34 @@ def test_eval_and_aggregate_runs_uses_passed_tracer(
 
 
 def test_eval_and_aggregate_runs_stores_example_evaluations(
+    dummy_runner: Runner[str, str],
     dummy_evaluator: Evaluator[str, str, None, DummyEvaluation],
     dummy_aggregator: Aggregator[
         DummyEvaluation, DummyAggregatedEvaluationWithResultList
     ],
     dataset_id: str,
-    dummy_runner: Runner[str, str],
 ) -> None:
     evaluation_repository = dummy_evaluator._evaluation_repository
     dataset_repository = dummy_evaluator._dataset_repository
-    dataset_id = list(dataset_repository.dataset_ids())[0]
-    dataset: Optional[Iterable[Example[str, None]]] = dataset_repository.examples(
-        dataset_id, str, type(None)
-    )
-    assert dataset is not None
+    examples = list(dataset_repository.examples(dataset_id, str, type(None)))
 
     run_overview = dummy_runner.run_dataset(dataset_id, NoOpTracer())
     evaluation_overview = dummy_evaluator.evaluate_runs(run_overview.id)
     aggregation_overview = dummy_aggregator.aggregate_evaluation(evaluation_overview.id)
+    assert next(iter(aggregation_overview.evaluation_overviews)) == evaluation_overview
 
-    examples = list(dataset)
-    eval_overview = next(iter(aggregation_overview.evaluation_overviews))
     success_result = evaluation_repository.example_evaluation(
-        eval_overview.id,
+        evaluation_overview.id,
         examples[0].id,
         DummyEvaluation,
     )
     failure_result_task = evaluation_repository.example_evaluation(
-        eval_overview.id,
+        evaluation_overview.id,
         examples[1].id,
         DummyEvaluation,
     )
     failure_result_eval = evaluation_repository.example_evaluation(
-        eval_overview.id,
+        evaluation_overview.id,
         examples[2].id,
         DummyEvaluation,
     )
@@ -318,6 +319,23 @@ def test_eval_and_aggregate_runs_stores_example_evaluations(
     )
 
 
+def test_failed_evaluations_returns_only_failed_evaluations(
+    dummy_runner: Runner[str, str],
+    dummy_evaluator: Evaluator[str, str, None, DummyEvaluation],
+    dataset_id: str,
+    sequence_examples: Iterable[Example[str, None]],
+) -> None:
+    run_overview = dummy_runner.run_dataset(dataset_id, NoOpTracer())
+    evaluation_overview = dummy_evaluator.evaluate_runs(run_overview.id)
+    failed_evaluations = list(
+        dummy_evaluator.failed_evaluations(evaluation_overview.id)
+    )
+
+    assert len(failed_evaluations) == 1
+    assert isinstance(failed_evaluations[0].evaluation.result, FailedExampleEvaluation)
+    assert failed_evaluations[0].example.id == list(sequence_examples)[-1].id
+
+
 def test_eval_and_aggregate_runs_stores_example_traces(
     dummy_evaluator: Evaluator[str, str, None, DummyEvaluation],
     dummy_aggregator: Aggregator[
diff --git a/tests/evaluation/test_run_repository.py b/tests/evaluation/test_run_repository.py
index d9edca43b..d931f1db8 100644
--- a/tests/evaluation/test_run_repository.py
+++ b/tests/evaluation/test_run_repository.py
@@ -19,6 +19,7 @@
     RunRepository,
     TaskSpanTrace,
 )
+from intelligence_layer.evaluation.run.domain import FailedExampleRun
 from tests.conftest import DummyStringInput
 
 test_repository_fixtures = [
@@ -249,3 +250,65 @@ def test_run_overview_ids_returns_all_sorted_ids(
     stored_run_overview_ids = list(run_repository.run_overview_ids())
 
     assert stored_run_overview_ids == sorted(run_overview_ids)
+
+
+@mark.parametrize(
+    "repository_fixture",
+    test_repository_fixtures,
+)
+def test_failed_example_outputs_returns_only_failed_examples(
+    repository_fixture: str, request: FixtureRequest, run_overview: RunOverview
+) -> None:
+    run_repository: RunRepository = request.getfixturevalue(repository_fixture)
+    run_repository.store_run_overview(run_overview)
+
+    run_repository.store_example_output(
+        ExampleOutput(
+            run_id=run_overview.id,
+            example_id="1",
+            output=FailedExampleRun(error_message="test"),
+        )
+    )
+    run_repository.store_example_output(
+        ExampleOutput(run_id=run_overview.id, example_id="2", output=None)
+    )
+
+    failed_outputs = list(
+        run_repository.failed_example_outputs(
+            run_id=run_overview.id, output_type=type(None)
+        )
+    )
+
+    assert len(failed_outputs) == 1
+    assert failed_outputs[0].example_id == "1"
+
+
+@mark.parametrize(
+    "repository_fixture",
+    test_repository_fixtures,
+)
+def test_successful_example_outputs_returns_only_successful_examples(
+    repository_fixture: str, request: FixtureRequest, run_overview: RunOverview
+) -> None:
+    run_repository: RunRepository = request.getfixturevalue(repository_fixture)
+    run_repository.store_run_overview(run_overview)
+
+    run_repository.store_example_output(
+        ExampleOutput(
+            run_id=run_overview.id,
+            example_id="1",
+            output=FailedExampleRun(error_message="test"),
+        )
+    )
+    run_repository.store_example_output(
+        ExampleOutput(run_id=run_overview.id, example_id="2", output=None)
+    )
+
+    successful_outputs = list(
+        run_repository.successful_example_outputs(
+            run_id=run_overview.id, output_type=type(None)
+        )
+    )
+
+    assert len(successful_outputs) == 1
+    assert successful_outputs[0].example_id == "2"
diff --git a/tests/evaluation/test_runner.py b/tests/evaluation/test_runner.py
index 55970d4d0..d07021b0b 100644
--- a/tests/evaluation/test_runner.py
+++ b/tests/evaluation/test_runner.py
@@ -1,3 +1,7 @@
+from typing import Iterable
+
+import pytest
+
 from intelligence_layer.core import InMemoryTracer
 from intelligence_layer.evaluation import (
     Example,
@@ -5,22 +9,19 @@
     InMemoryRunRepository,
     Runner,
 )
-from tests.evaluation.conftest import FAIL_IN_EVAL_INPUT, FAIL_IN_TASK_INPUT, DummyTask
+from tests.evaluation.conftest import FAIL_IN_TASK_INPUT, DummyTask
 
 
 def test_runner_runs_dataset(
     in_memory_dataset_repository: InMemoryDatasetRepository,
     in_memory_run_repository: InMemoryRunRepository,
+    sequence_examples: Iterable[Example[str, None]],
 ) -> None:
+    examples = list(sequence_examples)
     task = DummyTask()
     runner = Runner(
         task, in_memory_dataset_repository, in_memory_run_repository, "dummy-runner"
     )
-    examples = [
-        Example(input="success", expected_output=None),
-        Example(input=FAIL_IN_TASK_INPUT, expected_output=None),
-        Example(input=FAIL_IN_EVAL_INPUT, expected_output=None),
-    ]
 
     dataset_id = in_memory_dataset_repository.create_dataset(
         examples=examples, dataset_name="test-dataset"
@@ -36,6 +37,27 @@ def test_runner_runs_dataset(
         example.id for example in examples
     )
 
+    failed_runs = list(runner.failed_runs(overview.id, type(None)))
+    assert len(failed_runs) == 1
+    assert failed_runs[0].example.id == examples[1].id
+
+
+def test_runner_aborts_on_error(
+    in_memory_dataset_repository: InMemoryDatasetRepository,
+    in_memory_run_repository: InMemoryRunRepository,
+    sequence_examples: Iterable[Example[str, None]],
+) -> None:
+    task = DummyTask()
+    runner = Runner(
+        task, in_memory_dataset_repository, in_memory_run_repository, "dummy-runner"
+    )
+
+    dataset_id = in_memory_dataset_repository.create_dataset(
+        examples=sequence_examples, dataset_name="test-dataset"
+    ).id
+    with pytest.raises(RuntimeError):
+        runner.run_dataset(dataset_id, abort_on_error=True)
+
 
 def test_runner_runs_n_examples(
     in_memory_dataset_repository: InMemoryDatasetRepository,