diff --git a/src/documentation/how_tos/studio/how_to_execute_a_benchmark.ipynb b/src/documentation/how_tos/studio/how_to_execute_a_benchmark.ipynb
index 9b23112c..99bb5d6f 100644
--- a/src/documentation/how_tos/studio/how_to_execute_a_benchmark.ipynb
+++ b/src/documentation/how_tos/studio/how_to_execute_a_benchmark.ipynb
@@ -12,21 +12,32 @@
     "\n",
     "from documentation.how_tos.example_data import (\n",
     "    DummyAggregationLogic,\n",
-    "    DummyEvaluationLogic,\n",
+    "    DummyEvaluation,\n",
     "    DummyTask,\n",
     "    example_data,\n",
     ")\n",
-    "from intelligence_layer.connectors.studio.studio import StudioClient\n",
-    "from intelligence_layer.evaluation.benchmark.studio_benchmark import (\n",
+    "from intelligence_layer.connectors import StudioClient\n",
+    "from intelligence_layer.evaluation import (\n",
+    "    EvaluationLogic,\n",
+    "    Example,\n",
     "    StudioBenchmarkRepository,\n",
-    ")\n",
-    "from intelligence_layer.evaluation.dataset.studio_dataset_repository import (\n",
     "    StudioDatasetRepository,\n",
+    "    SuccessfulExampleOutput,\n",
     ")\n",
     "\n",
     "load_dotenv()\n",
     "my_example_data = example_data()\n",
-    "examples = my_example_data.examples"
+    "examples = my_example_data.examples\n",
+    "\n",
+    "\n",
+    "class DummyEvaluationLogic(EvaluationLogic[str, str, str, DummyEvaluation]):\n",
+    "    def do_evaluate(\n",
+    "        self, example: Example[str, str], *output: SuccessfulExampleOutput[str]\n",
+    "    ) -> DummyEvaluation:\n",
+    "        output_str = \"(\" + (\", \".join(o.output for o in output)) + \")\"\n",
+    "        return DummyEvaluation(\n",
+    "            eval=f\"{example.input}, {example.expected_output}, {output_str} -> evaluation\"\n",
+    "        )"
    ]
   },
   {
@@ -89,7 +100,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "intelligence-layer-LP3DLT23-py3.12",
+   "display_name": ".venv",
    "language": "python",
    "name": "python3"
   },
diff --git a/src/intelligence_layer/connectors/studio/studio.py b/src/intelligence_layer/connectors/studio/studio.py
index d5f88671..a72ed3da 100644
--- a/src/intelligence_layer/connectors/studio/studio.py
+++ b/src/intelligence_layer/connectors/studio/studio.py
@@ -122,8 +122,8 @@ class PostBenchmarkExecution(BaseModel):
     run_end: datetime
     run_successful_count: int
     run_failed_count: int
-    run_success_avg_latency: int
-    run_success_avg_token_count: int
+    run_success_avg_latency: float
+    run_success_avg_token_count: float
     # Eval Overview
     eval_start: datetime
     eval_end: datetime
@@ -143,7 +143,7 @@ class GetDatasetExamplesResponse(BaseModel, Generic[Input, ExpectedOutput]):
     items: Sequence[StudioExample[Input, ExpectedOutput]]
 
 
-class BenchmarkLineage(BaseModel, Generic[Input, Output, ExpectedOutput, Evaluation]):
+class BenchmarkLineage(BaseModel, Generic[Input, ExpectedOutput, Output, Evaluation]):
     trace_id: str
     input: Input
     expected_output: ExpectedOutput
diff --git a/src/intelligence_layer/evaluation/__init__.py b/src/intelligence_layer/evaluation/__init__.py
index f97a162e..055b155f 100644
--- a/src/intelligence_layer/evaluation/__init__.py
+++ b/src/intelligence_layer/evaluation/__init__.py
@@ -23,6 +23,10 @@
 from .aggregation.in_memory_aggregation_repository import (
     InMemoryAggregationRepository as InMemoryAggregationRepository,
 )
+from .benchmark.studio_benchmark import StudioBenchmark as StudioBenchmark
+from .benchmark.studio_benchmark import (
+    StudioBenchmarkRepository as StudioBenchmarkRepository,
+)
 from .dataset.dataset_repository import DatasetRepository as DatasetRepository
 from .dataset.domain import Dataset as Dataset
 from .dataset.domain import Example as Example
diff --git a/src/intelligence_layer/evaluation/benchmark/get_code.py b/src/intelligence_layer/evaluation/benchmark/get_code.py
new file mode 100644
index 00000000..f2e897ba
--- /dev/null
+++ b/src/intelligence_layer/evaluation/benchmark/get_code.py
@@ -0,0 +1,277 @@
+"""
+The following code is licensed under the following license and taken from https://github.com/wandb/weave.
+The following modifications have been made:
+ - Adapt the type hints to be more specific to this project
+ - Change the control flow for the get_source_notebook_safe to try to only get the source from notebooks once it fails.
+
+                              Apache License
+                        Version 2.0, January 2004
+                     http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+   "License" shall mean the terms and conditions for use, reproduction,
+   and distribution as defined by Sections 1 through 9 of this document.
+
+   "Licensor" shall mean the copyright owner or entity authorized by
+   the copyright owner that is granting the License.
+
+   "Legal Entity" shall mean the union of the acting entity and all
+   other entities that control, are controlled by, or are under common
+   control with that entity. For the purposes of this definition,
+   "control" means (i) the power, direct or indirect, to cause the
+   direction or management of such entity, whether by contract or
+   otherwise, or (ii) ownership of fifty percent (50%) or more of the
+   outstanding shares, or (iii) beneficial ownership of such entity.
+
+   "You" (or "Your") shall mean an individual or Legal Entity
+   exercising permissions granted by this License.
+
+   "Source" form shall mean the preferred form for making modifications,
+   including but not limited to software source code, documentation
+   source, and configuration files.
+
+   "Object" form shall mean any form resulting from mechanical
+   transformation or translation of a Source form, including but
+   not limited to compiled object code, generated documentation,
+   and conversions to other media types.
+
+   "Work" shall mean the work of authorship, whether in Source or
+   Object form, made available under the License, as indicated by a
+   copyright notice that is included in or attached to the work
+   (an example is provided in the Appendix below).
+
+   "Derivative Works" shall mean any work, whether in Source or Object
+   form, that is based on (or derived from) the Work and for which the
+   editorial revisions, annotations, elaborations, or other modifications
+   represent, as a whole, an original work of authorship. For the purposes
+   of this License, Derivative Works shall not include works that remain
+   separable from, or merely link (or bind by name) to the interfaces of,
+   the Work and Derivative Works thereof.
+
+   "Contribution" shall mean any work of authorship, including
+   the original version of the Work and any modifications or additions
+   to that Work or Derivative Works thereof, that is intentionally
+   submitted to Licensor for inclusion in the Work by the copyright owner
+   or by an individual or Legal Entity authorized to submit on behalf of
+   the copyright owner. For the purposes of this definition, "submitted"
+   means any form of electronic, verbal, or written communication sent
+   to the Licensor or its representatives, including but not limited to
+   communication on electronic mailing lists, source code control systems,
+   and issue tracking systems that are managed by, or on behalf of, the
+   Licensor for the purpose of discussing and improving the Work, but
+   excluding communication that is conspicuously marked or otherwise
+   designated in writing by the copyright owner as "Not a Contribution."
+
+   "Contributor" shall mean Licensor and any individual or Legal Entity
+   on behalf of whom a Contribution has been received by Licensor and
+   subsequently incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   copyright license to reproduce, prepare Derivative Works of,
+   publicly display, publicly perform, sublicense, and distribute the
+   Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   (except as stated in this section) patent license to make, have made,
+   use, offer to sell, sell, import, and otherwise transfer the Work,
+   where such license applies only to those patent claims licensable
+   by such Contributor that are necessarily infringed by their
+   Contribution(s) alone or by combination of their Contribution(s)
+   with the Work to which such Contribution(s) was submitted. If You
+   institute patent litigation against any entity (including a
+   cross-claim or counterclaim in a lawsuit) alleging that the Work
+   or a Contribution incorporated within the Work constitutes direct
+   or contributory patent infringement, then any patent licenses
+   granted to You under this License for that Work shall terminate
+   as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the
+   Work or Derivative Works thereof in any medium, with or without
+   modifications, and in Source or Object form, provided that You
+   meet the following conditions:
+
+   (a) You must give any other recipients of the Work or
+       Derivative Works a copy of this License; and
+
+   (b) You must cause any modified files to carry prominent notices
+       stating that You changed the files; and
+
+   (c) You must retain, in the Source form of any Derivative Works
+       that You distribute, all copyright, patent, trademark, and
+       attribution notices from the Source form of the Work,
+       excluding those notices that do not pertain to any part of
+       the Derivative Works; and
+
+   (d) If the Work includes a "NOTICE" text file as part of its
+       distribution, then any Derivative Works that You distribute must
+       include a readable copy of the attribution notices contained
+       within such NOTICE file, excluding those notices that do not
+       pertain to any part of the Derivative Works, in at least one
+       of the following places: within a NOTICE text file distributed
+       as part of the Derivative Works; within the Source form or
+       documentation, if provided along with the Derivative Works; or,
+       within a display generated by the Derivative Works, if and
+       wherever such third-party notices normally appear. The contents
+       of the NOTICE file are for informational purposes only and
+       do not modify the License. You may add Your own attribution
+       notices within Derivative Works that You distribute, alongside
+       or as an addendum to the NOTICE text from the Work, provided
+       that such additional attribution notices cannot be construed
+       as modifying the License.
+
+   You may add Your own copyright statement to Your modifications and
+   may provide additional or different license terms and conditions
+   for use, reproduction, or distribution of Your modifications, or
+   for any such Derivative Works as a whole, provided Your use,
+   reproduction, and distribution of the Work otherwise complies with
+   the conditions stated in this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise,
+   any Contribution intentionally submitted for inclusion in the Work
+   by You to the Licensor shall be under the terms and conditions of
+   this License, without any additional terms or conditions.
+   Notwithstanding the above, nothing herein shall supersede or modify
+   the terms of any separate license agreement you may have executed
+   with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade
+   names, trademarks, service marks, or product names of the Licensor,
+   except as required for reasonable and customary use in describing the
+   origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or
+   agreed to in writing, Licensor provides the Work (and each
+   Contributor provides its Contributions) on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+   implied, including, without limitation, any warranties or conditions
+   of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+   PARTICULAR PURPOSE. You are solely responsible for determining the
+   appropriateness of using or redistributing the Work and assume any
+   risks associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory,
+   whether in tort (including negligence), contract, or otherwise,
+   unless required by applicable law (such as deliberate and grossly
+   negligent acts) or agreed to in writing, shall any Contributor be
+   liable to You for damages, including any direct, indirect, special,
+   incidental, or consequential damages of any character arising as a
+   result of this License or out of the use or inability to use the
+   Work (including but not limited to damages for loss of goodwill,
+   work stoppage, computer failure or malfunction, or any and all
+   other commercial damages or losses), even if such Contributor
+   has been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing
+   the Work or Derivative Works thereof, You may choose to offer,
+   and charge a fee for, acceptance of support, warranty, indemnity,
+   or other liability obligations and/or rights consistent with this
+   License. However, in accepting such obligations, You may act only
+   on Your own behalf and on Your sole responsibility, not on behalf
+   of any other Contributor, and only if You agree to indemnify,
+   defend, and hold each Contributor harmless for any liability
+   incurred by, or claims asserted against, such Contributor by reason
+   of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
+
+APPENDIX: How to apply the Apache License to your work.
+
+   To apply the Apache License to your work, attach the following
+   boilerplate notice, with the fields enclosed by brackets "[]"
+   replaced with your own identifying information. (Don't include
+   the brackets!)  The text should be enclosed in the appropriate
+   comment syntax for the file format. We also recommend that a
+   file or class name and description of purpose be included on the
+   same "printed page" as the copyright notice for easier
+   identification within third-party archives.
+
+Copyright [yyyy] [name of copyright owner]
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""  # noqa
+
+import ast
+import inspect
+import textwrap
+
+from intelligence_layer.evaluation.aggregation.aggregator import AggregationLogic
+from intelligence_layer.evaluation.evaluation.evaluator.evaluator import EvaluationLogic
+
+
+class NotInteractiveEnvironmentError(Exception): ...
+
+
+def is_running_interactively() -> bool:
+    """Check if the code is running in an interactive environment."""
+    try:
+        from IPython import get_ipython
+
+        return get_ipython() is not None
+    except ModuleNotFoundError:
+        return False
+
+
+def get_notebook_source() -> str:
+    """Get the source code of the running notebook."""
+    from IPython import get_ipython
+
+    shell = get_ipython()
+    if shell is None:
+        raise NotInteractiveEnvironmentError
+
+    if not hasattr(shell, "user_ns"):
+        raise AttributeError("Cannot access user namespace")
+
+    # This is the list of input cells in the notebook
+    in_list = shell.user_ns["In"]
+
+    # Stitch them back into a single "file"
+    full_source = "\n\n".join(cell for cell in in_list[1:] if cell)
+
+    return full_source
+
+
+def get_class_source(cls: type) -> str:
+    """Get the latest source definition of a class in the notebook."""
+    notebook_source = get_notebook_source()
+    tree = ast.parse(notebook_source)
+    class_name = cls.__name__
+
+    # We need to walk the entire tree and get the last one since that's the most version of the cls
+    segment = None
+    for node in ast.walk(tree):
+        if isinstance(node, ast.ClassDef) and node.name == class_name:
+            segment = ast.get_source_segment(notebook_source, node)
+
+    if segment is not None:
+        return segment
+
+    raise ValueError(f"Class '{class_name}' not found in the notebook")
+
+
+def get_source_notebook_safe(logic: EvaluationLogic | AggregationLogic) -> str:
+    # In ipython, we can't use inspect.getsource on classes defined in the notebook
+    logic_class = type(logic)
+    try:
+        src = inspect.getsource(logic_class)
+    except OSError:
+        if is_running_interactively() and inspect.isclass(logic_class):
+            src = get_class_source(logic_class)
+    return textwrap.dedent(src)
diff --git a/src/intelligence_layer/evaluation/benchmark/studio_benchmark.py b/src/intelligence_layer/evaluation/benchmark/studio_benchmark.py
index 23235024..ca6cf80d 100644
--- a/src/intelligence_layer/evaluation/benchmark/studio_benchmark.py
+++ b/src/intelligence_layer/evaluation/benchmark/studio_benchmark.py
@@ -1,4 +1,4 @@
-import inspect
+import itertools
 from collections.abc import Sequence
 from datetime import datetime
 from http import HTTPStatus
@@ -17,6 +17,7 @@
 )
 from intelligence_layer.core import Input, Output
 from intelligence_layer.core.task import Task
+from intelligence_layer.core.tracer.tracer import ExportedSpan
 from intelligence_layer.evaluation.aggregation.aggregator import (
     AggregationLogic,
     Aggregator,
@@ -29,6 +30,11 @@
     Benchmark,
     BenchmarkRepository,
 )
+from intelligence_layer.evaluation.benchmark.get_code import get_source_notebook_safe
+from intelligence_layer.evaluation.benchmark.trace_information import (
+    extract_latency_from_trace,
+    extract_token_count_from_trace,
+)
 from intelligence_layer.evaluation.dataset.domain import ExpectedOutput
 from intelligence_layer.evaluation.dataset.studio_dataset_repository import (
     StudioDatasetRepository,
@@ -118,7 +124,30 @@ def execute(
 
         end = datetime.now()
 
-        data = PostBenchmarkExecution(
+        evaluation_lineages = list(
+            self.evaluator.evaluation_lineages(evaluation_overview.id)
+        )
+
+        run_traces = [
+            self._trace_from_lineage(lineage) for lineage in evaluation_lineages
+        ]
+        tokens_per_trace = [
+            extract_token_count_from_trace(trace) for trace in run_traces
+        ]
+        latency_per_trace = [extract_latency_from_trace(trace) for trace in run_traces]
+
+        tokens_per_successful_trace, latency_per_successful_trace = (
+            self._filter_for_succesful_runs(
+                (tokens_per_trace, latency_per_trace),
+                source_lineage_list=evaluation_lineages,
+                run_id=run_overview.id,
+            )
+        )
+
+        def average_or_zero(list: list) -> float:
+            return sum(list) / len(list) if len(list) > 0 else 0
+
+        benchmark_execution_data = PostBenchmarkExecution(
             name=name,
             description=description,
             labels=labels,
@@ -129,8 +158,8 @@ def execute(
             run_end=run_overview.end,
             run_successful_count=run_overview.successful_example_count,
             run_failed_count=run_overview.failed_example_count,
-            run_success_avg_latency=0,  # TODO: Implement this
-            run_success_avg_token_count=0,  # TODO: Implement this
+            run_success_avg_latency=average_or_zero(latency_per_successful_trace),
+            run_success_avg_token_count=average_or_zero(tokens_per_successful_trace),
             eval_start=evaluation_overview.start_date,
             eval_end=evaluation_overview.end_date,
             eval_successful_count=evaluation_overview.successful_evaluation_count,
@@ -141,23 +170,21 @@ def execute(
         )
 
         benchmark_execution_id = self.client.submit_benchmark_execution(
-            benchmark_id=self.id, data=data
+            benchmark_id=self.id, data=benchmark_execution_data
         )
 
-        evaluation_lineages = list(
-            self.evaluator.evaluation_lineages(evaluation_overview.id)
-        )
         trace_ids = []
-        for lineage in tqdm(evaluation_lineages, desc="Submitting traces to Studio"):
-            trace = lineage.tracers[0]
-            assert trace
-            trace_id = self.client.submit_trace(trace.export_for_viewing())
+        for trace in tqdm(run_traces, desc="Submitting traces to Studio"):
+            trace_id = self.client.submit_trace(trace)
             trace_ids.append(trace_id)
 
         benchmark_lineages = self._create_benchmark_lineages(
             eval_lineages=evaluation_lineages,
             trace_ids=trace_ids,
+            latencies_per_trace=latency_per_trace,
+            tokens_per_trace=tokens_per_trace,
         )
+
         self.client.submit_benchmark_lineages(
             benchmark_lineages=benchmark_lineages,
             execution_id=benchmark_execution_id,
@@ -166,22 +193,67 @@ def execute(
 
         return benchmark_execution_id
 
+    def _filter_for_succesful_runs(
+        self,
+        lists_to_filter: tuple[list, ...],
+        source_lineage_list: list[
+            EvaluationLineage[Input, ExpectedOutput, Output, Evaluation]
+        ],
+        run_id: str,
+    ) -> tuple[list, ...]:
+        """This method assumes that lists_to_filter and source_lineage_list are all equal length."""
+        failed_example_output_ids = [
+            example_output.example_id
+            for example_output in self.run_repository.failed_example_outputs(
+                run_id=run_id, output_type=self.evaluator.output_type()
+            )
+        ]
+
+        is_successful_run = [
+            lineage.example.id not in failed_example_output_ids
+            for lineage in source_lineage_list
+        ]
+        return tuple(
+            list(itertools.compress(sublist, is_successful_run))
+            for sublist in lists_to_filter
+        )
+
+    def _trace_from_lineage(
+        self, eval_lineage: EvaluationLineage[Input, ExpectedOutput, Output, Evaluation]
+    ) -> Sequence[ExportedSpan]:
+        # since we have 1 output only in this scenario, we expected to have exactly 1 tracer
+        trace = eval_lineage.tracers[0]
+        assert trace, "eval lineages always should have at least 1 tracer"
+        return trace.export_for_viewing()
+
     def _create_benchmark_lineages(
         self,
         eval_lineages: list[
             EvaluationLineage[Input, ExpectedOutput, Output, Evaluation]
         ],
         trace_ids: list[str],
-    ) -> Sequence[BenchmarkLineage[Input, Output, ExpectedOutput, Evaluation]]:
+        latencies_per_trace: list[int],
+        tokens_per_trace: list[int],
+    ) -> Sequence[BenchmarkLineage[Input, ExpectedOutput, Output, Evaluation]]:
         return [
-            self._create_benchmark_lineage(eval_lineage, trace_id)
-            for eval_lineage, trace_id in zip(eval_lineages, trace_ids, strict=True)
+            self._create_benchmark_lineage(
+                eval_lineage, trace_id, run_latency, run_tokens
+            )
+            for eval_lineage, trace_id, run_latency, run_tokens in zip(
+                eval_lineages,
+                trace_ids,
+                latencies_per_trace,
+                tokens_per_trace,
+                strict=True,
+            )
         ]
 
     def _create_benchmark_lineage(
         self,
         eval_lineage: EvaluationLineage[Input, ExpectedOutput, Output, Evaluation],
         trace_id: str,
+        run_latency: int,
+        run_tokens: int,
     ) -> BenchmarkLineage:
         return BenchmarkLineage(
             trace_id=trace_id,
@@ -190,8 +262,8 @@ def _create_benchmark_lineage(
             example_metadata=eval_lineage.example.metadata,
             output=eval_lineage.outputs[0].output,
             evaluation=eval_lineage.evaluation.result,
-            run_latency=0,  # TODO: Implement this
-            run_tokens=0,  # TODO: Implement this
+            run_latency=run_latency,
+            run_tokens=run_tokens,
         )
 
 
@@ -269,7 +341,7 @@ def create_evaluation_logic_identifier(
         evaluation_logic=eval_logic,
     )
     return EvaluationLogicIdentifier(
-        logic=inspect.getsource(type(eval_logic)),
+        logic=get_source_notebook_safe(eval_logic),
         input_schema=type_to_schema(evaluator.input_type()),
         output_schema=type_to_schema(evaluator.output_type()),
         expected_output_schema=type_to_schema(evaluator.expected_output_type()),
@@ -287,7 +359,7 @@ def create_aggregation_logic_identifier(
         aggregation_logic=aggregation_logic,
     )
     return AggregationLogicIdentifier(
-        logic=inspect.getsource(type(aggregation_logic)),
+        logic=get_source_notebook_safe(aggregation_logic),
         evaluation_schema=type_to_schema(aggregator.evaluation_type()),
         aggregation_schema=type_to_schema(aggregator.aggregated_evaluation_type()),
     )
diff --git a/src/intelligence_layer/evaluation/benchmark/trace_information.py b/src/intelligence_layer/evaluation/benchmark/trace_information.py
new file mode 100644
index 00000000..a4f73eec
--- /dev/null
+++ b/src/intelligence_layer/evaluation/benchmark/trace_information.py
@@ -0,0 +1,66 @@
+from collections.abc import Sequence
+from datetime import timedelta
+from typing import cast
+
+from aleph_alpha_client import CompletionResponse
+
+from intelligence_layer.core import ExportedSpan
+from intelligence_layer.core.model import _Complete
+from intelligence_layer.core.tracer.tracer import SpanType
+
+
+def _get_root(trace: Sequence[ExportedSpan]) -> ExportedSpan | None:
+    root_spans = [span for span in trace if span.parent_id is None]
+    if len(root_spans) != 1:
+        return None
+    return root_spans[0]
+
+
+def extract_latency_from_trace(trace: Sequence[ExportedSpan]) -> int:
+    """Extract the total duration of a given trace based on its root trace.
+
+    Args:
+        trace: trace to analyze
+
+    Returns:
+        The duration of the trace in microseconds
+    """
+    root_span = _get_root(trace)
+    if root_span is None:
+        raise ValueError("No root span found in the trace")
+    latency = (root_span.end_time - root_span.start_time) / timedelta(microseconds=1)
+    return int(latency)
+
+
+def _is_complete_request(span: ExportedSpan) -> bool:
+    # Assuming that LLM requests have a specific name or attribute
+    return span.name == _Complete.__name__
+
+
+def _extract_tokens_from_complete_request(span: ExportedSpan) -> int:
+    if not hasattr(span.attributes, "output"):
+        raise ValueError(
+            "Function expects a complete span with attributes.output. Output was not present."
+        )
+    completion_output = cast(CompletionResponse, span.attributes.output)
+    return completion_output.num_tokens_generated
+
+
+def extract_token_count_from_trace(trace: Sequence[ExportedSpan]) -> int:
+    """Extract the number of tokens generated in a trace based on its completion requests.
+
+    Note: Does not support traces of streamed responses.
+
+    Args:
+        trace: trace to analyze.
+
+    Returns:
+        The sum of newly generated tokens across all spans in the given trace.
+    """
+    token_count = 0
+    for span in trace:
+        if span.attributes.type != SpanType.TASK_SPAN:
+            continue
+        if _is_complete_request(span):
+            token_count += _extract_tokens_from_complete_request(span)
+    return token_count
diff --git a/src/intelligence_layer/evaluation/dataset/studio_dataset_repository.py b/src/intelligence_layer/evaluation/dataset/studio_dataset_repository.py
index 4c0f4ffc..e9c91b66 100644
--- a/src/intelligence_layer/evaluation/dataset/studio_dataset_repository.py
+++ b/src/intelligence_layer/evaluation/dataset/studio_dataset_repository.py
@@ -10,9 +10,9 @@
     StudioExample,
 )
 from intelligence_layer.core import Input
-from intelligence_layer.evaluation import (
+from intelligence_layer.evaluation.dataset.dataset_repository import DatasetRepository
+from intelligence_layer.evaluation.dataset.domain import (
     Dataset,
-    DatasetRepository,
     Example,
     ExpectedOutput,
 )
diff --git a/tests/evaluation/benchmark/test_benchmark.py b/tests/evaluation/benchmark/test_benchmark.py
index da3cb932..81afd027 100644
--- a/tests/evaluation/benchmark/test_benchmark.py
+++ b/tests/evaluation/benchmark/test_benchmark.py
@@ -1,5 +1,6 @@
 from datetime import datetime
-from typing import Optional
+from typing import Optional, cast
+from unittest.mock import Mock, patch
 from uuid import UUID, uuid4
 
 import pytest
@@ -8,7 +9,9 @@
 from requests import HTTPError, Response
 
 from intelligence_layer.connectors.studio.studio import (
+    BenchmarkLineage,
     GetBenchmarkResponse,
+    PostBenchmarkExecution,
     StudioClient,
     StudioExample,
 )
@@ -19,6 +22,7 @@
     type_to_schema,
 )
 from tests.evaluation.conftest import (
+    FAIL_IN_TASK_INPUT,
     DummyAggregationLogic,
     DummyEvaluationLogic,
     DummyTask,
@@ -140,7 +144,8 @@ def test_create_benchmark(
     aggregation_logic: DummyAggregationLogic,
 ) -> None:
     dataset_id = "fake_dataset_id"
-    mock_studio_client.submit_benchmark.return_value = str(uuid4())  # type: ignore
+    mock_submit_benchmark = cast(Mock, mock_studio_client.submit_benchmark)
+    mock_submit_benchmark.return_value = str(uuid4())
 
     benchmark = studio_benchmark_repository.create_benchmark(
         dataset_id, evaluation_logic, aggregation_logic, "benchmark_name"
@@ -148,7 +153,7 @@ def test_create_benchmark(
     uuid = UUID(benchmark.id)
     assert uuid
     assert benchmark.dataset_id == dataset_id
-    studio_benchmark_repository.client.submit_benchmark.assert_called_once()  # type: ignore
+    mock_submit_benchmark.assert_called_once()
 
 
 def test_create_benchmark_with_non_existing_dataset(
@@ -161,7 +166,7 @@ def test_create_benchmark_with_non_existing_dataset(
     response = Response()
     response.status_code = 400
 
-    mock_studio_client.submit_benchmark.side_effect = HTTPError(  # type: ignore
+    cast(Mock, mock_studio_client.submit_benchmark).side_effect = HTTPError(
         "400 Client Error: Bad Request for url", response=response
     )
 
@@ -180,7 +185,7 @@ def test_get_benchmark(
     datatset_id: str,
 ) -> None:
     benchmark_id = "benchmark_id"
-    mock_studio_client.get_benchmark.return_value = get_benchmark_response  # type: ignore
+    cast(Mock, mock_studio_client.get_benchmark).return_value = get_benchmark_response
 
     benchmark = studio_benchmark_repository.get_benchmark(
         benchmark_id, evaluation_logic, aggregation_logic
@@ -199,7 +204,7 @@ def test_get_non_existing_benchmark(
     evaluation_logic: DummyEvaluationLogic,
     aggregation_logic: DummyAggregationLogic,
 ) -> None:
-    mock_studio_client.get_benchmark.return_value = None  # type: ignore
+    cast(Mock, mock_studio_client.get_benchmark).return_value = None
 
     assert (
         studio_benchmark_repository.get_benchmark(
@@ -209,29 +214,134 @@ def test_get_non_existing_benchmark(
     )
 
 
+@patch(
+    "intelligence_layer.evaluation.benchmark.studio_benchmark.extract_token_count_from_trace"
+)
 def test_execute_benchmark(
+    mock_extract_tokens: Mock,
     studio_benchmark_repository: StudioBenchmarkRepository,
     mock_studio_client: StudioClient,
     evaluation_logic: DummyEvaluationLogic,
     get_benchmark_response: GetBenchmarkResponse,
     aggregation_logic: DummyAggregationLogic,
-    task,
+    task: DummyTask,
 ) -> None:
-    mock_studio_client.get_benchmark.return_value = get_benchmark_response  # type: ignore
-    mock_studio_client.submit_trace.return_value = str(uuid4())  # type: ignore
+    mock_submit_trace = cast(Mock, mock_studio_client.submit_trace)
+    mock_submit_trace.return_value = str(uuid4())
+    mock_submit_execution = cast(Mock, mock_studio_client.submit_benchmark_execution)
+    mock_submit_lineage = cast(Mock, mock_studio_client.submit_benchmark_lineages)
+
+    expected_generated_tokens = 100
+    mock_extract_tokens.return_value = expected_generated_tokens
+
+    cast(Mock, mock_studio_client.get_benchmark).return_value = get_benchmark_response
     examples = [
         StudioExample(input="input0", expected_output="expected_output0"),
         StudioExample(input="input1", expected_output="expected_output1"),
         StudioExample(input="input2", expected_output="expected_output2"),
         StudioExample(input="input3", expected_output="expected_output3"),
     ]
+    cast(Mock, mock_studio_client.get_dataset_examples).return_value = examples
+    benchmark = studio_benchmark_repository.get_benchmark(
+        "benchmark_id", evaluation_logic, aggregation_logic
+    )
+    assert benchmark
+
+    # when
+    benchmark.execute(
+        task,
+        name="name",
+        description="description",
+        metadata={"key": "value"},
+        labels={"label"},
+    )
 
-    mock_studio_client.get_dataset_examples.return_value = examples  # type: ignore
+    # then
+    mock_submit_execution.assert_called_once()
+    uploaded_execution = cast(
+        PostBenchmarkExecution, mock_submit_execution.call_args[1]["data"]
+    )
+    assert uploaded_execution.run_success_avg_latency > 0
+    assert uploaded_execution.run_success_avg_token_count == expected_generated_tokens
+
+    assert mock_submit_trace.call_count == 4
+
+    mock_submit_lineage.assert_called_once()
+    uploaded_lineages = mock_submit_lineage.call_args[1]["benchmark_lineages"]
+    for lineage in uploaded_lineages:
+        lineage = cast(BenchmarkLineage, lineage)
+        assert lineage.run_latency > 0
+        # this assumes that each lineage consists of traces that only have a single span
+        assert lineage.run_tokens == expected_generated_tokens
+
+
+def test_execute_benchmark_on_empty_examples_uploads_example_and_calculates_correctly(
+    studio_benchmark_repository: StudioBenchmarkRepository,
+    mock_studio_client: StudioClient,
+    evaluation_logic: DummyEvaluationLogic,
+    get_benchmark_response: GetBenchmarkResponse,
+    aggregation_logic: DummyAggregationLogic,
+    task: DummyTask,
+) -> None:
+    mock_submit_trace = cast(Mock, mock_studio_client.submit_trace)
+    mock_submit_execution = cast(Mock, mock_studio_client.submit_benchmark_execution)
+
+    cast(Mock, mock_studio_client.get_benchmark).return_value = get_benchmark_response
+    cast(Mock, mock_studio_client.get_dataset_examples).return_value = []
     benchmark = studio_benchmark_repository.get_benchmark(
         "benchmark_id", evaluation_logic, aggregation_logic
     )
+    assert benchmark
 
+    # when
+    benchmark.execute(
+        task,
+        name="name",
+        description="description",
+        metadata={"key": "value"},
+        labels={"label"},
+    )
+
+    # then
+    mock_submit_execution.assert_called_once()
+    uploaded_execution = cast(
+        PostBenchmarkExecution, mock_submit_execution.call_args[1]["data"]
+    )
+    assert uploaded_execution.run_success_avg_latency == 0
+    assert uploaded_execution.run_success_avg_token_count == 0
+
+    assert mock_submit_trace.call_count == 0
+
+
+@patch(
+    "intelligence_layer.evaluation.benchmark.studio_benchmark.extract_token_count_from_trace"
+)
+def test_execute_benchmark_failing_examples_calculates_correctly(
+    mock_extract_tokens: Mock,
+    studio_benchmark_repository: StudioBenchmarkRepository,
+    mock_studio_client: StudioClient,
+    evaluation_logic: DummyEvaluationLogic,
+    get_benchmark_response: GetBenchmarkResponse,
+    aggregation_logic: DummyAggregationLogic,
+    task: DummyTask,
+) -> None:
+    mock_submit_trace = cast(Mock, mock_studio_client.submit_trace)
+    mock_submit_execution = cast(Mock, mock_studio_client.submit_benchmark_execution)
+
+    cast(Mock, mock_studio_client.get_benchmark).return_value = get_benchmark_response
+    examples = [
+        StudioExample(input=FAIL_IN_TASK_INPUT, expected_output="expected_output0"),
+    ]
+    cast(Mock, mock_studio_client.get_dataset_examples).return_value = examples
+    benchmark = studio_benchmark_repository.get_benchmark(
+        "benchmark_id", evaluation_logic, aggregation_logic
+    )
+
+    expected_generated_tokens = 0
+    mock_extract_tokens.return_value = expected_generated_tokens + 1
     assert benchmark
+
+    # when
     benchmark.execute(
         task,
         name="name",
@@ -240,6 +350,13 @@ def test_execute_benchmark(
         labels={"label"},
     )
 
-    mock_studio_client.submit_benchmark_execution.assert_called_once()  # type: ignore
-    assert mock_studio_client.submit_trace.call_count == 4  # type: ignore
-    mock_studio_client.submit_benchmark_lineages.assert_called_once()  # type: ignore
+    # then
+    mock_submit_execution.assert_called_once()
+    uploaded_execution = cast(
+        PostBenchmarkExecution, mock_submit_execution.call_args[1]["data"]
+    )
+    assert uploaded_execution.run_success_avg_latency == 0
+    assert uploaded_execution.run_success_avg_token_count == expected_generated_tokens
+    assert uploaded_execution.run_successful_count == 0
+
+    assert mock_submit_trace.call_count == 0
diff --git a/tests/evaluation/benchmark/test_trace_information.py b/tests/evaluation/benchmark/test_trace_information.py
new file mode 100644
index 00000000..b115448c
--- /dev/null
+++ b/tests/evaluation/benchmark/test_trace_information.py
@@ -0,0 +1,158 @@
+from collections.abc import Mapping, Sequence
+from datetime import datetime, timedelta, timezone
+from typing import Any
+from uuid import uuid4
+
+import pytest
+from aleph_alpha_client import CompletionRequest, CompletionResponse
+from aleph_alpha_client.completion import CompletionResult
+
+from intelligence_layer.connectors.limited_concurrency_client import (
+    AlephAlphaClientProtocol,
+)
+from intelligence_layer.core import (
+    Context,
+    ExportedSpan,
+    SpanAttributes,
+    SpanStatus,
+    TaskSpanAttributes,
+)
+from intelligence_layer.core.model import CompleteInput, Message, Pharia1ChatModel
+from intelligence_layer.core.tracer.in_memory_tracer import InMemoryTracer
+from intelligence_layer.evaluation.benchmark.trace_information import (
+    extract_latency_from_trace,
+    extract_token_count_from_trace,
+)
+
+
+@pytest.fixture
+def root_span() -> ExportedSpan:
+    trace_id = uuid4()
+    return ExportedSpan(
+        context=Context(
+            trace_id=trace_id,
+            span_id=trace_id,
+        ),
+        name="root",
+        parent_id=None,
+        start_time=datetime(2022, 1, 1, 12, 0, 0, tzinfo=timezone.utc),
+        end_time=datetime(2022, 1, 1, 12, 0, 1, tzinfo=timezone.utc),
+        attributes=SpanAttributes(),
+        events=[],
+        status=SpanStatus.OK,
+    )
+
+
+@pytest.fixture
+def child_span(root_span: ExportedSpan) -> ExportedSpan:
+    return ExportedSpan(
+        context=Context(
+            trace_id=root_span.context.trace_id,
+            span_id=uuid4(),
+        ),
+        name="child",
+        parent_id=root_span.context.span_id,
+        start_time=datetime(2022, 1, 1, 12, 0, 0, 500000, tzinfo=timezone.utc),
+        end_time=datetime(2022, 1, 1, 12, 0, 1, 500000, tzinfo=timezone.utc),
+        attributes=TaskSpanAttributes(
+            input="input",
+            output="output",
+        ),
+        events=[],
+        status=SpanStatus.OK,
+    )
+
+
+SECOND_IN_MICROSECOND = 10**6
+
+
+def test_extract_latency_from_trace_root_span(root_span: ExportedSpan):
+    latency = extract_latency_from_trace([root_span])
+    assert latency == SECOND_IN_MICROSECOND
+
+
+def test_extract_latency_from_trace_root_span_can_do_ns_differences(
+    root_span: ExportedSpan,
+):
+    root_span.end_time = root_span.start_time + timedelta(microseconds=1)
+    latency = extract_latency_from_trace([root_span])
+    assert latency == 1
+
+
+def test_extract_latency_from_trace_root_span_with_child(
+    root_span: ExportedSpan, child_span: ExportedSpan
+):
+    latency = extract_latency_from_trace([root_span, child_span])
+    assert latency == SECOND_IN_MICROSECOND
+
+    latency = extract_latency_from_trace([child_span, root_span])
+    assert latency == SECOND_IN_MICROSECOND
+
+
+def test_extract_latency_from_trace_no_root_span(child_span: ExportedSpan):
+    # Call the function with the child span
+    with pytest.raises(ValueError):
+        extract_latency_from_trace([child_span])
+
+
+class MockClient(AlephAlphaClientProtocol):
+    def __init__(self, generated_tokens: int):
+        self.generated_tokens = generated_tokens
+
+    def complete(
+        self,
+        request: CompletionRequest,
+        model: str,
+    ) -> CompletionResponse:
+        return CompletionResponse(
+            model_version="---",
+            completions=[CompletionResult()],
+            num_tokens_generated=self.generated_tokens,
+            num_tokens_prompt_total=20,
+        )
+
+    def models(self) -> Sequence[Mapping[str, Any]]:
+        return [{"name": "pharia-1-llm-7b-control"}]
+
+
+def test_extract_token_count_from_trace_works_without_llm_spans(
+    root_span: ExportedSpan,
+):
+    result = extract_token_count_from_trace([root_span])
+    assert result == 0
+
+
+def test_extract_token_count_from_trace_works_with_complete_spans():
+    tokens_to_generate = 10
+    model_str = "pharia-1-llm-7b-control"
+    model = Pharia1ChatModel(
+        name=model_str, client=MockClient(generated_tokens=tokens_to_generate)
+    )
+    tracer = InMemoryTracer()
+    with tracer.span("root") as root:
+        model.complete(CompleteInput(prompt=model.to_instruct_prompt("test")), root)
+        model.complete(CompleteInput(prompt=model.to_instruct_prompt("test")), root)
+
+    complete_trace = tracer.export_for_viewing()
+
+    result = extract_token_count_from_trace(complete_trace)
+    assert result == tokens_to_generate * 2
+
+
+def test_extract_token_count_from_trace_works_chat():
+    tokens_to_generate = 10
+    model_str = "pharia-1-llm-7b-control"
+    model = Pharia1ChatModel(
+        name=model_str, client=MockClient(generated_tokens=tokens_to_generate)
+    )
+    tracer = InMemoryTracer()
+    with tracer.span("root") as root:
+        model.generate_chat(
+            messages=[Message(role="user", content="dummy")],
+            response_prefix=None,
+            tracer=root,
+        )
+    complete_trace = tracer.export_for_viewing()
+
+    result = extract_token_count_from_trace(complete_trace)
+    assert result == tokens_to_generate