Aleph-Alpha · NiklasKoehneckeAA · Apr 18, 2024 · Apr 18, 2024
diff --git a/src/examples/how_tos/example_data.py b/src/examples/how_tos/example_data.py
@@ -1,4 +1,4 @@
-from typing import Sequence
+from typing import Iterable, Sequence
 
 from pydantic import BaseModel
 
@@ -16,6 +16,7 @@
     RunOverview,
     SuccessfulExampleOutput,
 )
+from intelligence_layer.evaluation.aggregation.aggregator import AggregationLogic
 
 
 class DummyExample(Example[str, str]):
@@ -41,6 +42,15 @@ def do_evaluate(
         )
 
 
+class DummyAggregation(BaseModel):
+    num_evaluations: int
+
+
+class DummyAggregationLogic(AggregationLogic[DummyEvaluation, DummyAggregation]):
+    def aggregate(self, evaluations: Iterable[DummyEvaluation]) -> DummyAggregation:
+        return DummyAggregation(num_evaluations=len(list(evaluations)))
+
+
 class ExampleData:
     examples: Sequence[DummyExample]
     dataset_repository: InMemoryDatasetRepository
@@ -51,7 +61,8 @@ class ExampleData:
     dataset: Dataset
     run_overview_1: RunOverview
     run_overview_2: RunOverview
-    evaluation_overview: EvaluationOverview
+    evaluation_overview_1: EvaluationOverview
+    evaluation_overview_2: EvaluationOverview
 
 
 def example_data() -> ExampleData:
@@ -78,7 +89,12 @@ def example_data() -> ExampleData:
         "my-evaluator",
         DummyEvaluationLogic(),
     )
-    evaluation_overview = evaluator.evaluate_runs(run_overview_1.id, run_overview_2.id)
+    evaluation_overview_1 = evaluator.evaluate_runs(
+        run_overview_1.id, run_overview_2.id
+    )
+    evaluation_overview_2 = evaluator.evaluate_runs(
+        run_overview_1.id, run_overview_2.id
+    )
 
     example_data = ExampleData()
     example_data.examples = examples
@@ -90,5 +106,7 @@ def example_data() -> ExampleData:
     example_data.dataset = dataset
     example_data.run_overview_1 = run_overview_1
     example_data.run_overview_2 = run_overview_2
-    example_data.evaluation_overview = evaluation_overview
+    example_data.evaluation_overview_1 = evaluation_overview_1
+    example_data.evaluation_overview_2 = evaluation_overview_2
+
     return example_data
diff --git a/src/examples/how_tos/how_to_aggregate_evaluations.ipynb b/src/examples/how_tos/how_to_aggregate_evaluations.ipynb
@@ -1,5 +1,19 @@
 {
  "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from example_data import DummyAggregationLogic, example_data\n",
+    "\n",
+    "from intelligence_layer.evaluation.aggregation.aggregator import Aggregator\n",
+    "from intelligence_layer.evaluation.aggregation.in_memory_aggregation_repository import (\n",
+    "    InMemoryAggregationRepository,\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -20,15 +34,21 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%%script false --no-raise-error # the following code does not execute as the evaluations do not exist\n",
-    "\n",
     "# Step 0\n",
-    "evaluation_ids = [\"eval_of_interest\", \"other_eval_of_interest\"]\n",
+    "\n",
+    "\n",
+    "my_example_data = example_data()\n",
+    "print()\n",
+    "\n",
+    "evaluation_ids = [\n",
+    "    my_example_data.evaluation_overview_1.id,\n",
+    "    my_example_data.evaluation_overview_2.id,\n",
+    "]\n",
     "\n",
     "# Step 1\n",
-    "evaluation_repository = InMemoryEvaluationRepository()\n",
+    "evaluation_repository = my_example_data.evaluation_repository\n",
     "aggregation_repository = InMemoryAggregationRepository()\n",
-    "aggregation_logic = SingleLabelClassifyAggregationLogic()\n",
+    "aggregation_logic = DummyAggregationLogic()\n",
     "\n",
     "# Step 2\n",
     "aggregator = Aggregator(\n",
@@ -42,13 +62,6 @@
     "# Step 3\n",
     "print(aggregation_overview.id)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {

diff --git a/src/examples/how_tos/how_to_evaluate_runs.ipynb b/src/examples/how_tos/how_to_evaluate_runs.ipynb
@@ -1,5 +1,19 @@
 {
  "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from example_data import DummyEvaluationLogic, example_data\n",
+    "\n",
+    "from intelligence_layer.evaluation.evaluation.evaluator import Evaluator\n",
+    "from intelligence_layer.evaluation.evaluation.in_memory_evaluation_repository import (\n",
+    "    InMemoryEvaluationRepository,\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -25,34 +39,31 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%%script false --no-raise-error # the following code does not execute as the runs do not exist\n",
-    "\n",
     "# Step 0\n",
-    "run_ids = [\"run_id_of_interest\", \"other_run_id_of_interest\"]\n",
+    "my_example_data = example_data()\n",
+    "print()\n",
+    "run_ids = [my_example_data.run_overview_1.id, my_example_data.run_overview_2.id]\n",
     "\n",
     "# Step 1\n",
-    "dataset_repository = InMemoryDatasetRepository()\n",
-    "run_repository = InMemoryRunRepository()\n",
+    "dataset_repository = my_example_data.dataset_repository\n",
+    "run_repository = my_example_data.run_repository\n",
     "evaluation_repository = InMemoryEvaluationRepository()\n",
-    "evaluation_logic = SingleLabelClassifyEvaluationLogic()\n",
+    "evaluation_logic = DummyEvaluationLogic()\n",
     "\n",
     "# Step 3\n",
-    "evaluator = Evaluator(dataset_repository, run_repository, evaluation_repository, \"My joke evaluation\", evaluation_logic)\n",
+    "evaluator = Evaluator(\n",
+    "    dataset_repository,\n",
+    "    run_repository,\n",
+    "    evaluation_repository,\n",
+    "    \"My dummy evaluation\",\n",
+    "    evaluation_logic,\n",
+    ")\n",
     "\n",
     "evaluation_overview = evaluator.evaluate_runs(*run_ids)\n",
     "\n",
     "# Step 4\n",
     "print(evaluation_overview.id)"
    ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "\n",
-    "```python\n",
-    "```"
-   ]
   }
  ],
  "metadata": {

diff --git a/src/examples/how_tos/how_to_retrieve_data_for_analysis.ipynb b/src/examples/how_tos/how_to_retrieve_data_for_analysis.ipynb
@@ -59,7 +59,7 @@
     ")\n",
     "\n",
     "# retrieve all evaluations, and an evaluation for an example\n",
-    "my_evaluation_id = example_data.evaluation_overview.id\n",
+    "my_evaluation_id = example_data.evaluation_overview_1.id\n",
     "my_evaluations = evaluation_repository.example_evaluations(\n",
     "    my_evaluation_id, evaluation_type=DummyEvaluation\n",
     ")\n",
@@ -117,13 +117,6 @@
     "my_lineage = my_evaluator.evaluation_lineage(my_evaluation_id, my_example_id)\n",
     "display(my_lineage)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {

diff --git a/src/examples/how_tos/how_to_run_a_task_on_a_dataset.ipynb b/src/examples/how_tos/how_to_run_a_task_on_a_dataset.ipynb
@@ -1,12 +1,26 @@
 {
  "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from example_data import DummyTask, example_data\n",
+    "\n",
+    "from intelligence_layer.evaluation.run.in_memory_run_repository import (\n",
+    "    InMemoryRunRepository,\n",
+    ")\n",
+    "from intelligence_layer.evaluation.run.runner import Runner"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "# How to run a task on a dataset\n",
     "0. Create a suitable dataset (see [here](./how_to_create_a_dataset.ipynb)) and a task (see [here](./how_to_implement_a_task.ipynb)).\n",
-    "1. Initialize the task, a `DatasetRepository` and a `RunRepository`\n",
+    "1. Initialize the task and a `RunRepository`, and open the correct `DatasetRepository`\n",
     "    - The `DatasetRepository` needs to contain the dataset.\n",
     "    - The `RunRepository` stores results.\n",
     "2. Use the `Runner` to run the task on the given dataset via `run_dataset`\n",
@@ -21,25 +35,18 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%%script false --no-raise-error # the following code does not execute as the dataset does not exist\n",
-    "\n",
     "# Step 0\n",
-    "dataset_id = \"my-dataset-id\"\n",
-    "\n",
+    "my_example_data = example_data()\n",
+    "print()\n",
     "\n",
     "# Step 1\n",
-    "class DummyTask(Task[None, None]):\n",
-    "    def do_run(self, input: None, task_span: TaskSpan) -> None:\n",
-    "        return None\n",
-    "\n",
-    "\n",
-    "dataset_repository = InMemoryDatasetRepository()\n",
+    "dataset_repository = my_example_data.dataset_repository\n",
     "run_repository = InMemoryRunRepository()\n",
     "task = DummyTask()\n",
     "\n",
     "# Step 2\n",
     "runner = Runner(task, dataset_repository, run_repository, \"MyRunDescription\")\n",
-    "run_overview = runner.run_dataset(dataset_id)\n",
+    "run_overview = runner.run_dataset(my_example_data.dataset.id)\n",
     "\n",
     "# Step 3\n",
     "print(run_overview.id)"