From 908d1c80b988eacc2bfdc2901bf0179e6ba96f8a Mon Sep 17 00:00:00 2001 From: Niklas Koehnecke Date: Thu, 18 Apr 2024 15:46:58 +0200 Subject: [PATCH] refactor: make most how-tos runnable --- src/examples/how_tos/example_data.py | 26 +++++++++-- .../how_to_aggregate_evaluations.ipynb | 37 ++++++++++------ .../how_tos/how_to_evaluate_runs.ipynb | 43 ++++++++++++------- .../how_to_retrieve_data_for_analysis.ipynb | 9 +--- .../how_to_run_a_task_on_a_dataset.ipynb | 31 +++++++------ 5 files changed, 94 insertions(+), 52 deletions(-) diff --git a/src/examples/how_tos/example_data.py b/src/examples/how_tos/example_data.py index 1c4005e35..c9d60dd33 100644 --- a/src/examples/how_tos/example_data.py +++ b/src/examples/how_tos/example_data.py @@ -1,4 +1,4 @@ -from typing import Sequence +from typing import Iterable, Sequence from pydantic import BaseModel @@ -16,6 +16,7 @@ RunOverview, SuccessfulExampleOutput, ) +from intelligence_layer.evaluation.aggregation.aggregator import AggregationLogic class DummyExample(Example[str, str]): @@ -41,6 +42,15 @@ def do_evaluate( ) +class DummyAggregation(BaseModel): + num_evaluations: int + + +class DummyAggregationLogic(AggregationLogic[DummyEvaluation, DummyAggregation]): + def aggregate(self, evaluations: Iterable[DummyEvaluation]) -> DummyAggregation: + return DummyAggregation(num_evaluations=len(list(evaluations))) + + class ExampleData: examples: Sequence[DummyExample] dataset_repository: InMemoryDatasetRepository @@ -51,7 +61,8 @@ class ExampleData: dataset: Dataset run_overview_1: RunOverview run_overview_2: RunOverview - evaluation_overview: EvaluationOverview + evaluation_overview_1: EvaluationOverview + evaluation_overview_2: EvaluationOverview def example_data() -> ExampleData: @@ -78,7 +89,12 @@ def example_data() -> ExampleData: "my-evaluator", DummyEvaluationLogic(), ) - evaluation_overview = evaluator.evaluate_runs(run_overview_1.id, run_overview_2.id) + evaluation_overview_1 = evaluator.evaluate_runs( + run_overview_1.id, run_overview_2.id + ) + evaluation_overview_2 = evaluator.evaluate_runs( + run_overview_1.id, run_overview_2.id + ) example_data = ExampleData() example_data.examples = examples @@ -90,5 +106,7 @@ def example_data() -> ExampleData: example_data.dataset = dataset example_data.run_overview_1 = run_overview_1 example_data.run_overview_2 = run_overview_2 - example_data.evaluation_overview = evaluation_overview + example_data.evaluation_overview_1 = evaluation_overview_1 + example_data.evaluation_overview_2 = evaluation_overview_2 + return example_data diff --git a/src/examples/how_tos/how_to_aggregate_evaluations.ipynb b/src/examples/how_tos/how_to_aggregate_evaluations.ipynb index 5404436f1..2dadcce47 100644 --- a/src/examples/how_tos/how_to_aggregate_evaluations.ipynb +++ b/src/examples/how_tos/how_to_aggregate_evaluations.ipynb @@ -1,5 +1,19 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from example_data import DummyAggregationLogic, example_data\n", + "\n", + "from intelligence_layer.evaluation.aggregation.aggregator import Aggregator\n", + "from intelligence_layer.evaluation.aggregation.in_memory_aggregation_repository import (\n", + " InMemoryAggregationRepository,\n", + ")" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -20,15 +34,21 @@ "metadata": {}, "outputs": [], "source": [ - "%%script false --no-raise-error # the following code does not execute as the evaluations do not exist\n", - "\n", "# Step 0\n", - "evaluation_ids = [\"eval_of_interest\", \"other_eval_of_interest\"]\n", + "\n", + "\n", + "my_example_data = example_data()\n", + "print()\n", + "\n", + "evaluation_ids = [\n", + " my_example_data.evaluation_overview_1.id,\n", + " my_example_data.evaluation_overview_2.id,\n", + "]\n", "\n", "# Step 1\n", - "evaluation_repository = InMemoryEvaluationRepository()\n", + "evaluation_repository = my_example_data.evaluation_repository\n", "aggregation_repository = InMemoryAggregationRepository()\n", - "aggregation_logic = SingleLabelClassifyAggregationLogic()\n", + "aggregation_logic = DummyAggregationLogic()\n", "\n", "# Step 2\n", "aggregator = Aggregator(\n", @@ -42,13 +62,6 @@ "# Step 3\n", "print(aggregation_overview.id)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/src/examples/how_tos/how_to_evaluate_runs.ipynb b/src/examples/how_tos/how_to_evaluate_runs.ipynb index 8d471b173..403ebcb1a 100644 --- a/src/examples/how_tos/how_to_evaluate_runs.ipynb +++ b/src/examples/how_tos/how_to_evaluate_runs.ipynb @@ -1,5 +1,19 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from example_data import DummyEvaluationLogic, example_data\n", + "\n", + "from intelligence_layer.evaluation.evaluation.evaluator import Evaluator\n", + "from intelligence_layer.evaluation.evaluation.in_memory_evaluation_repository import (\n", + " InMemoryEvaluationRepository,\n", + ")" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -25,34 +39,31 @@ "metadata": {}, "outputs": [], "source": [ - "%%script false --no-raise-error # the following code does not execute as the runs do not exist\n", - "\n", "# Step 0\n", - "run_ids = [\"run_id_of_interest\", \"other_run_id_of_interest\"]\n", + "my_example_data = example_data()\n", + "print()\n", + "run_ids = [my_example_data.run_overview_1.id, my_example_data.run_overview_2.id]\n", "\n", "# Step 1\n", - "dataset_repository = InMemoryDatasetRepository()\n", - "run_repository = InMemoryRunRepository()\n", + "dataset_repository = my_example_data.dataset_repository\n", + "run_repository = my_example_data.run_repository\n", "evaluation_repository = InMemoryEvaluationRepository()\n", - "evaluation_logic = SingleLabelClassifyEvaluationLogic()\n", + "evaluation_logic = DummyEvaluationLogic()\n", "\n", "# Step 3\n", - "evaluator = Evaluator(dataset_repository, run_repository, evaluation_repository, \"My joke evaluation\", evaluation_logic)\n", + "evaluator = Evaluator(\n", + " dataset_repository,\n", + " run_repository,\n", + " evaluation_repository,\n", + " \"My dummy evaluation\",\n", + " evaluation_logic,\n", + ")\n", "\n", "evaluation_overview = evaluator.evaluate_runs(*run_ids)\n", "\n", "# Step 4\n", "print(evaluation_overview.id)" ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "```python\n", - "```" - ] } ], "metadata": { diff --git a/src/examples/how_tos/how_to_retrieve_data_for_analysis.ipynb b/src/examples/how_tos/how_to_retrieve_data_for_analysis.ipynb index 521772357..57875aa8a 100644 --- a/src/examples/how_tos/how_to_retrieve_data_for_analysis.ipynb +++ b/src/examples/how_tos/how_to_retrieve_data_for_analysis.ipynb @@ -59,7 +59,7 @@ ")\n", "\n", "# retrieve all evaluations, and an evaluation for an example\n", - "my_evaluation_id = example_data.evaluation_overview.id\n", + "my_evaluation_id = example_data.evaluation_overview_1.id\n", "my_evaluations = evaluation_repository.example_evaluations(\n", " my_evaluation_id, evaluation_type=DummyEvaluation\n", ")\n", @@ -117,13 +117,6 @@ "my_lineage = my_evaluator.evaluation_lineage(my_evaluation_id, my_example_id)\n", "display(my_lineage)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/src/examples/how_tos/how_to_run_a_task_on_a_dataset.ipynb b/src/examples/how_tos/how_to_run_a_task_on_a_dataset.ipynb index 28607804c..8b9358268 100644 --- a/src/examples/how_tos/how_to_run_a_task_on_a_dataset.ipynb +++ b/src/examples/how_tos/how_to_run_a_task_on_a_dataset.ipynb @@ -1,12 +1,26 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from example_data import DummyTask, example_data\n", + "\n", + "from intelligence_layer.evaluation.run.in_memory_run_repository import (\n", + " InMemoryRunRepository,\n", + ")\n", + "from intelligence_layer.evaluation.run.runner import Runner" + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ "# How to run a task on a dataset\n", "0. Create a suitable dataset (see [here](./how_to_create_a_dataset.ipynb)) and a task (see [here](./how_to_implement_a_task.ipynb)).\n", - "1. Initialize the task, a `DatasetRepository` and a `RunRepository`\n", + "1. Initialize the task and a `RunRepository`, and open the correct `DatasetRepository`\n", " - The `DatasetRepository` needs to contain the dataset.\n", " - The `RunRepository` stores results.\n", "2. Use the `Runner` to run the task on the given dataset via `run_dataset`\n", @@ -21,25 +35,18 @@ "metadata": {}, "outputs": [], "source": [ - "%%script false --no-raise-error # the following code does not execute as the dataset does not exist\n", - "\n", "# Step 0\n", - "dataset_id = \"my-dataset-id\"\n", - "\n", + "my_example_data = example_data()\n", + "print()\n", "\n", "# Step 1\n", - "class DummyTask(Task[None, None]):\n", - " def do_run(self, input: None, task_span: TaskSpan) -> None:\n", - " return None\n", - "\n", - "\n", - "dataset_repository = InMemoryDatasetRepository()\n", + "dataset_repository = my_example_data.dataset_repository\n", "run_repository = InMemoryRunRepository()\n", "task = DummyTask()\n", "\n", "# Step 2\n", "runner = Runner(task, dataset_repository, run_repository, \"MyRunDescription\")\n", - "run_overview = runner.run_dataset(dataset_id)\n", + "run_overview = runner.run_dataset(my_example_data.dataset.id)\n", "\n", "# Step 3\n", "print(run_overview.id)"