diff --git a/CHANGELOG.md b/CHANGELOG.md index 82be2fb01..01fa80a0b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,7 @@ ... ### New Features - - Add `how_to_implement_complete_incremental_evaluation_flow` + - Add `how_to_implement_incremental_evaluation`. ### Fixes - The document index client now correctly URL-encodes document names in its queries. diff --git a/README.md b/README.md index f33b089f8..dcd03d42b 100644 --- a/README.md +++ b/README.md @@ -180,7 +180,7 @@ The how-tos are quick lookups about how to do things. Compared to the tutorials, | [...retrieve data for analysis](./src/documentation/how_tos/how_to_retrieve_data_for_analysis.ipynb) | Retrieve experiment data in multiple different ways | | [...implement a custom human evaluation](./src/documentation/how_tos/how_to_human_evaluation_via_argilla.ipynb) | Necessary steps to create an evaluation with humans as a judge via Argilla | | [...implement elo evaluations](./src/documentation/how_tos/how_to_implement_elo_evaluations.ipynb) | Evaluate runs and create ELO ranking for them | -| [...implement complete incremental evaluation flow](./src/documentation/how_tos/how_to_implement_complete_incremental_evaluation_flow.ipynb) | Run complete incremental evaluation flow from runner to aggretation +| [...implement incremental evaluation](./src/documentation/how_tos/how_to_implement_incremental_evaluation.ipynb) | Implement and run an incremental evaluation # Models Currently, we support a bunch of models accessible via the Aleph Alpha API. Depending on your local setup, you may even have additional models available. diff --git a/src/documentation/how_tos/how_to_implement_complete_incremental_evaluation_flow.ipynb b/src/documentation/how_tos/how_to_implement_complete_incremental_evaluation_flow.ipynb deleted file mode 100644 index 163e15238..000000000 --- a/src/documentation/how_tos/how_to_implement_complete_incremental_evaluation_flow.ipynb +++ /dev/null @@ -1,205 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from documentation.how_tos.example_data import (\n", - " DummyAggregationLogic,\n", - " DummyEvaluation,\n", - " DummyExample,\n", - " DummyTask,\n", - ")\n", - "from intelligence_layer.evaluation import (\n", - " Aggregator,\n", - " IncrementalEvaluator,\n", - " InMemoryAggregationRepository,\n", - " InMemoryEvaluationRepository,\n", - " InMemoryRunRepository,\n", - " Runner,\n", - ")\n", - "from intelligence_layer.evaluation.dataset.domain import Example\n", - "from intelligence_layer.evaluation.dataset.in_memory_dataset_repository import (\n", - " InMemoryDatasetRepository,\n", - ")\n", - "from intelligence_layer.evaluation.evaluation.evaluator.incremental_evaluator import (\n", - " IncrementalEvaluationLogic,\n", - ")\n", - "from intelligence_layer.evaluation.run.domain import SuccessfulExampleOutput" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# How to implement complete incremental evaluation workflows from running (multiple) tasks to aggregation\n", - "This notebook outlines how to:\n", - " - run multiple tasks and configurations on the same dataset\n", - " - perform evaluations in an incremental fashion, i.e., adding additional runs to your existing evaluations without the need for recalculation\n", - " - run aggregation on these evaluations\n", - " \n", - "## Step-by-Step Guide\n", - "1. Setup:\n", - "- Initialize all necessary repositories: \n", - " - dataset\n", - " - run\n", - " - evaluation\n", - " - aggregation\n", - "- Create dataset from example(s)\n", - "- Initialized task(s)\n", - "- Initialize `Runner` for each task \n", - "2. Run task(s) for the dataset (see [here](./how_to_run_a_task_on_a_dataset.ipynb))\n", - "3. Compose a list of IDs of runs you want to evaluate.\n", - "4. Define and initialize an `IncrementalEvaluationLogic`; This is similar to a normal `EvaluationLogic` (see [here](./how_to_implement_a_simple_evaluation_and_aggregation_logic.ipynb)) but you also have to implement your own `do_incremental_evaluate` method\n", - "5. Initialize an `IncrementalEvaluator` with the repositories and your custom `IncrementalEvaluationLogic`\n", - "6. Call the `evaluate_runs` method of the `IncrementalEvaluator` to evaluate the run(s) and create a single `EvaluationOverview`\n", - "7. Aggregate your evaluation of the run(s) using the [standard aggregation](./how_to_aggregate_evaluations.ipynb) or using a [custom aggregation logic](./how_to_implement_a_simple_evaluation_and_aggregation_logic.ipynb)\n", - "\n", - "#### Steps for addition of new runs \n", - "8. Define and run some new task(s)\n", - "9. Define a list for runs that should not be re-evaluated\n", - "10. Call the `evaluate_additional_runs` method of the `IncrementalEvaluator`:\n", - " - `run_ids`: Runs to be included in the evaluation results, including those that have been evaluated before\n", - " - `previous_evaluation_ids`: Runs **not** to be re-evaluated, depending on the specific implementation of the `do_incremental_evaluate` method\n", - "11. Aggregate all your `EvaluationOverview`s in your `EvaluationRepository`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Preparation\n", - "examples = [\n", - " DummyExample(input=\"input1\", expected_output=\"expected_output1\", data=\"data1\")\n", - "]\n", - "\n", - "# Step 1\n", - "dataset_repository = InMemoryDatasetRepository()\n", - "run_repository = InMemoryRunRepository()\n", - "evaluation_repository = InMemoryEvaluationRepository()\n", - "aggregation_repository = InMemoryAggregationRepository()\n", - "\n", - "my_dataset = dataset_repository.create_dataset(examples, \"MyDataset\")\n", - "\n", - "first_task = DummyTask()\n", - "first_runner = Runner(first_task, dataset_repository, run_repository, \"MyFirstRun\")\n", - "\n", - "# Step 2\n", - "first_run_overview = first_runner.run_dataset(my_dataset.id)\n", - "print(f\"ID of first run: {first_run_overview.id}\")\n", - "\n", - "# Step 3\n", - "run_overview_ids_for_first_evaluation = []\n", - "for run_overview in run_repository.run_overviews():\n", - " if (\n", - " run_overview.description == \"MyFirstRun\"\n", - " ): ## This is filter for all the runs you want to include\n", - " run_overview_ids_for_first_evaluation.append(run_overview.id)\n", - "\n", - "\n", - "# Step 4\n", - "class DummyIncrementalEvaluationLogic(\n", - " IncrementalEvaluationLogic[str, str, str, DummyEvaluation]\n", - "):\n", - " def do_incremental_evaluate(\n", - " self,\n", - " example: Example[str, str],\n", - " outputs: list[SuccessfulExampleOutput[str]],\n", - " already_evaluated_outputs: list[list[SuccessfulExampleOutput[str]]],\n", - " ) -> DummyEvaluation:\n", - " output_str = \"(\" + (\", \".join(o.output for o in outputs)) + \")\"\n", - " return DummyEvaluation(\n", - " eval=f\"{example.input}, {example.expected_output}, {output_str}, {already_evaluated_outputs} -> evaluation\"\n", - " )\n", - "\n", - "\n", - "incremental_evaluation_logic = DummyIncrementalEvaluationLogic()\n", - "\n", - "# Step 5\n", - "incremental_evaluator = IncrementalEvaluator(\n", - " dataset_repository,\n", - " run_repository,\n", - " evaluation_repository,\n", - " \"My incremental evaluation\",\n", - " incremental_evaluation_logic,\n", - ")\n", - "\n", - "# Step 6\n", - "evaluation_overview_first_task = incremental_evaluator.evaluate_runs(\n", - " *run_overview_ids_for_first_evaluation\n", - ")\n", - "\n", - "# Step 7\n", - "aggregation_logic = DummyAggregationLogic()\n", - "aggregator = Aggregator(\n", - " evaluation_repository, aggregation_repository, \"MyAggregator\", aggregation_logic\n", - ")\n", - "first_aggregation_overview = aggregator.aggregate_evaluation(\n", - " *evaluation_repository.evaluation_overview_ids()\n", - ")\n", - "print(f\"First aggregation: {first_aggregation_overview}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "## Addition of new task/run\n", - "# Step 8\n", - "second_task = DummyTask()\n", - "second_runner = Runner(second_task, dataset_repository, run_repository, \"MySecondRun\")\n", - "second_run_overview = second_runner.run_dataset(my_dataset.id)\n", - "print(f\"ID of second run: {second_run_overview.id}\")\n", - "\n", - "# Step 9\n", - "already_evaluated_run_ids = evaluation_repository.evaluation_overview_ids()\n", - "\n", - "# Step 10\n", - "incremental_evaluator.evaluate_additional_runs(\n", - " *run_repository.run_overview_ids(),\n", - " previous_evaluation_ids=already_evaluated_run_ids,\n", - ")\n", - "\n", - "# Step 11\n", - "second_aggregation_overview = aggregator.aggregate_evaluation(\n", - " *evaluation_repository.evaluation_overview_ids()\n", - ")\n", - "print(second_aggregation_overview)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.8" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/src/documentation/how_tos/how_to_implement_incremental_evaluation.ipynb b/src/documentation/how_tos/how_to_implement_incremental_evaluation.ipynb new file mode 100644 index 000000000..52278c53e --- /dev/null +++ b/src/documentation/how_tos/how_to_implement_incremental_evaluation.ipynb @@ -0,0 +1,153 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from documentation.how_tos.example_data import (\n", + " DummyAggregationLogic,\n", + " DummyEvaluation,\n", + " DummyExample,\n", + " example_data,\n", + ")\n", + "from intelligence_layer.evaluation import (\n", + " Aggregator,\n", + " Example,\n", + " IncrementalEvaluationLogic,\n", + " IncrementalEvaluator,\n", + " InMemoryAggregationRepository,\n", + " InMemoryEvaluationRepository,\n", + " SuccessfulExampleOutput,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# How to implement incremental evaluation\n", + "This notebook outlines how to perform evaluations in an incremental fashion, i.e., adding additional runs to your existing evaluations without the need for recalculation.\n", + " \n", + "## Step-by-Step Guide\n", + "0. Run your tasks on the datasets on which you want to evaluate them (see [here](./how_to_run_a_task_on_a_dataset.ipynb))\n", + " - When evaluating multiple runs, all of them need the same data types \n", + "1. Initialize all necessary repositories and define your `IncrementalEvaluationLogic`; It is similar to a normal `EvaluationLogic` (see [here](./how_to_implement_a_simple_evaluation_and_aggregation_logic.ipynb)) but you additionally have to implement your own `do_incremental_evaluate` method\n", + "2. Initialize an `IncrementalEvaluator` with the repositories and your custom `IncrementalEvaluationLogic`\n", + "3. Call the `evaluate_runs` method of the `IncrementalEvaluator`\n", + "4. Aggregate your evaluations using the [standard aggregation](./how_to_aggregate_evaluations.ipynb) or using a [custom aggregation logic](./how_to_implement_a_simple_evaluation_and_aggregation_logic.ipynb)\n", + "\n", + "#### Steps for addition of new runs \n", + "5. Call the `evaluate_additional_runs` method of the `IncrementalEvaluator`:\n", + " - `run_ids`: Runs to be included in the evaluation results, including those that have been evaluated before\n", + " - `previous_evaluation_ids`: Runs **not** to be re-evaluated, depending on the specific implementation of the `do_incremental_evaluate` method\n", + "6. Aggregate all your `EvaluationOverview`s" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Step 0\n", + "examples = [\n", + " DummyExample(input=\"input1\", expected_output=\"expected_output1\", data=\"data1\")\n", + "]\n", + "my_example_data = example_data()\n", + "\n", + "dataset_repository = my_example_data.dataset_repository\n", + "run_repository = my_example_data.run_repository\n", + "\n", + "# Step 1\n", + "evaluation_repository = InMemoryEvaluationRepository()\n", + "aggregation_repository = InMemoryAggregationRepository()\n", + "\n", + "\n", + "class DummyIncrementalEvaluationLogic(\n", + " IncrementalEvaluationLogic[str, str, str, DummyEvaluation]\n", + "):\n", + " def do_incremental_evaluate(\n", + " self,\n", + " example: Example[str, str],\n", + " outputs: list[SuccessfulExampleOutput[str]],\n", + " already_evaluated_outputs: list[list[SuccessfulExampleOutput[str]]],\n", + " ) -> DummyEvaluation:\n", + " return DummyEvaluation(eval=\"DummyEvalResult\")\n", + "\n", + "\n", + "# Step 2\n", + "incremental_evaluator = IncrementalEvaluator(\n", + " dataset_repository,\n", + " run_repository,\n", + " evaluation_repository,\n", + " \"My incremental evaluation\",\n", + " DummyIncrementalEvaluationLogic(),\n", + ")\n", + "\n", + "# Step 3\n", + "incremental_evaluator.evaluate_runs(my_example_data.run_overview_1.id)\n", + "\n", + "# Step 4\n", + "aggregation_logic = DummyAggregationLogic()\n", + "aggregator = Aggregator(\n", + " evaluation_repository, aggregation_repository, \"MyAggregator\", aggregation_logic\n", + ")\n", + "aggregation_overview = aggregator.aggregate_evaluation(\n", + " *evaluation_repository.evaluation_overview_ids()\n", + ")\n", + "print(aggregation_overview)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "## Addition of new task/run\n", + "# Step 5\n", + "run_ids = [my_example_data.run_overview_1.id, my_example_data.run_overview_1.id]\n", + "incremental_evaluator.evaluate_additional_runs(\n", + " *run_ids,\n", + " previous_evaluation_ids=evaluation_repository.evaluation_overview_ids(),\n", + ")\n", + "\n", + "# Step 6\n", + "second_aggregation_overview = aggregator.aggregate_evaluation(\n", + " *evaluation_repository.evaluation_overview_ids()\n", + ")\n", + "print(second_aggregation_overview)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}