feat: Check if run has already happened (#1079)

Co-authored-by: Sebastian Niehus <[email protected]>
Aleph-Alpha · Oct 15, 2024 · 6dbf754 · 6dbf754
1 parent 592e746
commit 6dbf754
Show file tree

Hide file tree

Showing 7 changed files with 133 additions and 5 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,7 +2,9 @@
 ## Unreleased
 
 ### Features
-...
+- Add Catalan and Polish support to `DetectLanguage`.
+- Add utility function `run_is_already_computed` to `Runner` to check if a run with the given metadata has already been computed.
+  - The `parameter_optimization` notebook describes how to use the `run_is_already_computed` function.
 ### Fixes
 ...
 ### Deprecations 
@@ -14,7 +16,6 @@
 ## 7.0.0
 
 ### Features
-- Add Catalan and Polish support to `detectLanguage`.
 - You can now specify a `hybrid_index` when creating an index for the document index to use hybrid (semantic and keyword) search.
 - `min_score` and `max_results` are now optional parameters in `DocumentIndexClient.SearchQuery`.
 - `k` is now an optional parameter in `DocumentIndexRetriever`.

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -31,6 +31,7 @@ rouge-score = "^0.1.2"
 sacrebleu = "^2.4.3"
 lingua-language-detector = "^2.0.2"
 argilla = "^2.1.0"
+dict-hash = "^1.3.4"
 
 [tool.poetry.group.dev.dependencies]
 # lint & format

diff --git a/src/documentation/how_tos/how_to_implement_incremental_evaluation.ipynb b/src/documentation/how_tos/how_to_implement_incremental_evaluation.ipynb
@@ -74,6 +74,7 @@
     "        outputs: list[SuccessfulExampleOutput[str]],\n",
     "        already_evaluated_outputs: list[list[SuccessfulExampleOutput[str]]],\n",
     "    ) -> DummyEvaluation:\n",
+    "        # Here we just return the dummy evaluation. In a real use case one could also use `already_evaluated_outputs' to skip previous evaluations,\n",
     "        return DummyEvaluation(eval=\"DummyEvalResult\")\n",
     "\n",
     "\n",
@@ -108,7 +109,7 @@
    "source": [
     "## Addition of new task/run\n",
     "# Step 5\n",
-    "run_ids = [my_example_data.run_overview_1.id, my_example_data.run_overview_1.id]\n",
+    "run_ids = [my_example_data.run_overview_1.id, my_example_data.run_overview_2.id]\n",
     "incremental_evaluator.evaluate_additional_runs(\n",
     "    *run_ids,\n",
     "    previous_evaluation_ids=evaluation_repository.evaluation_overview_ids(),\n",

diff --git a/src/documentation/parameter_optimization.ipynb b/src/documentation/parameter_optimization.ipynb
@@ -299,6 +299,63 @@
    "source": [
     "With these results, it's easy to see which prompt is best to optimize our score! The model on the other hand does not seem to have a big impact on our metrics."
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "11",
+   "metadata": {},
+   "source": [
+    "## Adding Different Run Configurations"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "12",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Add a new model to the run configuration\n",
+    "model_list.append(\"model d\")\n",
+    "\n",
+    "# Loop over all combinations of parameters and run the `Task` for each combination.\n",
+    "# We set `recompute_if_metadata_changed` to `True` to only run the new model `model d` with the existing prompts.\n",
+    "# The previous runs are not recomputed and keep their old results, unless the metadata of the run has changed.\n",
+    "# Note, that currently the evaluations are recomputed for all runs.\n",
+    "for i, (model, prompt) in enumerate(itertools.product(model_list, prompt_list)):\n",
+    "    dummy_task = DummyTask(model=model, prompt=prompt)\n",
+    "\n",
+    "    # Model and prompt are stored in the metadata to specify the configuration of the current experiment\n",
+    "    metadata = dict({\"model\": model, \"prompt\": prompt})\n",
+    "    description = f\"Evaluate dummy task {i}\"\n",
+    "    runner = Runner(dummy_task, dataset_repository, run_repository, EXPERIMENT_NAME)\n",
+    "    if not runner.run_is_already_computed(metadata):\n",
+    "        run_overview = runner.run_dataset(\n",
+    "            dataset.id,\n",
+    "            metadata=metadata,\n",
+    "            description=description,\n",
+    "            labels=labels,\n",
+    "        )\n",
+    "\n",
+    "        eval_overview = evaluator.evaluate_runs(\n",
+    "            run_overview.id,\n",
+    "            metadata=metadata,\n",
+    "            description=description,\n",
+    "            labels=labels,\n",
+    "        )\n",
+    "\n",
+    "        aggregator.aggregate_evaluation(\n",
+    "            eval_overview.id, metadata=metadata, description=description, labels=labels\n",
+    "        )\n",
+    "\n",
+    "# Let's print the number of run overviews. Because we have not recomputed the 12 (3 models times 4 examples) runs we now expect 16 runs in total.\n",
+    "# Without the `recompute_if_metadata_changed` flag we would get 28 runs!\n",
+    "print(len(list(run_repository.run_overviews())))\n",
+    "print(len(list(evaluation_repository.evaluation_overviews())))\n",
+    "print(\n",
+    "    len(list(aggregation_repository.aggregation_overviews(DummyAggregatedEvaluation)))\n",
+    ")"
+   ]
   }
  ],
  "metadata": {

diff --git a/src/intelligence_layer/evaluation/run/runner.py b/src/intelligence_layer/evaluation/run/runner.py
@@ -6,6 +6,7 @@
 from typing import Generic, Optional, cast
 from uuid import uuid4
 
+from dict_hash import dict_hash  # type: ignore
 from pydantic import JsonValue
 
 from intelligence_layer.connectors.base.json_serializable import (
@@ -279,3 +280,21 @@ def run_lineage(
             expected_output_type=expected_output_type,
             output_type=self.output_type(),
         )
+
+    def run_is_already_computed(
+        self,
+        metadata: SerializableDict,
+    ) -> bool:
+        """Checks if a run with the given metadata has already been computed.
+
+        Args:
+            metadata: The metadata dictionary to check.
+
+        Returns:
+            True if a run with the same metadata has already been computed. False otherwise.
+        """
+        previous_runs = {
+            dict_hash(run_overview.metadata)
+            for run_overview in self._run_repository.run_overviews()
+        }
+        return dict_hash(metadata) in previous_runs
diff --git a/tests/evaluation/run/test_runner.py b/tests/evaluation/run/test_runner.py
@@ -215,3 +215,23 @@ def test_runner_run_overview_has_specified_metadata_and_labels(
 
     assert overview.metadata == run_metadata
     assert overview.labels == run_labels
+
+
+def test_run_is_already_computed_works(
+    in_memory_dataset_repository: InMemoryDatasetRepository,
+    in_memory_run_repository: InMemoryRunRepository,
+    sequence_examples: Iterable[Example[str, None]],
+) -> None:
+    old_model = "old_model"
+    examples = list(sequence_examples)
+    task = DummyTask()
+    runner = Runner(task, in_memory_dataset_repository, in_memory_run_repository, "foo")
+    dataset_id = in_memory_dataset_repository.create_dataset(
+        examples=examples, dataset_name=""
+    ).id
+
+    run_metadata: SerializableDict = dict({"model": old_model})
+    runner.run_dataset(dataset_id, metadata=run_metadata)
+
+    assert runner.run_is_already_computed(dict({"model": old_model}))
+    assert not runner.run_is_already_computed(dict({"model": "new_model"}))