Skip to content

Commit

Permalink
feat: Check if run has already happened (#1079)
Browse files Browse the repository at this point in the history
Co-authored-by: Sebastian Niehus <[email protected]>
  • Loading branch information
JohannesWesch and SebastianNiehusAA authored Oct 15, 2024
1 parent 592e746 commit 6dbf754
Show file tree
Hide file tree
Showing 7 changed files with 133 additions and 5 deletions.
5 changes: 3 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
## Unreleased

### Features
...
- Add Catalan and Polish support to `DetectLanguage`.
- Add utility function `run_is_already_computed` to `Runner` to check if a run with the given metadata has already been computed.
- The `parameter_optimization` notebook describes how to use the `run_is_already_computed` function.
### Fixes
...
### Deprecations
Expand All @@ -14,7 +16,6 @@
## 7.0.0

### Features
- Add Catalan and Polish support to `detectLanguage`.
- You can now specify a `hybrid_index` when creating an index for the document index to use hybrid (semantic and keyword) search.
- `min_score` and `max_results` are now optional parameters in `DocumentIndexClient.SearchQuery`.
- `k` is now an optional parameter in `DocumentIndexRetriever`.
Expand Down
33 changes: 31 additions & 2 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ rouge-score = "^0.1.2"
sacrebleu = "^2.4.3"
lingua-language-detector = "^2.0.2"
argilla = "^2.1.0"
dict-hash = "^1.3.4"

[tool.poetry.group.dev.dependencies]
# lint & format
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@
" outputs: list[SuccessfulExampleOutput[str]],\n",
" already_evaluated_outputs: list[list[SuccessfulExampleOutput[str]]],\n",
" ) -> DummyEvaluation:\n",
" # Here we just return the dummy evaluation. In a real use case one could also use `already_evaluated_outputs' to skip previous evaluations,\n",
" return DummyEvaluation(eval=\"DummyEvalResult\")\n",
"\n",
"\n",
Expand Down Expand Up @@ -108,7 +109,7 @@
"source": [
"## Addition of new task/run\n",
"# Step 5\n",
"run_ids = [my_example_data.run_overview_1.id, my_example_data.run_overview_1.id]\n",
"run_ids = [my_example_data.run_overview_1.id, my_example_data.run_overview_2.id]\n",
"incremental_evaluator.evaluate_additional_runs(\n",
" *run_ids,\n",
" previous_evaluation_ids=evaluation_repository.evaluation_overview_ids(),\n",
Expand Down
57 changes: 57 additions & 0 deletions src/documentation/parameter_optimization.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,63 @@
"source": [
"With these results, it's easy to see which prompt is best to optimize our score! The model on the other hand does not seem to have a big impact on our metrics."
]
},
{
"cell_type": "markdown",
"id": "11",
"metadata": {},
"source": [
"## Adding Different Run Configurations"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "12",
"metadata": {},
"outputs": [],
"source": [
"# Add a new model to the run configuration\n",
"model_list.append(\"model d\")\n",
"\n",
"# Loop over all combinations of parameters and run the `Task` for each combination.\n",
"# We set `recompute_if_metadata_changed` to `True` to only run the new model `model d` with the existing prompts.\n",
"# The previous runs are not recomputed and keep their old results, unless the metadata of the run has changed.\n",
"# Note, that currently the evaluations are recomputed for all runs.\n",
"for i, (model, prompt) in enumerate(itertools.product(model_list, prompt_list)):\n",
" dummy_task = DummyTask(model=model, prompt=prompt)\n",
"\n",
" # Model and prompt are stored in the metadata to specify the configuration of the current experiment\n",
" metadata = dict({\"model\": model, \"prompt\": prompt})\n",
" description = f\"Evaluate dummy task {i}\"\n",
" runner = Runner(dummy_task, dataset_repository, run_repository, EXPERIMENT_NAME)\n",
" if not runner.run_is_already_computed(metadata):\n",
" run_overview = runner.run_dataset(\n",
" dataset.id,\n",
" metadata=metadata,\n",
" description=description,\n",
" labels=labels,\n",
" )\n",
"\n",
" eval_overview = evaluator.evaluate_runs(\n",
" run_overview.id,\n",
" metadata=metadata,\n",
" description=description,\n",
" labels=labels,\n",
" )\n",
"\n",
" aggregator.aggregate_evaluation(\n",
" eval_overview.id, metadata=metadata, description=description, labels=labels\n",
" )\n",
"\n",
"# Let's print the number of run overviews. Because we have not recomputed the 12 (3 models times 4 examples) runs we now expect 16 runs in total.\n",
"# Without the `recompute_if_metadata_changed` flag we would get 28 runs!\n",
"print(len(list(run_repository.run_overviews())))\n",
"print(len(list(evaluation_repository.evaluation_overviews())))\n",
"print(\n",
" len(list(aggregation_repository.aggregation_overviews(DummyAggregatedEvaluation)))\n",
")"
]
}
],
"metadata": {
Expand Down
19 changes: 19 additions & 0 deletions src/intelligence_layer/evaluation/run/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from typing import Generic, Optional, cast
from uuid import uuid4

from dict_hash import dict_hash # type: ignore
from pydantic import JsonValue

from intelligence_layer.connectors.base.json_serializable import (
Expand Down Expand Up @@ -279,3 +280,21 @@ def run_lineage(
expected_output_type=expected_output_type,
output_type=self.output_type(),
)

def run_is_already_computed(
self,
metadata: SerializableDict,
) -> bool:
"""Checks if a run with the given metadata has already been computed.
Args:
metadata: The metadata dictionary to check.
Returns:
True if a run with the same metadata has already been computed. False otherwise.
"""
previous_runs = {
dict_hash(run_overview.metadata)
for run_overview in self._run_repository.run_overviews()
}
return dict_hash(metadata) in previous_runs
20 changes: 20 additions & 0 deletions tests/evaluation/run/test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,3 +215,23 @@ def test_runner_run_overview_has_specified_metadata_and_labels(

assert overview.metadata == run_metadata
assert overview.labels == run_labels


def test_run_is_already_computed_works(
in_memory_dataset_repository: InMemoryDatasetRepository,
in_memory_run_repository: InMemoryRunRepository,
sequence_examples: Iterable[Example[str, None]],
) -> None:
old_model = "old_model"
examples = list(sequence_examples)
task = DummyTask()
runner = Runner(task, in_memory_dataset_repository, in_memory_run_repository, "foo")
dataset_id = in_memory_dataset_repository.create_dataset(
examples=examples, dataset_name=""
).id

run_metadata: SerializableDict = dict({"model": old_model})
runner.run_dataset(dataset_id, metadata=run_metadata)

assert runner.run_is_already_computed(dict({"model": old_model}))
assert not runner.run_is_already_computed(dict({"model": "new_model"}))

0 comments on commit 6dbf754

Please sign in to comment.