From 85eed4ea8798edd69117d8cc89af5e277a47b8f7 Mon Sep 17 00:00:00 2001
From: Valentina Galata <valentina.galata@tngtech.com>
Date: Tue, 12 Mar 2024 11:44:15 +0100
Subject: [PATCH] feat(dataset-repository): add Dataset model to
 dataset-repository

Task: IL-238
---
 src/examples/human_evaluation.ipynb           |   7 +-
 src/examples/quickstart_task.ipynb            |   8 +-
 .../data_storage/dataset_repository.py        | 191 +++++++++++++-----
 src/intelligence_layer/evaluation/domain.py   |  12 ++
 .../evaluation/hugging_face.py                |   8 +-
 .../use_cases/classify/evaluation.ipynb       |  10 +-
 tests/evaluation/conftest.py                  |   4 +-
 tests/evaluation/test_dataset_repository.py   | 135 +++++++++++--
 tests/evaluation/test_evaluator.py            |   8 +-
 tests/evaluation/test_hugging_face.py         |  47 +++--
 ...t_instruct_comparison_argilla_evaluator.py |   7 +-
 tests/evaluation/test_run.py                  |   8 +-
 tests/evaluation/test_runner.py               |   8 +-
 tests/use_cases/classify/test_classify.py     |   8 +-
 .../classify/test_prompt_based_classify.py    |  18 +-
 tests/use_cases/summarize/test_summarize.py   |  16 +-
 16 files changed, 364 insertions(+), 131 deletions(-)

diff --git a/src/examples/human_evaluation.ipynb b/src/examples/human_evaluation.ipynb
index 9251890ef..75764683d 100644
--- a/src/examples/human_evaluation.ipynb
+++ b/src/examples/human_evaluation.ipynb
@@ -42,7 +42,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import os\n",
     "from typing import Iterable, cast\n",
     "\n",
     "from datasets import load_dataset\n",
@@ -77,7 +76,6 @@
     "    Runner,\n",
     "    SuccessfulExampleOutput,\n",
     ")\n",
-    "\n",
     "from intelligence_layer.evaluation.argilla import ArgillaAggregator\n",
     "\n",
     "load_dotenv()\n",
@@ -163,8 +161,9 @@
     "            id=str(dataset[\"meta\"][i][\"id\"]),\n",
     "        )\n",
     "        for i in range(num_examples)\n",
-    "    ]\n",
-    ")"
+    "    ],\n",
+    "    dataset_name=\"human-evaluation-dataset\",\n",
+    ").id"
    ]
   },
   {
diff --git a/src/examples/quickstart_task.ipynb b/src/examples/quickstart_task.ipynb
index 693ce1871..4e1a24ca8 100644
--- a/src/examples/quickstart_task.ipynb
+++ b/src/examples/quickstart_task.ipynb
@@ -299,8 +299,9 @@
     "expected_output = KeywordExtractionExpectedOutput(keywords=[\"dolphins\", \"sharks\"])\n",
     "\n",
     "single_example_dataset = dataset_repository.create_dataset(\n",
-    "    examples=[Example(input=model_input, expected_output=expected_output)]\n",
-    ")\n",
+    "    examples=[Example(input=model_input, expected_output=expected_output)],\n",
+    "    dataset_name=\"quickstart-task-single-example-dataset\",\n",
+    ").id\n",
     "\n",
     "run_overview = runner.run_dataset(single_example_dataset, NoOpTracer())"
    ]
@@ -483,7 +484,8 @@
     "            ),\n",
     "        ),\n",
     "    ],\n",
-    ")\n",
+    "    dataset_name=\"human-evaluation-multiple-examples-dataset\",\n",
+    ").id\n",
     "\n",
     "run = runner.run_dataset(dataset_id)\n",
     "evaluation_overview = evaluator.evaluate_runs(run.id)\n",
diff --git a/src/intelligence_layer/evaluation/data_storage/dataset_repository.py b/src/intelligence_layer/evaluation/data_storage/dataset_repository.py
index 3f7140f45..72aae428f 100644
--- a/src/intelligence_layer/evaluation/data_storage/dataset_repository.py
+++ b/src/intelligence_layer/evaluation/data_storage/dataset_repository.py
@@ -1,13 +1,12 @@
 from abc import ABC, abstractmethod
 from pathlib import Path
-from typing import Dict, Iterable, Optional, Sequence, cast
-from uuid import uuid4
+from typing import Iterable, Optional, Sequence, Tuple, cast
 
 from fsspec import AbstractFileSystem  # type: ignore
 from fsspec.implementations.local import LocalFileSystem  # type: ignore
 
 from intelligence_layer.core import Input, JsonSerializer, PydanticSerializable
-from intelligence_layer.evaluation.domain import Example, ExpectedOutput
+from intelligence_layer.evaluation.domain import Dataset, Example, ExpectedOutput
 
 
 class DatasetRepository(ABC):
@@ -18,16 +17,16 @@ class DatasetRepository(ABC):
 
     @abstractmethod
     def create_dataset(
-        self,
-        examples: Iterable[Example[Input, ExpectedOutput]],
-    ) -> str:
+        self, examples: Iterable[Example[Input, ExpectedOutput]], dataset_name: str
+    ) -> Dataset:
         """Creates a dataset from given :class:`Example`s and returns the ID of that dataset.
 
         Args:
             examples: An :class:`Iterable` of :class:`Example`s to be saved in the same dataset.
+            dataset_name: A name for the dataset.
 
         Returns:
-            The ID of the created dataset.
+            The created :class:`Dataset`.
         """
         pass
 
@@ -40,6 +39,29 @@ def delete_dataset(self, dataset_id: str) -> None:
         """
         pass
 
+    @abstractmethod
+    def dataset(self, dataset_id: str) -> Optional[Dataset]:
+        """Returns a dataset identified by the given dataset ID.
+
+        Args:
+            dataset_id: Dataset ID of the dataset to delete.
+
+        Returns:
+            :class:`Dataset` if it was not, `None` otherwise.
+        """
+        pass
+
+    def datasets(self) -> Iterable[Dataset]:
+        """Returns all :class:`Dataset`s sorted by their ID.
+
+        Returns:
+            :class:`Sequence` of :class:`Dataset`s.
+        """
+        for dataset_id in self.dataset_ids():
+            dataset = self.dataset(dataset_id)
+            if dataset is not None:
+                yield dataset
+
     @abstractmethod
     def dataset_ids(self) -> Iterable[str]:
         """Returns all sorted dataset IDs.
@@ -93,40 +115,79 @@ def examples(
 class FileSystemDatasetRepository(DatasetRepository):
     _REPO_TYPE = "dataset"
 
-    def __init__(self, fs: AbstractFileSystem, root_directory: str) -> None:
+    def __init__(self, filesystem: AbstractFileSystem, root_directory: Path) -> None:
         super().__init__()
-        assert root_directory[-1] != "/"
-        self._fs = fs
+
+        assert str(root_directory)[-1] != "/"
+        root_directory.mkdir(parents=True, exist_ok=True)
+
+        self._file_system = filesystem
         self._root_directory = root_directory
 
-    def create_dataset(self, examples: Iterable[Example[Input, ExpectedOutput]]) -> str:
-        dataset_id = str(uuid4())
-        dataset_path = self._dataset_path(dataset_id)
-        if self._fs.exists(dataset_path):
-            raise ValueError(f"Dataset name {dataset_id} already taken")
+    def create_dataset(
+        self, examples: Iterable[Example[Input, ExpectedOutput]], dataset_name: str
+    ) -> Dataset:
+        dataset = Dataset(name=dataset_name)
+        try:
+            self._dataset_directory(dataset.id).mkdir(exist_ok=False)
+        except OSError:
+            raise ValueError(
+                f"Created random dataset ID already exists for dataset {dataset}. This should not happen."
+            )
+
+        dataset_path = self._dataset_path(dataset.id)
+        examples_path = self._dataset_examples_path(dataset.id)
+        if self._file_system.exists(dataset_path) or self._file_system.exists(
+            examples_path
+        ):
+            raise ValueError(
+                f"One of the dataset files already exist for dataset {dataset}. This should not happen. Files: {dataset_path}, {examples_path}."
+            )
 
-        with self._fs.open(dataset_path, "w", encoding="utf-8") as examples_file:
+        with self._file_system.open(
+            str(dataset_path), "w", encoding="utf-8"
+        ) as dataset_file:
+            dataset_file.write(JsonSerializer(root=dataset).model_dump_json() + "\n")
+
+        with self._file_system.open(
+            str(examples_path), "w", encoding="utf-8"
+        ) as examples_file:
             for example in examples:
                 serialized_result = JsonSerializer(root=example)
                 text = serialized_result.model_dump_json() + "\n"
                 examples_file.write(text)
-        return dataset_id
+
+        return dataset
 
     def delete_dataset(self, dataset_id: str) -> None:
-        dataset_path = self._dataset_path(dataset_id)
         try:
-            self._fs.rm(dataset_path, recursive=True)
+            self._file_system.rm(
+                str(self._dataset_directory(dataset_id)), recursive=True
+            )
         except FileNotFoundError:
             pass
 
+    def dataset(self, dataset_id: str) -> Optional[Dataset]:
+        file_path = self._dataset_path(dataset_id)
+        if not file_path.exists():
+            return None
+
+        with self._file_system.open(
+            str(file_path), "r", encoding="utf-8"
+        ) as file_content:
+            # we save only one dataset per file
+            return [
+                Dataset.model_validate_json(dataset_string)
+                for dataset_string in file_content
+            ][0]
+
     def dataset_ids(self) -> Iterable[str]:
-        return sorted(
-            [
-                Path(f["name"]).stem
-                for f in self._fs.ls(self._root_directory, detail=True)
-                if isinstance(f, Dict) and Path(f["name"]).suffix == ".jsonl"
-            ]
+        dataset_files = self._file_system.glob(
+            path=str(self._root_directory) + "/**/*.json",
+            maxdepth=2,
+            detail=False,
         )
+        return sorted([Path(f).stem for f in dataset_files])
 
     def example(
         self,
@@ -135,11 +196,13 @@ def example(
         input_type: type[Input],
         expected_output_type: type[ExpectedOutput],
     ) -> Optional[Example[Input, ExpectedOutput]]:
-        example_path = self._dataset_path(dataset_id)
-        if not self._fs.exists(example_path):
+        example_path = self._dataset_examples_path(dataset_id)
+        if not self._file_system.exists(example_path):
             return None
 
-        with self._fs.open(example_path, "r", encoding="utf-8") as examples_file:
+        with self._file_system.open(
+            str(example_path), "r", encoding="utf-8"
+        ) as examples_file:
             for example in examples_file:
                 # mypy does not accept dynamic types
                 validated_example = Example[input_type, expected_output_type].model_validate_json(json_data=example)  # type: ignore
@@ -153,49 +216,63 @@ def examples(
         input_type: type[Input],
         expected_output_type: type[ExpectedOutput],
     ) -> Iterable[Example[Input, ExpectedOutput]]:
-        example_path = self._dataset_path(dataset_id)
-        if not self._fs.exists(example_path):
+        example_path = self._dataset_examples_path(dataset_id)
+        if not self._file_system.exists(example_path):
             return []
 
-        with self._fs.open(example_path, "r", encoding="utf-8") as examples_file:
+        with self._file_system.open(
+            str(example_path), "r", encoding="utf-8"
+        ) as examples_file:
             # Mypy does not accept dynamic types
             examples = [Example[input_type, expected_output_type].model_validate_json(json_data=example) for example in examples_file]  # type: ignore
 
         return sorted(examples, key=lambda example: example.id)
 
-    def _dataset_path(self, dataset_id: str) -> str:
-        return self._root_directory + f"/{dataset_id}.jsonl"
+    def _dataset_directory(self, dataset_id: str) -> Path:
+        return self._root_directory / f"{dataset_id}"
+
+    def _dataset_path(self, dataset_id: str) -> Path:
+        return self._dataset_directory(dataset_id) / f"{dataset_id}.json"
+
+    def _dataset_examples_path(self, dataset_id: str) -> Path:
+        return self._dataset_directory(dataset_id) / f"{dataset_id}.jsonl"
 
 
 class InMemoryDatasetRepository(DatasetRepository):
     def __init__(self) -> None:
-        self._datasets: dict[
-            str, Sequence[Example[PydanticSerializable, PydanticSerializable]]
+        self._datasets_and_examples: dict[
+            str,
+            Tuple[
+                Dataset, Sequence[Example[PydanticSerializable, PydanticSerializable]]
+            ],
         ] = {}
 
     def create_dataset(
-        self,
-        examples: Iterable[Example[Input, ExpectedOutput]],
-    ) -> str:
-        dataset_id = str(uuid4())
-        if dataset_id in self._datasets:
-            raise ValueError(f"Dataset name {dataset_id} already taken")
-
-        in_memory_examples = [
-            cast(
-                Example[PydanticSerializable, PydanticSerializable],
-                example,
+        self, examples: Iterable[Example[Input, ExpectedOutput]], dataset_name: str
+    ) -> Dataset:
+        dataset = Dataset(name=dataset_name)
+        if dataset.id in self._datasets_and_examples:
+            raise ValueError(
+                f"Created random dataset ID already exists for dataset {dataset}. This should not happen."
             )
-            for example in examples
-        ]
-        self._datasets[dataset_id] = in_memory_examples
-        return dataset_id
+
+        examples_casted = cast(
+            Sequence[Example[PydanticSerializable, PydanticSerializable]], examples
+        )
+        self._datasets_and_examples[dataset.id] = (dataset, examples_casted)
+
+        return dataset
 
     def delete_dataset(self, dataset_id: str) -> None:
-        self._datasets.pop(dataset_id, None)
+        self._datasets_and_examples.pop(dataset_id, None)
+
+    def dataset(self, dataset_id: str) -> Optional[Dataset]:
+        if dataset_id in self._datasets_and_examples:
+            return self._datasets_and_examples[dataset_id][0]
+        return None
 
     def dataset_ids(self) -> Iterable[str]:
-        return sorted(list(self._datasets.keys()))
+        return sorted(list(self._datasets_and_examples.keys()))
 
     def example(
         self,
@@ -214,13 +291,17 @@ def examples(
         input_type: type[Input],
         expected_output_type: type[ExpectedOutput],
     ) -> Iterable[Example[Input, ExpectedOutput]]:
+        if dataset_id not in self._datasets_and_examples:
+            return []
         return cast(
             Iterable[Example[Input, ExpectedOutput]],
-            sorted(self._datasets.get(dataset_id, []), key=lambda example: example.id),
+            sorted(
+                self._datasets_and_examples[dataset_id][1],
+                key=lambda example: example.id,
+            ),
         )
 
 
 class FileDatasetRepository(FileSystemDatasetRepository):
     def __init__(self, root_directory: Path) -> None:
-        super().__init__(LocalFileSystem(), str(root_directory))
-        root_directory.mkdir(parents=True, exist_ok=True)
+        super().__init__(LocalFileSystem(), root_directory)
diff --git a/src/intelligence_layer/evaluation/domain.py b/src/intelligence_layer/evaluation/domain.py
index e7a28cb78..bfcdc1579 100644
--- a/src/intelligence_layer/evaluation/domain.py
+++ b/src/intelligence_layer/evaluation/domain.py
@@ -24,6 +24,18 @@
 AggregatedEvaluation = TypeVar("AggregatedEvaluation", bound=BaseModel, covariant=True)
 
 
+class Dataset(BaseModel):
+    """Represents a dataset linked to multiple examples
+
+    Attributes:
+        id: Dataset ID.
+        name: A short name of the dataset.
+    """
+
+    id: str = Field(default_factory=lambda: str(uuid4()))
+    name: str
+
+
 class FailedExampleRun(BaseModel):
     """Captures an exception raised when running a single example with a :class:`Task`.
 
diff --git a/src/intelligence_layer/evaluation/hugging_face.py b/src/intelligence_layer/evaluation/hugging_face.py
index c6573586d..b777ff081 100644
--- a/src/intelligence_layer/evaluation/hugging_face.py
+++ b/src/intelligence_layer/evaluation/hugging_face.py
@@ -1,3 +1,5 @@
+from pathlib import Path
+
 import huggingface_hub  # type: ignore
 from huggingface_hub import HfFileSystem, create_repo
 
@@ -19,14 +21,12 @@ def __init__(self, database_name: str, token: str, private: bool) -> None:
             private=private,
         )
         self._database_name = database_name
-        fs = HfFileSystem(token=token)
-        root_directory = f"datasets/{database_name}"
-        super().__init__(fs, root_directory)
+        super().__init__(HfFileSystem(token=token), Path(f"datasets/{database_name}"))
 
     def delete_repository(self) -> None:
         huggingface_hub.delete_repo(
             database_name=self._database_name,
-            token=self._fs.token,
+            token=self._file_system.token,
             repo_type=HuggingFaceDatasetRepository._REPO_TYPE,
             missing_ok=True,
         )
diff --git a/src/intelligence_layer/use_cases/classify/evaluation.ipynb b/src/intelligence_layer/use_cases/classify/evaluation.ipynb
index 678585d50..ad453c2c1 100644
--- a/src/intelligence_layer/use_cases/classify/evaluation.ipynb
+++ b/src/intelligence_layer/use_cases/classify/evaluation.ipynb
@@ -108,8 +108,9 @@
     ")\n",
     "\n",
     "single_example_dataset = dataset_repository.create_dataset(\n",
-    "    examples=[Example(input=classify_input, expected_output=\"positive\")]\n",
-    ")\n",
+    "    examples=[Example(input=classify_input, expected_output=\"positive\")],\n",
+    "    dataset_name=\"evaluation-single-example-dataset\",\n",
+    ").id\n",
     "\n",
     "run_overview = runner.run_dataset(single_example_dataset, NoOpTracer())\n",
     "evaluation_overview = evaluator.evaluate_runs(run_overview.id)\n",
@@ -193,8 +194,9 @@
     "            expected_output=d[\"label_name\"],\n",
     "        )\n",
     "        for d in data\n",
-    "    ]\n",
-    ")"
+    "    ],\n",
+    "    dataset_name=\"evaluation-examples-with-labels-dataset\",\n",
+    ").id"
    ]
   },
   {
diff --git a/tests/evaluation/conftest.py b/tests/evaluation/conftest.py
index 661c5bbf9..904bd92be 100644
--- a/tests/evaluation/conftest.py
+++ b/tests/evaluation/conftest.py
@@ -95,7 +95,9 @@ def string_dataset_id(
     dummy_string_examples: Iterable[Example[DummyStringInput, DummyStringOutput]],
     in_memory_dataset_repository: DatasetRepository,
 ) -> str:
-    return in_memory_dataset_repository.create_dataset(dummy_string_examples)
+    return in_memory_dataset_repository.create_dataset(
+        examples=dummy_string_examples, dataset_name="test-dataset"
+    ).id
 
 
 @fixture
diff --git a/tests/evaluation/test_dataset_repository.py b/tests/evaluation/test_dataset_repository.py
index 08911c7b7..e413ea6ed 100644
--- a/tests/evaluation/test_dataset_repository.py
+++ b/tests/evaluation/test_dataset_repository.py
@@ -1,7 +1,8 @@
 from pathlib import Path
-from typing import Iterable
+from typing import Any, Iterable
+from unittest.mock import patch
 
-from pytest import FixtureRequest, fixture, mark
+from pytest import FixtureRequest, fixture, mark, raises
 
 from intelligence_layer.evaluation import (
     DatasetRepository,
@@ -28,50 +29,134 @@ def file_dataset_repository(tmp_path: Path) -> FileDatasetRepository:
 )
 def test_dataset_repository_can_create_and_store_a_dataset(
     repository_fixture: str,
-    dummy_string_example: Example[DummyStringInput, DummyStringOutput],
     request: FixtureRequest,
+    dummy_string_example: Example[DummyStringInput, DummyStringOutput],
 ) -> None:
     dataset_repository: DatasetRepository = request.getfixturevalue(repository_fixture)
 
-    dataset_id = dataset_repository.create_dataset(examples=[dummy_string_example])
+    dataset = dataset_repository.create_dataset(
+        examples=[dummy_string_example], dataset_name="test-dataset"
+    )
+    stored_dataset = dataset_repository.dataset(dataset.id)
     stored_examples = list(
         dataset_repository.examples(
-            dataset_id,
+            dataset.id,
             input_type=DummyStringInput,
             expected_output_type=DummyStringOutput,
         )
     )
 
+    assert stored_dataset == dataset
     assert len(stored_examples) == 1
     assert stored_examples[0] == dummy_string_example
 
 
+@patch(target="intelligence_layer.evaluation.domain.uuid4", return_value="12345")
+@mark.parametrize(
+    "repository_fixture",
+    test_repository_fixtures,
+)
+def test_dataset_repository_ensures_unique_dataset_ids(
+    _mock_uuid4: Any,
+    repository_fixture: str,
+    request: FixtureRequest,
+    dummy_string_example: Example[DummyStringInput, DummyStringOutput],
+) -> None:
+    dataset_repository: DatasetRepository = request.getfixturevalue(repository_fixture)
+
+    dataset_repository.create_dataset(
+        examples=[dummy_string_example], dataset_name="test-dataset"
+    )
+    with raises(ValueError):
+        dataset_repository.create_dataset(
+            examples=[dummy_string_example], dataset_name="test-dataset"
+        )
+
+
+@patch(
+    target="intelligence_layer.evaluation.data_storage.dataset_repository.LocalFileSystem.exists",
+    return_value=True,
+)
+def test_file_system_dataset_repository_avoids_overriding_existing_files(
+    _mock: Any,
+    tmp_path: Path,
+    dummy_string_example: Example[DummyStringInput, DummyStringOutput],
+) -> None:
+    dataset_repository = FileDatasetRepository(root_directory=tmp_path)
+
+    with raises(ValueError):
+        dataset_repository.create_dataset(
+            examples=[dummy_string_example], dataset_name="test-dataset"
+        )
+
+
+@mark.parametrize(
+    "repository_fixture",
+    test_repository_fixtures,
+)
+def test_dataset_returns_none_for_not_existing_dataset_id(
+    repository_fixture: str,
+    request: FixtureRequest,
+) -> None:
+    dataset_repository: DatasetRepository = request.getfixturevalue(repository_fixture)
+
+    stored_dataset = dataset_repository.dataset("not-existing-dataset-id")
+
+    assert stored_dataset is None
+
+
 @mark.parametrize(
     "repository_fixture",
     test_repository_fixtures,
 )
 def test_delete_dataset_deletes_a_dataset(
     repository_fixture: str,
-    dummy_string_examples: Iterable[Example[DummyStringInput, DummyStringOutput]],
     request: FixtureRequest,
+    dummy_string_examples: Iterable[Example[DummyStringInput, DummyStringOutput]],
 ) -> None:
     dataset_repository: DatasetRepository = request.getfixturevalue(repository_fixture)
-    dataset_id = dataset_repository.create_dataset(dummy_string_examples)
+    dataset = dataset_repository.create_dataset(
+        examples=dummy_string_examples, dataset_name="test-dataset"
+    )
 
-    dataset_repository.delete_dataset(dataset_id)
-    dataset_ids = dataset_repository.dataset_ids()
+    dataset_repository.delete_dataset(dataset.id)
+
+    stored_dataset = dataset_repository.dataset(dataset.id)
     examples = list(
-        dataset_repository.examples(dataset_id, DummyStringInput, DummyStringOutput)
+        dataset_repository.examples(dataset.id, DummyStringInput, DummyStringOutput)
     )
 
-    assert dataset_id not in dataset_ids
+    assert stored_dataset is None
     assert len(examples) == 0
 
     dataset_repository.delete_dataset(
-        dataset_id
+        dataset.id
     )  # tests whether function is idempotent
 
 
+@mark.parametrize(
+    "repository_fixture",
+    test_repository_fixtures,
+)
+def test_datasets_returns_all_sorted_dataset(
+    repository_fixture: str,
+    request: FixtureRequest,
+    dummy_string_example: Example[DummyStringInput, DummyStringOutput],
+) -> None:
+    dataset_repository: DatasetRepository = request.getfixturevalue(repository_fixture)
+    datasets = []
+    for i in range(10):
+        datasets.append(
+            dataset_repository.create_dataset(
+                examples=[dummy_string_example], dataset_name=f"test-dataset_{i}"
+            )
+        )
+
+    stored_datasets = list(dataset_repository.datasets())
+
+    assert stored_datasets == sorted(datasets, key=lambda dataset: dataset.id)
+
+
 @mark.parametrize(
     "repository_fixture",
     test_repository_fixtures,
@@ -83,7 +168,9 @@ def test_dataset_ids_returns_all_sorted_ids(
 ) -> None:
     dataset_repository: DatasetRepository = request.getfixturevalue(repository_fixture)
     dataset_ids = [
-        dataset_repository.create_dataset(examples=dummy_string_examples)
+        dataset_repository.create_dataset(
+            examples=dummy_string_examples, dataset_name="test-dataset"
+        ).id
         for _ in range(10)
     ]
 
@@ -108,10 +195,12 @@ def test_examples_returns_all_examples_sorted_by_their_id(
         )
         for i in range(0, 10)
     ]
-    dataset_id = dataset_repository.create_dataset(examples=examples)
+    dataset = dataset_repository.create_dataset(
+        examples=examples, dataset_name="test-dataset"
+    )
 
     stored_examples = list(
-        dataset_repository.examples(dataset_id, DummyStringInput, DummyStringOutput)
+        dataset_repository.examples(dataset.id, DummyStringInput, DummyStringOutput)
     )
 
     assert stored_examples == sorted(examples, key=lambda example: example.id)
@@ -141,14 +230,16 @@ def test_examples_returns_an_empty_list_for_not_existing_dataset_id(
 )
 def test_example_returns_example_for_existing_dataset_id_and_example_id(
     repository_fixture: str,
-    dummy_string_example: Example[DummyStringInput, DummyStringOutput],
     request: FixtureRequest,
+    dummy_string_example: Example[DummyStringInput, DummyStringOutput],
 ) -> None:
     dataset_repository: DatasetRepository = request.getfixturevalue(repository_fixture)
-    dataset_id = dataset_repository.create_dataset(examples=[dummy_string_example])
+    dataset = dataset_repository.create_dataset(
+        examples=[dummy_string_example], dataset_name="test-dataset"
+    )
 
     example = dataset_repository.example(
-        dataset_id, dummy_string_example.id, DummyStringInput, DummyStringOutput
+        dataset.id, dummy_string_example.id, DummyStringInput, DummyStringOutput
     )
 
     assert example == dummy_string_example
@@ -157,11 +248,13 @@ def test_example_returns_example_for_existing_dataset_id_and_example_id(
 @mark.parametrize("repository_fixture", test_repository_fixtures)
 def test_example_returns_none_for_not_existing_dataset_id_or_example_id(
     repository_fixture: str,
-    dummy_string_example: Example[DummyStringInput, DummyStringOutput],
     request: FixtureRequest,
+    dummy_string_example: Example[DummyStringInput, DummyStringOutput],
 ) -> None:
     dataset_repository: DatasetRepository = request.getfixturevalue(repository_fixture)
-    dataset_id = dataset_repository.create_dataset(examples=[dummy_string_example])
+    dataset = dataset_repository.create_dataset(
+        examples=[dummy_string_example], dataset_name="test-dataset"
+    )
 
     examples = [
         dataset_repository.example(
@@ -171,7 +264,7 @@ def test_example_returns_none_for_not_existing_dataset_id_or_example_id(
             DummyStringOutput,
         ),
         dataset_repository.example(
-            dataset_id, "not_existing_example_id", DummyStringInput, DummyStringOutput
+            dataset.id, "not_existing_example_id", DummyStringInput, DummyStringOutput
         ),
         dataset_repository.example(
             "not_existing_dataset_id",
diff --git a/tests/evaluation/test_evaluator.py b/tests/evaluation/test_evaluator.py
index 924bb312f..0152c9db8 100644
--- a/tests/evaluation/test_evaluator.py
+++ b/tests/evaluation/test_evaluator.py
@@ -180,7 +180,9 @@ def dataset_id(
     sequence_examples: Iterable[Example[str, None]],
     in_memory_dataset_repository: InMemoryDatasetRepository,
 ) -> str:
-    return in_memory_dataset_repository.create_dataset(sequence_examples)
+    return in_memory_dataset_repository.create_dataset(
+        examples=sequence_examples, dataset_name="test-dataset"
+    ).id
 
 
 @fixture
@@ -188,7 +190,9 @@ def good_dataset_id(
     sequence_good_examples: Iterable[Example[str, None]],
     in_memory_dataset_repository: InMemoryDatasetRepository,
 ) -> str:
-    return in_memory_dataset_repository.create_dataset(sequence_good_examples)
+    return in_memory_dataset_repository.create_dataset(
+        examples=sequence_good_examples, dataset_name="test-dataset"
+    ).id
 
 
 @fixture
diff --git a/tests/evaluation/test_hugging_face.py b/tests/evaluation/test_hugging_face.py
index f81d96048..024b46593 100644
--- a/tests/evaluation/test_hugging_face.py
+++ b/tests/evaluation/test_hugging_face.py
@@ -32,40 +32,63 @@ def example2() -> Example[str, str]:
     return Example(input="ho", expected_output="hey", id="1")
 
 
-def requires_token() -> MarkDecorator:
+def skip_if_required_token_not_set() -> MarkDecorator:
     return pytest.mark.skipif(
         "HUGGING_FACE_TOKEN" in os.environ.keys(),
         reason="HUGGING_FACE_TOKEN not set, necessary for for current test",
     )
 
 
-@requires_token()
+@skip_if_required_token_not_set()
 def test_hf_database_non_existing(hf_repository: HuggingFaceDatasetRepository) -> None:
+    # not existing IDs
     assert hf_repository.examples("lol", str, str) == []
     assert hf_repository.example("lol", "abc", str, str) is None
-    hf_repository.delete_dataset("lol")
-    # make sure random files are not actually datasets
+
+    # deleting a not-existing dataset
+    try:
+        hf_repository.delete_dataset("lol")
+    except Exception:
+        assert False, "Deleting a not-existing dataset should not throw an exception"
+
+    # non-dataset files are not retrieved as datasets
     datasets = list(hf_repository.dataset_ids())
+
     assert ".gitattributes" not in datasets
     assert "README.md" not in datasets
 
 
-@requires_token()
+@skip_if_required_token_not_set()
 def test_hf_database_operations(
     hf_repository: HuggingFaceDatasetRepository,
     example1: Example[str, str],
     example2: Example[str, str],
 ) -> None:
-    original_examples = [example1, example2]
-    dataset_id = hf_repository.create_dataset(original_examples)
+    examples = [example1, example2]
+
+    dataset_id = hf_repository.create_dataset(examples, "test-hg-dataset").id
+
     try:
-        assert dataset_id in list(hf_repository.dataset_ids())
-        examples = hf_repository.examples(dataset_id, str, str)
-        assert examples != []
-        assert list(examples) == original_examples
-        assert hf_repository.example(dataset_id, example1.id, str, str) == example1
+        stored_dataset_ids = list(hf_repository.dataset_ids())
+
+        # non-dataset files are not retrieved as datasets
+        assert ".gitattributes" not in stored_dataset_ids
+        assert "README.md" not in stored_dataset_ids
+
+        # created dataset is stored
+        assert dataset_id in stored_dataset_ids
+
+        # given examples are stored and can be accessed via their ID
+        assert list(hf_repository.examples(dataset_id, str, str)) == examples
+        for example in examples:
+            assert hf_repository.example(dataset_id, example.id, str, str) == example
+
+        # example() with not-existing example ID returns None
         assert hf_repository.example(dataset_id, "abc", str, str) is None
+
+        # deleting a dataset works
         hf_repository.delete_dataset(dataset_id)
+
         assert hf_repository.examples(dataset_id, str, str) == []
     finally:
         hf_repository.delete_dataset(dataset_id)
diff --git a/tests/evaluation/test_instruct_comparison_argilla_evaluator.py b/tests/evaluation/test_instruct_comparison_argilla_evaluator.py
index 846616404..a3a9dcba6 100644
--- a/tests/evaluation/test_instruct_comparison_argilla_evaluator.py
+++ b/tests/evaluation/test_instruct_comparison_argilla_evaluator.py
@@ -155,14 +155,15 @@ def create_dummy_dataset(
     instruction_input = "some text"
 
     return in_memory_dataset_repository.create_dataset(
-        [
+        examples=[
             Example(
                 id=example_id,
                 input=InstructInput(instruction=instruction, input=instruction_input),
                 expected_output=None,
             )
-        ]
-    )
+        ],
+        dataset_name="test-dataset",
+    ).id
 
 
 def create_dummy_runs(
diff --git a/tests/evaluation/test_run.py b/tests/evaluation/test_run.py
index 83ac25201..0cd794e6f 100644
--- a/tests/evaluation/test_run.py
+++ b/tests/evaluation/test_run.py
@@ -65,7 +65,9 @@ def test_run_evaluation(
 ) -> None:
     dataset_path = tmp_path / "dataset"
     dataset_repository = FileDatasetRepository(dataset_path)
-    dataset_id = dataset_repository.create_dataset(examples)
+    dataset_id = dataset_repository.create_dataset(
+        examples=examples, dataset_name="test-dataset"
+    ).id
 
     aggregation_path = tmp_path / "eval"
     aggregation_repository = FileAggregationRepository(aggregation_path)
@@ -101,7 +103,9 @@ def test_run_evaluation_with_task_with_client(
 ) -> None:
     dataset_path = tmp_path / "dataset"
     dataset_repository = FileDatasetRepository(dataset_path)
-    dataset_id = dataset_repository.create_dataset(examples)
+    dataset_id = dataset_repository.create_dataset(
+        examples=examples, dataset_name="test-dataset"
+    ).id
 
     eval_path = tmp_path / "eval"
 
diff --git a/tests/evaluation/test_runner.py b/tests/evaluation/test_runner.py
index f77a6b891..55970d4d0 100644
--- a/tests/evaluation/test_runner.py
+++ b/tests/evaluation/test_runner.py
@@ -22,7 +22,9 @@ def test_runner_runs_dataset(
         Example(input=FAIL_IN_EVAL_INPUT, expected_output=None),
     ]
 
-    dataset_id = in_memory_dataset_repository.create_dataset(examples=examples)
+    dataset_id = in_memory_dataset_repository.create_dataset(
+        examples=examples, dataset_name="test-dataset"
+    ).id
     overview = runner.run_dataset(dataset_id)
     outputs = list(
         in_memory_run_repository.example_outputs(
@@ -49,7 +51,9 @@ def test_runner_runs_n_examples(
         Example(input=FAIL_IN_TASK_INPUT, expected_output=None, id="example-2"),
     ]
 
-    dataset_id = in_memory_dataset_repository.create_dataset(examples=examples)
+    dataset_id = in_memory_dataset_repository.create_dataset(
+        examples=examples, dataset_name="test-dataset"
+    ).id
     overview = runner.run_dataset(dataset_id)
     overview_with_tracer = runner.run_dataset(dataset_id, tracer, num_examples=1)
 
diff --git a/tests/use_cases/classify/test_classify.py b/tests/use_cases/classify/test_classify.py
index 227b9b825..b18eaf9cf 100644
--- a/tests/use_cases/classify/test_classify.py
+++ b/tests/use_cases/classify/test_classify.py
@@ -119,7 +119,9 @@ def single_entry_dataset_name(
     in_memory_dataset_repository: InMemoryDatasetRepository,
     embedding_based_classify_example: Iterable[Example[ClassifyInput, Sequence[str]]],
 ) -> str:
-    return in_memory_dataset_repository.create_dataset(embedding_based_classify_example)
+    return in_memory_dataset_repository.create_dataset(
+        examples=embedding_based_classify_example, dataset_name="test-dataset"
+    ).id
 
 
 @fixture
@@ -128,8 +130,8 @@ def multiple_entries_dataset_name(
     embedding_based_classify_examples: Iterable[Example[ClassifyInput, Sequence[str]]],
 ) -> str:
     return in_memory_dataset_repository.create_dataset(
-        embedding_based_classify_examples
-    )
+        examples=embedding_based_classify_examples, dataset_name="test-dataset"
+    ).id
 
 
 @fixture
diff --git a/tests/use_cases/classify/test_prompt_based_classify.py b/tests/use_cases/classify/test_prompt_based_classify.py
index 87d2d2d7e..c3c50f872 100644
--- a/tests/use_cases/classify/test_prompt_based_classify.py
+++ b/tests/use_cases/classify/test_prompt_based_classify.py
@@ -200,9 +200,11 @@ def test_can_evaluate_classify(
         expected_output=["positive"],
     )
 
-    dataset_name = in_memory_dataset_repository.create_dataset([example])
+    dataset_id = in_memory_dataset_repository.create_dataset(
+        examples=[example], dataset_name="test-dataset"
+    ).id
 
-    run_overview = classify_runner.run_dataset(dataset_name)
+    run_overview = classify_runner.run_dataset(dataset_id)
     evaluation_overview = classify_evaluator.evaluate_runs(run_overview.id)
 
     evaluation = in_memory_evaluation_repository.example_evaluations(
@@ -243,11 +245,11 @@ def test_can_aggregate_evaluations(
         ),
         expected_output=positive_lst,
     )
-    dataset_name = in_memory_dataset_repository.create_dataset(
-        [correct_example, incorrect_example]
-    )
+    dataset_id = in_memory_dataset_repository.create_dataset(
+        examples=[correct_example, incorrect_example], dataset_name="test-dataset"
+    ).id
 
-    run_overview = classify_runner.run_dataset(dataset_name)
+    run_overview = classify_runner.run_dataset(dataset_id)
     evaluation_overview = classify_evaluator.evaluate_runs(run_overview.id)
     aggregation_overview = classify_aggregator.aggregate_evaluation(
         evaluation_overview.id
@@ -270,7 +272,9 @@ def test_aggregating_evaluations_works_with_empty_list(
     classify_runner: Runner[ClassifyInput, SingleLabelClassifyOutput],
     in_memory_dataset_repository: DatasetRepository,
 ) -> None:
-    dataset_id = in_memory_dataset_repository.create_dataset([])
+    dataset_id = in_memory_dataset_repository.create_dataset(
+        examples=[], dataset_name="test-dataset"
+    ).id
     run_overview = classify_runner.run_dataset(dataset_id)
     evaluation_overview = classify_evaluator.evaluate_runs(run_overview.id)
     aggregation_overview = classify_aggregator.aggregate_evaluation(
diff --git a/tests/use_cases/summarize/test_summarize.py b/tests/use_cases/summarize/test_summarize.py
index 938a2417c..e5102cf0c 100644
--- a/tests/use_cases/summarize/test_summarize.py
+++ b/tests/use_cases/summarize/test_summarize.py
@@ -178,10 +178,10 @@ def test_single_chunk_summarize_evaluator(
         expected_output="The brown bear is a large mammal that lives in Eurasia and North America.",
         id="good",
     )
-    dataset_name = in_memory_dataset_repository.create_dataset(
-        [good_example, bad_example]
-    )
-    run_overview = single_chunk_summarize_runner.run_dataset(dataset_name)
+    dataset_id = in_memory_dataset_repository.create_dataset(
+        examples=[good_example, bad_example], dataset_name="test-dataset"
+    ).id
+    run_overview = single_chunk_summarize_runner.run_dataset(dataset_id)
 
     evaluation_overview = single_chunk_summarize_evaluator.evaluate_runs(
         run_overview.id
@@ -236,10 +236,10 @@ def test_long_context_summarize_evaluator(
         expected_output="The brown bear is a large mammal that lives in Eurasia and North America.",
         id="good",
     )
-    dataset_name = in_memory_dataset_repository.create_dataset(
-        [good_example, bad_example]
-    )
-    run_overview = long_context_summarize_runner.run_dataset(dataset_name)
+    dataset_id = in_memory_dataset_repository.create_dataset(
+        examples=[good_example, bad_example], dataset_name="test-dataset"
+    ).id
+    run_overview = long_context_summarize_runner.run_dataset(dataset_id)
 
     evaluation_overview = long_context_summarize_evaluator.evaluate_runs(
         run_overview.id