From 85eed4ea8798edd69117d8cc89af5e277a47b8f7 Mon Sep 17 00:00:00 2001 From: Valentina Galata Date: Tue, 12 Mar 2024 11:44:15 +0100 Subject: [PATCH] feat(dataset-repository): add Dataset model to dataset-repository Task: IL-238 --- src/examples/human_evaluation.ipynb | 7 +- src/examples/quickstart_task.ipynb | 8 +- .../data_storage/dataset_repository.py | 191 +++++++++++++----- src/intelligence_layer/evaluation/domain.py | 12 ++ .../evaluation/hugging_face.py | 8 +- .../use_cases/classify/evaluation.ipynb | 10 +- tests/evaluation/conftest.py | 4 +- tests/evaluation/test_dataset_repository.py | 135 +++++++++++-- tests/evaluation/test_evaluator.py | 8 +- tests/evaluation/test_hugging_face.py | 47 +++-- ...t_instruct_comparison_argilla_evaluator.py | 7 +- tests/evaluation/test_run.py | 8 +- tests/evaluation/test_runner.py | 8 +- tests/use_cases/classify/test_classify.py | 8 +- .../classify/test_prompt_based_classify.py | 18 +- tests/use_cases/summarize/test_summarize.py | 16 +- 16 files changed, 364 insertions(+), 131 deletions(-) diff --git a/src/examples/human_evaluation.ipynb b/src/examples/human_evaluation.ipynb index 9251890ef..75764683d 100644 --- a/src/examples/human_evaluation.ipynb +++ b/src/examples/human_evaluation.ipynb @@ -42,7 +42,6 @@ "metadata": {}, "outputs": [], "source": [ - "import os\n", "from typing import Iterable, cast\n", "\n", "from datasets import load_dataset\n", @@ -77,7 +76,6 @@ " Runner,\n", " SuccessfulExampleOutput,\n", ")\n", - "\n", "from intelligence_layer.evaluation.argilla import ArgillaAggregator\n", "\n", "load_dotenv()\n", @@ -163,8 +161,9 @@ " id=str(dataset[\"meta\"][i][\"id\"]),\n", " )\n", " for i in range(num_examples)\n", - " ]\n", - ")" + " ],\n", + " dataset_name=\"human-evaluation-dataset\",\n", + ").id" ] }, { diff --git a/src/examples/quickstart_task.ipynb b/src/examples/quickstart_task.ipynb index 693ce1871..4e1a24ca8 100644 --- a/src/examples/quickstart_task.ipynb +++ b/src/examples/quickstart_task.ipynb @@ -299,8 +299,9 @@ "expected_output = KeywordExtractionExpectedOutput(keywords=[\"dolphins\", \"sharks\"])\n", "\n", "single_example_dataset = dataset_repository.create_dataset(\n", - " examples=[Example(input=model_input, expected_output=expected_output)]\n", - ")\n", + " examples=[Example(input=model_input, expected_output=expected_output)],\n", + " dataset_name=\"quickstart-task-single-example-dataset\",\n", + ").id\n", "\n", "run_overview = runner.run_dataset(single_example_dataset, NoOpTracer())" ] @@ -483,7 +484,8 @@ " ),\n", " ),\n", " ],\n", - ")\n", + " dataset_name=\"human-evaluation-multiple-examples-dataset\",\n", + ").id\n", "\n", "run = runner.run_dataset(dataset_id)\n", "evaluation_overview = evaluator.evaluate_runs(run.id)\n", diff --git a/src/intelligence_layer/evaluation/data_storage/dataset_repository.py b/src/intelligence_layer/evaluation/data_storage/dataset_repository.py index 3f7140f45..72aae428f 100644 --- a/src/intelligence_layer/evaluation/data_storage/dataset_repository.py +++ b/src/intelligence_layer/evaluation/data_storage/dataset_repository.py @@ -1,13 +1,12 @@ from abc import ABC, abstractmethod from pathlib import Path -from typing import Dict, Iterable, Optional, Sequence, cast -from uuid import uuid4 +from typing import Iterable, Optional, Sequence, Tuple, cast from fsspec import AbstractFileSystem # type: ignore from fsspec.implementations.local import LocalFileSystem # type: ignore from intelligence_layer.core import Input, JsonSerializer, PydanticSerializable -from intelligence_layer.evaluation.domain import Example, ExpectedOutput +from intelligence_layer.evaluation.domain import Dataset, Example, ExpectedOutput class DatasetRepository(ABC): @@ -18,16 +17,16 @@ class DatasetRepository(ABC): @abstractmethod def create_dataset( - self, - examples: Iterable[Example[Input, ExpectedOutput]], - ) -> str: + self, examples: Iterable[Example[Input, ExpectedOutput]], dataset_name: str + ) -> Dataset: """Creates a dataset from given :class:`Example`s and returns the ID of that dataset. Args: examples: An :class:`Iterable` of :class:`Example`s to be saved in the same dataset. + dataset_name: A name for the dataset. Returns: - The ID of the created dataset. + The created :class:`Dataset`. """ pass @@ -40,6 +39,29 @@ def delete_dataset(self, dataset_id: str) -> None: """ pass + @abstractmethod + def dataset(self, dataset_id: str) -> Optional[Dataset]: + """Returns a dataset identified by the given dataset ID. + + Args: + dataset_id: Dataset ID of the dataset to delete. + + Returns: + :class:`Dataset` if it was not, `None` otherwise. + """ + pass + + def datasets(self) -> Iterable[Dataset]: + """Returns all :class:`Dataset`s sorted by their ID. + + Returns: + :class:`Sequence` of :class:`Dataset`s. + """ + for dataset_id in self.dataset_ids(): + dataset = self.dataset(dataset_id) + if dataset is not None: + yield dataset + @abstractmethod def dataset_ids(self) -> Iterable[str]: """Returns all sorted dataset IDs. @@ -93,40 +115,79 @@ def examples( class FileSystemDatasetRepository(DatasetRepository): _REPO_TYPE = "dataset" - def __init__(self, fs: AbstractFileSystem, root_directory: str) -> None: + def __init__(self, filesystem: AbstractFileSystem, root_directory: Path) -> None: super().__init__() - assert root_directory[-1] != "/" - self._fs = fs + + assert str(root_directory)[-1] != "/" + root_directory.mkdir(parents=True, exist_ok=True) + + self._file_system = filesystem self._root_directory = root_directory - def create_dataset(self, examples: Iterable[Example[Input, ExpectedOutput]]) -> str: - dataset_id = str(uuid4()) - dataset_path = self._dataset_path(dataset_id) - if self._fs.exists(dataset_path): - raise ValueError(f"Dataset name {dataset_id} already taken") + def create_dataset( + self, examples: Iterable[Example[Input, ExpectedOutput]], dataset_name: str + ) -> Dataset: + dataset = Dataset(name=dataset_name) + try: + self._dataset_directory(dataset.id).mkdir(exist_ok=False) + except OSError: + raise ValueError( + f"Created random dataset ID already exists for dataset {dataset}. This should not happen." + ) + + dataset_path = self._dataset_path(dataset.id) + examples_path = self._dataset_examples_path(dataset.id) + if self._file_system.exists(dataset_path) or self._file_system.exists( + examples_path + ): + raise ValueError( + f"One of the dataset files already exist for dataset {dataset}. This should not happen. Files: {dataset_path}, {examples_path}." + ) - with self._fs.open(dataset_path, "w", encoding="utf-8") as examples_file: + with self._file_system.open( + str(dataset_path), "w", encoding="utf-8" + ) as dataset_file: + dataset_file.write(JsonSerializer(root=dataset).model_dump_json() + "\n") + + with self._file_system.open( + str(examples_path), "w", encoding="utf-8" + ) as examples_file: for example in examples: serialized_result = JsonSerializer(root=example) text = serialized_result.model_dump_json() + "\n" examples_file.write(text) - return dataset_id + + return dataset def delete_dataset(self, dataset_id: str) -> None: - dataset_path = self._dataset_path(dataset_id) try: - self._fs.rm(dataset_path, recursive=True) + self._file_system.rm( + str(self._dataset_directory(dataset_id)), recursive=True + ) except FileNotFoundError: pass + def dataset(self, dataset_id: str) -> Optional[Dataset]: + file_path = self._dataset_path(dataset_id) + if not file_path.exists(): + return None + + with self._file_system.open( + str(file_path), "r", encoding="utf-8" + ) as file_content: + # we save only one dataset per file + return [ + Dataset.model_validate_json(dataset_string) + for dataset_string in file_content + ][0] + def dataset_ids(self) -> Iterable[str]: - return sorted( - [ - Path(f["name"]).stem - for f in self._fs.ls(self._root_directory, detail=True) - if isinstance(f, Dict) and Path(f["name"]).suffix == ".jsonl" - ] + dataset_files = self._file_system.glob( + path=str(self._root_directory) + "/**/*.json", + maxdepth=2, + detail=False, ) + return sorted([Path(f).stem for f in dataset_files]) def example( self, @@ -135,11 +196,13 @@ def example( input_type: type[Input], expected_output_type: type[ExpectedOutput], ) -> Optional[Example[Input, ExpectedOutput]]: - example_path = self._dataset_path(dataset_id) - if not self._fs.exists(example_path): + example_path = self._dataset_examples_path(dataset_id) + if not self._file_system.exists(example_path): return None - with self._fs.open(example_path, "r", encoding="utf-8") as examples_file: + with self._file_system.open( + str(example_path), "r", encoding="utf-8" + ) as examples_file: for example in examples_file: # mypy does not accept dynamic types validated_example = Example[input_type, expected_output_type].model_validate_json(json_data=example) # type: ignore @@ -153,49 +216,63 @@ def examples( input_type: type[Input], expected_output_type: type[ExpectedOutput], ) -> Iterable[Example[Input, ExpectedOutput]]: - example_path = self._dataset_path(dataset_id) - if not self._fs.exists(example_path): + example_path = self._dataset_examples_path(dataset_id) + if not self._file_system.exists(example_path): return [] - with self._fs.open(example_path, "r", encoding="utf-8") as examples_file: + with self._file_system.open( + str(example_path), "r", encoding="utf-8" + ) as examples_file: # Mypy does not accept dynamic types examples = [Example[input_type, expected_output_type].model_validate_json(json_data=example) for example in examples_file] # type: ignore return sorted(examples, key=lambda example: example.id) - def _dataset_path(self, dataset_id: str) -> str: - return self._root_directory + f"/{dataset_id}.jsonl" + def _dataset_directory(self, dataset_id: str) -> Path: + return self._root_directory / f"{dataset_id}" + + def _dataset_path(self, dataset_id: str) -> Path: + return self._dataset_directory(dataset_id) / f"{dataset_id}.json" + + def _dataset_examples_path(self, dataset_id: str) -> Path: + return self._dataset_directory(dataset_id) / f"{dataset_id}.jsonl" class InMemoryDatasetRepository(DatasetRepository): def __init__(self) -> None: - self._datasets: dict[ - str, Sequence[Example[PydanticSerializable, PydanticSerializable]] + self._datasets_and_examples: dict[ + str, + Tuple[ + Dataset, Sequence[Example[PydanticSerializable, PydanticSerializable]] + ], ] = {} def create_dataset( - self, - examples: Iterable[Example[Input, ExpectedOutput]], - ) -> str: - dataset_id = str(uuid4()) - if dataset_id in self._datasets: - raise ValueError(f"Dataset name {dataset_id} already taken") - - in_memory_examples = [ - cast( - Example[PydanticSerializable, PydanticSerializable], - example, + self, examples: Iterable[Example[Input, ExpectedOutput]], dataset_name: str + ) -> Dataset: + dataset = Dataset(name=dataset_name) + if dataset.id in self._datasets_and_examples: + raise ValueError( + f"Created random dataset ID already exists for dataset {dataset}. This should not happen." ) - for example in examples - ] - self._datasets[dataset_id] = in_memory_examples - return dataset_id + + examples_casted = cast( + Sequence[Example[PydanticSerializable, PydanticSerializable]], examples + ) + self._datasets_and_examples[dataset.id] = (dataset, examples_casted) + + return dataset def delete_dataset(self, dataset_id: str) -> None: - self._datasets.pop(dataset_id, None) + self._datasets_and_examples.pop(dataset_id, None) + + def dataset(self, dataset_id: str) -> Optional[Dataset]: + if dataset_id in self._datasets_and_examples: + return self._datasets_and_examples[dataset_id][0] + return None def dataset_ids(self) -> Iterable[str]: - return sorted(list(self._datasets.keys())) + return sorted(list(self._datasets_and_examples.keys())) def example( self, @@ -214,13 +291,17 @@ def examples( input_type: type[Input], expected_output_type: type[ExpectedOutput], ) -> Iterable[Example[Input, ExpectedOutput]]: + if dataset_id not in self._datasets_and_examples: + return [] return cast( Iterable[Example[Input, ExpectedOutput]], - sorted(self._datasets.get(dataset_id, []), key=lambda example: example.id), + sorted( + self._datasets_and_examples[dataset_id][1], + key=lambda example: example.id, + ), ) class FileDatasetRepository(FileSystemDatasetRepository): def __init__(self, root_directory: Path) -> None: - super().__init__(LocalFileSystem(), str(root_directory)) - root_directory.mkdir(parents=True, exist_ok=True) + super().__init__(LocalFileSystem(), root_directory) diff --git a/src/intelligence_layer/evaluation/domain.py b/src/intelligence_layer/evaluation/domain.py index e7a28cb78..bfcdc1579 100644 --- a/src/intelligence_layer/evaluation/domain.py +++ b/src/intelligence_layer/evaluation/domain.py @@ -24,6 +24,18 @@ AggregatedEvaluation = TypeVar("AggregatedEvaluation", bound=BaseModel, covariant=True) +class Dataset(BaseModel): + """Represents a dataset linked to multiple examples + + Attributes: + id: Dataset ID. + name: A short name of the dataset. + """ + + id: str = Field(default_factory=lambda: str(uuid4())) + name: str + + class FailedExampleRun(BaseModel): """Captures an exception raised when running a single example with a :class:`Task`. diff --git a/src/intelligence_layer/evaluation/hugging_face.py b/src/intelligence_layer/evaluation/hugging_face.py index c6573586d..b777ff081 100644 --- a/src/intelligence_layer/evaluation/hugging_face.py +++ b/src/intelligence_layer/evaluation/hugging_face.py @@ -1,3 +1,5 @@ +from pathlib import Path + import huggingface_hub # type: ignore from huggingface_hub import HfFileSystem, create_repo @@ -19,14 +21,12 @@ def __init__(self, database_name: str, token: str, private: bool) -> None: private=private, ) self._database_name = database_name - fs = HfFileSystem(token=token) - root_directory = f"datasets/{database_name}" - super().__init__(fs, root_directory) + super().__init__(HfFileSystem(token=token), Path(f"datasets/{database_name}")) def delete_repository(self) -> None: huggingface_hub.delete_repo( database_name=self._database_name, - token=self._fs.token, + token=self._file_system.token, repo_type=HuggingFaceDatasetRepository._REPO_TYPE, missing_ok=True, ) diff --git a/src/intelligence_layer/use_cases/classify/evaluation.ipynb b/src/intelligence_layer/use_cases/classify/evaluation.ipynb index 678585d50..ad453c2c1 100644 --- a/src/intelligence_layer/use_cases/classify/evaluation.ipynb +++ b/src/intelligence_layer/use_cases/classify/evaluation.ipynb @@ -108,8 +108,9 @@ ")\n", "\n", "single_example_dataset = dataset_repository.create_dataset(\n", - " examples=[Example(input=classify_input, expected_output=\"positive\")]\n", - ")\n", + " examples=[Example(input=classify_input, expected_output=\"positive\")],\n", + " dataset_name=\"evaluation-single-example-dataset\",\n", + ").id\n", "\n", "run_overview = runner.run_dataset(single_example_dataset, NoOpTracer())\n", "evaluation_overview = evaluator.evaluate_runs(run_overview.id)\n", @@ -193,8 +194,9 @@ " expected_output=d[\"label_name\"],\n", " )\n", " for d in data\n", - " ]\n", - ")" + " ],\n", + " dataset_name=\"evaluation-examples-with-labels-dataset\",\n", + ").id" ] }, { diff --git a/tests/evaluation/conftest.py b/tests/evaluation/conftest.py index 661c5bbf9..904bd92be 100644 --- a/tests/evaluation/conftest.py +++ b/tests/evaluation/conftest.py @@ -95,7 +95,9 @@ def string_dataset_id( dummy_string_examples: Iterable[Example[DummyStringInput, DummyStringOutput]], in_memory_dataset_repository: DatasetRepository, ) -> str: - return in_memory_dataset_repository.create_dataset(dummy_string_examples) + return in_memory_dataset_repository.create_dataset( + examples=dummy_string_examples, dataset_name="test-dataset" + ).id @fixture diff --git a/tests/evaluation/test_dataset_repository.py b/tests/evaluation/test_dataset_repository.py index 08911c7b7..e413ea6ed 100644 --- a/tests/evaluation/test_dataset_repository.py +++ b/tests/evaluation/test_dataset_repository.py @@ -1,7 +1,8 @@ from pathlib import Path -from typing import Iterable +from typing import Any, Iterable +from unittest.mock import patch -from pytest import FixtureRequest, fixture, mark +from pytest import FixtureRequest, fixture, mark, raises from intelligence_layer.evaluation import ( DatasetRepository, @@ -28,50 +29,134 @@ def file_dataset_repository(tmp_path: Path) -> FileDatasetRepository: ) def test_dataset_repository_can_create_and_store_a_dataset( repository_fixture: str, - dummy_string_example: Example[DummyStringInput, DummyStringOutput], request: FixtureRequest, + dummy_string_example: Example[DummyStringInput, DummyStringOutput], ) -> None: dataset_repository: DatasetRepository = request.getfixturevalue(repository_fixture) - dataset_id = dataset_repository.create_dataset(examples=[dummy_string_example]) + dataset = dataset_repository.create_dataset( + examples=[dummy_string_example], dataset_name="test-dataset" + ) + stored_dataset = dataset_repository.dataset(dataset.id) stored_examples = list( dataset_repository.examples( - dataset_id, + dataset.id, input_type=DummyStringInput, expected_output_type=DummyStringOutput, ) ) + assert stored_dataset == dataset assert len(stored_examples) == 1 assert stored_examples[0] == dummy_string_example +@patch(target="intelligence_layer.evaluation.domain.uuid4", return_value="12345") +@mark.parametrize( + "repository_fixture", + test_repository_fixtures, +) +def test_dataset_repository_ensures_unique_dataset_ids( + _mock_uuid4: Any, + repository_fixture: str, + request: FixtureRequest, + dummy_string_example: Example[DummyStringInput, DummyStringOutput], +) -> None: + dataset_repository: DatasetRepository = request.getfixturevalue(repository_fixture) + + dataset_repository.create_dataset( + examples=[dummy_string_example], dataset_name="test-dataset" + ) + with raises(ValueError): + dataset_repository.create_dataset( + examples=[dummy_string_example], dataset_name="test-dataset" + ) + + +@patch( + target="intelligence_layer.evaluation.data_storage.dataset_repository.LocalFileSystem.exists", + return_value=True, +) +def test_file_system_dataset_repository_avoids_overriding_existing_files( + _mock: Any, + tmp_path: Path, + dummy_string_example: Example[DummyStringInput, DummyStringOutput], +) -> None: + dataset_repository = FileDatasetRepository(root_directory=tmp_path) + + with raises(ValueError): + dataset_repository.create_dataset( + examples=[dummy_string_example], dataset_name="test-dataset" + ) + + +@mark.parametrize( + "repository_fixture", + test_repository_fixtures, +) +def test_dataset_returns_none_for_not_existing_dataset_id( + repository_fixture: str, + request: FixtureRequest, +) -> None: + dataset_repository: DatasetRepository = request.getfixturevalue(repository_fixture) + + stored_dataset = dataset_repository.dataset("not-existing-dataset-id") + + assert stored_dataset is None + + @mark.parametrize( "repository_fixture", test_repository_fixtures, ) def test_delete_dataset_deletes_a_dataset( repository_fixture: str, - dummy_string_examples: Iterable[Example[DummyStringInput, DummyStringOutput]], request: FixtureRequest, + dummy_string_examples: Iterable[Example[DummyStringInput, DummyStringOutput]], ) -> None: dataset_repository: DatasetRepository = request.getfixturevalue(repository_fixture) - dataset_id = dataset_repository.create_dataset(dummy_string_examples) + dataset = dataset_repository.create_dataset( + examples=dummy_string_examples, dataset_name="test-dataset" + ) - dataset_repository.delete_dataset(dataset_id) - dataset_ids = dataset_repository.dataset_ids() + dataset_repository.delete_dataset(dataset.id) + + stored_dataset = dataset_repository.dataset(dataset.id) examples = list( - dataset_repository.examples(dataset_id, DummyStringInput, DummyStringOutput) + dataset_repository.examples(dataset.id, DummyStringInput, DummyStringOutput) ) - assert dataset_id not in dataset_ids + assert stored_dataset is None assert len(examples) == 0 dataset_repository.delete_dataset( - dataset_id + dataset.id ) # tests whether function is idempotent +@mark.parametrize( + "repository_fixture", + test_repository_fixtures, +) +def test_datasets_returns_all_sorted_dataset( + repository_fixture: str, + request: FixtureRequest, + dummy_string_example: Example[DummyStringInput, DummyStringOutput], +) -> None: + dataset_repository: DatasetRepository = request.getfixturevalue(repository_fixture) + datasets = [] + for i in range(10): + datasets.append( + dataset_repository.create_dataset( + examples=[dummy_string_example], dataset_name=f"test-dataset_{i}" + ) + ) + + stored_datasets = list(dataset_repository.datasets()) + + assert stored_datasets == sorted(datasets, key=lambda dataset: dataset.id) + + @mark.parametrize( "repository_fixture", test_repository_fixtures, @@ -83,7 +168,9 @@ def test_dataset_ids_returns_all_sorted_ids( ) -> None: dataset_repository: DatasetRepository = request.getfixturevalue(repository_fixture) dataset_ids = [ - dataset_repository.create_dataset(examples=dummy_string_examples) + dataset_repository.create_dataset( + examples=dummy_string_examples, dataset_name="test-dataset" + ).id for _ in range(10) ] @@ -108,10 +195,12 @@ def test_examples_returns_all_examples_sorted_by_their_id( ) for i in range(0, 10) ] - dataset_id = dataset_repository.create_dataset(examples=examples) + dataset = dataset_repository.create_dataset( + examples=examples, dataset_name="test-dataset" + ) stored_examples = list( - dataset_repository.examples(dataset_id, DummyStringInput, DummyStringOutput) + dataset_repository.examples(dataset.id, DummyStringInput, DummyStringOutput) ) assert stored_examples == sorted(examples, key=lambda example: example.id) @@ -141,14 +230,16 @@ def test_examples_returns_an_empty_list_for_not_existing_dataset_id( ) def test_example_returns_example_for_existing_dataset_id_and_example_id( repository_fixture: str, - dummy_string_example: Example[DummyStringInput, DummyStringOutput], request: FixtureRequest, + dummy_string_example: Example[DummyStringInput, DummyStringOutput], ) -> None: dataset_repository: DatasetRepository = request.getfixturevalue(repository_fixture) - dataset_id = dataset_repository.create_dataset(examples=[dummy_string_example]) + dataset = dataset_repository.create_dataset( + examples=[dummy_string_example], dataset_name="test-dataset" + ) example = dataset_repository.example( - dataset_id, dummy_string_example.id, DummyStringInput, DummyStringOutput + dataset.id, dummy_string_example.id, DummyStringInput, DummyStringOutput ) assert example == dummy_string_example @@ -157,11 +248,13 @@ def test_example_returns_example_for_existing_dataset_id_and_example_id( @mark.parametrize("repository_fixture", test_repository_fixtures) def test_example_returns_none_for_not_existing_dataset_id_or_example_id( repository_fixture: str, - dummy_string_example: Example[DummyStringInput, DummyStringOutput], request: FixtureRequest, + dummy_string_example: Example[DummyStringInput, DummyStringOutput], ) -> None: dataset_repository: DatasetRepository = request.getfixturevalue(repository_fixture) - dataset_id = dataset_repository.create_dataset(examples=[dummy_string_example]) + dataset = dataset_repository.create_dataset( + examples=[dummy_string_example], dataset_name="test-dataset" + ) examples = [ dataset_repository.example( @@ -171,7 +264,7 @@ def test_example_returns_none_for_not_existing_dataset_id_or_example_id( DummyStringOutput, ), dataset_repository.example( - dataset_id, "not_existing_example_id", DummyStringInput, DummyStringOutput + dataset.id, "not_existing_example_id", DummyStringInput, DummyStringOutput ), dataset_repository.example( "not_existing_dataset_id", diff --git a/tests/evaluation/test_evaluator.py b/tests/evaluation/test_evaluator.py index 924bb312f..0152c9db8 100644 --- a/tests/evaluation/test_evaluator.py +++ b/tests/evaluation/test_evaluator.py @@ -180,7 +180,9 @@ def dataset_id( sequence_examples: Iterable[Example[str, None]], in_memory_dataset_repository: InMemoryDatasetRepository, ) -> str: - return in_memory_dataset_repository.create_dataset(sequence_examples) + return in_memory_dataset_repository.create_dataset( + examples=sequence_examples, dataset_name="test-dataset" + ).id @fixture @@ -188,7 +190,9 @@ def good_dataset_id( sequence_good_examples: Iterable[Example[str, None]], in_memory_dataset_repository: InMemoryDatasetRepository, ) -> str: - return in_memory_dataset_repository.create_dataset(sequence_good_examples) + return in_memory_dataset_repository.create_dataset( + examples=sequence_good_examples, dataset_name="test-dataset" + ).id @fixture diff --git a/tests/evaluation/test_hugging_face.py b/tests/evaluation/test_hugging_face.py index f81d96048..024b46593 100644 --- a/tests/evaluation/test_hugging_face.py +++ b/tests/evaluation/test_hugging_face.py @@ -32,40 +32,63 @@ def example2() -> Example[str, str]: return Example(input="ho", expected_output="hey", id="1") -def requires_token() -> MarkDecorator: +def skip_if_required_token_not_set() -> MarkDecorator: return pytest.mark.skipif( "HUGGING_FACE_TOKEN" in os.environ.keys(), reason="HUGGING_FACE_TOKEN not set, necessary for for current test", ) -@requires_token() +@skip_if_required_token_not_set() def test_hf_database_non_existing(hf_repository: HuggingFaceDatasetRepository) -> None: + # not existing IDs assert hf_repository.examples("lol", str, str) == [] assert hf_repository.example("lol", "abc", str, str) is None - hf_repository.delete_dataset("lol") - # make sure random files are not actually datasets + + # deleting a not-existing dataset + try: + hf_repository.delete_dataset("lol") + except Exception: + assert False, "Deleting a not-existing dataset should not throw an exception" + + # non-dataset files are not retrieved as datasets datasets = list(hf_repository.dataset_ids()) + assert ".gitattributes" not in datasets assert "README.md" not in datasets -@requires_token() +@skip_if_required_token_not_set() def test_hf_database_operations( hf_repository: HuggingFaceDatasetRepository, example1: Example[str, str], example2: Example[str, str], ) -> None: - original_examples = [example1, example2] - dataset_id = hf_repository.create_dataset(original_examples) + examples = [example1, example2] + + dataset_id = hf_repository.create_dataset(examples, "test-hg-dataset").id + try: - assert dataset_id in list(hf_repository.dataset_ids()) - examples = hf_repository.examples(dataset_id, str, str) - assert examples != [] - assert list(examples) == original_examples - assert hf_repository.example(dataset_id, example1.id, str, str) == example1 + stored_dataset_ids = list(hf_repository.dataset_ids()) + + # non-dataset files are not retrieved as datasets + assert ".gitattributes" not in stored_dataset_ids + assert "README.md" not in stored_dataset_ids + + # created dataset is stored + assert dataset_id in stored_dataset_ids + + # given examples are stored and can be accessed via their ID + assert list(hf_repository.examples(dataset_id, str, str)) == examples + for example in examples: + assert hf_repository.example(dataset_id, example.id, str, str) == example + + # example() with not-existing example ID returns None assert hf_repository.example(dataset_id, "abc", str, str) is None + + # deleting a dataset works hf_repository.delete_dataset(dataset_id) + assert hf_repository.examples(dataset_id, str, str) == [] finally: hf_repository.delete_dataset(dataset_id) diff --git a/tests/evaluation/test_instruct_comparison_argilla_evaluator.py b/tests/evaluation/test_instruct_comparison_argilla_evaluator.py index 846616404..a3a9dcba6 100644 --- a/tests/evaluation/test_instruct_comparison_argilla_evaluator.py +++ b/tests/evaluation/test_instruct_comparison_argilla_evaluator.py @@ -155,14 +155,15 @@ def create_dummy_dataset( instruction_input = "some text" return in_memory_dataset_repository.create_dataset( - [ + examples=[ Example( id=example_id, input=InstructInput(instruction=instruction, input=instruction_input), expected_output=None, ) - ] - ) + ], + dataset_name="test-dataset", + ).id def create_dummy_runs( diff --git a/tests/evaluation/test_run.py b/tests/evaluation/test_run.py index 83ac25201..0cd794e6f 100644 --- a/tests/evaluation/test_run.py +++ b/tests/evaluation/test_run.py @@ -65,7 +65,9 @@ def test_run_evaluation( ) -> None: dataset_path = tmp_path / "dataset" dataset_repository = FileDatasetRepository(dataset_path) - dataset_id = dataset_repository.create_dataset(examples) + dataset_id = dataset_repository.create_dataset( + examples=examples, dataset_name="test-dataset" + ).id aggregation_path = tmp_path / "eval" aggregation_repository = FileAggregationRepository(aggregation_path) @@ -101,7 +103,9 @@ def test_run_evaluation_with_task_with_client( ) -> None: dataset_path = tmp_path / "dataset" dataset_repository = FileDatasetRepository(dataset_path) - dataset_id = dataset_repository.create_dataset(examples) + dataset_id = dataset_repository.create_dataset( + examples=examples, dataset_name="test-dataset" + ).id eval_path = tmp_path / "eval" diff --git a/tests/evaluation/test_runner.py b/tests/evaluation/test_runner.py index f77a6b891..55970d4d0 100644 --- a/tests/evaluation/test_runner.py +++ b/tests/evaluation/test_runner.py @@ -22,7 +22,9 @@ def test_runner_runs_dataset( Example(input=FAIL_IN_EVAL_INPUT, expected_output=None), ] - dataset_id = in_memory_dataset_repository.create_dataset(examples=examples) + dataset_id = in_memory_dataset_repository.create_dataset( + examples=examples, dataset_name="test-dataset" + ).id overview = runner.run_dataset(dataset_id) outputs = list( in_memory_run_repository.example_outputs( @@ -49,7 +51,9 @@ def test_runner_runs_n_examples( Example(input=FAIL_IN_TASK_INPUT, expected_output=None, id="example-2"), ] - dataset_id = in_memory_dataset_repository.create_dataset(examples=examples) + dataset_id = in_memory_dataset_repository.create_dataset( + examples=examples, dataset_name="test-dataset" + ).id overview = runner.run_dataset(dataset_id) overview_with_tracer = runner.run_dataset(dataset_id, tracer, num_examples=1) diff --git a/tests/use_cases/classify/test_classify.py b/tests/use_cases/classify/test_classify.py index 227b9b825..b18eaf9cf 100644 --- a/tests/use_cases/classify/test_classify.py +++ b/tests/use_cases/classify/test_classify.py @@ -119,7 +119,9 @@ def single_entry_dataset_name( in_memory_dataset_repository: InMemoryDatasetRepository, embedding_based_classify_example: Iterable[Example[ClassifyInput, Sequence[str]]], ) -> str: - return in_memory_dataset_repository.create_dataset(embedding_based_classify_example) + return in_memory_dataset_repository.create_dataset( + examples=embedding_based_classify_example, dataset_name="test-dataset" + ).id @fixture @@ -128,8 +130,8 @@ def multiple_entries_dataset_name( embedding_based_classify_examples: Iterable[Example[ClassifyInput, Sequence[str]]], ) -> str: return in_memory_dataset_repository.create_dataset( - embedding_based_classify_examples - ) + examples=embedding_based_classify_examples, dataset_name="test-dataset" + ).id @fixture diff --git a/tests/use_cases/classify/test_prompt_based_classify.py b/tests/use_cases/classify/test_prompt_based_classify.py index 87d2d2d7e..c3c50f872 100644 --- a/tests/use_cases/classify/test_prompt_based_classify.py +++ b/tests/use_cases/classify/test_prompt_based_classify.py @@ -200,9 +200,11 @@ def test_can_evaluate_classify( expected_output=["positive"], ) - dataset_name = in_memory_dataset_repository.create_dataset([example]) + dataset_id = in_memory_dataset_repository.create_dataset( + examples=[example], dataset_name="test-dataset" + ).id - run_overview = classify_runner.run_dataset(dataset_name) + run_overview = classify_runner.run_dataset(dataset_id) evaluation_overview = classify_evaluator.evaluate_runs(run_overview.id) evaluation = in_memory_evaluation_repository.example_evaluations( @@ -243,11 +245,11 @@ def test_can_aggregate_evaluations( ), expected_output=positive_lst, ) - dataset_name = in_memory_dataset_repository.create_dataset( - [correct_example, incorrect_example] - ) + dataset_id = in_memory_dataset_repository.create_dataset( + examples=[correct_example, incorrect_example], dataset_name="test-dataset" + ).id - run_overview = classify_runner.run_dataset(dataset_name) + run_overview = classify_runner.run_dataset(dataset_id) evaluation_overview = classify_evaluator.evaluate_runs(run_overview.id) aggregation_overview = classify_aggregator.aggregate_evaluation( evaluation_overview.id @@ -270,7 +272,9 @@ def test_aggregating_evaluations_works_with_empty_list( classify_runner: Runner[ClassifyInput, SingleLabelClassifyOutput], in_memory_dataset_repository: DatasetRepository, ) -> None: - dataset_id = in_memory_dataset_repository.create_dataset([]) + dataset_id = in_memory_dataset_repository.create_dataset( + examples=[], dataset_name="test-dataset" + ).id run_overview = classify_runner.run_dataset(dataset_id) evaluation_overview = classify_evaluator.evaluate_runs(run_overview.id) aggregation_overview = classify_aggregator.aggregate_evaluation( diff --git a/tests/use_cases/summarize/test_summarize.py b/tests/use_cases/summarize/test_summarize.py index 938a2417c..e5102cf0c 100644 --- a/tests/use_cases/summarize/test_summarize.py +++ b/tests/use_cases/summarize/test_summarize.py @@ -178,10 +178,10 @@ def test_single_chunk_summarize_evaluator( expected_output="The brown bear is a large mammal that lives in Eurasia and North America.", id="good", ) - dataset_name = in_memory_dataset_repository.create_dataset( - [good_example, bad_example] - ) - run_overview = single_chunk_summarize_runner.run_dataset(dataset_name) + dataset_id = in_memory_dataset_repository.create_dataset( + examples=[good_example, bad_example], dataset_name="test-dataset" + ).id + run_overview = single_chunk_summarize_runner.run_dataset(dataset_id) evaluation_overview = single_chunk_summarize_evaluator.evaluate_runs( run_overview.id @@ -236,10 +236,10 @@ def test_long_context_summarize_evaluator( expected_output="The brown bear is a large mammal that lives in Eurasia and North America.", id="good", ) - dataset_name = in_memory_dataset_repository.create_dataset( - [good_example, bad_example] - ) - run_overview = long_context_summarize_runner.run_dataset(dataset_name) + dataset_id = in_memory_dataset_repository.create_dataset( + examples=[good_example, bad_example], dataset_name="test-dataset" + ).id + run_overview = long_context_summarize_runner.run_dataset(dataset_id) evaluation_overview = long_context_summarize_evaluator.evaluate_runs( run_overview.id