From cbbb5a364595c739037952593d96d2f30ebf2fec Mon Sep 17 00:00:00 2001 From: Johannes Wesch Date: Tue, 29 Oct 2024 14:34:48 +0100 Subject: [PATCH] feat: add raise_for_status to studio client --- CHANGELOG.md | 2 + ...o_upload_existing_datasets_to_studio.ipynb | 1 - .../connectors/studio/studio.py | 53 ++++++++++++++----- .../evaluation/run/runner.py | 2 +- tests/connectors/studio/test_studio.py | 24 ++++----- 5 files changed, 56 insertions(+), 26 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5b59988f..aa775882 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ - Add support for Llama3InstructModel in PromptBasedClassify - Add TextControl to 'to_instruct_prompt' for instruct models - Add 'attention_manipulation_with_text_controls.ipynb' to tutorial notebooks +- Add submit_dataset function to StudioClient + - Add `how_to_upload_existing_datasets_to_studio.ipynb` to how-tos ### Fixes ... diff --git a/src/documentation/how_tos/studio/how_to_upload_existing_datasets_to_studio.ipynb b/src/documentation/how_tos/studio/how_to_upload_existing_datasets_to_studio.ipynb index d9e6a4e9..bbdec0e9 100644 --- a/src/documentation/how_tos/studio/how_to_upload_existing_datasets_to_studio.ipynb +++ b/src/documentation/how_tos/studio/how_to_upload_existing_datasets_to_studio.ipynb @@ -46,7 +46,6 @@ "outputs": [], "source": [ "# Step 0\n", - "\n", "existing_dataset_repo = InMemoryDatasetRepository()\n", "existing_dataset = existing_dataset_repo.dataset(dataset_id=\"my_existing_dataset_id\")\n", "assert existing_dataset, \"Make sure your dataset still exists.\"\n", diff --git a/src/intelligence_layer/connectors/studio/studio.py b/src/intelligence_layer/connectors/studio/studio.py index 25c98479..c0b1cc62 100644 --- a/src/intelligence_layer/connectors/studio/studio.py +++ b/src/intelligence_layer/connectors/studio/studio.py @@ -2,13 +2,22 @@ import os from collections import defaultdict from collections.abc import Iterable, Sequence -from typing import Optional +from http import HTTPStatus +from typing import ClassVar, Optional from urllib.parse import urljoin import requests from pydantic import BaseModel from requests.exceptions import ConnectionError, MissingSchema +from intelligence_layer.connectors.data.exceptions import ( + DataError, + DataExternalServiceUnavailable, + DataForbiddenError, + DataInternalError, + DataInvalidInput, + DataResourceNotFound, +) from intelligence_layer.core.task import Input from intelligence_layer.core.tracer.tracer import ( # Import to be fixed with PHS-731 ExportedSpan, @@ -57,7 +66,6 @@ def __init__( "'AA_TOKEN' is not set and auth_token is not given as a parameter. Please provide one or the other." ) self._headers = { - "Content-Type": "application/json", "Accept": "application/json", "Authorization": f"Bearer {self._token}", } @@ -214,29 +222,50 @@ def submit_dataset( id of created dataset """ url = urljoin(self.url, f"/api/projects/{self.project_id}/datasets") - source_data_list = [ example.model_dump_json() for example in sorted(examples, key=lambda x: x.id) ] - file_data = "\n".join(source_data_list).encode() + + source_data_file = "\n".join(source_data_list).encode() data = { "name": dataset.name, "labels": list(dataset.labels) if dataset.labels is not None else [], "total_datapoints": len(source_data_list), - "metadata": json.dumps(dataset.metadata) if dataset.metadata else None, } + + # Handle metadata separately to avoid double JSON encoding + if dataset.metadata: + if isinstance(dataset.metadata, str): + data["metadata"] = dataset.metadata + else: + data["metadata"] = json.dumps(dataset.metadata) + response = requests.post( url, - files={"source_data": file_data}, + files={"source_data": source_data_file}, data=data, headers=self._headers, ) - match response.status_code: - case 409: - raise ValueError("Dataset already exists") - case _: - response.raise_for_status() - return str(response.json()) + self._raise_for_status(response) + return str(response.text) + + def _raise_for_status(self, response: requests.Response) -> None: + try: + response.raise_for_status() + except requests.HTTPError as e: + exception_factory = self._status_code_to_exception.get( + HTTPStatus(response.status_code), DataInternalError + ) + raise exception_factory( + response.text, HTTPStatus(response.status_code) + ) from e + + _status_code_to_exception: ClassVar[dict[int, type[DataError]]] = { + HTTPStatus.SERVICE_UNAVAILABLE: DataExternalServiceUnavailable, + HTTPStatus.NOT_FOUND: DataResourceNotFound, + HTTPStatus.UNPROCESSABLE_ENTITY: DataInvalidInput, + HTTPStatus.FORBIDDEN: DataForbiddenError, + } diff --git a/src/intelligence_layer/evaluation/run/runner.py b/src/intelligence_layer/evaluation/run/runner.py index 089e86dc..6a4f4c7f 100644 --- a/src/intelligence_layer/evaluation/run/runner.py +++ b/src/intelligence_layer/evaluation/run/runner.py @@ -6,7 +6,7 @@ from typing import Generic, Optional, cast from uuid import uuid4 -from dict_hash import dict_hash +from dict_hash import dict_hash # type: ignore from pydantic import JsonValue from intelligence_layer.connectors.base.json_serializable import ( diff --git a/tests/connectors/studio/test_studio.py b/tests/connectors/studio/test_studio.py index 6aaffe1e..32c9f9d4 100644 --- a/tests/connectors/studio/test_studio.py +++ b/tests/connectors/studio/test_studio.py @@ -175,7 +175,17 @@ def test_submit_from_tracer_works_with_empty_tracer( assert len(empty_trace_id_list) == 0 -def test_can_upload_dataset( +def test_can_upload_dataset_with_minimal_request_body( + studio_client: StudioClient, + examples: Sequence[Example[str, str]], +) -> None: + dataset_repo = InMemoryDatasetRepository() + dataset = dataset_repo.create_dataset(examples, "my_dataset") + result = studio_client.submit_dataset(dataset=dataset, examples=examples) + assert result is not None + + +def test_can_upload_dataset_with_complete_request_body( studio_client: StudioClient, examples: Sequence[Example[str, str]], labels: set[str], @@ -187,14 +197,4 @@ def test_can_upload_dataset( ) result = studio_client.submit_dataset(dataset=dataset, examples=examples) - assert result == dataset.id - - -def test_cannot_upload_same_dataset_twice(studio_client: StudioClient) -> None: - example = Example(input="input_str", expected_output="output_str") - dataset_repo = InMemoryDatasetRepository() - dataset = dataset_repo.create_dataset(examples=[example], dataset_name="my_dataset") - studio_client.submit_dataset(dataset=dataset, examples=[example]) - - with pytest.raises(ValueError): - studio_client.submit_dataset(dataset=dataset, examples=[example]) + assert result is not None