Skip to content

Commit

Permalink
feat: add raise_for_status to studio client
Browse files Browse the repository at this point in the history
  • Loading branch information
JohannesWesch authored and MerlinKallenbornAA committed Oct 30, 2024
1 parent 99bfc5f commit cbbb5a3
Show file tree
Hide file tree
Showing 5 changed files with 56 additions and 26 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
- Add support for Llama3InstructModel in PromptBasedClassify
- Add TextControl to 'to_instruct_prompt' for instruct models
- Add 'attention_manipulation_with_text_controls.ipynb' to tutorial notebooks
- Add submit_dataset function to StudioClient
- Add `how_to_upload_existing_datasets_to_studio.ipynb` to how-tos

### Fixes
...
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@
"outputs": [],
"source": [
"# Step 0\n",
"\n",
"existing_dataset_repo = InMemoryDatasetRepository()\n",
"existing_dataset = existing_dataset_repo.dataset(dataset_id=\"my_existing_dataset_id\")\n",
"assert existing_dataset, \"Make sure your dataset still exists.\"\n",
Expand Down
53 changes: 41 additions & 12 deletions src/intelligence_layer/connectors/studio/studio.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,22 @@
import os
from collections import defaultdict
from collections.abc import Iterable, Sequence
from typing import Optional
from http import HTTPStatus
from typing import ClassVar, Optional
from urllib.parse import urljoin

import requests
from pydantic import BaseModel
from requests.exceptions import ConnectionError, MissingSchema

from intelligence_layer.connectors.data.exceptions import (
DataError,
DataExternalServiceUnavailable,
DataForbiddenError,
DataInternalError,
DataInvalidInput,
DataResourceNotFound,
)
from intelligence_layer.core.task import Input
from intelligence_layer.core.tracer.tracer import ( # Import to be fixed with PHS-731
ExportedSpan,
Expand Down Expand Up @@ -57,7 +66,6 @@ def __init__(
"'AA_TOKEN' is not set and auth_token is not given as a parameter. Please provide one or the other."
)
self._headers = {
"Content-Type": "application/json",
"Accept": "application/json",
"Authorization": f"Bearer {self._token}",
}
Expand Down Expand Up @@ -214,29 +222,50 @@ def submit_dataset(
id of created dataset
"""
url = urljoin(self.url, f"/api/projects/{self.project_id}/datasets")

source_data_list = [
example.model_dump_json()
for example in sorted(examples, key=lambda x: x.id)
]
file_data = "\n".join(source_data_list).encode()

source_data_file = "\n".join(source_data_list).encode()

data = {
"name": dataset.name,
"labels": list(dataset.labels) if dataset.labels is not None else [],
"total_datapoints": len(source_data_list),
"metadata": json.dumps(dataset.metadata) if dataset.metadata else None,
}

# Handle metadata separately to avoid double JSON encoding
if dataset.metadata:
if isinstance(dataset.metadata, str):
data["metadata"] = dataset.metadata
else:
data["metadata"] = json.dumps(dataset.metadata)

response = requests.post(
url,
files={"source_data": file_data},
files={"source_data": source_data_file},
data=data,
headers=self._headers,
)

match response.status_code:
case 409:
raise ValueError("Dataset already exists")
case _:
response.raise_for_status()
return str(response.json())
self._raise_for_status(response)
return str(response.text)

def _raise_for_status(self, response: requests.Response) -> None:
try:
response.raise_for_status()
except requests.HTTPError as e:
exception_factory = self._status_code_to_exception.get(
HTTPStatus(response.status_code), DataInternalError
)
raise exception_factory(
response.text, HTTPStatus(response.status_code)
) from e

_status_code_to_exception: ClassVar[dict[int, type[DataError]]] = {
HTTPStatus.SERVICE_UNAVAILABLE: DataExternalServiceUnavailable,
HTTPStatus.NOT_FOUND: DataResourceNotFound,
HTTPStatus.UNPROCESSABLE_ENTITY: DataInvalidInput,
HTTPStatus.FORBIDDEN: DataForbiddenError,
}
2 changes: 1 addition & 1 deletion src/intelligence_layer/evaluation/run/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from typing import Generic, Optional, cast
from uuid import uuid4

from dict_hash import dict_hash
from dict_hash import dict_hash # type: ignore
from pydantic import JsonValue

from intelligence_layer.connectors.base.json_serializable import (
Expand Down
24 changes: 12 additions & 12 deletions tests/connectors/studio/test_studio.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,17 @@ def test_submit_from_tracer_works_with_empty_tracer(
assert len(empty_trace_id_list) == 0


def test_can_upload_dataset(
def test_can_upload_dataset_with_minimal_request_body(
studio_client: StudioClient,
examples: Sequence[Example[str, str]],
) -> None:
dataset_repo = InMemoryDatasetRepository()
dataset = dataset_repo.create_dataset(examples, "my_dataset")
result = studio_client.submit_dataset(dataset=dataset, examples=examples)
assert result is not None


def test_can_upload_dataset_with_complete_request_body(
studio_client: StudioClient,
examples: Sequence[Example[str, str]],
labels: set[str],
Expand All @@ -187,14 +197,4 @@ def test_can_upload_dataset(
)
result = studio_client.submit_dataset(dataset=dataset, examples=examples)

assert result == dataset.id


def test_cannot_upload_same_dataset_twice(studio_client: StudioClient) -> None:
example = Example(input="input_str", expected_output="output_str")
dataset_repo = InMemoryDatasetRepository()
dataset = dataset_repo.create_dataset(examples=[example], dataset_name="my_dataset")
studio_client.submit_dataset(dataset=dataset, examples=[example])

with pytest.raises(ValueError):
studio_client.submit_dataset(dataset=dataset, examples=[example])
assert result is not None

0 comments on commit cbbb5a3

Please sign in to comment.