Skip to content

Commit

Permalink
Improve test reliability (#1074)
Browse files Browse the repository at this point in the history
Co-authored-by: JanEricNitschkeAA <[email protected]>
  • Loading branch information
NiklasKoehneckeAA and JanEricNitschkeAA authored Oct 15, 2024
1 parent 6dbf754 commit 75e516d
Show file tree
Hide file tree
Showing 8 changed files with 75 additions and 19 deletions.
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@
- Add utility function `run_is_already_computed` to `Runner` to check if a run with the given metadata has already been computed.
- The `parameter_optimization` notebook describes how to use the `run_is_already_computed` function.
### Fixes
...
- The default `max_retry_time` for the `LimitedConcurrencyClient` is now set to 3 minutes from a day. If you have long-running evaluations that need this, you can re-set a long retry time in the constructor.


### Deprecations
...
### Breaking Changes
Expand Down
18 changes: 13 additions & 5 deletions src/documentation/elo_qa_eval.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@
"load_dotenv()\n",
"\n",
"aa_client = Client(getenv(\"AA_TOKEN\"))\n",
"limited_concurrency_client = LimitedConcurrencyClient(aa_client)"
"limited_concurrency_client = LimitedConcurrencyClient(aa_client, max_retry_time=60)"
]
},
{
Expand Down Expand Up @@ -204,8 +204,8 @@
"outputs": [],
"source": [
"models = [\n",
" LuminousControlModel(name=\"luminous-base-control-20240215\", client=aa_client),\n",
" LuminousControlModel(name=\"luminous-supreme-control-20240215\", client=aa_client),\n",
" LuminousControlModel(name=\"luminous-base-control\", client=aa_client),\n",
" LuminousControlModel(name=\"luminous-supreme-control\", client=aa_client),\n",
"]\n",
"\n",
"for model in models:\n",
Expand All @@ -226,9 +226,17 @@
"source": [
"# ensure that all examples succeeded\n",
"for run_overview in run_repository.run_overviews():\n",
" error_examples = \"\\n\".join(\n",
" [\n",
" str(output.output)\n",
" for output in run_repository.failed_example_outputs(\n",
" run_overview.id, SingleChunkQaOutput\n",
" )\n",
" ]\n",
" )\n",
" assert (\n",
" run_overview.failed_example_count == 0\n",
" ), f\"There are failed runs for run overview ID {run_overview.id}\""
" ), f\"There are failed runs for run overview ID {run_overview.id}: \\n{error_examples}\""
]
},
{
Expand Down Expand Up @@ -592,7 +600,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "intelligence-layer-tfT-HG2V-py3.11",
"display_name": "intelligence-layer-LP3DLT23-py3.12",
"language": "python",
"name": "python3"
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def __init__(
self,
client: AlephAlphaClientProtocol,
max_concurrency: int = 10,
max_retry_time: int = 24 * 60 * 60, # one day in seconds
max_retry_time: int = 3 * 60, # three minutes in seconds
) -> None:
self._client = client
self._concurrency_limit_semaphore = Semaphore(max_concurrency)
Expand Down Expand Up @@ -151,8 +151,9 @@ def _retry_on_busy_error(self, func: Callable[[], T]) -> T:
retries = 0
start_time = time.time()
latest_exception = None
current_time = start_time
while (
time.time() - start_time < self._max_retry_time or self._max_retry_time < 0
current_time - start_time < self._max_retry_time or self._max_retry_time < 0
):
try:
return func()
Expand All @@ -161,10 +162,11 @@ def _retry_on_busy_error(self, func: Callable[[], T]) -> T:
time.sleep(
min(
2**retries,
self._max_retry_time - (time.time() - start_time),
self._max_retry_time - (current_time - start_time),
)
)
retries += 1
current_time = time.time()
continue
assert latest_exception is not None
raise latest_exception
Expand Down
4 changes: 3 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,9 @@ def client(token: str) -> AlephAlphaClientProtocol:
Args:
token: AA Token
"""
return LimitedConcurrencyClient(Client(token), max_concurrency=10)
return LimitedConcurrencyClient(
Client(token), max_concurrency=10, max_retry_time=2 * 60
)


@fixture(scope="session")
Expand Down
43 changes: 41 additions & 2 deletions tests/connectors/test_limited_concurrency_client.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import time
from concurrent.futures import ThreadPoolExecutor
from threading import Lock
from time import sleep
Expand Down Expand Up @@ -40,14 +41,16 @@ def complete(self, request: CompletionRequest, model: str) -> CompletionResponse

class BusyClient:
def __init__(
self,
return_value: CompletionResponse | Exception,
self, return_value: CompletionResponse | Exception, wait_time: int | None = None
) -> None:
self.number_of_retries: int = 0
self.return_value = return_value
self.wait_time = wait_time

def complete(self, request: CompletionRequest, model: str) -> CompletionResponse:
self.number_of_retries += 1
if self.wait_time:
time.sleep(self.wait_time)
if self.number_of_retries < 2:
raise BusyError(503) # type: ignore
else:
Expand Down Expand Up @@ -106,6 +109,42 @@ def test_limited_concurrency_client_retries() -> None:
assert completion == expected_completion


def test_limited_concurrency_client_stops_retrying_after_max_retry() -> None:
expected_completion = CompletionResponse(
model_version="model-version",
completions=[],
optimized_prompt=None,
num_tokens_generated=0,
num_tokens_prompt_total=0,
)
busy_client = BusyClient(return_value=expected_completion)
limited_concurrency_client = LimitedConcurrencyClient(
cast(AlephAlphaClientProtocol, busy_client), max_retry_time=1
)
with pytest.raises(BusyError):
limited_concurrency_client.complete(
CompletionRequest(prompt=Prompt("")), "model"
)


def test_limited_concurrency_client_handles_long_running_functions_properly() -> None:
expected_completion = CompletionResponse(
model_version="model-version",
completions=[],
optimized_prompt=None,
num_tokens_generated=0,
num_tokens_prompt_total=0,
)
busy_client = BusyClient(return_value=expected_completion, wait_time=1)
limited_concurrency_client = LimitedConcurrencyClient(
cast(AlephAlphaClientProtocol, busy_client), max_retry_time=1
)
with pytest.raises(BusyError):
limited_concurrency_client.complete(
CompletionRequest(prompt=Prompt("")), "model"
)


def test_limited_concurrency_client_throws_exception() -> None:
expected_exception = Exception(404)
busy_client = BusyClient(return_value=expected_exception)
Expand Down
2 changes: 1 addition & 1 deletion tests/core/tracer/test_open_telemetry_tracer.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ def test_open_telemetry_tracer_works_with_jaeger(
input_value = str(uuid4())
tracer_test_task.run(input_value, jaeger_compatible_tracer)
# the processor needs time to submit the trace to jaeger
time.sleep(1)
time.sleep(3)

res = get_current_traces(url)

Expand Down
6 changes: 4 additions & 2 deletions tests/examples/qa/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,10 @@ def single_chunk_qa(luminous_control_model: LuminousControlModel) -> SingleChunk


@fixture
def multiple_chunk_qa(single_chunk_qa: SingleChunkQa) -> MultipleChunkQa:
return MultipleChunkQa(single_chunk_qa)
def multiple_chunk_qa(
single_chunk_qa: SingleChunkQa, luminous_control_model: LuminousControlModel
) -> MultipleChunkQa:
return MultipleChunkQa(single_chunk_qa, merge_answers_model=luminous_control_model)


@fixture
Expand Down
9 changes: 5 additions & 4 deletions tests/examples/qa/test_single_chunk_qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def test_qa_with_logit_bias_for_no_answer(


def test_qa_highlights_will_not_become_out_of_bounds(
single_chunk_qa: SingleChunkQa,
luminous_control_model: LuminousControlModel,
) -> None:
input_text = """Zubereitung
Ein Hotdog besteht aus einem erwärmten Brühwürstchen in einem länglichen, meist weichen Weizenbrötchen, das üblicherweise getoastet oder gedämpft wird. Das Hotdogbrötchen wird zur Hälfte der Länge nach aufgeschnitten und ggf. erhitzt. Danach legt man das heiße Würstchen hinein und garniert es mit den Saucen (Ketchup, Senf, Mayonnaise usw.). Häufig werden auch noch weitere Zugaben, etwa Röstzwiebeln, Essiggurken, Sauerkraut oder Krautsalat in das Brötchen gegeben.
Expand All @@ -116,10 +116,11 @@ def test_qa_highlights_will_not_become_out_of_bounds(
Weltweit bekannt sind die Hotdog-Stände der schwedischen Möbelhauskette IKEA, an denen im Möbelhaus hinter den Kassen Hot Dogs der schwedischen Variante zum Selberbelegen mit Röstzwiebeln, Gurken und verschiedenen Soßen verkauft werden. Der Hotdogstand in der Filiale gilt weltweit als eine Art Markenzeichen von IKEA. In Deutschland wird das Gericht meist mit Frankfurter oder Wiener Würstchen zubereitet.
In den USA wird der Hotdog meist auf einem Roller Grill gegart. So bekommt die Wurst einen besonderen Grillgeschmack. Amerikanische Hotdogs werden mit speziellen Pickled Gherkins (Gurkenscheiben) und Relishes (Sweet Relish, Hot Pepper Relish oder Corn Relish), häufig mit mildem Senf (Yellow Mustard, die populärste Hotdog-Zutat) oder mit Ketchup serviert. Auch eine Garnitur aus warmem Sauerkraut ist möglich (Nathan’s Famous in New York)."""
model = LuminousControlModel("luminous-supreme-control")
qa_task = SingleChunkQa(
text_highlight=TextHighlight(model=model, granularity=None, clamp=True),
model=model,
text_highlight=TextHighlight(
model=luminous_control_model, granularity=None, clamp=True
),
model=luminous_control_model,
)
input = SingleChunkQaInput(
chunk=TextChunk(input_text),
Expand Down

0 comments on commit 75e516d

Please sign in to comment.