Skip to content

Commit

Permalink
feat: add average successful latency and token count to benchmark exe…
Browse files Browse the repository at this point in the history
…cution
  • Loading branch information
NiklasKoehneckeAA authored and MerlinKallenbornAA committed Dec 18, 2024
1 parent 189c2c3 commit 4de31d5
Show file tree
Hide file tree
Showing 3 changed files with 150 additions and 14 deletions.
4 changes: 2 additions & 2 deletions src/intelligence_layer/connectors/studio/studio.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,8 @@ class PostBenchmarkExecution(BaseModel):
run_end: datetime
run_successful_count: int
run_failed_count: int
run_success_avg_latency: int
run_success_avg_token_count: int
run_success_avg_latency: float
run_success_avg_token_count: float
# Eval Overview
eval_start: datetime
eval_end: datetime
Expand Down
65 changes: 56 additions & 9 deletions src/intelligence_layer/evaluation/benchmark/studio_benchmark.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import inspect
import itertools
from collections.abc import Sequence
from datetime import datetime
from http import HTTPStatus
Expand Down Expand Up @@ -123,7 +125,30 @@ def execute(

end = datetime.now()

data = PostBenchmarkExecution(
evaluation_lineages = list(
self.evaluator.evaluation_lineages(evaluation_overview.id)
)

run_traces = [
self._trace_from_lineage(lineage) for lineage in evaluation_lineages
]
tokens_per_trace = [
extract_token_count_from_trace(trace) for trace in run_traces
]
latency_per_trace = [extract_latency_from_trace(trace) for trace in run_traces]

tokens_per_successful_trace, latency_per_successful_trace = (
self._filter_for_succesful_runs(
(tokens_per_trace, latency_per_trace),
source_lineage_list=evaluation_lineages,
run_id=run_overview.id,
)
)

def average_or_zero(list: list) -> float:
return sum(list) / len(list) if len(list) > 0 else 0

benchmark_execution_data = PostBenchmarkExecution(
name=name,
description=description,
labels=labels,
Expand All @@ -134,8 +159,8 @@ def execute(
run_end=run_overview.end,
run_successful_count=run_overview.successful_example_count,
run_failed_count=run_overview.failed_example_count,
run_success_avg_latency=0, # TODO: Implement this
run_success_avg_token_count=0, # TODO: Implement this
run_success_avg_latency=average_or_zero(latency_per_successful_trace),
run_success_avg_token_count=average_or_zero(tokens_per_successful_trace),
eval_start=evaluation_overview.start_date,
eval_end=evaluation_overview.end_date,
eval_successful_count=evaluation_overview.successful_evaluation_count,
Expand All @@ -146,22 +171,19 @@ def execute(
)

benchmark_execution_id = self.client.submit_benchmark_execution(
benchmark_id=self.id, data=data
benchmark_id=self.id, data=benchmark_execution_data
)

evaluation_lineages = list(
self.evaluator.evaluation_lineages(evaluation_overview.id)
)
trace_ids = []
for lineage in tqdm(evaluation_lineages, desc="Submitting traces to Studio"):
trace = self._trace_from_lineage(lineage)
for trace in tqdm(run_traces, desc="Submitting traces to Studio"):
trace_id = self.client.submit_trace(trace)
trace_ids.append(trace_id)

benchmark_lineages = self._create_benchmark_lineages(
eval_lineages=evaluation_lineages,
trace_ids=trace_ids,
)

self.client.submit_benchmark_lineages(
benchmark_lineages=benchmark_lineages,
execution_id=benchmark_execution_id,
Expand All @@ -170,6 +192,31 @@ def execute(

return benchmark_execution_id

def _filter_for_succesful_runs(
self,
lists_to_filter: tuple[list, ...],
source_lineage_list: list[
EvaluationLineage[Input, ExpectedOutput, Output, Evaluation]
],
run_id: str,
) -> tuple[list, ...]:
"""This method assumes that lists_to_filter and source_lineage_list are all equal length."""
failed_example_output_ids = [
example_output.example_id
for example_output in self.run_repository.failed_example_outputs(
run_id=run_id, output_type=self.evaluator.output_type()
)
]

is_successful_run = [
lineage.example.id not in failed_example_output_ids
for lineage in source_lineage_list
]
return tuple(
list(itertools.compress(sublist, is_successful_run))
for sublist in lists_to_filter
)

def _trace_from_lineage(
self, eval_lineage: EvaluationLineage[Input, ExpectedOutput, Output, Evaluation]
) -> Sequence[ExportedSpan]:
Expand Down
95 changes: 92 additions & 3 deletions tests/evaluation/benchmark/test_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from intelligence_layer.connectors.studio.studio import (
BenchmarkLineage,
GetBenchmarkResponse,
PostBenchmarkExecution,
StudioClient,
StudioExample,
)
Expand All @@ -21,6 +22,7 @@
type_to_schema,
)
from tests.evaluation.conftest import (
FAIL_IN_TASK_INPUT,
DummyAggregationLogic,
DummyEvaluationLogic,
DummyTask,
Expand Down Expand Up @@ -227,6 +229,7 @@ def test_execute_benchmark(
mock_submit_trace = cast(Mock, mock_studio_client.submit_trace)
mock_submit_trace.return_value = str(uuid4())
mock_submit_execution = cast(Mock, mock_studio_client.submit_benchmark_execution)
mock_submit_lineage = cast(Mock, mock_studio_client.submit_benchmark_lineages)

expected_generated_tokens = 100
mock_extract_tokens.return_value = expected_generated_tokens
Expand Down Expand Up @@ -255,12 +258,98 @@ def test_execute_benchmark(

# then
mock_submit_execution.assert_called_once()
uploaded_execution = cast(
PostBenchmarkExecution, mock_submit_execution.call_args[1]["data"]
)
assert uploaded_execution.run_success_avg_latency > 0
assert uploaded_execution.run_success_avg_token_count == expected_generated_tokens

assert mock_submit_trace.call_count == 4
uploaded_lineages = cast(
Mock, mock_studio_client.submit_benchmark_lineages
).call_args[1]["benchmark_lineages"]

mock_submit_lineage.assert_called_once()
uploaded_lineages = mock_submit_lineage.call_args[1]["benchmark_lineages"]
for lineage in uploaded_lineages:
lineage = cast(BenchmarkLineage, lineage)
assert lineage.run_latency > 0
# this assumes that each lineage consists of traces that only have a single span
assert lineage.run_tokens == expected_generated_tokens


def test_execute_benchmark_on_empty_examples_uploads_example_and_calculates_correctly(
studio_benchmark_repository: StudioBenchmarkRepository,
mock_studio_client: StudioClient,
evaluation_logic: DummyEvaluationLogic,
get_benchmark_response: GetBenchmarkResponse,
aggregation_logic: DummyAggregationLogic,
task: DummyTask,
) -> None:
mock_submit_trace = cast(Mock, mock_studio_client.submit_trace)
mock_submit_execution = cast(Mock, mock_studio_client.submit_benchmark_execution)

cast(Mock, mock_studio_client.get_benchmark).return_value = get_benchmark_response
cast(Mock, mock_studio_client.get_dataset_examples).return_value = []
benchmark = studio_benchmark_repository.get_benchmark(
"benchmark_id", evaluation_logic, aggregation_logic
)
assert benchmark

# when
benchmark.execute(
task,
name="name",
description="description",
metadata={"key": "value"},
labels={"label"},
)

# then
mock_submit_execution.assert_called_once()
uploaded_execution = cast(
PostBenchmarkExecution, mock_submit_execution.call_args[1]["data"]
)
assert uploaded_execution.run_success_avg_latency == 0
assert uploaded_execution.run_success_avg_token_count == 0

assert mock_submit_trace.call_count == 0


def test_execute_benchmark_failing_examples_calculates_correctly(
studio_benchmark_repository: StudioBenchmarkRepository,
mock_studio_client: StudioClient,
evaluation_logic: DummyEvaluationLogic,
get_benchmark_response: GetBenchmarkResponse,
aggregation_logic: DummyAggregationLogic,
task: DummyTask,
) -> None:
mock_submit_trace = cast(Mock, mock_studio_client.submit_trace)
mock_submit_execution = cast(Mock, mock_studio_client.submit_benchmark_execution)

cast(Mock, mock_studio_client.get_benchmark).return_value = get_benchmark_response
examples = [
StudioExample(input=FAIL_IN_TASK_INPUT, expected_output="expected_output0"),
]
cast(Mock, mock_studio_client.get_dataset_examples).return_value = examples
benchmark = studio_benchmark_repository.get_benchmark(
"benchmark_id", evaluation_logic, aggregation_logic
)
assert benchmark

# when
benchmark.execute(
task,
name="name",
description="description",
metadata={"key": "value"},
labels={"label"},
)

# then
mock_submit_execution.assert_called_once()
uploaded_execution = cast(
PostBenchmarkExecution, mock_submit_execution.call_args[1]["data"]
)
assert uploaded_execution.run_success_avg_latency == 0
assert uploaded_execution.run_success_avg_token_count == 0
assert uploaded_execution.run_successful_count == 0

assert mock_submit_trace.call_count == 0

0 comments on commit 4de31d5

Please sign in to comment.