Skip to content

Commit

Permalink
refactor: Change internal creation functions to include calculated to…
Browse files Browse the repository at this point in the history
…ken and latency values
  • Loading branch information
MerlinKallenbornAA authored and NiklasKoehneckeAA committed Dec 18, 2024
1 parent cde364c commit 519f402
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 12 deletions.
2 changes: 1 addition & 1 deletion src/intelligence_layer/connectors/studio/studio.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ class GetDatasetExamplesResponse(BaseModel, Generic[Input, ExpectedOutput]):
items: Sequence[StudioExample[Input, ExpectedOutput]]


class BenchmarkLineage(BaseModel, Generic[Input, Output, ExpectedOutput, Evaluation]):
class BenchmarkLineage(BaseModel, Generic[Input, ExpectedOutput, Output, Evaluation]):
trace_id: str
input: Input
expected_output: ExpectedOutput
Expand Down
33 changes: 23 additions & 10 deletions src/intelligence_layer/evaluation/benchmark/studio_benchmark.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import inspect
import itertools
from collections.abc import Sequence
from datetime import datetime
Expand Down Expand Up @@ -181,7 +180,9 @@ def average_or_zero(list: list) -> float:

benchmark_lineages = self._create_benchmark_lineages(
eval_lineages=evaluation_lineages,
traces=run_traces,
trace_ids=trace_ids,
latencies_per_trace=latency_per_trace,
tokens_per_trace=tokens_per_trace,
)

self.client.submit_benchmark_lineages(
Expand Down Expand Up @@ -230,27 +231,39 @@ def _create_benchmark_lineages(
eval_lineages: list[
EvaluationLineage[Input, ExpectedOutput, Output, Evaluation]
],
traces: list[Sequence[ExportedSpan]],
) -> Sequence[BenchmarkLineage[Input, Output, ExpectedOutput, Evaluation]]:
trace_ids: list[str],
latencies_per_trace: list[int],
tokens_per_trace: list[int],
) -> Sequence[BenchmarkLineage[Input, ExpectedOutput, Output, Evaluation]]:
return [
self._create_benchmark_lineage(eval_lineage, trace)
for eval_lineage, trace in zip(eval_lineages, traces, strict=True)
self._create_benchmark_lineage(
eval_lineage, trace_id, run_latency, run_tokens
)
for eval_lineage, trace_id, run_latency, run_tokens in zip(
eval_lineages,
trace_ids,
latencies_per_trace,
tokens_per_trace,
strict=True,
)
]

def _create_benchmark_lineage(
self,
eval_lineage: EvaluationLineage[Input, ExpectedOutput, Output, Evaluation],
trace: Sequence[ExportedSpan],
trace_id: str,
run_latency: int,
run_tokens: int,
) -> BenchmarkLineage:
return BenchmarkLineage(
trace_id=str(trace[0].context.trace_id),
trace_id=trace_id,
input=eval_lineage.example.input,
expected_output=eval_lineage.example.expected_output,
example_metadata=eval_lineage.example.metadata,
output=eval_lineage.outputs[0].output,
evaluation=eval_lineage.evaluation.result,
run_latency=extract_latency_from_trace(trace),
run_tokens=extract_token_count_from_trace(trace),
run_latency=run_latency,
run_tokens=run_tokens,
)


Expand Down
9 changes: 8 additions & 1 deletion tests/evaluation/benchmark/test_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,11 @@ def test_execute_benchmark_on_empty_examples_uploads_example_and_calculates_corr
assert mock_submit_trace.call_count == 0


@patch(
"intelligence_layer.evaluation.benchmark.studio_benchmark.extract_token_count_from_trace"
)
def test_execute_benchmark_failing_examples_calculates_correctly(
mock_extract_tokens: Mock,
studio_benchmark_repository: StudioBenchmarkRepository,
mock_studio_client: StudioClient,
evaluation_logic: DummyEvaluationLogic,
Expand All @@ -332,6 +336,9 @@ def test_execute_benchmark_failing_examples_calculates_correctly(
benchmark = studio_benchmark_repository.get_benchmark(
"benchmark_id", evaluation_logic, aggregation_logic
)

expected_generated_tokens = 0
mock_extract_tokens.return_value = expected_generated_tokens + 1
assert benchmark

# when
Expand All @@ -349,7 +356,7 @@ def test_execute_benchmark_failing_examples_calculates_correctly(
PostBenchmarkExecution, mock_submit_execution.call_args[1]["data"]
)
assert uploaded_execution.run_success_avg_latency == 0
assert uploaded_execution.run_success_avg_token_count == 0
assert uploaded_execution.run_success_avg_token_count == expected_generated_tokens
assert uploaded_execution.run_successful_count == 0

assert mock_submit_trace.call_count == 0

0 comments on commit 519f402

Please sign in to comment.