diff --git a/src/intelligence_layer/connectors/studio/studio.py b/src/intelligence_layer/connectors/studio/studio.py index d1c38aff..a72ed3da 100644 --- a/src/intelligence_layer/connectors/studio/studio.py +++ b/src/intelligence_layer/connectors/studio/studio.py @@ -143,7 +143,7 @@ class GetDatasetExamplesResponse(BaseModel, Generic[Input, ExpectedOutput]): items: Sequence[StudioExample[Input, ExpectedOutput]] -class BenchmarkLineage(BaseModel, Generic[Input, Output, ExpectedOutput, Evaluation]): +class BenchmarkLineage(BaseModel, Generic[Input, ExpectedOutput, Output, Evaluation]): trace_id: str input: Input expected_output: ExpectedOutput diff --git a/src/intelligence_layer/evaluation/benchmark/studio_benchmark.py b/src/intelligence_layer/evaluation/benchmark/studio_benchmark.py index 3b93f85c..0b77ba6a 100644 --- a/src/intelligence_layer/evaluation/benchmark/studio_benchmark.py +++ b/src/intelligence_layer/evaluation/benchmark/studio_benchmark.py @@ -180,7 +180,9 @@ def average_or_zero(list: list) -> float: benchmark_lineages = self._create_benchmark_lineages( eval_lineages=evaluation_lineages, - traces=run_traces, + trace_ids=trace_ids, + latencies_per_trace=latency_per_trace, + tokens_per_trace=tokens_per_trace, ) self.client.submit_benchmark_lineages( @@ -229,27 +231,39 @@ def _create_benchmark_lineages( eval_lineages: list[ EvaluationLineage[Input, ExpectedOutput, Output, Evaluation] ], - traces: list[Sequence[ExportedSpan]], - ) -> Sequence[BenchmarkLineage[Input, Output, ExpectedOutput, Evaluation]]: + trace_ids: list[str], + latencies_per_trace: list[int], + tokens_per_trace: list[int], + ) -> Sequence[BenchmarkLineage[Input, ExpectedOutput, Output, Evaluation]]: return [ - self._create_benchmark_lineage(eval_lineage, trace) - for eval_lineage, trace in zip(eval_lineages, traces, strict=True) + self._create_benchmark_lineage( + eval_lineage, trace_id, run_latency, run_tokens + ) + for eval_lineage, trace_id, run_latency, run_tokens in zip( + eval_lineages, + trace_ids, + latencies_per_trace, + tokens_per_trace, + strict=True, + ) ] def _create_benchmark_lineage( self, eval_lineage: EvaluationLineage[Input, ExpectedOutput, Output, Evaluation], - trace: Sequence[ExportedSpan], + trace_id: str, + run_latency: int, + run_tokens: int, ) -> BenchmarkLineage: return BenchmarkLineage( - trace_id=str(trace[0].context.trace_id), + trace_id=trace_id, input=eval_lineage.example.input, expected_output=eval_lineage.example.expected_output, example_metadata=eval_lineage.example.metadata, output=eval_lineage.outputs[0].output, evaluation=eval_lineage.evaluation.result, - run_latency=extract_latency_from_trace(trace), - run_tokens=extract_token_count_from_trace(trace), + run_latency=run_latency, + run_tokens=run_tokens, ) diff --git a/tests/evaluation/benchmark/test_benchmark.py b/tests/evaluation/benchmark/test_benchmark.py index 761085ea..81afd027 100644 --- a/tests/evaluation/benchmark/test_benchmark.py +++ b/tests/evaluation/benchmark/test_benchmark.py @@ -313,7 +313,11 @@ def test_execute_benchmark_on_empty_examples_uploads_example_and_calculates_corr assert mock_submit_trace.call_count == 0 +@patch( + "intelligence_layer.evaluation.benchmark.studio_benchmark.extract_token_count_from_trace" +) def test_execute_benchmark_failing_examples_calculates_correctly( + mock_extract_tokens: Mock, studio_benchmark_repository: StudioBenchmarkRepository, mock_studio_client: StudioClient, evaluation_logic: DummyEvaluationLogic, @@ -332,6 +336,9 @@ def test_execute_benchmark_failing_examples_calculates_correctly( benchmark = studio_benchmark_repository.get_benchmark( "benchmark_id", evaluation_logic, aggregation_logic ) + + expected_generated_tokens = 0 + mock_extract_tokens.return_value = expected_generated_tokens + 1 assert benchmark # when @@ -349,7 +356,7 @@ def test_execute_benchmark_failing_examples_calculates_correctly( PostBenchmarkExecution, mock_submit_execution.call_args[1]["data"] ) assert uploaded_execution.run_success_avg_latency == 0 - assert uploaded_execution.run_success_avg_token_count == 0 + assert uploaded_execution.run_success_avg_token_count == expected_generated_tokens assert uploaded_execution.run_successful_count == 0 assert mock_submit_trace.call_count == 0