From 52c1bfe68c51dce96b72398fb1fa4b0bdbc3a0bd Mon Sep 17 00:00:00 2001 From: braf Date: Wed, 11 Sep 2024 16:27:15 +0000 Subject: [PATCH] Fixes + all unit testing completed --- .../measurements/model_config_measurement.py | 16 +- genai-perf/genai_perf/record/record.py | 2 +- genai-perf/genai_perf/utils.py | 11 + .../tests/test_model_config_measurement.py | 339 ++++++++---------- 4 files changed, 171 insertions(+), 197 deletions(-) diff --git a/genai-perf/genai_perf/measurements/model_config_measurement.py b/genai-perf/genai_perf/measurements/model_config_measurement.py index 575a531e..bf157b56 100644 --- a/genai-perf/genai_perf/measurements/model_config_measurement.py +++ b/genai-perf/genai_perf/measurements/model_config_measurement.py @@ -17,7 +17,7 @@ from dataclasses import dataclass from functools import total_ordering from statistics import mean -from typing import Any, Dict, List, Optional, TypeAlias +from typing import Any, Dict, Optional, TypeAlias from genai_perf.record.record import Record @@ -121,7 +121,7 @@ def _read_perf_metrics_from_checkpoint( ) -> Records: perf_metrics: Records = {} - for [tag, record_dict] in perf_metrics_dict.items(): + for [tag, record_dict] in perf_metrics_dict.values(): record = Record.get(tag) record = record.read_from_checkpoint(record_dict) # type: ignore perf_metrics[tag] = record # type: ignore @@ -149,13 +149,11 @@ def __gt__(self, other: "ModelConfigMeasurement") -> bool: == ModelConfigMeasurementDefaults.SELF_IS_BETTER ) - # TODO: OPTIMIZE - # Why is mypy complaining about this? - # def __eq__(self, other: "ModelConfigMeasurement") -> bool: - # return ( - # self._compare_measurements(other) - # == ModelConfigMeasurementDefaults.EQUALIVILENT - # ) + def __eq__(self, other: "ModelConfigMeasurement") -> bool: # type: ignore + return ( + self._compare_measurements(other) + == ModelConfigMeasurementDefaults.EQUALIVILENT + ) def _compare_measurements(self, other: "ModelConfigMeasurement") -> int: """ diff --git a/genai-perf/genai_perf/record/record.py b/genai-perf/genai_perf/record/record.py index e4fed617..8cb17cba 100644 --- a/genai-perf/genai_perf/record/record.py +++ b/genai-perf/genai_perf/record/record.py @@ -159,7 +159,7 @@ def tag(self) -> str: the name tag of the record type. """ - def to_dict(self): + def write_to_checkpoint(self): return (self.tag, self.__dict__) @classmethod diff --git a/genai-perf/genai_perf/utils.py b/genai-perf/genai_perf/utils.py index 4b625352..0193e5e1 100644 --- a/genai-perf/genai_perf/utils.py +++ b/genai-perf/genai_perf/utils.py @@ -108,3 +108,14 @@ def get_enum_entry(name: str, enum: Type[Enum]) -> Optional[Enum]: def scale(value, factor): return value * factor + + +# FIXME: OPTIMIZE +# This will move to the checkpoint class when it's created +def checkpoint_encoder(obj): + if isinstance(obj, bytes): + return obj.decode("utf-8") + elif hasattr(obj, "write_to_checkpoint"): + return obj.write_to_checkpoint() + else: + return obj.__dict__ diff --git a/genai-perf/tests/test_model_config_measurement.py b/genai-perf/tests/test_model_config_measurement.py index 90d7f940..b58fd8bb 100644 --- a/genai-perf/tests/test_model_config_measurement.py +++ b/genai-perf/tests/test_model_config_measurement.py @@ -22,220 +22,185 @@ ) from genai_perf.record.types.perf_latency_p99 import PerfLatencyP99 from genai_perf.record.types.perf_throughput import PerfThroughput +from genai_perf.record.types.time_to_first_token_avg import TimeToFirstTokenAvg +from genai_perf.utils import checkpoint_encoder class TestModelConfigMeasurement(unittest.TestCase): + ########################################################################### + # Setup & Teardown + ########################################################################### def setUp(self): - self.throughput_record = PerfThroughput(1000) - self.latency_record = PerfLatencyP99(20) + self.throughput_recordA = PerfThroughput(1000) + self.latency_recordA = PerfLatencyP99(20) - self.perf_metrics = { - PerfThroughput.tag: self.throughput_record, - PerfLatencyP99.tag: self.latency_record, + self.perf_metricsA = { + PerfThroughput.tag: self.throughput_recordA, + PerfLatencyP99.tag: self.latency_recordA, } - self.mcmA = ModelConfigMeasurement(self.perf_metrics) + self.mcmA = ModelConfigMeasurement(self.perf_metricsA) - # mcmB_non_gpu_metric_values = { - # "perf_throughput": 2000, - # "perf_latency_p99": 40, - # "cpu_used_ram": 1000, - # } + self.throughput_recordB = PerfThroughput(500) + self.latency_recordB = PerfLatencyP99(10) - # self.mcmB = self._construct_model_config_measurement( - # "modelB", self.model_specific_pa_params, mcmB_non_gpu_metric_values - # ) - - # self.mcmC = self._construct_model_config_measurement( - # "modelC", self.model_specific_pa_params, {} - # ) + self.perf_metricsB = { + PerfThroughput.tag: self.throughput_recordB, + PerfLatencyP99.tag: self.latency_recordB, + } - # self.mcmD = self._construct_model_config_measurement( - # "modelD", self.model_specific_pa_params, {} - # ) + self.mcmB = ModelConfigMeasurement(self.perf_metricsB) def tearDown(self): patch.stopall() + ########################################################################### + # Accessor Tests + ########################################################################### def test_basic_accessor_methods(self): """ Test that values are properly initialized """ - - self.assertEqual(self.mcmA.get_perf_metrics(), self.perf_metrics) + self.assertEqual(self.mcmA.get_perf_metrics(), self.perf_metricsA) self.assertEqual( - self.mcmA.get_perf_metric(PerfLatencyP99.tag), self.latency_record + self.mcmA.get_perf_metric(PerfLatencyP99.tag), self.latency_recordA ) self.assertEqual( self.mcmA.get_perf_metric_value(PerfThroughput.tag, return_value=-1), - self.throughput_record.value(), + self.throughput_recordA.value(), ) self.assertEqual( - self.mcmA.get_perf_metric_value("metric not in dict", return_value=-1), -1 + self.mcmA.get_perf_metric_value(TimeToFirstTokenAvg.tag, return_value=-1), + -1, + ) + + def test_set_metric_weighting(self): + """ + Test that metric weighting is set correctly + """ + # Default + self.assertEqual( + ModelConfigMeasurementDefaults.METRIC_WEIGHTING, self.mcmA._metric_weights + ) + + self.mcmA.set_metric_weighting({PerfThroughput.tag: 2, PerfLatencyP99.tag: 3}) + expected_mw = {PerfThroughput.tag: 2 / 5, PerfLatencyP99.tag: 3 / 5} + self.assertEqual(expected_mw, self.mcmA._metric_weights) + + def test_get_weighted_score(self): + """ + Test that weighted score is returned correctly + """ + + # In the default case we are comparing throughputs with mcmA = 1000, mcmB = 500 + # scoreA will be positive (2/3), and scoreB be will be its negative + scoreA = self.mcmA.get_weighted_score(self.mcmB) + scoreB = self.mcmB.get_weighted_score(self.mcmA) + + self.assertEqual(2 / 3, scoreA) + self.assertEqual(-2 / 3, scoreB) + + # In this case we will change the objective to be latency, with mcmA = 20, mcmB = 5 + # since latency is a decreasing record (lower is better), scoreB will be positive + self.mcmA.set_metric_weighting({PerfLatencyP99.tag: 1}) + self.mcmB.set_metric_weighting({PerfLatencyP99.tag: 1}) + scoreA = self.mcmA.get_weighted_score(self.mcmB) + scoreB = self.mcmB.get_weighted_score(self.mcmA) + + self.assertEqual(-2 / 3, scoreA) + self.assertEqual(2 / 3, scoreB) + + ########################################################################### + # Checkpoint Tests + ########################################################################### + def test_checkpoint_methods(self): + """ + Checks to ensure checkpoint methods work as intended + """ + mcmA_json = json.dumps(self.mcmA, default=checkpoint_encoder) + + mcmA_from_checkpoint = ModelConfigMeasurement.read_from_checkpoint( + json.loads(mcmA_json) ) - # def test_get_metric_found(self): - # """ - # Test that non-gpu metrics can be correctly returned - # """ - # non_gpu_data = convert_non_gpu_metrics_to_data(self.non_gpu_metric_values) - - # self.assertEqual(self.mcmA.get_metric("perf_throughput"), non_gpu_data[0]) - # self.assertEqual(self.mcmA.get_metric("perf_latency_p99"), non_gpu_data[1]) - # self.assertEqual(self.mcmA.get_metric("cpu_used_ram"), non_gpu_data[2]) - - # def test_get_metric_not_found(self): - # """ - # Test that an incorrect metric search returns None - # """ - # self.assertEqual(self.mcmA.get_metric("XXXXX"), None) - - # def test_get_metric_value_found(self): - # """ - # Test that non-gpu metric values can be correctly returned - # """ - # self.assertEqual( - # self.mcmA.get_metric_value("perf_throughput"), - # self.non_gpu_metric_values["perf_throughput"], - # ) - # self.assertEqual( - # self.mcmA.get_metric_value("perf_latency_p99"), - # self.non_gpu_metric_values["perf_latency_p99"], - # ) - # self.assertEqual( - # self.mcmA.get_metric_value("cpu_used_ram"), - # self.non_gpu_metric_values["cpu_used_ram"], - # ) - - # def test_get_metric_value_not_found(self): - # """ - # Test that an incorrect metric value search returns the correct value - # """ - # self.assertEqual(self.mcmA.get_metric_value("XXXXX"), 0) - # self.assertEqual(self.mcmA.get_metric_value("XXXXX", 100), 100) - - # def test_set_metric_weighting(self): - # # Default - # self.assertEqual({"perf_throughput": 1}, self.mcmA._metric_weights) - - # self.mcmA.set_metric_weighting({"perf_throughput": 2, "perf_latency_p99": 3}) - # expected_mw = {"perf_throughput": 2 / 5, "perf_latency_p99": 3 / 5} - # self.assertEqual(expected_mw, self.mcmA._metric_weights) - - # def test_calculate_weighted_percentage_gain(self): - # """ - # Test that weighted percentages are returned correctly - # """ - # self.mcmA.set_metric_weighting({"perf_throughput": 1}) - # self.mcmB.set_metric_weighting({"perf_throughput": 1}) - - # # throuhput: mcmA: 1000, mcmB: 2000 - # self.assertEqual(self.mcmA.calculate_weighted_percentage_gain(self.mcmB), -50) - # self.assertEqual(self.mcmB.calculate_weighted_percentage_gain(self.mcmA), 100) - - # self.mcmA.set_metric_weighting({"perf_latency_p99": 1}) - # self.mcmB.set_metric_weighting({"perf_latency_p99": 1}) - - # # latency: mcmA: 20, mcmB: 40 - # self.assertEqual(self.mcmA.calculate_weighted_percentage_gain(self.mcmB), 100) - # self.assertEqual(self.mcmB.calculate_weighted_percentage_gain(self.mcmA), -50) - - # # This illustrates why we need to use score, not percentages to determine - # # which model is better. In both cases we will (correctly) report that - # # mcmA/B is 25% better than the other, even though they are equal - # # - # # mcmA has 50% worse throughput, but 100% better latency - # # mcmB has 100% better latency, but 50% worse throughput - # self.mcmA.set_metric_weighting({"perf_throughput": 1, "perf_latency_p99": 1}) - # self.mcmB.set_metric_weighting({"perf_throughput": 1, "perf_latency_p99": 1}) - # self.assertEqual(self.mcmA, self.mcmB) - # self.assertEqual(self.mcmA.calculate_weighted_percentage_gain(self.mcmB), 25) - # self.assertEqual(self.mcmB.calculate_weighted_percentage_gain(self.mcmA), 25) - - # def test_is_better_than(self): - # """ - # Test that individual metric comparison works as expected - # """ - # self.mcmA.set_metric_weighting({"perf_throughput": 1}) - - # # throughput: 1000 is not better than 2000 - # self.assertFalse(self.mcmA.is_better_than(self.mcmB)) - # self.assertLess(self.mcmA, self.mcmB) - - # self.mcmA.set_metric_weighting({"perf_latency_p99": 1}) - - # # latency: 20 is better than 40 - # self.assertTrue(self.mcmA.is_better_than(self.mcmB)) - # self.assertGreater(self.mcmA, self.mcmB) - - # def test_is_better_than_combo(self): - # """ - # Test that combination metric comparison works as expected - # """ - # # throuhput: 1000 vs. 2000 (worse), latency: 20 vs. 40 (better) - # # with latency bias mcmA is better - # self.mcmA.set_metric_weighting({"perf_throughput": 1, "perf_latency_p99": 3}) - - # self.assertTrue(self.mcmA.is_better_than(self.mcmB)) - - # def test_is_better_than_empty(self): - # """ - # Test for correct return values when comparing for/against an empty set - # """ - # self.mcmA.set_metric_weighting({"perf_throughput": 1}) - # self.mcmC.set_metric_weighting({"perf_throughput": 1}) - - # self.assertTrue(self.mcmA.is_better_than(self.mcmC)) - # self.assertFalse(self.mcmC.is_better_than(self.mcmA)) - # self.assertEqual(self.mcmC, self.mcmD) - - # def test__eq__(self): - # """ - # Test that individual metric equality works as expected - # """ - # self.mcmA.set_metric_weighting({"cpu_used_ram": 10}) - - # self.assertEqual(self.mcmA, self.mcmB) - - # def test__eq__combo(self): - # """ - # Test that combination metric equality works as expected - # """ - # # throuhput: 1000 vs. 2000 (worse), latency: 20 vs. 40 (better) - # # with no bias they are equal - # self.mcmA.set_metric_weighting({"perf_throughput": 1, "perf_latency_p99": 1}) - - # self.assertEqual(self.mcmA, self.mcmB) - - # def test_from_dict(self): - # """ - # Test to ensure class can be correctly restored from a dictionary - # """ - # mcmA_json = json.dumps(self.mcmA, default=default_encode) - - # mcmA_from_dict = ModelConfigMeasurement.from_dict(json.loads(mcmA_json)) - - # self.assertEqual( - # mcmA_from_dict.model_config_name(), self.mcmA.model_config_name() - # ) - # self.assertEqual( - # mcmA_from_dict.model_specific_pa_params(), - # self.mcmA.model_specific_pa_params(), - # ) - # self.assertEqual(mcmA_from_dict.non_gpu_data(), self.mcmA.non_gpu_data()) - - # # Catchall in case something new is added - # self.assertEqual(mcmA_from_dict, self.mcmA) - - # def _construct_model_config_measurement( - # self, model_config_name, model_specific_pa_params, non_gpu_metric_values - # ): - # non_gpu_data = convert_non_gpu_metrics_to_data(non_gpu_metric_values) - - # return ModelConfigMeasurement( - # model_config_name, model_specific_pa_params, non_gpu_data - # ) + self.assertEqual( + mcmA_from_checkpoint.get_perf_metrics(), self.mcmA.get_perf_metrics() + ) + + # Catchall in case something new is added + self.assertEqual(mcmA_from_checkpoint, self.mcmA) + + ########################################################################### + # Calculation Tests + ########################################################################### + def test_calculate_weighted_percentage_gain(self): + """ + Test that weighted percentages are returned correctly + """ + + # throughput: mcmA: 1000, mcmB: 500 + self.assertEqual(self.mcmA.calculate_weighted_percentage_gain(self.mcmB), 100) + self.assertEqual(self.mcmB.calculate_weighted_percentage_gain(self.mcmA), -50) + + self.mcmA.set_metric_weighting({PerfLatencyP99.tag: 1}) + self.mcmB.set_metric_weighting({PerfLatencyP99.tag: 1}) + + # latency: mcmA: 20, mcmB: 10 + self.assertEqual(self.mcmA.calculate_weighted_percentage_gain(self.mcmB), -50) + self.assertEqual(self.mcmB.calculate_weighted_percentage_gain(self.mcmA), 100) + + # This illustrates why we need to use score, not percentages to determine + # which model is better. In both cases we will (correctly) report that + # mcmA/B is 25% better than the other, even though they are equal + # + # mcmA has 50% worse throughput, but 100% better latency + # mcmB has 100% better latency, but 50% worse throughput + self.mcmA.set_metric_weighting({PerfThroughput.tag: 1, PerfLatencyP99.tag: 1}) + self.mcmB.set_metric_weighting({PerfThroughput.tag: 1, PerfLatencyP99.tag: 1}) + self.assertEqual(self.mcmA, self.mcmB) + self.assertEqual(self.mcmA.calculate_weighted_percentage_gain(self.mcmB), 25) + self.assertEqual(self.mcmB.calculate_weighted_percentage_gain(self.mcmA), 25) + + ########################################################################### + # Comparison Tests + ########################################################################### + def test_is_better_than(self): + """ + Test that individual metric comparison works as expected + """ + self.mcmA.set_metric_weighting({PerfThroughput.tag: 1}) + + # throughput: 1000 is better than 500 + self.assertTrue(self.mcmA.is_better_than(self.mcmB)) + self.assertGreater(self.mcmA, self.mcmB) + + self.mcmA.set_metric_weighting({PerfLatencyP99.tag: 1}) + + # latency: 20 is worse than 10 + self.assertFalse(self.mcmA.is_better_than(self.mcmB)) + self.assertLess(self.mcmA, self.mcmB) + + def test_is_better_than_combo(self): + """ + Test that combination metric comparison works as expected + """ + # throuhput: 2000 vs. 1000 (better), latency: 20 vs. 10 (worse) + # with latency bias mcmB is better + self.mcmA.set_metric_weighting({PerfThroughput.tag: 1, PerfLatencyP99.tag: 3}) + + self.assertFalse(self.mcmA.is_better_than(self.mcmB)) + + def test_is_better_than_empty(self): + """ + Test for correct return values when comparing for/against an empty set + """ + empty_mcm0 = ModelConfigMeasurement({}) + empty_mcm1 = ModelConfigMeasurement({}) + + self.assertTrue(self.mcmA.is_better_than(empty_mcm0)) + self.assertFalse(empty_mcm0.is_better_than(self.mcmA)) + self.assertEqual(empty_mcm0, empty_mcm1) if __name__ == "__main__":