Skip to content

Commit

Permalink
Adds support for skipping profiling if the Result is found in the che…
Browse files Browse the repository at this point in the history
…ckpoint (#191)

* Add support for skipping profiling if the results are found in the checkpoint

* Fix codeql issue

* Removing mutable default in RCM

* Changes based on PR

* fixing codeql issue
  • Loading branch information
nv-braf committed Dec 10, 2024
1 parent 4c10f69 commit de1e86b
Show file tree
Hide file tree
Showing 8 changed files with 230 additions and 66 deletions.
12 changes: 12 additions & 0 deletions genai-perf/genai_perf/config/generate/genai_perf_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,18 @@ def get_obj_args(self) -> Namespace:

return obj_args

###########################################################################
# Representation Methods
###########################################################################
def representation(self) -> str:
"""
A string representation of the GAP options which will be
used when determining if a previous (checkpointed) run can be used
"""
representation = " ".join([self.input.__str__(), self.output_tokens.__str__()])

return representation

###########################################################################
# Checkpoint Methods
###########################################################################
Expand Down
38 changes: 37 additions & 1 deletion genai-perf/genai_perf/config/run/results.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from copy import deepcopy
from dataclasses import dataclass, field
from typing import List

Expand All @@ -18,11 +19,18 @@
from genai_perf.types import (
CheckpointObject,
GpuMetricObjectives,
ModelName,
ModelWeights,
PerfMetricObjectives,
RunConfigName,
)


@dataclass(frozen=True)
class ResultsDefaults:
STARTING_ID = -1


@dataclass
class Results:
"""
Expand Down Expand Up @@ -80,11 +88,29 @@ def get_results_failing_constraints(self) -> "Results":

return failing_results

def get_run_config_name_based_on_representation(
self, model_name: ModelName, representation: str
) -> RunConfigName:
"""
Returns the name of the RunConfig if the representation is found,
else creates a new name by incrementing the config ID
"""
max_run_config_id = ResultsDefaults.STARTING_ID
for run_config in self.run_configs:
if representation == run_config.representation():
return run_config.name
else:
max_run_config_id = max(
max_run_config_id, int(run_config.get_name_id())
)

return f"{model_name}_run_config_{max_run_config_id+1}"

###########################################################################
# Set Accessor Methods
###########################################################################
def add_run_config(self, run_config: RunConfig) -> None:
self.run_configs.append(run_config)
self.run_configs.append(deepcopy(run_config))
self.run_configs.sort(reverse=True)

def set_gpu_metric_objectives(
Expand All @@ -110,3 +136,13 @@ def set_model_weighting(self, model_weights: ModelWeights) -> None:
def set_constraints(self, constraints: RunConstraints) -> None:
for run_config in self.run_configs:
run_config.set_constraints(constraints)

###########################################################################
# Misc Methods
###########################################################################
def found_representation(self, representation: str) -> bool:
for run_config in self.run_configs:
if representation == run_config.representation():
return True

return False
36 changes: 33 additions & 3 deletions genai-perf/genai_perf/config/run/run_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,10 @@ class RunConfig:
# triton_env: Dict[str, Any]
# model_run_configs: List[ModelRunConfig]

name: RunConfigName
genai_perf_config: GenAIPerfConfig
perf_analyzer_config: PerfAnalyzerConfig
measurement: RunConfigMeasurement
name: RunConfigName = ""
measurement: RunConfigMeasurement = RunConfigMeasurement()

###########################################################################
# Checkpoint Methods
Expand Down Expand Up @@ -88,7 +88,10 @@ def create_class_from_checkpoint(
)

run_config = RunConfig(
name, genai_perf_config, perf_analyzer_config, measurement
name=name,
genai_perf_config=genai_perf_config,
perf_analyzer_config=perf_analyzer_config,
measurement=measurement,
)

return run_config
Expand Down Expand Up @@ -138,6 +141,16 @@ def get_weighted_perf_metric_values(
perf_metric_name, return_value
)

def get_name_id(self) -> str:
"""
Return the unique ID assigned to a RunConfig's name
by convention this is the final part of the string after
the underscore
"""
name_fields = self.name.split("_")

return name_fields[-1]

###########################################################################
# Set Accessor Methods
###########################################################################
Expand All @@ -164,6 +177,23 @@ def add_perf_metrics(
) -> None:
self.measurement.add_perf_metrics(model_name, perf_metrics)

###########################################################################
# Representation Methods
###########################################################################
def representation(self) -> str:
"""
A string representation of the RunConfig options which will be
used when determining if a previous (checkpointed) run can be used
"""
representation = " ".join(
[
self.perf_analyzer_config.representation(),
self.genai_perf_config.representation(),
]
)

return representation

###########################################################################
# Constraint Methods
###########################################################################
Expand Down
4 changes: 2 additions & 2 deletions genai-perf/genai_perf/measurements/run_config_measurement.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ class RunConfigMeasurement:

def __init__(
self,
gpu_metrics: GpuRecords,
gpu_metrics: Optional[GpuRecords] = None,
run_constraints: Optional[RunConstraints] = None,
):
"""
Expand All @@ -82,7 +82,7 @@ def __init__(
A set of constraints (set by the user) used to determine if
this is a valid measurement
"""
self._gpu_metrics = gpu_metrics
self._gpu_metrics = gpu_metrics if gpu_metrics else {}
self._gpu_metric_objectives: Optional[GpuMetricObjectives] = (
RunConfigMeasurementDefaults.METRIC_OBJECTIVE
)
Expand Down
122 changes: 69 additions & 53 deletions genai-perf/genai_perf/subcommand/analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,63 +90,79 @@ def analyze_handler(args: Namespace) -> None:
obj_args = perf_analyzer_config.get_obj_args()

#
# Create Input/Artifacts
input_config_options = create_config_options(obj_args)
create_artifacts_dirs(obj_args)
tokenizer = get_tokenizer(
obj_args.tokenizer,
obj_args.tokenizer_trust_remote_code,
obj_args.tokenizer_revision,
)
generate_inputs(input_config_options)

#
# Run PA
run_perf_analyzer(
args=obj_args,
perf_analyzer_config=perf_analyzer_config,
telemetry_data_collector=telemetry_data_collector,
)

#
# Extract Perf Metrics
infer_mode, load_level = _determine_infer_mode_and_load_level(
obj_args, objectives, model_name
)
data_parser = calculate_metrics(obj_args, tokenizer)
perf_stats = data_parser.get_statistics(infer_mode, load_level)
perf_metrics = perf_stats.create_records()

#
# Extract Telemetry Metrics
# FIXME: Once I'm able to collect telemetry records will need
# to write a method to hook this up
# telemetry_stats = (
# telemetry_data_collector.get_statistics()
# if telemetry_data_collector
# else None
# )
gpu_metrics: GpuRecords = {}

#
# Create RunConfigMeasurement
run_config_measurement = RunConfigMeasurement(gpu_metrics)
run_config_measurement.add_perf_metrics(model_name, perf_metrics)

#
# Create RunConfig
run_config_name = model_name + "_run_config_" + str(count)
run_config = RunConfig(
name=run_config_name,
# Check if this configuration has already been profiled (is in the checkpoint)
representation = RunConfig(
genai_perf_config=genai_perf_config,
perf_analyzer_config=perf_analyzer_config,
measurement=run_config_measurement,
).representation()

run_config_found = results.found_representation(representation)
run_config_name = results.get_run_config_name_based_on_representation(
model_name, representation
)

#
# Add to results and write checkpoint
results.add_run_config(run_config)
checkpoint.create_checkpoint_object()
if not run_config_found:
#
# Create Input/Artifacts
input_config_options = create_config_options(obj_args)
create_artifacts_dirs(obj_args)
tokenizer = get_tokenizer(
obj_args.tokenizer,
obj_args.tokenizer_trust_remote_code,
obj_args.tokenizer_revision,
)
generate_inputs(input_config_options)

#
# Run PA
run_perf_analyzer(
args=obj_args,
perf_analyzer_config=perf_analyzer_config,
telemetry_data_collector=telemetry_data_collector,
)

#
# Extract Perf Metrics
infer_mode, load_level = _determine_infer_mode_and_load_level(
obj_args, objectives, model_name
)
data_parser = calculate_metrics(obj_args, tokenizer)
perf_stats = data_parser.get_statistics(infer_mode, load_level)
perf_metrics = perf_stats.create_records()

#
# Extract Telemetry Metrics
# FIXME: Once I'm able to collect telemetry records will need
# to write a method to hook this up
# telemetry_stats = (
# telemetry_data_collector.get_statistics()
# if telemetry_data_collector
# else None
# )
gpu_metrics: GpuRecords = {}

#
# Create RunConfigMeasurement
run_config_measurement = RunConfigMeasurement(gpu_metrics)
run_config_measurement.add_perf_metrics(model_name, perf_metrics)

#
# Create RunConfig
run_config = RunConfig(
name=run_config_name,
genai_perf_config=genai_perf_config,
perf_analyzer_config=perf_analyzer_config,
measurement=run_config_measurement,
)

#
# Add to results and write checkpoint
results.add_run_config(run_config)
checkpoint.create_checkpoint_object()
else:
logger.info(
f"{run_config_name} found in checkpoint - skipping profiling..."
)


def _setup_config(args: Namespace) -> ConfigCommand:
Expand Down
14 changes: 14 additions & 0 deletions genai-perf/tests/test_genai_perf_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,20 @@ def test_default_config_and_objective_capture(self):
expected_output_tokens_config, self._default_genai_perf_config.output_tokens
)

###########################################################################
# Test Representation
###########################################################################
def test_representation(self):
"""
Test that the representation is created correctly
"""
expected_representation = " ".join(
[ConfigInput(num_prompts=50).__str__(), ConfigOutputTokens().__str__()]
)
representation = self._default_genai_perf_config.representation()

self.assertEqual(expected_representation, representation)

###########################################################################
# Checkpoint Tests
###########################################################################
Expand Down
Loading

0 comments on commit de1e86b

Please sign in to comment.