Skip to content

Commit

Permalink
use gpu metrics from PA (#520)
Browse files Browse the repository at this point in the history
* support and use new CLI options

* Updates to use new gpu metrics

* working unit tests

* Update FreeMemory support

* Review feedback

* fix tests

* add fixme
  • Loading branch information
tgerdesnv authored and mc-nv committed Sep 12, 2022
1 parent c157d15 commit 7080291
Show file tree
Hide file tree
Showing 6 changed files with 181 additions and 105 deletions.
1 change: 1 addition & 0 deletions model_analyzer/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
INTERVAL_SLEEP_TIME = 1
PERF_ANALYZER_MEASUREMENT_WINDOW = 5000
PERF_ANALYZER_MINIMUM_REQUEST_COUNT = 50
SECONDS_TO_MILLISECONDS_MULTIPLIER = 1000

# Triton Server
SERVER_OUTPUT_TIMEOUT_SECS = 5
Expand Down
93 changes: 64 additions & 29 deletions model_analyzer/perf_analyzer/perf_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
from model_analyzer.record.types.gpu_utilization import GPUUtilization
from model_analyzer.record.types.gpu_power_usage import GPUPowerUsage
from model_analyzer.record.types.gpu_used_memory import GPUUsedMemory
from model_analyzer.record.types.gpu_total_memory import GPUTotalMemory
from model_analyzer.record.types.gpu_free_memory import GPUFreeMemory

from model_analyzer.constants import \
INTERVAL_SLEEP_TIME, LOGGER_NAME, MEASUREMENT_REQUEST_COUNT_STEP, \
Expand Down Expand Up @@ -86,10 +86,10 @@ class PerfAnalyzer:
]

gpu_metric_table = [
["gpu_utilization", "Avg GPU Utilizations", GPUUtilization],
["gpu_power_usage", "Avg GPU Power Usages", GPUPowerUsage],
["gpu_used_memory", "Max GPU Memory Usages", GPUUsedMemory],
["gpu_total_memory", "Total GPU Memory Usages", GPUTotalMemory]
["gpu_utilization", "Avg GPU Utilization", GPUUtilization, "0.01"],
["gpu_power_usage", "Avg GPU Power Usage", GPUPowerUsage, "1"],
["gpu_used_memory", "Max GPU Memory Usage", GPUUsedMemory, "1000000"],
["gpu_free_memory", "Total GPU Memory", GPUFreeMemory, "1000000"]
]
#yapf: enable

Expand Down Expand Up @@ -133,6 +133,7 @@ def __init__(self, path, config, max_retries, timeout, max_cpu_util):
self._timeout = timeout
self._output = ""
self._perf_records = {}
self._gpu_records = []
self._max_cpu_util = max_cpu_util

def run(self, metrics, env=None):
Expand Down Expand Up @@ -183,11 +184,11 @@ def run(self, metrics, env=None):

return self.PA_SUCCESS

def get_records(self):
def get_perf_records(self):
"""
Returns
-------
The records from the last perf_analyzer run
The perf records from the last perf_analyzer run
"""

if self._perf_records:
Expand All @@ -196,6 +197,15 @@ def get_records(self):
"Attempted to get perf_analyzer results"
"without calling run first.")

def get_gpu_records(self):
"""
Returns
-------
The gpu records from the last perf_analyzer run
"""

return self._gpu_records

def output(self):
"""
Returns
Expand Down Expand Up @@ -331,7 +341,16 @@ def _get_process_output(self):
self._cmd_log.seek(0)
tmp_output = self._cmd_log.read()
self._cmd_log.close()
return tmp_output.decode('utf-8')

# PA has occasionally output non-UTF-8 bytes which would cause MA
# to assert. In that case, just ignore the result instead of asserting
result = ""
try:
result = tmp_output.decode('utf-8')
except:
pass

return result

def _auto_adjust_parameters(self, process):
"""
Expand Down Expand Up @@ -419,28 +438,17 @@ def _parse_outputs(self, metrics):

for row in csv_reader:
self._perf_records[perf_config[
'model-name']] = self._extract_metrics_from_row(
'model-name']] = self._extract_perf_records_from_row(
metrics, row)
self._gpu_records = self._extract_gpu_records_from_row(
metrics, row)

for perf_config in [
mrc.perf_config() for mrc in self._config.model_run_configs()
]:
os.remove(perf_config['latency-report-file'])

def _extract_metrics_from_row(self, requested_metrics: List[Record],
row_metrics: Dict[str, str]) -> List[Record]:
"""
Extracts the requested metrics from the CSV's row and creates a list of Records
"""
perf_records = self._create_records_from_perf_metrics(
requested_metrics, row_metrics)

gpu_records = self._create_records_from_gpu_metrics(
requested_metrics, row_metrics)

return perf_records + gpu_records

def _create_records_from_perf_metrics(
def _extract_perf_records_from_row(
self, requested_metrics: List[Record],
row_metrics: Dict[str, str]) -> List[Record]:
perf_records: List[Record] = []
Expand All @@ -459,7 +467,7 @@ def _create_records_from_perf_metrics(

return perf_records

def _create_records_from_gpu_metrics(
def _extract_gpu_records_from_row(
self, requested_metrics: List[Record],
row_metrics: Dict[str, str]) -> List[Record]:
# GPU metrics have the following format: UUID0:value0;UUID1:value1;...
Expand All @@ -484,14 +492,41 @@ def _create_records_from_gpu_metrics(
for gpu_metric_string_tuple in gpu_metric_string_tuples:
gpu_metric_tuple = gpu_metric_string_tuple.split(':')

gpu_records.append(gpu_metric[PerfAnalyzer.RECORD_CLASS](
value=float(
gpu_metric_tuple[PerfAnalyzer.GPU_METRIC_VALUE]),
device_uuid=gpu_metric_tuple[
PerfAnalyzer.GPU_METRIC_UUID])) # type: ignore
uuid = gpu_metric_tuple[PerfAnalyzer.GPU_METRIC_UUID]
tmp_value = float(
gpu_metric_tuple[PerfAnalyzer.GPU_METRIC_VALUE])
reduction_factor = float(
str(gpu_metric[PerfAnalyzer.REDUCTION_FACTOR]))
value = tmp_value / reduction_factor

record = gpu_metric[PerfAnalyzer.RECORD_CLASS](
value=value, device_uuid=uuid) # type: ignore

gpu_records.append(record)

self._cleanup_gpu_records(gpu_records)
return gpu_records

def _cleanup_gpu_records(self, gpu_records):
# Recalculate GPUFreeMemory by removing the value of the associated GPUUsedMemory
# Remove any GPUFreeMemory records that don't have a matching GPUUsedMemory
indexes_to_remove = []
for i, record in enumerate(gpu_records):
if type(record) == GPUFreeMemory:
# Find matching UUID UsedMemory
found = False
for other_record in gpu_records:
if type(other_record
) == GPUUsedMemory and record.device_uuid(
) == other_record.device_uuid():
found = True
record._value = record.value() - other_record.value()
break
if not found:
indexes_to_remove.append(i)
for i in reversed(indexes_to_remove):
del gpu_records[i]

def _is_metric_requested_and_in_row(self, metric: List[object],
requested_metrics: List[Record],
row_metrics: Dict[str, str]) -> bool:
Expand Down
15 changes: 13 additions & 2 deletions model_analyzer/perf_analyzer/perf_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from model_analyzer.model_analyzer_exceptions \
import TritonModelAnalyzerException
from model_analyzer.config.input.config_defaults import DEFAULT_MEASUREMENT_MODE
from model_analyzer.constants import SECONDS_TO_MILLISECONDS_MULTIPLIER


class PerfAnalyzerConfig:
Expand All @@ -39,7 +40,8 @@ class PerfAnalyzerConfig:
'ssl-https-verify-host', 'ssl-https-ca-certificates-file',
'ssl-https-client-certificate-type',
'ssl-https-client-certificate-file', 'ssl-https-private-key-type',
'ssl-https-private-key-file'
'ssl-https-private-key-file', 'collect-metrics', 'metrics-url',
'metrics-interval'
]

input_to_options = [
Expand All @@ -52,7 +54,8 @@ class PerfAnalyzerConfig:
additive_args = ['input-data', 'shape']

boolean_args = [
'streaming', 'async', 'sync', 'binary-search', 'ssl-grpc-use-ssl'
'streaming', 'async', 'sync', 'binary-search', 'ssl-grpc-use-ssl',
'collect-metrics'
]

def __init__(self):
Expand Down Expand Up @@ -166,6 +169,14 @@ def update_config_from_profile_config(self, model_name, profile_config):
'protocol': profile_config.client_protocol,
'url': url
})

metrics_interval = profile_config.monitoring_interval * SECONDS_TO_MILLISECONDS_MULTIPLIER
params.update({
'collect-metrics': 'True',
'metrics-url': profile_config.triton_metrics_url,
'metrics-interval': metrics_interval
})

self.update_config(params)

@classmethod
Expand Down
35 changes: 23 additions & 12 deletions model_analyzer/record/metrics_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,7 @@ def _categorize_metrics(metric_tags, collect_cpu_metrics=False):
gpu_metrics, perf_metrics, cpu_metrics = [], [], []
# Separates metrics and objectives into related lists
for metric in MetricsManager.get_metric_types(metric_tags):
if metric in DCGMMonitor.model_analyzer_to_dcgm_field or metric in RemoteMonitor.gpu_metrics.values(
):
if metric in PerfAnalyzer.get_gpu_metrics():
gpu_metrics.append(metric)
elif metric in PerfAnalyzer.get_perf_metrics():
perf_metrics.append(metric)
Expand Down Expand Up @@ -223,17 +222,16 @@ def profile_models(self, run_config):

self._start_monitors(cpu_only=cpu_only)

perf_analyzer_metrics = self._run_perf_analyzer(run_config,
perf_output_writer)
perf_analyzer_metrics, model_gpu_metrics = self._run_perf_analyzer(
run_config, perf_output_writer)

if not perf_analyzer_metrics:
self._stop_monitors(cpu_only=cpu_only)
self._destroy_monitors(cpu_only=cpu_only)
return None

# Get metrics for model inference and combine metrics that do not have GPU UUID
model_gpu_metrics = {}
if not cpu_only:
if not cpu_only and not model_gpu_metrics:
model_gpu_metrics = self._get_gpu_inference_metrics()
model_cpu_metrics = self._get_cpu_inference_metrics()

Expand Down Expand Up @@ -369,6 +367,7 @@ def _start_monitors(self, cpu_only=False):
Start any metrics monitors
"""

self._gpu_monitor = None
if not cpu_only:
try:
if self._config.use_local_gpu_monitor:
Expand Down Expand Up @@ -446,7 +445,8 @@ def _run_perf_analyzer(self, run_config, perf_output_writer):
timeout=self._config.perf_analyzer_timeout,
max_cpu_util=self._config.perf_analyzer_cpu_util)

status = perf_analyzer.run(self._perf_metrics, env=perf_analyzer_env)
metrics_to_gather = self._perf_metrics + self._gpu_metrics
status = perf_analyzer.run(metrics_to_gather, env=perf_analyzer_env)

if perf_output_writer:
perf_output_writer.write(
Expand All @@ -459,16 +459,23 @@ def _run_perf_analyzer(self, run_config, perf_output_writer):

# PerfAnalyzer run was not succesful
if status == 1:
return None
return (None, None)

perf_records = perf_analyzer.get_perf_records()
gpu_records = perf_analyzer.get_gpu_records()

per_model_perf_records = perf_analyzer.get_records()
aggregated_perf_records = self._aggregate_perf_records(perf_records)
aggregated_gpu_records = self._aggregate_gpu_records(gpu_records)

for (model, perf_records) in per_model_perf_records.items():
return aggregated_perf_records, aggregated_gpu_records

def _aggregate_perf_records(self, perf_records):
per_model_perf_records = {}
for (model, records) in perf_records.items():
perf_record_aggregator = RecordAggregator()
perf_record_aggregator.insert_all(perf_records)
perf_record_aggregator.insert_all(records)

per_model_perf_records[model] = perf_record_aggregator.aggregate()

return per_model_perf_records

def _get_gpu_inference_metrics(self):
Expand All @@ -485,6 +492,10 @@ def _get_gpu_inference_metrics(self):
# Stop and destroy DCGM monitor
gpu_records = self._gpu_monitor.stop_recording_metrics()

gpu_metrics = self._aggregate_gpu_records(gpu_records)
return gpu_metrics

def _aggregate_gpu_records(self, gpu_records):
# Insert all records into aggregator and get aggregated DCGM records
gpu_record_aggregator = RecordAggregator()
gpu_record_aggregator.insert_all(gpu_records)
Expand Down
9 changes: 7 additions & 2 deletions tests/common/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,12 @@
from model_analyzer.record.metrics_manager import MetricsManager
from model_analyzer.perf_analyzer.perf_config import PerfAnalyzerConfig
from model_analyzer.state.analyzer_state_manager import AnalyzerStateManager

from model_analyzer.constants import SECONDS_TO_MILLISECONDS_MULTIPLIER
from model_analyzer.config.input.config_defaults import \
DEFAULT_BATCH_SIZES, DEFAULT_TRITON_LAUNCH_MODE, DEFAULT_CLIENT_PROTOCOL, \
DEFAULT_MEASUREMENT_MODE, DEFAULT_TRITON_GRPC_ENDPOINT, DEFAULT_TRITON_HTTP_ENDPOINT, \
DEFAULT_TRITON_INSTALL_PATH, DEFAULT_OUTPUT_MODEL_REPOSITORY
DEFAULT_TRITON_INSTALL_PATH, DEFAULT_OUTPUT_MODEL_REPOSITORY, DEFAULT_TRITON_METRICS_URL, \
DEFAULT_MONITORING_INTERVAL

import os

Expand Down Expand Up @@ -221,6 +222,10 @@ def construct_perf_analyzer_config(model_name='my-model',
pa_config._args['triton-server-directory'] = DEFAULT_TRITON_INSTALL_PATH
pa_config._args['model-repository'] = DEFAULT_OUTPUT_MODEL_REPOSITORY
else:
pa_config._args['collect-metrics'] = 'True'
pa_config._args['metrics-url'] = DEFAULT_TRITON_METRICS_URL
pa_config._args[
'metrics-interval'] = SECONDS_TO_MILLISECONDS_MULTIPLIER * DEFAULT_MONITORING_INTERVAL
pa_config._options['-i'] = client_protocol
if client_protocol == 'http':
pa_config._options['-u'] = DEFAULT_TRITON_HTTP_ENDPOINT
Expand Down
Loading

0 comments on commit 7080291

Please sign in to comment.