Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update DCGM version to 3.2.6 #785

Merged
merged 14 commits into from
Nov 6, 2023
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

exclude: monitor/dcgm/
repos:
- repo: https://github.com/timothycrosley/isort
rev: 5.12.0
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ ARG BASE_IMAGE
ARG TRITONSDK_BASE_IMAGE

# DCGM version to install for Model Analyzer
ENV DCGM_VERSION=2.4.7
ENV DCGM_VERSION=3.2.6

# Ensure apt-get won't prompt for selecting options
ENV DEBIAN_FRONTEND=noninteractive
Expand Down
7 changes: 4 additions & 3 deletions model_analyzer/device/gpu_device_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,10 @@ def init_all_devices(self, dcgmPath=None):
device_atrributes = dcgm_agent.dcgmGetDeviceAttributes(
dcgm_handle, device_id
).identifiers
pci_bus_id = device_atrributes.pciBusId.decode("utf-8").upper()
device_uuid = str(device_atrributes.uuid, encoding="utf-8")
device_name = str(device_atrributes.deviceName, encoding="utf-8")
pci_bus_id = device_atrributes.pciBusId
device_uuid = device_atrributes.uuid
device_name = device_atrributes.deviceName

gpu_device = GPUDevice(device_name, device_id, pci_bus_id, device_uuid)

self._devices.append(gpu_device)
Expand Down
191 changes: 191 additions & 0 deletions model_analyzer/monitor/dcgm/DcgmDiag.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent


class DcgmDiag:

# Maps version codes to simple version values for range comparisons
_versionMap = {dcgm_structs.dcgmRunDiag_version: 5}

def __init__(self,
gpuIds=None,
testNamesStr='',
paramsStr='',
verbose=True,
version=dcgm_structs.dcgmRunDiag_version):
# Make sure version is valid
if version not in DcgmDiag._versionMap:
raise ValueError("'%s' is not a valid version for dcgmRunDiag." %
version)
self.version = version

if self.version == dcgm_structs.dcgmRunDiag_version7:
self.runDiagInfo = dcgm_structs.c_dcgmRunDiag_v7()
else:
self.runDiagInfo = dcgm_structs.c_dcgmRunDiag_t()

self.numTests = 0
self.numParams = 0
self.SetVerbose(verbose)
if testNamesStr == '':
# default to a level 1 test
self.runDiagInfo.validate = 1
elif testNamesStr == '1':
self.runDiagInfo.validate = 1
elif testNamesStr == '2':
self.runDiagInfo.validate = 2
elif testNamesStr == '3':
self.runDiagInfo.validate = 3
elif testNamesStr == '4':
self.runDiagInfo.validate = 4
else:
# Make sure no number other that 1-4 were submitted
if testNamesStr.isdigit():
raise ValueError("'%s' is not a valid test name." %
testNamesStr)

# Copy to the testNames portion of the object
names = testNamesStr.split(',')
if len(names) > dcgm_structs.DCGM_MAX_TEST_NAMES:
err = 'DcgmDiag cannot initialize: %d test names were specified exceeding the limit of %d.' %\
(len(names), dcgm_structs.DCGM_MAX_TEST_NAMES)
raise ValueError(err)

for testName in names:
self.AddTest(testName)

if paramsStr != '':
params = paramsStr.split(';')
if len(params) >= dcgm_structs.DCGM_MAX_TEST_PARMS:
err = 'DcgmDiag cannot initialize: %d parameters were specified, exceeding the limit of %d.' %\
(len(params), dcgm_structs.DCGM_MAX_TEST_PARMS)
raise ValueError(err)

for param in params:
self.AddParameter(param)

if gpuIds:
first = True
for gpu in gpuIds:
if first:
self.runDiagInfo.gpuList = str(gpu)
first = False
else:
self.runDiagInfo.gpuList = "%s,%s" % (
self.runDiagInfo.gpuList, str(gpu))

def SetVerbose(self, val):
if val == True:
self.runDiagInfo.flags |= dcgm_structs.DCGM_RUN_FLAGS_VERBOSE
else:
self.runDiagInfo.flags &= ~dcgm_structs.DCGM_RUN_FLAGS_VERBOSE

def UseFakeGpus(self):
self.runDiagInfo.fakeGpuList = self.runDiagInfo.gpuList

def GetStruct(self):
return self.runDiagInfo

def AddParameter(self, parameterStr):
if len(parameterStr) >= dcgm_structs.DCGM_MAX_TEST_PARMS_LEN:
err = 'DcgmDiag cannot add parameter \'%s\' because it exceeds max length %d.' % \
(parameterStr, dcgm_structs.DCGM_MAX_TEST_PARMS_LEN)
raise ValueError(err)

index = 0
for c in parameterStr:
self.runDiagInfo.testParms[self.numParams][index] = ord(c)
index += 1

self.numParams += 1

def AddTest(self, testNameStr):
if len(testNameStr) >= dcgm_structs.DCGM_MAX_TEST_NAMES_LEN:
err = 'DcgmDiag cannot add test name \'%s\' because it exceeds max length %d.' % \
(testNameStr, dcgm_structs.DCGM_MAX_TEST_NAMES_LEN)
raise ValueError(err)

index = 0
for c in testNameStr:
self.runDiagInfo.testNames[self.numTests][index] = ord(c)
index += 1

self.numTests += 1

def SetStatsOnFail(self, val):
if val == True:
self.runDiagInfo.flags |= dcgm_structs.DCGM_RUN_FLAGS_STATSONFAIL

def SetThrottleMask(self, value):
if DcgmDiag._versionMap[self.version] < 3:
raise ValueError(
"Throttle mask requires minimum version 3 for dcgmRunDiag.")
if isinstance(
value,
str) and len(value) >= dcgm_structs.DCGM_THROTTLE_MASK_LEN:
raise ValueError("Throttle mask value '%s' exceeds max length %d." %
(value, dcgm_structs.DCGM_THROTTLE_MASK_LEN - 1))

self.runDiagInfo.throttleMask = str(value)

def SetFailEarly(self, enable=True, checkInterval=5):
if DcgmDiag._versionMap[self.version] < 5:
raise ValueError(
"Fail early requires minimum version 5 for dcgmRunDiag.")
if not isinstance(checkInterval, int):
raise ValueError("Invalid checkInterval value: %s" % checkInterval)

if enable:
self.runDiagInfo.flags |= dcgm_structs.DCGM_RUN_FLAGS_FAIL_EARLY
self.runDiagInfo.failCheckInterval = checkInterval
else:
self.runDiagInfo.flags &= ~dcgm_structs.DCGM_RUN_FLAGS_FAIL_EARLY

def Execute(self, handle):
return dcgm_agent.dcgmActionValidate_v2(handle, self.runDiagInfo,
self.version)

def SetStatsPath(self, statsPath):
if len(statsPath) >= dcgm_structs.DCGM_PATH_LEN:
err = "DcgmDiag cannot set statsPath '%s' because it exceeds max length %d." % \
(statsPath, dcgm_structs.DCGM_PATH_LEN)
raise ValueError(err)

self.runDiagInfo.statsPath = statsPath

def SetConfigFileContents(self, configFileContents):
if len(configFileContents) >= dcgm_structs.DCGM_MAX_CONFIG_FILE_LEN:
err = "Dcgm Diag cannot set config file contents to '%s' because it exceeds max length %d." \
% (configFileContents, dcgm_structs.DCGM_MAX_CONFIG_FILE_LEN)
raise ValueError(err)

self.runDiagInfo.configFileContents = configFileContents

def SetDebugLogFile(self, logFileName):
if len(logFileName) >= dcgm_structs.DCGM_FILE_LEN:
raise ValueError("Cannot set debug file to '%s' because it exceeds max length %d."\
% (logFileName, dcgm_structs.DCGM_FILE_LEN))

self.runDiagInfo.debugLogFile = logFileName

def SetDebugLevel(self, debugLevel):
if debugLevel < 0 or debugLevel > 5:
raise ValueError(
"Cannot set debug level to %d. Debug Level must be a value from 0-5 inclusive."
)

self.runDiagInfo.debugLevel = debugLevel
83 changes: 83 additions & 0 deletions model_analyzer/monitor/dcgm/DcgmFieldGroup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent
import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
'''
Class for managing a group of field IDs in the host engine.
'''


class DcgmFieldGroup:
'''
Constructor

dcgmHandle - DcgmHandle() instance to use for communicating with the host engine
name - Name of the field group to use within DCGM. This must be unique
fieldIds - Fields that are part of this group
fieldGroupId - If provided, this is used to initialize the object from an existing field group ID
'''

def __init__(self, dcgmHandle, name="", fieldIds=None, fieldGroupId=None):
fieldIds = fieldIds or []
self.name = name
self.fieldIds = fieldIds
self._dcgmHandle = dcgmHandle
self.wasCreated = False

#If the user passed in an ID, the field group already exists. Fetch live info
if fieldGroupId is not None:
self.fieldGroupId = fieldGroupId
fieldGroupInfo = dcgm_agent.dcgmFieldGroupGetInfo(
self._dcgmHandle.handle, self.fieldGroupId)
self.name = fieldGroupInfo.fieldGroupName
self.fieldIds = fieldGroupInfo.fieldIds
else:
self.fieldGroupId = None #Assign here so the destructor doesn't fail if the call below fails
self.fieldGroupId = dcgm_agent.dcgmFieldGroupCreate(
self._dcgmHandle.handle, fieldIds, name)
self.wasCreated = True

'''
Remove this field group from DCGM. This object can no longer be passed to other APIs after this call.
'''

def Delete(self):
if self.wasCreated and self.fieldGroupId is not None:
try:
try:
dcgm_agent.dcgmFieldGroupDestroy(self._dcgmHandle.handle,
self.fieldGroupId)
except dcgm_structs.dcgmExceptionClass(
dcgm_structs.DCGM_ST_NO_DATA):
# someone may have deleted the group under us. That's ok.
pass
except dcgm_structs.dcgmExceptionClass(
dcgm_structs.DCGM_ST_CONNECTION_NOT_VALID):
# We lost our connection, but we're destructing this object anyway.
pass
except AttributeError as ae:
# When we're cleaning up at the end, dcgm_agent and dcgm_structs have been unloaded and we'll
# get an AttributeError: "'NoneType' object has no 'dcgmExceptionClass'" Ignore this
pass
except TypeError as te:
# When we're cleaning up at the end, dcgm_agent and dcgm_structs have been unloaded and we might
# get a TypeError: "'NoneType' object is not callable'" Ignore this
pass
self.fieldGroupId = None
self._dcgmHandle = None

#Destructor
def __del__(self):
self.Delete()
Loading
Loading