Skip to content

Commit

Permalink
Update DCGM version to 3.2.6 (#785)
Browse files Browse the repository at this point in the history
* Update DCGM version (#692)

* Update library version

* Added new DCGM files

* Added new DCGM files

* Update package references

* Attempt at DCGM fix

* Fixing uuid and name

* Making PCI ID an int

* Printing device types

* Next attempt

* Fixing copyright issues

* Update DCGM version

* Ignore pre-commit hooks for DCGM

* Fix pre-commit

---------

Co-authored-by: Misha Chornyi <[email protected]>
Co-authored-by: Misha Chornyi <[email protected]>
Co-authored-by: Brian Raf <[email protected]>
  • Loading branch information
4 people authored Nov 6, 2023
1 parent f15427e commit 167278f
Show file tree
Hide file tree
Showing 28 changed files with 7,424 additions and 2,376 deletions.
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

exclude: monitor/dcgm/
repos:
- repo: https://github.com/timothycrosley/isort
rev: 5.12.0
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ ARG BASE_IMAGE
ARG TRITONSDK_BASE_IMAGE

# DCGM version to install for Model Analyzer
ENV DCGM_VERSION=2.4.7
ENV DCGM_VERSION=3.2.6

# Ensure apt-get won't prompt for selecting options
ENV DEBIAN_FRONTEND=noninteractive
Expand Down
7 changes: 4 additions & 3 deletions model_analyzer/device/gpu_device_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,10 @@ def init_all_devices(self, dcgmPath=None):
device_atrributes = dcgm_agent.dcgmGetDeviceAttributes(
dcgm_handle, device_id
).identifiers
pci_bus_id = device_atrributes.pciBusId.decode("utf-8").upper()
device_uuid = str(device_atrributes.uuid, encoding="utf-8")
device_name = str(device_atrributes.deviceName, encoding="utf-8")
pci_bus_id = device_atrributes.pciBusId
device_uuid = device_atrributes.uuid
device_name = device_atrributes.deviceName

gpu_device = GPUDevice(device_name, device_id, pci_bus_id, device_uuid)

self._devices.append(gpu_device)
Expand Down
191 changes: 191 additions & 0 deletions model_analyzer/monitor/dcgm/DcgmDiag.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent


class DcgmDiag:

# Maps version codes to simple version values for range comparisons
_versionMap = {dcgm_structs.dcgmRunDiag_version: 5}

def __init__(self,
gpuIds=None,
testNamesStr='',
paramsStr='',
verbose=True,
version=dcgm_structs.dcgmRunDiag_version):
# Make sure version is valid
if version not in DcgmDiag._versionMap:
raise ValueError("'%s' is not a valid version for dcgmRunDiag." %
version)
self.version = version

if self.version == dcgm_structs.dcgmRunDiag_version7:
self.runDiagInfo = dcgm_structs.c_dcgmRunDiag_v7()
else:
self.runDiagInfo = dcgm_structs.c_dcgmRunDiag_t()

self.numTests = 0
self.numParams = 0
self.SetVerbose(verbose)
if testNamesStr == '':
# default to a level 1 test
self.runDiagInfo.validate = 1
elif testNamesStr == '1':
self.runDiagInfo.validate = 1
elif testNamesStr == '2':
self.runDiagInfo.validate = 2
elif testNamesStr == '3':
self.runDiagInfo.validate = 3
elif testNamesStr == '4':
self.runDiagInfo.validate = 4
else:
# Make sure no number other that 1-4 were submitted
if testNamesStr.isdigit():
raise ValueError("'%s' is not a valid test name." %
testNamesStr)

# Copy to the testNames portion of the object
names = testNamesStr.split(',')
if len(names) > dcgm_structs.DCGM_MAX_TEST_NAMES:
err = 'DcgmDiag cannot initialize: %d test names were specified exceeding the limit of %d.' %\
(len(names), dcgm_structs.DCGM_MAX_TEST_NAMES)
raise ValueError(err)

for testName in names:
self.AddTest(testName)

if paramsStr != '':
params = paramsStr.split(';')
if len(params) >= dcgm_structs.DCGM_MAX_TEST_PARMS:
err = 'DcgmDiag cannot initialize: %d parameters were specified, exceeding the limit of %d.' %\
(len(params), dcgm_structs.DCGM_MAX_TEST_PARMS)
raise ValueError(err)

for param in params:
self.AddParameter(param)

if gpuIds:
first = True
for gpu in gpuIds:
if first:
self.runDiagInfo.gpuList = str(gpu)
first = False
else:
self.runDiagInfo.gpuList = "%s,%s" % (
self.runDiagInfo.gpuList, str(gpu))

def SetVerbose(self, val):
if val == True:
self.runDiagInfo.flags |= dcgm_structs.DCGM_RUN_FLAGS_VERBOSE
else:
self.runDiagInfo.flags &= ~dcgm_structs.DCGM_RUN_FLAGS_VERBOSE

def UseFakeGpus(self):
self.runDiagInfo.fakeGpuList = self.runDiagInfo.gpuList

def GetStruct(self):
return self.runDiagInfo

def AddParameter(self, parameterStr):
if len(parameterStr) >= dcgm_structs.DCGM_MAX_TEST_PARMS_LEN:
err = 'DcgmDiag cannot add parameter \'%s\' because it exceeds max length %d.' % \
(parameterStr, dcgm_structs.DCGM_MAX_TEST_PARMS_LEN)
raise ValueError(err)

index = 0
for c in parameterStr:
self.runDiagInfo.testParms[self.numParams][index] = ord(c)
index += 1

self.numParams += 1

def AddTest(self, testNameStr):
if len(testNameStr) >= dcgm_structs.DCGM_MAX_TEST_NAMES_LEN:
err = 'DcgmDiag cannot add test name \'%s\' because it exceeds max length %d.' % \
(testNameStr, dcgm_structs.DCGM_MAX_TEST_NAMES_LEN)
raise ValueError(err)

index = 0
for c in testNameStr:
self.runDiagInfo.testNames[self.numTests][index] = ord(c)
index += 1

self.numTests += 1

def SetStatsOnFail(self, val):
if val == True:
self.runDiagInfo.flags |= dcgm_structs.DCGM_RUN_FLAGS_STATSONFAIL

def SetThrottleMask(self, value):
if DcgmDiag._versionMap[self.version] < 3:
raise ValueError(
"Throttle mask requires minimum version 3 for dcgmRunDiag.")
if isinstance(
value,
str) and len(value) >= dcgm_structs.DCGM_THROTTLE_MASK_LEN:
raise ValueError("Throttle mask value '%s' exceeds max length %d." %
(value, dcgm_structs.DCGM_THROTTLE_MASK_LEN - 1))

self.runDiagInfo.throttleMask = str(value)

def SetFailEarly(self, enable=True, checkInterval=5):
if DcgmDiag._versionMap[self.version] < 5:
raise ValueError(
"Fail early requires minimum version 5 for dcgmRunDiag.")
if not isinstance(checkInterval, int):
raise ValueError("Invalid checkInterval value: %s" % checkInterval)

if enable:
self.runDiagInfo.flags |= dcgm_structs.DCGM_RUN_FLAGS_FAIL_EARLY
self.runDiagInfo.failCheckInterval = checkInterval
else:
self.runDiagInfo.flags &= ~dcgm_structs.DCGM_RUN_FLAGS_FAIL_EARLY

def Execute(self, handle):
return dcgm_agent.dcgmActionValidate_v2(handle, self.runDiagInfo,
self.version)

def SetStatsPath(self, statsPath):
if len(statsPath) >= dcgm_structs.DCGM_PATH_LEN:
err = "DcgmDiag cannot set statsPath '%s' because it exceeds max length %d." % \
(statsPath, dcgm_structs.DCGM_PATH_LEN)
raise ValueError(err)

self.runDiagInfo.statsPath = statsPath

def SetConfigFileContents(self, configFileContents):
if len(configFileContents) >= dcgm_structs.DCGM_MAX_CONFIG_FILE_LEN:
err = "Dcgm Diag cannot set config file contents to '%s' because it exceeds max length %d." \
% (configFileContents, dcgm_structs.DCGM_MAX_CONFIG_FILE_LEN)
raise ValueError(err)

self.runDiagInfo.configFileContents = configFileContents

def SetDebugLogFile(self, logFileName):
if len(logFileName) >= dcgm_structs.DCGM_FILE_LEN:
raise ValueError("Cannot set debug file to '%s' because it exceeds max length %d."\
% (logFileName, dcgm_structs.DCGM_FILE_LEN))

self.runDiagInfo.debugLogFile = logFileName

def SetDebugLevel(self, debugLevel):
if debugLevel < 0 or debugLevel > 5:
raise ValueError(
"Cannot set debug level to %d. Debug Level must be a value from 0-5 inclusive."
)

self.runDiagInfo.debugLevel = debugLevel
83 changes: 83 additions & 0 deletions model_analyzer/monitor/dcgm/DcgmFieldGroup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent
import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
'''
Class for managing a group of field IDs in the host engine.
'''


class DcgmFieldGroup:
'''
Constructor
dcgmHandle - DcgmHandle() instance to use for communicating with the host engine
name - Name of the field group to use within DCGM. This must be unique
fieldIds - Fields that are part of this group
fieldGroupId - If provided, this is used to initialize the object from an existing field group ID
'''

def __init__(self, dcgmHandle, name="", fieldIds=None, fieldGroupId=None):
fieldIds = fieldIds or []
self.name = name
self.fieldIds = fieldIds
self._dcgmHandle = dcgmHandle
self.wasCreated = False

#If the user passed in an ID, the field group already exists. Fetch live info
if fieldGroupId is not None:
self.fieldGroupId = fieldGroupId
fieldGroupInfo = dcgm_agent.dcgmFieldGroupGetInfo(
self._dcgmHandle.handle, self.fieldGroupId)
self.name = fieldGroupInfo.fieldGroupName
self.fieldIds = fieldGroupInfo.fieldIds
else:
self.fieldGroupId = None #Assign here so the destructor doesn't fail if the call below fails
self.fieldGroupId = dcgm_agent.dcgmFieldGroupCreate(
self._dcgmHandle.handle, fieldIds, name)
self.wasCreated = True

'''
Remove this field group from DCGM. This object can no longer be passed to other APIs after this call.
'''

def Delete(self):
if self.wasCreated and self.fieldGroupId is not None:
try:
try:
dcgm_agent.dcgmFieldGroupDestroy(self._dcgmHandle.handle,
self.fieldGroupId)
except dcgm_structs.dcgmExceptionClass(
dcgm_structs.DCGM_ST_NO_DATA):
# someone may have deleted the group under us. That's ok.
pass
except dcgm_structs.dcgmExceptionClass(
dcgm_structs.DCGM_ST_CONNECTION_NOT_VALID):
# We lost our connection, but we're destructing this object anyway.
pass
except AttributeError as ae:
# When we're cleaning up at the end, dcgm_agent and dcgm_structs have been unloaded and we'll
# get an AttributeError: "'NoneType' object has no 'dcgmExceptionClass'" Ignore this
pass
except TypeError as te:
# When we're cleaning up at the end, dcgm_agent and dcgm_structs have been unloaded and we might
# get a TypeError: "'NoneType' object is not callable'" Ignore this
pass
self.fieldGroupId = None
self._dcgmHandle = None

#Destructor
def __del__(self):
self.Delete()
Loading

0 comments on commit 167278f

Please sign in to comment.