-
Notifications
You must be signed in to change notification settings - Fork 76
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Update DCGM version (#692) * Update library version * Added new DCGM files * Added new DCGM files * Update package references * Attempt at DCGM fix * Fixing uuid and name * Making PCI ID an int * Printing device types * Next attempt * Fixing copyright issues * Update DCGM version * Ignore pre-commit hooks for DCGM * Fix pre-commit --------- Co-authored-by: Misha Chornyi <[email protected]> Co-authored-by: Misha Chornyi <[email protected]> Co-authored-by: Brian Raf <[email protected]>
- Loading branch information
1 parent
f15427e
commit 167278f
Showing
28 changed files
with
7,424 additions
and
2,376 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,191 @@ | ||
# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs | ||
import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent | ||
|
||
|
||
class DcgmDiag: | ||
|
||
# Maps version codes to simple version values for range comparisons | ||
_versionMap = {dcgm_structs.dcgmRunDiag_version: 5} | ||
|
||
def __init__(self, | ||
gpuIds=None, | ||
testNamesStr='', | ||
paramsStr='', | ||
verbose=True, | ||
version=dcgm_structs.dcgmRunDiag_version): | ||
# Make sure version is valid | ||
if version not in DcgmDiag._versionMap: | ||
raise ValueError("'%s' is not a valid version for dcgmRunDiag." % | ||
version) | ||
self.version = version | ||
|
||
if self.version == dcgm_structs.dcgmRunDiag_version7: | ||
self.runDiagInfo = dcgm_structs.c_dcgmRunDiag_v7() | ||
else: | ||
self.runDiagInfo = dcgm_structs.c_dcgmRunDiag_t() | ||
|
||
self.numTests = 0 | ||
self.numParams = 0 | ||
self.SetVerbose(verbose) | ||
if testNamesStr == '': | ||
# default to a level 1 test | ||
self.runDiagInfo.validate = 1 | ||
elif testNamesStr == '1': | ||
self.runDiagInfo.validate = 1 | ||
elif testNamesStr == '2': | ||
self.runDiagInfo.validate = 2 | ||
elif testNamesStr == '3': | ||
self.runDiagInfo.validate = 3 | ||
elif testNamesStr == '4': | ||
self.runDiagInfo.validate = 4 | ||
else: | ||
# Make sure no number other that 1-4 were submitted | ||
if testNamesStr.isdigit(): | ||
raise ValueError("'%s' is not a valid test name." % | ||
testNamesStr) | ||
|
||
# Copy to the testNames portion of the object | ||
names = testNamesStr.split(',') | ||
if len(names) > dcgm_structs.DCGM_MAX_TEST_NAMES: | ||
err = 'DcgmDiag cannot initialize: %d test names were specified exceeding the limit of %d.' %\ | ||
(len(names), dcgm_structs.DCGM_MAX_TEST_NAMES) | ||
raise ValueError(err) | ||
|
||
for testName in names: | ||
self.AddTest(testName) | ||
|
||
if paramsStr != '': | ||
params = paramsStr.split(';') | ||
if len(params) >= dcgm_structs.DCGM_MAX_TEST_PARMS: | ||
err = 'DcgmDiag cannot initialize: %d parameters were specified, exceeding the limit of %d.' %\ | ||
(len(params), dcgm_structs.DCGM_MAX_TEST_PARMS) | ||
raise ValueError(err) | ||
|
||
for param in params: | ||
self.AddParameter(param) | ||
|
||
if gpuIds: | ||
first = True | ||
for gpu in gpuIds: | ||
if first: | ||
self.runDiagInfo.gpuList = str(gpu) | ||
first = False | ||
else: | ||
self.runDiagInfo.gpuList = "%s,%s" % ( | ||
self.runDiagInfo.gpuList, str(gpu)) | ||
|
||
def SetVerbose(self, val): | ||
if val == True: | ||
self.runDiagInfo.flags |= dcgm_structs.DCGM_RUN_FLAGS_VERBOSE | ||
else: | ||
self.runDiagInfo.flags &= ~dcgm_structs.DCGM_RUN_FLAGS_VERBOSE | ||
|
||
def UseFakeGpus(self): | ||
self.runDiagInfo.fakeGpuList = self.runDiagInfo.gpuList | ||
|
||
def GetStruct(self): | ||
return self.runDiagInfo | ||
|
||
def AddParameter(self, parameterStr): | ||
if len(parameterStr) >= dcgm_structs.DCGM_MAX_TEST_PARMS_LEN: | ||
err = 'DcgmDiag cannot add parameter \'%s\' because it exceeds max length %d.' % \ | ||
(parameterStr, dcgm_structs.DCGM_MAX_TEST_PARMS_LEN) | ||
raise ValueError(err) | ||
|
||
index = 0 | ||
for c in parameterStr: | ||
self.runDiagInfo.testParms[self.numParams][index] = ord(c) | ||
index += 1 | ||
|
||
self.numParams += 1 | ||
|
||
def AddTest(self, testNameStr): | ||
if len(testNameStr) >= dcgm_structs.DCGM_MAX_TEST_NAMES_LEN: | ||
err = 'DcgmDiag cannot add test name \'%s\' because it exceeds max length %d.' % \ | ||
(testNameStr, dcgm_structs.DCGM_MAX_TEST_NAMES_LEN) | ||
raise ValueError(err) | ||
|
||
index = 0 | ||
for c in testNameStr: | ||
self.runDiagInfo.testNames[self.numTests][index] = ord(c) | ||
index += 1 | ||
|
||
self.numTests += 1 | ||
|
||
def SetStatsOnFail(self, val): | ||
if val == True: | ||
self.runDiagInfo.flags |= dcgm_structs.DCGM_RUN_FLAGS_STATSONFAIL | ||
|
||
def SetThrottleMask(self, value): | ||
if DcgmDiag._versionMap[self.version] < 3: | ||
raise ValueError( | ||
"Throttle mask requires minimum version 3 for dcgmRunDiag.") | ||
if isinstance( | ||
value, | ||
str) and len(value) >= dcgm_structs.DCGM_THROTTLE_MASK_LEN: | ||
raise ValueError("Throttle mask value '%s' exceeds max length %d." % | ||
(value, dcgm_structs.DCGM_THROTTLE_MASK_LEN - 1)) | ||
|
||
self.runDiagInfo.throttleMask = str(value) | ||
|
||
def SetFailEarly(self, enable=True, checkInterval=5): | ||
if DcgmDiag._versionMap[self.version] < 5: | ||
raise ValueError( | ||
"Fail early requires minimum version 5 for dcgmRunDiag.") | ||
if not isinstance(checkInterval, int): | ||
raise ValueError("Invalid checkInterval value: %s" % checkInterval) | ||
|
||
if enable: | ||
self.runDiagInfo.flags |= dcgm_structs.DCGM_RUN_FLAGS_FAIL_EARLY | ||
self.runDiagInfo.failCheckInterval = checkInterval | ||
else: | ||
self.runDiagInfo.flags &= ~dcgm_structs.DCGM_RUN_FLAGS_FAIL_EARLY | ||
|
||
def Execute(self, handle): | ||
return dcgm_agent.dcgmActionValidate_v2(handle, self.runDiagInfo, | ||
self.version) | ||
|
||
def SetStatsPath(self, statsPath): | ||
if len(statsPath) >= dcgm_structs.DCGM_PATH_LEN: | ||
err = "DcgmDiag cannot set statsPath '%s' because it exceeds max length %d." % \ | ||
(statsPath, dcgm_structs.DCGM_PATH_LEN) | ||
raise ValueError(err) | ||
|
||
self.runDiagInfo.statsPath = statsPath | ||
|
||
def SetConfigFileContents(self, configFileContents): | ||
if len(configFileContents) >= dcgm_structs.DCGM_MAX_CONFIG_FILE_LEN: | ||
err = "Dcgm Diag cannot set config file contents to '%s' because it exceeds max length %d." \ | ||
% (configFileContents, dcgm_structs.DCGM_MAX_CONFIG_FILE_LEN) | ||
raise ValueError(err) | ||
|
||
self.runDiagInfo.configFileContents = configFileContents | ||
|
||
def SetDebugLogFile(self, logFileName): | ||
if len(logFileName) >= dcgm_structs.DCGM_FILE_LEN: | ||
raise ValueError("Cannot set debug file to '%s' because it exceeds max length %d."\ | ||
% (logFileName, dcgm_structs.DCGM_FILE_LEN)) | ||
|
||
self.runDiagInfo.debugLogFile = logFileName | ||
|
||
def SetDebugLevel(self, debugLevel): | ||
if debugLevel < 0 or debugLevel > 5: | ||
raise ValueError( | ||
"Cannot set debug level to %d. Debug Level must be a value from 0-5 inclusive." | ||
) | ||
|
||
self.runDiagInfo.debugLevel = debugLevel |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent | ||
import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs | ||
''' | ||
Class for managing a group of field IDs in the host engine. | ||
''' | ||
|
||
|
||
class DcgmFieldGroup: | ||
''' | ||
Constructor | ||
dcgmHandle - DcgmHandle() instance to use for communicating with the host engine | ||
name - Name of the field group to use within DCGM. This must be unique | ||
fieldIds - Fields that are part of this group | ||
fieldGroupId - If provided, this is used to initialize the object from an existing field group ID | ||
''' | ||
|
||
def __init__(self, dcgmHandle, name="", fieldIds=None, fieldGroupId=None): | ||
fieldIds = fieldIds or [] | ||
self.name = name | ||
self.fieldIds = fieldIds | ||
self._dcgmHandle = dcgmHandle | ||
self.wasCreated = False | ||
|
||
#If the user passed in an ID, the field group already exists. Fetch live info | ||
if fieldGroupId is not None: | ||
self.fieldGroupId = fieldGroupId | ||
fieldGroupInfo = dcgm_agent.dcgmFieldGroupGetInfo( | ||
self._dcgmHandle.handle, self.fieldGroupId) | ||
self.name = fieldGroupInfo.fieldGroupName | ||
self.fieldIds = fieldGroupInfo.fieldIds | ||
else: | ||
self.fieldGroupId = None #Assign here so the destructor doesn't fail if the call below fails | ||
self.fieldGroupId = dcgm_agent.dcgmFieldGroupCreate( | ||
self._dcgmHandle.handle, fieldIds, name) | ||
self.wasCreated = True | ||
|
||
''' | ||
Remove this field group from DCGM. This object can no longer be passed to other APIs after this call. | ||
''' | ||
|
||
def Delete(self): | ||
if self.wasCreated and self.fieldGroupId is not None: | ||
try: | ||
try: | ||
dcgm_agent.dcgmFieldGroupDestroy(self._dcgmHandle.handle, | ||
self.fieldGroupId) | ||
except dcgm_structs.dcgmExceptionClass( | ||
dcgm_structs.DCGM_ST_NO_DATA): | ||
# someone may have deleted the group under us. That's ok. | ||
pass | ||
except dcgm_structs.dcgmExceptionClass( | ||
dcgm_structs.DCGM_ST_CONNECTION_NOT_VALID): | ||
# We lost our connection, but we're destructing this object anyway. | ||
pass | ||
except AttributeError as ae: | ||
# When we're cleaning up at the end, dcgm_agent and dcgm_structs have been unloaded and we'll | ||
# get an AttributeError: "'NoneType' object has no 'dcgmExceptionClass'" Ignore this | ||
pass | ||
except TypeError as te: | ||
# When we're cleaning up at the end, dcgm_agent and dcgm_structs have been unloaded and we might | ||
# get a TypeError: "'NoneType' object is not callable'" Ignore this | ||
pass | ||
self.fieldGroupId = None | ||
self._dcgmHandle = None | ||
|
||
#Destructor | ||
def __del__(self): | ||
self.Delete() |
Oops, something went wrong.