From 167278f7dffb47ff38989fb6bb3f66d21d186a7a Mon Sep 17 00:00:00 2001
From: Hyunjae Woo <107147848+nv-hwoo@users.noreply.github.com>
Date: Mon, 6 Nov 2023 12:11:41 -0800
Subject: [PATCH] Update DCGM version to 3.2.6 (#785)

* Update DCGM version (#692)

* Update library version

* Added new DCGM files

* Added new DCGM files

* Update package references

* Attempt at DCGM fix

* Fixing uuid and name

* Making PCI ID an int

* Printing device types

* Next attempt

* Fixing copyright issues

* Update DCGM version

* Ignore pre-commit hooks for DCGM

* Fix pre-commit

---------

Co-authored-by: Misha Chornyi <99709299+mc-nv@users.noreply.github.com>
Co-authored-by: Misha Chornyi <mchornyi@nvidia.com>
Co-authored-by: Brian Raf <braf@nvidia.com>
---
 .pre-commit-config.yaml                       |    1 +
 Dockerfile                                    |    2 +-
 model_analyzer/device/gpu_device_factory.py   |    7 +-
 model_analyzer/monitor/dcgm/DcgmDiag.py       |  191 ++
 model_analyzer/monitor/dcgm/DcgmFieldGroup.py |   83 +
 model_analyzer/monitor/dcgm/DcgmGroup.py      |  815 ++++++
 model_analyzer/monitor/dcgm/DcgmHandle.py     |  141 ++
 model_analyzer/monitor/dcgm/DcgmJsonReader.py |   69 +
 model_analyzer/monitor/dcgm/DcgmReader.py     |  623 +++++
 model_analyzer/monitor/dcgm/DcgmStatus.py     |   57 +
 model_analyzer/monitor/dcgm/DcgmSystem.py     |  412 +++
 .../monitor/dcgm/common/__init__.py           |   13 +
 .../dcgm/common/dcgm_client_cli_parser.py     |  194 ++
 .../monitor/dcgm/common/dcgm_client_main.py   |   86 +
 model_analyzer/monitor/dcgm/dcgm_agent.py     |  443 ++--
 .../monitor/dcgm/dcgm_collectd_plugin.py      |  369 +++
 model_analyzer/monitor/dcgm/dcgm_errors.py    |  395 +++
 .../monitor/dcgm/dcgm_field_helpers.py        |  642 ++---
 model_analyzer/monitor/dcgm/dcgm_fields.py    | 1138 ++++-----
 .../monitor/dcgm/dcgm_fields_collectd.py      |  671 +++++
 .../monitor/dcgm/dcgm_fields_internal.py      |   29 +
 model_analyzer/monitor/dcgm/dcgm_fluentd.py   |   45 +
 .../monitor/dcgm/dcgm_prometheus.py           |  326 +++
 model_analyzer/monitor/dcgm/dcgm_structs.py   | 2208 ++++++++---------
 model_analyzer/monitor/dcgm/dcgm_telegraf.py  |   65 +
 model_analyzer/monitor/dcgm/dcgmvalue.py      |  155 ++
 .../monitor/dcgm/denylist_recommendations.py  |  573 +++++
 model_analyzer/monitor/dcgm/pydcgm.py         |   47 +
 28 files changed, 7424 insertions(+), 2376 deletions(-)
 create mode 100644 model_analyzer/monitor/dcgm/DcgmDiag.py
 create mode 100644 model_analyzer/monitor/dcgm/DcgmFieldGroup.py
 create mode 100644 model_analyzer/monitor/dcgm/DcgmGroup.py
 create mode 100644 model_analyzer/monitor/dcgm/DcgmHandle.py
 create mode 100644 model_analyzer/monitor/dcgm/DcgmJsonReader.py
 create mode 100644 model_analyzer/monitor/dcgm/DcgmReader.py
 create mode 100644 model_analyzer/monitor/dcgm/DcgmStatus.py
 create mode 100644 model_analyzer/monitor/dcgm/DcgmSystem.py
 create mode 100644 model_analyzer/monitor/dcgm/common/__init__.py
 create mode 100644 model_analyzer/monitor/dcgm/common/dcgm_client_cli_parser.py
 create mode 100644 model_analyzer/monitor/dcgm/common/dcgm_client_main.py
 create mode 100644 model_analyzer/monitor/dcgm/dcgm_collectd_plugin.py
 create mode 100644 model_analyzer/monitor/dcgm/dcgm_errors.py
 create mode 100644 model_analyzer/monitor/dcgm/dcgm_fields_collectd.py
 create mode 100644 model_analyzer/monitor/dcgm/dcgm_fields_internal.py
 create mode 100644 model_analyzer/monitor/dcgm/dcgm_fluentd.py
 create mode 100644 model_analyzer/monitor/dcgm/dcgm_prometheus.py
 create mode 100644 model_analyzer/monitor/dcgm/dcgm_telegraf.py
 create mode 100644 model_analyzer/monitor/dcgm/dcgmvalue.py
 create mode 100644 model_analyzer/monitor/dcgm/denylist_recommendations.py
 create mode 100644 model_analyzer/monitor/dcgm/pydcgm.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index ab70f9450..d334e556f 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -24,6 +24,7 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+exclude: monitor/dcgm/
 repos:
 - repo: https://github.com/timothycrosley/isort
   rev: 5.12.0
diff --git a/Dockerfile b/Dockerfile
index 93fff3753..802cb93b5 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -27,7 +27,7 @@ ARG BASE_IMAGE
 ARG TRITONSDK_BASE_IMAGE
 
 # DCGM version to install for Model Analyzer
-ENV DCGM_VERSION=2.4.7
+ENV DCGM_VERSION=3.2.6
 
 # Ensure apt-get won't prompt for selecting options
 ENV DEBIAN_FRONTEND=noninteractive
diff --git a/model_analyzer/device/gpu_device_factory.py b/model_analyzer/device/gpu_device_factory.py
index f28e36b3e..03f76115f 100755
--- a/model_analyzer/device/gpu_device_factory.py
+++ b/model_analyzer/device/gpu_device_factory.py
@@ -66,9 +66,10 @@ def init_all_devices(self, dcgmPath=None):
                 device_atrributes = dcgm_agent.dcgmGetDeviceAttributes(
                     dcgm_handle, device_id
                 ).identifiers
-                pci_bus_id = device_atrributes.pciBusId.decode("utf-8").upper()
-                device_uuid = str(device_atrributes.uuid, encoding="utf-8")
-                device_name = str(device_atrributes.deviceName, encoding="utf-8")
+                pci_bus_id = device_atrributes.pciBusId
+                device_uuid = device_atrributes.uuid
+                device_name = device_atrributes.deviceName
+
                 gpu_device = GPUDevice(device_name, device_id, pci_bus_id, device_uuid)
 
                 self._devices.append(gpu_device)
diff --git a/model_analyzer/monitor/dcgm/DcgmDiag.py b/model_analyzer/monitor/dcgm/DcgmDiag.py
new file mode 100644
index 000000000..e9178895c
--- /dev/null
+++ b/model_analyzer/monitor/dcgm/DcgmDiag.py
@@ -0,0 +1,191 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
+import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent
+
+
+class DcgmDiag:
+
+    # Maps version codes to simple version values for range comparisons
+    _versionMap = {dcgm_structs.dcgmRunDiag_version: 5}
+
+    def __init__(self,
+                 gpuIds=None,
+                 testNamesStr='',
+                 paramsStr='',
+                 verbose=True,
+                 version=dcgm_structs.dcgmRunDiag_version):
+        # Make sure version is valid
+        if version not in DcgmDiag._versionMap:
+            raise ValueError("'%s' is not a valid version for dcgmRunDiag." %
+                             version)
+        self.version = version
+
+        if self.version == dcgm_structs.dcgmRunDiag_version7:
+            self.runDiagInfo = dcgm_structs.c_dcgmRunDiag_v7()
+        else:
+            self.runDiagInfo = dcgm_structs.c_dcgmRunDiag_t()
+
+        self.numTests = 0
+        self.numParams = 0
+        self.SetVerbose(verbose)
+        if testNamesStr == '':
+            # default to a level 1 test
+            self.runDiagInfo.validate = 1
+        elif testNamesStr == '1':
+            self.runDiagInfo.validate = 1
+        elif testNamesStr == '2':
+            self.runDiagInfo.validate = 2
+        elif testNamesStr == '3':
+            self.runDiagInfo.validate = 3
+        elif testNamesStr == '4':
+            self.runDiagInfo.validate = 4
+        else:
+            # Make sure no number other that 1-4 were submitted
+            if testNamesStr.isdigit():
+                raise ValueError("'%s' is not a valid test name." %
+                                 testNamesStr)
+
+            # Copy to the testNames portion of the object
+            names = testNamesStr.split(',')
+            if len(names) > dcgm_structs.DCGM_MAX_TEST_NAMES:
+                err = 'DcgmDiag cannot initialize: %d test names were specified exceeding the limit of %d.' %\
+                      (len(names), dcgm_structs.DCGM_MAX_TEST_NAMES)
+                raise ValueError(err)
+
+            for testName in names:
+                self.AddTest(testName)
+
+        if paramsStr != '':
+            params = paramsStr.split(';')
+            if len(params) >= dcgm_structs.DCGM_MAX_TEST_PARMS:
+                err = 'DcgmDiag cannot initialize: %d parameters were specified, exceeding the limit of %d.' %\
+                      (len(params), dcgm_structs.DCGM_MAX_TEST_PARMS)
+                raise ValueError(err)
+
+            for param in params:
+                self.AddParameter(param)
+
+        if gpuIds:
+            first = True
+            for gpu in gpuIds:
+                if first:
+                    self.runDiagInfo.gpuList = str(gpu)
+                    first = False
+                else:
+                    self.runDiagInfo.gpuList = "%s,%s" % (
+                        self.runDiagInfo.gpuList, str(gpu))
+
+    def SetVerbose(self, val):
+        if val == True:
+            self.runDiagInfo.flags |= dcgm_structs.DCGM_RUN_FLAGS_VERBOSE
+        else:
+            self.runDiagInfo.flags &= ~dcgm_structs.DCGM_RUN_FLAGS_VERBOSE
+
+    def UseFakeGpus(self):
+        self.runDiagInfo.fakeGpuList = self.runDiagInfo.gpuList
+
+    def GetStruct(self):
+        return self.runDiagInfo
+
+    def AddParameter(self, parameterStr):
+        if len(parameterStr) >= dcgm_structs.DCGM_MAX_TEST_PARMS_LEN:
+            err = 'DcgmDiag cannot add parameter \'%s\' because it exceeds max length %d.' % \
+                  (parameterStr, dcgm_structs.DCGM_MAX_TEST_PARMS_LEN)
+            raise ValueError(err)
+
+        index = 0
+        for c in parameterStr:
+            self.runDiagInfo.testParms[self.numParams][index] = ord(c)
+            index += 1
+
+        self.numParams += 1
+
+    def AddTest(self, testNameStr):
+        if len(testNameStr) >= dcgm_structs.DCGM_MAX_TEST_NAMES_LEN:
+            err = 'DcgmDiag cannot add test name \'%s\' because it exceeds max length %d.' % \
+                  (testNameStr, dcgm_structs.DCGM_MAX_TEST_NAMES_LEN)
+            raise ValueError(err)
+
+        index = 0
+        for c in testNameStr:
+            self.runDiagInfo.testNames[self.numTests][index] = ord(c)
+            index += 1
+
+        self.numTests += 1
+
+    def SetStatsOnFail(self, val):
+        if val == True:
+            self.runDiagInfo.flags |= dcgm_structs.DCGM_RUN_FLAGS_STATSONFAIL
+
+    def SetThrottleMask(self, value):
+        if DcgmDiag._versionMap[self.version] < 3:
+            raise ValueError(
+                "Throttle mask requires minimum version 3 for dcgmRunDiag.")
+        if isinstance(
+                value,
+                str) and len(value) >= dcgm_structs.DCGM_THROTTLE_MASK_LEN:
+            raise ValueError("Throttle mask value '%s' exceeds max length %d." %
+                             (value, dcgm_structs.DCGM_THROTTLE_MASK_LEN - 1))
+
+        self.runDiagInfo.throttleMask = str(value)
+
+    def SetFailEarly(self, enable=True, checkInterval=5):
+        if DcgmDiag._versionMap[self.version] < 5:
+            raise ValueError(
+                "Fail early requires minimum version 5 for dcgmRunDiag.")
+        if not isinstance(checkInterval, int):
+            raise ValueError("Invalid checkInterval value: %s" % checkInterval)
+
+        if enable:
+            self.runDiagInfo.flags |= dcgm_structs.DCGM_RUN_FLAGS_FAIL_EARLY
+            self.runDiagInfo.failCheckInterval = checkInterval
+        else:
+            self.runDiagInfo.flags &= ~dcgm_structs.DCGM_RUN_FLAGS_FAIL_EARLY
+
+    def Execute(self, handle):
+        return dcgm_agent.dcgmActionValidate_v2(handle, self.runDiagInfo,
+                                                self.version)
+
+    def SetStatsPath(self, statsPath):
+        if len(statsPath) >= dcgm_structs.DCGM_PATH_LEN:
+            err = "DcgmDiag cannot set statsPath '%s' because it exceeds max length %d." % \
+                   (statsPath, dcgm_structs.DCGM_PATH_LEN)
+            raise ValueError(err)
+
+        self.runDiagInfo.statsPath = statsPath
+
+    def SetConfigFileContents(self, configFileContents):
+        if len(configFileContents) >= dcgm_structs.DCGM_MAX_CONFIG_FILE_LEN:
+            err = "Dcgm Diag cannot set config file contents to '%s' because it exceeds max length %d." \
+                  % (configFileContents, dcgm_structs.DCGM_MAX_CONFIG_FILE_LEN)
+            raise ValueError(err)
+
+        self.runDiagInfo.configFileContents = configFileContents
+
+    def SetDebugLogFile(self, logFileName):
+        if len(logFileName) >= dcgm_structs.DCGM_FILE_LEN:
+            raise ValueError("Cannot set debug file to '%s' because it exceeds max length %d."\
+                % (logFileName, dcgm_structs.DCGM_FILE_LEN))
+
+        self.runDiagInfo.debugLogFile = logFileName
+
+    def SetDebugLevel(self, debugLevel):
+        if debugLevel < 0 or debugLevel > 5:
+            raise ValueError(
+                "Cannot set debug level to %d. Debug Level must be a value from 0-5 inclusive."
+            )
+
+        self.runDiagInfo.debugLevel = debugLevel
diff --git a/model_analyzer/monitor/dcgm/DcgmFieldGroup.py b/model_analyzer/monitor/dcgm/DcgmFieldGroup.py
new file mode 100644
index 000000000..bcbe37035
--- /dev/null
+++ b/model_analyzer/monitor/dcgm/DcgmFieldGroup.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent
+import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
+'''
+Class for managing a group of field IDs in the host engine.
+'''
+
+
+class DcgmFieldGroup:
+    '''
+    Constructor
+
+    dcgmHandle - DcgmHandle() instance to use for communicating with the host engine
+    name - Name of the field group to use within DCGM. This must be unique
+    fieldIds - Fields that are part of this group
+    fieldGroupId - If provided, this is used to initialize the object from an existing field group ID
+    '''
+
+    def __init__(self, dcgmHandle, name="", fieldIds=None, fieldGroupId=None):
+        fieldIds = fieldIds or []
+        self.name = name
+        self.fieldIds = fieldIds
+        self._dcgmHandle = dcgmHandle
+        self.wasCreated = False
+
+        #If the user passed in an ID, the field group already exists. Fetch live info
+        if fieldGroupId is not None:
+            self.fieldGroupId = fieldGroupId
+            fieldGroupInfo = dcgm_agent.dcgmFieldGroupGetInfo(
+                self._dcgmHandle.handle, self.fieldGroupId)
+            self.name = fieldGroupInfo.fieldGroupName
+            self.fieldIds = fieldGroupInfo.fieldIds
+        else:
+            self.fieldGroupId = None  #Assign here so the destructor doesn't fail if the call below fails
+            self.fieldGroupId = dcgm_agent.dcgmFieldGroupCreate(
+                self._dcgmHandle.handle, fieldIds, name)
+            self.wasCreated = True
+
+    '''
+    Remove this field group from DCGM. This object can no longer be passed to other APIs after this call.
+    '''
+
+    def Delete(self):
+        if self.wasCreated and self.fieldGroupId is not None:
+            try:
+                try:
+                    dcgm_agent.dcgmFieldGroupDestroy(self._dcgmHandle.handle,
+                                                     self.fieldGroupId)
+                except dcgm_structs.dcgmExceptionClass(
+                        dcgm_structs.DCGM_ST_NO_DATA):
+                    # someone may have deleted the group under us. That's ok.
+                    pass
+                except dcgm_structs.dcgmExceptionClass(
+                        dcgm_structs.DCGM_ST_CONNECTION_NOT_VALID):
+                    # We lost our connection, but we're destructing this object anyway.
+                    pass
+            except AttributeError as ae:
+                # When we're cleaning up at the end, dcgm_agent and dcgm_structs have been unloaded and we'll
+                # get an AttributeError: "'NoneType' object has no 'dcgmExceptionClass'" Ignore this
+                pass
+            except TypeError as te:
+                # When we're cleaning up at the end, dcgm_agent and dcgm_structs have been unloaded and we might
+                # get a TypeError: "'NoneType' object is not callable'" Ignore this
+                pass
+            self.fieldGroupId = None
+            self._dcgmHandle = None
+
+    #Destructor
+    def __del__(self):
+        self.Delete()
diff --git a/model_analyzer/monitor/dcgm/DcgmGroup.py b/model_analyzer/monitor/dcgm/DcgmGroup.py
new file mode 100644
index 000000000..834e102db
--- /dev/null
+++ b/model_analyzer/monitor/dcgm/DcgmGroup.py
@@ -0,0 +1,815 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import model_analyzer.monitor.dcgm.pydcgm as pydcgm
+import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent
+import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
+import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields
+import model_analyzer.monitor.dcgm.dcgm_field_helpers as dcgm_field_helpers
+from model_analyzer.monitor.dcgm.DcgmHandle import DcgmHandle
+
+
+class DcgmGroupConfig:
+
+    def __init__(self, dcgmHandle, groupId, dcgmGroup):
+        self._dcgmHandle = dcgmHandle
+        self._groupId = groupId
+        self._dcgmGroup = dcgmGroup
+
+    '''
+    Set configuration for this group
+
+    config should be an instance of dcgm_structs.c_dcgmDeviceConfig_v1
+
+    Will throw an exception on error
+    '''
+
+    def Set(self, config):
+        status = pydcgm.DcgmStatus()
+        ret = dcgm_structs.DCGM_ST_OK
+
+        try:
+            ret = dcgm_agent.dcgmConfigSet(self._dcgmHandle.handle,
+                                           self._groupId, config, status.handle)
+        except dcgm_structs.DCGMError as e:
+            pass
+
+        #Throw specific errors before return error
+        status.ThrowExceptionOnErrors()
+        #Throw an appropriate exception on error
+        dcgm_structs._dcgmCheckReturn(ret)
+
+    '''
+    Get configuration for this group
+
+    configType is a DCGM_CONFIG_? constant
+
+    Returns an array of dcgm_structs.c_dcgmDeviceConfig_v1 objects
+    Throws an exception on error
+    '''
+
+    def Get(self, configType):
+        status = pydcgm.DcgmStatus()
+
+        gpuIds = self._dcgmGroup.GetGpuIds()
+        configList = dcgm_agent.dcgmConfigGet(self._dcgmHandle.handle,
+                                              self._groupId, configType,
+                                              len(gpuIds), status.handle)
+        #Throw specific errors before return error
+        status.ThrowExceptionOnErrors()
+        return configList
+
+    '''
+    Enforce the configuration that has been set with Set()
+
+    Throws an exception on error
+    '''
+
+    def Enforce(self):
+        status = pydcgm.DcgmStatus()
+        ret = dcgm_structs.DCGM_ST_OK
+        try:
+            ret = dcgm_agent.dcgmConfigEnforce(self._dcgmHandle.handle,
+                                               self._groupId, status.handle)
+        except dcgm_structs.DCGMError as e:
+            pass
+
+        #Throw specific errors before return error
+        status.ThrowExceptionOnErrors()
+        #Throw an appropriate exception on error
+        dcgm_structs._dcgmCheckReturn(ret)
+
+
+class DcgmGroupSamples:
+
+    def __init__(self, dcgmHandle, groupId, dcgmGroup):
+        self._dcgmHandle = dcgmHandle
+        self._groupId = groupId
+        self._dcgmGroup = dcgmGroup
+
+    '''
+    Tell DCGM to start recording samples for the given field group
+
+    fieldGroup: DcgmFieldGroup() instance tracking the fields we want to watch.
+    updateFreq: How often to update these fields in usec
+    maxKeepAge: How long to keep data for these fields in seconds
+    maxKeepSamples: Maximum number of samples to keep per field. 0=no limit
+
+    Once the field collection is watched, it will update whenever the next update
+    loop occurs. If you want to query these values immediately, use
+    handle.UpdateAllFields(True) to make sure that the fields have updated at least once.
+    '''
+
+    def WatchFields(self, fieldGroup, updateFreq, maxKeepAge, maxKeepSamples):
+        ret = dcgm_agent.dcgmWatchFields(self._dcgmHandle.handle, self._groupId,
+                                         fieldGroup.fieldGroupId, updateFreq,
+                                         maxKeepAge, maxKeepSamples)
+        dcgm_structs._dcgmCheckReturn(ret)
+
+    '''
+    tell DCGM to stop recording samples for a given field group
+
+    fieldGroup: DcgmFieldGroup() instance tracking the fields we want to unwatch.
+    '''
+
+    def UnwatchFields(self, fieldGroup):
+        ret = dcgm_agent.dcgmUnwatchFields(self._dcgmHandle.handle,
+                                           self._groupId,
+                                           fieldGroup.fieldGroupId)
+        dcgm_structs._dcgmCheckReturn(ret)
+
+    '''
+    Get the most recent values for each field in a field collection
+
+    fieldGroup: DcgmFieldGroup() instance tracking the fields we want to watch.
+
+    Returns DcgmFieldValueCollection object. Use its .values[gpuId][fieldId][0].value to access values
+    '''
+
+    def GetLatest(self, fieldGroup):
+        dfvc = dcgm_field_helpers.DcgmFieldValueCollection(
+            self._dcgmHandle.handle, self._groupId)
+        dfvc.GetLatestValues(fieldGroup)
+        return dfvc
+
+    '''
+    Get the most recent values for each field in a field collection
+
+    fieldGroup: DcgmFieldGroup() instance tracking the fields we want to watch.
+
+    Returns DcgmFieldValueEntityCollection object. Use its .values[entityGroupId][entityId][fieldId][0].value to access values
+    '''
+
+    def GetLatest_v2(self, fieldGroup):
+        dfvec = dcgm_field_helpers.DcgmFieldValueEntityCollection(
+            self._dcgmHandle.handle, self._groupId)
+        dfvec.GetLatestValues(fieldGroup)
+        return dfvec
+
+    '''
+    Get the new values for each field in a field collection since the last
+    collection.
+
+    dfvc:       DcgmFieldValueCollection() instance. Will return a
+                DcgmFieldValueCollection with values since the one passed in.
+                Pass None for the first call to get one for subsequent calls.
+                On subsequent calls, pass what was returned.
+    fieldGroup: DcgmFieldGroup() instance tracking the fields we want to watch.
+
+    Returns DcgmFieldValueCollection object. Use its .values[gpuId][fieldId][*].value to access values
+    '''
+
+    def GetAllSinceLastCall(self, dfvc, fieldGroup):
+        if dfvc == None:
+            dfvc = dcgm_field_helpers.DcgmFieldValueCollection(
+                self._dcgmHandle.handle, self._groupId)
+            dfvc.GetLatestValues(fieldGroup)
+        else:
+            # We used to expect at least one value (GetLatestValues), so this
+            # ensures we provide one at the risk of repetition. This should not
+            # happen if we call this function infrequently enough (slower than
+            # the sampling rate).
+            dfvc.GetAllSinceLastCall(fieldGroup)
+            if len(dfvc.values) == 0:
+                dfvc.GetLatestValues(fieldGroup)
+        return dfvc
+
+    '''
+    Gets more values for each field in a field entity collection
+
+    dfvec:      DcgmFieldValueEntityCollection() instance. Will return a
+                DcgmFieldValueEntityCollection with values since the one passed
+                in. Pass None for the first call to get one for subsequent
+                calls. On subsequent calls, pass what was returned.
+
+    fieldGroup: DcgmFieldGroup() instance tracking the fields we want to watch.
+
+    Returns DcgmFieldValueEntityCollection object. Use its .values[entityGroupId][entityId][fieldId][*].value to access values
+    '''
+
+    def GetAllSinceLastCall_v2(self, dvfec, fieldGroup):
+        if dfvec == None:
+            dfvec = dcgm_field_helpers.DcgmFieldValueEntityCollection(
+                self._dcgmHandle.handle, self._groupId)
+            dfvec.GetLastestValues_v2(fieldGroup)
+        else:
+            dfvec.GetAllSinceLastCall_v2(fieldGroup)
+            # We used to expect at least one value (GetLatestValues), so this
+            # ensures we provide one at the risk of repetition. This should not
+            # happen if we call this function infrequently enough (slower than
+            # the sampling rate).
+            if len(dfvec.values) == 0:
+                dfvec.GetLatestValues_v2(fieldGroup)
+
+        return dfvec
+
+    '''
+    Convenience alias for DcgmHandle.UpdateAllFields(). All fields on the system will be updated, not
+    just this group's.
+    '''
+
+    def UpdateAllFields(self, waitForUpdate):
+        self._dcgmHandle.UpdateAllFields(waitForUpdate)
+
+
+class DcgmGroupHealth:
+
+    def __init__(self, dcgmHandle, groupId, dcgmGroup):
+        self._dcgmHandle = dcgmHandle
+        self._groupId = groupId
+        self._dcgmGroup = dcgmGroup
+
+    '''
+    Enable health checks for this group
+
+    systems: A bitmask of dcgm_structs.DCGM_HEALTH_WATCH_? definitions of which health checks to enable
+    updateInterval: How often DCGM should request new health data from the driver in usec
+    maxKeepAge: How long DCGM should keep health data around once it has been retrieved from the driver in seconds
+    '''
+
+    def Set(self, systems, updateInterval=None, maxKeepAge=None):
+        if updateInterval is None or maxKeepAge is None:
+            ret = dcgm_agent.dcgmHealthSet(self._dcgmHandle.handle,
+                                           self._groupId, systems)
+        else:
+            ret = dcgm_agent.dcgmHealthSet_v2(self._dcgmHandle.handle,
+                                              self._groupId, systems,
+                                              updateInterval, maxKeepAge)
+        dcgm_structs._dcgmCheckReturn(ret)
+
+    '''
+    Retrieve the current state of the DCGM health check system
+
+    Returns a bitmask of dcgm_structs.DCGM_HEALTH_WATCH_? definitions of which health checks are currently enabled
+    '''
+
+    def Get(self):
+        systems = dcgm_agent.dcgmHealthGet(self._dcgmHandle.handle,
+                                           self._groupId)
+        return systems
+
+    '''
+    Check the configured watches for any errors/failures/warnings that have occurred
+    since the last time this check was invoked.  On the first call, stateful information
+    about all of the enabled watches within a group is created but no error results are
+    provided.  On subsequent calls, any error information will be returned.
+
+    @param version    IN: Allows the caller to use an older version of this request. Should be 
+                          dcgm_structs.dcgmHealthResponse_version4
+
+    Returns a dcgm_structs.c_dcgmHealthResponse_* object that contains results for each GPU/entity
+    '''
+
+    def Check(self, version=dcgm_structs.dcgmHealthResponse_version4):
+        resp = dcgm_agent.dcgmHealthCheck(self._dcgmHandle.handle,
+                                          self._groupId, version)
+        return resp
+
+
+class DcgmGroupPolicy:
+
+    def __init__(self, dcgmHandle, groupId, dcgmGroup):
+        self._dcgmHandle = dcgmHandle
+        self._groupId = groupId
+        self._dcgmGroup = dcgmGroup
+
+    '''
+    Get the current violation policy inside the policy manager. Given a groupId, a number of 
+    policy structures are retrieved.
+    
+    @param statusHandle              IN/OUT: pydcgm.DcgmStatus for the resulting status of the operation. Pass it as None 
+                                             if the detailed error information for the operation is not needed (default).
+            
+    Returns a list of dcgm_structs.c_dcgmPolicy_v1 with the same length as the number of GPUs in the group.  
+    The index of an entry corresponds to a given GPU ID in the group.  Throws an exception on error.
+    '''
+
+    def Get(self, statusHandle=None):
+        if statusHandle:
+            statusHandle = statusHandle.handle
+        count = len(self._dcgmGroup.GetGpuIds())
+        if count <= 0:
+            raise pydcgm.DcgmException(
+                "This group has no GPUs, cannot retrieve policies")
+        return dcgm_agent.dcgmPolicyGet(self._dcgmHandle.handle, self._groupId,
+                                        count, statusHandle)
+
+    '''
+    Set the current violation policy inside the policy manager.  Given the conditions within "policy", 
+    if a violation has occurred, subsequent action(s) may be performed to either 
+    report or contain the failure.
+
+    This API is only supported on Tesla GPUs and will throw DCGMError_NotSupported if called on non-Tesla GPUs.
+    
+    @param policy                        IN: dcgm_structs.c_dcgmPolicy_v1 that will be applied to all GPUs in the group
+    
+    @param statusHandle              IN/OUT: pydcgm.DcgmStatus for the resulting status for the operation. Pass it as 
+                                             None if the detailed error information for the operation is not needed (default).
+            
+    Returns Nothing. Throws an exception on error
+    '''
+
+    def Set(self, policy, statusHandle=None):
+        if statusHandle:
+            statusHandle = statusHandle.handle
+        dcgm_agent.dcgmPolicySet(self._dcgmHandle.handle, self._groupId, policy,
+                                 statusHandle)
+
+    '''
+    Register a function to be called when a specific policy condition (see dcgm_structs.c_dcgmPolicy_v1.condition) 
+    has been violated.  This callback(s) will be called automatically when in DCGM_OPERATION_MODE_AUTO mode and only after 
+    DcgmPolicy.Trigger when in DCGM_OPERATION_MODE_MANUAL mode.  
+    All callbacks are made within a separate thread.
+
+    This API is only supported on Tesla GPUs and will throw DCGMError_NotSupported if called on non-Tesla GPUs.
+  
+    @param condition                     IN: The set of conditions specified as an OR'd list 
+                                             (see dcgm_structs.DCGM_POLICY_COND_*)
+                                             for which to register a callback function
+            
+    @param beginCallback                 IN: A function that should be called should a violation occur.  This 
+                                             function will be called prior to any actions specified by the policy are taken.
+            
+    @param finishCallback                IN: A reference to a function that should be called should a violation occur.  
+                                             This function will be called after any action specified by the policy are completed.
+    
+    At least one callback must be provided that is not None.
+    
+    Returns Nothing. Throws an exception on error.
+    '''
+
+    def Register(self, condition, beginCallback=None, finishCallback=None):
+        if beginCallback is None and finishCallback is None:
+            raise pydcgm.DcgmException(
+                "At least 1 callback must be provided to register that is not None"
+            )
+        dcgm_agent.dcgmPolicyRegister(self._dcgmHandle.handle, self._groupId,
+                                      condition, beginCallback, finishCallback)
+
+    '''
+    Unregister a function to be called for a specific policy condition (see dcgm_structs.c_dcgmPolicy_v1.condition) .
+    This function will unregister all callbacks for a given condition.
+ 
+    @param condition                     IN: The set of conditions specified as an OR'd list 
+                                             (see dcgm_structs.DCGM_POLICY_COND_*) 
+                                             for which to unregister a callback function
+
+    Returns Nothing. Throws an exception on error.
+    '''
+
+    def Unregister(self, condition):
+        dcgm_agent.dcgmPolicyUnregister(self._dcgmHandle.handle, self._groupId,
+                                        condition)
+
+    '''
+    Inform the policy manager loop to perform an iteration and trigger the callbacks of any
+    registered functions. Callback functions will be called from a separate thread as the calling function.
+ 
+    Note: The GPU monitoring and management agent must call this method periodically if the operation 
+    mode is set to manual mode (DCGM_OPERATION_MODE_MANUAL) during initialization 
+    (\ref DcgmHandle.__init__).
+    
+    Returns Nothing. Throws an exception if there is a generic error that the 
+    policy manager was unable to perform another iteration.
+    '''
+
+    def Trigger(self):
+        dcgm_agent.dcgmPolicyTrigger(self._dcgmHandle.handle)
+
+
+class DcgmGroupDiscovery:
+
+    def __init__(self, dcgmHandle, groupId, dcgmGroup):
+        self._dcgmHandle = dcgmHandle
+        self._groupId = groupId
+        self._dcgmGroup = dcgmGroup
+
+    '''
+    Get the topology for this group
+
+    Returns a c_dcgmGroupTopology_v1 object representing the topology for this group
+    '''
+
+    def GetTopology(self):
+        return dcgm_agent.dcgmGetGroupTopology(self._dcgmHandle.handle,
+                                               self._groupId)
+
+
+class DcgmGroupStats:
+
+    def __init__(self, dcgmHandle, groupId, dcgmGroup):
+        self._dcgmHandle = dcgmHandle
+        self._groupId = groupId
+        self._dcgmGroup = dcgmGroup
+
+    '''
+    Tell DCGM to start recording samples for fields returned from GetPidInfo()
+
+    updateFreq: How often to update these fields in usec
+    maxKeepAge: How long to keep data for these fields in seconds
+    maxKeepSamples: Maximum number of samples to keep per field. 0=no limit
+
+    Once the field collection is watched, it will update whenever the next update
+    loop occurs. If you want to query these values immediately, use
+    handle.UpdateAllFields(True) to make sure that the fields have updated at least once.
+    '''
+
+    def WatchPidFields(self, updateFreq, maxKeepAge, maxKeepSamples):
+        ret = dcgm_agent.dcgmWatchPidFields(self._dcgmHandle.handle,
+                                            self._groupId, updateFreq,
+                                            maxKeepAge, maxKeepSamples)
+        dcgm_structs._dcgmCheckReturn(ret)
+
+    '''
+    Get process stats for a given PID on this GPU group
+
+    You must call WatchPidFields() before this query for this method to return any results
+
+    Returns a dcgm_structs.c_dcgmPidInfo_v2 structure
+    '''
+
+    def GetPidInfo(self, pid):
+        return dcgm_agent.dcgmGetPidInfo(self._dcgmHandle.handle, self._groupId,
+                                         pid)
+
+    '''
+    Tell DCGM to start recording samples for fields returned from GetJobStats()
+
+    updateFreq: How often to update these fields in usec
+    maxKeepAge: How long to keep data for these fields in seconds
+    maxKeepSamples: Maximum number of samples to keep per field. 0=no limit
+
+    Once the fields are watched, they will update whenever the next update
+    loop occurs. If you want to query these values immediately, use
+    handle.UpdateAllFields(True) to make sure that the fields have updated at least once.
+    '''
+
+    def WatchJobFields(self, updateFreq, maxKeepAge, maxKeepSamples):
+        ret = dcgm_agent.dcgmWatchJobFields(self._dcgmHandle.handle,
+                                            self._groupId, updateFreq,
+                                            maxKeepAge, maxKeepSamples)
+        dcgm_structs._dcgmCheckReturn(ret)
+
+    '''
+    Start collecting stats for a named job for this GPU group
+
+    Calling this will tell DCGM to start tracking stats for the given jobId. Stats tracking
+    will end when StopJobStats() is called
+
+    You must call WatchJobFields() before this call to tell DCGM to start sampling the fields
+    that are returned from GetJobStats().
+
+    jobId is a unique string identifier for this job. An exception will be thrown if this is not unique
+
+    Returns Nothing (Will throw exception on error)
+    '''
+
+    def StartJobStats(self, jobId):
+        ret = dcgm_agent.dcgmJobStartStats(self._dcgmHandle.handle,
+                                           self._groupId, jobId)
+        dcgm_structs._dcgmCheckReturn(ret)
+
+    '''
+    Stop collecting stats for a named job
+
+    Calling this will tell DCGM to stop collecting stats for a job that was previously started
+    with StartJobStats().
+
+    jobId is the unique string that was passed as jobId to StartJobStats.
+
+    Returns Nothing (Will throw exception on error)
+    '''
+
+    def StopJobStats(self, jobId):
+        ret = dcgm_agent.dcgmJobStopStats(self._dcgmHandle.handle, jobId)
+        dcgm_structs._dcgmCheckReturn(ret)
+
+    '''
+    Get stats for a job that was started with StartJobStats. If StopJobStats has not been called yet,
+    this will get stats from when the job started until now. If StopJob was called prior to
+    this, the returned Stats will go from when StartJobStats was called to when StopJobStats was called.
+
+    jobId is the unique string that was passed as jobId to StartJobStats and StopJobStats
+
+    Returns a dcgm_structs.c_dcgmJobInfo_v3 structure. Throws an exception on error
+    '''
+
+    def GetJobStats(self, jobId):
+        ret = dcgm_agent.dcgmJobGetStats(self._dcgmHandle.handle, jobId)
+        return ret
+
+    '''
+    This API tells DCGM to stop tracking the job given by jobId. After this call, you will no longer
+    be able to call GetJobStats() on this jobId. However, you will be able to reuse jobId after
+    this call.
+
+    jobId is the unique string that was passed as jobId to StartJobStats and StopJobStats
+
+    Returns Nothing (Will throw exception on error)
+    '''
+
+    def RemoveJob(self, jobId):
+        ret = dcgm_agent.dcgmJobRemove(self._dcgmHandle.handle, jobId)
+        return ret
+
+    '''
+    This API tells DCGM to stop tracking all jobs. After this call, you will no longer
+    be able to call dcgmJobGetStats() any jobs until you call StartJobStats() again.
+    You will be able to reuse any previously-used jobIds after this call.
+
+    Returns Nothing (Will throw exception on error)
+    '''
+
+    def RemoveAllJobs(self):
+        ret = dcgm_agent.dcgmJobRemoveAll(self._dcgmHandle.handle)
+        return ret
+
+
+class DcgmGroupAction:
+
+    def __init__(self, dcgmHandle, groupId, dcgmGroup):
+        self._dcgmHandle = dcgmHandle
+        self._groupId = groupId
+        self._dcgmGroup = dcgmGroup
+
+    '''
+    Inform the action manager to perform a manual validation of a group of GPUs on the system
+
+    validate is what sort of validation to do. See dcgm_structs.DCGM_POLICY_VALID_* defines.
+
+    Returns a dcgm_structs.c_dcgmDiagResponse_v5 instance
+    '''
+
+    def Validate(self, validate):
+        runDiagInfo = dcgm_structs.c_dcgmRunDiag_v7()
+        runDiagInfo.version = dcgm_structs.dcgmRunDiag_version7
+        runDiagInfo.validate = validate
+        runDiagInfo.groupId = self._groupId
+
+        ret = dcgm_agent.dcgmActionValidate_v2(self._dcgmHandle.handle,
+                                               runDiagInfo)
+        return ret
+
+    '''
+    Run a diagnostic on this group of GPUs.
+
+    diagLevel is the level of diagnostic desired. See dcgm_structs.DCGM_DIAG_LVL_* constants.
+
+    Returns a dcgm_structs.c_dcgmDiagResponse_v5 instance
+    '''
+
+    def RunDiagnostic(self, diagLevel):
+        ret = dcgm_agent.dcgmRunDiagnostic(self._dcgmHandle.handle,
+                                           self._groupId, diagLevel)
+        return ret
+
+    '''
+    Run a specific diagnostic test on this group of GPUs.
+    testName is the name of the specific test that should be invoked.
+    Returns a dcgm_structs.c_dcgmDiagResponse_v5 instance
+    '''
+
+    def RunSpecificTest(self, testName):
+        runDiagInfo = dcgm_structs.c_dcgmRunDiag_v7()
+        runDiagInfo.version = dcgm_structs.dcgmRunDiag_version7
+        for i in range(len(testName)):
+            runDiagInfo.testNames[0][i] = testName[i]
+        runDiagInfo.groupId = self._groupId
+        runDiagInfo.validate = dcgm_structs.DCGM_POLICY_VALID_NONE
+        response = dcgm_agent.dcgmActionValidate_v2(self._dcgmHandle.handle,
+                                                    runDiagInfo)
+        return response
+
+
+class DcgmGroupProfiling:
+
+    def __init__(self, dcgmHandle, groupId, dcgmGroup):
+        """
+
+        Parameters
+        ----------
+        dcgmHandle : DcgmHandle
+        groupId : int
+        dcgmGroup : DcgmGroup
+        """
+        self._dcgmHandle = dcgmHandle
+        self._groupId = groupId
+        self._dcgmGroup = dcgmGroup
+
+    def GetSupportedMetricGroups(self):
+        """
+         Get a list of the profiling metric groups available for this group of entities
+
+         :return: dcgm_structs.c_dcgmProfGetMetricGroups_v3
+         :throws: dcgm_structs.DCGMError on error
+         """
+        gpuIds = self._dcgmGroup.GetGpuIds()
+        if len(gpuIds) < 1:
+            raise dcgm_structs.DCGMError_ProfilingNotSupported
+
+        ret = dcgm_agent.dcgmProfGetSupportedMetricGroups(
+            self._dcgmHandle.handle, gpuIds[0])
+        return ret
+
+
+class DcgmGroup:
+    '''
+    Constructor.
+
+    Either groupId OR groupName must be provided as a parameter.
+    This will set which GPU group this object is bound to
+
+    groupId=DCGM_GROUP_ALL_GPUS creates a group with all GPUs. Passing an existing groupId will
+    not create an additional group.
+    If groupName is provided, an empty group (No GPUs) of name groupName will be created. This group
+    will be destroyed when this object goes out of scope or is deleted with del().
+    groupType is the type of group to create. See dcgm_structs.DCGM_GROUP_? constants.
+    '''
+
+    def __init__(self,
+                 dcgmHandle,
+                 groupId=None,
+                 groupName=None,
+                 groupType=dcgm_structs.DCGM_GROUP_EMPTY):
+        self._dcgmHandle = dcgmHandle
+
+        if groupId is None and groupName is None:
+            raise pydcgm.DcgmException(
+                "Either groupId or groupName is required")
+
+        if groupId is not None:
+            self._groupId = groupId
+        else:
+            self._groupId = dcgm_agent.dcgmGroupCreate(self._dcgmHandle.handle,
+                                                       groupType, groupName)
+
+        #Create namespace classes
+        self.config = DcgmGroupConfig(self._dcgmHandle, self._groupId, self)
+        self.samples = DcgmGroupSamples(self._dcgmHandle, self._groupId, self)
+        self.health = DcgmGroupHealth(self._dcgmHandle, self._groupId, self)
+        self.policy = DcgmGroupPolicy(self._dcgmHandle, self._groupId, self)
+        self.discovery = DcgmGroupDiscovery(self._dcgmHandle, self._groupId,
+                                            self)
+        self.stats = DcgmGroupStats(self._dcgmHandle, self._groupId, self)
+        self.action = DcgmGroupAction(self._dcgmHandle, self._groupId, self)
+        self.profiling = DcgmGroupProfiling(self._dcgmHandle, self._groupId,
+                                            self)
+
+    '''
+    Remove this group from DCGM. This object will no longer be valid after this call.
+    '''
+
+    def Delete(self):
+        del self.config
+        self.config = None
+        del self.samples
+        self.samples = None
+        del self.health
+        self.health = None
+        del self.policy
+        self.policy = None
+        del self.discovery
+        self.discovery = None
+        del self.stats
+        self.stats = None
+        del self.action
+        self.action = None
+        del self.profiling
+        self.profiling = None
+
+        #Delete the group we created if we're not using the special all-GPU group
+        if self._groupId is not None and not self._IsGroupIdStatic():
+            ret = dcgm_agent.dcgmGroupDestroy(self._dcgmHandle.handle,
+                                              self._groupId)
+            dcgm_structs._dcgmCheckReturn(ret)
+
+        self._groupId = None
+
+    '''
+    Private method to determine if our groupId is a predefined one
+    '''
+
+    def _IsGroupIdStatic(self):
+        if self._groupId == dcgm_structs.DCGM_GROUP_ALL_GPUS or \
+           self._groupId == dcgm_structs.DCGM_GROUP_ALL_NVSWITCHES:
+            return True
+        return False
+
+    '''
+    Add a GPU to this group
+
+    gpuId is the GPU ID to add to our group
+
+    Returns Nothing. Throws an exception on error
+    '''
+
+    def AddGpu(self, gpuId):
+        if self._IsGroupIdStatic():
+            raise pydcgm.DcgmException("Can't add a GPU to a static group")
+
+        ret = dcgm_agent.dcgmGroupAddDevice(self._dcgmHandle.handle,
+                                            self._groupId, gpuId)
+        dcgm_structs._dcgmCheckReturn(ret)
+
+    '''
+    Add an entity to this group
+
+    entityGroupId is DCGM_FE_? constant of the entity group this entity belongs to
+    entityId is the entity to add to this group
+
+    Returns Nothing. Throws an exception on error
+    '''
+
+    def AddEntity(self, entityGroupId, entityId):
+        if self._IsGroupIdStatic():
+            raise pydcgm.DcgmException("Can't add an entity to a static group")
+
+        ret = dcgm_agent.dcgmGroupAddEntity(self._dcgmHandle.handle,
+                                            self._groupId, entityGroupId,
+                                            entityId)
+        dcgm_structs._dcgmCheckReturn(ret)
+
+    '''
+    Remove a GPU from this group
+
+    gpuId is the GPU ID to remove from our group
+
+    Returns Nothing. Throws an exception on error
+    '''
+
+    def RemoveGpu(self, gpuId):
+        if self._IsGroupIdStatic():
+            raise pydcgm.DcgmException("Can't remove a GPU from a static group")
+
+        ret = dcgm_agent.dcgmGroupRemoveDevice(self._dcgmHandle.handle,
+                                               self._groupId, gpuId)
+        dcgm_structs._dcgmCheckReturn(ret)
+
+    '''
+    Remove an entity from this group
+
+    entityGroupId is DCGM_FE_? constant of the entity group this entity belongs to
+    entityId is the entity to remove from this group
+
+    Returns Nothing. Throws an exception on error
+    '''
+
+    def RemoveEntity(self, entityGroupId, entityId):
+        if self._IsGroupIdStatic():
+            raise pydcgm.DcgmException(
+                "Can't remove an entity from a static group")
+
+        ret = dcgm_agent.dcgmGroupRemoveEntity(self._dcgmHandle.handle,
+                                               self._groupId, entityGroupId,
+                                               entityId)
+        dcgm_structs._dcgmCheckReturn(ret)
+
+    '''
+    Get an array of GPU ids that are part of this group
+
+    Note: this ignores non-GPU members of the group
+
+    Returns a list of GPU ids. Throws an exception on error
+    '''
+
+    def GetGpuIds(self):
+        groupInfo = dcgm_agent.dcgmGroupGetInfo(self._dcgmHandle.handle,
+                                                self._groupId)
+        groupGpuIds = []
+        for i in range(groupInfo.count):
+            if groupInfo.entityList[i].entityGroupId != dcgm_fields.DCGM_FE_GPU:
+                continue
+            groupGpuIds.append(groupInfo.entityList[i].entityId)
+        return groupGpuIds
+
+    '''
+    Get an array of entities that are part of this group
+
+    Returns a list of c_dcgmGroupEntityPair_t structs. Throws an exception on error
+    '''
+
+    def GetEntities(self):
+        groupInfo = dcgm_agent.dcgmGroupGetInfo(self._dcgmHandle.handle,
+                                                self._groupId)
+        entities = groupInfo.entityList[0:groupInfo.count]
+        return entities
+
+    '''
+    Get the groupId of this object
+
+    Returns our groupId
+    '''
+
+    def GetId(self):
+        return self._groupId
diff --git a/model_analyzer/monitor/dcgm/DcgmHandle.py b/model_analyzer/monitor/dcgm/DcgmHandle.py
new file mode 100644
index 000000000..0234318ed
--- /dev/null
+++ b/model_analyzer/monitor/dcgm/DcgmHandle.py
@@ -0,0 +1,141 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import model_analyzer.monitor.dcgm.pydcgm as pydcgm
+import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
+import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent
+
+
+class DcgmHandle:
+    '''
+    Class to encapsulate a handle to DCGM and global methods to control + query the host engine
+    '''
+
+    def __init__(self,
+                 handle=None,
+                 ipAddress=None,
+                 opMode=dcgm_structs.DCGM_OPERATION_MODE_AUTO,
+                 persistAfterDisconnect=False,
+                 unixSocketPath=None,
+                 timeoutMs=0):
+        '''
+        Constructor
+
+        handle is an existing handle from dcgmInit(). Pass None if you want this object to handle DCGM initialization for you
+        ipAddress is the host to connect to. None = start embedded host engine
+        opMode is a dcgm_structs.DCGM_OPERATION_MODE_* constant for how the host engine should run (embedded mode only)
+        persistAfterDisconnect (TCP-IP connections only) is whether the host engine should persist all of our watches
+                               after we disconnect. 1=persist our watches. 0=clean up after our connection
+        unixSocketPath is a path to a path on the local filesystem that is a unix socket that the host engine is listening on.
+                       This option is mutually exclusive with ipAddress
+        timeoutMs is how long to wait for TCP/IP or Unix domain connections to establish in ms. 0=Default timeout (5000ms)
+        '''
+        self._handleCreated = False
+        self._persistAfterDisconnect = persistAfterDisconnect
+
+        if handle is not None:
+            self.handle = handle
+            return
+
+        self._ipAddress = ipAddress
+
+        #Can't provide both unix socket and ip address
+        if ipAddress is not None and unixSocketPath is not None:
+            raise dcgm_structs.dcgmExceptionClass(dcgm_structs.DCGM_ST_BADPARAM)
+
+        #Initialize the DCGM client library
+        dcgm_structs._dcgmInit()
+        dcgm_agent.dcgmInit(
+        )  #Not harmful to call this multiple times in a process
+
+        #If neither ipAddress nor unixSocketPath are present, start an embedded host engine
+        if ipAddress is None and unixSocketPath is None:
+            self.handle = dcgm_agent.dcgmStartEmbedded(opMode)
+            self.isEmbedded = True
+            self._handleCreated = True
+            return
+
+        #Set up connection parameters. We're connecting to something
+        connectParams = dcgm_structs.c_dcgmConnectV2Params_v2()
+        connectParams.version = dcgm_structs.c_dcgmConnectV2Params_version
+        connectParams.timeoutMs = timeoutMs
+        if self._persistAfterDisconnect:
+            connectParams.persistAfterDisconnect = 1
+        else:
+            connectParams.persistAfterDisconnect = 0
+
+        if ipAddress is not None:
+            connectToAddress = ipAddress
+            connectParams.addressIsUnixSocket = 0
+        else:
+            connectToAddress = unixSocketPath
+            connectParams.addressIsUnixSocket = 1
+
+        self.handle = dcgm_agent.dcgmConnect_v2(connectToAddress, connectParams)
+        self.isEmbedded = False
+        self._handleCreated = True
+
+    def __del__(self):
+        '''
+        Destructor
+        '''
+        if self._handleCreated:
+            self.Shutdown()
+
+    def GetSystem(self):
+        '''
+        Get a DcgmSystem instance for this handle
+        '''
+        return pydcgm.DcgmSystem(self)
+
+    def __StopDcgm__(self):
+        '''
+        Shuts down either the hostengine or the embedded server
+        '''
+        if self.isEmbedded:
+            dcgm_agent.dcgmStopEmbedded(self.handle)
+        else:
+            dcgm_agent.dcgmDisconnect(self.handle)
+
+    def Shutdown(self):
+        '''
+        Shutdown DCGM hostengine
+        '''
+        if not self._handleCreated:
+            return
+
+        try:
+            self.__StopDcgm__()
+        except AttributeError as e:
+            # Due to multi-threading, sometimes this is called after the modules have been unloaded, making
+            # dcgm_agent effectively NoneType and resulting in this error being thrown.
+            pass
+
+        self._handleCreated = False
+        self.handle = None
+
+    @staticmethod
+    def Unload():
+        '''
+        Unload DCGM, removing any memory it is pointing at. Use this if you really
+        want DCGM gone from your process. Shutdown() only closes the connection/embedded host engine
+        that was create in __init__().
+        '''
+        dcgm_agent.dcgmShutdown()
+
+    def GetIpAddress(self):
+        '''
+        Returns the IP address associated with this handle. None=embedded connection
+        '''
+        return self._ipAddress
diff --git a/model_analyzer/monitor/dcgm/DcgmJsonReader.py b/model_analyzer/monitor/dcgm/DcgmJsonReader.py
new file mode 100644
index 000000000..9c2ce187e
--- /dev/null
+++ b/model_analyzer/monitor/dcgm/DcgmJsonReader.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from model_analyzer.monitor.dcgm.DcgmReader import DcgmReader
+from json import dumps as toJson
+from os import environ
+from socket import socket, AF_INET, SOCK_DGRAM
+from time import sleep
+import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields
+import logging
+
+
+class DcgmJsonReader(DcgmReader):
+
+    ###########################################################################
+    def ConvertFieldIdToTag(self, fieldId):
+        return self.m_fieldIdToInfo[fieldId].tag
+
+    ###########################################################################
+    def PrepareJson(self, gpuId, obj):
+        '''
+        Receive an object with measurements turn it into an equivalent JSON. We
+        add the GPU UUID first.
+        '''
+        uuid = self.m_gpuIdToUUId[gpuId]
+        # This mutates the original object, but it shouldn't be a problem here
+        obj['gpu_uuid'] = uuid
+        return toJson(obj)
+
+    ###########################################################################
+    def CustomDataHandler(self, fvs):
+        for gpuId in list(fvs.keys()):
+            # We don't need the keys because each value has a `fieldId`
+            # So just get the values
+            gpuData = list(fvs[gpuId].values())
+
+            # Get the values from FV (which is a list of values)
+            valuesListOfLists = [datum.values for datum in gpuData]
+
+            # We only want the last measurement
+            lastValueList = [l[-1] for l in valuesListOfLists]
+
+            # Turn FV into a conventional Python Object which can be converted to JSON
+            outObject = {
+                self.ConvertFieldIdToTag(i.fieldId): i.value
+                for i in lastValueList
+            }
+            outJson = self.PrepareJson(gpuId, outObject)
+
+            self.CustomJsonHandler(outJson)
+
+    ###########################################################################
+    def CustomJsonHandler(self, outJson):
+        '''
+        This method should be overriden by subclasses to handle the JSON objects
+        received.
+        '''
+        logging.warning('CustomJsonHandler has not been overriden')
+        logging.info(outJson)
diff --git a/model_analyzer/monitor/dcgm/DcgmReader.py b/model_analyzer/monitor/dcgm/DcgmReader.py
new file mode 100644
index 000000000..2c32a1f91
--- /dev/null
+++ b/model_analyzer/monitor/dcgm/DcgmReader.py
@@ -0,0 +1,623 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import subprocess
+import signal, os
+import model_analyzer.monitor.dcgm.pydcgm as pydcgm
+import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
+import threading
+import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields
+import sys
+import logging
+
+defaultFieldIds = [
+    dcgm_fields.DCGM_FI_DEV_POWER_USAGE, dcgm_fields.DCGM_FI_DEV_GPU_TEMP,
+    dcgm_fields.DCGM_FI_DEV_SM_CLOCK, dcgm_fields.DCGM_FI_DEV_GPU_UTIL,
+    dcgm_fields.DCGM_FI_DEV_RETIRED_PENDING,
+    dcgm_fields.DCGM_FI_DEV_RETIRED_SBE, dcgm_fields.DCGM_FI_DEV_RETIRED_DBE,
+    dcgm_fields.DCGM_FI_DEV_ECC_SBE_VOL_TOTAL,
+    dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL,
+    dcgm_fields.DCGM_FI_DEV_ECC_SBE_AGG_TOTAL,
+    dcgm_fields.DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, dcgm_fields.DCGM_FI_DEV_FB_TOTAL,
+    dcgm_fields.DCGM_FI_DEV_FB_FREE, dcgm_fields.DCGM_FI_DEV_FB_USED,
+    dcgm_fields.DCGM_FI_DEV_PCIE_REPLAY_COUNTER,
+    dcgm_fields.DCGM_FI_DEV_POWER_VIOLATION,
+    dcgm_fields.DCGM_FI_DEV_THERMAL_VIOLATION,
+    dcgm_fields.DCGM_FI_DEV_XID_ERRORS,
+    dcgm_fields.DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL,
+    dcgm_fields.DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL,
+    dcgm_fields.DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL,
+    dcgm_fields.DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL,
+    dcgm_fields.DCGM_FI_DEV_MEM_CLOCK, dcgm_fields.DCGM_FI_DEV_MEMORY_TEMP,
+    dcgm_fields.DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION,
+    dcgm_fields.DCGM_FI_DEV_MEM_COPY_UTIL,
+    dcgm_fields.DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL,
+    dcgm_fields.DCGM_FI_DEV_PCIE_TX_THROUGHPUT,
+    dcgm_fields.DCGM_FI_DEV_PCIE_RX_THROUGHPUT
+]
+
+
+def entity_group_id_to_string(entityGroupId):
+    if entityGroupId == dcgm_fields.DCGM_FE_GPU:
+        return 'GPU'
+    elif entityGroupId == dcgm_fields.DCGM_FE_VGPU:
+        return 'VGPU'
+    elif entityGroupId == dcgm_fields.DCGM_FE_SWITCH:
+        return 'NVSWITCH'
+    elif entityGroupId == dcgm_fields.DCGM_FE_GPU_I:
+        return 'GPU INSTANCE'
+    elif entityGroupId == dcgm_fields.DCGM_FE_GPU_CI:
+        return 'COMPUTE INSTANCE'
+    elif entityGroupId == dcgm_fields.DCGM_FE_LINK:
+        return 'LINK'
+    else:
+        return ''
+
+
+class DcgmReader(object):
+    ###########################################################################
+    '''
+    This function can be implemented as a callback in the class that inherits from DcgmReader
+    to handle each field individually.
+    By default, it passes a string with the gpu, field tag, and value to LogInfo()
+    @params:
+    gpuId : the id of the GPU this field is reporting on
+    fieldId : the id of the field (ignored by default, may be useful for children)
+    fieldTag : the string representation of the field id
+    val : the value class that comes from DCGM (v.value is the value for the field)
+    '''
+
+    def CustomFieldHandler(self, gpuId, fieldId, fieldTag, val):
+        print("GPU %s field %s=%s" % (str(gpuId), fieldTag, str(val.value)))
+
+    ###########################################################################
+    '''
+    This function can be implemented as a callback in the class that inherits from DcgmReader
+    to handle each field individually.
+    By default, it passes a string with the gpu, field tag, and value to LogInfo()
+    @params:
+    entityGroupId : the type of entity this field is reporting on
+    entityId : the id of the entity this field is reporting on
+    fieldId : the id of the field (ignored by default, may be useful for children)
+    fieldTag : the string representation of the field id
+    val : the value class that comes from DCGM (v.value is the value for the field)
+    '''
+
+    def CustomFieldHandler_v2(self, entityGroupId, entityId, fieldId, fieldTag,
+                              val):
+        print("%s %s field %s=%s" % (entity_group_id_to_string(entityGroupId),
+                                     str(entityId), fieldTag, str(val.value)))
+
+    ###########################################################################
+    '''
+    This function can be implemented as a callback in the class that inherits from DcgmReader
+    to handle all of the data queried from DCGM.
+    By default, it will simply print the field tags and values for each GPU
+    @params:
+    fvs : Data in the format entityGroupId -> entityId -> values (dictionary of dictionaries)
+    '''
+
+    def CustomDataHandler_v2(self, fvs):
+        for entityGroupId in list(fvs.keys()):
+            entityGroup = fvs[entityGroupId]
+
+            for entityId in list(entityGroup.keys()):
+                entityFv = entityGroup[entityId]
+                for fieldId in list(entityFv.keys()):
+                    if fieldId in self.m_dcgmIgnoreFields:
+                        continue
+
+                    val = entityFv[fieldId][-1]
+
+                    if val.isBlank:
+                        continue
+
+                    fieldTag = self.m_fieldIdToInfo[fieldId].tag
+
+                    self.CustomFieldHandler_v2(entityGroupId, entityId, fieldId,
+                                               fieldTag, val)
+
+    ###########################################################################
+    '''
+    This function can be implemented as a callback in the class that inherits from DcgmReader
+    to handle all of the data queried from DCGM.
+    By default, it will simply print the field tags and values for each GPU
+    @params:
+    fvs : Dictionary with gpuID as key and values as Value
+    '''
+
+    def CustomDataHandler(self, fvs):
+        for gpuId in list(fvs.keys()):
+            gpuFv = fvs[gpuId]
+
+            for fieldId in list(gpuFv.keys()):
+                if fieldId in self.m_dcgmIgnoreFields:
+                    continue
+
+                val = gpuFv[fieldId][-1]
+
+                if val.isBlank:
+                    continue
+
+                fieldTag = self.m_fieldIdToInfo[fieldId].tag
+
+                self.CustomFieldHandler(gpuId, fieldId, fieldTag, val)
+
+    ###########################################################################
+    def SetupGpuIdUUIdMappings(self):
+        '''
+        Populate the m_gpuIdToUUId map
+        '''
+
+        gpuIds = self.m_dcgmGroup.GetGpuIds()
+        for gpuId in gpuIds:
+            gpuInfo = self.m_dcgmSystem.discovery.GetGpuAttributes(gpuId)
+            self.m_gpuIdToUUId[gpuId] = gpuInfo.identifiers.uuid
+
+    ###########################################################################
+    '''
+    Constructor
+    @params:
+    hostname        : Address:port of the host to connect. Defaults to localhost
+    fieldIds        : List of the field ids to publish. If it isn't specified, our default list is used.
+    updateFrequency : Frequency of update in microseconds. Defauls to 10 seconds or 10000000 microseconds
+    maxKeepAge      : Max time to keep data from NVML, in seconds. Default is 3600.0 (1 hour)
+    ignoreList      : List of the field ids we want to query but not publish.
+    gpuIds          : List of GPU IDs to monitor. If not provided, DcgmReader will monitor all GPUs on the system
+    fieldIntervalMap: Map of intervals to list of field numbers to monitor. Takes precedence over fieldIds and updateFrequency if not None.
+    '''
+
+    def __init__(self,
+                 hostname='localhost',
+                 fieldIds=None,
+                 updateFrequency=10000000,
+                 maxKeepAge=3600.0,
+                 ignoreList=None,
+                 fieldGroupName='dcgm_fieldgroupData',
+                 gpuIds=None,
+                 entities=None,
+                 fieldIntervalMap=None):
+        fieldIds = fieldIds or defaultFieldIds
+        ignoreList = ignoreList or []
+        self.m_dcgmHostName = hostname
+        self.m_updateFreq = updateFrequency  # default / redundant
+
+        self.m_fieldGroupName = fieldGroupName
+        self.m_publishFields = {}
+
+        if fieldIntervalMap is not None:
+            self.m_publishFields = fieldIntervalMap
+        else:
+            self.m_publishFields[self.m_updateFreq] = fieldIds
+
+        self.m_requestedGpuIds = gpuIds
+        self.m_requestedEntities = entities
+
+        self.m_dcgmIgnoreFields = ignoreList  #Fields not to publish
+        self.m_maxKeepAge = maxKeepAge
+        self.m_dcgmHandle = None
+        self.m_dcgmSystem = None
+        self.m_dcgmGroup = None
+        self.m_closeHandle = False
+
+        self.m_gpuIdToBusId = {}  #GpuID => PCI-E busId string
+        self.m_gpuIdToUUId = {}  # FieldId => dcgm_fields.dcgm_field_meta_t
+        self.m_fieldIdToInfo = {}  #FieldId => dcgm_fields.dcgm_field_meta_t
+        self.m_lock = threading.Lock(
+        )  #DCGM connection start-up/shutdown is not thread safe. Just lock pessimistically
+        self.m_debug = False
+
+        # For GetAllSinceLastCall* calls. We cache the value for these objects
+        # after first retrieval, so initializing them to None lets us know if
+        # we've made a first retrieval. The first retrieval is based on a
+        # "since" timestamp of 0, so it gets data in which we are not
+        # interested in. The second retrieval gets data since the first one, in
+        # which we ARE interested. The practical upshot of this is that actual
+        # reporting of data is delayed one collectd sampling interval -- as if
+        # the sampling was actually started one collectd sampling interval
+        # later. We expect this is not an issue.
+        self.fvs = None
+        self.dfvc = None
+        self.dfvec = None
+
+    ###########################################################################
+    '''
+    Define what should happen to this object at the beginning of a with
+    block. In this case, nothing more is needed since the constructor should've
+    been called.
+    '''
+
+    def __enter__(self):
+        return self
+
+    ###########################################################################
+    '''
+    Define the cleanup
+    '''
+
+    def __exit__(self, type, value, traceback):
+        self.Shutdown()
+
+    ###########################################################################
+    '''
+    This function intializes DCGM from the specified directory and connects to
+    the host engine.
+    '''
+
+    def InitWrapped(self, path=None):
+        dcgm_structs._dcgmInit(libDcgmPath=path)
+        self.Reconnect()
+
+    ###########################################################################
+    '''
+    This function tries to connect to hostengine and calls initwrapped to initialize
+    the dcgm.
+    '''
+
+    def Init(self, libpath=None):
+        with self.m_lock:
+            try:
+                self.InitWrapped(path=libpath)
+            except dcgm_structs.dcgmExceptionClass(
+                    dcgm_structs.DCGM_ST_CONNECTION_NOT_VALID):
+                self.LogError("Can't connect to nv-hostengine. Is it down?")
+                self.SetDisconnected()
+
+    ###########################################################################
+    '''
+    Delete the DCGM group, DCGM system and DCGM handle and clear the attributes
+    on shutdown.
+    '''
+
+    def SetDisconnected(self):
+        #Force destructors since DCGM currently doesn't support more than one client connection per process
+        if self.m_dcgmGroup is not None:
+            del (self.m_dcgmGroup)
+            self.m_dcgmGroup = None
+        if self.m_dcgmSystem is not None:
+            del (self.m_dcgmSystem)
+            self.m_dcgmSystem = None
+        if self.m_dcgmHandle is not None:
+            del (self.m_dcgmHandle)
+            self.m_dcgmHandle = None
+
+    ##########################################################################
+    '''
+    This function calls the SetDisconnected function which disconnects from
+    DCGM and clears DCGM handle and DCGM group.
+    '''
+
+    def Shutdown(self):
+        with self.m_lock:
+            if self.m_closeHandle == True:
+                self.SetDisconnected()
+
+    ############################################################################
+    '''
+    Turns debugging output on
+    '''
+
+    def AddDebugOutput(self):
+        self.m_debug = True
+
+    ############################################################################
+    '''
+    '''
+
+    def InitializeFromHandle(self):
+        self.m_dcgmSystem = self.m_dcgmHandle.GetSystem()
+
+        if not self.m_requestedGpuIds and not self.m_requestedEntities:
+            self.m_dcgmGroup = self.m_dcgmSystem.GetDefaultGroup()
+        else:
+            groupName = "dcgmreader_%d" % os.getpid()
+
+            if self.m_requestedGpuIds:
+                self.m_dcgmGroup = self.m_dcgmSystem.GetGroupWithGpuIds(
+                    groupName, self.m_requestedGpuIds)
+                if self.m_requestedEntities:
+                    for entity in self.m_requestedEntities:
+                        self.m_dcgmGroup.AddEntity(entity.entityGroupId,
+                                                   entity.entityId)
+            else:
+                self.m_dcgmGroup = self.m_dcgmSystem.GetGroupWithEntities(
+                    groupName, self.m_requestedEntities)
+
+        self.SetupGpuIdBusMappings()
+        self.SetupGpuIdUUIdMappings()
+        self.GetFieldMetadata()
+        self.AddFieldWatches()
+
+    ############################################################################
+    '''
+    Has DcgmReader use but not own a handle. Currently for the unit tests.
+    '''
+
+    def SetHandle(self, handle):
+        self.m_dcgmHandle = pydcgm.DcgmHandle(handle)
+        self.InitializeFromHandle()
+
+    ############################################################################
+    '''
+    Reconnect function checks if connection handle is present. If the handle is
+    none, it creates the handle and gets the default DCGM group. It then maps
+    gpuIds to BusID, set the meta data of the field ids and adds watches to the
+    field Ids mentioned in the idToWatch list.
+    '''
+
+    def Reconnect(self):
+        if self.m_dcgmHandle is not None:
+            return
+
+        self.LogDebug("Connection handle is None. Trying to reconnect")
+
+        self.m_dcgmHandle = pydcgm.DcgmHandle(
+            None, self.m_dcgmHostName, dcgm_structs.DCGM_OPERATION_MODE_AUTO)
+        self.m_closeHandle = True
+
+        self.LogDebug("Connected to nv-hostengine")
+
+        self.InitializeFromHandle()
+
+    ###########################################################################
+    '''
+    Populate the g_gpuIdToBusId map. This map contains mapping from
+    gpuID to the BusID.
+    '''
+
+    def SetupGpuIdBusMappings(self):
+        self.m_gpuIdToBusId = {}
+
+        gpuIds = self.m_dcgmGroup.GetGpuIds()
+        for gpuId in gpuIds:
+            gpuInfo = self.m_dcgmSystem.discovery.GetGpuAttributes(gpuId)
+            self.m_gpuIdToBusId[gpuId] = gpuInfo.identifiers.pciBusId
+
+    ###########################################################################
+    '''
+    Add watches to the fields which are passed in init function in idToWatch
+    list. It also updates the field values for the first time.
+    '''
+
+    def AddFieldWatches(self):
+        maxKeepSamples = 0  #No limit. Handled by m_maxKeepAge
+        for interval, fieldGroup in self.m_fieldGroups.items():
+            self.LogDebug("AddWatchFields: interval = " + str(interval) + "\n")
+            self.m_dcgmGroup.samples.WatchFields(fieldGroup, interval,
+                                                 self.m_maxKeepAge,
+                                                 maxKeepSamples)
+        self.m_dcgmSystem.UpdateAllFields(1)
+        self.LogDebug("AddWatchFields exit\n")
+
+    ###########################################################################
+    '''
+    If the groupID already exists, we delete that group and create a new fieldgroup with
+    the fields mentioned in idToWatch. Then information of each field is acquired from its id.
+    '''
+
+    def GetFieldMetadata(self):
+        self.m_fieldIdToInfo = {}
+        self.m_fieldGroups = {}
+        self.m_fieldGroup = None
+        allFieldIds = []
+
+        # Initialize groups for all field intervals.
+        self.LogDebug("GetFieldMetaData:\n")
+
+        intervalIndex = 0
+        for interval, fieldIds in self.m_publishFields.items():
+            self.LogDebug("sampling interval = " + str(interval) + ":\n")
+            for fieldId in fieldIds:
+                self.LogDebug("   fieldId: " + str(fieldId) + "\n")
+
+            intervalIndex += 1
+            fieldGroupName = self.m_fieldGroupName + "_" + str(intervalIndex)
+            findByNameId = self.m_dcgmSystem.GetFieldGroupIdByName(
+                fieldGroupName)
+            self.LogDebug("fieldGroupName: " + fieldGroupName + "\n")
+
+            # Remove our field group if it exists already
+            if findByNameId is not None:
+                self.LogDebug("fieldGroupId: " + findByNameId + "\n")
+                delFieldGroup = pydcgm.DcgmFieldGroup(
+                    dcgmHandle=self.m_dcgmHandle, fieldGroupId=findByNameId)
+                delFieldGroup.Delete()
+                del (delFieldGroup)
+
+            self.m_fieldGroups[interval] = pydcgm.DcgmFieldGroup(
+                self.m_dcgmHandle, fieldGroupName, fieldIds)
+
+            for fieldId in fieldIds:
+                if fieldId not in allFieldIds:
+                    allFieldIds += [fieldId]
+
+                self.m_fieldIdToInfo[
+                    fieldId] = self.m_dcgmSystem.fields.GetFieldById(fieldId)
+                if self.m_fieldIdToInfo[fieldId] == 0 or self.m_fieldIdToInfo[
+                        fieldId] == None:
+                    self.LogError(
+                        "Cannot get field tag for field id %d. Please check dcgm_fields to see if it is valid."
+                        % (fieldId))
+                    raise dcgm_structs.DCGMError(
+                        dcgm_structs.DCGM_ST_UNKNOWN_FIELD)
+        # Initialize a field group of ALL fields.
+        fieldGroupName = self.m_fieldGroupName
+        findByNameId = self.m_dcgmSystem.GetFieldGroupIdByName(fieldGroupName)
+
+        # Remove our field group if it exists already
+        if findByNameId is not None:
+            delFieldGroup = pydcgm.DcgmFieldGroup(dcgmHandle=self.m_dcgmHandle,
+                                                  fieldGroupId=findByNameId)
+            delFieldGroup.Delete()
+            del (delFieldGroup)
+
+        self.m_fieldGroup = pydcgm.DcgmFieldGroup(self.m_dcgmHandle,
+                                                  fieldGroupName, allFieldIds)
+
+    ###########################################################################
+    '''
+    This function attempts to connect to DCGM and calls the implemented
+    CustomDataHandler in the child class with field values.
+    @params:
+    self.m_dcgmGroup.samples.GetLatest(self.m_fieldGroup).values : The field
+    values for each field. This dictionary contains fieldInfo for each field id
+    requested to be watched.
+    '''
+
+    def Process(self):
+        with self.m_lock:
+            try:
+                self.Reconnect()
+
+                # The first call just clears the collection set.
+
+                if not self.m_requestedEntities:
+                    self.dfvc = self.m_dcgmGroup.samples.GetAllSinceLastCall(
+                        self.dfvc, self.m_fieldGroup)
+                    self.CustomDataHandler(self.dfvc.values)
+                    self.dfvc.EmptyValues()
+                else:
+                    self.dfvec = self.m_dcgmGroup.samples.GetAllSinceLastCall_v2(
+                        self.dfvec, self.m_fieldGroup)
+                    self.CustomDataHandler_v2(self.dfvec.values)
+                    self.dfvec.EmptyValues()
+            except dcgm_structs.dcgmExceptionClass(
+                    dcgm_structs.DCGM_ST_CONNECTION_NOT_VALID):
+                self.LogError("Can't connect to nv-hostengine. Is it down?")
+                self.SetDisconnected()
+
+    ###########################################################################
+    def LogInfo(self, msg):
+        logging.info(msg)
+
+    ###########################################################################
+    def LogDebug(self, msg):
+        logging.debug(msg)
+
+    ###########################################################################
+    def LogError(self, msg):
+        logging.error(msg)
+
+    ###########################################################################
+    '''
+    This function gets each value as a dictionary of dictionaries. The dictionary
+    returned is each gpu id mapped to a dictionary of it's field values. Each
+    field value dictionary is the field name mapped to the value or the field
+    id mapped to value depending on the parameter mapById.
+    '''
+
+    def GetLatestGpuValuesAsDict(self, mapById):
+        systemDictionary = {}
+
+        with self.m_lock:
+            try:
+                self.Reconnect()
+                fvs = self.m_dcgmGroup.samples.GetLatest(
+                    self.m_fieldGroup).values
+                for gpuId in list(fvs.keys()):
+                    systemDictionary[gpuId] = {
+                    }  # initialize the gpu's dictionary
+                    gpuFv = fvs[gpuId]
+
+                    for fieldId in list(gpuFv.keys()):
+                        val = gpuFv[fieldId][-1]
+
+                        if val.isBlank:
+                            continue
+
+                        if mapById == False:
+                            fieldTag = self.m_fieldIdToInfo[fieldId].tag
+                            systemDictionary[gpuId][
+                                fieldTag] = val.value if isinstance(
+                                    val.value, bytes) else val.value
+                        else:
+                            systemDictionary[gpuId][
+                                fieldId] = val.value if isinstance(
+                                    val.value, bytes) else val.value
+            except dcgm_structs.dcgmExceptionClass(
+                    dcgm_structs.DCGM_ST_CONNECTION_NOT_VALID):
+                self.LogError(
+                    "Can't connection to nv-hostengine. Please verify that it is running."
+                )
+                self.SetDisconnected()
+
+        return systemDictionary
+
+    ###########################################################################
+    '''
+    This function gets value as a dictionary of dictionaries of lists. The
+    dictionary returned is each gpu id mapped to a dictionary of it's field
+    value lists. Each field value dictionary is the field name mapped to the
+    list of values or the field id mapped to list of values depending on the
+    parameter mapById. The list of values are the values for each field since
+    the last retrieval.
+    '''
+
+    def GetAllGpuValuesAsDictSinceLastCall(self, mapById):
+        systemDictionary = {}
+
+        with self.m_lock:
+            try:
+                self.Reconnect()
+                report = self.fvs is not None
+                self.fvs = self.m_dcgmGroup.samples.GetAllSinceLastCall(
+                    self.fvs, self.m_fieldGroup)
+                if report:
+                    for gpuId in list(self.fvs.values.keys()):
+                        systemDictionary[gpuId] = {
+                        }  # initialize the gpu's dictionary
+                        gpuFv = self.fvs.values[gpuId]
+
+                        for fieldId in list(gpuFv.keys()):
+                            for val in gpuFv[fieldId]:
+                                if val.isBlank:
+                                    continue
+
+                                if mapById == False:
+                                    fieldTag = self.m_fieldIdToInfo[fieldId].tag
+                                    if not fieldTag in systemDictionary[gpuId]:
+                                        systemDictionary[gpuId][fieldTag] = []
+
+                                    systemDictionary[gpuId][fieldTag].append(
+                                        val)
+                                else:
+                                    if not fieldId in systemDictionary[gpuId]:
+                                        systemDictionary[gpuId][fieldId] = []
+                                    systemDictionary[gpuId][fieldId].append(val)
+            except dcgm_structs.dcgmExceptionClass(
+                    dcgm_structs.DCGM_ST_CONNECTION_NOT_VALID):
+                self.LogError(
+                    "Can't connection to nv-hostengine. Please verify that it is running."
+                )
+                self.SetDisconnected()
+
+        if self.fvs is not None:
+            self.fvs.EmptyValues()
+
+        return systemDictionary
+
+    ###########################################################################
+    def GetLatestGpuValuesAsFieldIdDict(self):
+        return self.GetLatestGpuValuesAsDict(True)
+
+    ###########################################################################
+    def GetLatestGpuValuesAsFieldNameDict(self):
+        return self.GetLatestGpuValuesAsDict(False)
+
+    ###########################################################################
+    def GetAllGpuValuesAsFieldIdDictSinceLastCall(self):
+        return self.GetAllGpuValuesAsDictSinceLastCall(True)
+
+    ###########################################################################
+    def GetAllGpuValuesAsFieldNameDictSinceLastCall(self):
+        return self.GetAllGpuValuesAsDictSinceLastCall(False)
diff --git a/model_analyzer/monitor/dcgm/DcgmStatus.py b/model_analyzer/monitor/dcgm/DcgmStatus.py
new file mode 100644
index 000000000..f0a5e3a7d
--- /dev/null
+++ b/model_analyzer/monitor/dcgm/DcgmStatus.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import model_analyzer.monitor.dcgm.pydcgm as pydcgm
+import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent
+import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
+
+
+class DcgmStatus:
+
+    def __init__(self):
+        self.handle = dcgm_agent.dcgmStatusCreate()
+        self.errors = []
+
+    def __del__(self):
+        dcgm_agent.dcgmStatusDestroy(self.handle)
+
+    '''
+    Take any errors stored in our handle and update self.errors with them
+    '''
+
+    def UpdateErrors(self):
+        errorCount = dcgm_agent.dcgmStatusGetCount(self.handle)
+        if errorCount < 1:
+            return
+
+        for i in range(errorCount):
+            self.errors.append(dcgm_agent.dcgmStatusPopError(self.handle))
+
+    '''
+    Throw an exception if any errors are stored in our status handle
+
+    The exception text will contain all of the errors
+    '''
+
+    def ThrowExceptionOnErrors(self):
+        #Make sure we've captured all errors before looking at them
+        self.UpdateErrors()
+
+        if len(self.errors) < 1:
+            return
+
+        errorString = "Errors: "
+        for value in self.errors:
+            errorString += "\"%s\"" % value
+            raise dcgm_structs.DCGMError(value.status)
diff --git a/model_analyzer/monitor/dcgm/DcgmSystem.py b/model_analyzer/monitor/dcgm/DcgmSystem.py
new file mode 100644
index 000000000..6df2759f7
--- /dev/null
+++ b/model_analyzer/monitor/dcgm/DcgmSystem.py
@@ -0,0 +1,412 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import model_analyzer.monitor.dcgm.pydcgm as pydcgm
+import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent
+import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
+import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields
+import ctypes
+
+
+class DcgmSystemDiscovery:
+    '''
+    Constructor
+    '''
+
+    def __init__(self, dcgmHandle):
+        self._dcgmHandle = dcgmHandle
+
+    '''
+    Get all IDs of the GPUs that DCGM knows about. To get only GPUs that DCGM support,
+    use GetAllSupportedGpuIds().
+
+    Returns an array of GPU IDs. Each of these can be passed to DcgmGroup::AddGpu()
+    '''
+
+    def GetAllGpuIds(self):
+        gpuIds = dcgm_agent.dcgmGetAllDevices(self._dcgmHandle.handle)
+        return gpuIds
+
+    '''
+    Get all of IDs of the GPUs that DCGM supports. This will exclude unsupported
+    GPUs
+
+    Returns an array of GPU IDs. Each of these can be passed to DcgmGroup::AddGpu()
+    '''
+
+    def GetAllSupportedGpuIds(self):
+        gpuIds = dcgm_agent.dcgmGetAllSupportedDevices(self._dcgmHandle.handle)
+        return gpuIds
+
+    '''
+    Get some basic GPU attributes for a given GPU ID.
+
+    Returns a dcgm_structs.c_dcgmDeviceAttributes_v3() object for the given GPU
+    '''
+
+    def GetGpuAttributes(self, gpuId):
+        return dcgm_agent.dcgmGetDeviceAttributes(self._dcgmHandle.handle,
+                                                  gpuId)
+
+    '''
+    Get topology information for a given GPU ID
+
+    Returns a dcgm_structs.c_dcgmDeviceTopology_v1 structure representing the topology for the given GPU
+    '''
+
+    def GetGpuTopology(self, gpuId):
+        return dcgm_agent.dcgmGetDeviceTopology(self._dcgmHandle.handle, gpuId)
+
+    '''
+    Get all entityIds of the entities that DCGM knows about.
+
+    entityGroupId IN: DCGM_FE_? constant of the entity group to fetch the entities of
+    onlyActive    IN: Boolean as to whether to fetch entities that are supported by DCGM (True)
+                      or all entity IDs (False)
+
+    Returns an array of entity IDs. Each of these can be passed to DcgmGroup::AddEntity()
+    '''
+
+    def GetEntityGroupEntities(self, entityGroupId, onlySupported):
+        flags = 0
+        if onlySupported:
+            flags |= dcgm_structs.DCGM_GEGE_FLAG_ONLY_SUPPORTED
+        entityIds = dcgm_agent.dcgmGetEntityGroupEntities(
+            self._dcgmHandle.handle, entityGroupId, flags)
+        return entityIds
+
+    '''
+    Get the status of all of the NvLink links in the system.
+
+    Returns a dcgm_structs.c_dcgmNvLinkStatus_v3 object.
+    '''
+
+    def GetNvLinkLinkStatus(self):
+        return dcgm_agent.dcgmGetNvLinkLinkStatus(self._dcgmHandle.handle)
+
+    '''
+    From a bitmask of input gpu ids, return a bitmask of numGpus GPUs which identifies the topologically
+    closest GPUs to use for a single job. DCGM will consider CPU affinities and NVLink connection speeds
+    to determine the closest.
+    hintFlags can instruct DCGM to consider GPU health or not. By default, unhealthy GPUs are excluded from
+    consideration.
+    '''
+
+    def SelectGpusByTopology(self, inputGpuIds, numGpus, hintFlags):
+        return dcgm_agent.dcgmSelectGpusByTopology(self._dcgmHandle.handle,
+                                                   inputGpuIds, numGpus,
+                                                   hintFlags)
+
+
+class DcgmSystemIntrospect:
+    '''
+    Class to access the system-wide introspection modules of DCGM
+    '''
+
+    def __init__(self, dcgmHandle):
+        self._handle = dcgmHandle
+        self.memory = DcgmSystemIntrospectMemory(dcgmHandle)
+        self.cpuUtil = DcgmSystemIntrospectCpuUtil(dcgmHandle)
+
+    def UpdateAll(self, waitForUpdate=True):
+        dcgm_agent.dcgmIntrospectUpdateAll(self._handle.handle, waitForUpdate)
+
+
+class DcgmSystemIntrospectMemory:
+    '''
+    Class to access information about the memory usage of DCGM itself
+    '''
+
+    def __init__(self, dcgmHandle):
+        self._dcgmHandle = dcgmHandle
+
+    def GetForHostengine(self, waitIfNoData=True):
+        '''
+        Retrieve the total amount of virtual memory that the hostengine process is currently using.
+        This measurement represents both the resident set size (what is currently in RAM) and
+        the swapped memory that belongs to the process.
+        
+        waitIfNoData:      wait for metadata to be updated if it's not available
+                      
+        Returns a dcgm_structs.c_dcgmIntrospectMemory_v1 object
+        Raises an exception for DCGM_ST_NO_DATA if no data is available yet and \ref waitIfNoData is False
+        '''
+        return dcgm_agent.dcgmIntrospectGetHostengineMemoryUsage(
+            self._dcgmHandle.handle, waitIfNoData)
+
+
+class DcgmSystemIntrospectCpuUtil:
+    '''
+    Class to access information about the CPU Utilization of DCGM
+    '''
+
+    def __init__(self, dcgmHandle):
+        self._dcgmHandle = dcgmHandle
+
+    def GetForHostengine(self, waitIfNoData=True):
+        '''
+        Get the current CPU Utilization of the hostengine process.
+        
+        waitIfNoData:      wait for metadata to be updated if it's not available
+                      
+        Returns a dcgm_structs.c_dcgmIntrospectCpuUtil_v1 object
+        Raises an exception for DCGM_ST_NO_DATA if no data is available yet and \ref waitIfNoData is False
+        '''
+        return dcgm_agent.dcgmIntrospectGetHostengineCpuUtilization(
+            self._dcgmHandle.handle, waitIfNoData)
+
+
+'''
+Class to encapsulate DCGM field-metadata requests
+'''
+
+
+class DcgmSystemFields:
+
+    def GetFieldById(self, fieldId):
+        '''
+        Get a field's metadata by its dcgm_fields.DCGM_FI_* field ID
+
+        fieldId: dcgm_fields.DCGM_FI_* field ID of the field
+
+        Returns a dcgm_fields.c_dcgm_field_meta_t struct on success or None on error.
+        '''
+        return dcgm_fields.DcgmFieldGetById(fieldId)
+
+    def GetFieldByTag(self, tag):
+        '''
+        Get a field's metadata by its tag name. Ex: 'brand'
+
+        tag: Tag name of the field
+
+        Returns a dcgm_fields.c_dcgm_field_meta_t struct on success or None on error.
+        '''
+        return dcgm_fields.DcgmFieldGetByTag(tag)
+
+
+'''
+Class to encapsulate DCGM module management and introspection
+'''
+
+
+class DcgmSystemModules:
+    '''
+    Constructor
+    '''
+
+    def __init__(self, dcgmHandle):
+        self._dcgmHandle = dcgmHandle
+
+    '''
+    Denylist a module from being loaded by DCGM.
+
+    moduleId a dcgm_structs.dcgmModuleId* ID of the module to denylist
+
+    Returns: Nothing.
+    Raises a DCGM_ST_IN_USE exception if the module was already loaded
+    '''
+
+    def Denylist(self, moduleId):
+        dcgm_agent.dcgmModuleDenylist(self._dcgmHandle.handle, moduleId)
+
+    '''
+    Get the statuses of all of the modules in DCGM
+
+    Returns: a dcgm_structs.c_dcgmModuleGetStatuses_v1 structure.
+    '''
+
+    def GetStatuses(self):
+        return dcgm_agent.dcgmModuleGetStatuses(self._dcgmHandle.handle)
+
+
+'''
+Class to encapsulate DCGM profiling
+'''
+
+
+class DcgmSystemProfiling:
+    '''
+    Constructor
+    '''
+
+    def __init__(self, dcgmHandle):
+        self._dcgmHandle = dcgmHandle
+
+    '''
+    Pause profiling activities in DCGM. This should be used when you are monitoring profiling fields
+    from DCGM but want to be able to still run developer tools like nvprof, nsight systems, and nsight compute.
+    Profiling fields start with DCGM_PROF_ and are in the field ID range 1001-1012.
+    
+    Call this API before you launch one of those tools and Resume() after the tool has completed.
+    
+    DCGM will save BLANK values while profiling is paused. 
+    Calling this while profiling activities are already paused is fine and will be treated as a no-op.
+    '''
+
+    def Pause(self):
+        return dcgm_agent.dcgmProfPause(self._dcgmHandle.handle)
+
+    '''
+    Resume profiling activities in DCGM that were previously paused with Pause().
+
+    Call this API after you have completed running other NVIDIA developer tools to reenable DCGM
+    profiling metrics.
+    
+    DCGM will save BLANK values while profiling is paused. 
+    
+    Calling this while profiling activities have already been resumed is fine and will be treated as a no-op.
+    '''
+
+    def Resume(self):
+        return dcgm_agent.dcgmProfResume(self._dcgmHandle.handle)
+
+
+'''
+Class to encapsulate global DCGM methods. These apply to a single DcgmHandle, provided to the constructor
+'''
+
+
+class DcgmSystem:
+    '''
+    Constructor
+
+    dcgmHandle is a pydcgm.DcgmHandle instance of the connection that will be used by all methods of this class
+    '''
+
+    def __init__(self, dcgmHandle):
+        self._dcgmHandle = dcgmHandle
+
+        #Child classes
+        self.discovery = DcgmSystemDiscovery(self._dcgmHandle)
+        self.introspect = DcgmSystemIntrospect(self._dcgmHandle)
+        self.fields = DcgmSystemFields()
+        self.modules = DcgmSystemModules(self._dcgmHandle)
+        self.profiling = DcgmSystemProfiling(self._dcgmHandle)
+
+    '''
+    Request that the host engine perform a field value update cycle. If the host
+    engine was starting in DCGM_OPERATION_MODE_MANUAL, calling this method is
+    the only way that field values will be updated.
+
+    Note that performing a field value update cycle does not update every field.
+    It only update fields that are newly watched or fields that haven't updated
+    in enough time to warrant updating again, based on their update frequency.
+
+    waitForUpdate specifies whether this function call should block until the
+    field value update loop is complete or not. Use True if you intend to query
+    values immediately after calling this.
+    '''
+
+    def UpdateAllFields(self, waitForUpdate):
+        ret = dcgm_agent.dcgmUpdateAllFields(self._dcgmHandle.handle,
+                                             waitForUpdate)
+        #Throw an exception on error
+        dcgm_structs._dcgmCheckReturn(ret)
+
+    '''
+    Get a DcgmGroup instance for the default all-GPUs group. This object is used to
+    perform operations on a group of GPUs. See DcgmGroup.py for details.
+
+    AddGpu() and RemoveGpu() operations are not allowed on the default group
+    '''
+
+    def GetDefaultGroup(self):
+        return pydcgm.DcgmGroup(self._dcgmHandle,
+                                groupId=dcgm_structs.DCGM_GROUP_ALL_GPUS)
+
+    '''
+    Get an instance of DcgmGroup with no GPUs. Call AddGpu() on the returned
+    object with GPU IDs from GetAllGpuIds() before performing actions on
+    the returned DcgmGroup instance.
+
+    groupName is the name of the group to create in the host engine. This name must be
+    unique.
+
+    Note: The group will be deleted from the host engine when the returned object goes out of scope
+    '''
+
+    def GetEmptyGroup(self, groupName):
+        return pydcgm.DcgmGroup(self._dcgmHandle, groupName=groupName)
+
+    '''
+    Get an instance of DcgmGroup populated with the gpuIds provided
+
+    groupName is the name of the group to create in the host engine. This name must be
+    unique.
+    gpuIds is the list of GPU IDs to add to the group
+
+    Note: The group will be deleted from the host engine when the returned object goes out of scope
+    '''
+
+    def GetGroupWithGpuIds(self, groupName, gpuIds):
+        newGroup = pydcgm.DcgmGroup(self._dcgmHandle, groupName=groupName)
+        for gpuId in gpuIds:
+            newGroup.AddGpu(gpuId)
+        return newGroup
+
+    '''
+    Get an instance of DcgmGroup populated with the provided entities
+
+    groupName is the name of the group to create in the host engine. This name must be
+    unique.
+    entities is the list of entity pairs (type and id) to add to the group
+
+    Note: The group will be deleted from the host engine when the returned object goes out of scope
+    '''
+
+    def GetGroupWithEntities(self, groupName, entities):
+        group = pydcgm.DcgmGroup(self._dcgmHandle, groupName=groupName)
+        for entity in entities:
+            group.AddEntity(entity.entityGroupId, entity.entityId)
+
+        return group
+
+    '''
+    Get ids of all DcgmGroups of GPUs. This returns a list containing the ids of the DcgmGroups.
+    '''
+
+    def GetAllGroupIds(self):
+        return dcgm_agent.dcgmGroupGetAllIds(self._dcgmHandle.handle)
+
+    '''
+    Get all all of the field groups in the system
+    '''
+
+    def GetAllFieldGroups(self):
+        return dcgm_agent.dcgmFieldGroupGetAll(self._dcgmHandle.handle)
+
+    '''
+    Get a field group's id by its name.
+
+    Returns: Field group ID if found
+             None if not found
+    '''
+
+    def GetFieldGroupIdByName(self, name):
+        allGroups = self.GetAllFieldGroups()
+        for i in range(0, allGroups.numFieldGroups):
+            if allGroups.fieldGroups[i].fieldGroupName == name:
+                return ctypes.c_void_p(allGroups.fieldGroups[i].fieldGroupId)
+
+        return None
+
+    def PauseTelemetryForDiag(self):
+        """Pause DCGM modules from updating field values."""
+        import dcgm_agent_internal
+        dcgm_agent_internal.dcgmPauseTelemetryForDiag(self._dcgmHandle.handle)
+
+    def ResumeTelemetryForDiag(self):
+        """Resume previously paused DCGM modules so that they can update field values."""
+        import dcgm_agent_internal
+        dcgm_agent_internal.dcgmResumeTelemetryForDiag(self._dcgmHandle.handle)
diff --git a/model_analyzer/monitor/dcgm/common/__init__.py b/model_analyzer/monitor/dcgm/common/__init__.py
new file mode 100644
index 000000000..236f66016
--- /dev/null
+++ b/model_analyzer/monitor/dcgm/common/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/model_analyzer/monitor/dcgm/common/dcgm_client_cli_parser.py b/model_analyzer/monitor/dcgm/common/dcgm_client_cli_parser.py
new file mode 100644
index 000000000..401dcee05
--- /dev/null
+++ b/model_analyzer/monitor/dcgm/common/dcgm_client_cli_parser.py
@@ -0,0 +1,194 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from os import environ
+import argparse
+import logging
+import sys
+
+
+###############################################################################
+def create_parser(
+    publish_port=8000,
+    interval=10,
+    name='the monitoring tool',  # Replace with 'prometheus', 'telegraf', etc.
+    field_ids=None,
+    log_file=None,
+    log_level='INFO',
+    dcgm_hostname=environ.get('DCGM_HOSTNAME') or 'localhost',
+):
+    '''
+    Create a parser that defaults to sane parameters.
+
+    The default parameters can be overridden through keyword arguments.
+
+    Note: if DCGM_HOSTNAME is set as an environment variable, it is used as
+    the default instead of localhost
+    '''
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '-p',
+        '--publish-port',
+        dest='publish_port',
+        type=int,
+        default=publish_port,
+        help='TCP port that the client should publish to. Default={}.'.format(
+            publish_port))
+    parser.add_argument(
+        '-i',
+        '--interval',
+        dest='interval',
+        type=int,
+        default=interval,
+        help=
+        'How often the client should retrieve new values from DCGM in seconds. Default={}.'
+        .format(interval))
+    parser.add_argument(
+        '-f',
+        '--field-ids',
+        dest='field_ids',
+        type=str,
+        default=field_ids,
+        help=
+        'Comma-separated list of field IDs that should be retrieved from DCGM. '
+        +
+        'The full list of available field IDs can be obtained from dcgm_fields.h, dcgm_fields.py, '
+        + 'or running \'dcgmi dmon -l\'.')
+    parser.add_argument(
+        '--log-file',
+        dest='logfile',
+        type=str,
+        default=log_file,
+        help=
+        'A path to a log file for recording what information is being sent to {}'
+        .format(name))
+    parser.add_argument(
+        '--log-level',
+        dest='loglevel',
+        type=str,
+        default=log_level,
+        help=
+        'Specify a log level to use for logging.\n\tCRITICAL (0) - log only critical errors that drastically affect execution'
+        +
+        '\n\tERROR (1) - Log any error in execution\n\tWARNING (2) - Log all warnings and errors that occur'
+        +
+        '\n\tINFO (3) - Log informational messages about program execution in addition to warnings and errors'
+        +
+        '\n\tDEBUG (4) - Log debugging information in addition to all information about execution'
+        + '\nDefault: {}'.format(log_level))
+
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        '-n',
+        '--hostname',
+        dest='hostname',
+        type=str,
+        default=dcgm_hostname,
+        help=
+        'IP/hostname where the client should query DCGM for values. Default={} (all interfaces).'
+        .format(dcgm_hostname))
+    group.add_argument(
+        '-e',
+        '--embedded',
+        dest='embedded',
+        action='store_true',
+        help=
+        'Launch DCGM from within this process instead of connecting to nv-hostengine.'
+    )
+
+    return parser
+
+
+def add_custom_argument(parser, *args, **kwargs):
+    parser.add_argument(*args, **kwargs)
+
+
+###############################################################################
+def add_target_host_argument(name, parser, default_target='localhost'):
+    parser.add_argument(
+        '-t',
+        '--publish-hostname',
+        dest='publish_hostname',
+        type=str,
+        default=default_target,
+        help='The hostname at which the client will publish the readings to {}'.
+        format(name))
+
+
+###############################################################################
+def run_parser(parser):
+    '''
+    Run a parser created using create_parser
+    '''
+    return parser.parse_args()
+
+
+###############################################################################
+def get_field_ids(args):
+    # This indicates the user supplied a string, so we should override the
+    # default
+    if isinstance(args.field_ids, str):
+        tokens = args.field_ids.split(",")
+        field_ids = [int(token) for token in tokens]
+        return field_ids
+    # The default object should already be an array of ints. Just return it
+    else:
+        return args.field_ids
+
+
+###############################################################################
+def get_log_level(args):
+    levelStr = args.loglevel.upper()
+    if levelStr == '0' or levelStr == 'CRITICAL':
+        numeric_log_level = logging.CRITICAL
+    elif levelStr == '1' or levelStr == 'ERROR':
+        numeric_log_level = logging.ERROR
+    elif levelStr == '2' or levelStr == 'WARNING':
+        numeric_log_level = logging.WARNING
+    elif levelStr == '3' or levelStr == 'INFO':
+        numeric_log_level = logging.INFO
+    elif levelStr == '4' or levelStr == 'DEBUG':
+        numeric_log_level = logging.DEBUG
+    else:
+        print("Could not understand the specified --log-level '%s'" %
+              (args.loglevel))
+        args.print_help()
+        sys.exit(2)
+    return numeric_log_level
+
+
+###############################################################################
+def parse_command_line(name, default_port, add_target_host=False):
+    # Fields we accept raw from the CLI
+    FIELDS_AS_IS = ['publish_port', 'interval', 'logfile', 'publish_hostname']
+
+    parser = create_parser(
+        name=name,
+        publish_port=default_port,
+    )
+
+    if add_target_host:
+        add_target_host_argument(name, parser)
+
+    args = run_parser(parser)
+    field_ids = get_field_ids(args)
+    log_level = get_log_level(args)
+
+    args_as_dict = vars(args)
+    settings = {i: args_as_dict[i] for i in FIELDS_AS_IS}
+    settings['dcgm_hostname'] = None if args.embedded else args.hostname
+    settings['field_ids'] = field_ids
+    settings['log_level'] = log_level
+
+    return settings
diff --git a/model_analyzer/monitor/dcgm/common/dcgm_client_main.py b/model_analyzer/monitor/dcgm/common/dcgm_client_main.py
new file mode 100644
index 000000000..54cd04673
--- /dev/null
+++ b/model_analyzer/monitor/dcgm/common/dcgm_client_main.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from time import sleep
+from . import dcgm_client_cli_parser as cli
+import signal
+
+
+###############################################################################
+def exit_handler(signum, frame):
+    # The Prometheus client does something smarter but more complex
+    # Here we just exit
+    exit()
+
+
+###############################################################################
+def initialize_signal_handlers():
+    signal.signal(signal.SIGINT, exit_handler)
+    signal.signal(signal.SIGTERM, exit_handler)
+
+
+###############################################################################
+def main(DRConstructor, name, default_port, add_target_host=False):
+    '''
+    This main function should work for most DCGM clients. It creates a
+    DcgmReader object using DRConstructor and enters a loop that queries DCGM
+    for data
+
+    Arguments
+    ---------
+    DRConstructor:   A constructor for a DcgmReader. The constructor must
+                     accept the following keyword arguments:
+                         - hostname: DCGM hostname
+                         - publish_port: port on which the data is published
+                     In some cases, the constructor will also need to accept:
+                         - publish_hostname: hostname the data is published to
+                         - field_ids: field ids to query and publish
+    name:            The name of the client. This is displayed to the user
+    default_port:    Default port to publish to
+
+    Keyword arguments
+    -----------------
+    add_target_host: Boolean that indicates whether this client accepts a
+                     publish hostname
+
+    '''
+
+    initialize_signal_handlers()
+    settings = cli.parse_command_line(
+        name,
+        default_port,
+        add_target_host=add_target_host,
+    )
+
+    # Create a dictionary for the arguments because field_ids might not be
+    # provided (if it's None) when we want to use the default in DcgmReader
+    dr_args = {
+        'hostname': settings['dcgm_hostname'],
+        'publish_port': settings['publish_port'],
+    }
+
+    # publish_hostname is only available if we add the target_host parameter
+    if add_target_host:
+        dr_args['publish_hostname'] = settings['publish_hostname']
+
+    if settings['field_ids']:
+        dr_args['fieldIds'] = settings['field_ids']
+
+    dr = DRConstructor(**dr_args)
+
+    try:
+        while True:
+            dr.Process()
+            sleep(settings['interval'])
+    except KeyboardInterrupt:
+        print('Caught CTRL-C. Exiting')
diff --git a/model_analyzer/monitor/dcgm/dcgm_agent.py b/model_analyzer/monitor/dcgm/dcgm_agent.py
index 809b57f66..320db76d2 100755
--- a/model_analyzer/monitor/dcgm/dcgm_agent.py
+++ b/model_analyzer/monitor/dcgm/dcgm_agent.py
@@ -1,6 +1,4 @@
-#!/usr/bin/env python3
-
-# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,31 +11,61 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+##
+# Python bindings for the internal API of DCGM library (dcgm_agent.h)
+##
 
-from ctypes import (
-    CFUNCTYPE,
-    POINTER,
-    byref,
-    c_double,
-    c_int,
-    c_int32,
-    c_int64,
-    c_uint,
-    c_uint16,
-    c_uint32,
-    c_uint64,
-    c_void_p,
-    py_object,
-)
-
-import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields
 import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
+import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields
+from ctypes import *
+import functools
+
+
+def ensure_byte_strings():
+    """
+    Ensures that we don't call C APIs with unicode strings in the arguments
+    every unicode args gets converted to UTF-8 before the function is called
+    """
+
+    def convert_result_from_bytes(result):
+        if isinstance(result, bytes):
+            return result.decode('utf-8')
+        if isinstance(result, list):
+            return list(map(convert_result_from_bytes, result))
+        if isinstance(result, tuple):
+            return tuple(map(convert_result_from_bytes, result))
+        return result
+
+    def decorator(fn):
+
+        @functools.wraps(fn)
+        def wrapper(*args, **kwargs):
+            newargs = []
+            newkwargs = {}
+            for arg in args:
+                if isinstance(arg, str):
+                    newargs.append(bytes(arg, 'utf-8'))
+                else:
+                    newargs.append(arg)
+            for k, v in kwargs.items():
+                if isinstance(v, str):
+                    newkwargs[k] = bytes(v, 'utf-8')
+                else:
+                    newkwargs[k] = v
+            newargs = tuple(newargs)
+            return fn(*newargs, **newkwargs)
+
+        return wrapper
+
+    return decorator
+
 
 # Provides access to functions from dcgm_agent_internal
 dcgmFP = dcgm_structs._dcgmGetFunctionPointer
 
 
 # This method is used to initialize DCGM
+@ensure_byte_strings()
 def dcgmInit():
     dcgm_handle = c_void_p()
     fn = dcgmFP("dcgmInit")
@@ -47,6 +75,7 @@ def dcgmInit():
 
 
 # This method is used to shutdown DCGM Engine
+@ensure_byte_strings()
 def dcgmShutdown():
     fn = dcgmFP("dcgmShutdown")
     ret = fn()
@@ -54,6 +83,7 @@ def dcgmShutdown():
     return ret
 
 
+@ensure_byte_strings()
 def dcgmStartEmbedded(opMode):
     dcgm_handle = c_void_p()
     fn = dcgmFP("dcgmStartEmbedded")
@@ -62,6 +92,7 @@ def dcgmStartEmbedded(opMode):
     return dcgm_handle
 
 
+@ensure_byte_strings()
 def dcgmStopEmbedded(dcgm_handle):
     fn = dcgmFP("dcgmStopEmbedded")
     ret = fn(dcgm_handle)
@@ -69,6 +100,7 @@ def dcgmStopEmbedded(dcgm_handle):
     return ret
 
 
+@ensure_byte_strings()
 def dcgmConnect(ip_address):
     dcgm_handle = c_void_p()
     fn = dcgmFP("dcgmConnect")
@@ -77,9 +109,10 @@ def dcgmConnect(ip_address):
     return dcgm_handle
 
 
-def dcgmConnect_v2(
-    ip_address, connectParams, version=dcgm_structs.c_dcgmConnectV2Params_version
-):
+@ensure_byte_strings()
+def dcgmConnect_v2(ip_address,
+                   connectParams,
+                   version=dcgm_structs.c_dcgmConnectV2Params_version):
     connectParams.version = version
     dcgm_handle = c_void_p()
     fn = dcgmFP("dcgmConnect_v2")
@@ -88,6 +121,7 @@ def dcgmConnect_v2(
     return dcgm_handle
 
 
+@ensure_byte_strings()
 def dcgmDisconnect(dcgm_handle):
     fn = dcgmFP("dcgmDisconnect")
     ret = fn(dcgm_handle)
@@ -95,6 +129,7 @@ def dcgmDisconnect(dcgm_handle):
     return ret
 
 
+@ensure_byte_strings()
 def dcgmGetAllSupportedDevices(dcgm_handle):
     c_count = c_uint()
     gpuid_list = c_uint * dcgm_structs.DCGM_MAX_NUM_DEVICES
@@ -102,9 +137,10 @@ def dcgmGetAllSupportedDevices(dcgm_handle):
     fn = dcgmFP("dcgmGetAllSupportedDevices")
     ret = fn(dcgm_handle, c_gpuid_list, byref(c_count))
     dcgm_structs._dcgmCheckReturn(ret)
-    return [c_gpuid_list[i] for i in range(c_count.value)[0 : int(c_count.value)]]
+    return list(c_gpuid_list[0:int(c_count.value)])
 
 
+@ensure_byte_strings()
 def dcgmGetAllDevices(dcgm_handle):
     c_count = c_uint()
     gpuid_list = c_uint * dcgm_structs.DCGM_MAX_NUM_DEVICES
@@ -112,18 +148,26 @@ def dcgmGetAllDevices(dcgm_handle):
     fn = dcgmFP("dcgmGetAllDevices")
     ret = fn(dcgm_handle, c_gpuid_list, byref(c_count))
     dcgm_structs._dcgmCheckReturn(ret)
-    return [c_gpuid_list[i] for i in range(c_count.value)[0 : int(c_count.value)]]
+    return list(c_gpuid_list[0:int(c_count.value)])
 
 
-def dcgmGetDeviceAttributes(dcgm_handle, gpuId):
+@ensure_byte_strings()
+def dcgmGetDeviceAttributes(dcgm_handle,
+                            gpuId,
+                            version=dcgm_structs.dcgmDeviceAttributes_version3):
     fn = dcgmFP("dcgmGetDeviceAttributes")
-    device_values = dcgm_structs.c_dcgmDeviceAttributes_v2()
-    device_values.version = dcgm_structs.dcgmDeviceAttributes_version2
+    if version == dcgm_structs.dcgmDeviceAttributes_version3:
+        device_values = dcgm_structs.c_dcgmDeviceAttributes_v3()
+        device_values.version = dcgm_structs.dcgmDeviceAttributes_version3
+    else:
+        dcgm_structs._dcgmCheckReturn(dcgm_structs.DCGM_ST_VER_MISMATCH)
+
     ret = fn(dcgm_handle, c_int(gpuId), byref(device_values))
     dcgm_structs._dcgmCheckReturn(ret)
     return device_values
 
 
+@ensure_byte_strings()
 def dcgmGetEntityGroupEntities(dcgm_handle, entityGroup, flags):
     capacity = dcgm_structs.DCGM_GROUP_MAX_ENTITIES
     c_count = c_int32(capacity)
@@ -135,24 +179,27 @@ def dcgmGetEntityGroupEntities(dcgm_handle, entityGroup, flags):
     return c_entityIds[0 : int(c_count.value)]
 
 
+@ensure_byte_strings()
 def dcgmGetNvLinkLinkStatus(dcgm_handle):
-    linkStatus = dcgm_structs.c_dcgmNvLinkStatus_v2()
-    linkStatus.version = dcgm_structs.dcgmNvLinkStatus_version2
+    linkStatus = dcgm_structs.c_dcgmNvLinkStatus_v3()
+    linkStatus.version = dcgm_structs.dcgmNvLinkStatus_version3
     fn = dcgmFP("dcgmGetNvLinkLinkStatus")
     ret = fn(dcgm_handle, byref(linkStatus))
     dcgm_structs._dcgmCheckReturn(ret)
     return linkStatus
 
 
+@ensure_byte_strings()
 def dcgmGetGpuInstanceHierarchy(dcgm_handle):
-    hierarchy = dcgm_structs.c_dcgmMigHierarchy_v1()
-    hierarchy.version = dcgm_structs.c_dcgmMigHierarchy_version1
+    hierarchy = dcgm_structs.c_dcgmMigHierarchy_v2()
+    hierarchy.version = dcgm_structs.c_dcgmMigHierarchy_version2
     fn = dcgmFP("dcgmGetGpuInstanceHierarchy")
     ret = fn(dcgm_handle, byref(hierarchy))
     dcgm_structs._dcgmCheckReturn(ret)
     return hierarchy
 
 
+@ensure_byte_strings()
 def dcgmCreateMigEntity(dcgm_handle, parentId, profile, createOption, flags):
     fn = dcgmFP("dcgmCreateMigEntity")
     cme = dcgm_structs.c_dcgmCreateMigEntity_v1()
@@ -165,6 +212,7 @@ def dcgmCreateMigEntity(dcgm_handle, parentId, profile, createOption, flags):
     dcgm_structs._dcgmCheckReturn(ret)
 
 
+@ensure_byte_strings()
 def dcgmDeleteMigEntity(dcgm_handle, entityGroupId, entityId, flags):
     fn = dcgmFP("dcgmDeleteMigEntity")
     dme = dcgm_structs.c_dcgmDeleteMigEntity_v1()
@@ -176,6 +224,7 @@ def dcgmDeleteMigEntity(dcgm_handle, entityGroupId, entityId, flags):
     dcgm_structs._dcgmCheckReturn(ret)
 
 
+@ensure_byte_strings()
 def dcgmGroupCreate(dcgm_handle, type, groupName):
     c_group_id = c_void_p()
     fn = dcgmFP("dcgmGroupCreate")
@@ -184,6 +233,7 @@ def dcgmGroupCreate(dcgm_handle, type, groupName):
     return c_group_id
 
 
+@ensure_byte_strings()
 def dcgmGroupDestroy(dcgm_handle, group_id):
     fn = dcgmFP("dcgmGroupDestroy")
     ret = fn(dcgm_handle, group_id)
@@ -191,6 +241,7 @@ def dcgmGroupDestroy(dcgm_handle, group_id):
     return ret
 
 
+@ensure_byte_strings()
 def dcgmGroupAddDevice(dcgm_handle, group_id, gpu_id):
     fn = dcgmFP("dcgmGroupAddDevice")
     ret = fn(dcgm_handle, group_id, gpu_id)
@@ -198,6 +249,7 @@ def dcgmGroupAddDevice(dcgm_handle, group_id, gpu_id):
     return ret
 
 
+@ensure_byte_strings()
 def dcgmGroupAddEntity(dcgm_handle, group_id, entityGroupId, entityId):
     fn = dcgmFP("dcgmGroupAddEntity")
     ret = fn(dcgm_handle, group_id, entityGroupId, entityId)
@@ -205,6 +257,7 @@ def dcgmGroupAddEntity(dcgm_handle, group_id, entityGroupId, entityId):
     return ret
 
 
+@ensure_byte_strings()
 def dcgmGroupRemoveDevice(dcgm_handle, group_id, gpu_id):
     fn = dcgmFP("dcgmGroupRemoveDevice")
     ret = fn(dcgm_handle, group_id, gpu_id)
@@ -212,6 +265,7 @@ def dcgmGroupRemoveDevice(dcgm_handle, group_id, gpu_id):
     return ret
 
 
+@ensure_byte_strings()
 def dcgmGroupRemoveEntity(dcgm_handle, group_id, entityGroupId, entityId):
     fn = dcgmFP("dcgmGroupRemoveEntity")
     ret = fn(dcgm_handle, group_id, entityGroupId, entityId)
@@ -219,12 +273,13 @@ def dcgmGroupRemoveEntity(dcgm_handle, group_id, entityGroupId, entityId):
     return ret
 
 
-def dcgmGroupGetInfo(
-    dcgm_handle, group_id, version=dcgm_structs.c_dcgmGroupInfo_version2
-):
+@ensure_byte_strings()
+def dcgmGroupGetInfo(dcgm_handle,
+                     group_id,
+                     version=dcgm_structs.c_dcgmGroupInfo_version2):
     fn = dcgmFP("dcgmGroupGetInfo")
 
-    # support the old version of the request since the host engine does
+    #support the old version of the request since the host engine does
     if version == dcgm_structs.c_dcgmGroupInfo_version2:
         device_values = dcgm_structs.c_dcgmGroupInfo_v2()
         device_values.version = dcgm_structs.c_dcgmGroupInfo_version2
@@ -236,6 +291,7 @@ def dcgmGroupGetInfo(
     return device_values
 
 
+@ensure_byte_strings()
 def dcgmGroupGetAllIds(dcgmHandle):
     fn = dcgmFP("dcgmGroupGetAllIds")
     c_count = c_uint()
@@ -243,25 +299,22 @@ def dcgmGroupGetAllIds(dcgmHandle):
     c_groupIdList = groupIdList()
     ret = fn(dcgmHandle, c_groupIdList, byref(c_count))
     dcgm_structs._dcgmCheckReturn(ret)
-    return map(None, c_groupIdList[0 : int(c_count.value)])
+    return list(c_groupIdList[0:int(c_count.value)])
 
 
+@ensure_byte_strings()
 def dcgmFieldGroupCreate(dcgm_handle, fieldIds, fieldGroupName):
     c_field_group_id = c_void_p()
     c_num_field_ids = c_int32(len(fieldIds))
     c_field_ids = (c_uint16 * len(fieldIds))(*fieldIds)
     fn = dcgmFP("dcgmFieldGroupCreate")
-    ret = fn(
-        dcgm_handle,
-        c_num_field_ids,
-        byref(c_field_ids),
-        fieldGroupName,
-        byref(c_field_group_id),
-    )
+    ret = fn(dcgm_handle, c_num_field_ids, byref(c_field_ids), fieldGroupName,
+             byref(c_field_group_id))
     dcgm_structs._dcgmCheckReturn(ret)
     return c_field_group_id
 
 
+@ensure_byte_strings()
 def dcgmFieldGroupDestroy(dcgm_handle, fieldGroupId):
     fn = dcgmFP("dcgmFieldGroupDestroy")
     ret = fn(dcgm_handle, fieldGroupId)
@@ -269,6 +322,7 @@ def dcgmFieldGroupDestroy(dcgm_handle, fieldGroupId):
     return ret
 
 
+@ensure_byte_strings()
 def dcgmFieldGroupGetInfo(dcgm_handle, fieldGroupId):
     c_fieldGroupInfo = dcgm_structs.c_dcgmFieldGroupInfo_v1()
     c_fieldGroupInfo.version = dcgm_structs.dcgmFieldGroupInfo_version1
@@ -279,6 +333,7 @@ def dcgmFieldGroupGetInfo(dcgm_handle, fieldGroupId):
     return c_fieldGroupInfo
 
 
+@ensure_byte_strings()
 def dcgmFieldGroupGetAll(dcgm_handle):
     c_allGroupInfo = dcgm_structs.c_dcgmAllFieldGroup_v1()
     c_allGroupInfo.version = dcgm_structs.dcgmAllFieldGroup_version1
@@ -288,6 +343,7 @@ def dcgmFieldGroupGetAll(dcgm_handle):
     return c_allGroupInfo
 
 
+@ensure_byte_strings()
 def dcgmStatusCreate():
     c_status_handle = c_void_p()
     fn = dcgmFP("dcgmStatusCreate")
@@ -296,6 +352,7 @@ def dcgmStatusCreate():
     return c_status_handle
 
 
+@ensure_byte_strings()
 def dcgmStatusDestroy(status_handle):
     fn = dcgmFP("dcgmStatusDestroy")
     ret = fn(status_handle)
@@ -303,6 +360,7 @@ def dcgmStatusDestroy(status_handle):
     return ret
 
 
+@ensure_byte_strings()
 def dcgmStatusGetCount(status_handle):
     c_count = c_uint()
     fn = dcgmFP("dcgmStatusGetCount")
@@ -311,6 +369,7 @@ def dcgmStatusGetCount(status_handle):
     return c_count.value
 
 
+@ensure_byte_strings()
 def dcgmStatusPopError(status_handle):
     c_errorInfo = dcgm_structs.c_dcgmErrorInfo_v1()
     fn = dcgmFP("dcgmStatusPopError")
@@ -321,6 +380,7 @@ def dcgmStatusPopError(status_handle):
         return None
 
 
+@ensure_byte_strings()
 def dcgmStatusClear(status_handle):
     fn = dcgmFP("dcgmStatusClear")
     ret = fn(status_handle)
@@ -328,6 +388,7 @@ def dcgmStatusClear(status_handle):
     return ret
 
 
+@ensure_byte_strings()
 def dcgmConfigSet(dcgm_handle, group_id, configToSet, status_handle):
     fn = dcgmFP("dcgmConfigSet")
     configToSet.version = dcgm_structs.dcgmDeviceConfig_version1
@@ -336,6 +397,7 @@ def dcgmConfigSet(dcgm_handle, group_id, configToSet, status_handle):
     return ret
 
 
+@ensure_byte_strings()
 def dcgmConfigGet(dcgm_handle, group_id, reqCfgType, count, status_handle):
     fn = dcgmFP("dcgmConfigGet")
 
@@ -345,11 +407,13 @@ def dcgmConfigGet(dcgm_handle, group_id, reqCfgType, count, status_handle):
     for index in range(0, count):
         c_config_values[index].version = dcgm_structs.dcgmDeviceConfig_version1
 
-    ret = fn(dcgm_handle, group_id, reqCfgType, count, c_config_values, status_handle)
+    ret = fn(dcgm_handle, group_id, reqCfgType, count, c_config_values,
+             status_handle)
     dcgm_structs._dcgmCheckReturn(ret)
-    return map(None, c_config_values[0:count])
+    return list(c_config_values[0:count])
 
 
+@ensure_byte_strings()
 def dcgmConfigEnforce(dcgm_handle, group_id, status_handle):
     fn = dcgmFP("dcgmConfigEnforce")
     ret = fn(dcgm_handle, group_id, status_handle)
@@ -358,6 +422,7 @@ def dcgmConfigEnforce(dcgm_handle, group_id, status_handle):
 
 
 # This method is used to tell the cache manager to update all fields
+@ensure_byte_strings()
 def dcgmUpdateAllFields(dcgm_handle, waitForUpdate):
     fn = dcgmFP("dcgmUpdateAllFields")
     ret = fn(dcgm_handle, c_int(waitForUpdate))
@@ -366,6 +431,7 @@ def dcgmUpdateAllFields(dcgm_handle, waitForUpdate):
 
 
 # This method is used to get the policy information
+@ensure_byte_strings()
 def dcgmPolicyGet(dcgm_handle, group_id, count, status_handle):
     fn = dcgmFP("dcgmPolicyGet")
     policy_array = count * dcgm_structs.c_dcgmPolicy_v1
@@ -381,6 +447,7 @@ def dcgmPolicyGet(dcgm_handle, group_id, count, status_handle):
 
 
 # This method is used to set the policy information
+@ensure_byte_strings()
 def dcgmPolicySet(dcgm_handle, group_id, policy, status_handle):
     fn = dcgmFP("dcgmPolicySet")
     ret = fn(dcgm_handle, group_id, byref(policy), status_handle)
@@ -388,56 +455,38 @@ def dcgmPolicySet(dcgm_handle, group_id, policy, status_handle):
     return ret
 
 
-# First parameter below is the return type
+#First parameter below is the return type
 dcgmFieldValueEnumeration_f = CFUNCTYPE(
-    c_int32, c_uint32, POINTER(dcgm_structs.c_dcgmFieldValue_v1), c_int32, c_void_p
-)
+    c_int32, c_uint32, POINTER(dcgm_structs.c_dcgmFieldValue_v1), c_int32,
+    c_void_p)
 dcgmFieldValueEntityEnumeration_f = CFUNCTYPE(
-    c_int32,
-    c_uint32,
-    c_uint32,
-    POINTER(dcgm_structs.c_dcgmFieldValue_v1),
-    c_int32,
-    c_void_p,
-)
-
-
-def dcgmGetValuesSince(
-    dcgm_handle, groupId, fieldGroupId, sinceTimestamp, enumCB, userData
-):
+    c_int32, c_uint32, c_uint32, POINTER(dcgm_structs.c_dcgmFieldValue_v1),
+    c_int32, c_void_p)
+
+
+@ensure_byte_strings()
+def dcgmGetValuesSince(dcgm_handle, groupId, fieldGroupId, sinceTimestamp,
+                       enumCB, userData):
     fn = dcgmFP("dcgmGetValuesSince")
     c_nextSinceTimestamp = c_int64()
-    ret = fn(
-        dcgm_handle,
-        groupId,
-        fieldGroupId,
-        c_int64(sinceTimestamp),
-        byref(c_nextSinceTimestamp),
-        enumCB,
-        py_object(userData),
-    )
+    ret = fn(dcgm_handle, groupId, fieldGroupId, c_int64(sinceTimestamp),
+             byref(c_nextSinceTimestamp), enumCB, py_object(userData))
     dcgm_structs._dcgmCheckReturn(ret)
     return c_nextSinceTimestamp.value
 
 
-def dcgmGetValuesSince_v2(
-    dcgm_handle, groupId, fieldGroupId, sinceTimestamp, enumCB, userData
-):
+@ensure_byte_strings()
+def dcgmGetValuesSince_v2(dcgm_handle, groupId, fieldGroupId, sinceTimestamp,
+                          enumCB, userData):
     fn = dcgmFP("dcgmGetValuesSince_v2")
     c_nextSinceTimestamp = c_int64()
-    ret = fn(
-        dcgm_handle,
-        groupId,
-        fieldGroupId,
-        c_int64(sinceTimestamp),
-        byref(c_nextSinceTimestamp),
-        enumCB,
-        py_object(userData),
-    )
+    ret = fn(dcgm_handle, groupId, fieldGroupId, c_int64(sinceTimestamp),
+             byref(c_nextSinceTimestamp), enumCB, py_object(userData))
     dcgm_structs._dcgmCheckReturn(ret)
     return c_nextSinceTimestamp.value
 
 
+@ensure_byte_strings()
 def dcgmGetLatestValues(dcgm_handle, groupId, fieldGroupId, enumCB, userData):
     fn = dcgmFP("dcgmGetLatestValues")
     ret = fn(dcgm_handle, groupId, fieldGroupId, enumCB, py_object(userData))
@@ -445,29 +494,26 @@ def dcgmGetLatestValues(dcgm_handle, groupId, fieldGroupId, enumCB, userData):
     return ret
 
 
-def dcgmGetLatestValues_v2(dcgm_handle, groupId, fieldGroupId, enumCB, userData):
+@ensure_byte_strings()
+def dcgmGetLatestValues_v2(dcgm_handle, groupId, fieldGroupId, enumCB,
+                           userData):
     fn = dcgmFP("dcgmGetLatestValues_v2")
     ret = fn(dcgm_handle, groupId, fieldGroupId, enumCB, py_object(userData))
     dcgm_structs._dcgmCheckReturn(ret)
     return ret
 
 
-def dcgmWatchFields(
-    dcgm_handle, groupId, fieldGroupId, updateFreq, maxKeepAge, maxKeepSamples
-):
+@ensure_byte_strings()
+def dcgmWatchFields(dcgm_handle, groupId, fieldGroupId, updateFreq, maxKeepAge,
+                    maxKeepSamples):
     fn = dcgmFP("dcgmWatchFields")
-    ret = fn(
-        dcgm_handle,
-        groupId,
-        fieldGroupId,
-        c_int64(updateFreq),
-        c_double(maxKeepAge),
-        c_int32(maxKeepSamples),
-    )
+    ret = fn(dcgm_handle, groupId, fieldGroupId, c_int64(updateFreq),
+             c_double(maxKeepAge), c_int32(maxKeepSamples))
     dcgm_structs._dcgmCheckReturn(ret)
     return ret
 
 
+@ensure_byte_strings()
 def dcgmUnwatchFields(dcgm_handle, groupId, fieldGroupId):
     fn = dcgmFP("dcgmUnwatchFields")
     ret = fn(dcgm_handle, groupId, fieldGroupId)
@@ -475,6 +521,7 @@ def dcgmUnwatchFields(dcgm_handle, groupId, fieldGroupId):
     return ret
 
 
+@ensure_byte_strings()
 def dcgmHealthSet(dcgm_handle, groupId, systems):
     fn = dcgmFP("dcgmHealthSet")
     ret = fn(dcgm_handle, groupId, systems)
@@ -482,6 +529,7 @@ def dcgmHealthSet(dcgm_handle, groupId, systems):
     return ret
 
 
+@ensure_byte_strings()
 def dcgmHealthSet_v2(dcgm_handle, groupId, systems, updateInterval, maxKeepAge):
     params = dcgm_structs.c_dcgmHealthSetParams_v2()
     params.version = dcgm_structs.dcgmHealthSetParams_version2
@@ -496,6 +544,7 @@ def dcgmHealthSet_v2(dcgm_handle, groupId, systems, updateInterval, maxKeepAge):
     return ret
 
 
+@ensure_byte_strings()
 def dcgmHealthGet(dcgm_handle, groupId):
     c_systems = c_int32()
     fn = dcgmFP("dcgmHealthGet")
@@ -504,9 +553,10 @@ def dcgmHealthGet(dcgm_handle, groupId):
     return c_systems.value
 
 
-def dcgmHealthCheck(
-    dcgm_handle, groupId, version=dcgm_structs.dcgmHealthResponse_version4
-):
+@ensure_byte_strings()
+def dcgmHealthCheck(dcgm_handle,
+                    groupId,
+                    version=dcgm_structs.dcgmHealthResponse_version4):
     if version != dcgm_structs.dcgmHealthResponse_version4:
         dcgm_structs._dcgmCheckReturn(dcgm_structs.DCGM_ST_VER_MISMATCH)
 
@@ -518,13 +568,16 @@ def dcgmHealthCheck(
     return c_results
 
 
-def dcgmPolicyRegister(dcgm_handle, groupId, condition, beginCallback, finishCallback):
+@ensure_byte_strings()
+def dcgmPolicyRegister(dcgm_handle, groupId, condition, beginCallback,
+                       finishCallback):
     fn = dcgmFP("dcgmPolicyRegister")
     ret = fn(dcgm_handle, groupId, condition, beginCallback, finishCallback)
     dcgm_structs._dcgmCheckReturn(ret)
     return ret
 
 
+@ensure_byte_strings()
 def dcgmPolicyUnregister(dcgm_handle, groupId, condition):
     fn = dcgmFP("dcgmPolicyUnregister")
     ret = fn(dcgm_handle, groupId, condition)
@@ -532,6 +585,7 @@ def dcgmPolicyUnregister(dcgm_handle, groupId, condition):
     return ret
 
 
+@ensure_byte_strings()
 def dcgmPolicyTrigger(dcgm_handle):
     fn = dcgmFP("dcgmPolicyTrigger")
     ret = fn(dcgm_handle)
@@ -549,32 +603,34 @@ def helperDiagCheckReturn(ret, response):
 
             info = "%s" % response.systemError.msg
             e.SetAdditionalInfo(info)
-            raise e  # pylint: disable=E0710
+            raise e
         else:
             raise
 
     return response
 
 
-def dcgmActionValidate_v2(
-    dcgm_handle, runDiagInfo, runDiagVersion=dcgm_structs.dcgmRunDiag_version6
-):
-    response = dcgm_structs.c_dcgmDiagResponse_v6()
+@ensure_byte_strings()
+def dcgmActionValidate_v2(dcgm_handle,
+                          runDiagInfo,
+                          runDiagVersion=dcgm_structs.dcgmRunDiag_version7):
+    response = dcgm_structs.c_dcgmDiagResponse_v8()
     runDiagInfo.version = runDiagVersion
-    response.version = dcgm_structs.dcgmDiagResponse_version6
+    response.version = dcgm_structs.dcgmDiagResponse_version8
     fn = dcgmFP("dcgmActionValidate_v2")
     ret = fn(dcgm_handle, byref(runDiagInfo), byref(response))
 
     return helperDiagCheckReturn(ret, response)
 
 
+@ensure_byte_strings()
 def dcgmActionValidate(dcgm_handle, group_id, validate):
-    response = dcgm_structs.c_dcgmDiagResponse_v6()
-    response.version = dcgm_structs.dcgmDiagResponse_version6
+    response = dcgm_structs.c_dcgmDiagResponse_v8()
+    response.version = dcgm_structs.dcgmDiagResponse_version8
 
     # Put the group_id and validate into a dcgmRunDiag struct
-    runDiagInfo = dcgm_structs.c_dcgmRunDiag_v6()
-    runDiagInfo.version = dcgm_structs.dcgmRunDiag_version6
+    runDiagInfo = dcgm_structs.c_dcgmRunDiag_v7()
+    runDiagInfo.version = dcgm_structs.dcgmRunDiag_version7
     runDiagInfo.validate = validate
     runDiagInfo.groupId = group_id
 
@@ -584,28 +640,27 @@ def dcgmActionValidate(dcgm_handle, group_id, validate):
     return helperDiagCheckReturn(ret, response)
 
 
+@ensure_byte_strings()
 def dcgmRunDiagnostic(dcgm_handle, group_id, diagLevel):
-    response = dcgm_structs.c_dcgmDiagResponse_v6()
-    response.version = dcgm_structs.dcgmDiagResponse_version6
+    response = dcgm_structs.c_dcgmDiagResponse_v8()
+    response.version = dcgm_structs.dcgmDiagResponse_version8
     fn = dcgmFP("dcgmRunDiagnostic")
     ret = fn(dcgm_handle, group_id, diagLevel, byref(response))
 
     return helperDiagCheckReturn(ret, response)
 
 
-def dcgmWatchPidFields(dcgm_handle, groupId, updateFreq, maxKeepAge, maxKeepSamples):
+@ensure_byte_strings()
+def dcgmWatchPidFields(dcgm_handle, groupId, updateFreq, maxKeepAge,
+                       maxKeepSamples):
     fn = dcgmFP("dcgmWatchPidFields")
-    ret = fn(
-        dcgm_handle,
-        groupId,
-        c_int64(updateFreq),
-        c_double(maxKeepAge),
-        c_int32(maxKeepSamples),
-    )
+    ret = fn(dcgm_handle, groupId, c_int64(updateFreq), c_double(maxKeepAge),
+             c_int32(maxKeepSamples))
     dcgm_structs._dcgmCheckReturn(ret)
     return ret
 
 
+@ensure_byte_strings()
 def dcgmGetPidInfo(dcgm_handle, groupId, pid):
     fn = dcgmFP("dcgmGetPidInfo")
     pidInfo = dcgm_structs.c_dcgmPidInfo_v2()
@@ -618,6 +673,7 @@ def dcgmGetPidInfo(dcgm_handle, groupId, pid):
     return pidInfo
 
 
+@ensure_byte_strings()
 def dcgmGetDeviceTopology(dcgm_handle, gpuId):
     devtopo = dcgm_structs.c_dcgmDeviceTopology_v1()
     fn = dcgmFP("dcgmGetDeviceTopology")
@@ -626,6 +682,7 @@ def dcgmGetDeviceTopology(dcgm_handle, gpuId):
     return devtopo
 
 
+@ensure_byte_strings()
 def dcgmGetGroupTopology(dcgm_handle, groupId):
     grouptopo = dcgm_structs.c_dcgmGroupTopology_v1()
     fn = dcgmFP("dcgmGetGroupTopology")
@@ -634,19 +691,17 @@ def dcgmGetGroupTopology(dcgm_handle, groupId):
     return grouptopo
 
 
-def dcgmWatchJobFields(dcgm_handle, groupId, updateFreq, maxKeepAge, maxKeepSamples):
+@ensure_byte_strings()
+def dcgmWatchJobFields(dcgm_handle, groupId, updateFreq, maxKeepAge,
+                       maxKeepSamples):
     fn = dcgmFP("dcgmWatchJobFields")
-    ret = fn(
-        dcgm_handle,
-        groupId,
-        c_int64(updateFreq),
-        c_double(maxKeepAge),
-        c_int32(maxKeepSamples),
-    )
+    ret = fn(dcgm_handle, groupId, c_int64(updateFreq), c_double(maxKeepAge),
+             c_int32(maxKeepSamples))
     dcgm_structs._dcgmCheckReturn(ret)
     return ret
 
 
+@ensure_byte_strings()
 def dcgmJobStartStats(dcgm_handle, groupId, jobid):
     fn = dcgmFP("dcgmJobStartStats")
     ret = fn(dcgm_handle, groupId, jobid)
@@ -654,6 +709,7 @@ def dcgmJobStartStats(dcgm_handle, groupId, jobid):
     return ret
 
 
+@ensure_byte_strings()
 def dcgmJobStopStats(dcgm_handle, jobid):
     fn = dcgmFP("dcgmJobStopStats")
     ret = fn(dcgm_handle, jobid)
@@ -661,6 +717,7 @@ def dcgmJobStopStats(dcgm_handle, jobid):
     return ret
 
 
+@ensure_byte_strings()
 def dcgmJobGetStats(dcgm_handle, jobid):
     fn = dcgmFP("dcgmJobGetStats")
     jobInfo = dcgm_structs.c_dcgmJobInfo_v3()
@@ -672,6 +729,7 @@ def dcgmJobGetStats(dcgm_handle, jobid):
     return jobInfo
 
 
+@ensure_byte_strings()
 def dcgmJobRemove(dcgm_handle, jobid):
     fn = dcgmFP("dcgmJobRemove")
     ret = fn(dcgm_handle, jobid)
@@ -679,6 +737,7 @@ def dcgmJobRemove(dcgm_handle, jobid):
     return ret
 
 
+@ensure_byte_strings()
 def dcgmJobRemoveAll(dcgm_handle):
     fn = dcgmFP("dcgmJobRemoveAll")
     ret = fn(dcgm_handle)
@@ -686,13 +745,7 @@ def dcgmJobRemoveAll(dcgm_handle):
     return ret
 
 
-def dcgmIntrospectToggleState(dcgm_handle, enabledState):
-    fn = dcgmFP("dcgmIntrospectToggleState")
-    ret = fn(dcgm_handle, enabledState)
-    dcgm_structs._dcgmCheckReturn(ret)
-    return ret
-
-
+@ensure_byte_strings()
 def dcgmIntrospectGetHostengineMemoryUsage(dcgm_handle, waitIfNoData=True):
     fn = dcgmFP("dcgmIntrospectGetHostengineMemoryUsage")
 
@@ -704,6 +757,7 @@ def dcgmIntrospectGetHostengineMemoryUsage(dcgm_handle, waitIfNoData=True):
     return memInfo
 
 
+@ensure_byte_strings()
 def dcgmIntrospectGetHostengineCpuUtilization(dcgm_handle, waitIfNoData=True):
     fn = dcgmFP("dcgmIntrospectGetHostengineCpuUtilization")
 
@@ -715,88 +769,45 @@ def dcgmIntrospectGetHostengineCpuUtilization(dcgm_handle, waitIfNoData=True):
     return cpuUtil
 
 
-def dcgmIntrospectGetFieldsExecTime(dcgm_handle, introspectContext, waitIfNoData=True):
-    fn = dcgmFP("dcgmIntrospectGetFieldsExecTime")
-
-    execTime = dcgm_structs.c_dcgmIntrospectFullFieldsExecTime_v2()
-    execTime.version = dcgm_structs.dcgmIntrospectFullFieldsExecTime_version2
-
-    ret = fn(dcgm_handle, byref(introspectContext), byref(execTime), waitIfNoData)
-    dcgm_structs._dcgmCheckReturn(ret)
-    return execTime
-
-
-def dcgmIntrospectGetFieldsMemoryUsage(
-    dcgm_handle, introspectContext, waitIfNoData=True
-):
-    fn = dcgmFP("dcgmIntrospectGetFieldsMemoryUsage")
-
-    memInfo = dcgm_structs.c_dcgmIntrospectFullMemory_v1()
-    memInfo.version = dcgm_structs.dcgmIntrospectFullMemory_version1
-
-    ret = fn(dcgm_handle, byref(introspectContext), byref(memInfo), waitIfNoData)
-    dcgm_structs._dcgmCheckReturn(ret)
-    return memInfo
-
-
-def dcgmIntrospectUpdateAll(dcgmHandle, waitForUpdate):
-    fn = dcgmFP("dcgmIntrospectUpdateAll")
-    ret = fn(dcgmHandle, c_int(waitForUpdate))
-    dcgm_structs._dcgmCheckReturn(ret)
-
-
+@ensure_byte_strings()
 def dcgmEntityGetLatestValues(dcgmHandle, entityGroup, entityId, fieldIds):
     fn = dcgmFP("dcgmEntityGetLatestValues")
     field_values = (dcgm_structs.c_dcgmFieldValue_v1 * len(fieldIds))()
     id_values = (c_uint16 * len(fieldIds))(*fieldIds)
-    ret = fn(
-        dcgmHandle,
-        c_uint(entityGroup),
-        dcgm_fields.c_dcgm_field_eid_t(entityId),
-        id_values,
-        c_uint(len(fieldIds)),
-        field_values,
-    )
+    ret = fn(dcgmHandle, c_uint(entityGroup),
+             dcgm_fields.c_dcgm_field_eid_t(entityId), id_values,
+             c_uint(len(fieldIds)), field_values)
     dcgm_structs._dcgmCheckReturn(ret)
     return field_values
 
 
+@ensure_byte_strings()
 def dcgmEntitiesGetLatestValues(dcgmHandle, entities, fieldIds, flags):
     fn = dcgmFP("dcgmEntitiesGetLatestValues")
     numFvs = len(fieldIds) * len(entities)
     field_values = (dcgm_structs.c_dcgmFieldValue_v2 * numFvs)()
-    entities_values = (dcgm_structs.c_dcgmGroupEntityPair_t * len(entities))(*entities)
+    entities_values = (dcgm_structs.c_dcgmGroupEntityPair_t *
+                       len(entities))(*entities)
     field_id_values = (c_uint16 * len(fieldIds))(*fieldIds)
-    ret = fn(
-        dcgmHandle,
-        entities_values,
-        c_uint(len(entities)),
-        field_id_values,
-        c_uint(len(fieldIds)),
-        flags,
-        field_values,
-    )
+    ret = fn(dcgmHandle, entities_values, c_uint(len(entities)),
+             field_id_values, c_uint(len(fieldIds)), flags, field_values)
     dcgm_structs._dcgmCheckReturn(ret)
     return field_values
 
 
+@ensure_byte_strings()
 def dcgmSelectGpusByTopology(dcgmHandle, inputGpuIds, numGpus, hintFlags):
     fn = dcgmFP("dcgmSelectGpusByTopology")
     outputGpuIds = c_int64()
-    ret = fn(
-        dcgmHandle,
-        c_uint64(inputGpuIds),
-        c_uint32(numGpus),
-        byref(outputGpuIds),
-        c_uint64(hintFlags),
-    )
+    ret = fn(dcgmHandle, c_uint64(inputGpuIds), c_uint32(numGpus),
+             byref(outputGpuIds), c_uint64(hintFlags))
     dcgm_structs._dcgmCheckReturn(ret)
     return outputGpuIds
 
 
-def dcgmGetFieldSummary(
-    dcgmHandle, fieldId, entityGroupType, entityId, summaryMask, startTime, endTime
-):
+@ensure_byte_strings()
+def dcgmGetFieldSummary(dcgmHandle, fieldId, entityGroupType, entityId,
+                        summaryMask, startTime, endTime):
     fn = dcgmFP("dcgmGetFieldSummary")
     request = dcgm_structs.c_dcgmFieldSummaryRequest_v1()
     request.version = dcgm_structs.dcgmFieldSummaryRequest_version1
@@ -811,13 +822,15 @@ def dcgmGetFieldSummary(
     return request
 
 
-def dcgmModuleBlacklist(dcgmHandle, moduleId):
-    fn = dcgmFP("dcgmModuleBlacklist")
+@ensure_byte_strings()
+def dcgmModuleDenylist(dcgmHandle, moduleId):
+    fn = dcgmFP("dcgmModuleDenylist")
     ret = fn(dcgmHandle, c_uint32(moduleId))
     dcgm_structs._dcgmCheckReturn(ret)
     return ret
 
 
+@ensure_byte_strings()
 def dcgmModuleGetStatuses(dcgmHandle):
     moduleStatuses = dcgm_structs.c_dcgmModuleGetStatuses_v1()
     moduleStatuses.version = dcgm_structs.dcgmModuleGetStatuses_version1
@@ -827,45 +840,18 @@ def dcgmModuleGetStatuses(dcgmHandle):
     return moduleStatuses
 
 
-def dcgmProfGetSupportedMetricGroups(dcgmHandle, groupId):
-    msg = dcgm_structs.c_dcgmProfGetMetricGroups_v2()
-    msg.version = dcgm_structs.dcgmProfGetMetricGroups_version1
-    msg.groupId = groupId
+@ensure_byte_strings()
+def dcgmProfGetSupportedMetricGroups(dcgmHandle, gpuId):
+    msg = dcgm_structs.c_dcgmProfGetMetricGroups_v3()
+    msg.version = dcgm_structs.dcgmProfGetMetricGroups_version3
+    msg.gpuId = gpuId
     fn = dcgmFP("dcgmProfGetSupportedMetricGroups")
     ret = fn(dcgmHandle, byref(msg))
     dcgm_structs._dcgmCheckReturn(ret)
     return msg
 
 
-def dcgmProfWatchFields(
-    dcgmHandle, fieldIds, groupId, updateFreq, maxKeepAge, maxKeepSamples
-):
-    msg = dcgm_structs.c_dcgmProfWatchFields_v1()
-    msg.version = dcgm_structs.dcgmProfWatchFields_version1
-    msg.groupId = groupId
-    msg.updateFreq = updateFreq
-    msg.maxKeepAge = maxKeepAge
-    msg.maxKeepSamples = maxKeepSamples
-    msg.numFieldIds = c_uint32(len(fieldIds))
-    for i, fieldId in enumerate(fieldIds):
-        msg.fieldIds[i] = fieldId
-
-    fn = dcgmFP("dcgmProfWatchFields")
-    ret = fn(dcgmHandle, byref(msg))
-    dcgm_structs._dcgmCheckReturn(ret)
-    return msg
-
-
-def dcgmProfUnwatchFields(dcgmHandle, groupId):
-    msg = dcgm_structs.c_dcgmProfUnwatchFields_v1()
-    msg.version = dcgm_structs.dcgmProfUnwatchFields_version1
-    msg.groupId = groupId
-    fn = dcgmFP("dcgmProfUnwatchFields")
-    ret = fn(dcgmHandle, byref(msg))
-    dcgm_structs._dcgmCheckReturn(ret)
-    return msg
-
-
+@ensure_byte_strings()
 def dcgmProfPause(dcgmHandle):
     fn = dcgmFP("dcgmProfPause")
     ret = fn(dcgmHandle)
@@ -873,6 +859,7 @@ def dcgmProfPause(dcgmHandle):
     return ret
 
 
+@ensure_byte_strings()
 def dcgmProfResume(dcgmHandle):
     fn = dcgmFP("dcgmProfResume")
     ret = fn(dcgmHandle)
@@ -880,6 +867,7 @@ def dcgmProfResume(dcgmHandle):
     return ret
 
 
+@ensure_byte_strings()
 def dcgmVersionInfo():
     msg = dcgm_structs.c_dcgmVersionInfo_v2()
     msg.version = dcgm_structs.dcgmVersionInfo_version2
@@ -889,10 +877,11 @@ def dcgmVersionInfo():
     return msg
 
 
+@ensure_byte_strings()
 def dcgmHostengineIsHealthy(dcgmHandle):
     heHealth = dcgm_structs.c_dcgmHostengineHealth_v1()
     heHealth.version = dcgm_structs.dcgmHostengineHealth_version1
     fn = dcgmFP("dcgmHostengineIsHealthy")
     ret = fn(dcgmHandle, byref(heHealth))
     dcgm_structs._dcgmCheckReturn(ret)
-    return heHealth
+    return heHealth
\ No newline at end of file
diff --git a/model_analyzer/monitor/dcgm/dcgm_collectd_plugin.py b/model_analyzer/monitor/dcgm/dcgm_collectd_plugin.py
new file mode 100644
index 000000000..d3355c556
--- /dev/null
+++ b/model_analyzer/monitor/dcgm/dcgm_collectd_plugin.py
@@ -0,0 +1,369 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import subprocess
+import signal
+import os
+import re
+import sys
+
+dir_path = os.path.dirname(os.path.realpath(__file__))
+parent_dir_path = os.path.abspath(os.path.join(dir_path, os.pardir))
+sys.path.insert(0, parent_dir_path)
+
+import model_analyzer.monitor.dcgm.dcgm_fields_collectd as dcgm_fields_collectd
+import model_analyzer.monitor.dcgm.pydcgm as pydcgm
+import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields
+import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
+import threading
+from model_analyzer.monitor.dcgm.DcgmReader import DcgmReader
+
+if 'DCGM_TESTING_FRAMEWORK' in os.environ:
+    try:
+        import collectd_tester_api as collectd
+    except:
+        import collectd
+else:
+    import collectd
+
+# Set default values for the hostname and the library path
+g_dcgmLibPath = '/usr/lib'
+g_dcgmHostName = 'localhost'
+
+# Add overriding through the environment instead of hard coded.
+if 'DCGM_HOSTNAME' in os.environ:
+    g_dcgmHostName = os.environ['DCGM_HOSTNAME']
+
+if 'DCGMLIBPATH' in os.environ:
+    g_dcgmLibPath = os.environ['DCGMLIBPATH']
+
+c_ONE_SEC_IN_USEC = 1000000
+
+g_intervalSec = 10  # Default
+
+g_dcgmIgnoreFields = [dcgm_fields.DCGM_FI_DEV_UUID]  # Fields not to publish
+
+g_publishFieldIds = [
+    dcgm_fields.DCGM_FI_DEV_UUID,  #Needed for plugin instance
+    dcgm_fields.DCGM_FI_DEV_POWER_USAGE,
+    dcgm_fields.DCGM_FI_DEV_GPU_TEMP,
+    dcgm_fields.DCGM_FI_DEV_SM_CLOCK,
+    dcgm_fields.DCGM_FI_DEV_GPU_UTIL,
+    dcgm_fields.DCGM_FI_DEV_RETIRED_PENDING,
+    dcgm_fields.DCGM_FI_DEV_RETIRED_SBE,
+    dcgm_fields.DCGM_FI_DEV_RETIRED_DBE,
+    dcgm_fields.DCGM_FI_DEV_ECC_SBE_VOL_TOTAL,
+    dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL,
+    dcgm_fields.DCGM_FI_DEV_ECC_SBE_AGG_TOTAL,
+    dcgm_fields.DCGM_FI_DEV_ECC_DBE_AGG_TOTAL,
+    dcgm_fields.DCGM_FI_DEV_FB_TOTAL,
+    dcgm_fields.DCGM_FI_DEV_FB_FREE,
+    dcgm_fields.DCGM_FI_DEV_FB_USED,
+    dcgm_fields.DCGM_FI_DEV_PCIE_REPLAY_COUNTER,
+    dcgm_fields.DCGM_FI_DEV_POWER_VIOLATION,
+    dcgm_fields.DCGM_FI_DEV_THERMAL_VIOLATION,
+    dcgm_fields.DCGM_FI_DEV_XID_ERRORS,
+    dcgm_fields.DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL,
+    dcgm_fields.DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL,
+    dcgm_fields.DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL,
+    dcgm_fields.DCGM_FI_DEV_MEM_CLOCK,
+    dcgm_fields.DCGM_FI_DEV_MEMORY_TEMP,
+    dcgm_fields.DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION,
+    dcgm_fields.DCGM_FI_DEV_MEM_COPY_UTIL,
+    dcgm_fields.DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL,
+    dcgm_fields.DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL,
+    dcgm_fields.DCGM_FI_DEV_PCIE_TX_THROUGHPUT,
+    dcgm_fields.DCGM_FI_DEV_PCIE_RX_THROUGHPUT
+]
+
+g_fieldIntervalMap = None
+g_parseRegEx = None
+g_fieldRegEx = None
+
+# We build up a regex to match field IDs. These can be numeric IDs, or
+# names. We start with field_regex that matches either as a string (as
+# well as names that might start with digits, but we do not worry about
+# this over-generation of valid IDs at this point).
+#
+# Basically a field is an integral number or a textual name. A field
+# list is a field, or a list of fields separated by commas and enclosed
+# in parenthssis. A field list may be optionally followed by a colon,
+# indicating a possible non-default interval if also followed by a
+# floating point interval value. This is a complete field list.
+# Multiple complete field lists may appear, separated by commas.
+#
+# For example: (1001,tensor_active):5,1002:10
+#
+# This specifies that fields 1001 and tensor_active are to be sampled
+# at a rate of every 5 seconds, and 1002 every ten seconds.
+#
+# For example: (1001,tensor_active):5,1002:
+#
+# This is the same, but field 1002 is to be sampled at the default rate
+# (and the colon in entirely unnecessary, but not illegal).
+
+field_regex = r"[0-9a-zA-Z_]+"
+g_fieldRegEx = re.compile("((" + field_regex + "),?)")
+
+# We now generate a list of field regular expressions, separated by a
+# comma, and enclosed with parenthesis, for grouping.
+
+fields_regex = r"\(" + field_regex + "(," + field_regex + ")*" + r"\)"
+
+# This is an optional interval specification, allowing an optional :,
+# followed by an optional floating point dcgm sampling interval. If any
+# are missing, the default collectd sampling interval is used.
+
+interval_regex = r"(:[0-9]*(\.[0-9]+)?)?,?"
+
+# Here, we combine a field regex or field list regex with an optional
+# interval regex. Multiple of these may appear in succession.
+
+g_parseRegEx = re.compile("((" + field_regex + "|(" + fields_regex + "))" +
+                          interval_regex + ")")
+
+
+class DcgmCollectdPlugin(DcgmReader):
+    ###########################################################################
+    def __init__(self):
+        global c_ONE_SEC_IN_USEC
+
+        collectd.debug(
+            'Initializing DCGM with interval={}s'.format(g_intervalSec))
+        DcgmReader.__init__(self,
+                            fieldIds=g_publishFieldIds,
+                            ignoreList=g_dcgmIgnoreFields,
+                            fieldGroupName='collectd_plugin',
+                            updateFrequency=g_intervalSec * c_ONE_SEC_IN_USEC,
+                            fieldIntervalMap=g_fieldIntervalMap)
+
+###########################################################################
+
+    def CustomDataHandler(self, fvs):
+        global c_ONE_SEC_IN_USEC
+
+        value = collectd.Values(type='gauge')  # pylint: disable=no-member
+        value.plugin = 'dcgm_collectd'
+
+        for gpuId in list(fvs.keys()):
+            gpuFv = fvs[gpuId]
+
+            uuid = self.m_gpuIdToUUId[gpuId]
+            collectd.debug('CustomDataHandler uuid: ' + '%s' % (uuid) + '\n')
+            value.plugin_instance = '%s' % (uuid)
+
+            typeInstance = str(gpuId)
+
+            for fieldId in list(gpuFv.keys()):
+                # Skip ignore list
+                if fieldId in self.m_dcgmIgnoreFields:
+                    continue
+
+                fieldTag = self.m_fieldIdToInfo[fieldId].tag
+                lastValTime = float("inf")
+
+                # Filter out times too close together (< 1.0 sec) but always
+                # include latest one.
+
+                for val in gpuFv[fieldId][::-1]:
+                    # Skip blank values. Otherwise, we'd have to insert a placeholder blank value based on the fieldId
+                    if val.isBlank:
+                        continue
+
+                    valTimeSec1970 = (val.ts / c_ONE_SEC_IN_USEC
+                                     )  #Round down to 1-second for now
+                    if (lastValTime - valTimeSec1970) < 1.0:
+                        collectd.debug(
+                            "DCGM sample for field ID %d too soon  at %f, last one sampled at %f"
+                            % (fieldId, valTimeSec1970, lastValTime))
+                        val.isBlank = True  # Filter this one out
+                        continue
+
+                    lastValTime = valTimeSec1970
+
+                i = 0
+
+                for val in gpuFv[fieldId]:
+                    # Skip blank values. Otherwise, we'd have to insert a placeholder blank value based on the fieldId
+                    if val.isBlank:
+                        continue
+
+                    # Round down to 1-second for now
+                    valTimeSec1970 = (val.ts / c_ONE_SEC_IN_USEC)
+                    valueArray = [
+                        val.value,
+                    ]
+                    value.dispatch(type=fieldTag,
+                                   type_instance=typeInstance,
+                                   time=valTimeSec1970,
+                                   values=valueArray,
+                                   plugin=value.plugin)
+
+                    collectd.debug(
+                        "    gpuId %d, tag %s, sample %d, value %s, time %s" %
+                        (gpuId, fieldTag, i, str(val.value), str(val.ts)))  # pylint: disable=no-member
+                    i += 1
+
+    ###########################################################################
+    def LogInfo(self, msg):
+        collectd.info(msg)  # pylint: disable=no-member
+
+    ###########################################################################
+    def LogError(self, msg):
+        collectd.error(msg)  # pylint: disable=no-member
+
+
+###############################################################################
+##### Parse supplied collectd configuration object.
+###############################################################################
+def parse_config(config):
+    global c_ONE_SEC_IN_USEC
+    global g_intervalSec
+    global g_fieldIntervalMap
+    global g_parseRegEx
+    global g_fieldRegEx
+
+    g_fieldIntervalMap = {}
+
+    for node in config.children:
+        if node.key == 'Interval':
+            g_intervalSec = float(node.values[0])
+        elif node.key == 'FieldIds':
+            fieldIds = node.values[0]
+
+            # And we parse out the field ID list with this regex.
+            field_set_list = g_parseRegEx.finditer(fieldIds)
+
+            for field_set in field_set_list:
+                # We get the list of fields...
+                fields = field_set.group(2)
+
+                # ... and the optional interval.
+                interval_str = field_set.group(5)
+
+                # We figure out if the default collectd sampling interval is
+                # to be used, or a different one.
+                if (interval_str == None) or (interval_str == ":"):
+                    interval = int(g_intervalSec * c_ONE_SEC_IN_USEC)
+                else:
+                    interval = int(float(interval_str[1:]) *
+                                   c_ONE_SEC_IN_USEC)  # strip :
+
+                # We keep a set of fields for each unique interval
+                if interval not in g_fieldIntervalMap.keys():
+                    g_fieldIntervalMap[interval] = []
+
+                # Here we parse out either miltiple fields sharing an
+                # interval, or a single field.
+                if fields[0:1] == "(":  # a true field set
+                    fields = fields[1:-1]
+                    field_list = g_fieldRegEx.finditer(fields)
+                    for field_group in field_list:
+
+                        # We map any field names to field numbers, and add
+                        # them to the list for the interval
+                        field = dcgm_fields_collectd.GetFieldByName(
+                            field_group.group(2))
+                        g_fieldIntervalMap[interval] += [field]
+                else:  # just one field
+                    # Map field name to number.
+                    field = dcgm_fields_collectd.GetFieldByName(fields)
+                    g_fieldIntervalMap[interval] += [field]
+
+
+###############################################################################
+##### Wrapper the Class methods for collectd callbacks
+###############################################################################
+def config_dcgm(config=None):
+    """
+    collectd config for dcgm is in the form of a dcgm.conf file, usually
+    installed in /etc/collectd/collectd.conf.d/dcgm.conf.
+
+    An example is:
+
+    LoadPlugin python
+    <Plugin python>
+        ModulePath "/usr/lib64/collectd/dcgm"
+        LogTraces true
+        Interactive false
+        Import "dcgm_collectd_plugin"
+        <Module dcgm_collectd_plugin>
+            Interval 2
+            FieldIds "(1001,tensor_active):5,1002:10,1004:.1,1010:"
+            FieldIds "1007"
+        </Module>
+    </Plugin>
+
+    ModulePath indicates where the plugin and supporting files are installed
+    (generally copied from /usr/local/dcgm/bindings/python3).
+
+    Interval is the default collectd sampling interval in seconds.
+
+    FieldIds may appear several times. One is either a field ID by name or
+    number. A field ID list is either a single field ID or a list of same, 
+    separated by commas (,) and bounded by parenthesis ( ( and ) ). Each field
+    ID list can be followed by an optional colon (:) and a floating point
+    DCGM sampling interval. If no sampling interval is specified the default
+    collectd sampling interval is used (and the colon is redundant but not
+    illegal). Multiple field ID lists can appear on one FieldIds entry,
+    separated by commas (,). FieldIDs are strings and must be enclosed in
+    quotes ("). Multiple FieldIds lines are permitted.
+
+    DCGM will sample the fields at the interval(s) indicated, and collectd will
+    collect the samples asynchronously at the Interval specified. Because this
+    is asynchronous sometimes one less than expected will be collected and other
+    times one more than expected will be collected.
+    """
+
+    # If we throw an exception here, collectd config will terminate loading the
+    # plugin.
+    if config is not None:
+        parse_config(config)
+
+    # Register the read function with the default collectd sampling interval.
+    collectd.register_read(read_dcgm, interval=g_intervalSec)  # pylint: disable=no-member
+
+
+###############################################################################
+def init_dcgm():
+    global g_dcgmCollectd
+
+    # restore default SIGCHLD behavior to avoid exceptions with new processes
+    signal.signal(signal.SIGCHLD, signal.SIG_DFL)
+
+    g_dcgmCollectd = DcgmCollectdPlugin()
+    g_dcgmCollectd.Init()
+
+
+###############################################################################
+def shutdown_dcgm():
+    g_dcgmCollectd.Shutdown()
+
+
+###############################################################################
+def read_dcgm(data=None):
+    g_dcgmCollectd.Process()
+
+
+def register_collectd_callbacks():
+    collectd.register_config(config_dcgm, name="dcgm_collectd_plugin")  # pylint: disable=no-member
+    # config_dcgm registers read since it needs to parse the sampling interval.
+    collectd.register_init(init_dcgm)  # pylint: disable=no-member
+    collectd.register_shutdown(shutdown_dcgm)  # pylint: disable=no-member
+
+
+###############################################################################
+##### Main
+###############################################################################
+register_collectd_callbacks()
diff --git a/model_analyzer/monitor/dcgm/dcgm_errors.py b/model_analyzer/monitor/dcgm/dcgm_errors.py
new file mode 100644
index 000000000..e52f3b114
--- /dev/null
+++ b/model_analyzer/monitor/dcgm/dcgm_errors.py
@@ -0,0 +1,395 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import ctypes
+import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
+
+DCGM_FR_OK = 0  # No error
+DCGM_FR_UNKNOWN = 1  # Unknown error code
+DCGM_FR_UNRECOGNIZED = 2  # Unrecognized error code
+DCGM_FR_PCI_REPLAY_RATE = 3  # Unacceptable rate of PCI errors
+DCGM_FR_VOLATILE_DBE_DETECTED = 4  # Uncorrectable volatile double bit error
+DCGM_FR_VOLATILE_SBE_DETECTED = 5  # Unacceptable rate of volatile single bit errors
+DCGM_FR_PENDING_PAGE_RETIREMENTS = 6  # Pending page retirements detected
+DCGM_FR_RETIRED_PAGES_LIMIT = 7  # Unacceptable total page retirements detected
+DCGM_FR_RETIRED_PAGES_DBE_LIMIT = 8  # Unacceptable total page retirements due to uncorrectable errors
+DCGM_FR_CORRUPT_INFOROM = 9  # Corrupt inforom found
+DCGM_FR_CLOCK_THROTTLE_THERMAL = 10  # Clocks being throttled due to overheating
+DCGM_FR_POWER_UNREADABLE = 11  # Cannot get a reading for power from NVML
+DCGM_FR_CLOCK_THROTTLE_POWER = 12  # Clock being throttled due to power restrictions
+DCGM_FR_NVLINK_ERROR_THRESHOLD = 13  # Unacceptable rate of NVLink errors
+DCGM_FR_NVLINK_DOWN = 14  # NVLink is down
+DCGM_FR_NVSWITCH_FATAL_ERROR = 15  # Fatal errors on the NVSwitch
+DCGM_FR_NVSWITCH_NON_FATAL_ERROR = 16  # Non-fatal errors on the NVSwitch
+DCGM_FR_NVSWITCH_DOWN = 17  # NVSwitch is down
+DCGM_FR_NO_ACCESS_TO_FILE = 18  # Cannot access a file
+DCGM_FR_NVML_API = 19  # Error occurred on an NVML API
+DCGM_FR_DEVICE_COUNT_MISMATCH = 20  # Disagreement in GPU count between /dev and NVML
+DCGM_FR_BAD_PARAMETER = 21  # Bad parameter passed to API
+DCGM_FR_CANNOT_OPEN_LIB = 22  # Cannot open a library that must be accessed
+DCGM_FR_DENYLISTED_DRIVER = 23  # A driver on the denylist (nouveau) is active
+DCGM_FR_NVML_LIB_BAD = 24  # The NVML library is missing expected functions
+DCGM_FR_GRAPHICS_PROCESSES = 25  # Graphics processes are active on this GPU
+DCGM_FR_HOSTENGINE_CONN = 26  # Unstable connection to nv-hostengine (daemonized DCGM)
+DCGM_FR_FIELD_QUERY = 27  # Error querying a field from DCGM
+DCGM_FR_BAD_CUDA_ENV = 28  # The environment has variables that hurt CUDA
+DCGM_FR_PERSISTENCE_MODE = 29  # Persistence mode is disabled
+DCGM_FR_LOW_BANDWIDTH = 30  # The bandwidth is unacceptably low
+DCGM_FR_HIGH_LATENCY = 31  # Latency is too high
+DCGM_FR_CANNOT_GET_FIELD_TAG = 32  # Cannot find a tag for a field
+DCGM_FR_FIELD_VIOLATION = 33  # The value for the specified error field is above 0
+DCGM_FR_FIELD_THRESHOLD = 34  # The value for the specified field is above the threshold
+DCGM_FR_FIELD_VIOLATION_DBL = 35  # The value for the specified error field is above 0
+DCGM_FR_FIELD_THRESHOLD_DBL = 36  # The value for the specified field is above the threshold
+DCGM_FR_UNSUPPORTED_FIELD_TYPE = 37  # Field type cannot be supported
+DCGM_FR_FIELD_THRESHOLD_TS = 38  # The value for the specified field is above the threshold
+DCGM_FR_FIELD_THRESHOLD_TS_DBL = 39  # The value for the specified field is above the threshold
+DCGM_FR_THERMAL_VIOLATIONS = 40  # Thermal violations detected
+DCGM_FR_THERMAL_VIOLATIONS_TS = 41  # Thermal violations detected with a timestamp
+DCGM_FR_TEMP_VIOLATION = 42  # Temperature is too high
+DCGM_FR_THROTTLING_VIOLATION = 43  # Non-benign clock throttling is occurring
+DCGM_FR_INTERNAL = 44  # An internal error was detected
+DCGM_FR_PCIE_GENERATION = 45  # PCIe generation is too low
+DCGM_FR_PCIE_WIDTH = 46  # PCIe width is too low
+DCGM_FR_ABORTED = 47  # Test was aborted by a user signal
+DCGM_FR_TEST_DISABLED = 48  # This test is disabled for this GPU
+DCGM_FR_CANNOT_GET_STAT = 49  # Cannot get telemetry for a needed value
+DCGM_FR_STRESS_LEVEL = 50  # Stress level is too low (bad performance)
+DCGM_FR_CUDA_API = 51  # Error calling the specified CUDA API
+DCGM_FR_FAULTY_MEMORY = 52  # Faulty memory detected on this GPU
+DCGM_FR_CANNOT_SET_WATCHES = 53  # Unable to set field watches in DCGM
+DCGM_FR_CUDA_UNBOUND = 54  # CUDA context is no longer bound
+DCGM_FR_ECC_DISABLED = 55  # ECC memory is disabled right now
+DCGM_FR_MEMORY_ALLOC = 56  # Cannot allocate memory
+DCGM_FR_CUDA_DBE = 57  # CUDA detected unrecovable double-bit error
+DCGM_FR_MEMORY_MISMATCH = 58  # Memory error detected
+DCGM_FR_CUDA_DEVICE = 59  # No CUDA device discoverable for existing GPU
+DCGM_FR_ECC_UNSUPPORTED = 60  # ECC memory is unsupported by this SKU
+DCGM_FR_ECC_PENDING = 61  # ECC memory is in a pending state
+DCGM_FR_MEMORY_BANDWIDTH = 62  # Memory bandwidth is too low
+DCGM_FR_TARGET_POWER = 63  # Cannot hit the target power draw
+DCGM_FR_API_FAIL = 64  # The specified API call failed
+DCGM_FR_API_FAIL_GPU = 65  # The specified API call failed for the specified GPU
+DCGM_FR_CUDA_CONTEXT = 66  # Cannot create a CUDA context on this GPU
+DCGM_FR_DCGM_API = 67  # DCGM API failure
+DCGM_FR_CONCURRENT_GPUS = 68  # Need multiple GPUs to run this test
+DCGM_FR_TOO_MANY_ERRORS = 69  # More errors than fit in the return struct
+DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD = 70  # More than 100 CRC errors are happening per second
+DCGM_FR_NVLINK_ERROR_CRITICAL = 71  # NVLink error for a field that should always be 0
+DCGM_FR_ENFORCED_POWER_LIMIT = 72  # The enforced power limit is too low to hit the target
+DCGM_FR_MEMORY_ALLOC_HOST = 73  # Cannot allocate memory on the host
+DCGM_FR_GPU_OP_MODE = 74  # Bad GPU operating mode for running plugin
+DCGM_FR_NO_MEMORY_CLOCKS = 75  # No memory clocks with the needed MHz were found
+DCGM_FR_NO_GRAPHICS_CLOCKS = 76  # No graphics clocks with the needed MHz were found
+DCGM_FR_HAD_TO_RESTORE_STATE = 77  # Note that we had to restore a GPU's state
+DCGM_FR_L1TAG_UNSUPPORTED = 78  # L1TAG test is unsupported by this SKU
+DCGM_FR_L1TAG_MISCOMPARE = 79  # L1TAG test failed on a miscompare
+DCGM_FR_ROW_REMAP_FAILURE = 80  # Row remapping failed (Ampere or newer GPUs)
+DCGM_FR_UNCONTAINED_ERROR = 81  # Uncontained error - XID 95
+DCGM_FR_EMPTY_GPU_LIST = 82  # No GPU information given to plugin
+DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS = 83  # Pending page retirements due to a DBE
+DCGM_FR_UNCORRECTABLE_ROW_REMAP = 84  # Uncorrectable row remapping
+DCGM_FR_PENDING_ROW_REMAP = 85  # Row remapping is pending
+DCGM_FR_BROKEN_P2P_MEMORY_DEVICE = 86  # P2P copy test detected an error writing to this GPU
+DCGM_FR_BROKEN_P2P_WRITER_DEVICE = 87  # P2P copy test detected an error writing from this GPU
+DCGM_FR_NVSWITCH_NVLINK_DOWN = 88  # An NVLink is down
+DCGM_FR_EUD_BINARY_PERMISSIONS = 89  # EUD binary permissions are incorrect
+DCGM_FR_EUD_NON_ROOT_USER = 90  # EUD plugin is not running as root
+DCGM_FR_EUD_SPAWN_FAILURE = 91  # EUD plugin failed to spawn the EUD binary
+DCGM_FR_EUD_TIMEOUT = 92  # EUD plugin timed out
+DCGM_FR_EUD_ZOMBIE = 93  # EUD process remains running after the plugin considers it finished
+DCGM_FR_EUD_NON_ZERO_EXIT_CODE = 94  # EUD process exited with a non-zero exit code
+DCGM_FR_EUD_TEST_FAILED = 95  # EUD test failed
+DCGM_FR_FILE_CREATE_PERMISSIONS = 96  # We cannot write a file in this directory.
+DCGM_FR_PAUSE_RESUME_FAILED = 97  # Pause/Resume failed
+DCGM_FR_ERROR_SENTINEL = 98  # MUST BE THE LAST ERROR CODE
+
+# Standard message for running a field diagnostic
+TRIAGE_RUN_FIELD_DIAG_MSG = "Run a field diagnostic on the GPU."
+DEBUG_COOLING_MSG = "Verify that the cooling on this machine is functional, including external, thermal "\
+                    "material interface, fans, and any other components."
+BUG_REPORT_MSG = "Please capture an nvidia-bug-report and send it to NVIDIA."
+
+# Define DCGM error priorities
+DCGM_ERROR_MONITOR = 0  # Can perform workload, but needs to be monitored.
+DCGM_ERROR_ISOLATE = 1  # Cannot perform workload. GPU should be isolated.
+DCGM_ERROR_UNKNOWN = 2  # This error code is not recognized
+
+# Messages for the error codes. All messages must be defined in the ERROR_CODE_MSG <msg> format
+# where <msg> is the actual message.
+
+DCGM_FR_OK_MSG = "The operation completed successfully."
+DCGM_FR_UNKNOWN_MSG = "Unknown error."
+DCGM_FR_UNRECOGNIZED_MSG = "Unrecognized error code."
+# replay limit, gpu id, replay errors detected
+DCGM_FR_PCI_REPLAY_RATE_MSG = "Detected more than %u PCIe replays per minute for GPU %u : %d"
+# dbes deteced, gpu id
+DCGM_FR_VOLATILE_DBE_DETECTED_MSG = "Detected %d volatile double-bit ECC error(s) in GPU %u."
+# sbe limit, gpu id, sbes detected
+DCGM_FR_VOLATILE_SBE_DETECTED_MSG = "More than %u single-bit ECC error(s) detected in GPU %u Volatile SBEs: %lld"
+# gpu id
+DCGM_FR_PENDING_PAGE_RETIREMENTS_MSG = "A pending retired page has been detected in GPU %u."
+# retired pages detected, gpud id
+DCGM_FR_RETIRED_PAGES_LIMIT_MSG = "%u or more retired pages have been detected in GPU %u. "
+# retired pages due to dbes detected, gpu id
+DCGM_FR_RETIRED_PAGES_DBE_LIMIT_MSG   = "An excess of %u retired pages due to DBEs have been detected and" \
+                                                    " more than one page has been retired due to DBEs in the past" \
+                                                    " week in GPU %u."
+# gpu id
+DCGM_FR_CORRUPT_INFOROM_MSG = "A corrupt InfoROM has been detected in GPU %u."
+# gpu id
+DCGM_FR_CLOCK_THROTTLE_THERMAL_MSG = "Detected clock throttling due to thermal violation in GPU %u."
+# gpu id
+DCGM_FR_POWER_UNREADABLE_MSG = "Cannot reliably read the power usage for GPU %u."
+# gpu id
+DCGM_FR_CLOCK_THROTTLE_POWER_MSG = "Detected clock throttling due to power violation in GPU %u."
+# nvlink errors detected, nvlink id, error threshold
+DCGM_FR_NVLINK_ERROR_THRESHOLD_MSG = "Detected %ld NvLink errors on NvLink %u which exceeds threshold of %u"
+# gpu id, nvlink id
+DCGM_FR_NVLINK_DOWN_MSG = "GPU %u's NvLink link %d is currently down"
+# nvswitch id, nvlink id
+DCGM_FR_NVSWITCH_FATAL_ERROR_MSG = "Detected fatal errors on NvSwitch %u link %u"
+# nvswitch id, nvlink id
+DCGM_FR_NVSWITCH_NON_FATAL_ERROR_MSG = "Detected nonfatal errors on NvSwitch %u link %u"
+# nvswitch id, nvlink port
+DCGM_FR_NVSWITCH_DOWN_MSG = "NvSwitch physical ID %u's NvLink port %d is currently down."
+# file path, error detail
+DCGM_FR_NO_ACCESS_TO_FILE_MSG = "File %s could not be accessed directly: %s"
+# purpose for communicating with NVML, NVML error as string, NVML error
+DCGM_FR_NVML_API_MSG = "Error calling NVML API %s: %s"
+DCGM_FR_DEVICE_COUNT_MISMATCH_MSG     = "The number of devices NVML returns is different than the number "\
+                                                "of devices in /dev."
+# function name
+DCGM_FR_BAD_PARAMETER_MSG = "Bad parameter to function %s cannot be processed"
+# library name, error returned from dlopen
+DCGM_FR_CANNOT_OPEN_LIB_MSG = "Cannot open library %s: '%s'"
+# the name of the driver on the denylist
+DCGM_FR_DENYLISTED_DRIVER_MSG = "Found driver on the denylist: %s"
+# the name of the function that wasn't found
+DCGM_FR_NVML_LIB_BAD_MSG = "Cannot get pointer to %s from libnvidia-ml.so"
+DCGM_FR_GRAPHICS_PROCESSES_MSG        = "NVVS has detected graphics processes running on at least one "\
+                                                "GPU. This may cause some tests to fail."
+# error message from the API call
+DCGM_FR_HOSTENGINE_CONN_MSG = "Could not connect to the host engine: '%s'"
+# field name, gpu id
+DCGM_FR_FIELD_QUERY_MSG = "Could not query field %s for GPU %u"
+# environment variable name
+DCGM_FR_BAD_CUDA_ENV_MSG = "Found CUDA performance-limiting environment variable '%s'."
+# gpu id
+DCGM_FR_PERSISTENCE_MODE_MSG          = "Persistence mode for GPU %u is currently disabled. The DCGM "\
+                                                "diagnostic requires peristence mode to be enabled."
+DCGM_FR_LOW_BANDWIDTH_MSG             = "Bandwidth of GPU %u in direction %s of %.2f did not exceed "\
+                                                "minimum required bandwidth of %.2f."
+DCGM_FR_HIGH_LATENCY_MSG              = "Latency type %s of GPU %u value %.2f exceeded maximum allowed "\
+                                                "latency of %.2f."
+DCGM_FR_CANNOT_GET_FIELD_TAG_MSG = "Unable to get field information for field id %hu"
+DCGM_FR_FIELD_VIOLATION_MSG = "Detected %ld %s for GPU %u"
+DCGM_FR_FIELD_THRESHOLD_MSG = "Detected %ld %s for GPU %u which is above the threshold %ld"
+DCGM_FR_FIELD_VIOLATION_DBL_MSG = "Detected %.1f %s for GPU %u"
+DCGM_FR_FIELD_THRESHOLD_DBL_MSG = "Detected %.1f %s for GPU %u which is above the threshold %.1f"
+DCGM_FR_UNSUPPORTED_FIELD_TYPE_MSG    = "Field %s is not supported by this API because it is neither an "\
+                                                "int64 nor a double type."
+DCGM_FR_FIELD_THRESHOLD_TS_MSG        = "%s met or exceeded the threshold of %lu per second: %lu at "\
+                                                "%.1f seconds into the test."
+DCGM_FR_FIELD_THRESHOLD_TS_DBL_MSG    = "%s met or exceeded the threshold of %.1f per second: %.1f at "\
+                                                "%.1f seconds into the test."
+DCGM_FR_THERMAL_VIOLATIONS_MSG = "There were thermal violations totaling %lu seconds for GPU %u"
+DCGM_FR_THERMAL_VIOLATIONS_TS_MSG     = "Thermal violations totaling %lu samples started at %.1f seconds "\
+                                                "into the test for GPU %u"
+DCGM_FR_TEMP_VIOLATION_MSG            = "Temperature %lld of GPU %u exceeded user-specified maximum "\
+                                                "allowed temperature %lld"
+DCGM_FR_THROTTLING_VIOLATION_MSG      = "Clocks are being throttling for GPU %u because of clock "\
+                                                "throttling starting %.1f seconds into the test. %s"
+DCGM_FR_INTERNAL_MSG = "There was an internal error during the test: '%s'"
+DCGM_FR_PCIE_GENERATION_MSG           = "GPU %u is running at PCI link generation %d, which is below "\
+                                                "the minimum allowed link generation of %d (parameter '%s')"
+DCGM_FR_PCIE_WIDTH_MSG                = "GPU %u is running at PCI link width %dX, which is below the "\
+                                                "minimum allowed link generation of %d (parameter '%s')"
+DCGM_FR_ABORTED_MSG = "Test was aborted early due to user signal"
+DCGM_FR_TEST_DISABLED_MSG = "The %s test is skipped for this GPU."
+DCGM_FR_CANNOT_GET_STAT_MSG = "Unable to generate / collect stat %s for GPU %u"
+DCGM_FR_STRESS_LEVEL_MSG              = "Max stress level of %.1f did not reach desired stress level of "\
+                                                "%.1f for GPU %u"
+DCGM_FR_CUDA_API_MSG = "Error using CUDA API %s"
+DCGM_FR_FAULTY_MEMORY_MSG = "Found %d faulty memory elements on GPU %u"
+DCGM_FR_CANNOT_SET_WATCHES_MSG = "Unable to add field watches to DCGM: %s"
+DCGM_FR_CUDA_UNBOUND_MSG = "Cuda GPU %d is no longer bound to a CUDA context...Aborting"
+DCGM_FR_ECC_DISABLED_MSG = "Skipping test %s because ECC is not enabled on GPU %u"
+DCGM_FR_MEMORY_ALLOC_MSG = "Couldn't allocate at least %.1f%% of GPU memory on GPU %u"
+DCGM_FR_CUDA_DBE_MSG                  = "CUDA APIs have indicated that a double-bit ECC error has "\
+                                                "occured on GPU %u."
+DCGM_FR_MEMORY_MISMATCH_MSG           = "A memory mismatch was detected on GPU %u, but no error was "\
+                                                "reported by CUDA or NVML."
+DCGM_FR_CUDA_DEVICE_MSG = "Unable to find a corresponding CUDA device for GPU %u: '%s'"
+DCGM_FR_ECC_UNSUPPORTED_MSG = "This card does not support ECC Memory. Skipping test."
+DCGM_FR_ECC_PENDING_MSG = "ECC memory for GPU %u is in a pending state."
+DCGM_FR_MEMORY_BANDWIDTH_MSG          = "GPU %u only achieved a memory bandwidth of %.2f GB/s, failing "\
+                                                "to meet %.2f GB/s for test %d"
+DCGM_FR_TARGET_POWER_MSG              = "Max power of %.1f did not reach desired power minimum %s of "\
+                                                "%.1f for GPU %u"
+DCGM_FR_API_FAIL_MSG = "API call %s failed: '%s'"
+DCGM_FR_API_FAIL_GPU_MSG = "API call %s failed for GPU %u: '%s'"
+DCGM_FR_CUDA_CONTEXT_MSG = "GPU %u failed to create a CUDA context: %s"
+DCGM_FR_DCGM_API_MSG = "Error using DCGM API %s"
+DCGM_FR_CONCURRENT_GPUS_MSG           = "Unable to run concurrent pair bandwidth test without 2 or more "\
+                                        "gpus. Skipping"
+DCGM_FR_TOO_MANY_ERRORS_MSG           = "This API can only return up to four errors per system. "\
+                                        "Additional errors were found for this system that couldn't be "\
+                                        "communicated."
+DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD_MSG = "%.1f %s NvLink errors found occuring per second on GPU %u, "\
+                                        "exceeding the limit of 100 per second."
+DCGM_FR_NVLINK_ERROR_CRITICAL_MSG = "Detected %ld %s NvLink errors on GPU %u's NVLink (should be 0)"
+DCGM_FR_ENFORCED_POWER_LIMIT_MSG       = "Enforced power limit on GPU %u set to %.1f, which is too low to "\
+                                         "attempt to achieve target power %.1f"
+DCGM_FR_MEMORY_ALLOC_HOST_MSG = "Cannot allocate %zu bytes on the host"
+DCGM_FR_GPU_OP_MODE_MSG = "Skipping plugin due to a GPU being in GPU Operating Mode: LOW_DP."
+DCGM_FR_NO_MEMORY_CLOCKS_MSG = "No memory clocks <= %u MHZ were found in %u supported memory clocks."
+DCGM_FR_NO_GRAPHICS_CLOCKS_MSG = "No graphics clocks <= %u MHZ were found in %u supported graphics clocks for memory clock %u MHZ."
+DCGM_FR_HAD_TO_RESTORE_STATE_MSG = "Had to restore GPU state on NVML GPU(s): %s"
+DCGM_FR_L1TAG_UNSUPPORTED_MSG = "This card does not support the L1 cache test. Skipping test."
+DCGM_FR_L1TAG_MISCOMPARE_MSG = "The L1 cache test failed with a miscompare."
+DCGM_FR_ROW_REMAP_FAILURE_MSG = "Row remapping failed."
+DCGM_FR_UNCONTAINED_ERROR_MSG = "GPU had an uncontained error (XID 95)"
+DCGM_FR_EMPTY_GPU_LIST_MSG = "No valid GPUs passed to plugin"
+DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS_MSG = "Pending page retirements together with a DBE were detected on GPU %u."
+DCGM_FR_UNCORRECTABLE_ROW_REMAP_MSG = "GPU %u has uncorrectable row remappings"
+DCGM_FR_PENDING_ROW_REMAP_MSG = "GPU %u has pending row remappings"
+DCGM_FR_BROKEN_P2P_MEMORY_DEVICE_MSG = "GPU %u was unsuccessfully written to in a peer-to-peer test: %s"
+DCGM_FR_BROKEN_P2P_WRITER_DEVICE_MSG = "GPU %u unsuccessfully wrote data in a peer-to-peer test: %s"
+DCGM_FR_NVSWITCH_NVLINK_DOWN_MSG = "NVSwitch %u's NvLink %u is down."
+DCGM_FR_FILE_CREATE_PERMISSIONS_MSG = "The DCGM Diagnostic does not have permissions to create a file in directory '%s'"
+
+# Suggestions for next steps for the corresponding error message
+DCGM_FR_OK_NEXT = "N/A"
+DCGM_FR_UNKNOWN_NEXT = ""
+DCGM_FR_UNRECOGNIZED_NEXT = ""
+DCGM_FR_PCI_REPLAY_RATE_NEXT          = "Reconnect PCIe card. Run system side PCIE diagnostic utilities "\
+                                                "to verify hops off the GPU board. If issue is on the board, run "\
+                                                "the field diagnostic."
+DCGM_FR_VOLATILE_DBE_DETECTED_NEXT = "Drain the GPU and reset it or reboot the node."
+DCGM_FR_VOLATILE_SBE_DETECTED_NEXT = "Monitor - this GPU can still perform workload."
+DCGM_FR_PENDING_PAGE_RETIREMENTS_NEXT = "If volatile double bit errors exist, drain the GPU and reset it "\
+                                                "or reboot the node. Otherwise, monitor - GPU can still perform "\
+                                                "workload."
+DCGM_FR_RETIRED_PAGES_LIMIT_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG
+DCGM_FR_RETIRED_PAGES_DBE_LIMIT_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG
+DCGM_FR_CORRUPT_INFOROM_NEXT = "Flash the InfoROM to clear this corruption."
+DCGM_FR_CLOCK_THROTTLE_THERMAL_NEXT = DEBUG_COOLING_MSG
+DCGM_FR_POWER_UNREADABLE_NEXT = ""
+DCGM_FR_CLOCK_THROTTLE_POWER_NEXT = "Monitor the power conditions. This GPU can still perform workload."
+DCGM_FR_NVLINK_ERROR_THRESHOLD_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG
+DCGM_FR_NVLINK_DOWN_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG
+DCGM_FR_NVSWITCH_FATAL_ERROR_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG
+DCGM_FR_NVSWITCH_NON_FATAL_ERROR_NEXT = "Monitor the NVSwitch. It can still perform workload."
+DCGM_FR_NVSWITCH_DOWN_NEXT = ""
+DCGM_FR_NO_ACCESS_TO_FILE_NEXT = "Check relevant permissions, access, and existence of the file."
+DCGM_FR_NVML_API_NEXT                 = "Check the error condition and ensure that appropriate libraries "\
+                                                "are present and accessible."
+DCGM_FR_DEVICE_COUNT_MISMATCH_NEXT    = "Check for the presence of cgroups, operating system blocks, and "\
+                                                "or unsupported / older cards"
+DCGM_FR_BAD_PARAMETER_NEXT = ""
+DCGM_FR_CANNOT_OPEN_LIB_NEXT          = "Check for the existence of the library and set LD_LIBRARY_PATH "\
+                                                "if needed."
+DCGM_FR_DENYLISTED_DRIVER_NEXT = "Please load the appropriate driver."
+DCGM_FR_NVML_LIB_BAD_NEXT             = "Make sure that the required version of libnvidia-ml.so "\
+                                                "is present and accessible on the system."
+DCGM_FR_GRAPHICS_PROCESSES_NEXT       = "Stop the graphics processes or run this diagnostic on a server "\
+                                                "that is not being used for display purposes."
+DCGM_FR_HOSTENGINE_CONN_NEXT          = "If hostengine is run separately, please ensure that it is up "\
+                                                "and responsive."
+DCGM_FR_FIELD_QUERY_NEXT = ""
+DCGM_FR_BAD_CUDA_ENV_NEXT = "Please unset this environment variable to address test failures."
+DCGM_FR_PERSISTENCE_MODE_NEXT         = "Enable persistence mode by running \"nvidia-smi -i <gpuId> -pm "\
+                                                "1 \" as root."
+DCGM_FR_LOW_BANDWIDTH_NEXT            = "Verify that your minimum bandwidth setting is appropriate for "\
+                                                "all topological consequences."
+DCGM_FR_HIGH_LATENCY_NEXT = ""
+DCGM_FR_CANNOT_GET_FIELD_TAG_NEXT = ""
+DCGM_FR_FIELD_VIOLATION_NEXT = ""
+DCGM_FR_FIELD_THRESHOLD_NEXT = ""
+DCGM_FR_FIELD_VIOLATION_DBL_NEXT = ""
+DCGM_FR_FIELD_THRESHOLD_DBL_NEXT = ""
+DCGM_FR_UNSUPPORTED_FIELD_TYPE_NEXT = ""
+DCGM_FR_FIELD_THRESHOLD_TS_NEXT = ""
+DCGM_FR_FIELD_THRESHOLD_TS_DBL_NEXT = ""
+DCGM_FR_THERMAL_VIOLATIONS_NEXT = DEBUG_COOLING_MSG
+DCGM_FR_THERMAL_VIOLATIONS_TS_NEXT = DEBUG_COOLING_MSG
+DCGM_FR_TEMP_VIOLATION_NEXT           = "Verify that the user-specified temperature maximum is set "\
+                                                "correctly. If it is, %s" % DEBUG_COOLING_MSG
+DCGM_FR_THROTTLING_VIOLATION_NEXT = ""
+DCGM_FR_INTERNAL_NEXT = ""
+DCGM_FR_PCIE_GENERATION_NEXT = ""
+DCGM_FR_PCIE_WIDTH_NEXT = ""
+DCGM_FR_ABORTED_NEXT = ""
+DCGM_FR_TEST_DISABLED_NEXT = ""
+DCGM_FR_CANNOT_GET_STAT_NEXT          = "If running a standalone nv-hostengine, verify that it is up "\
+                                                "and responsive."
+DCGM_FR_STRESS_LEVEL_NEXT = ""
+DCGM_FR_CUDA_API_NEXT = ""
+DCGM_FR_FAULTY_MEMORY_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG
+DCGM_FR_CANNOT_SET_WATCHES_NEXT = ""
+DCGM_FR_CUDA_UNBOUND_NEXT = ""
+DCGM_FR_ECC_DISABLED_NEXT             = "Enable ECC memory by running \"nvidia-smi -i <gpuId> -e 1\" "\
+                                                "to enable. This may require a GPU reset or reboot to take effect."
+DCGM_FR_MEMORY_ALLOC_NEXT = ""
+DCGM_FR_CUDA_DBE_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG
+DCGM_FR_MEMORY_MISMATCH_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG
+DCGM_FR_CUDA_DEVICE_NEXT = ""
+DCGM_FR_ECC_UNSUPPORTED_NEXT = ""
+DCGM_FR_ECC_PENDING_NEXT = "Please reboot to activate it."
+DCGM_FR_MEMORY_BANDWIDTH_NEXT = ""
+DCGM_FR_TARGET_POWER_NEXT = ""
+DCGM_FR_API_FAIL_NEXT = ""
+DCGM_FR_API_FAIL_GPU_NEXT = ""
+DCGM_FR_CUDA_CONTEXT_NEXT             = "Please make sure the correct driver version is installed and "\
+                                                "verify that no conflicting libraries are present."
+DCGM_FR_DCGM_API_NEXT = ""
+DCGM_FR_CONCURRENT_GPUS_NEXT = ""
+DCGM_FR_TOO_MANY_ERRORS_NEXT = ""
+DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG
+DCGM_FR_NVLINK_ERROR_CRITICAL_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG
+DCGM_FR_ENFORCED_POWER_LIMIT_NEXT     = "If this enforced power limit is necessary, then this test "\
+                                        "cannot be run. If it is unnecessary, then raise the enforced "\
+                                        "power limit setting to be able to run this test."
+DCGM_FR_MEMORY_ALLOC_HOST_NEXT = "Manually kill processes or restart your machine."
+DCGM_FR_GPU_OP_MODE_NEXT              = "Fix by running nvidia-smi as root with: nvidia-smi --gom=0 -i "\
+                                        "<gpu index>"
+DCGM_FR_NO_MEMORY_CLOCKS_NEXT = ""
+DCGM_FR_NO_GRAPHICS_CLOCKS_NEXT = ""
+DCGM_FR_HAD_TO_RESTORE_STATE_NEXT = ""
+DCGM_FR_L1TAG_UNSUPPORTED_NEXT = ""
+DCGM_FR_L1TAG_MISCOMPARE_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG
+DCGM_FR_ROW_REMAP_FAILURE_NEXT = DCGM_FR_VOLATILE_DBE_DETECTED_NEXT
+DCGM_FR_UNCONTAINED_ERROR_NEXT = DCGM_FR_VOLATILE_DBE_DETECTED_NEXT
+DCGM_FR_EMPTY_GPU_LIST_NEXT = ""
+DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS_NEXT = "Drain the GPU and reset it or reboot the node to resolve this issue."
+DCGM_FR_UNCORRECTABLE_ROW_REMAP_NEXT = ""
+DCGM_FR_PENDING_ROW_REMAP_NEXT = ""
+DCGM_FR_BROKEN_P2P_MEMORY_DEVICE_NEXT = BUG_REPORT_MSG
+DCGM_FR_BROKEN_P2P_WRITER_DEVICE_NEXT = BUG_REPORT_MSG
+DCGM_FR_NVSWITCH_NVLINK_DOWN_NEXT = "Please check fabric manager and initialization logs to figure out why the link is down. You may also need to run a field diagnostic."
+DCGM_FR_FILE_CREATE_PERMISSIONS_NEXT  = "Please restart the hostengine with parameter --home-dir to specify a different home directory for the " \
+    "diagnostic or change permissions in the current directory to allow the user to write files there."
+
+
+def dcgmErrorGetPriorityByCode(code):
+    fn = dcgm_structs._dcgmGetFunctionPointer("dcgmErrorGetPriorityByCode")
+    ret = fn(code)
+    return ret
+
+
+def dcgmErrorGetFormatMsgByCode(code):
+    fn = dcgm_structs._dcgmGetFunctionPointer("dcgmErrorGetFormatMsgByCode")
+    fn.restype = ctypes.c_char_p
+    ret = fn(code)
+    return ret.decode('utf-8') if isinstance(ret, bytes) else ret
diff --git a/model_analyzer/monitor/dcgm/dcgm_field_helpers.py b/model_analyzer/monitor/dcgm/dcgm_field_helpers.py
index d29a5c412..ceb9f7e0e 100755
--- a/model_analyzer/monitor/dcgm/dcgm_field_helpers.py
+++ b/model_analyzer/monitor/dcgm/dcgm_field_helpers.py
@@ -1,6 +1,4 @@
-#!/usr/bin/env python3
-
-# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,29 +12,31 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import time
+import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields
+import model_analyzer.monitor.dcgm.dcgm_fields_internal as dcgm_fields_internal
+import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
+import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent
 import ctypes
+import model_analyzer.monitor.dcgm.dcgmvalue as dcgmvalue
+import model_analyzer.monitor.dcgm.pydcgm as pydcgm
 import json
+'''
+Helper class that makes a python-friendly field value from one returned from the python bindings
+'''
 
-import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent
-import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields
-import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
-import model_analyzer.monitor.dcgm.dcgm_value as dcgmvalue
 
+class DcgmFieldValue():
+    '''
+    Constructor
 
-class DcgmFieldValue:
-    """
-    Helper class that makes a python-friendly field value from one returned
-    from the python bindings
-    """
+    rawValue is the latest dcgm_structs.c_dcgmFieldValue_v? structure of a field value returned from the raw APIs
+    '''
 
     def __init__(self, rawValue):
-        """
-        rawValue : dcgm_structs.c_dcgmFieldValue_v?
-            is the latest structure of a field value returned from the raw APIs
-        """
-        # Make sure the class passed in is an expected type
+        #Make sure the class passed in is an expected type
         if not type(rawValue) == dcgm_structs.c_dcgmFieldValue_v1:
-            raise Exception(f"Unexpected rawValue type {str(type(rawValue))}")
+            raise Exception("Unexpected rawValue type %s" % str(type(rawValue)))
 
         self.ts = rawValue.ts
         self.fieldId = rawValue.fieldId
@@ -51,10 +51,7 @@ def __init__(self, rawValue):
         if self.fieldType == dcgm_fields.DCGM_FT_DOUBLE:
             self.value = float(rawValue.value.dbl)
             self.isBlank = dcgmvalue.DCGM_FP64_IS_BLANK(self.value)
-        elif (
-            self.fieldType == dcgm_fields.DCGM_FT_INT64
-            or self.fieldType == dcgm_fields.DCGM_FT_TIMESTAMP
-        ):
+        elif self.fieldType == dcgm_fields.DCGM_FT_INT64 or self.fieldType == dcgm_fields.DCGM_FT_TIMESTAMP:
             self.value = int(rawValue.value.i64)
             self.isBlank = dcgmvalue.DCGM_INT64_IS_BLANK(self.value)
         elif self.fieldType == dcgm_fields.DCGM_FT_STRING:
@@ -63,31 +60,33 @@ def __init__(self, rawValue):
         elif self.fieldType == dcgm_fields.DCGM_FT_BINARY:
             if self.fieldId == dcgm_fields.DCGM_FI_DEV_ACCOUNTING_DATA:
                 accStats = dcgm_structs.c_dcgmDevicePidAccountingStats_v1()
-                ctypes.memmove(
-                    ctypes.addressof(accStats),
-                    rawValue.value.blob,
-                    accStats.FieldsSizeof(),
-                )
-            if self.fieldId == dcgm_fields.DCGM_FI_DEV_COMPUTE_PIDS:
-                accStats = dcgm_structs.c_dcgmDeviceVgpuProcessUtilInfo_v1()
-                ctypes.memmove(
-                    ctypes.addressof(accStats),
-                    rawValue.value.blob,
-                    accStats.FieldsSizeof(),
-                )
+                ctypes.memmove(ctypes.addressof(accStats), rawValue.value.blob,
+                               accStats.FieldsSizeof())
+            if self.fieldId in [
+                    dcgm_fields_internal.DCGM_FI_DEV_COMPUTE_PIDS,
+                    dcgm_fields_internal.DCGM_FI_DEV_GRAPHICS_PIDS
+            ]:
+                processStats = dcgm_structs.c_dcgmRunningProcess_t()
+                ctypes.memmove(ctypes.addressof(processStats),
+                               rawValue.value.blob, processStats.FieldsSizeof())
+                self.value = processStats
+                self.fieldType = dcgm_fields.DCGM_FT_BINARY
+                # This should always be false
+                self.isBlank = dcgmvalue.DCGM_INT64_IS_BLANK(processStats.pid)
             elif self.fieldId == dcgm_fields.DCGM_FI_SYNC_BOOST:
-                # Not exposed publicly for now
+                #Not exposed publicly for now
                 self.value = None
             else:
-                raise Exception("Blobs not handled yet for fieldId %d" % self.fieldId)
+                raise Exception("Blobs not handled yet for fieldId %d" %
+                                self.fieldId)
         else:
             raise Exception("Unhandled fieldType: %s" % self.fieldType)
 
 
 class DcgmFieldValueTimeSeries:
+
     def __init__(self):
-        # Values in timestamp order
-        self.values = []
+        self.values = []  #Values in timestamp order
 
     def __len__(self):
         return len(self.values)
@@ -100,7 +99,7 @@ def InsertValue(self, value):
             self.values.append(value)
             return
 
-        # Otherwise, we need to insert the value in the correct place.
+        #Otherwise, we need to insert the value in the correct place. Find the place
         for i, existingValue in enumerate(self.values):
             if value.ts < existingValue.ts:
                 self.values.insert(i, value)
@@ -110,75 +109,75 @@ def InsertValue(self, value):
 
 
 class FieldValueEncoder(json.JSONEncoder):
-    # Pylint does not link overloading the default method, so the comment below
-    # is WAR for the linting problem
+    # Pylint does not link overloading the default method, so the comment below is WAR for the linting problem
     def default(self, obj):  # pylint: disable=E0202
         nested_json = []
+        i = 0
         for key in obj:
             if isinstance(key, DcgmFieldValue):
-                if key.isBlank:
+                if (key.isBlank):
                     continue
-                nested_json.append(
-                    {"Timestamp": key.ts, "FieldId": key.fieldId, "Value": key.value}
-                )
+                nested_json.append({
+                    'Timestamp': key.ts,
+                    'FieldId': key.fieldId,
+                    'Value': key.value
+                })
             else:
                 return json.JSONEncoder.default(
-                    self, obj
-                )  # Let default encoder throw exception
+                    self, obj)  # Let default encoder throw exception
         return nested_json
 
 
-def py_helper_dcgm_field_values_since_callback(gpuId, values, numValues, userData):
+def py_helper_dcgm_field_values_since_callback(gpuId, values, numValues,
+                                               userData):
+
     userData = ctypes.cast(userData, ctypes.py_object).value
     userData._ProcessValues(gpuId, values[0:numValues])
     return 0
 
 
 helper_dcgm_field_values_since_callback = dcgm_agent.dcgmFieldValueEnumeration_f(
-    py_helper_dcgm_field_values_since_callback
-)
+    py_helper_dcgm_field_values_since_callback)
 
 
-def py_helper_dcgm_field_values_since_callback_v2(
-    entityGroupId, entityId, values, numValues, userData
-):
+def py_helper_dcgm_field_values_since_callback_v2(entityGroupId, entityId,
+                                                  values, numValues, userData):
     userData = ctypes.cast(userData, ctypes.py_object).value
-    userData._ProcessValues(entityGroupId, entityId, values[0:numValues])
+    userData._ProcessValuesV2(entityGroupId, entityId, values[0:numValues])
     return 0
 
 
-helper_dcgm_field_values_since_callback_v2 = (
-    dcgm_agent.dcgmFieldValueEntityEnumeration_f(
-        py_helper_dcgm_field_values_since_callback_v2
-    )
-)
+helper_dcgm_field_values_since_callback_v2 = dcgm_agent.dcgmFieldValueEntityEnumeration_f(
+    py_helper_dcgm_field_values_since_callback_v2)
+'''
+Helper class for handling field value update callbacks and storing them in a .values member variable
+'''
 
 
 class DcgmFieldValueCollection:
-    """
-    Helper class for handling field value update callbacks and storing them
-    in a .values member variable
-    """
 
     def __init__(self, handle, groupId):
-        self.values = {}
-        # 2D dictionary of [gpuId][fieldId](DcgmFieldValueTimeSeries)
+        self.values = {
+        }  #2D dictionary of [gpuId][fieldId](DcgmFieldValueTimeSeries)
+        self.entityValues = {
+        }  #3D dictionary of [entityGroupId][entityId][fieldId](DcgmFieldValueTimeSeries)
         self._handle = handle
         self._groupId = groupId
         self._numValuesSeen = 0
+        self._nextSinceTimestamp = 0
+
+    '''
+    Helper function called by the callback of dcgm_agent.dcgmGetValuesSince to process individual field values
+    '''
 
     def _ProcessValues(self, gpuId, values):
-        """
-        Helper function called by the callback of
-        dcgm_agent.dcgmGetValuesSince to process individual field values
-        """
         self._numValuesSeen += len(values)
 
         if gpuId not in self.values:
             self.values[gpuId] = {}
 
         for rawValue in values:
-            # Convert to python-friendly value
+            #Convert to python-friendly value
             value = DcgmFieldValue(rawValue)
 
             if value.fieldId not in self.values[gpuId]:
@@ -186,185 +185,187 @@ def _ProcessValues(self, gpuId, values):
 
             self.values[gpuId][value.fieldId].InsertValue(value)
 
-    def GetLatestValues(self, fieldGroup):
-        """
-        Get the latest values for a fieldGroup and store them to the .values
-        member variable
+    '''
+    Helper function called by the callback py_helper_dcgm_field_values_since_callback_v2 to process individual field values
+    '''
+
+    def _ProcessValuesV2(self, entityGroupId, entityId, values):
+        self._numValuesSeen += len(values)
+
+        if entityGroupId not in self.entityValues:
+            self.entityValues[entityGroupId] = {}
+
+        if entityId not in self.entityValues[entityGroupId]:
+            self.entityValues[entityGroupId][entityId] = {}
+
+        for rawValue in values:
+            #Convert to python-friendly value
+            value = DcgmFieldValue(rawValue)
 
-        Note: This class does not automatically watch fieldGroup. You must do
-        that ahead of time with dcgmGroup.samples.WatchFields()
-        """
+            if value.fieldId not in self.entityValues[entityGroupId][entityId]:
+                self.entityValues[entityGroupId][entityId][
+                    value.fieldId] = DcgmFieldValueTimeSeries()
+
+            self.entityValues[entityGroupId][entityId][
+                value.fieldId].InsertValue(value)
+
+    '''
+    Get the latest values for a fieldGroup and store them to the .values member variable
+
+    Note: This class does not automatically watch fieldGroup. You must do that ahead of time with dcgmGroup.samples.WatchFields()
+    '''
+
+    def GetLatestValues(self, fieldGroup):
         ret = dcgm_agent.dcgmGetLatestValues(
-            self._handle,
-            self._groupId,
-            fieldGroup.fieldGroupId,
-            helper_dcgm_field_values_since_callback,
-            self,
-        )
-        # Will throw exception on error
+            self._handle, self._groupId, fieldGroup.fieldGroupId,
+            helper_dcgm_field_values_since_callback, self)
+        #Will throw exception on error
         dcgm_structs._dcgmCheckReturn(ret)
 
+    '''
+    Method to cause more field values to be retrieved from DCGM. Returns the
+    number of field values that were retrieved.
+    '''
+
+    def GetAllSinceLastCall(self, fieldGroup):
+        beforeCount = self._numValuesSeen
+        self._nextSinceTimestamp = dcgm_agent.dcgmGetValuesSince(
+            self._handle, self._groupId, fieldGroup.fieldGroupId,
+            self._nextSinceTimestamp, helper_dcgm_field_values_since_callback,
+            self)
+        afterCount = self._numValuesSeen
+        return afterCount - beforeCount
+
     def GetLatestValues_v2(self, fieldGroup):
         ret = dcgm_agent.dcgmGetLatestValues_v2(
-            self._handle,
-            self._groupId,
-            fieldGroup.fieldGroupId,
-            helper_dcgm_field_values_since_callback_v2,
-            self,
-        )
-        # Will throw exception on error
+            self._handle, self._groupId, fieldGroup.fieldGroupId,
+            helper_dcgm_field_values_since_callback_v2, self)
+        #Will throw exception on error
         dcgm_structs._dcgmCheckReturn(ret)
 
+    '''
+    Method to cause more field values to be retrieved from DCGM. Returns the number of field values that were retrieved
+    '''
+
+    def GetAllSinceLastCall_v2(self, fieldGroup):
+        beforeCount = self._numValuesSeen
+        self._nextSinceTimestamp = dcgm_agent.dcgmGetValuesSince_v2(
+            self._handle, self._groupId, fieldGroup.fieldGroupId,
+            self._nextSinceTimestamp,
+            helper_dcgm_field_values_since_entity_callback, self)
+        afterCount = self._numValuesSeen
+        return afterCount - beforeCount
+
+    '''
+    Empty .values{} so that old data is no longer present in this structure.
+    This can be used to prevent .values from growing over time
+    '''
+
     def EmptyValues(self):
-        """
-        Empty .values{} so that old data is no longer present in this
-        structure. This can be used to prevent .values from growing over time
-        """
         self.values = {}
         self._numValuesSeen = 0
 
 
+'''
+Helper class for watching a field group and storing fields values returned from it
+'''
+
+
 class DcgmFieldGroupWatcher(DcgmFieldValueCollection):
-    """
-    Helper class for watching a field group and storing fields values returned
-    from it
-    """
-
-    def __init__(
-        self,
-        handle,
-        groupId,
-        fieldGroup,
-        operationMode,
-        updateFreq,
-        maxKeepAge,
-        maxKeepSamples,
-        startTimestamp,
-    ):
-        """
-        handle :
-            DCGM handle from dcgm_agent.dcgmInit()
-        groupId :
-            a DCGM group ID returned from dcgm_agent.dcgmGroupCreate
-        fieldGroup :
-            DcgmFieldGroup() instance to watch fields for
-        operationMode :
-            a dcgm_structs.DCGM_OPERATION_MODE_? constant for if the host
-            engine is running in lock step or auto mode
-        updateFreq :
-            how often to update each field in usec
-        maxKeepAge :
-            how long DCGM should keep values for in seconds
-        maxKeepSamples :
-            is the maximum number of samples DCGM should ever cache for each
-            field
-        startTimestamp :
-            a base timestamp we should start from when first reading
-            values. This can be used to resume a previous instance of a
-            DcgmFieldGroupWatcher by using its _nextSinceTimestamp. 0=start
-            with all cached data
-        """
+    '''
+    Constructor
+
+    handle is a DCGM handle from dcgm_agent.dcgmInit()
+    groupId is a valid DCGM group ID returned from dcgm_agent.dcgmGroupCreate
+    fieldGroup is the DcgmFieldGroup() instance to watch fields for
+    operationMode is a dcgm_structs.DCGM_OPERATION_MODE_? constant for if the host engine is running in lock step or auto mode
+    updateFreq is how often to update each field in usec
+    maxKeepAge is how long DCGM should keep values for in seconds
+    maxKeepSamples is the maximum number of samples DCGM should ever cache for each field
+    startTimestamp is a base timestamp we should start from when first reading values. This can be used to resume a
+                   previous instance of a DcgmFieldGroupWatcher by using its _nextSinceTimestamp.
+                   0=start with all cached data
+    '''
+
+    def __init__(self, handle, groupId, fieldGroup, operationMode, updateFreq,
+                 maxKeepAge, maxKeepSamples, startTimestamp):
         self._fieldGroup = fieldGroup
-        self._oprationMode = operationMode
+        self._operationMode = operationMode
         self._updateFreq = updateFreq
         self._maxKeepAge = maxKeepAge
         self._maxKeepSamples = maxKeepSamples
         DcgmFieldValueCollection.__init__(self, handle, groupId)
 
-        # Start from beginning of time
-        self._nextSinceTimestamp = 0
+        self._nextSinceTimestamp = 0  #Start from beginning of time
         if startTimestamp > 0:
             self._nextSinceTimestamp = startTimestamp
 
-        # Start watches
+        #Start watches
         self._WatchFieldGroup()
 
+    '''
+    Initiate the host engine watch on the fields
+    '''
+
     def _WatchFieldGroup(self):
-        """
-        Initiate the host engine watch on the fields
-        """
-        ret = dcgm_agent.dcgmWatchFields(
-            self._handle,
-            self._groupId,
-            self._fieldGroup,
-            self._updateFreq,
-            self._maxKeepAge,
-            self._maxKeepSamples,
-        )
-        # Will throw exception on error
-        dcgm_structs._dcgmCheckReturn(ret)
+        ret = dcgm_agent.dcgmWatchFields(self._handle, self._groupId,
+                                         self._fieldGroup.fieldGroupId,
+                                         self._updateFreq, self._maxKeepAge,
+                                         self._maxKeepSamples)
+        dcgm_structs._dcgmCheckReturn(ret)  #Will throw exception on error
 
-        # Force an update of the fields so that we can fetch initial values
+        # Force an update of the fields so that we can fetch initial values.
         ret = dcgm_agent.dcgmUpdateAllFields(self._handle, 1)
-        # Will throw exception on error
-        dcgm_structs._dcgmCheckReturn(ret)
+        dcgm_structs._dcgmCheckReturn(ret)  #Will throw exception on error
 
-        # initial update will fetch from startTimestamp
-        self.GetMore()
+        # Initial update will fetch from startTimestamp.
+        self.GetAllSinceLastCall()
 
-    def GetMore(self):
-        """
-        Method to cause more field values to be retrieved from DCGM.
+    '''
+    Method to cause more field values to be retrieved from DCGM. Returns the
+    number of field values that were retrieved
+    '''
 
-        Returns
-        -------
-        int
-            the number of field values that were retrieved
-        """
-        beforeCount = self._numValuesSeen
-
-        # If we're in manual mode, force an update
-        if self._oprationMode == dcgm_structs.DCGM_OPERATION_MODE_MANUAL:
+    def GetAllSinceLastCall(self):
+        #If we're in manual mode, force an update
+        if self._operationMode == dcgm_structs.DCGM_OPERATION_MODE_MANUAL:
             ret = dcgm_agent.dcgmUpdateAllFields(self._handle, 1)
-            # Will throw exception on error
-            dcgm_structs._dcgmCheckReturn(ret)
+            dcgm_structs._dcgmCheckReturn(ret)  #Will throw exception on error
+
+        return super().GetAllSinceLastCall(self._fieldGroup)
 
-        self._nextSinceTimestamp = dcgm_agent.dcgmGetValuesSince(
-            self._handle,
-            self._groupId,
-            self._fieldGroup,
-            self._nextSinceTimestamp,
-            helper_dcgm_field_values_since_callback,
-            self,
-        )
-        afterCount = self._numValuesSeen
-        return afterCount - beforeCount
 
+def py_helper_dcgm_field_values_since_entity_callback(entityGroupId, entityId,
+                                                      values, numValues,
+                                                      userData):
 
-def py_helper_dcgm_field_values_since_entity_callback(
-    entityGroupId, entityId, values, numValues, userData
-):
     userData = ctypes.cast(userData, ctypes.py_object).value
     userData._ProcessValues(entityGroupId, entityId, values[0:numValues])
     return 0
 
 
-helper_dcgm_field_values_since_entity_callback = (
-    dcgm_agent.dcgmFieldValueEntityEnumeration_f(
-        py_helper_dcgm_field_values_since_entity_callback
-    )
-)
+helper_dcgm_field_values_since_entity_callback = dcgm_agent.dcgmFieldValueEntityEnumeration_f(
+    py_helper_dcgm_field_values_since_entity_callback)
+'''
+Helper class for handling field value update callbacks and storing them in a .values member variable
+'''
 
 
 class DcgmFieldValueEntityCollection:
-    """
-    Helper class for handling field value update callbacks and storing them
-    in a .values member variable
-    """
 
     def __init__(self, handle, groupId):
-        # 3D dictionary of
-        # [entityGroupId][entityId][fieldId](DcgmFieldValueTimeSeries)
-        self.values = {}
+        self.values = {
+        }  #3D dictionary of [entityGroupId][entityId][fieldId](DcgmFieldValueTimeSeries)
         self._handle = handle
         self._groupId = groupId
         self._numValuesSeen = 0
+        self._nextSinceTimestamp = 0
+
+    '''
+    Helper function called by the callback of dcgm_agent.dcgmGetValuesSince to process individual field values
+    '''
 
     def _ProcessValues(self, entityGroupId, entityId, values):
-        """
-        Helper function called by the callback of
-        dcgm_agent.dcgmGetValuesSince to process individual field values
-        """
         self._numValuesSeen += len(values)
 
         if entityGroupId not in self.values:
@@ -374,141 +375,172 @@ def _ProcessValues(self, entityGroupId, entityId, values):
             self.values[entityGroupId][entityId] = {}
 
         for rawValue in values:
-            # Convert to python-friendly value
+            #Convert to python-friendly value
             value = DcgmFieldValue(rawValue)
 
             if value.fieldId not in self.values[entityGroupId][entityId]:
                 self.values[entityGroupId][entityId][
-                    value.fieldId
-                ] = DcgmFieldValueTimeSeries()
+                    value.fieldId] = DcgmFieldValueTimeSeries()
 
-            self.values[entityGroupId][entityId][value.fieldId].InsertValue(value)
+            self.values[entityGroupId][entityId][value.fieldId].InsertValue(
+                value)
 
-    def GetLatestValues(self, fieldGroup):
-        """
-        Get the latest values for a fieldGroup and store them to the
-        .values member variable
+    '''
+    Get the latest values for a fieldGroup and store them to the .values member variable
+
+    Note: This class does not automatically watch fieldGroup. You must do that ahead of time with dcgmGroup.samples.WatchFields()
+    '''
 
-        Note: This class does not automatically watch fieldGroup. You must do
-        that ahead of time with dcgmGroup.samples.WatchFields()
-        """
+    def GetLatestValues(self, fieldGroup):
         ret = dcgm_agent.dcgmGetLatestValues_v2(
-            self._handle,
-            self._groupId,
-            fieldGroup.fieldGroupId,
-            helper_dcgm_field_values_since_entity_callback,
-            self,
-        )
-        # Will throw exception on error
+            self._handle, self._groupId, fieldGroup.fieldGroupId,
+            helper_dcgm_field_values_since_entity_callback, self)
+        #Will throw exception on error
         dcgm_structs._dcgmCheckReturn(ret)
 
+    '''
+    Method to cause more field values to be retrieved from DCGM. Returns the
+    number of field values that were retrieved.
+    '''
+
+    def GetAllSinceLastCall(self, fieldGroup):
+        beforeCount = self._numValuesSeen
+        self._nextSinceTimestamp = dcgm_agent.dcgmGetValuesSince_v2(
+            self._handle, self._groupId, fieldGroup.fieldGroupId,
+            self._nextSinceTimestamp,
+            helper_dcgm_field_values_since_entity_callback, self)
+        afterCount = self._numValuesSeen
+        return afterCount - beforeCount
+
+    '''
+    Empty .values{} so that old data is no longer present in this structure.
+    This can be used to prevent .values from growing over time
+    '''
+
     def EmptyValues(self):
-        """
-        Empty .values{} so that old data is no longer present in this
-        structure. This can be used to prevent .values from growing over time
-        """
         self.values = {}
         self._numValuesSeen = 0
 
 
+'''
+Helper class for watching a field group and storing fields values returned from it
+'''
+
+
 class DcgmFieldGroupEntityWatcher(DcgmFieldValueEntityCollection):
-    """
-    Helper class for watching a field group and storing fields values
-    returned from it
-    """
-
-    def __init__(
-        self,
-        handle,
-        groupId,
-        fieldGroup,
-        operationMode,
-        updateFreq,
-        maxKeepAge,
-        maxKeepSamples,
-        startTimestamp,
-    ):
-        """
-        Constructor
-
-        handle :
-            a DCGM handle from dcgm_agent.dcgmInit()
-        groupId :
-            a valid DCGM group ID returned from dcgm_agent.dcgmGroupCreate
-        fieldGroup :
-            DcgmFieldGroup() instance to watch fields for
-        operationMode :
-            is a dcgm_structs.DCGM_OPERATION_MODE_? constant for if the host
-            engine is running in lock step or auto mode
-        updateFreq :
-            how often to update each field in usec
-        maxKeepAge :
-            how long DCGM should keep values for in seconds
-        maxKeepSamples :
-            the maximum number of samples DCGM should ever cache for each field
-        startTimestamp :
-            a base timestamp we should start from when first reading values.
-            This can be used to resume a previous instance of a
-            DcgmFieldGroupWatcher by using its _nextSinceTimestamp. 0=start
-            with all cached data
-        """
+    '''
+    Constructor
+
+    handle is a DCGM handle from dcgm_agent.dcgmInit()
+    groupId is a valid DCGM group ID returned from dcgm_agent.dcgmGroupCreate
+    fieldGroup is the DcgmFieldGroup() instance to watch fields for
+    operationMode is a dcgm_structs.DCGM_OPERATION_MODE_? constant for if the host engine is running in lock step or auto mode
+    updateFreq is how often to update each field in usec
+    maxKeepAge is how long DCGM should keep values for in seconds
+    maxKeepSamples is the maximum number of samples DCGM should ever cache for each field
+    startTimestamp is a base timestamp we should start from when first reading values. This can be used to resume a
+                   previous instance of a DcgmFieldGroupWatcher by using its _nextSinceTimestamp.
+                   0=start with all cached data
+    '''
+
+    def __init__(self, handle, groupId, fieldGroup, operationMode, updateFreq,
+                 maxKeepAge, maxKeepSamples, startTimestamp):
         self._fieldGroup = fieldGroup
-        self._oprationMode = operationMode
+        self._operationMode = operationMode
         self._updateFreq = updateFreq
         self._maxKeepAge = maxKeepAge
         self._maxKeepSamples = maxKeepSamples
         DcgmFieldValueEntityCollection.__init__(self, handle, groupId)
 
-        # Start from beginning of time
-        self._nextSinceTimestamp = 0
+        self._nextSinceTimestamp = 0  #Start from beginning of time
         if startTimestamp > 0:
             self._nextSinceTimestamp = startTimestamp
 
-        # Start watches
+        #Start watches
         self._WatchFieldGroup()
 
+    '''
+    Initiate the host engine watch on the fields
+    '''
+
     def _WatchFieldGroup(self):
-        """
-        Initiate the host engine watch on the fields
-        """
-        ret = dcgm_agent.dcgmWatchFields(
-            self._handle,
-            self._groupId,
-            self._fieldGroup.fieldGroupId,
-            self._updateFreq,
-            self._maxKeepAge,
-            self._maxKeepSamples,
-        )
-        # Will throw exception on error
-        dcgm_structs._dcgmCheckReturn(ret)
+        ret = dcgm_agent.dcgmWatchFields(self._handle, self._groupId,
+                                         self._fieldGroup.fieldGroupId,
+                                         self._updateFreq, self._maxKeepAge,
+                                         self._maxKeepSamples)
+        dcgm_structs._dcgmCheckReturn(ret)  #Will throw exception on error
 
-        # Force an update of the fields so that we can fetch initial values
+        # Force an update of the fields so that we can fetch initial values.
         ret = dcgm_agent.dcgmUpdateAllFields(self._handle, 1)
-        # Will throw exception on error
-        dcgm_structs._dcgmCheckReturn(ret)
-        # initial update will fetch from startTimestamp
-        self.GetMore()
-
-    def GetMore(self):
-        """
-        Method to cause more field values to be retrieved from DCGM. Returns
-        the number of field values that were retrieved
-        """
-        beforeCount = self._numValuesSeen
+        dcgm_structs._dcgmCheckReturn(ret)  #Will throw exception on error
 
-        # If we're in manual mode, force an update
-        if self._oprationMode == dcgm_structs.DCGM_OPERATION_MODE_MANUAL:
-            ret = dcgm_agent.dcgmUpdateAllFields(self._handle, 1)
-            # Will throw exception on error
-            dcgm_structs._dcgmCheckReturn(ret)
+        # Initial update will fetch from startTimestamp.
+        self.GetAllSinceLastCall()
 
-        self._nextSinceTimestamp = dcgm_agent.dcgmGetValuesSince_v2(
-            self._handle,
-            self._groupId,
-            self._fieldGroup.fieldGroupId,
-            self._nextSinceTimestamp,
-            helper_dcgm_field_values_since_entity_callback,
-            self,
-        )
-        afterCount = self._numValuesSeen
-        return afterCount - beforeCount
+    '''
+    Method to cause more field values to be retrieved from DCGM. Returns the
+    number of field values that were retrieved
+    '''
+
+    def GetAllSinceLastCall(self):
+        #If we're in manual mode, force an update
+        if self._operationMode == dcgm_structs.DCGM_OPERATION_MODE_MANUAL:
+            ret = dcgm_agent.dcgmUpdateAllFields(self._handle, 1)
+            dcgm_structs._dcgmCheckReturn(ret)  #Will throw exception on error
+
+        return super().GetAllSinceLastCall(self._fieldGroup)
+
+
+#Test program for demonstrating how this module works
+def main():
+    operationMode = dcgm_structs.DCGM_OPERATION_MODE_AUTO
+    timeStep = 1.0
+
+    dcgm_structs._dcgmInit()
+    dcgm_agent.dcgmInit()  #Will throw an exception on error
+    handle = dcgm_agent.dcgmStartEmbedded(operationMode)
+    handleObj = pydcgm.DcgmHandle(handle=handle)
+    groupId = dcgm_structs.DCGM_GROUP_ALL_GPUS
+    fieldIds = [
+        dcgm_fields.DCGM_FI_DEV_SM_CLOCK, dcgm_fields.DCGM_FI_DEV_MEM_CLOCK
+    ]
+
+    fieldGroup = pydcgm.DcgmFieldGroup(handleObj, "my_field_group", fieldIds)
+
+    updateFreq = int(timeStep * 1000000.0)
+    maxKeepAge = 3600.0  #1 hour
+    maxKeepSamples = 0  #unlimited. maxKeepAge will enforce quota
+    startTimestamp = 0  #beginning of time
+
+    dfcw = DcgmFieldGroupWatcher(handle, groupId, fieldGroup, operationMode,
+                                 updateFreq, maxKeepAge, maxKeepSamples,
+                                 startTimestamp)
+    dfcw2 = DcgmFieldGroupEntityWatcher(handle, groupId, fieldGroup,
+                                        operationMode, updateFreq, maxKeepAge,
+                                        maxKeepSamples, startTimestamp)
+
+    while (True):
+        newUpdateCount = dfcw.GetAllSinceLastCall()
+        newUpdateCount2 = dfcw2.GetAllSinceLastCall()
+        print("Got %d and %d new field value updates" %
+              (newUpdateCount, newUpdateCount2))
+        for gpuId in list(dfcw.values.keys()):
+            print("gpuId %d" % gpuId)
+            for fieldId in list(dfcw.values[gpuId].keys()):
+                print("    fieldId %d: %d values. latest timestamp %d" % \
+                      (fieldId, len(dfcw.values[gpuId][fieldId]), dfcw.values[gpuId][fieldId][-1].ts))
+
+        for entityGroupId in list(dfcw2.values.keys()):
+            print("entityGroupId %d" % entityGroupId)
+            for entityId in list(dfcw2.values[entityGroupId].keys()):
+                print("    entityId %d" % entityId)
+                for fieldId in list(
+                        dfcw2.values[entityGroupId][entityId].keys()):
+                    print("        fieldId %d: %d values. latest timestamp %d" % \
+                          (fieldId, len(dfcw2.values[entityGroupId][entityId][fieldId]), dfcw2.values[entityGroupId][entityId][fieldId][-1].ts))
+
+        time.sleep(timeStep)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/model_analyzer/monitor/dcgm/dcgm_fields.py b/model_analyzer/monitor/dcgm/dcgm_fields.py
index 708008233..7c07111cd 100755
--- a/model_analyzer/monitor/dcgm/dcgm_fields.py
+++ b/model_analyzer/monitor/dcgm/dcgm_fields.py
@@ -1,6 +1,4 @@
-#!/usr/bin/env python3
-
-# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,38 +11,28 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+##
+# Python bindings for the internal API of DCGM library (dcgm_fields.h)
+##
 
-from ctypes import (
-    POINTER,
-    Structure,
-    addressof,
-    c_char,
-    c_char_p,
-    c_int,
-    c_short,
-    c_ubyte,
-    c_uint32,
-    memmove,
-    sizeof,
-)
-
+from ctypes import *
+from ctypes.util import find_library
 import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
 
 # Provides access to functions
 dcgmFP = dcgm_structs._dcgmGetFunctionPointer
 
 # Field Types are a single byte. List these in ASCII order
-DCGM_FT_BINARY = "b"  # Blob of binary data representing a structure
-DCGM_FT_DOUBLE = "d"  # 8-byte double precision
-DCGM_FT_INT64 = "i"  # 8-byte signed integer
-DCGM_FT_STRING = "s"  # Null-terminated ASCII Character string
-DCGM_FT_TIMESTAMP = "t"  # 8-byte signed integer usec since 1970
+DCGM_FT_BINARY = 'b'  # Blob of binary data representing a structure
+DCGM_FT_DOUBLE = 'd'  # 8-byte double precision
+DCGM_FT_INT64 = 'i'  # 8-byte signed integer
+DCGM_FT_STRING = 's'  # Null-terminated ASCII Character string
+DCGM_FT_TIMESTAMP = 't'  # 8-byte signed integer usec since 1970
 
 # Field scope. What are these fields associated with
 DCGM_FS_GLOBAL = 0  # Field is global (ex: driver version)
 DCGM_FS_ENTITY = 1  # Field is associated with an entity (GPU, VGPU, ..etc)
-# Field is associated with a device. Deprecated. Use DCGM_FS_ENTITY
-DCGM_FS_DEVICE = DCGM_FS_ENTITY
+DCGM_FS_DEVICE = DCGM_FS_ENTITY  # Field is associated with a device. Deprecated. Use DCGM_FS_ENTITY
 
 # DCGM_FI_DEV_CLOCK_THROTTLE_REASONS is a bitmap of why the clock is throttled.
 # These macros are masks for relevant throttling, and are a 1:1 map to the NVML
@@ -63,8 +51,7 @@
 #
 # This is an indicator of:
 #  - temperature being too high
-#  - External Power Brake Assertion is triggered
-#    (e.g. by the system power supply)
+#  - External Power Brake Assertion is triggered (e.g. by the system power supply)
 #  - Power draw is too high and Fast Trigger protection is reducing the clocks
 #  - May be also reported during PState or clock change
 #  - This behavior may be removed in a later release.
@@ -87,635 +74,451 @@
 #  - Current memory temperature above the Memory Max Operating Temperature
 DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL = 0x0000000000000020
 
-# HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is
-# engaged
+# HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged
 #
 # This is an indicator of:
 #  - temperature being too high
 DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL = 0x0000000000000040
 
-# HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more)
-# is engaged
+# HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more) is engaged
 #
 # This is an indicator of:
-#  - External Power Brake Assertion being triggered (e.g. by the system power
-#  supply)
+#  - External Power Brake Assertion being triggered (e.g. by the system power supply)
 DCGM_CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE = 0x0000000000000080
 
 # GPU clocks are limited by current setting of Display clocks
 DCGM_CLOCKS_THROTTLE_REASON_DISPLAY_CLOCKS = 0x0000000000000100
 
-# Field entity groups. Which type of entity is this field or field value
-# associated with
-
-# Field is not associated with an entity. Field scope should be DCGM_FS_GLOBAL
-DCGM_FE_NONE = 0
+#Field entity groups. Which type of entity is this field or field value associated with
+DCGM_FE_NONE = 0  # Field is not associated with an entity. Field scope should be DCGM_FS_GLOBAL
 DCGM_FE_GPU = 1  # Field is associated with a GPU entity
 DCGM_FE_VGPU = 2  # Field is associated with a VGPU entity
 DCGM_FE_SWITCH = 3  # Field is associated with a Switch entity
 DCGM_FE_GPU_I = 4  # Field is associated with a GPU Instance entity
 DCGM_FE_GPU_CI = 5  # Field is associated with a GPU Compute Instance entity
+DCGM_FE_LINK = 6  # Field is associated with an NVLINK
 
-# Represents an identifier for an entity within a field entity. For instance,
-# this is the gpuId for DCGM_FE_GPU.
-c_dcgm_field_eid_t = c_uint32
+c_dcgm_field_eid_t = c_uint32  #Represents an identifier for an entity within a field entity. For instance, this is the gpuId for DCGM_FE_GPU.
 
-#
-# System attributes
-#
+#System attributes
 DCGM_FI_UNKNOWN = 0
-# Driver Version
-DCGM_FI_DRIVER_VERSION = 1
-# Underlying NVML version
-DCGM_FI_NVML_VERSION = 2
-# Process Name. Will be nv-hostengine or your process's name in embedded mode
-DCGM_FI_PROCESS_NAME = 3
-# Number of Devices on the node
-DCGM_FI_DEV_COUNT = 4
-
-#
-# Device attributes
-#
-# Name of the GPU device
-DCGM_FI_DEV_NAME = 50
-# Device Brand
-DCGM_FI_DEV_BRAND = 51
-# NVML index of this GPU
-DCGM_FI_DEV_NVML_INDEX = 52
-# Device Serial Number
-DCGM_FI_DEV_SERIAL = 53
-# UUID corresponding to the device
-DCGM_FI_DEV_UUID = 54
-# Device node minor number /dev/nvidia#
-DCGM_FI_DEV_MINOR_NUMBER = 55
-# OEM inforom version
-DCGM_FI_DEV_OEM_INFOROM_VER = 56
-# PCI attributes for the device
-DCGM_FI_DEV_PCI_BUSID = 57
-# The combined 16-bit device id and 16-bit vendor id
-DCGM_FI_DEV_PCI_COMBINED_ID = 58
-# The 32-bit Sub System Device ID
-DCGM_FI_DEV_PCI_SUBSYS_ID = 59
-# Topology of all GPUs on the system via PCI (static)
-DCGM_FI_GPU_TOPOLOGY_PCI = 60
-# Topology of all GPUs on the system via NVLINK (static)
-DCGM_FI_GPU_TOPOLOGY_NVLINK = 61
-# Affinity of all GPUs on the system (static)
-DCGM_FI_GPU_TOPOLOGY_AFFINITY = 62
-# Compute mode for the device
-DCGM_FI_DEV_COMPUTE_MODE = 65
-# Persistence mode for the device
-DCGM_FI_DEV_PERSISTENCE_MODE = 66
-# MIG mode for the device
-DCGM_FI_DEV_MIG_MODE = 67
-# String value for CUDA_VISIBLE_DEVICES for the device
-DCGM_FI_DEV_CUDA_VISIBLE_DEVICES_STR = 68
-# Device CPU affinity. part 1/8 = cpus 0 - 63
-DCGM_FI_DEV_CPU_AFFINITY_0 = 70
-# Device CPU affinity. part 1/8 = cpus 64 - 127
-DCGM_FI_DEV_CPU_AFFINITY_1 = 71
-# Device CPU affinity. part 2/8 = cpus 128 - 191
-DCGM_FI_DEV_CPU_AFFINITY_2 = 72
-# Device CPU affinity. part 3/8 = cpus 192 - 255
-DCGM_FI_DEV_CPU_AFFINITY_3 = 73
-# ECC inforom version
-DCGM_FI_DEV_ECC_INFOROM_VER = 80
-# Power management object inforom version
-DCGM_FI_DEV_POWER_INFOROM_VER = 81
-# Inforom image version
-DCGM_FI_DEV_INFOROM_IMAGE_VER = 82
-# Inforom configuration checksum
-DCGM_FI_DEV_INFOROM_CONFIG_CHECK = 83
-# Reads the infoROM from the flash and verifies the checksums
-DCGM_FI_DEV_INFOROM_CONFIG_VALID = 84
-# VBIOS version of the device
-DCGM_FI_DEV_VBIOS_VERSION = 85
-# Total BAR1 of the GPU
-DCGM_FI_DEV_BAR1_TOTAL = 90
-# Deprecated - Sync boost settings on the node
-DCGM_FI_SYNC_BOOST = 91
-# Used BAR1 of the GPU in MB
-DCGM_FI_DEV_BAR1_USED = 92
-# Free BAR1 of the GPU in MB
-DCGM_FI_DEV_BAR1_FREE = 93
-
-#
-# Clocks and power
-#
-# SM clock for the device
-DCGM_FI_DEV_SM_CLOCK = 100
-# Memory clock for the device
-DCGM_FI_DEV_MEM_CLOCK = 101
-# Video encoder/decoder clock for the device
-DCGM_FI_DEV_VIDEO_CLOCK = 102
-# SM Application clocks
-DCGM_FI_DEV_APP_SM_CLOCK = 110
-# Memory Application clocks
-DCGM_FI_DEV_APP_MEM_CLOCK = 111
-# Current clock throttle reasons (bitmask of DCGM_CLOCKS_THROTTLE_REASON_*)
-DCGM_FI_DEV_CLOCK_THROTTLE_REASONS = 112
-# Maximum supported SM clock for the device
-DCGM_FI_DEV_MAX_SM_CLOCK = 113
-# Maximum supported Memory clock for the device
-DCGM_FI_DEV_MAX_MEM_CLOCK = 114
-# Maximum supported Video encoder/decoder clock for the device
-DCGM_FI_DEV_MAX_VIDEO_CLOCK = 115
-# Auto-boost for the device (1 = enabled. 0 = disabled)
-DCGM_FI_DEV_AUTOBOOST = 120
-# Supported clocks for the device
-DCGM_FI_DEV_SUPPORTED_CLOCKS = 130
-# Memory temperature for the device
-DCGM_FI_DEV_MEMORY_TEMP = 140
-# Current temperature readings for the device, in degrees C
-DCGM_FI_DEV_GPU_TEMP = 150
-# Power usage for the device in Watts
-DCGM_FI_DEV_POWER_USAGE = 155
-# Total energy consumption for the GPU in mJ since the driver was last reloaded
-DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION = 156
-# Slowdown temperature for the device
-DCGM_FI_DEV_SLOWDOWN_TEMP = 158
-# Shutdown temperature for the device
-DCGM_FI_DEV_SHUTDOWN_TEMP = 159
-# Current Power limit for the device
-DCGM_FI_DEV_POWER_MGMT_LIMIT = 160
-# Minimum power management limit for the device
-DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN = 161
-# Maximum power management limit for the device
-DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX = 162
-# Default power management limit for the device
-DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF = 163
-# Effective power limit that the driver enforces after taking into account all
-# limiters
-DCGM_FI_DEV_ENFORCED_POWER_LIMIT = 164
-# Performance state (P-State) 0-15. 0=highest
-DCGM_FI_DEV_PSTATE = 190
-# Fan speed for the device in percent 0-100
-DCGM_FI_DEV_FAN_SPEED = 191
-
-#
-# Device utilization and telemetry
-#
-# Deprecated - PCIe Tx utilization information
-DCGM_FI_DEV_PCIE_TX_THROUGHPUT = 200
-# Deprecated - PCIe Rx utilization information
-DCGM_FI_DEV_PCIE_RX_THROUGHPUT = 201
-# PCIe replay counter
-DCGM_FI_DEV_PCIE_REPLAY_COUNTER = 202
-# GPU Utilization
-DCGM_FI_DEV_GPU_UTIL = 203
-# Memory Utilization
-DCGM_FI_DEV_MEM_COPY_UTIL = 204
-# Process accounting stats
-DCGM_FI_DEV_ACCOUNTING_DATA = 205
-# Encoder utilization
-DCGM_FI_DEV_ENC_UTIL = 206
-# Decoder utilization
-DCGM_FI_DEV_DEC_UTIL = 207
-# Memory utilization samples
-DCGM_FI_DEV_MEM_COPY_UTIL_SAMPLES = 210
-# SM utilization samples
-DCGM_FI_DEV_GPU_UTIL_SAMPLES = 211
-# Graphics processes running on the GPU.
-DCGM_FI_DEV_GRAPHICS_PIDS = 220
-# Compute processes running on the GPU.
-DCGM_FI_DEV_COMPUTE_PIDS = 221
-# XID errors. The value is the specific XID error
-DCGM_FI_DEV_XID_ERRORS = 230
-# PCIe Max Link Generation
-DCGM_FI_DEV_PCIE_MAX_LINK_GEN = 235
-# PCIe Max Link Width
-DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH = 236
-# PCIe Current Link Generation
-DCGM_FI_DEV_PCIE_LINK_GEN = 237
-# PCIe Current Link Width
-DCGM_FI_DEV_PCIE_LINK_WIDTH = 238
-
-#
-# Violation counters
-#
-# Power Violation time in usec
-DCGM_FI_DEV_POWER_VIOLATION = 240
-# Thermal Violation time in usec
-DCGM_FI_DEV_THERMAL_VIOLATION = 241
-# Sync Boost Violation time in usec
-DCGM_FI_DEV_SYNC_BOOST_VIOLATION = 242
-# Board Limit Violation time in usec.
-DCGM_FI_DEV_BOARD_LIMIT_VIOLATION = 243
-# Low Utilization Violation time in usec.
-DCGM_FI_DEV_LOW_UTIL_VIOLATION = 244
-# Reliability Violation time in usec.
-DCGM_FI_DEV_RELIABILITY_VIOLATION = 245
-# App Clocks Violation time in usec.
-DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION = 246
-# Base Clocks Violation time in usec.
-DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION = 247
-
-#
-# Framebuffer usage
-#
-# Total framebuffer memory in MB
-DCGM_FI_DEV_FB_TOTAL = 250
-# Total framebuffer used in MB
-DCGM_FI_DEV_FB_FREE = 251
-# Total framebuffer free in MB
-DCGM_FI_DEV_FB_USED = 252
-
-#
-# Device ECC Counters
-#
-# Current ECC mode for the device
-DCGM_FI_DEV_ECC_CURRENT = 300
-# Pending ECC mode for the device
-DCGM_FI_DEV_ECC_PENDING = 301
-# Total single bit volatile ecc errors
-DCGM_FI_DEV_ECC_SBE_VOL_TOTAL = 310
-# Total double bit volatile ecc errors
-DCGM_FI_DEV_ECC_DBE_VOL_TOTAL = 311
-# Total single bit aggregate (persistent) ecc errors
-DCGM_FI_DEV_ECC_SBE_AGG_TOTAL = 312
-# Total double bit aggregate (persistent) ecc errors
-DCGM_FI_DEV_ECC_DBE_AGG_TOTAL = 313
-# L1 cache single bit volatile ecc errors
-DCGM_FI_DEV_ECC_SBE_VOL_L1 = 314
-# L1 cache double bit volatile ecc errors
-DCGM_FI_DEV_ECC_DBE_VOL_L1 = 315
-# L2 cache single bit volatile ecc errors
-DCGM_FI_DEV_ECC_SBE_VOL_L2 = 316
-# L2 cache double bit volatile ecc errors
-DCGM_FI_DEV_ECC_DBE_VOL_L2 = 317
-# Device memory single bit volatile ecc errors
-DCGM_FI_DEV_ECC_SBE_VOL_DEV = 318
-# Device memory double bit volatile ecc errors
-DCGM_FI_DEV_ECC_DBE_VOL_DEV = 319
-# Register file single bit volatile ecc errors
-DCGM_FI_DEV_ECC_SBE_VOL_REG = 320
-# Register file double bit volatile ecc errors
-DCGM_FI_DEV_ECC_DBE_VOL_REG = 321
-# Texture memory single bit volatile ecc errors
-DCGM_FI_DEV_ECC_SBE_VOL_TEX = 322
-# Texture memory double bit volatile ecc errors
-DCGM_FI_DEV_ECC_DBE_VOL_TEX = 323
-# L1 cache single bit aggregate (persistent) ecc errors
-DCGM_FI_DEV_ECC_SBE_AGG_L1 = 324
-# L1 cache double bit aggregate (persistent) ecc errors
-DCGM_FI_DEV_ECC_DBE_AGG_L1 = 325
-# L2 cache single bit aggregate (persistent) ecc errors
-DCGM_FI_DEV_ECC_SBE_AGG_L2 = 326
-# L2 cache double bit aggregate (persistent) ecc errors
-DCGM_FI_DEV_ECC_DBE_AGG_L2 = 327
-# Device memory single bit aggregate (persistent) ecc errors
-DCGM_FI_DEV_ECC_SBE_AGG_DEV = 328
-# Device memory double bit aggregate (persistent) ecc errors
-DCGM_FI_DEV_ECC_DBE_AGG_DEV = 329
-# Register File single bit aggregate (persistent) ecc errors
-DCGM_FI_DEV_ECC_SBE_AGG_REG = 330
-# Register File double bit aggregate (persistent) ecc errors
-DCGM_FI_DEV_ECC_DBE_AGG_REG = 331
-# Texture memory single bit aggregate (persistent) ecc errors
-DCGM_FI_DEV_ECC_SBE_AGG_TEX = 332
-# Texture memory double bit aggregate (persistent) ecc errors
-DCGM_FI_DEV_ECC_DBE_AGG_TEX = 333
-# Number of retired pages because of single bit errors
-DCGM_FI_DEV_RETIRED_SBE = 390
-# Number of retired pages because of double bit errors
-DCGM_FI_DEV_RETIRED_DBE = 391
-# Number of pages pending retirement
-DCGM_FI_DEV_RETIRED_PENDING = 392
-
-#
-# Row remapper fields (Ampere and newer)
-#
-# Number of remapped rows for uncorrectable errors
-DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS = 393
-# Number of remapped rows for correctable errors
-DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS = 394
-# Whether remapping of rows has failed
-DCGM_FI_DEV_ROW_REMAP_FAILURE = 395
-
-#
-# Device NvLink Bandwidth and Error Counters
-#
-# NV Link flow control CRC  Error Counter for Lane 0
-DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0 = 400
-# NV Link flow control CRC  Error Counter for Lane 1
-DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1 = 401
-# NV Link flow control CRC  Error Counter for Lane 2
-DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2 = 402
-# NV Link flow control CRC  Error Counter for Lane 3
-DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3 = 403
-# NV Link flow control CRC  Error Counter for Lane 4
-DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4 = 404
-# NV Link flow control CRC  Error Counter for Lane 5
-DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5 = 405
-# NV Link flow control CRC  Error Counter total for all Lanes
-DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL = 409
-# NV Link data CRC Error Counter for Lane 0
-DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0 = 410
-# NV Link data CRC Error Counter for Lane 1
-DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1 = 411
-# NV Link data CRC Error Counter for Lane 2
-DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2 = 412
-# NV Link data CRC Error Counter for Lane 3
-DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3 = 413
-# NV Link data CRC Error Counter for Lane 4
-DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4 = 414
-# NV Link data CRC Error Counter for Lane 5
-DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5 = 415
-# NV Link data CRC Error Counter total for all Lanes
-DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL = 419
-# NV Link Replay Error Counter for Lane 0
-DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0 = 420
-# NV Link Replay Error Counter for Lane 1
-DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1 = 421
-# NV Link Replay Error Counter for Lane 2
-DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2 = 422
-# NV Link Replay Error Counter for Lane 3
-DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3 = 423
-# NV Link Replay Error Counter for Lane 4
-DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4 = 424
-# NV Link Replay Error Counter for Lane 3
-DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5 = 425
-# NV Link Replay Error Counter total for all Lanes
-DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL = 429
-# NV Link Recovery Error Counter for Lane 0
-DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0 = 430
-# NV Link Recovery Error Counter for Lane 1
-DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1 = 431
-# NV Link Recovery Error Counter for Lane 2
-DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2 = 432
-# NV Link Recovery Error Counter for Lane 3
-DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3 = 433
-# NV Link Recovery Error Counter for Lane 4
-DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4 = 434
-# NV Link Recovery Error Counter for Lane 5
-DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5 = 435
-# NV Link Recovery Error Counter total for all Lanes
-DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL = 439
-# NV Link Bandwidth Counter for Lane 0
-DCGM_FI_DEV_NVLINK_BANDWIDTH_L0 = 440
-# NV Link Bandwidth Counter for Lane 1
-DCGM_FI_DEV_NVLINK_BANDWIDTH_L1 = 441
-# NV Link Bandwidth Counter for Lane 2
-DCGM_FI_DEV_NVLINK_BANDWIDTH_L2 = 442
-# NV Link Bandwidth Counter for Lane 3
-DCGM_FI_DEV_NVLINK_BANDWIDTH_L3 = 443
-# NV Link Bandwidth Counter for Lane 4
-DCGM_FI_DEV_NVLINK_BANDWIDTH_L4 = 444
-# NV Link Bandwidth Counter for Lane 5
-DCGM_FI_DEV_NVLINK_BANDWIDTH_L5 = 445
-# NV Link Bandwidth Counter total for all Lanes
-DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL = 449
-# GPU NVLink error information
-DCGM_FI_DEV_GPU_NVLINK_ERRORS = 450
-
-#
-# Device Attributes associated with virtualization
-#
-# Operating mode of the GPU
-DCGM_FI_DEV_VIRTUAL_MODE = 500
-# Includes Count and Supported vGPU type information
-DCGM_FI_DEV_SUPPORTED_TYPE_INFO = 501
-# Includes Count and List of Creatable vGPU type IDs
-DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS = 502
-# Includes Count and List of vGPU instance IDs
-DCGM_FI_DEV_VGPU_INSTANCE_IDS = 503
-# Utilization values for vGPUs running on the device
-DCGM_FI_DEV_VGPU_UTILIZATIONS = 504
-# Utilization values for processes running within vGPU VMs using the device
-DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION = 505
-# Current encoder statistics for a given device
-DCGM_FI_DEV_ENC_STATS = 506
-# Statistics of current active frame buffer capture sessions on a given device
-DCGM_FI_DEV_FBC_STATS = 507
-# Information about active frame buffer capture sessions on a target device
-DCGM_FI_DEV_FBC_SESSIONS_INFO = 508
-
-#
-# Related to vGPU Instance IDs
-#
-# vGPU VM ID
-DCGM_FI_DEV_VGPU_VM_ID = 520
-# vGPU VM name
-DCGM_FI_DEV_VGPU_VM_NAME = 521
-# vGPU type of the vGPU instance
-DCGM_FI_DEV_VGPU_TYPE = 522
-# UUID of the vGPU instance
-DCGM_FI_DEV_VGPU_UUID = 523
-# Driver version of the vGPU instance
-DCGM_FI_DEV_VGPU_DRIVER_VERSION = 524
-# Memory usage of the vGPU instance
-DCGM_FI_DEV_VGPU_MEMORY_USAGE = 525
-# License status of the vGPU instance
-DCGM_FI_DEV_VGPU_LICENSE_STATUS = 526
-# Frame rate limit of the vGPU instance
-DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT = 527
-# Current encoder statistics of the vGPU instance
-DCGM_FI_DEV_VGPU_ENC_STATS = 528
-# Information about all active encoder sessions on the vGPU instance
-DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO = 529
-# Statistics of current active frame buffer capture sessions on the vGPU
-# instance
-DCGM_FI_DEV_VGPU_FBC_STATS = 530
-# Information about active frame buffer capture sessions on the vGPU instance
-DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO = 531
-
-# Internal fields reserve the range 600..699
-# below fields related to NVSwitch
-DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P00 = 700
-DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P00 = 701
-DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P00 = 702
-DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P00 = 703
-DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P01 = 704
-DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P01 = 705
-DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P01 = 706
-DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P01 = 707
-DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P02 = 708
-DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P02 = 709
-DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P02 = 710
-DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P02 = 711
-DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P03 = 712
-DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P03 = 713
-DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P03 = 714
-DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P03 = 715
-DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P04 = 716
-DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P04 = 717
-DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P04 = 718
-DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P04 = 719
-DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P05 = 720
-DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P05 = 721
-DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P05 = 722
-DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P05 = 723
-DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P06 = 724
-DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P06 = 725
-DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P06 = 726
-DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P06 = 727
-DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P07 = 728
-DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P07 = 729
-DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P07 = 730
-DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P07 = 731
-DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P08 = 732
-DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P08 = 733
-DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P08 = 734
-DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P08 = 735
-DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P09 = 736
-DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P09 = 737
-DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P09 = 738
-DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P09 = 739
-DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P10 = 740
-DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P10 = 741
-DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P10 = 742
-DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P10 = 743
-DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P11 = 744
-DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P11 = 745
-DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P11 = 746
-DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P11 = 747
-DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P12 = 748
-DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P12 = 749
-DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P12 = 750
-DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P12 = 751
-DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P13 = 752
-DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P13 = 753
-DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P13 = 754
-DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P13 = 755
-DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P14 = 756
-DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P14 = 757
-DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P14 = 758
-DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P14 = 759
-DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P15 = 760
-DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P15 = 761
-DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P15 = 762
-DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P15 = 763
-DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P16 = 764
-DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P16 = 765
-DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P16 = 766
-DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P16 = 767
-DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P17 = 768
-DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P17 = 769
-DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P17 = 770
-DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P17 = 771
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P00 = 780
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P00 = 781
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P01 = 782
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P01 = 783
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P02 = 784
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P02 = 785
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P03 = 786
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P03 = 787
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P04 = 788
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P04 = 789
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P05 = 790
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P05 = 791
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P06 = 792
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P06 = 793
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P07 = 794
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P07 = 795
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P08 = 796
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P08 = 797
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P09 = 798
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P09 = 799
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P10 = 800
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P10 = 801
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P11 = 802
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P11 = 803
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P12 = 804
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P12 = 805
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P13 = 806
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P13 = 807
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P14 = 808
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P14 = 809
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P15 = 810
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P15 = 811
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P16 = 812
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P16 = 813
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P17 = 814
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P17 = 815
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P00 = 820
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P00 = 821
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P01 = 822
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P01 = 823
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P02 = 824
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P02 = 825
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P03 = 826
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P03 = 827
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P04 = 828
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P04 = 829
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P05 = 830
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P05 = 831
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P06 = 832
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P06 = 833
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P07 = 834
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P07 = 835
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P08 = 836
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P08 = 837
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P09 = 838
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P09 = 839
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P10 = 840
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P10 = 841
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P11 = 842
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P11 = 843
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P12 = 844
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P12 = 845
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P13 = 846
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P13 = 847
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P14 = 848
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P14 = 849
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P15 = 850
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P15 = 851
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P16 = 852
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P16 = 853
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P17 = 854
-DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P17 = 855
+DCGM_FI_DRIVER_VERSION = 1  #Driver Version
+DCGM_FI_NVML_VERSION = 2  #Underlying NVML version
+DCGM_FI_PROCESS_NAME = 3  #Process Name. Will be nv-hostengine or your process's name in embedded mode
+DCGM_FI_DEV_COUNT = 4  #Number of Devices on the node
+DCGM_FI_CUDA_DRIVER_VERSION = 5  #Cuda Driver Version as an integer. CUDA 11.1 = 11100
+#Device attributes
+DCGM_FI_DEV_NAME = 50  #Name of the GPU device
+DCGM_FI_DEV_BRAND = 51  #Device Brand
+DCGM_FI_DEV_NVML_INDEX = 52  #NVML index of this GPU
+DCGM_FI_DEV_SERIAL = 53  #Device Serial Number
+DCGM_FI_DEV_UUID = 54  #UUID corresponding to the device
+DCGM_FI_DEV_MINOR_NUMBER = 55  #Device node minor number /dev/nvidia#
+DCGM_FI_DEV_OEM_INFOROM_VER = 56  #OEM inforom version
+DCGM_FI_DEV_PCI_BUSID = 57  #PCI attributes for the device
+DCGM_FI_DEV_PCI_COMBINED_ID = 58  #The combined 16-bit device id and 16-bit vendor id
+DCGM_FI_DEV_PCI_SUBSYS_ID = 59  #The 32-bit Sub System Device ID
+DCGM_FI_GPU_TOPOLOGY_PCI = 60  #Topology of all GPUs on the system via PCI (static)
+DCGM_FI_GPU_TOPOLOGY_NVLINK = 61  #Topology of all GPUs on the system via NVLINK (static)
+DCGM_FI_GPU_TOPOLOGY_AFFINITY = 62  #Affinity of all GPUs on the system (static)
+DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY = 63  #Cuda compute capability for the device
+DCGM_FI_DEV_COMPUTE_MODE = 65  #Compute mode for the device
+DCGM_FI_DEV_PERSISTENCE_MODE = 66  #Persistence mode for the device
+DCGM_FI_DEV_MIG_MODE = 67  #MIG mode for the device
+DCGM_FI_DEV_CUDA_VISIBLE_DEVICES_STR = 68  #String value for CUDA_VISIBLE_DEVICES for the device
+DCGM_FI_DEV_MIG_MAX_SLICES = 69  #The maximum number of slices this GPU supports
+DCGM_FI_DEV_CPU_AFFINITY_0 = 70  #Device CPU affinity. part 1/8 = cpus 0 - 63
+DCGM_FI_DEV_CPU_AFFINITY_1 = 71  #Device CPU affinity. part 1/8 = cpus 64 - 127
+DCGM_FI_DEV_CPU_AFFINITY_2 = 72  #Device CPU affinity. part 2/8 = cpus 128 - 191
+DCGM_FI_DEV_CPU_AFFINITY_3 = 73  #Device CPU affinity. part 3/8 = cpus 192 - 255
+DCGM_FI_DEV_CC_MODE = 74  #Device CC/APM mode
+DCGM_FI_DEV_MIG_ATTRIBUTES = 75  #MIG device attributes
+DCGM_FI_DEV_MIG_GI_INFO = 76  #GPU instance profile information
+DCGM_FI_DEV_MIG_CI_INFO = 77  #Compute instance profile information
+DCGM_FI_DEV_ECC_INFOROM_VER = 80  #ECC inforom version
+DCGM_FI_DEV_POWER_INFOROM_VER = 81  #Power management object inforom version
+DCGM_FI_DEV_INFOROM_IMAGE_VER = 82  #Inforom image version
+DCGM_FI_DEV_INFOROM_CONFIG_CHECK = 83  #Inforom configuration checksum
+DCGM_FI_DEV_INFOROM_CONFIG_VALID = 84  #Reads the infoROM from the flash and verifies the checksums
+DCGM_FI_DEV_VBIOS_VERSION = 85  #VBIOS version of the device
+DCGM_FI_DEV_BAR1_TOTAL = 90  #Total BAR1 of the GPU
+DCGM_FI_SYNC_BOOST = 91  #Deprecated - Sync boost settings on the node
+DCGM_FI_DEV_BAR1_USED = 92  #Used BAR1 of the GPU in MB
+DCGM_FI_DEV_BAR1_FREE = 93  #Free BAR1 of the GPU in MB
+#Clocks and power
+DCGM_FI_DEV_SM_CLOCK = 100  #SM clock for the device
+DCGM_FI_DEV_MEM_CLOCK = 101  #Memory clock for the device
+DCGM_FI_DEV_VIDEO_CLOCK = 102  #Video encoder/decoder clock for the device
+DCGM_FI_DEV_APP_SM_CLOCK = 110  #SM Application clocks
+DCGM_FI_DEV_APP_MEM_CLOCK = 111  #Memory Application clocks
+DCGM_FI_DEV_CLOCK_THROTTLE_REASONS = 112  #Current clock throttle reasons (bitmask of DCGM_CLOCKS_THROTTLE_REASON_*)
+DCGM_FI_DEV_MAX_SM_CLOCK = 113  #Maximum supported SM clock for the device
+DCGM_FI_DEV_MAX_MEM_CLOCK = 114  #Maximum supported Memory clock for the device
+DCGM_FI_DEV_MAX_VIDEO_CLOCK = 115  #Maximum supported Video encoder/decoder clock for the device
+DCGM_FI_DEV_AUTOBOOST = 120  #Auto-boost for the device (1 = enabled. 0 = disabled)
+DCGM_FI_DEV_SUPPORTED_CLOCKS = 130  #Supported clocks for the device
+DCGM_FI_DEV_MEMORY_TEMP = 140  #Memory temperature for the device
+DCGM_FI_DEV_GPU_TEMP = 150  #Current temperature readings for the device, in degrees C
+DCGM_FI_DEV_MEM_MAX_OP_TEMP = 151  #Maximum operating temperature for the memory of this GPU
+DCGM_FI_DEV_GPU_MAX_OP_TEMP = 152  #Maximum operating temperature for this GPU
+DCGM_FI_DEV_POWER_USAGE = 155  #Power usage for the device in Watts
+DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION = 156  #Total energy consumption for the GPU in mJ since the driver was last reloaded
+DCGM_FI_DEV_SLOWDOWN_TEMP = 158  #Slowdown temperature for the device
+DCGM_FI_DEV_SHUTDOWN_TEMP = 159  #Shutdown temperature for the device
+DCGM_FI_DEV_POWER_MGMT_LIMIT = 160  #Current Power limit for the device
+DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN = 161  #Minimum power management limit for the device
+DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX = 162  #Maximum power management limit for the device
+DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF = 163  #Default power management limit for the device
+DCGM_FI_DEV_ENFORCED_POWER_LIMIT = 164  #Effective power limit that the driver enforces after taking into account all limiters
+DCGM_FI_DEV_PSTATE = 190  #Performance state (P-State) 0-15. 0=highest
+DCGM_FI_DEV_FAN_SPEED = 191  #Fan speed for the device in percent 0-100
+#Device utilization and telemetry
+DCGM_FI_DEV_PCIE_TX_THROUGHPUT = 200  #Deprecated - PCIe Tx utilization information
+DCGM_FI_DEV_PCIE_RX_THROUGHPUT = 201  #Deprecated - PCIe Rx utilization information
+DCGM_FI_DEV_PCIE_REPLAY_COUNTER = 202  #PCIe replay counter
+DCGM_FI_DEV_GPU_UTIL = 203  #GPU Utilization
+DCGM_FI_DEV_MEM_COPY_UTIL = 204  #Memory Utilization
+DCGM_FI_DEV_ACCOUNTING_DATA = 205  #Process accounting stats
+DCGM_FI_DEV_ENC_UTIL = 206  #Encoder utilization
+DCGM_FI_DEV_DEC_UTIL = 207  #Decoder utilization
+# Fields 210, 211, 220, and 221 are internal-only. see dcgm_fields_internal.py
+DCGM_FI_DEV_XID_ERRORS = 230  #XID errors. The value is the specific XID error
+DCGM_FI_DEV_PCIE_MAX_LINK_GEN = 235  #PCIe Max Link Generation
+DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH = 236  #PCIe Max Link Width
+DCGM_FI_DEV_PCIE_LINK_GEN = 237  #PCIe Current Link Generation
+DCGM_FI_DEV_PCIE_LINK_WIDTH = 238  #PCIe Current Link Width
+#Violation counters
+DCGM_FI_DEV_POWER_VIOLATION = 240  #Power Violation time in usec
+DCGM_FI_DEV_THERMAL_VIOLATION = 241  #Thermal Violation time in usec
+DCGM_FI_DEV_SYNC_BOOST_VIOLATION = 242  #Sync Boost Violation time in usec
+DCGM_FI_DEV_BOARD_LIMIT_VIOLATION = 243  #Board Limit Violation time in usec.
+DCGM_FI_DEV_LOW_UTIL_VIOLATION = 244  #Low Utilization Violation time in usec.
+DCGM_FI_DEV_RELIABILITY_VIOLATION = 245  #Reliability Violation time in usec.
+DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION = 246  #App Clocks Violation time in usec.
+DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION = 247  #Base Clocks Violation time in usec.
+#Framebuffer usage
+DCGM_FI_DEV_FB_TOTAL = 250  #Total framebuffer memory in MB
+DCGM_FI_DEV_FB_FREE = 251  #Total framebuffer used in MB
+DCGM_FI_DEV_FB_USED = 252  #Total framebuffer free in MB
+DCGM_FI_DEV_FB_RESERVED = 253  #Total framebuffer reserved in MB
+#Device ECC Counters
+DCGM_FI_DEV_ECC_CURRENT = 300  #Current ECC mode for the device
+DCGM_FI_DEV_ECC_PENDING = 301  #Pending ECC mode for the device
+DCGM_FI_DEV_ECC_SBE_VOL_TOTAL = 310  #Total single bit volatile ecc errors
+DCGM_FI_DEV_ECC_DBE_VOL_TOTAL = 311  #Total double bit volatile ecc errors
+DCGM_FI_DEV_ECC_SBE_AGG_TOTAL = 312  #Total single bit aggregate (persistent) ecc errors
+DCGM_FI_DEV_ECC_DBE_AGG_TOTAL = 313  #Total double bit aggregate (persistent) ecc errors
+DCGM_FI_DEV_ECC_SBE_VOL_L1 = 314  #L1 cache single bit volatile ecc errors
+DCGM_FI_DEV_ECC_DBE_VOL_L1 = 315  #L1 cache double bit volatile ecc errors
+DCGM_FI_DEV_ECC_SBE_VOL_L2 = 316  #L2 cache single bit volatile ecc errors
+DCGM_FI_DEV_ECC_DBE_VOL_L2 = 317  #L2 cache double bit volatile ecc errors
+DCGM_FI_DEV_ECC_SBE_VOL_DEV = 318  #Device memory single bit volatile ecc errors
+DCGM_FI_DEV_ECC_DBE_VOL_DEV = 319  #Device memory double bit volatile ecc errors
+DCGM_FI_DEV_ECC_SBE_VOL_REG = 320  #Register file single bit volatile ecc errors
+DCGM_FI_DEV_ECC_DBE_VOL_REG = 321  #Register file double bit volatile ecc errors
+DCGM_FI_DEV_ECC_SBE_VOL_TEX = 322  #Texture memory single bit volatile ecc errors
+DCGM_FI_DEV_ECC_DBE_VOL_TEX = 323  #Texture memory double bit volatile ecc errors
+DCGM_FI_DEV_ECC_SBE_AGG_L1 = 324  #L1 cache single bit aggregate (persistent) ecc errors
+DCGM_FI_DEV_ECC_DBE_AGG_L1 = 325  #L1 cache double bit aggregate (persistent) ecc errors
+DCGM_FI_DEV_ECC_SBE_AGG_L2 = 326  #L2 cache single bit aggregate (persistent) ecc errors
+DCGM_FI_DEV_ECC_DBE_AGG_L2 = 327  #L2 cache double bit aggregate (persistent) ecc errors
+DCGM_FI_DEV_ECC_SBE_AGG_DEV = 328  #Device memory single bit aggregate (persistent) ecc errors
+DCGM_FI_DEV_ECC_DBE_AGG_DEV = 329  #Device memory double bit aggregate (persistent) ecc errors
+DCGM_FI_DEV_ECC_SBE_AGG_REG = 330  #Register File single bit aggregate (persistent) ecc errors
+DCGM_FI_DEV_ECC_DBE_AGG_REG = 331  #Register File double bit aggregate (persistent) ecc errors
+DCGM_FI_DEV_ECC_SBE_AGG_TEX = 332  #Texture memory single bit aggregate (persistent) ecc errors
+DCGM_FI_DEV_ECC_DBE_AGG_TEX = 333  #Texture memory double bit aggregate (persistent) ecc errors
+DCGM_FI_DEV_RETIRED_SBE = 390  #Number of retired pages because of single bit errors
+DCGM_FI_DEV_RETIRED_DBE = 391  #Number of retired pages because of double bit errors
+DCGM_FI_DEV_RETIRED_PENDING = 392  #Number of pages pending retirement
+#Row remapper fields (Ampere and newer)
+DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS = 393  #Number of remapped rows for uncorrectable errors
+DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS = 394  #Number of remapped rows for correctable errors
+DCGM_FI_DEV_ROW_REMAP_FAILURE = 395  #Whether remapping of rows has failed
+DCGM_FI_DEV_ROW_REMAP_PENDING = 396  #Whether remapping of rows is pending
+
+#Device NvLink Bandwidth and Error Counters
+DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0 = 400  #NV Link flow control CRC  Error Counter for Lane 0
+DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1 = 401  #NV Link flow control CRC  Error Counter for Lane 1
+DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2 = 402  #NV Link flow control CRC  Error Counter for Lane 2
+DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3 = 403  #NV Link flow control CRC  Error Counter for Lane 3
+DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4 = 404  #NV Link flow control CRC  Error Counter for Lane 4
+DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5 = 405  #NV Link flow control CRC  Error Counter for Lane 5
+DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL = 409  #NV Link flow control CRC  Error Counter total for all Lanes
+DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0 = 410  #NV Link data CRC Error Counter for Lane 0
+DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1 = 411  #NV Link data CRC Error Counter for Lane 1
+DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2 = 412  #NV Link data CRC Error Counter for Lane 2
+DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3 = 413  #NV Link data CRC Error Counter for Lane 3
+DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4 = 414  #NV Link data CRC Error Counter for Lane 4
+DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5 = 415  #NV Link data CRC Error Counter for Lane 5
+DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL = 419  #NV Link data CRC Error Counter total for all Lanes
+DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0 = 420  #NV Link Replay Error Counter for Lane 0
+DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1 = 421  #NV Link Replay Error Counter for Lane 1
+DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2 = 422  #NV Link Replay Error Counter for Lane 2
+DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3 = 423  #NV Link Replay Error Counter for Lane 3
+DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4 = 424  #NV Link Replay Error Counter for Lane 4
+DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5 = 425  #NV Link Replay Error Counter for Lane 3
+DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL = 429  #NV Link Replay Error Counter total for all Lanes
+
+DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0 = 430  #NV Link Recovery Error Counter for Lane 0
+DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1 = 431  #NV Link Recovery Error Counter for Lane 1
+DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2 = 432  #NV Link Recovery Error Counter for Lane 2
+DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3 = 433  #NV Link Recovery Error Counter for Lane 3
+DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4 = 434  #NV Link Recovery Error Counter for Lane 4
+DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5 = 435  #NV Link Recovery Error Counter for Lane 5
+DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL = 439  #NV Link Recovery Error Counter total for all Lanes
+DCGM_FI_DEV_NVLINK_BANDWIDTH_L0 = 440  #NV Link Bandwidth Counter for Lane 0
+DCGM_FI_DEV_NVLINK_BANDWIDTH_L1 = 441  #NV Link Bandwidth Counter for Lane 1
+DCGM_FI_DEV_NVLINK_BANDWIDTH_L2 = 442  #NV Link Bandwidth Counter for Lane 2
+DCGM_FI_DEV_NVLINK_BANDWIDTH_L3 = 443  #NV Link Bandwidth Counter for Lane 3
+DCGM_FI_DEV_NVLINK_BANDWIDTH_L4 = 444  #NV Link Bandwidth Counter for Lane 4
+DCGM_FI_DEV_NVLINK_BANDWIDTH_L5 = 445  #NV Link Bandwidth Counter for Lane 5
+DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL = 449  #NV Link Bandwidth Counter total for all Lanes
+DCGM_FI_DEV_GPU_NVLINK_ERRORS = 450  #GPU NVLink error information
+DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L6 = 451
+DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L7 = 452
+DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L8 = 453
+DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L9 = 454
+DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L10 = 455
+DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L11 = 456
+DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L12 = 406
+DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L13 = 407
+DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L14 = 408
+DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L15 = 481
+DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L16 = 482
+DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L17 = 483
+DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L6 = 457
+DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L7 = 458
+DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L8 = 459
+DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L9 = 460
+DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L10 = 461
+DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L11 = 462
+DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L12 = 416
+DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L13 = 417
+DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L14 = 418
+DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L15 = 484
+DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L16 = 485
+DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L17 = 486
+DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L6 = 463
+DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L7 = 464
+DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L8 = 465
+DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L9 = 466
+DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L10 = 467
+DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L11 = 468
+DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L12 = 426
+DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L13 = 427
+DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L14 = 428
+DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L15 = 487
+DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L16 = 488
+DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L17 = 489
+DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L6 = 469
+DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L7 = 470
+DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L8 = 471
+DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L9 = 472
+DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L10 = 473
+DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L11 = 474
+DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L12 = 436
+DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L13 = 437
+DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L14 = 438
+DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L15 = 491
+DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L16 = 492
+DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L17 = 493
+DCGM_FI_DEV_NVLINK_BANDWIDTH_L6 = 475
+DCGM_FI_DEV_NVLINK_BANDWIDTH_L7 = 476
+DCGM_FI_DEV_NVLINK_BANDWIDTH_L8 = 477
+DCGM_FI_DEV_NVLINK_BANDWIDTH_L9 = 478
+DCGM_FI_DEV_NVLINK_BANDWIDTH_L10 = 479
+DCGM_FI_DEV_NVLINK_BANDWIDTH_L11 = 480
+DCGM_FI_DEV_NVLINK_BANDWIDTH_L12 = 446
+DCGM_FI_DEV_NVLINK_BANDWIDTH_L13 = 447
+DCGM_FI_DEV_NVLINK_BANDWIDTH_L14 = 448
+DCGM_FI_DEV_NVLINK_BANDWIDTH_L15 = 494
+DCGM_FI_DEV_NVLINK_BANDWIDTH_L16 = 495
+DCGM_FI_DEV_NVLINK_BANDWIDTH_L17 = 496
+
+#Device Attributes associated with virtualization
+DCGM_FI_DEV_VIRTUAL_MODE = 500  #Operating mode of the GPU
+DCGM_FI_DEV_SUPPORTED_TYPE_INFO = 501  #Includes Count and Supported vGPU type information
+DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS = 502  #Includes Count and List of Creatable vGPU type IDs
+DCGM_FI_DEV_VGPU_INSTANCE_IDS = 503  #Includes Count and List of vGPU instance IDs
+DCGM_FI_DEV_VGPU_UTILIZATIONS = 504  #Utilization values for vGPUs running on the device
+DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION = 505  #Utilization values for processes running within vGPU VMs using the device
+DCGM_FI_DEV_ENC_STATS = 506  #Current encoder statistics for a given device
+DCGM_FI_DEV_FBC_STATS = 507  #Statistics of current active frame buffer capture sessions on a given device
+DCGM_FI_DEV_FBC_SESSIONS_INFO = 508  #Information about active frame buffer capture sessions on a target device
+DCGM_FI_DEV_SUPPORTED_VGPU_TYPE_IDS = 509  #Includes Count and currently Supported vGPU types on a device
+DCGM_FI_DEV_VGPU_TYPE_INFO = 510  #Includes Static info of vGPU types supported on a device
+DCGM_FI_DEV_VGPU_TYPE_NAME = 511  #Includes the name of a vGPU type supported on a device
+DCGM_FI_DEV_VGPU_TYPE_CLASS = 512  #Includes the class of a vGPU type supported on a device
+DCGM_FI_DEV_VGPU_TYPE_LICENSE = 513  #Includes the license info for a vGPU type supported on a device
+#Related to vGPU Instance IDs
+DCGM_FI_DEV_VGPU_VM_ID = 520  #vGPU VM ID
+DCGM_FI_DEV_VGPU_VM_NAME = 521  #vGPU VM name
+DCGM_FI_DEV_VGPU_TYPE = 522  #vGPU type of the vGPU instance
+DCGM_FI_DEV_VGPU_UUID = 523  #UUID of the vGPU instance
+DCGM_FI_DEV_VGPU_DRIVER_VERSION = 524  #Driver version of the vGPU instance
+DCGM_FI_DEV_VGPU_MEMORY_USAGE = 525  #Memory usage of the vGPU instance
+DCGM_FI_DEV_VGPU_LICENSE_STATUS = 526  #License status of the vGPU
+DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT = 527  #Frame rate limit of the vGPU instance
+DCGM_FI_DEV_VGPU_ENC_STATS = 528  #Current encoder statistics of the vGPU instance
+DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO = 529  #Information about all active encoder sessions on the vGPU instance
+DCGM_FI_DEV_VGPU_FBC_STATS = 530  #Statistics of current active frame buffer capture sessions on the vGPU instance
+DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO = 531  #Information about active frame buffer capture sessions on the vGPU instance
+DCGM_FI_DEV_VGPU_INSTANCE_LICENSE_STATE = 532  #License state information of the vGPU instance
+DCGM_FI_DEV_VGPU_PCI_ID = 533  #PCI Id of the vGPU instance
+DCGM_FI_DEV_VGPU_VM_GPU_INSTANCE_ID = 534  #GPU Instance Id of the vGPU instance
+#Internal fields reserve the range 600..699
+#below fields related to NVSwitch
+DCGM_FI_FIRST_NVSWITCH_FIELD_ID = 700  #Starting field ID of the NVSwitch instance
+DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_TX = 780
+DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_RX = 781
+DCGM_FI_DEV_NVSWITCH_LINK_FATAL_ERRORS = 782
+DCGM_FI_DEV_NVSWITCH_LINK_NON_FATAL_ERRORS = 783
+DCGM_FI_DEV_NVSWITCH_LINK_REPLAY_ERRORS = 784
+DCGM_FI_DEV_NVSWITCH_LINK_RECOVERY_ERRORS = 785
+DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS = 786
+DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS = 787
+DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS = 788
+DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC0 = 789
+DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC1 = 790
+DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC2 = 791
+DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC3 = 792
+DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC0 = 793
+DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC1 = 794
+DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC2 = 795
+DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC3 = 796
+DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC0 = 797
+DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC1 = 798
+DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC2 = 799
+DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC3 = 800
+DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC0 = 801
+DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC1 = 802
+DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC2 = 803
+DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC3 = 804
+DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC0 = 805
+DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC1 = 806
+DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC2 = 807
+DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC3 = 808
+DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE0 = 809
+DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE1 = 810
+DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE2 = 811
+DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE3 = 812
+DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE0 = 813
+DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE1 = 814
+DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE2 = 815
+DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE3 = 816
 DCGM_FI_DEV_NVSWITCH_FATAL_ERRORS = 856
 DCGM_FI_DEV_NVSWITCH_NON_FATAL_ERRORS = 857
-
-#
-# Profiling Fields
-#
-# Ratio of time the graphics engine is active. The graphics engine is active if
-# a graphics/compute context is bound and the graphics pipe or compute pipe is
-# busy.
-DCGM_FI_PROF_GR_ENGINE_ACTIVE = 1001
-
-# The ratio of cycles an SM has at least 1 warp assigned
-DCGM_FI_PROF_SM_ACTIVE = 1002
-# (computed from the number of cycles and elapsed cycles)
-
-# The ratio of number of warps resident on an SM.
-DCGM_FI_PROF_SM_OCCUPANCY = 1003
-# (number of resident as a ratio of the theoretical
-# maximum number of warps per elapsed cycle)
-
-# The ratio of cycles the tensor (HMMA) pipe is active
-DCGM_FI_PROF_PIPE_TENSOR_ACTIVE = 1004
-# (off the peak sustained elapsed cycles)
-
-# The ratio of cycles the device memory interface is active sending or
-# receiving data.
-DCGM_FI_PROF_DRAM_ACTIVE = 1005
-# Ratio of cycles the fp64 pipe is active.
-DCGM_FI_PROF_PIPE_FP64_ACTIVE = 1006
-# Ratio of cycles the fp32 pipe is active.
-DCGM_FI_PROF_PIPE_FP32_ACTIVE = 1007
-# Ratio of cycles the fp16 pipe is active. This does not include HMMA.
-DCGM_FI_PROF_PIPE_FP16_ACTIVE = 1008
-# The number of bytes of active PCIe tx (transmit) data including both header
-# and payload.
-DCGM_FI_PROF_PCIE_TX_BYTES = 1009
-# The number of bytes of active PCIe rx (read) data including both header and
-# payload.
-DCGM_FI_PROF_PCIE_RX_BYTES = 1010
-# The number of bytes of active NvLink tx (transmit) data including both header
-# and payload.
-DCGM_FI_PROF_NVLINK_TX_BYTES = 1011
-# The number of bytes of active NvLink rx (receive) data including both header
-# and payload.
-DCGM_FI_PROF_NVLINK_RX_BYTES = 1012
-
-# greater than maximum fields above. This value can increase in the future
-DCGM_FI_MAX_FIELDS = 1013
-
-
-class struct_c_dcgm_field_meta_t(Structure):
+DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT = 858
+DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SLOWDOWN = 859
+DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SHUTDOWN = 860
+DCGM_FI_DEV_NVSWITCH_THROUGHPUT_TX = 861
+DCGM_FI_DEV_NVSWITCH_THROUGHPUT_RX = 862
+
+DCGM_FI_LAST_NVSWITCH_FIELD_ID = 899  #Last field ID of the NVSwitch instance
+'''
+Profiling Fields
+'''
+DCGM_FI_PROF_GR_ENGINE_ACTIVE = 1001  #Ratio of time the graphics engine is active. The graphics engine is
+#active if a graphics/compute context is bound and the graphics pipe or
+#compute pipe is busy.
+
+DCGM_FI_PROF_SM_ACTIVE = 1002  #The ratio of cycles an SM has at least 1 warp assigned
+#(computed from the number of cycles and elapsed cycles)
+
+DCGM_FI_PROF_SM_OCCUPANCY = 1003  #The ratio of number of warps resident on an SM.
+#(number of resident as a ratio of the theoretical
+#maximum number of warps per elapsed cycle)
+
+DCGM_FI_PROF_PIPE_TENSOR_ACTIVE = 1004  #The ratio of cycles the any tensor pipe is active
+#(off the peak sustained elapsed cycles)
+
+DCGM_FI_PROF_DRAM_ACTIVE = 1005  #The ratio of cycles the device memory interface is active sending or receiving data.
+DCGM_FI_PROF_PIPE_FP64_ACTIVE = 1006  #Ratio of cycles the fp64 pipe is active.
+DCGM_FI_PROF_PIPE_FP32_ACTIVE = 1007  #Ratio of cycles the fp32 pipe is active.
+DCGM_FI_PROF_PIPE_FP16_ACTIVE = 1008  #Ratio of cycles the fp16 pipe is active. This does not include HMMA.
+DCGM_FI_PROF_PCIE_TX_BYTES = 1009  #The number of bytes of active PCIe tx (transmit) data including both header and payload.
+DCGM_FI_PROF_PCIE_RX_BYTES = 1010  #The number of bytes of active PCIe rx (read) data including both header and payload.
+DCGM_FI_PROF_NVLINK_TX_BYTES = 1011  #The number of bytes of active NvLink tx (transmit) data including both header and payload.
+DCGM_FI_PROF_NVLINK_RX_BYTES = 1012  #The number of bytes of active NvLink rx (receive) data including both header and payload.
+DCGM_FI_PROF_PIPE_TENSOR_IMMA_ACTIVE = 1013  #The ratio of cycles the IMMA tensor pipe is active (off the peak sustained elapsed cycles)
+DCGM_FI_PROF_PIPE_TENSOR_HMMA_ACTIVE = 1014  #The ratio of cycles the HMMA tensor pipe is active (off the peak sustained elapsed cycles)
+DCGM_FI_PROF_PIPE_TENSOR_DFMA_ACTIVE = 1015  #The ratio of cycles the tensor (DFMA) pipe is active (off the peak sustained elapsed cycles)
+DCGM_FI_PROF_PIPE_INT_ACTIVE = 1016  #Ratio of cycles the integer pipe is active.
+
+#Ratio of cycles each of the NVDEC engines are active.
+DCGM_FI_PROF_NVDEC0_ACTIVE = 1017
+DCGM_FI_PROF_NVDEC1_ACTIVE = 1018
+DCGM_FI_PROF_NVDEC2_ACTIVE = 1019
+DCGM_FI_PROF_NVDEC3_ACTIVE = 1020
+DCGM_FI_PROF_NVDEC4_ACTIVE = 1021
+DCGM_FI_PROF_NVDEC5_ACTIVE = 1022
+DCGM_FI_PROF_NVDEC6_ACTIVE = 1023
+DCGM_FI_PROF_NVDEC7_ACTIVE = 1024
+
+#Ratio of cycles each of the NVJPG engines are active.
+DCGM_FI_PROF_NVJPG0_ACTIVE = 1025
+DCGM_FI_PROF_NVJPG1_ACTIVE = 1026
+DCGM_FI_PROF_NVJPG2_ACTIVE = 1027
+DCGM_FI_PROF_NVJPG3_ACTIVE = 1028
+DCGM_FI_PROF_NVJPG4_ACTIVE = 1029
+DCGM_FI_PROF_NVJPG5_ACTIVE = 1030
+DCGM_FI_PROF_NVJPG6_ACTIVE = 1031
+DCGM_FI_PROF_NVJPG7_ACTIVE = 1032
+
+#Ratio of cycles each of the NVOFA engines are active.
+DCGM_FI_PROF_NVOFA0_ACTIVE = 1033
+'''
+The per-link number of bytes of active NvLink TX (transmit) or RX (transmit) data including both header and payload.
+For example: DCGM_FI_PROF_NVLINK_L0_TX_BYTES -> L0 TX
+To get the bandwidth for a link, add the RX and TX value together like 
+total = DCGM_FI_PROF_NVLINK_L0_TX_BYTES + DCGM_FI_PROF_NVLINK_L0_RX_BYTES
+'''
+DCGM_FI_PROF_NVLINK_L0_TX_BYTES = 1040
+DCGM_FI_PROF_NVLINK_L0_RX_BYTES = 1041
+DCGM_FI_PROF_NVLINK_L1_TX_BYTES = 1042
+DCGM_FI_PROF_NVLINK_L1_RX_BYTES = 1043
+DCGM_FI_PROF_NVLINK_L2_TX_BYTES = 1044
+DCGM_FI_PROF_NVLINK_L2_RX_BYTES = 1045
+DCGM_FI_PROF_NVLINK_L3_TX_BYTES = 1046
+DCGM_FI_PROF_NVLINK_L3_RX_BYTES = 1047
+DCGM_FI_PROF_NVLINK_L4_TX_BYTES = 1048
+DCGM_FI_PROF_NVLINK_L4_RX_BYTES = 1049
+DCGM_FI_PROF_NVLINK_L5_TX_BYTES = 1050
+DCGM_FI_PROF_NVLINK_L5_RX_BYTES = 1051
+DCGM_FI_PROF_NVLINK_L6_TX_BYTES = 1052
+DCGM_FI_PROF_NVLINK_L6_RX_BYTES = 1053
+DCGM_FI_PROF_NVLINK_L7_TX_BYTES = 1054
+DCGM_FI_PROF_NVLINK_L7_RX_BYTES = 1055
+DCGM_FI_PROF_NVLINK_L8_TX_BYTES = 1056
+DCGM_FI_PROF_NVLINK_L8_RX_BYTES = 1057
+DCGM_FI_PROF_NVLINK_L9_TX_BYTES = 1058
+DCGM_FI_PROF_NVLINK_L9_RX_BYTES = 1059
+DCGM_FI_PROF_NVLINK_L10_TX_BYTES = 1060
+DCGM_FI_PROF_NVLINK_L10_RX_BYTES = 1061
+DCGM_FI_PROF_NVLINK_L11_TX_BYTES = 1062
+DCGM_FI_PROF_NVLINK_L11_RX_BYTES = 1063
+DCGM_FI_PROF_NVLINK_L12_TX_BYTES = 1064
+DCGM_FI_PROF_NVLINK_L12_RX_BYTES = 1065
+DCGM_FI_PROF_NVLINK_L13_TX_BYTES = 1066
+DCGM_FI_PROF_NVLINK_L13_RX_BYTES = 1067
+DCGM_FI_PROF_NVLINK_L14_TX_BYTES = 1068
+DCGM_FI_PROF_NVLINK_L14_RX_BYTES = 1069
+DCGM_FI_PROF_NVLINK_L15_TX_BYTES = 1070
+DCGM_FI_PROF_NVLINK_L15_RX_BYTES = 1071
+DCGM_FI_PROF_NVLINK_L16_TX_BYTES = 1072
+DCGM_FI_PROF_NVLINK_L16_RX_BYTES = 1073
+DCGM_FI_PROF_NVLINK_L17_TX_BYTES = 1074
+DCGM_FI_PROF_NVLINK_L17_RX_BYTES = 1075
+
+DCGM_FI_PROF_NVLINK_THROUGHPUT_FIRST = DCGM_FI_PROF_NVLINK_L0_TX_BYTES
+DCGM_FI_PROF_NVLINK_THROUGHPUT_LAST = DCGM_FI_PROF_NVLINK_L17_RX_BYTES
+
+#greater than maximum fields above. This value can increase in the future
+DCGM_FI_MAX_FIELDS = 1076
+
+
+class struct_c_dcgm_field_meta_t(dcgm_structs._DcgmStructure):
     # struct_c_dcgm_field_meta_t structure
     pass  # opaque handle
 
@@ -723,7 +526,7 @@ class struct_c_dcgm_field_meta_t(Structure):
 dcgm_field_meta_t = POINTER(struct_c_dcgm_field_meta_t)
 
 
-class _PrintableStructure(Structure):
+class _PrintableStructure(dcgm_structs._DcgmStructure):
     """
     Abstract class that produces nicer __str__ output than ctypes.Structure.
     e.g. instead of:
@@ -736,13 +539,11 @@ class _PrintableStructure(Structure):
     e.g. class that has _field_ 'hex_value', c_uint could be formatted with
       _fmt_ = {"hex_value" : "%08X"}
     to produce nicer output.
-    Default formatting string for all fields can be set with key "<default>"
-    like:
+    Default fomratting string for all fields can be set with key "<default>" like:
       _fmt_ = {"<default>" : "%d MHz"} # e.g all values are numbers in MHz.
     If not set it's assumed to be just "%s"
 
-    Exact format of returned str from this class is subject to change in the
-    future.
+    Exact format of returned str from this class is subject to change in the future.
     """
 
     _fmt_ = {}
@@ -770,11 +571,8 @@ def __str__(self):
 
 # Structure to hold formatting information for values
 class c_dcgm_field_output_format_t(_PrintableStructure):
-    _fields_ = [
-        ("shortName", c_char * SHORTNAME_LENGTH),
-        ("unit", c_char * UNIT_LENGTH),
-        ("width", c_short),
-    ]
+    _fields_ = [('shortName', c_char * SHORTNAME_LENGTH),
+                ('unit', c_char * UNIT_LENGTH), ('width', c_short)]
 
 
 TAG_LENGTH = 48
@@ -793,24 +591,14 @@ class c_dcgm_field_meta_t(_PrintableStructure):
     ]
 
 
-# Class for maintaining properties for each sampling type like Power,
-# Utilization and Clock.
+# Class for maintaining properties for each sampling type like Power, Utilization and Clock.
 class pySamplingProperties:
-    """
-    The instance of this class is used to hold information related to each
-    sampling event type.
-    """
+    '''
+    The instance of this class is used to hold information related to each sampling event type.
+    '''
 
-    def __init__(
-        self,
-        name,
-        sampling_type,
-        sample_val_type,
-        timeIntervalIdle,
-        timeIntervalBoost,
-        min_value,
-        max_value,
-    ):
+    def __init__(self, name, sampling_type, sample_val_type, timeIntervalIdle,
+                 timeIntervalBoost, min_value, max_value):
         self.name = name
         self.sampling_type = sampling_type
         self.timeIntervalIdle = timeIntervalIdle
@@ -827,19 +615,12 @@ def DcgmFieldsInit():
 
 
 def DcgmFieldGetById(fieldId):
-    """
+    '''
     Get metadata for a field, given its fieldId
 
-    Parameters
-    ----------
-    fieldId :
-        Field ID to get metadata for.
-
-    Returns
-    -------
-    c_dcgm_field_meta_t or None
-        Returns c_dcgm_field_meta_t on success or None on error.
-    """
+    :param fieldId: Field ID to get metadata for
+    :return: c_dcgm_field_meta_t struct on success. None on error.
+    '''
     DcgmFieldsInit()
 
     fn = dcgmFP("DcgmFieldGetById")
@@ -854,25 +635,18 @@ def DcgmFieldGetById(fieldId):
 
 
 def DcgmFieldGetByTag(tag):
-    """
+    '''
     Get metadata for a field, given its string tag
 
-    Parameters
-    ---------
-    tag :
-        Field tag to get metadata for. Example 'brand'.
-
-    Returns
-    -------
-    c_dcgm_field_meta_t or None
-        Returns c_dcgm_field_meta_t on success or None on error.
-    """
+    :param tag: Field tag to get metadata for. Example 'brand'
+    :return: c_dcgm_field_meta_t struct on success. None on error.
+    '''
     DcgmFieldsInit()
 
     c_dcgm_field_meta_t()
     fn = dcgmFP("DcgmFieldGetByTag")
     fn.restype = POINTER(c_dcgm_field_meta_t)
-    c_field_meta_ptr = fn(c_char_p(tag))
+    c_field_meta_ptr = fn(c_char_p(tag.encode('utf-8')))
     if not c_field_meta_ptr:
         return None
 
diff --git a/model_analyzer/monitor/dcgm/dcgm_fields_collectd.py b/model_analyzer/monitor/dcgm/dcgm_fields_collectd.py
new file mode 100644
index 000000000..7a29edc9e
--- /dev/null
+++ b/model_analyzer/monitor/dcgm/dcgm_fields_collectd.py
@@ -0,0 +1,671 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from model_analyzer.monitor.dcgm.dcgm_fields import *
+from model_analyzer.monitor.dcgm.dcgm_fields_internal import *
+import sys
+
+
+class CollectdMetadata:
+    '''
+    Constructor
+    @params:
+    name: string identifying the dcgm field. The field_name as opposed to
+          field_id.Address:port of the host to connect. Defaults to localhost
+    kind: collectd type string.
+    used: a bool indicating whether or not the field is to be defined in
+          a collectd types.db file when GenerateCollectdTypesDB() is called
+          (generally if this file is run as a python3 mainline). We enumerate
+          all the dcgm fields, but only generate types.db records for those
+          supported at the current time. Others may or may not have correct
+          collectd type definitions (generally  one might be a guage where it
+          is more correctly a counter). The idea is that an intrepid user may
+          enable generation of additional dcgm fields that they wish to collect
+          but are not officially supported yet.
+     '''
+
+    def __init__(self, name, kind, used=False):
+        self.name = name
+        self.kind = kind
+        self.used = used
+
+
+# collectd metadata definition table.
+
+CollectdMetadataDict = {
+    DCGM_FI_DRIVER_VERSION:
+        None,
+    DCGM_FI_NVML_VERSION:
+        None,
+    DCGM_FI_PROCESS_NAME:
+        None,
+    DCGM_FI_CUDA_DRIVER_VERSION:
+        CollectdMetadata("cuda_driver_version", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_COUNT:
+        CollectdMetadata("device_count", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NAME:
+        None,
+    DCGM_FI_DEV_BRAND:
+        None,
+    DCGM_FI_DEV_NVML_INDEX:
+        CollectdMetadata("nvml_index", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_SERIAL:
+        None,
+    DCGM_FI_DEV_CPU_AFFINITY_0:
+        CollectdMetadata("cpu_affinity_0", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_CPU_AFFINITY_1:
+        CollectdMetadata("cpu_affinity_1", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_CPU_AFFINITY_2:
+        CollectdMetadata("cpu_affinity_2", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_CPU_AFFINITY_3:
+        CollectdMetadata("cpu_affinity_3", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_UUID:
+        None,
+    DCGM_FI_DEV_MINOR_NUMBER:
+        CollectdMetadata("minor_number", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_OEM_INFOROM_VER:
+        None,
+    DCGM_FI_DEV_ECC_INFOROM_VER:
+        None,
+    DCGM_FI_DEV_POWER_INFOROM_VER:
+        None,
+    DCGM_FI_DEV_INFOROM_IMAGE_VER:
+        None,
+    DCGM_FI_DEV_INFOROM_CONFIG_CHECK:
+        CollectdMetadata("inforom_config_checksum", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_PCI_BUSID:
+        None,
+    DCGM_FI_DEV_PCI_COMBINED_ID:
+        CollectdMetadata("pci_combined_id", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_PCI_SUBSYS_ID:
+        CollectdMetadata("pci_subsys_id", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_PCIE_TX_THROUGHPUT:
+        CollectdMetadata("pcie_tx_throughput", "value:GAUGE:0:U", True),
+    DCGM_FI_DEV_PCIE_RX_THROUGHPUT:
+        CollectdMetadata("pcie_rx_throughput", "value:GAUGE:0:U", True),
+    DCGM_FI_DEV_PCIE_REPLAY_COUNTER:
+        CollectdMetadata("pcie_replay_counter", "value:COUNTER:0:U", True),
+    DCGM_FI_DEV_SM_CLOCK:
+        CollectdMetadata("sm_clock", "value:GAUGE:0:U", True),
+    DCGM_FI_DEV_MEM_CLOCK:
+        CollectdMetadata("memory_clock", "value:GAUGE:0:U", True),
+    DCGM_FI_DEV_VIDEO_CLOCK:
+        CollectdMetadata("video_clock", "value:GAUGE:0:U", True),
+    DCGM_FI_DEV_APP_SM_CLOCK:
+        CollectdMetadata("sm_app_clock", "value:GAUGE:0:U", True),
+    DCGM_FI_DEV_APP_MEM_CLOCK:
+        CollectdMetadata("mem_app_clock", "value:GAUGE:0:U", True),
+    DCGM_FI_DEV_CLOCK_THROTTLE_REASONS:
+        CollectdMetadata("current_clock_throttle_reasons", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_MAX_SM_CLOCK:
+        CollectdMetadata("sm_max_clock", "value:GAUGE:0:U", True),
+    DCGM_FI_DEV_MAX_MEM_CLOCK:
+        CollectdMetadata("memory_max_clock", "value:GAUGE:0:U", True),
+    DCGM_FI_DEV_MAX_VIDEO_CLOCK:
+        CollectdMetadata("video_max_clock", "value:GAUGE:0:U", True),
+    DCGM_FI_DEV_AUTOBOOST:
+        CollectdMetadata("autoboost", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_GPU_TEMP:
+        CollectdMetadata("gpu_temp", "value:GAUGE:U:U", True),
+    DCGM_FI_DEV_MEM_MAX_OP_TEMP:
+        CollectdMetadata("gpu_mem_max_op_temp", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_GPU_MAX_OP_TEMP:
+        CollectdMetadata("gpu_max_op_temp", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_SLOWDOWN_TEMP:
+        CollectdMetadata("slowdown_temp", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_SHUTDOWN_TEMP:
+        CollectdMetadata("shutdown_temp", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_POWER_MGMT_LIMIT:
+        CollectdMetadata("power_management_limit", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN:
+        CollectdMetadata("power_management_limit_min", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX:
+        CollectdMetadata("power_management_limit_max", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF:
+        CollectdMetadata("power_management_limit_default", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_POWER_USAGE:
+        CollectdMetadata("power_usage", "value:GAUGE:0:U", True),
+    DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION:
+        CollectdMetadata("total_energy_consumption", "value:GAUGE:0:U",
+                         True),  # left as guage since zeroed at driver reload
+    DCGM_FI_DEV_ENFORCED_POWER_LIMIT:
+        CollectdMetadata("enforced_power_limit", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_PSTATE:
+        CollectdMetadata("pstate", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_FAN_SPEED:
+        CollectdMetadata("fan_speed", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_COMPUTE_MODE:
+        CollectdMetadata("compute_mode", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_PERSISTENCE_MODE:
+        CollectdMetadata("persistance_mode", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_MIG_MODE:
+        CollectdMetadata("mig_mode", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_CUDA_VISIBLE_DEVICES_STR:
+        None,
+    DCGM_FI_DEV_MIG_MAX_SLICES:
+        CollectdMetadata("mig_max_slices", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_ECC_CURRENT:
+        CollectdMetadata("ecc", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_ECC_PENDING:
+        CollectdMetadata("ecc_pending", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_ECC_SBE_VOL_TOTAL:
+        CollectdMetadata("ecc_sbe_volatile_total", "value:COUNTER:0:U", True),
+    DCGM_FI_DEV_ECC_DBE_VOL_TOTAL:
+        CollectdMetadata("ecc_dbe_volatile_total", "value:COUNTER:0:U", True),
+    DCGM_FI_DEV_ECC_SBE_AGG_TOTAL:
+        CollectdMetadata("ecc_sbe_aggregate_total", "value:COUNTER:0:U", True),
+    DCGM_FI_DEV_ECC_DBE_AGG_TOTAL:
+        CollectdMetadata("ecc_dbe_aggregate_total", "value:COUNTER:0:U", True),
+    DCGM_FI_DEV_ECC_SBE_VOL_L1:
+        CollectdMetadata("ecc_sbe_volatile_l1", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_ECC_DBE_VOL_L1:
+        CollectdMetadata("ecc_dbe_volatile_l1", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_ECC_SBE_VOL_L2:
+        CollectdMetadata("ecc_sbe_volatile_l2", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_ECC_DBE_VOL_L2:
+        CollectdMetadata("ecc_dbe_volatile_l2", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_ECC_SBE_VOL_DEV:
+        CollectdMetadata("ecc_sbe_volatile_device", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_ECC_DBE_VOL_DEV:
+        CollectdMetadata("ecc_dbe_volatile_device", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_ECC_SBE_VOL_REG:
+        CollectdMetadata("ecc_sbe_volatile_register", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_ECC_DBE_VOL_REG:
+        CollectdMetadata("ecc_dbe_volatile_register", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_ECC_SBE_VOL_TEX:
+        CollectdMetadata("ecc_sbe_volatile_texture", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_ECC_DBE_VOL_TEX:
+        CollectdMetadata("ecc_dbe_volatile_texture", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_ECC_SBE_AGG_L1:
+        CollectdMetadata("ecc_sbe_aggregate_l1", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_ECC_DBE_AGG_L1:
+        CollectdMetadata("ecc_dbe_aggregate_l1", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_ECC_SBE_AGG_L2:
+        CollectdMetadata("ecc_sbe_aggregate_l2", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_ECC_DBE_AGG_L2:
+        CollectdMetadata("ecc_dbe_aggregate_l2", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_ECC_SBE_AGG_DEV:
+        CollectdMetadata("ecc_sbe_aggregate_device", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_ECC_DBE_AGG_DEV:
+        CollectdMetadata("ecc_dbe_aggregate_device", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_ECC_SBE_AGG_REG:
+        CollectdMetadata("ecc_sbe_aggregate_register", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_ECC_DBE_AGG_REG:
+        CollectdMetadata("ecc_dbe_aggregate_register", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_ECC_SBE_AGG_TEX:
+        CollectdMetadata("ecc_sbe_aggregate_texture", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_ECC_DBE_AGG_TEX:
+        CollectdMetadata("ecc_dbe_aggregate_texture", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_GPU_UTIL:
+        CollectdMetadata("gpu_utilization", "value:GAUGE:0.0:1.0", True),
+    DCGM_FI_DEV_MEM_COPY_UTIL:
+        CollectdMetadata("mem_copy_utilization", "value:GAUGE:0:100", True),
+    DCGM_FI_DEV_ENC_UTIL:
+        CollectdMetadata("enc_utilization", "value:GAUGE:0:100"),
+    DCGM_FI_DEV_DEC_UTIL:
+        CollectdMetadata("dec_utilization", "value:GAUGE:0:100"),
+    DCGM_FI_DEV_VBIOS_VERSION:
+        None,
+    DCGM_FI_DEV_BAR1_TOTAL:
+        CollectdMetadata("bar1_total", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_BAR1_USED:
+        CollectdMetadata("bar1_used", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_BAR1_FREE:
+        CollectdMetadata("bar1_free", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_FB_TOTAL:
+        CollectdMetadata("fb_total", "value:GAUGE:0.0:U", True),
+    DCGM_FI_DEV_FB_FREE:
+        CollectdMetadata("fb_free", "value:GAUGE:0.0:U", True),
+    DCGM_FI_DEV_FB_USED:
+        CollectdMetadata("fb_used", "value:GAUGE:0.0:U", True),
+    DCGM_FI_DEV_FB_RESERVED:
+        CollectdMetadata("fb_resv", "value:GAUGE:0.0:U", True),
+    DCGM_FI_DEV_VIRTUAL_MODE:
+        CollectdMetadata("virtualization_mode", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_VGPU_INSTANCE_IDS:
+        None,
+    DCGM_FI_DEV_VGPU_UTILIZATIONS:
+        None,
+    DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION:
+        None,
+    DCGM_FI_DEV_VGPU_VM_ID:
+        None,
+    DCGM_FI_DEV_VGPU_VM_NAME:
+        None,
+    DCGM_FI_DEV_VGPU_TYPE:
+        CollectdMetadata("vgpu_instance_type", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_VGPU_UUID:
+        None,
+    DCGM_FI_DEV_VGPU_DRIVER_VERSION:
+        None,
+    DCGM_FI_DEV_VGPU_MEMORY_USAGE:
+        CollectdMetadata("vgpu_instance_memory_usage", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_VGPU_INSTANCE_LICENSE_STATE:
+        CollectdMetadata("vgpu_instance_license_state", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_VGPU_LICENSE_STATUS:
+        CollectdMetadata("vgpu_instance_license_status", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT:
+        CollectdMetadata("vgpu_instance_frame_rate_limit", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_VGPU_PCI_ID:
+        CollectdMetadata("vgpu_instance_pci_id", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_VGPU_ENC_STATS:
+        None,
+    DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO:
+        None,
+    DCGM_FI_DEV_VGPU_FBC_STATS:
+        None,
+    DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO:
+        None,
+    DCGM_FI_DEV_VGPU_VM_GPU_INSTANCE_ID:
+        None,
+    DCGM_FI_DEV_SUPPORTED_TYPE_INFO:
+        None,
+    DCGM_FI_DEV_SUPPORTED_VGPU_TYPE_IDS:
+        None,
+    DCGM_FI_DEV_VGPU_TYPE_INFO:
+        None,
+    DCGM_FI_DEV_VGPU_TYPE_NAME:
+        None,
+    DCGM_FI_DEV_VGPU_TYPE_CLASS:
+        None,
+    DCGM_FI_DEV_VGPU_TYPE_LICENSE:
+        None,
+    DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS:
+        None,
+    DCGM_FI_DEV_ENC_STATS:
+        None,
+    DCGM_FI_DEV_FBC_STATS:
+        None,
+    DCGM_FI_DEV_FBC_SESSIONS_INFO:
+        None,
+    DCGM_FI_DEV_ACCOUNTING_DATA:
+        None,
+    DCGM_FI_DEV_RETIRED_SBE:
+        CollectdMetadata("retired_pages_sbe", "value:COUNTER:0:U", True),
+    DCGM_FI_DEV_RETIRED_DBE:
+        CollectdMetadata("retired_pages_dbe", "value:COUNTER:0:U", True),
+    DCGM_FI_DEV_GRAPHICS_PIDS:
+        None,
+    DCGM_FI_DEV_COMPUTE_PIDS:
+        None,
+    DCGM_FI_DEV_SUPPORTED_CLOCKS:
+        None,
+    DCGM_FI_SYNC_BOOST:
+        None,
+    DCGM_FI_DEV_RETIRED_PENDING:
+        CollectdMetadata("retired_pages_pending", "value:GAUGE:0:1",
+                         True),  # boolean 1 = yes, 0 = no
+    DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS:
+        CollectdMetadata("uncorrectable_remapped_rows", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS:
+        CollectdMetadata("correctable_remapped_rows", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_ROW_REMAP_FAILURE:
+        CollectdMetadata("row_remap_failure", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_ROW_REMAP_PENDING:
+        CollectdMetadata("row_remap_pending", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_INFOROM_CONFIG_VALID:
+        CollectdMetadata("inforom_config_valid", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_XID_ERRORS:
+        CollectdMetadata("xid_errors", "value:GAUGE:0:U", True),
+    DCGM_FI_DEV_PCIE_MAX_LINK_GEN:
+        CollectdMetadata("pcie_max_link_gen", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH:
+        CollectdMetadata("pcie_max_link_width", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_PCIE_LINK_GEN:
+        CollectdMetadata("pcie_link_gen", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_PCIE_LINK_WIDTH:
+        CollectdMetadata("pcie_link_width", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_POWER_VIOLATION:
+        CollectdMetadata("power_violation", "value:COUNTER:0:U", True),
+    DCGM_FI_DEV_THERMAL_VIOLATION:
+        CollectdMetadata("thermal_violation", "value:COUNTER:0:U", True),
+    DCGM_FI_GPU_TOPOLOGY_PCI:
+        None,
+    DCGM_FI_GPU_TOPOLOGY_NVLINK:
+        None,
+    DCGM_FI_GPU_TOPOLOGY_AFFINITY:
+        None,
+    DCGM_FI_DEV_SYNC_BOOST_VIOLATION:
+        CollectdMetadata("sync_boost_violation", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_BOARD_LIMIT_VIOLATION:
+        CollectdMetadata("board_limit_violation", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_LOW_UTIL_VIOLATION:
+        CollectdMetadata("low_util_violation", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_RELIABILITY_VIOLATION:
+        CollectdMetadata("reliability_violation", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION:
+        CollectdMetadata("app_clock_violation", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION:
+        CollectdMetadata("base_clock_violation", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_MEM_COPY_UTIL_SAMPLES:
+        CollectdMetadata("mem_util_samples", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_GPU_UTIL_SAMPLES:
+        CollectdMetadata("gpu_util_samples", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0:
+        CollectdMetadata("nvlink_flit_crc_error_count_l0", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1:
+        CollectdMetadata("nvlink_flit_crc_error_count_l1", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2:
+        CollectdMetadata("nvlink_flit_crc_error_count_l2", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3:
+        CollectdMetadata("nvlink_flit_crc_error_count_l3", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4:
+        CollectdMetadata("nvlink_flit_crc_error_count_l4", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5:
+        CollectdMetadata("nvlink_flit_crc_error_count_l5", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL:
+        CollectdMetadata("nvlink_flit_crc_error_count_total",
+                         "value:COUNTER:0:U", True),
+    DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0:
+        CollectdMetadata("nvlink_data_crc_error_count_l0", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1:
+        CollectdMetadata("nvlink_data_crc_error_count_l1", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2:
+        CollectdMetadata("nvlink_data_crc_error_count_l2", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3:
+        CollectdMetadata("nvlink_data_crc_error_count_l3", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4:
+        CollectdMetadata("nvlink_data_crc_error_count_l4", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5:
+        CollectdMetadata("nvlink_data_crc_error_count_l5", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL:
+        CollectdMetadata("nvlink_data_crc_error_count_total",
+                         "value:COUNTER:0:U", True),
+    DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0:
+        CollectdMetadata("nvlink_replay_error_count_l0", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1:
+        CollectdMetadata("nvlink_replay_error_count_l1", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2:
+        CollectdMetadata("nvlink_replay_error_count_l2", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3:
+        CollectdMetadata("nvlink_replay_error_count_l3", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4:
+        CollectdMetadata("nvlink_replay_error_count_l4", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5:
+        CollectdMetadata("nvlink_replay_error_count_l5", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL:
+        CollectdMetadata("nvlink_replay_error_count_total", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0:
+        CollectdMetadata("nvlink_recovery_error_count_l0", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1:
+        CollectdMetadata("nvlink_recovery_error_count_l1", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2:
+        CollectdMetadata("nvlink_recovery_error_count_l2", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3:
+        CollectdMetadata("nvlink_recovery_error_count_l3", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4:
+        CollectdMetadata("nvlink_recovery_error_count_l4", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5:
+        CollectdMetadata("nvlink_recovery_error_count_l5", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL:
+        CollectdMetadata("nvlink_recovery_error_count_total",
+                         "value:COUNTER:0:U", True),
+    DCGM_FI_DEV_NVLINK_BANDWIDTH_L0:
+        CollectdMetadata("nvlink_bandwidth_l0", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_BANDWIDTH_L1:
+        CollectdMetadata("nvlink_bandwidth_l1", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_BANDWIDTH_L2:
+        CollectdMetadata("nvlink_bandwidth_l2", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_BANDWIDTH_L3:
+        CollectdMetadata("nvlink_bandwidth_l3", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_BANDWIDTH_L4:
+        CollectdMetadata("nvlink_bandwidth_l4", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_BANDWIDTH_L5:
+        CollectdMetadata("nvlink_bandwidth_l5", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL:
+        CollectdMetadata("nvlink_bandwidth_total", "value:GAUGE:0:U", True),
+    DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L6:
+        CollectdMetadata("nvlink_flit_crc_error_count_l6", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L7:
+        CollectdMetadata("nvlink_flit_crc_error_count_l7", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L8:
+        CollectdMetadata("nvlink_flit_crc_error_count_l8", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L9:
+        CollectdMetadata("nvlink_flit_crc_error_count_l9", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L10:
+        CollectdMetadata("nvlink_flit_crc_error_count_l10", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L11:
+        CollectdMetadata("nvlink_flit_crc_error_count_l11", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L6:
+        CollectdMetadata("nvlink_data_crc_error_count_l6", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L7:
+        CollectdMetadata("nvlink_data_crc_error_count_l7", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L8:
+        CollectdMetadata("nvlink_data_crc_error_count_l8", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L9:
+        CollectdMetadata("nvlink_data_crc_error_count_l9", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L10:
+        CollectdMetadata("nvlink_data_crc_error_count_l10", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L11:
+        CollectdMetadata("nvlink_data_crc_error_count_l11", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L6:
+        CollectdMetadata("nvlink_replay_error_count_l6", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L7:
+        CollectdMetadata("nvlink_replay_error_count_l7", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L8:
+        CollectdMetadata("nvlink_replay_error_count_l8", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L9:
+        CollectdMetadata("nvlink_replay_error_count_l9", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L10:
+        CollectdMetadata("nvlink_replay_error_count_l10", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L11:
+        CollectdMetadata("nvlink_replay_error_count_l11", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L6:
+        CollectdMetadata("nvlink_recovery_error_count_l6", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L7:
+        CollectdMetadata("nvlink_recovery_error_count_l7", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L8:
+        CollectdMetadata("nvlink_recovery_error_count_l8", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L9:
+        CollectdMetadata("nvlink_recovery_error_count_l9", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L10:
+        CollectdMetadata("nvlink_recovery_error_count_l10", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L11:
+        CollectdMetadata("nvlink_recovery_error_count_l11", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_BANDWIDTH_L6:
+        CollectdMetadata("nvlink_bandwidth_l6", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_BANDWIDTH_L7:
+        CollectdMetadata("nvlink_bandwidth_l7", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_BANDWIDTH_L8:
+        CollectdMetadata("nvlink_bandwidth_l8", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_BANDWIDTH_L9:
+        CollectdMetadata("nvlink_bandwidth_l9", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_BANDWIDTH_L10:
+        CollectdMetadata("nvlink_bandwidth_l10", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVLINK_BANDWIDTH_L11:
+        CollectdMetadata("nvlink_bandwidth_l11", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_MEMORY_TEMP:
+        CollectdMetadata("memory_temp", "value:GAUGE:U:U", True),
+    DCGM_FI_DEV_GPU_NVLINK_ERRORS:
+        CollectdMetadata("gpu_nvlink_errors", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_TX:
+        CollectdMetadata("nvswitch_link_bandwidth_tx", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_RX:
+        CollectdMetadata("nvswitch_link_bandwidth_rx", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVSWITCH_LINK_NON_FATAL_ERRORS:
+        CollectdMetadata("nvswitch_link_fatal_errors", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVSWITCH_LINK_REPLAY_ERRORS:
+        CollectdMetadata("nvswitch_link_non_fatal_errors", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVSWITCH_LINK_RECOVERY_ERRORS:
+        CollectdMetadata("nvswitch_link_recovery_errors", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS:
+        CollectdMetadata("nvswitch_link_flit_errors", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS:
+        CollectdMetadata("nvswitch_link_crc_errors", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS:
+        CollectdMetadata("nvswitch_link_ecc_errors", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC0:
+        CollectdMetadata("nvswitch_link_latency_low_vc0", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC1:
+        CollectdMetadata("nvswitch_link_latency_low_vc1", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC2:
+        CollectdMetadata("nvswitch_link_latency_low_vc2", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC3:
+        CollectdMetadata("nvswitch_link_latency_low_vc3", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC0:
+        CollectdMetadata("nvswitch_link_latency_medium_vc0", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC1:
+        CollectdMetadata("nvswitch_link_latency_medium_vc1", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC2:
+        CollectdMetadata("nvswitch_link_latency_medium_vc2", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC3:
+        CollectdMetadata("nvswitch_link_latency_medium_vc3", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC0:
+        CollectdMetadata("nvswitch_link_latency_high_vc0", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC1:
+        CollectdMetadata("nvswitch_link_latency_high_vc1", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC2:
+        CollectdMetadata("nvswitch_link_latency_high_vc2", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC3:
+        CollectdMetadata("nvswitch_link_latency_high_vc3", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC0:
+        CollectdMetadata("nvswitch_link_latency_panic_vc0", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC1:
+        CollectdMetadata("nvswitch_link_latency_panic_vc1", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC2:
+        CollectdMetadata("nvswitch_link_latency_panic_vc2", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC3:
+        CollectdMetadata("nvswitch_link_latency_panic_vc3", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC0:
+        CollectdMetadata("nvswitch_link_latency_count_vc0", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC1:
+        CollectdMetadata("nvswitch_link_latency_count_vc1", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC2:
+        CollectdMetadata("nvswitch_link_latency_count_vc2", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC3:
+        CollectdMetadata("nvswitch_link_latency_count_vc3", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE0:
+        CollectdMetadata("nvswitch_link_crc_errors_lane0", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE1:
+        CollectdMetadata("nvswitch_link_crc_errors_lane1", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE2:
+        CollectdMetadata("nvswitch_link_crc_errors_lane2", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE3:
+        CollectdMetadata("nvswitch_link_crc_errors_lane3", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE0:
+        CollectdMetadata("nvswitch_link_ecc_errors_lane0", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE1:
+        CollectdMetadata("nvswitch_link_ecc_errors_lane1", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE2:
+        CollectdMetadata("nvswitch_link_ecc_errors_lane2", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE3:
+        CollectdMetadata("nvswitch_link_ecc_errors_lane3", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVSWITCH_FATAL_ERRORS:
+        CollectdMetadata("nvswitch_fatal_error", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVSWITCH_NON_FATAL_ERRORS:
+        CollectdMetadata("nvswitch_non_fatal_error", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT:
+        CollectdMetadata("nvswitch_temperature_current", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SLOWDOWN:
+        CollectdMetadata("nvswitch_temperature_limit_slowdown",
+                         "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SHUTDOWN:
+        CollectdMetadata("nvswitch_temperature_limit_shutdown",
+                         "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVSWITCH_THROUGHPUT_TX:
+        CollectdMetadata("nvswitch_throughput_tx", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_NVSWITCH_THROUGHPUT_RX:
+        CollectdMetadata("nvswitch_throughput_rx", "value:GAUGE:U:U"),
+    DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY:
+        CollectdMetadata("cuda_compute_capability", "value:GAUGE:U:U"),
+    DCGM_FI_PROF_GR_ENGINE_ACTIVE:
+        CollectdMetadata("gr_engine_active", "value:GAUGE:0.0:1.0", True),
+    DCGM_FI_PROF_SM_ACTIVE:
+        CollectdMetadata("sm_active", "value:GAUGE:0.0:1.0", True),
+    DCGM_FI_PROF_SM_OCCUPANCY:
+        CollectdMetadata("sm_occupancy", "value:GAUGE:0:U", True),
+    DCGM_FI_PROF_PIPE_TENSOR_ACTIVE:
+        CollectdMetadata("tensor_active", "value:GAUGE:0.0:1.0", True),
+    DCGM_FI_PROF_DRAM_ACTIVE:
+        CollectdMetadata("dram_active", "value:GAUGE:0.0:1.0", True),
+    DCGM_FI_PROF_PIPE_FP64_ACTIVE:
+        CollectdMetadata("fp64_active", "value:GAUGE:U:U"),
+    DCGM_FI_PROF_PIPE_FP32_ACTIVE:
+        CollectdMetadata("fp32_active", "value:GAUGE:U:U"),
+    DCGM_FI_PROF_PIPE_FP16_ACTIVE:
+        CollectdMetadata("fp16_active", "value:GAUGE:U:U"),
+    DCGM_FI_PROF_PCIE_TX_BYTES:
+        CollectdMetadata("pcie_tx_bytes", "value:GAUGE:U:U"),
+    DCGM_FI_PROF_PCIE_RX_BYTES:
+        CollectdMetadata("pcie_rx_bytes", "value:GAUGE:U:U"),
+    DCGM_FI_PROF_NVLINK_TX_BYTES:
+        CollectdMetadata("nvlink_tx_bytes", "value:GAUGE:U:U"),
+    DCGM_FI_PROF_NVLINK_RX_BYTES:
+        CollectdMetadata("nvlink_rx_bytes", "value:GAUGE:U:U"),
+    DCGM_FI_PROF_PIPE_TENSOR_IMMA_ACTIVE:
+        CollectdMetadata("tensor_imma_active", "value:GAUGE:0.0:1.0", True),
+    DCGM_FI_PROF_PIPE_TENSOR_HMMA_ACTIVE:
+        CollectdMetadata("tensor_hmma_active", "value:GAUGE:0.0:1.0", True),
+}
+
+__fieldDict = None
+
+
+def GenerateCollectdTypesDB():
+    length = max(
+        map(lambda x: len(x.name) if x else 0, CollectdMetadataDict.values()))
+
+    fmt = "{0:<" + str(length) + "}"
+    fail = False
+
+    for item in filter(None, CollectdMetadataDict.values()):
+        item_list = item.kind.split(':')
+
+        # Some rudimentary syntax checking.
+
+        if len(item_list) != 4:
+            sys.stderr.write(
+                'Item ' + item.name +
+                ' has wrong number of collectd type fields - four required.\n')
+            fail = True
+
+        if item_list[1] not in ['GAUGE', 'COUNTER', 'DERIVE', 'ABSOLUTE']:
+            sys.stderr.write(
+                'Item ' + item.name +
+                ' should be one of GAUGE, COUNTER, DERIVE, ABSOLUTE.\n')
+            fail = True
+
+        # We check this so we can enumerate all dcgm fields for possible
+        # inclusion, even if some are not (yet) formally supported.
+
+        if item.used:
+            print(fmt.format(item.name), item.kind)
+
+    if fail:
+        exit("Failed on db.types table syntax errors.\n")
+
+
+def GetFieldByName(name):
+    global __fieldDict
+
+    if name.isnumeric():
+        return int(name)
+
+    if __fieldDict == None:
+        __fieldDict = {}
+
+        for key in CollectdMetadataDict:
+            item = CollectdMetadataDict[key]
+
+            if item != None:
+                __fieldDict[item.name] = key
+
+    if name not in __fieldDict.keys():
+        return -1
+
+    return __fieldDict[name]
+
+
+if __name__ == '__main__':
+    GenerateCollectdTypesDB()
diff --git a/model_analyzer/monitor/dcgm/dcgm_fields_internal.py b/model_analyzer/monitor/dcgm/dcgm_fields_internal.py
new file mode 100644
index 000000000..9502c959a
--- /dev/null
+++ b/model_analyzer/monitor/dcgm/dcgm_fields_internal.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##
+# Python bindings for the internal API of DCGM library (dcgm_fields_internal.hpp)
+##
+
+from ctypes import *
+from ctypes.util import find_library
+import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
+
+# Provides access to functions
+dcgmFP = dcgm_structs._dcgmGetFunctionPointer
+
+#internal-only fields
+DCGM_FI_DEV_MEM_COPY_UTIL_SAMPLES = 210  #Memory utilization samples
+DCGM_FI_DEV_GPU_UTIL_SAMPLES = 211  #SM utilization samples
+DCGM_FI_DEV_GRAPHICS_PIDS = 220  #Graphics processes running on the GPU.
+DCGM_FI_DEV_COMPUTE_PIDS = 221  #Compute processes running on the GPU.
diff --git a/model_analyzer/monitor/dcgm/dcgm_fluentd.py b/model_analyzer/monitor/dcgm/dcgm_fluentd.py
new file mode 100644
index 000000000..24a345100
--- /dev/null
+++ b/model_analyzer/monitor/dcgm/dcgm_fluentd.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from model_analyzer.monitor.dcgm.common.dcgm_client_main import main
+from model_analyzer.monitor.dcgm.DcgmJsonReader import DcgmJsonReader
+from socket import socket, AF_INET, SOCK_DGRAM
+
+# Displayed to the user
+FLUENTD_NAME = 'Fluentd'
+DEFAULT_FLUENTD_PORT = 24225
+
+# Fluentd Configuration
+# =====================
+# In order to use this client, Fluentd needs to accept json over udp.
+# The default port is 24225
+
+
+class DcgmFluentd(DcgmJsonReader):
+    ###########################################################################
+    def __init__(self, publish_hostname, publish_port, **kwargs):
+        self.m_sock = socket(AF_INET, SOCK_DGRAM)
+        self.m_dest = (publish_hostname, publish_port)
+        super(DcgmFluentd, self).__init__(**kwargs)
+
+    ###########################################################################
+    def SendToFluentd(self, payload):
+        self.m_sock.sendto(payload, self.m_dest)
+
+    ###########################################################################
+    def CustomJsonHandler(self, outJson):
+        self.SendToFluentd(outJson)
+
+
+if __name__ == '__main__':  # pragma: no cover
+    main(DcgmFluentd, FLUENTD_NAME, DEFAULT_FLUENTD_PORT, add_target_host=True)
diff --git a/model_analyzer/monitor/dcgm/dcgm_prometheus.py b/model_analyzer/monitor/dcgm/dcgm_prometheus.py
new file mode 100644
index 000000000..f6f69a613
--- /dev/null
+++ b/model_analyzer/monitor/dcgm/dcgm_prometheus.py
@@ -0,0 +1,326 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields
+import time
+import logging
+import os
+import argparse
+import sys
+import signal
+
+dir_path = os.path.dirname(os.path.realpath(__file__))
+parent_dir_path = os.path.abspath(os.path.join(dir_path, os.pardir))
+sys.path.insert(0, parent_dir_path)
+
+from model_analyzer.monitor.dcgm.DcgmReader import DcgmReader
+from model_analyzer.monitor.dcgm.common import dcgm_client_cli_parser as cli
+
+if 'DCGM_TESTING_FRAMEWORK' in os.environ:
+    try:
+        from prometheus_tester_api import start_http_server, Gauge
+    except:
+        logging.critical(
+            "prometheus_tester_api missing, reinstall test framework.")
+        sys.exit(3)
+else:
+    try:
+        from prometheus_client import start_http_server, Gauge
+    except ImportError:
+        pass
+        logging.critical(
+            "prometheus_client not installed, please run: \"pip install prometheus_client\""
+        )
+        sys.exit(3)
+
+DEFAULT_FIELDS = [
+    dcgm_fields.DCGM_FI_DEV_PCI_BUSID,  #Needed for plugin_instance
+    dcgm_fields.DCGM_FI_DEV_POWER_USAGE,
+    dcgm_fields.DCGM_FI_DEV_GPU_TEMP,
+    dcgm_fields.DCGM_FI_DEV_SM_CLOCK,
+    dcgm_fields.DCGM_FI_DEV_GPU_UTIL,
+    dcgm_fields.DCGM_FI_DEV_RETIRED_PENDING,
+    dcgm_fields.DCGM_FI_DEV_RETIRED_SBE,
+    dcgm_fields.DCGM_FI_DEV_RETIRED_DBE,
+    dcgm_fields.DCGM_FI_DEV_ECC_SBE_AGG_TOTAL,
+    dcgm_fields.DCGM_FI_DEV_ECC_DBE_AGG_TOTAL,
+    dcgm_fields.DCGM_FI_DEV_FB_TOTAL,
+    dcgm_fields.DCGM_FI_DEV_FB_FREE,
+    dcgm_fields.DCGM_FI_DEV_FB_USED,
+    dcgm_fields.DCGM_FI_DEV_PCIE_REPLAY_COUNTER,
+    dcgm_fields.DCGM_FI_DEV_ECC_SBE_VOL_TOTAL,
+    dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL,
+    dcgm_fields.DCGM_FI_DEV_POWER_VIOLATION,
+    dcgm_fields.DCGM_FI_DEV_THERMAL_VIOLATION,
+    dcgm_fields.DCGM_FI_DEV_XID_ERRORS,
+    dcgm_fields.DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL,
+    dcgm_fields.DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL,
+    dcgm_fields.DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL,
+    dcgm_fields.DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL,
+]
+
+
+class DcgmPrometheus(DcgmReader):
+    ###########################################################################
+    def __init__(self):
+        #Have DCGM update its watches twice as fast as our update interval so we don't get out of phase by our update interval
+        updateIntervalUsec = int(
+            (1000000 * g_settings['prometheusPublishInterval']) / 2)
+        #Add our PID to our field group name so we can have multiple instances running
+        fieldGroupName = 'dcgm_prometheus_' + str(os.getpid())
+
+        DcgmReader.__init__(self,
+                            ignoreList=g_settings['ignoreList'],
+                            fieldIds=g_settings['publishFieldIds'],
+                            updateFrequency=updateIntervalUsec,
+                            fieldGroupName=fieldGroupName,
+                            hostname=g_settings['dcgmHostName'])
+        self.m_existingGauge = {}
+
+    ###########################################################################
+    '''
+    This function is implemented from the base class : DcgmReader. It converts each
+    field / value from the fvs dictionary to a gauge and publishes the gauge to the
+    prometheus client server.
+
+    @params:
+    fvs : The fieldvalue dictionary that contains info about the values of field Ids for each gpuId.
+    '''
+
+    def CustomDataHandler(self, fvs):
+        if not self.m_existingGauge:
+            self.SetupGauges()
+
+        for _, fieldIds in self.m_publishFields.items():
+            if fieldIds is None:
+                continue
+
+            for fieldId in fieldIds:
+                if fieldId in self.m_dcgmIgnoreFields:
+                    continue
+
+                g = self.m_existingGauge[fieldId]
+
+                for gpuId in list(fvs.keys()):
+                    gpuFv = fvs[gpuId]
+                    val = gpuFv[fieldId][-1]
+
+                    #Skip blank values. Otherwise, we'd have to insert a placeholder blank value based on the fieldId
+                    if val.isBlank:
+                        continue
+
+                    gpuUuid = self.m_gpuIdToUUId[gpuId]
+                    gpuBusId = self.m_gpuIdToBusId[gpuId]
+                    gpuUniqueId = gpuUuid if g_settings['sendUuid'] else gpuBusId
+
+                    # pylint doesn't find the labels member for Gauge, but it exists. Ignore the warning
+                    g.labels(gpuId, gpuUniqueId).set(val.value)  # pylint: disable=no-member
+
+                    logging.debug(
+                        'Sent GPU %d %s %s = %s' %
+                        (gpuId, gpuUniqueId, self.m_fieldIdToInfo[fieldId].tag,
+                         str(val.value)))
+
+    ###############################################################################
+    '''
+    NOTE: even though some fields are monotonically increasing and therefore fit the mold to be
+    counters, all are published as gauges so that DCGM is the sole authority on the state of the
+    system, preventing problems around down times, driver reboots, and the unlikely event of
+    flashing the inforom.
+    For specific information about which fields monotonically increase, see the API guide or
+    dcgm_fields.h
+    '''
+
+    def SetupGauges(self):
+        for _, fieldIds in self.m_publishFields.items():
+            if fieldIds is None:
+                continue
+
+            for fieldId in fieldIds:
+                if fieldId in self.m_dcgmIgnoreFields:
+                    continue
+
+                uniqueIdName = 'GpuUuid' if g_settings[
+                    'sendUuid'] else 'GpuBusID'
+
+                fieldTag = self.m_fieldIdToInfo[fieldId].tag
+                self.m_existingGauge[fieldId] = Gauge("dcgm_" + fieldTag,
+                                                      'DCGM_PROMETHEUS',
+                                                      ['GpuID', uniqueIdName])
+
+    ###############################################################################
+    '''
+    Scrape the fieldvalue data and publish. This function calls the process function of
+    the base class DcgmReader.
+    '''
+
+    def Scrape(self, data=None):
+        return self.Process()
+
+    ###############################################################################
+    def LogBasicInformation(self):
+        # Reconnect causes everything to get initialized
+        self.Reconnect()
+
+        logging.info('Started prometheus client')
+
+        fieldTagList = ''
+
+        for _, fieldIds in self.m_publishFields.items():
+            if fieldIds is None:
+                continue
+
+            for fieldId in fieldIds:
+                if fieldId in self.m_dcgmIgnoreFields:
+                    continue
+
+                if fieldTagList == '':
+                    fieldTagList = self.m_fieldIdToInfo[fieldId].tag
+                else:
+                    fieldTagList = fieldTagList + ", %s" % (
+                        self.m_fieldIdToInfo[fieldId].tag)
+
+        logging.info("Publishing fields: '%s'" % (fieldTagList))
+
+    ###############################################################################
+    def LogError(self, msg):
+        logging.error(msg)
+
+    ###############################################################################
+    def LogInfo(self, msg):
+        logging.info(msg)
+
+
+###############################################################################
+def exit_handler(signum, frame):
+    g_settings['shouldExit'] = True
+
+
+###############################################################################
+def main_loop(prometheus_obj, publish_interval):
+    try:
+        while True:
+            prometheus_obj.Scrape(prometheus_obj)
+            time.sleep(publish_interval)
+
+            if g_settings['shouldExit'] == True:
+                prometheus_obj.LogInfo('Received a signal...shutting down')
+                break
+    except KeyboardInterrupt:
+        print("Caught CTRL-C. Exiting")
+
+
+###############################################################################
+def initialize_globals():
+    '''
+    Name of the host.
+    '''
+    global g_settings
+    g_settings = {}
+
+    g_settings['shouldExit'] = False
+    '''
+    List of the ids that are present in g_settings['publishFieldIds'] but ignored for watch.
+    '''
+    g_settings['ignoreList'] = [
+        dcgm_fields.DCGM_FI_DEV_PCI_BUSID,
+    ]
+    '''
+    Those are initialized by the CLI parser. We only list them here for clarity.
+    '''
+    for key in [
+            'dcgmHostName',
+            'prometheusPort',
+            'prometheusPublishInterval',
+            'publishFieldIds',
+    ]:
+        g_settings[key] = None
+
+
+###############################################################################
+def parse_command_line():
+    parser = cli.create_parser(
+        name='Prometheus',
+        field_ids=DEFAULT_FIELDS,
+    )
+
+    cli.add_custom_argument(parser,
+                            '--send-uuid',
+                            dest='send_uuid',
+                            default=False,
+                            action='store_true',
+                            help='Send GPU UUID instead of bus id')
+
+    args = cli.run_parser(parser)
+    field_ids = cli.get_field_ids(args)
+    numeric_log_level = cli.get_log_level(args)
+
+    # Defaults to localhost, so we need to set it to None
+    if args.embedded:
+        g_settings['dcgmHostName'] = None
+    else:
+        g_settings['dcgmHostName'] = args.hostname
+
+    g_settings['prometheusPort'] = args.publish_port
+
+    g_settings['prometheusPublishInterval'] = args.interval
+
+    logfile = args.logfile
+
+    g_settings['publishFieldIds'] = field_ids
+
+    g_settings['sendUuid'] = args.send_uuid
+
+    if logfile != None:
+        logging.basicConfig(level=numeric_log_level,
+                            filename=logfile,
+                            filemode='w+',
+                            format='%(asctime)s %(levelname)s: %(message)s')
+    else:
+        logging.basicConfig(level=numeric_log_level,
+                            stream=sys.stdout,
+                            filemode='w+',
+                            format='%(asctime)s %(levelname)s: %(message)s')
+
+
+###############################################################################
+def initialize_signal_handlers():
+    signal.signal(signal.SIGINT, exit_handler)
+    signal.signal(signal.SIGTERM, exit_handler)
+
+
+###############################################################################
+def main():
+    initialize_globals()
+
+    initialize_signal_handlers()
+
+    parse_command_line()
+
+    prometheus_obj = DcgmPrometheus()
+
+    logging.info("Starting Prometheus server on port " +
+                 str(g_settings['prometheusPort']))
+
+    #start prometheus client server.
+    start_http_server(g_settings['prometheusPort'])
+
+    prometheus_obj.LogBasicInformation()
+
+    main_loop(prometheus_obj, g_settings['prometheusPublishInterval'])
+
+    prometheus_obj.Shutdown()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/model_analyzer/monitor/dcgm/dcgm_structs.py b/model_analyzer/monitor/dcgm/dcgm_structs.py
index e401c4181..233d15564 100755
--- a/model_analyzer/monitor/dcgm/dcgm_structs.py
+++ b/model_analyzer/monitor/dcgm/dcgm_structs.py
@@ -1,6 +1,4 @@
-#!/usr/bin/env python3
-
-# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,49 +11,30 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+##
+# Python bindings for "dcgm_structs.h"
+##
 
-import json
-import os
-import platform
-import string
+from ctypes import *
+from ctypes.util import find_library
 import sys
+import os
 import threading
-from ctypes import (
-    CDLL,
-    POINTER,
-    Array,
-    Structure,
-    Union,
-    c_bool,
-    c_byte,
-    c_char,
-    c_char_p,
-    c_double,
-    c_int,
-    c_int32,
-    c_int64,
-    c_longlong,
-    c_short,
-    c_uint,
-    c_uint16,
-    c_uint32,
-    c_uint64,
-    c_ulong,
-    c_ushort,
-    c_void_p,
-    sizeof,
-)
-
-import distro
-
-import model_analyzer.monitor.dcgm.dcgm_value as dcgmvalue
+import string
+import json
+import model_analyzer.monitor.dcgm.dcgmvalue as dcgmvalue
+import platform
+from inspect import isclass
 
 DCGM_MAX_STR_LENGTH = 256
 DCGM_MAX_NUM_DEVICES = 32  # DCGM 2.0 and newer = 32. DCGM 1.8 and older = 16
 DCGM_MAX_NUM_SWITCHES = 12
-DCGM_NVLINK_MAX_LINKS_PER_GPU = 12
+DCGM_NVLINK_MAX_LINKS_PER_GPU = 18
 DCGM_NVLINK_MAX_LINKS_PER_GPU_LEGACY1 = 6
-DCGM_NVLINK_MAX_LINKS_PER_NVSWITCH = 36
+DCGM_NVLINK_MAX_LINKS_PER_GPU_LEGACY2 = 12
+DCGM_NVLINK_MAX_LINKS_PER_NVSWITCH_V1 = 36  # Max NvLinks per NvSwitch pre-Hopper
+DCGM_NVLINK_MAX_LINKS_PER_NVSWITCH = 64
+DCGM_LANE_MAX_LANES_PER_NVSWICH_LINK = 4
 DCGM_MAX_CLOCKS = 256
 DCGM_MAX_NUM_GROUPS = 64
 DCGM_MAX_BLOB_LENGTH = 4096
@@ -66,8 +45,7 @@
 DCGM_DEVICE_UUID_BUFFER_SIZE = 80
 DCGM_MAX_FBC_SESSIONS = 256
 
-# When more than one value is returned from a query, which order should it be
-# returned in?
+#When more than one value is returned from a query, which order should it be returned in?
 DCGM_ORDER_ASCENDING = 1
 DCGM_ORDER_DESCENDING = 2
 
@@ -83,125 +61,72 @@
 DCGM_FBC_SESSION_TYPE_VID = 3  # FB capture for a Vid buffer
 DCGM_FBC_SESSION_TYPE_HWENC = 4  # FB capture for a NVENC HW buffer
 
-# C Type mappings #
-# Enums
+## C Type mappings ##
+## Enums
 
 # Return types
 _dcgmReturn_t = c_uint
-# Success
-DCGM_ST_OK = 0
-# A bad parameter was passed to a function
-DCGM_ST_BADPARAM = -1
-# A generic, unspecified error
-DCGM_ST_GENERIC_ERROR = -3
-# An out of memory error occurred
-DCGM_ST_MEMORY = -4
-# Setting not configured
-DCGM_ST_NOT_CONFIGURED = -5
-# Feature not supported
-DCGM_ST_NOT_SUPPORTED = -6
-# DCGM Init error
-DCGM_ST_INIT_ERROR = -7
-# When NVML returns error.
-DCGM_ST_NVML_ERROR = -8
-# Object is in pending state of something else
-DCGM_ST_PENDING = -9
-# Object is in undefined state
-DCGM_ST_UNINITIALIZED = -10
-# Requested operation timed out
-DCGM_ST_TIMEOUT = -11
-# Version mismatch between received and understood API
-DCGM_ST_VER_MISMATCH = -12
-# Unknown field id
-DCGM_ST_UNKNOWN_FIELD = -13
-# No data is available
-DCGM_ST_NO_DATA = -14
+DCGM_ST_OK = 0  # Success
+DCGM_ST_BADPARAM = -1  # A bad parameter was passed to a function
+DCGM_ST_GENERIC_ERROR = -3  # A generic, unspecified error
+DCGM_ST_MEMORY = -4  # An out of memory error occured
+DCGM_ST_NOT_CONFIGURED = -5  # Setting not configured
+DCGM_ST_NOT_SUPPORTED = -6  # Feature not supported
+DCGM_ST_INIT_ERROR = -7  # DCGM Init error
+DCGM_ST_NVML_ERROR = -8  # When NVML returns error.
+DCGM_ST_PENDING = -9  # Object is in pending state of something else
+DCGM_ST_UNINITIALIZED = -10  # Object is in undefined state
+DCGM_ST_TIMEOUT = -11  # Requested operation timed out
+DCGM_ST_VER_MISMATCH = -12  # Version mismatch between received and understood API
+DCGM_ST_UNKNOWN_FIELD = -13  # Unknown field id
+DCGM_ST_NO_DATA = -14  # No data is available
 DCGM_ST_STALE_DATA = -15
-# The given field is not being updated by the cache manager
-DCGM_ST_NOT_WATCHED = -16
-# We are not permissioned to perform the desired action
-DCGM_ST_NO_PERMISSION = -17
-# GPU is no longer reachable
-DCGM_ST_GPU_IS_LOST = -18
-# GPU requires a reset
-DCGM_ST_RESET_REQUIRED = -19
-# Unable to find function
-DCGM_ST_FUNCTION_NOT_FOUND = -20
-# Connection to the host engine is not valid any longer
-DCGM_ST_CONNECTION_NOT_VALID = -21
-# This GPU is not supported by DCGM
-DCGM_ST_GPU_NOT_SUPPORTED = -22
-# The GPUs of the provided group are not compatible with each other for the
-# requested operation
-DCGM_ST_GROUP_INCOMPATIBLE = -23
+DCGM_ST_NOT_WATCHED = -16  # The given field is not being updated by the cache manager
+DCGM_ST_NO_PERMISSION = -17  # We are not permissioned to perform the desired action
+DCGM_ST_GPU_IS_LOST = -18  # GPU is no longer reachable
+DCGM_ST_RESET_REQUIRED = -19  # GPU requires a reset
+DCGM_ST_FUNCTION_NOT_FOUND = -20  # Unable to find function
+DCGM_ST_CONNECTION_NOT_VALID = -21  # Connection to the host engine is not valid any longer
+DCGM_ST_GPU_NOT_SUPPORTED = -22  # This GPU is not supported by DCGM
+DCGM_ST_GROUP_INCOMPATIBLE = -23  # The GPUs of the provided group are not compatible with each other for the requested operation
 DCGM_ST_MAX_LIMIT = -24
-# DCGM library could not be found
-DCGM_ST_LIBRARY_NOT_FOUND = -25
-# Duplicate key passed to the function
-DCGM_ST_DUPLICATE_KEY = -26
-# GPU is already a part of a sync boost group
-DCGM_ST_GPU_IN_SYNC_BOOST_GROUP = -27
-# GPU is a not a part of sync boost group
-DCGM_ST_GPU_NOT_IN_SYNC_BOOST_GROUP = -28
-# This operation cannot be performed when the host engine is running as
-# non-root
-DCGM_ST_REQUIRES_ROOT = -29
-# DCGM GPU Diagnostic was successfully executed, but reported an error.
-DCGM_ST_NVVS_ERROR = -30
-# An input argument is not large enough
-DCGM_ST_INSUFFICIENT_SIZE = -31
-# The given field ID is not supported by the API being called
-DCGM_ST_FIELD_UNSUPPORTED_BY_API = -32
-# This request is serviced by a module of DCGM that is not currently loaded
-DCGM_ST_MODULE_NOT_LOADED = -33
-# The requested operation could not be completed because the affected resource
-# is in use
-DCGM_ST_IN_USE = -34
-# The specified group is empty and this operation is not valid with an empty
-# group
-DCGM_ST_GROUP_IS_EMPTY = -35
-# Profiling is not supported for this group of GPUs or GPU
-DCGM_ST_PROFILING_NOT_SUPPORTED = -36
-# The third-party Profiling module returned an unrecoverable error
-DCGM_ST_PROFILING_LIBRARY_ERROR = -37
-# The requested profiling metrics cannot be collected in a single pass
-DCGM_ST_PROFILING_MULTI_PASS = -38
-# A diag instance is already running, cannot run a new diag until the current
-# one finishes.
-DCGM_ST_DIAG_ALREADY_RUNNING = -39
-# The DCGM GPU Diagnostic returned JSON that cannot be parsed
-DCGM_ST_DIAG_BAD_JSON = -40
-# Error while launching the DCGM GPU Diagnostic
-DCGM_ST_DIAG_BAD_LAUNCH = -41
-# There is too much variance while training the diagnostic
-DCGM_ST_DIAG_VARIANCE = -42
-# A field value met or exceeded the error threshold.
-DCGM_ST_DIAG_THRESHOLD_EXCEEDED = -43
-# The installed driver version is insufficient for this API
-DCGM_ST_INSUFFICIENT_DRIVER_VERSION = -44
-# The specified GPU instance does not exist
-DCGM_ST_INSTANCE_NOT_FOUND = -45
-# The specified GPU compute instance does not exist
-DCGM_ST_COMPUTE_INSTANCE_NOT_FOUND = -46
-# Could not kill a child process within the retries
-DCGM_ST_CHILD_NOT_KILLED = -47
-# Detected an error in a 3rd-party library
-DCGM_ST_3RD_PARTY_LIBRARY_ERROR = -48
-# Not enough resources available
-DCGM_ST_INSUFFICIENT_RESOURCES = -49
-
-# All the GPUs on the node are added to the group
-DCGM_GROUP_DEFAULT = 0
-# Creates an empty group
-DCGM_GROUP_EMPTY = 1
-# All NvSwitches of the node are added to the group
-DCGM_GROUP_DEFAULT_NVSWITCHES = 2
-# All GPU instances of the node are added to the group
-DCGM_GROUP_DEFAULT_INSTANCES = 3
-# All compute instances of the node are added to the group
-DCGM_GROUP_DEFAULT_COMPUTE_INSTANCES = 4
-# All entities are added to this default group
-DCGM_GROUP_DEFAULT_ENTITIES = 5
+DCGM_ST_LIBRARY_NOT_FOUND = -25  # DCGM library could not be found
+DCGM_ST_DUPLICATE_KEY = -26  #Duplicate key passed to the function
+DCGM_ST_GPU_IN_SYNC_BOOST_GROUP = -27  #GPU is already a part of a sync boost group
+DCGM_ST_GPU_NOT_IN_SYNC_BOOST_GROUP = -28  #GPU is a not a part of sync boost group
+DCGM_ST_REQUIRES_ROOT = -29  #This operation cannot be performed when the host engine is running as non-root
+DCGM_ST_NVVS_ERROR = -30  #DCGM GPU Diagnostic was successfully executed, but reported an error.
+DCGM_ST_INSUFFICIENT_SIZE = -31  #An input argument is not large enough
+DCGM_ST_FIELD_UNSUPPORTED_BY_API = -32  #The given field ID is not supported by the API being called
+DCGM_ST_MODULE_NOT_LOADED = -33  #This request is serviced by a module of DCGM that is not currently loaded
+DCGM_ST_IN_USE = -34  #The requested operation could not be completed because the affected resource is in use
+DCGM_ST_GROUP_IS_EMPTY = -35  # The specified group is empty and this operation is not valid with an empty group
+DCGM_ST_PROFILING_NOT_SUPPORTED = -36  # Profiling is not supported for this group of GPUs or GPU
+DCGM_ST_PROFILING_LIBRARY_ERROR = -37  # The third-party Profiling module returned an unrecoverable error
+DCGM_ST_PROFILING_MULTI_PASS = -38  # The requested profiling metrics cannot be collected in a single pass
+DCGM_ST_DIAG_ALREADY_RUNNING = -39  # A diag instance is already running, cannot run a new diag until the current one finishes.
+DCGM_ST_DIAG_BAD_JSON = -40  # The DCGM GPU Diagnostic returned JSON that cannot be parsed
+DCGM_ST_DIAG_BAD_LAUNCH = -41  # Error while launching the DCGM GPU Diagnostic
+DCGM_ST_DIAG_UNUSED = -42  # Unused
+DCGM_ST_DIAG_THRESHOLD_EXCEEDED = -43  # A field value met or exceeded the error threshold.
+DCGM_ST_INSUFFICIENT_DRIVER_VERSION = -44  # The installed driver version is insufficient for this API
+DCGM_ST_INSTANCE_NOT_FOUND = -45  # The specified GPU instance does not exist
+DCGM_ST_COMPUTE_INSTANCE_NOT_FOUND = -46  # The specified GPU compute instance does not exist
+DCGM_ST_CHILD_NOT_KILLED = -47  # Couldn't kill a child process within the retries
+DCGM_ST_3RD_PARTY_LIBRARY_ERROR = -48  # Detected an error in a 3rd-party library
+DCGM_ST_INSUFFICIENT_RESOURCES = -49  # Not enough resources available
+DCGM_ST_PLUGIN_EXCEPTION = -50  # Exception thrown from a diagnostic plugin
+DCGM_ST_NVVS_ISOLATE_ERROR = -51  # The diagnostic returned an error that indicates the need for isolation
+DCGM_ST_NVVS_BINARY_NOT_FOUND = -52  # The NVVS binary was not found in the specified location
+DCGM_ST_NVVS_KILLED = -53  # The NVVS process was killed by a signal
+DCGM_ST_PAUSED = -54  # The hostengine and all modules are paused
+
+DCGM_GROUP_DEFAULT = 0  # All the GPUs on the node are added to the group
+DCGM_GROUP_EMPTY = 1  # Creates an empty group
+DCGM_GROUP_DEFAULT_NVSWITCHES = 2  # All NvSwitches of the node are added to the group
+DCGM_GROUP_DEFAULT_INSTANCES = 3  # All GPU instances of the node are added to the group
+DCGM_GROUP_DEFAULT_COMPUTE_INSTANCES = 4  # All compute instances of the node are added to the group
+DCGM_GROUP_DEFAULT_ENTITIES = 5  # All entities are added to this default group
 
 DCGM_GROUP_ALL_GPUS = 0x7FFFFFFF
 DCGM_GROUP_ALL_NVSWITCHES = 0x7FFFFFFE
@@ -209,26 +134,17 @@
 DCGM_GROUP_ALL_COMPUTE_INSTANCES = 0x7FFFFFFC
 DCGM_GROUP_ALL_ENTITIES = 0x7FFFFFFB
 
-# Maximum number of entities per entity group
-DCGM_GROUP_MAX_ENTITIES = 64
+DCGM_GROUP_MAX_ENTITIES = 64  #Maximum number of entities per entity group
 
-# The target configuration values to be applied
-DCGM_CONFIG_TARGET_STATE = 0
-# The current configuration state
-DCGM_CONFIG_CURRENT_STATE = 1
+DCGM_CONFIG_TARGET_STATE = 0  # The target configuration values to be applied
+DCGM_CONFIG_CURRENT_STATE = 1  # The current configuration state
 
-# Represents the power cap to be applied for each member of the group
-DCGM_CONFIG_POWER_CAP_INDIVIDUAL = 0
-# Represents the power budget for the entire group
-DCGM_CONFIG_POWER_BUDGET_GROUP = 1
+DCGM_CONFIG_POWER_CAP_INDIVIDUAL = 0  # Represents the power cap to be applied for each member of the group
+DCGM_CONFIG_POWER_BUDGET_GROUP = 1  # Represents the power budget for the entire group
 
-# Default compute mode -- multiple contexts per device
-DCGM_CONFIG_COMPUTEMODE_DEFAULT = 0
-# Compute-prohibited mode -- no contexts per device
-DCGM_CONFIG_COMPUTEMODE_PROHIBITED = 1
-# Compute-exclusive-process mode -- only one context per device, usable from
-# multiple threads at a time
-DCGM_CONFIG_COMPUTEMODE_EXCLUSIVE_PROCESS = 2
+DCGM_CONFIG_COMPUTEMODE_DEFAULT = 0  # Default compute mode -- multiple contexts per device
+DCGM_CONFIG_COMPUTEMODE_PROHIBITED = 1  # Compute-prohibited mode -- no contexts per device
+DCGM_CONFIG_COMPUTEMODE_EXCLUSIVE_PROCESS = 2  #* Compute-exclusive-process mode -- only one context per device, usable from multiple threads at a time
 
 DCGM_TOPOLOGY_BOARD = 0x1
 DCGM_TOPOLOGY_SINGLE = 0x2
@@ -249,19 +165,26 @@
 DCGM_TOPOLOGY_NVLINK11 = 0x40000
 DCGM_TOPOLOGY_NVLINK12 = 0x80000
 
-# Diagnostic per gpu tests - fixed indices for
-# dcgmDiagResponsePerGpu_t.results[]
+# Diagnostic per gpu tests - fixed indices for dcgmDiagResponsePerGpu_t.results[]
 DCGM_MEMORY_INDEX = 0
 DCGM_DIAGNOSTIC_INDEX = 1
 DCGM_PCI_INDEX = 2
-DCGM_SM_PERF_INDEX = 3
-DCGM_TARGETED_PERF_INDEX = 4
+DCGM_SM_STRESS_INDEX = 3
+DCGM_TARGETED_STRESS_INDEX = 4
 DCGM_TARGETED_POWER_INDEX = 5
 DCGM_MEMORY_BANDWIDTH_INDEX = 6
-DCGM_PER_GPU_TEST_COUNT = 7
+DCGM_MEMTEST_INDEX = 7
+DCGM_PULSE_TEST_INDEX = 8
+DCGM_EUD_TEST_INDEX = 9
+DCGM_UNUSED2_TEST_INDEX = 10
+DCGM_UNUSED3_TEST_INDEX = 11
+DCGM_UNUSED4_TEST_INDEX = 12
+DCGM_UNUSED5_TEST_INDEX = 13
+DCGM_PER_GPU_TEST_COUNT_V7 = 9
+DCGM_PER_GPU_TEST_COUNT_V8 = 13
 
 # DCGM Diag Level One test indices
-DCGM_SWTEST_BLACKLIST = 0
+DCGM_SWTEST_DENYLIST = 0
 DCGM_SWTEST_NVML_LIBRARY = 1
 DCGM_SWTEST_CUDA_MAIN_LIBRARY = 2
 DCGM_SWTEST_CUDA_RUNTIME_LIBRARY = 3
@@ -284,70 +207,116 @@ class DCGM_INTROSPECT_STATE(object):
 # Lib loading
 dcgmLib = None
 libLoadLock = threading.Lock()
-# Incremented on each dcgmInit and decremented on dcgmShutdown
-_dcgmLib_refcount = 0
+_dcgmLib_refcount = 0  # Incremented on each dcgmInit and decremented on dcgmShutdown
 
 
 class DCGMError(Exception):
-    """
-    Class to return error values for DCGM
-    """
-
+    """ Class to return error values for DCGM """
     _valClassMapping = dict()
     # List of currently known error codes
     _error_code_to_string = {
-        DCGM_ST_OK: "Success",
-        DCGM_ST_BADPARAM: "Bad parameter passed to function",
-        DCGM_ST_GENERIC_ERROR: "Generic unspecified error",
-        DCGM_ST_MEMORY: "Out of memory error",
-        DCGM_ST_NOT_CONFIGURED: "Setting not configured",
-        DCGM_ST_NOT_SUPPORTED: "Feature not supported",
-        DCGM_ST_INIT_ERROR: "DCGM initialization error",
-        DCGM_ST_NVML_ERROR: "NVML error",
-        DCGM_ST_PENDING: "Object is in a pending state",
-        DCGM_ST_UNINITIALIZED: "Object is in an undefined state",
-        DCGM_ST_TIMEOUT: "Timeout",
-        DCGM_ST_VER_MISMATCH: "API version mismatch",
-        DCGM_ST_UNKNOWN_FIELD: "Unknown field",
-        DCGM_ST_NO_DATA: "No data is available",
-        DCGM_ST_STALE_DATA: "Data is considered stale",
-        DCGM_ST_NOT_WATCHED: "Field is not being updated",
-        DCGM_ST_NO_PERMISSION: "Not permissioned",
-        DCGM_ST_GPU_IS_LOST: "GPU is unreachable",
-        DCGM_ST_RESET_REQUIRED: "GPU requires a reset",
-        DCGM_ST_FUNCTION_NOT_FOUND: "Unable to find function",
-        DCGM_ST_CONNECTION_NOT_VALID: "The connection to the host engine is not valid any longer",
-        DCGM_ST_GPU_NOT_SUPPORTED: "This GPU is not supported by DCGM",
-        DCGM_ST_GROUP_INCOMPATIBLE: "GPUs are incompatible with each other for\
-                 the requested operation",
-        DCGM_ST_MAX_LIMIT: "Max limit reached for the object",
-        DCGM_ST_LIBRARY_NOT_FOUND: "DCGM library could not be found",
-        DCGM_ST_DUPLICATE_KEY: "Duplicate key passed to function",
-        DCGM_ST_GPU_IN_SYNC_BOOST_GROUP: "GPU is already a part of a sync boost group",
-        DCGM_ST_GPU_NOT_IN_SYNC_BOOST_GROUP: "GPU is not a part of the sync boost group",
-        DCGM_ST_REQUIRES_ROOT: "This operation is not supported when the host engine\
-                is running as non root",
-        DCGM_ST_NVVS_ERROR: "DCGM GPU Diagnostic returned an error.",
-        DCGM_ST_INSUFFICIENT_SIZE: "An input argument is not large enough",
-        DCGM_ST_FIELD_UNSUPPORTED_BY_API: "The given field ID is not supported by the API being called",
-        DCGM_ST_MODULE_NOT_LOADED: "This request is serviced by a module of DCGM that\
-                is not currently loaded",
-        DCGM_ST_IN_USE: "The requested operation could not be completed because\
-                the affected resource is in use",
-        DCGM_ST_GROUP_IS_EMPTY: "The specified group is empty, and this operation\
-                is incompatible with an empty group",
-        DCGM_ST_PROFILING_NOT_SUPPORTED: "Profiling is not supported for this group of GPUs or GPU",
-        DCGM_ST_PROFILING_LIBRARY_ERROR: "The third-party Profiling module returned an unrecoverable error",
-        DCGM_ST_PROFILING_MULTI_PASS: "The requested profiling metrics\
-                cannot be collected in a single pass",
-        DCGM_ST_DIAG_ALREADY_RUNNING: "A diag instance is already running, cannot\
-                run a new diag until the current one finishes",
-        DCGM_ST_DIAG_BAD_JSON: "The GPU Diagnostic returned Json that cannot be parsed.",
-        DCGM_ST_DIAG_BAD_LAUNCH: "Error while launching the GPU Diagnostic.",
-        DCGM_ST_DIAG_VARIANCE: "The results of training DCGM GPU Diagnostic cannot\
-                be trusted because they vary too much from run to run",
-        DCGM_ST_DIAG_THRESHOLD_EXCEEDED: "A field value met or exceeded the error threshold.",
-        DCGM_ST_INSUFFICIENT_DRIVER_VERSION: "The installed driver version is insufficient for this API",
+        DCGM_ST_OK:
+            "Success",
+        DCGM_ST_BADPARAM:
+            "Bad parameter passed to function",
+        DCGM_ST_GENERIC_ERROR:
+            "Generic unspecified error",
+        DCGM_ST_MEMORY:
+            "Out of memory error",
+        DCGM_ST_NOT_CONFIGURED:
+            "Setting not configured",
+        DCGM_ST_NOT_SUPPORTED:
+            "Feature not supported",
+        DCGM_ST_INIT_ERROR:
+            "DCGM initialization error",
+        DCGM_ST_NVML_ERROR:
+            "NVML error",
+        DCGM_ST_PENDING:
+            "Object is in a pending state",
+        DCGM_ST_UNINITIALIZED:
+            "Object is in an undefined state",
+        DCGM_ST_TIMEOUT:
+            "Timeout",
+        DCGM_ST_VER_MISMATCH:
+            "API version mismatch",
+        DCGM_ST_UNKNOWN_FIELD:
+            "Unknown field",
+        DCGM_ST_NO_DATA:
+            "No data is available",
+        DCGM_ST_STALE_DATA:
+            "Data is considered stale",
+        DCGM_ST_NOT_WATCHED:
+            "Field is not being updated",
+        DCGM_ST_NO_PERMISSION:
+            "Not permissioned",
+        DCGM_ST_GPU_IS_LOST:
+            "GPU is unreachable",
+        DCGM_ST_RESET_REQUIRED:
+            "GPU requires a reset",
+        DCGM_ST_FUNCTION_NOT_FOUND:
+            "Unable to find function",
+        DCGM_ST_CONNECTION_NOT_VALID:
+            "The connection to the host engine is not valid any longer",
+        DCGM_ST_GPU_NOT_SUPPORTED:
+            "This GPU is not supported by DCGM",
+        DCGM_ST_GROUP_INCOMPATIBLE:
+            "GPUs are incompatible with each other for the requested operation",
+        DCGM_ST_MAX_LIMIT:
+            "Max limit reached for the object",
+        DCGM_ST_LIBRARY_NOT_FOUND:
+            "DCGM library could not be found",
+        DCGM_ST_DUPLICATE_KEY:
+            "Duplicate key passed to function",
+        DCGM_ST_GPU_IN_SYNC_BOOST_GROUP:
+            "GPU is already a part of a sync boost group",
+        DCGM_ST_GPU_NOT_IN_SYNC_BOOST_GROUP:
+            "GPU is not a part of the sync boost group",
+        DCGM_ST_REQUIRES_ROOT:
+            "This operation is not supported when the host engine is running as non root",
+        DCGM_ST_NVVS_ERROR:
+            "DCGM GPU Diagnostic returned an error.",
+        DCGM_ST_INSUFFICIENT_SIZE:
+            "An input argument is not large enough",
+        DCGM_ST_FIELD_UNSUPPORTED_BY_API:
+            "The given field ID is not supported by the API being called",
+        DCGM_ST_MODULE_NOT_LOADED:
+            "This request is serviced by a module of DCGM that is not currently loaded",
+        DCGM_ST_IN_USE:
+            "The requested operation could not be completed because the affected resource is in use",
+        DCGM_ST_GROUP_IS_EMPTY:
+            "The specified group is empty, and this operation is incompatible with an empty group",
+        DCGM_ST_PROFILING_NOT_SUPPORTED:
+            "Profiling is not supported for this group of GPUs or GPU",
+        DCGM_ST_PROFILING_LIBRARY_ERROR:
+            "The third-party Profiling module returned an unrecoverable error",
+        DCGM_ST_PROFILING_MULTI_PASS:
+            "The requested profiling metrics cannot be collected in a single pass",
+        DCGM_ST_DIAG_ALREADY_RUNNING:
+            "A diag instance is already running, cannot run a new diag until the current one finishes",
+        DCGM_ST_DIAG_BAD_JSON:
+            "The GPU Diagnostic returned Json that cannot be parsed.",
+        DCGM_ST_DIAG_BAD_LAUNCH:
+            "Error while launching the GPU Diagnostic.",
+        DCGM_ST_DIAG_UNUSED:
+            "Unused error code",
+        DCGM_ST_DIAG_THRESHOLD_EXCEEDED:
+            "A field value met or exceeded the error threshold.",
+        DCGM_ST_INSUFFICIENT_DRIVER_VERSION:
+            "The installed driver version is insufficient for this API",
+        DCGM_ST_INSTANCE_NOT_FOUND:
+            "The specified GPU instance does not exist",
+        DCGM_ST_COMPUTE_INSTANCE_NOT_FOUND:
+            "The specified GPU compute instance does not exist",
+        DCGM_ST_CHILD_NOT_KILLED:
+            "Couldn't kill a child process within the retries",
+        DCGM_ST_3RD_PARTY_LIBRARY_ERROR:
+            "Detected an error in a 3rd-party library",
+        DCGM_ST_INSUFFICIENT_RESOURCES:
+            "Not enough resources available",
+        DCGM_ST_PLUGIN_EXCEPTION:
+            "Exception thrown from a diagnostic plugin",
+        DCGM_ST_NVVS_ISOLATE_ERROR:
+            "The diagnostic returned an error that indicates the need for isolation",
     }
 
     def __new__(typ, value):
@@ -366,11 +335,9 @@ def __str__(self):
         try:
             if self.value not in DCGMError._error_code_to_string:
                 DCGMError._error_code_to_string[self.value] = str(
-                    _dcgmErrorString(self.value)
-                )
+                    _dcgmErrorString(self.value))
             msg = DCGMError._error_code_to_string[self.value]
-        # Ensure we catch all exceptions, otherwise the error code will be
-        # hidden in a traceback
+        # Ensure we catch all exceptions, otherwise the error code will be hidden in a traceback
         except BaseException:
             msg = "DCGM Error with code %d" % self.value
 
@@ -383,16 +350,16 @@ def __str__(self):
     def __eq__(self, other):
         return self.value == other.value
 
+    def __hash__(self):
+        return hash(self.value)
+
     def SetAdditionalInfo(self, msg):
         """
-        Sets msg as additional information returned by the string
-        representation of DCGMError and subclasses. Example output for
-        DCGMError_Uninitialized subclass, with msg set to 'more info msg
-        here' is "DCGMError_Uninitialized: Object is in an undefined state:
-        'more info msg here'".
-
-        Ensure that msg is a string or an object for which the __str__()
-        method does not throw an error
+        Sets msg as additional information returned by the string representation of DCGMError and subclasses.
+        Example output for DCGMError_Uninitialized subclass, with msg set to 'more info msg here' is 
+        "DCGMError_Uninitialized: Object is in an undefined state: 'more info msg here'".
+        
+        Ensure that msg is a string or an object for which the __str__() method does not throw an error
         """
         self.info = msg
 
@@ -402,33 +369,34 @@ def dcgmExceptionClass(error_code):
 
 
 def _extractDCGMErrorsAsClasses():
-    """
+    '''
     Generates a hierarchy of classes on top of DCGMLError class.
 
-    Each DCGM Error gets a new DCGMError subclass. This way try,except blocks
-    can filter appropriate exceptions more easily.
+    Each DCGM Error gets a new DCGMError subclass. This way try,except blocks can filter appropriate
+    exceptions more easily.
 
     DCGMError is a parent class. Each DCGM_ST_* gets it's own subclass.
     e.g. DCGM_ST_UNINITIALIZED will be turned into DCGMError_Uninitialized
-    """
-
+    '''
     this_module = sys.modules[__name__]
     dcgmErrorsNames = filter(lambda x: x.startswith("DCGM_ST_"), dir(this_module))
     for err_name in dcgmErrorsNames:
         # e.g. Turn DCGM_ST_UNINITIALIZED into DCGMError_Uninitialized
         class_name = "DCGMError_" + string.capwords(
-            err_name.replace("DCGM_ST_", ""), "_"
-        ).replace("_", "")
+            err_name.replace("DCGM_ST_", ""), "_").replace("_", "")
         err_val = getattr(this_module, err_name)
 
         def gen_new(val):
+
             def new(typ):
+                # pylint: disable=E1121
                 obj = DCGMError.__new__(typ, val)
                 return obj
 
             return new
 
-        new_error_class = type(class_name, (DCGMError,), {"__new__": gen_new(err_val)})
+        new_error_class = type(class_name, (DCGMError,),
+                               {'__new__': gen_new(err_val)})
         new_error_class.__module__ = __name__
         setattr(this_module, class_name, new_error_class)
         DCGMError._valClassMapping[err_val] = new_error_class
@@ -445,7 +413,118 @@ class struct_c_dcgmUnit_t(Structure):
 _dcgmUnit_t = POINTER(struct_c_dcgmUnit_t)
 
 
-class _PrintableStructure(Structure):
+class _WrappedStructure():
+
+    def __init__(self, obj):
+        self.__dict__["_obj"] = obj
+
+    def __getattr__(self, key):
+        value = getattr(self._obj, key)
+        if isinstance(value, bytes):
+            return value.decode('utf-8')
+        if isclass(value):
+            return _WrappedStructure(value)
+        return value
+
+    def __getitem__(self, key):
+        value = self._obj[key]
+        if isinstance(value, bytes):
+            return value.decode('utf-8')
+        if isclass(value):
+            return _WrappedStructure(value)
+        return value
+
+    def __setattr__(self, key, raw_value):
+
+        def find_field_type(fields, key):
+            field = (f[1] for f in fields if f[0] == key)
+            try:
+                return next(field)
+            except StopIteration:
+                return None
+
+        if (key == '_obj'):
+            raise RuntimeError("Cannot set _obj")
+
+        value = raw_value
+        fieldtype = find_field_type(self._obj._fields_, key)
+
+        if fieldtype == c_uint and not isinstance(value, c_uint32):
+            value = int(value)
+        elif fieldtype == c_int and not isinstance(value, c_int32):
+            value = int(value)
+        elif isinstance(raw_value, str):
+            value = raw_value.encode('utf-8')
+
+        self._obj[key] = value
+        return value
+
+
+class _DcgmStructure(Structure):
+
+    def __getattribute__(self, key):
+        value = super().__getattribute__(key)
+        if isinstance(value, bytes):
+            return value.decode('utf-8')
+        if isclass(value):
+            return _WrappedStructure(value)
+        return value
+
+    def __setattr__(self, key, raw_value):
+
+        def find_field_type(fields, key):
+            field = (f[1] for f in fields if f[0] == key)
+            try:
+                return next(field)
+            except StopIteration:
+                return None
+
+        value = raw_value
+        fieldtype = find_field_type(self._fields_, key)
+
+        if fieldtype == c_uint and not isinstance(value, c_uint32):
+            value = int(value)
+        elif fieldtype == c_int and not isinstance(value, c_int32):
+            value = int(value)
+        elif isinstance(raw_value, str):
+            value = raw_value.encode('utf-8')
+
+        return super().__setattr__(key, value)
+
+
+class DcgmUnion(Union):
+
+    def __getattribute__(self, key):
+        value = super().__getattribute__(key)
+        if isinstance(value, bytes):
+            return value.decode('utf-8')
+        if isclass(value):
+            return _WrappedStructure(value)
+        return value
+
+    def __setattr__(self, key, raw_value):
+
+        def find_field_type(fields, key):
+            field = (f[1] for f in fields if f[0] == key)
+            try:
+                return next(field)
+            except StopIteration:
+                return None
+
+        value = raw_value
+        fieldtype = find_field_type(self._fields_, key)
+
+        if fieldtype == c_uint and not isinstance(value, c_uint32):
+            value = int(value)
+        elif fieldtype == c_int and not isinstance(value, c_int32):
+            value = int(value)
+        elif isinstance(raw_value, str):
+            value = raw_value.encode('utf-8')
+
+        return super().__setattr__(key, value)
+
+
+class _PrintableStructure(_DcgmStructure):
     """
     Abstract class that produces nicer __str__ output than ctypes.Structure.
     e.g. instead of:
@@ -458,15 +537,12 @@ class _PrintableStructure(Structure):
     e.g. class that has _field_ 'hex_value', c_uint could be formatted with
       _fmt_ = {"hex_value" : "%08X"}
     to produce nicer output.
-    Default formatting string for all fields can be set with key "<default>"
-    like:
+    Default fomratting string for all fields can be set with key "<default>" like:
       _fmt_ = {"<default>" : "%d MHz"} # e.g all values are numbers in MHz.
     If not set it's assumed to be just "%s"
 
-    Exact format of returned str from this class is subject to change in the
-    future.
+    Exact format of returned str from this class is subject to change in the future.
     """
-
     _fmt_ = {}
 
     def __str__(self):
@@ -480,7 +556,7 @@ def __str__(self):
             elif "<default>" in self._fmt_:
                 fmt = self._fmt_["<default>"]
             result.append(("%s: " + fmt) % (key, value))
-        return self.__class__.__name__ + "(" + string.join(result, ", ") + ")"
+        return self.__class__.__name__ + "(" + ", ".join(result) + ")"
 
     def FieldsSizeof(self):
         size = 0
@@ -489,10 +565,8 @@ def FieldsSizeof(self):
         return size
 
 
+#JSON serializer for DCGM structures
 class DcgmJSONEncoder(json.JSONEncoder):
-    """
-    JSON serializer for DCGM structures
-    """
 
     def default(self, o):  # pylint: disable=method-hidden
         if isinstance(o, _PrintableStructure):
@@ -519,21 +593,18 @@ def default(self, o):  # pylint: disable=method-hidden
                 retVal.append(subVal)
             return retVal
 
-        # Let the parent class handle this/fail
+        #Let the parent class handle this/fail
         return json.JSONEncoder.default(self, o)
 
 
+# Creates a unique version number for each struct
 def make_dcgm_version(struct, ver):
-    """
-    Creates a unique version number for each struct
-    """
-
     return sizeof(struct) | (ver << 24)
 
 
-# Function access
-# function pointers are cached to prevent unnecessary libLoadLock locking
-_dcgmGetFunctionPointer_cache = dict()
+# Function access ##
+_dcgmGetFunctionPointer_cache = dict(
+)  # function pointers are cached to prevent unnecessary libLoadLock locking
 
 
 def _dcgmGetFunctionPointer(name):
@@ -557,18 +628,14 @@ def _dcgmGetFunctionPointer(name):
         libLoadLock.release()
 
 
-#
-# C function wrappers
-#
+# C function wrappers ##
 def _LoadDcgmLibrary(libDcgmPath=None):
     """
     Load the library if it isn't loaded already
-    :param libDcgmPath: Optional path to the libdcgm*.so libraries. Will use
-    system defaults if not specified.
+    :param libDcgmPath: Optional path to the libdcgm*.so libraries. Will use system defaults if not specified.
     :type libDcgmPath: str
     :return: None
     """
-
     global dcgmLib
 
     if dcgmLib is None:
@@ -580,30 +647,24 @@ def _LoadDcgmLibrary(libDcgmPath=None):
             if dcgmLib is None:
                 try:
                     if sys.platform[:3] == "win":
-                        # cdecl calling convention load nvml.dll from
-                        # %ProgramFiles%/NVIDIA Corporation/NVSMI/nvml.dll
+                        # cdecl calling convention
+                        # load nvml.dll from %ProgramFiles%/NVIDIA Corporation/NVSMI/nvml.dll
                         dcgmLib = CDLL(
                             os.path.join(
                                 os.getenv("ProgramFiles", "C:/Program Files"),
-                                "NVIDIA Corporation/NVSMI/dcgm.dll",
-                            )
-                        )
+                                "NVIDIA Corporation/NVSMI/dcgm.dll"))
                     else:
-                        if not libDcgmPath:
-                            (
-                                dist_name,
-                                dist_version,
-                                dist_id,
-                            ) = distro.linux_distribution(full_distribution_name=0)
-                            dist_name = dist_name.lower()
-                            if dist_name in {"ubuntu", "debian"}:
-                                libDcgmPath = "/usr/lib/{}-linux-gnu".format(
-                                    platform.machine()
-                                )
-                            elif dist_name in {"fedora", "redhat", "centos", "suse"}:
-                                libDcgmPath = "/usr/lib64"
-
-                        dcgmLib = CDLL(os.path.join(libDcgmPath, "libdcgm.so.2"))
+                        if libDcgmPath:
+                            lib_file = os.path.join(libDcgmPath, "libdcgm.so.3")
+                        else:
+                            # Try Debian-based distros
+                            lib_file = '/usr/lib/{}-linux-gnu/libdcgm.so.3'.format(
+                                platform.machine())
+                            if not os.path.isfile(lib_file):
+                                # Presume Redhat-based distros
+                                lib_file = '/usr/lib64/libdcgm.so.3'
+
+                    dcgmLib = CDLL(lib_file)
 
                 except OSError as ose:
                     _dcgmCheckReturn(DCGM_ST_LIBRARY_NOT_FOUND)
@@ -652,20 +713,23 @@ def _dcgmErrorString(result):
     return str
 
 
+# Represents a link object. type should be one of DCGM_FE_GPU or
+# DCGM_FE_SWITCH. gpuId or switchID the associated gpu or switch;
+#
+class c_dcgm_link_t(_PrintableStructure):
+    _fields = [('type', c_uint8), ('index', c_uint8), ('id', c_uint16)]
+
+
 class c_dcgmConnectV2Params_v1(_PrintableStructure):
-    _fields_ = [("version", c_uint), ("persistAfterDisconnect", c_uint)]
+    _fields_ = [('version', c_uint), ('persistAfterDisconnect', c_uint)]
 
 
 c_dcgmConnectV2Params_version1 = make_dcgm_version(c_dcgmConnectV2Params_v1, 1)
 
 
 class c_dcgmConnectV2Params_v2(_PrintableStructure):
-    _fields_ = [
-        ("version", c_uint),
-        ("persistAfterDisconnect", c_uint),
-        ("timeoutMs", c_uint),
-        ("addressIsUnixSocket", c_uint),
-    ]
+    _fields_ = [('version', c_uint), ('persistAfterDisconnect', c_uint),
+                ('timeoutMs', c_uint), ('addressIsUnixSocket', c_uint)]
 
 
 c_dcgmConnectV2Params_version2 = make_dcgm_version(c_dcgmConnectV2Params_v2, 2)
@@ -683,22 +747,22 @@ class c_dcgmHostengineHealth_v1(_PrintableStructure):
 dcgmHostengineHealth_version = dcgmHostengineHealth_version1
 
 
-# Represents memory and proc clocks for a device
+#Represents memory and proc clocks for a device
 class c_dcgmClockSet_v1(_PrintableStructure):
     _fields_ = [
-        ("version", c_uint),
-        ("memClock", c_uint),  # Memory Clock
-        ("smClock", c_uint),  # SM Clock
+        ('version', c_uint),
+        ('memClock', c_uint),  #/* Memory Clock */
+        ('smClock', c_uint)  #/* SM Clock */
     ]
 
 
-# Represents a entityGroupId + entityId pair to uniquely identify a given
-# entityId inside a group of entities
+# Represents a entityGroupId + entityId pair to uniquely identify a given entityId inside
+# a group of entities
 # Added in DCGM 1.5.0
 class c_dcgmGroupEntityPair_t(_PrintableStructure):
     _fields_ = [
-        ("entityGroupId", c_uint32),  # Entity Group ID entity belongs to
-        ("entityId", c_uint32),  # Entity ID of the entity
+        ('entityGroupId', c_uint32),  #Entity Group ID entity belongs to
+        ('entityId', c_uint32)  #Entity ID of the entity
     ]
 
 
@@ -707,12 +771,10 @@ class c_dcgmGroupEntityPair_t(_PrintableStructure):
 #  * Added in DCGM 1.5.0
 #  */
 class c_dcgmGroupInfo_v2(_PrintableStructure):
-    _fields_ = [
-        ("version", c_uint),
-        ("count", c_uint),
-        ("groupName", c_char * DCGM_MAX_STR_LENGTH),
-        ("entityList", c_dcgmGroupEntityPair_t * DCGM_GROUP_MAX_ENTITIES),
-    ]
+    _fields_ = [('version', c_uint), ('count', c_uint),
+                ('groupName', c_char * DCGM_MAX_STR_LENGTH),
+                ('entityList',
+                 c_dcgmGroupEntityPair_t * DCGM_GROUP_MAX_ENTITIES)]
 
 
 c_dcgmGroupInfo_version2 = make_dcgm_version(c_dcgmGroupInfo_v2, 2)
@@ -723,19 +785,25 @@ class c_dcgmGroupInfo_v2(_PrintableStructure):
 DcgmMigProfileGpuInstanceSlice3 = 3  # GPU instance slice 3
 DcgmMigProfileGpuInstanceSlice4 = 4  # GPU instance slice 4
 DcgmMigProfileGpuInstanceSlice7 = 5  # GPU instance slice 7
+DcgmMigProfileGpuInstanceSlice8 = 6  # GPU instance slice 8
+DcgmMigProfileGpuInstanceSlice6 = 7  # GPU instance slice 6
+DcgmMigProfileGpuInstanceSlice1Rev1 = 8  # GPU instance slice 1 revision 1
+DcgmMigProfileGpuInstanceSlice2Rev1 = 9  # GPU instance slice 2 revision 1
+DcgmMigProfileGpuInstanceSlice1Rev2 = 10  # GPU instance slice 1 revision 2
 DcgmMigProfileComputeInstanceSlice1 = 30  # compute instance slice 1
 DcgmMigProfileComputeInstanceSlice2 = 31  # compute instance slice 2
 DcgmMigProfileComputeInstanceSlice3 = 32  # compute instance slice 3
 DcgmMigProfileComputeInstanceSlice4 = 33  # compute instance slice 4
 DcgmMigProfileComputeInstanceSlice7 = 34  # compute instance slice 7
+DcgmMigProfileComputeInstanceSlice8 = 35  # compute instance slice 8
+DcgmMigProfileComputeInstanceSlice6 = 36  # compute instance slice 6
+DcgmMigProfileComputeInstanceSlice1Rev1 = 37  # compute instance slice 1 revision 1
 
 
+# /**
+#  * Represents a pair of entity pairings to uniquely identify an entity and its place in the hierarchy.
+#  */
 class c_dcgmMigHierarchyInfo_t(_PrintableStructure):
-    """
-    Represents a pair of entity pairings to uniquely identify an entity and
-    its place in the hierarchy.
-    """
-
     _fields_ = [
         ("entity", c_dcgmGroupEntityPair_t),
         ("parent", c_dcgmGroupEntityPair_t),
@@ -743,36 +811,50 @@ class c_dcgmMigHierarchyInfo_t(_PrintableStructure):
     ]
 
 
-DCGM_MAX_INSTANCES_PER_GPU = 7
-# There can never be more compute instances per GPU than instances per GPU
-# because a compute instance is part of an instance
+class c_dcgmMigEntityInfo_t(_PrintableStructure):
+    _fields_ = [
+        ('gpuUuid', c_char * 128),  # GPU UUID
+        ('nvmlGpuIndex', c_uint),  # GPU index from NVML
+        ('nvmlInstanceId', c_uint),  # GPU instance index within GPU
+        ('nvmlComputeInstanceId',
+         c_uint),  # GPU Compute instance index within GPU instance
+        ('nvmlMigProfileId',
+         c_uint),  # Unique profile ID for GPU or Compute instances
+        ('nvmlProfileSlices', c_uint),  # Number of slices in the MIG profile
+    ]
+
+
+class c_dcgmMigHierarchyInfo_v2(_PrintableStructure):
+    _fields_ = [
+        ('entity', c_dcgmGroupEntityPair_t),
+        ('parent', c_dcgmGroupEntityPair_t),
+        ('info', c_dcgmMigEntityInfo_t),
+    ]
+
+
+DCGM_MAX_INSTANCES_PER_GPU = 8
+# There can never be more compute instances per GPU than instances per GPU because a compute instance
+# is part of an instance
 DCGM_MAX_COMPUTE_INSTANCES_PER_GPU = DCGM_MAX_INSTANCES_PER_GPU
-# Currently, there cannot be more than 14 instances + compute instances. There
-# are always 7 compute instances and never more than 7 instances
+# Currently, there cannot be more than 14 instances + compute instances. There are always 7 compute instances
+# and never more than 7 instances
 DCGM_MAX_TOTAL_INSTANCES = 14
 DCGM_MAX_HIERARCHY_INFO = DCGM_MAX_NUM_DEVICES * DCGM_MAX_TOTAL_INSTANCES
 DCGM_MAX_INSTANCES = DCGM_MAX_NUM_DEVICES * DCGM_MAX_INSTANCES_PER_GPU
-# The maximum compute instances are always the same as the maximum instances
-# because each compute instances is part of an instance
+# The maximum compute instances are always the same as the maximum instances because each compute instances
+# is part of an instance
 DCGM_MAX_COMPUTE_INSTANCES = DCGM_MAX_INSTANCES
 
-# Ask the hostengine to wait to process reconfiguring the GPUs
-DCGM_MIG_RECONFIG_DELAY_PROCESSING = 0x1
+DCGM_MIG_RECONFIG_DELAY_PROCESSING = 0x1  # Ask the hostengine to wait to process reconfiguring the GPUs
 
 
-class c_dcgmMigHierarchy_v1(_PrintableStructure):
-    """
-    Structure to store the GPU hierarchy for a system
-    """
-
-    _fields_ = [
-        ("version", c_uint),
-        ("count", c_uint),
-        ("entityList", c_dcgmMigHierarchyInfo_t * DCGM_MAX_HIERARCHY_INFO),
-    ]
+class c_dcgmMigHierarchy_v2(_PrintableStructure):
+    _fields_ = [('version', c_uint), ('count', c_uint),
+                ('entityList',
+                 c_dcgmMigHierarchyInfo_v2 * DCGM_MAX_HIERARCHY_INFO)]
 
 
-c_dcgmMigHierarchy_version1 = make_dcgm_version(c_dcgmMigHierarchy_v1, 1)
+c_dcgmMigHierarchy_version2 = make_dcgm_version(c_dcgmMigHierarchy_v2, 2)
 
 
 class c_dcgmDeleteMigEntity_v1(_PrintableStructure):
@@ -786,13 +868,11 @@ class c_dcgmDeleteMigEntity_v1(_PrintableStructure):
 
 c_dcgmDeleteMigEntity_version1 = make_dcgm_version(c_dcgmDeleteMigEntity_v1, 1)
 
-#
-# Enum values for the kinds of MIG creations
-#
-# Create a GPU instance
-DcgmMigCreateGpuInstance = 0
-# Create a compute instance
-DcgmMigCreateComputeInstance = 1
+# /**
+#  * Enum values for the kinds of MIG creations
+#  */
+DcgmMigCreateGpuInstance = 0  # Create a GPU instance
+DcgmMigCreateComputeInstance = 1  # Create a compute instance
 
 
 class c_dcgmCreateMigEntity_v1(_PrintableStructure):
@@ -808,331 +888,391 @@ class c_dcgmCreateMigEntity_v1(_PrintableStructure):
 c_dcgmCreateMigEntity_version1 = make_dcgm_version(c_dcgmCreateMigEntity_v1, 1)
 
 
+# /**
+#  * Structure to represent error attributes
+#  */
 class c_dcgmErrorInfo_v1(_PrintableStructure):
-    """
-    Structure to represent error attributes
-    """
-
-    _fields_ = [("gpuId", c_uint), ("fieldId", c_ushort), ("status", c_int)]
+    _fields_ = [('gpuId', c_uint), ('fieldId', c_ushort), ('status', c_int)]
 
 
+# /**
+#  * Represents list of supported clocks for a device
+#  */
 class c_dcgmDeviceSupportedClockSets_v1(_PrintableStructure):
-    """
-    Represents list of supported clocks for a device
-    """
-
-    _fields_ = [
-        ("version", c_uint),
-        ("count", c_uint),
-        ("clockSet", c_dcgmClockSet_v1 * DCGM_MAX_CLOCKS),
-    ]
+    _fields_ = [('version', c_uint), ('count', c_uint),
+                ('clockSet', c_dcgmClockSet_v1 * DCGM_MAX_CLOCKS)]
 
 
+# /**
+# * Represents accounting information for a device and pid
+# */
 class c_dcgmDevicePidAccountingStats_v1(_PrintableStructure):
-    """
-    epresents accounting information for a device and pid
-    """
-
-    _fields_ = [
-        ("version", c_uint32),
-        ("pid", c_uint32),
-        ("gpuUtilization", c_uint32),
-        ("memoryUtilization", c_uint32),
-        ("maxMemoryUsage", c_uint64),
-        ("startTimestamp", c_uint64),
-        ("activeTimeUsec", c_uint64),
-    ]
+    _fields_ = [('version', c_uint32), ('pid', c_uint32),
+                ('gpuUtilization', c_uint32), ('memoryUtilization', c_uint32),
+                ('maxMemoryUsage', c_uint64), ('startTimestamp', c_uint64),
+                ('activeTimeUsec', c_uint64)]
 
 
+# /**
+#  * Represents thermal information
+#  */
 class c_dcgmDeviceThermals_v1(_PrintableStructure):
-    """
-    Represents thermal information
-    """
-
-    _fields_ = [("version", c_uint), ("slowdownTemp", c_uint), ("shutdownTemp", c_uint)]
+    _fields_ = [('version', c_uint), ('slowdownTemp', c_uint),
+                ('shutdownTemp', c_uint)]
 
 
+# /**
+#  * Represents various power limits
+#  */
 class c_dcgmDevicePowerLimits_v1(_PrintableStructure):
-    """
-    Represents various power limits
-    """
-
-    _fields_ = [
-        ("version", c_uint),
-        ("curPowerLimit", c_uint),
-        ("defaultPowerLimit", c_uint),
-        ("enforcedPowerLimit", c_uint),
-        ("minPowerLimit", c_uint),
-        ("maxPowerLimit", c_uint),
-    ]
+    _fields_ = [('version', c_uint), ('curPowerLimit', c_uint),
+                ('defaultPowerLimit', c_uint), ('enforcedPowerLimit', c_uint),
+                ('minPowerLimit', c_uint), ('maxPowerLimit', c_uint)]
 
 
+# /**
+#  * Represents device identifiers
+#  */
 class c_dcgmDeviceIdentifiers_v1(_PrintableStructure):
-    """
-    Represents device identifiers
-    """
-
-    _fields_ = [
-        ("version", c_uint),
-        ("brandName", c_char * DCGM_MAX_STR_LENGTH),
-        ("deviceName", c_char * DCGM_MAX_STR_LENGTH),
-        ("pciBusId", c_char * DCGM_MAX_STR_LENGTH),
-        ("serial", c_char * DCGM_MAX_STR_LENGTH),
-        ("uuid", c_char * DCGM_MAX_STR_LENGTH),
-        ("vbios", c_char * DCGM_MAX_STR_LENGTH),
-        ("inforomImageVersion", c_char * DCGM_MAX_STR_LENGTH),
-        ("pciDeviceId", c_uint32),
-        ("pciSubSystemId", c_uint32),
-        ("driverVersion", c_char * DCGM_MAX_STR_LENGTH),
-        ("virtualizationMode", c_uint32),
-    ]
+    _fields_ = [('version', c_uint),
+                ('brandName', c_char * DCGM_MAX_STR_LENGTH),
+                ('deviceName', c_char * DCGM_MAX_STR_LENGTH),
+                ('pciBusId', c_char * DCGM_MAX_STR_LENGTH),
+                ('serial', c_char * DCGM_MAX_STR_LENGTH),
+                ('uuid', c_char * DCGM_MAX_STR_LENGTH),
+                ('vbios', c_char * DCGM_MAX_STR_LENGTH),
+                ('inforomImageVersion', c_char * DCGM_MAX_STR_LENGTH),
+                ('pciDeviceId', c_uint32), ('pciSubSystemId', c_uint32),
+                ('driverVersion', c_char * DCGM_MAX_STR_LENGTH),
+                ('virtualizationMode', c_uint32)]
 
 
+# /**
+#  * Represents memory utilization
+#  */
 class c_dcgmDeviceMemoryUsage_v1(_PrintableStructure):
-    """
-    Represents memory utilization
-    """
-
-    _fields_ = [
-        ("version", c_uint),
-        ("bar1Total", c_uint),
-        ("fbTotal", c_uint),
-        ("fbUsed", c_uint),
-        ("fbFree", c_uint),
-    ]
+    _fields_ = [('version', c_uint), ('bar1Total', c_uint), ('fbTotal', c_uint),
+                ('fbUsed', c_uint), ('fbFree', c_uint)]
 
 
+# /**
+#  * Represents utilization values of vGPUs running on the device
+#  */
 class c_dcgmDeviceVgpuUtilInfo_v1(_PrintableStructure):
-    """
-    Represents utilization values of vGPUs running on the device
-    """
-
-    _fields_ = [
-        ("version", c_uint),
-        ("vgpuId", c_uint),
-        ("smUtil", c_uint),
-        ("memUtil", c_uint),
-        ("encUtil", c_uint),
-        ("decUtil", c_uint),
-    ]
+    _fields_ = [('version', c_uint), ('vgpuId', c_uint), ('smUtil', c_uint),
+                ('memUtil', c_uint), ('encUtil', c_uint), ('decUtil', c_uint)]
 
 
 # /**
 #  * Utilization values for processes running within vGPU VMs using the device
 #  */
 class c_dcgmDeviceVgpuProcessUtilInfo_v1(_PrintableStructure):
-    _fields_ = [
-        ("version", c_uint),
-        ("vgpuId", c_uint),
-        ("pid", c_uint),
-        ("processName", c_char * DCGM_VGPU_NAME_BUFFER_SIZE),
-        ("smUtil", c_uint),
-        ("memUtil", c_uint),
-        ("encUtil", c_uint),
-        ("decUtil", c_uint),
-    ]
+    _fields_ = [('version', c_uint), ('vgpuId', c_uint), ('pid', c_uint),
+                ('processName', c_char * DCGM_VGPU_NAME_BUFFER_SIZE),
+                ('smUtil', c_uint), ('memUtil', c_uint), ('encUtil', c_uint),
+                ('decUtil', c_uint)]
 
 
 # /**
 #  * Represents current encoder statistics for the given device/vGPU instance
 #  */
 class c_dcgmDeviceEncStats_v1(_PrintableStructure):
-    _fields_ = [
-        ("version", c_uint),
-        ("sessionCount", c_uint),
-        ("averageFps", c_uint),
-        ("averageLatency", c_uint),
-    ]
+    _fields_ = [('version', c_uint), ('sessionCount', c_uint),
+                ('averageFps', c_uint), ('averageLatency', c_uint)]
 
 
+# /**
+#  * Represents information about active encoder sessions on the given vGPU instance
+#  */
 class c_dcgmDeviceVgpuEncSessions_v1(_PrintableStructure):
-    """
-    Represents information about active encoder sessions on the given vGPU
-    instance
-    """
+    _fields_ = [('version', c_uint), ('vgpuId', c_uint), ('sessionId', c_uint),
+                ('pid', c_uint), ('codecType', c_uint), ('hResolution', c_uint),
+                ('vResolution', c_uint), ('averageFps', c_uint),
+                ('averageLatency', c_uint)]
+
+
+# /**
+#  * Represents current frame buffer capture sessions statistics for the given device/vGPU instance
+#  */
+class c_dcgmDeviceFbcStats_v1(_PrintableStructure):
+    _fields_ = [('version', c_uint), ('sessionCount', c_uint),
+                ('averageFps', c_uint), ('averageLatency', c_uint)]
 
+
+# /**
+#  * Represents information about active FBC session on the given device/vGPU instance
+#  */
+class c_dcgmDeviceFbcSessionInfo_t(_PrintableStructure):
+    _fields_ = [('version', c_uint), ('sessionId', c_uint), ('pid', c_uint),
+                ('vgpuId', c_uint), ('displayOrdinal', c_uint),
+                ('sessionType', c_uint), ('sessionFlags', c_uint),
+                ('hMaxResolution', c_uint), ('vMaxResolution', c_uint),
+                ('hResolution', c_uint), ('vResolution', c_uint),
+                ('averageFps', c_uint), ('averageLatency', c_uint)]
+
+
+# /**
+#  * Represents all the active FBC sessions on the given device/vGPU instance
+#  */
+class c_dcgmDeviceFbcSessions_v1(_PrintableStructure):
+    _fields_ = [('version', c_uint), ('sessionCount', c_uint),
+                ('sessionInfo',
+                 c_dcgmDeviceFbcSessionInfo_t * DCGM_MAX_FBC_SESSIONS)]
+
+
+# /**
+#  * Represents static info related to vGPU types supported on the device
+#  */
+class c_dcgmDeviceVgpuTypeInfo_v1(_PrintableStructure):
+    _fields_ = [('version', c_uint), ('vgpuTypeId', c_uint),
+                ('vgpuTypeName', c_char * DCGM_VGPU_NAME_BUFFER_SIZE),
+                ('vgpuTypeClass', c_char * DCGM_VGPU_NAME_BUFFER_SIZE),
+                ('vgpuTypeLicense', c_char * DCGM_GRID_LICENSE_BUFFER_SIZE),
+                ('deviceId', c_uint), ('subsystemId', c_uint),
+                ('numDisplayHeads', c_uint), ('maxInstances', c_uint),
+                ('frameRateLimit', c_uint), ('maxResolutionX', c_uint),
+                ('maxResolutionY', c_uint), ('fbTotal', c_uint)]
+
+
+class c_dcgmDeviceVgpuTypeInfo_v2(_PrintableStructure):
+    _fields_ = [('version', c_uint), ('vgpuTypeId', c_uint),
+                ('vgpuTypeName', c_char * DCGM_VGPU_NAME_BUFFER_SIZE),
+                ('vgpuTypeClass', c_char * DCGM_VGPU_NAME_BUFFER_SIZE),
+                ('vgpuTypeLicense', c_char * DCGM_GRID_LICENSE_BUFFER_SIZE),
+                ('deviceId', c_uint), ('subsystemId', c_uint),
+                ('numDisplayHeads', c_uint), ('maxInstances', c_uint),
+                ('frameRateLimit', c_uint), ('maxResolutionX', c_uint),
+                ('maxResolutionY', c_uint), ('fbTotal', c_uint),
+                ('gpuInstanceProfileId', c_uint)]
+
+
+dcgmDeviceVgpuTypeInfo_version2 = make_dcgm_version(c_dcgmDeviceVgpuTypeInfo_v2,
+                                                    2)
+
+
+class c_dcgmDeviceSettings_v2(_PrintableStructure):
     _fields_ = [
-        ("version", c_uint),
-        ("vgpuId", c_uint),
-        ("sessionId", c_uint),
-        ("pid", c_uint),
-        ("codecType", c_uint),
-        ("hResolution", c_uint),
-        ("vResolution", c_uint),
-        ("averageFps", c_uint),
-        ("averageLatency", c_uint),
+        ('version', c_uint),
+        ('persistenceModeEnabled', c_uint),
+        ('migModeEnabled', c_uint),
+        ('confidentialComputeMode', c_uint),
     ]
 
 
-class c_dcgmDeviceFbcStats_v1(_PrintableStructure):
-    """
-    Represents current frame buffer capture sessions statistics for the given
-    device/vGPU instance
-    """
+# /**
+#  * Represents attributes corresponding to a device
+#  */
+class c_dcgmDeviceAttributes_deprecated_v1(_PrintableStructure):
+    _fields_ = [('version', c_uint),
+                ('clockSets', c_dcgmDeviceSupportedClockSets_v1),
+                ('thermalSettings', c_dcgmDeviceThermals_v1),
+                ('powerLimits', c_dcgmDevicePowerLimits_v1),
+                ('identifiers', c_dcgmDeviceIdentifiers_v1),
+                ('memoryUsage', c_dcgmDeviceMemoryUsage_v1),
+                ('unused', c_char * 208)]
+
+
+dcgmDeviceAttributes_deprecated_version1 = make_dcgm_version(
+    c_dcgmDeviceAttributes_deprecated_v1, 1)
 
+
+# /**
+#  * Represents attributes corresponding to a device
+#  */
+class c_dcgmDeviceAttributes_v3(_PrintableStructure):
     _fields_ = [
-        ("version", c_uint),
-        ("sessionCount", c_uint),
-        ("averageFps", c_uint),
-        ("averageLatency", c_uint),
+        ('version', c_uint),
+        ('clockSets', c_dcgmDeviceSupportedClockSets_v1),
+        ('thermalSettings', c_dcgmDeviceThermals_v1),
+        ('powerLimits', c_dcgmDevicePowerLimits_v1),
+        ('identifiers', c_dcgmDeviceIdentifiers_v1),
+        ('memoryUsage', c_dcgmDeviceMemoryUsage_v1),
+        ('settings', c_dcgmDeviceSettings_v2),
     ]
 
 
-class c_dcgmDeviceFbcSessionInfo_t(_PrintableStructure):
-    """
-    Represents information about active FBC session on the given device/vGPU
-    instance
-    """
+dcgmDeviceAttributes_version3 = make_dcgm_version(c_dcgmDeviceAttributes_v3, 3)
+
 
+# /**
+#  * Represents attributes info for a MIG device
+#  */
+class c_dcgmDeviceMigAttributesInfo_v1(_PrintableStructure):
     _fields_ = [
-        ("version", c_uint),
-        ("sessionId", c_uint),
-        ("pid", c_uint),
-        ("vgpuId", c_uint),
-        ("displayOrdinal", c_uint),
-        ("sessionType", c_uint),
-        ("sessionFlags", c_uint),
-        ("hMaxResolution", c_uint),
-        ("vMaxResolution", c_uint),
-        ("hResolution", c_uint),
-        ("vResolution", c_uint),
-        ("averageFps", c_uint),
-        ("averageLatency", c_uint),
+        ('version', c_uint),
+        ('gpuInstanceId', c_uint),
+        ('computeInstanceId', c_uint),
+        ('multiprocessorCount', c_uint),
+        ('sharedCopyEngineCount', c_uint),
+        ('sharedDecoderCount', c_uint),
+        ('sharedEncoderCount', c_uint),
+        ('sharedJpegCount', c_uint),
+        ('sharedOfaCount', c_uint),
+        ('gpuInstanceSliceCount', c_uint),
+        ('computeInstanceSliceCount', c_uint),
+        ('memorySizeMB', c_uint64),
     ]
 
 
-class c_dcgmDeviceFbcSessions_v1(_PrintableStructure):
-    """
-    Represents all the active FBC sessions on the given device/vGPU instance
-    """
+dcgmDeviceMigAttributesInfo_version1 = make_dcgm_version(
+    c_dcgmDeviceMigAttributesInfo_v1, 1)
+
 
+# /**
+#  * Represents attributes for a MIG device
+#  */
+class c_dcgmDeviceMigAttributes_v1(_PrintableStructure):
     _fields_ = [
-        ("version", c_uint),
-        ("sessionCount", c_uint),
-        ("sessionInfo", c_dcgmDeviceFbcSessionInfo_t * DCGM_MAX_FBC_SESSIONS),
+        ('version', c_uint),
+        ('migDevicesCount', c_uint),
+        ('migAttributesInfo', c_dcgmDeviceMigAttributesInfo_v1),
     ]
 
 
-class c_dcgmDeviceVgpuTypeInfo_v1(_PrintableStructure):
-    """
-    Represents static info related to vGPU types supported on the device
-    """
+dcgmDeviceMigAttributes_version1 = make_dcgm_version(
+    c_dcgmDeviceMigAttributes_v1, 1)
+
 
+# /**
+#  * Represents GPU instance profile information
+#  */
+class c_dcgmGpuInstanceProfileInfo_v1(_PrintableStructure):
     _fields_ = [
-        ("version", c_uint),
-        ("vgpuTypeId", c_uint),
-        ("vgpuTypeName", c_char * DCGM_VGPU_NAME_BUFFER_SIZE),
-        ("vgpuTypeClass", c_char * DCGM_VGPU_NAME_BUFFER_SIZE),
-        ("vgpuTypeLicense", c_char * DCGM_GRID_LICENSE_BUFFER_SIZE),
-        ("deviceId", c_uint),
-        ("subsystemId", c_uint),
-        ("numDisplayHeads", c_uint),
-        ("maxInstances", c_uint),
-        ("frameRateLimit", c_uint),
-        ("maxResolutionX", c_uint),
-        ("maxResolutionY", c_uint),
-        ("fbTotal", c_uint),
+        ('version', c_uint),
+        ('id', c_uint),
+        ('isP2pSupported', c_uint),
+        ('sliceCount', c_uint),
+        ('instanceCount', c_uint),
+        ('multiprocessorCount', c_uint),
+        ('copyEngineCount', c_uint),
+        ('decoderCount', c_uint),
+        ('encoderCount', c_uint),
+        ('jpegCount', c_uint),
+        ('ofaCount', c_uint),
+        ('memorySizeMB', c_uint64),
     ]
 
 
-class c_dcgmDeviceSettings_v1(_PrintableStructure):
+dcgmGpuInstanceProfileInfo_version1 = make_dcgm_version(
+    c_dcgmGpuInstanceProfileInfo_v1, 1)
+
+
+# /**
+#  * Represents GPU instance profiles
+#  */
+class c_dcgmGpuInstanceProfiles_v1(_PrintableStructure):
     _fields_ = [
-        ("version", c_uint),
-        ("persistenceModeEnabled", c_uint),
-        ("migModeEnabled", c_uint),
+        ('version', c_uint),
+        ('profileCount', c_uint),
+        ('profileInfo', c_dcgmGpuInstanceProfileInfo_v1),
     ]
 
 
-class c_dcgmDeviceAttributes_v1(_PrintableStructure):
-    """
-    Represents attributes corresponding to a device
-    """
+dcgmGpuInstanceProfiles_version1 = make_dcgm_version(
+    c_dcgmGpuInstanceProfiles_v1, 1)
+
 
+# /**
+#  * Represents Compute instance profile information
+#  */
+class c_dcgmComputeInstanceProfileInfo_v1(_PrintableStructure):
     _fields_ = [
-        ("version", c_uint),
-        ("clockSets", c_dcgmDeviceSupportedClockSets_v1),
-        ("thermalSettings", c_dcgmDeviceThermals_v1),
-        ("powerLimits", c_dcgmDevicePowerLimits_v1),
-        ("identifiers", c_dcgmDeviceIdentifiers_v1),
-        ("memoryUsage", c_dcgmDeviceMemoryUsage_v1),
-        ("unused", c_char * 208),
+        ('version', c_uint),
+        ('gpuInstanceId', c_uint),
+        ('id', c_uint),
+        ('sliceCount', c_uint),
+        ('instanceCount', c_uint),
+        ('multiprocessorCount', c_uint),
+        ('sharedCopyEngineCount', c_uint),
+        ('sharedDecoderCount', c_uint),
+        ('sharedEncoderCount', c_uint),
+        ('sharedJpegCount', c_uint),
+        ('sharedOfaCount', c_uint),
     ]
 
 
-dcgmDeviceAttributes_version1 = make_dcgm_version(c_dcgmDeviceAttributes_v1, 1)
-
+dcgmComputeInstanceProfileInfo_version1 = make_dcgm_version(
+    c_dcgmComputeInstanceProfileInfo_v1, 1)
 
-class c_dcgmDeviceAttributes_v2(_PrintableStructure):
-    """
-    Represents attributes corresponding to a device
-    """
 
+# /**
+#  * Represents Compute instance profiles
+#  */
+class c_dcgmComputeInstanceProfiles_v1(_PrintableStructure):
     _fields_ = [
-        ("version", c_uint),
-        ("clockSets", c_dcgmDeviceSupportedClockSets_v1),
-        ("thermalSettings", c_dcgmDeviceThermals_v1),
-        ("powerLimits", c_dcgmDevicePowerLimits_v1),
-        ("identifiers", c_dcgmDeviceIdentifiers_v1),
-        ("memoryUsage", c_dcgmDeviceMemoryUsage_v1),
-        ("settings", c_dcgmDeviceSettings_v1),
+        ('version', c_uint),
+        ('profileCount', c_uint),
+        ('profileInfo', c_dcgmComputeInstanceProfileInfo_v1),
     ]
 
 
-dcgmDeviceAttributes_version2 = make_dcgm_version(c_dcgmDeviceAttributes_v2, 2)
+dcgmComputeInstanceProfiles_version1 = make_dcgm_version(
+    c_dcgmComputeInstanceProfiles_v1, 1)
 
 
+# /**
+#  * Represents vGPU attributes corresponding to a device
+#  */
 class c_dcgmVgpuDeviceAttributes_v6(_PrintableStructure):
-    """
-    Represents vGPU attributes corresponding to a device
-    """
-
     _fields_ = [
-        ("version", c_uint),
-        ("activeVgpuInstanceCount", c_uint),
-        ("activeVgpuInstanceIds", c_uint * DCGM_MAX_VGPU_INSTANCES_PER_PGPU),
-        ("creatableVgpuTypeCount", c_uint),
-        ("creatableVgpuTypeIds", c_uint * DCGM_MAX_VGPU_TYPES_PER_PGPU),
-        ("supportedVgpuTypeCount", c_uint),
-        (
-            "supportedVgpuTypeInfo",
-            c_dcgmDeviceVgpuTypeInfo_v1 * DCGM_MAX_VGPU_TYPES_PER_PGPU,
-        ),
-        ("vgpuUtilInfo", c_dcgmDeviceVgpuUtilInfo_v1 * DCGM_MAX_VGPU_TYPES_PER_PGPU),
-        ("gpuUtil", c_uint),
-        ("memCopyUtil", c_uint),
-        ("encUtil", c_uint),
-        ("decUtil", c_uint),
+        ('version', c_uint), ('activeVgpuInstanceCount', c_uint),
+        ('activeVgpuInstanceIds', c_uint * DCGM_MAX_VGPU_INSTANCES_PER_PGPU),
+        ('creatableVgpuTypeCount', c_uint),
+        ('creatableVgpuTypeIds', c_uint * DCGM_MAX_VGPU_TYPES_PER_PGPU),
+        ('supportedVgpuTypeCount', c_uint),
+        ('supportedVgpuTypeInfo',
+         c_dcgmDeviceVgpuTypeInfo_v1 * DCGM_MAX_VGPU_TYPES_PER_PGPU),
+        ('vgpuUtilInfo',
+         c_dcgmDeviceVgpuUtilInfo_v1 * DCGM_MAX_VGPU_TYPES_PER_PGPU),
+        ('gpuUtil', c_uint), ('memCopyUtil', c_uint), ('encUtil', c_uint),
+        ('decUtil', c_uint)
     ]
 
 
-dcgmVgpuDeviceAttributes_version6 = make_dcgm_version(c_dcgmVgpuDeviceAttributes_v6, 1)
+dcgmVgpuDeviceAttributes_version6 = make_dcgm_version(
+    c_dcgmVgpuDeviceAttributes_v6, 1)
 
 
-class c_dcgmVgpuInstanceAttributes_v1(_PrintableStructure):
-    """
-    Represents attributes specific to vGPU instance
-    """
-
+class c_dcgmVgpuDeviceAttributes_v7(_PrintableStructure):
     _fields_ = [
-        ("version", c_uint),
-        ("vmId", c_char * DCGM_DEVICE_UUID_BUFFER_SIZE),
-        ("vmName", c_char * DCGM_DEVICE_UUID_BUFFER_SIZE),
-        ("vgpuTypeId", c_uint),
-        ("vgpuUuid", c_char * DCGM_DEVICE_UUID_BUFFER_SIZE),
-        ("vgpuDriverVersion", c_char * DCGM_DEVICE_UUID_BUFFER_SIZE),
-        ("fbUsage", c_uint),
-        ("licenseStatus", c_uint),
-        ("frameRateLimit", c_uint),
+        ('version', c_uint), ('activeVgpuInstanceCount', c_uint),
+        ('activeVgpuInstanceIds', c_uint * DCGM_MAX_VGPU_INSTANCES_PER_PGPU),
+        ('creatableVgpuTypeCount', c_uint),
+        ('creatableVgpuTypeIds', c_uint * DCGM_MAX_VGPU_TYPES_PER_PGPU),
+        ('supportedVgpuTypeCount', c_uint),
+        ('supportedVgpuTypeInfo',
+         c_dcgmDeviceVgpuTypeInfo_v2 * DCGM_MAX_VGPU_TYPES_PER_PGPU),
+        ('vgpuUtilInfo',
+         c_dcgmDeviceVgpuUtilInfo_v1 * DCGM_MAX_VGPU_TYPES_PER_PGPU),
+        ('gpuUtil', c_uint), ('memCopyUtil', c_uint), ('encUtil', c_uint),
+        ('decUtil', c_uint)
     ]
 
 
+dcgmVgpuDeviceAttributes_version7 = make_dcgm_version(
+    c_dcgmVgpuDeviceAttributes_v7, 7)
+
+
+# /**
+#  * Represents attributes specific to vGPU instance
+#  */
+class c_dcgmVgpuInstanceAttributes_v1(_PrintableStructure):
+    _fields_ = [('version', c_uint),
+                ('vmId', c_char * DCGM_DEVICE_UUID_BUFFER_SIZE),
+                ('vmName', c_char * DCGM_DEVICE_UUID_BUFFER_SIZE),
+                ('vgpuTypeId', c_uint),
+                ('vgpuUuid', c_char * DCGM_DEVICE_UUID_BUFFER_SIZE),
+                ('vgpuDriverVersion', c_char * DCGM_DEVICE_UUID_BUFFER_SIZE),
+                ('fbUsage', c_uint), ('licenseStatus', c_uint),
+                ('frameRateLimit', c_uint)]
+
+
 dcgmVgpuInstanceAttributes_version1 = make_dcgm_version(
-    c_dcgmVgpuInstanceAttributes_v1, 1
-)
+    c_dcgmVgpuInstanceAttributes_v1, 1)
 
 
 class c_dcgmConfigPowerLimit(_PrintableStructure):
-    _fields_ = [("type", c_uint), ("val", c_uint)]
+    _fields_ = [('type', c_uint), ('val', c_uint)]
 
 
 class c_dcgmConfigPerfStateSettings_t(_PrintableStructure):
@@ -1146,12 +1286,12 @@ class c_dcgmConfigPerfStateSettings_t(_PrintableStructure):
 class c_dcgmDeviceConfig_v1(_PrintableStructure):
     _fields_ = [
         # version must always be first
-        ("version", c_uint),
-        ("gpuId", c_uint),
-        ("mEccMode", c_uint),
-        ("mComputeMode", c_uint),
-        ("mPerfState", c_dcgmConfigPerfStateSettings_t),
-        ("mPowerLimit", c_dcgmConfigPowerLimit),
+        ('version', c_uint),
+        ('gpuId', c_uint),
+        ('mEccMode', c_uint),
+        ('mComputeMode', c_uint),
+        ('mPerfState', c_dcgmConfigPerfStateSettings_t),
+        ('mPowerLimit', c_dcgmConfigPowerLimit)
     ]
 
 
@@ -1162,16 +1302,16 @@ class c_dcgmDeviceConfig_v1(_PrintableStructure):
 class c_dcgmDeviceVgpuConfig_v1(_PrintableStructure):
     _fields_ = [
         # version must always be first
-        ("version", c_uint),
-        ("gpuId", c_uint),
-        ("mEccMode", c_uint),
-        ("mComputeMode", c_uint),
-        ("mPerfState", c_dcgmConfigPerfStateSettings_t),
-        ("mPowerLimit", c_dcgmConfigPowerLimit),
+        ('version', c_uint),
+        ('gpuId', c_uint),
+        ('mEccMode', c_uint),
+        ('mComputeMode', c_uint),
+        ('mPerfState', c_dcgmConfigPerfStateSettings_t),
+        ('mPowerLimit', c_dcgmConfigPowerLimit)
     ]
 
     def SetBlank(self):
-        # Does not set version or gpuId
+        #Does not set version or gpuId
         self.mEccMode = dcgmvalue.DCGM_INT32_BLANK
         self.mPerfState.syncBoost = dcgmvalue.DCGM_INT32_BLANK
         self.mPerfState.targetClocks.memClock = dcgmvalue.DCGM_INT32_BLANK
@@ -1224,7 +1364,8 @@ class c_dcgmUnwatchFieldValue_v1(_PrintableStructure):
     _fields_ = []
 
 
-dcgmUnwatchFieldValue_version1 = make_dcgm_version(c_dcgmUnwatchFieldValue_v1, 1)
+dcgmUnwatchFieldValue_version1 = make_dcgm_version(c_dcgmUnwatchFieldValue_v1,
+                                                   1)
 
 
 class c_dcgmUpdateAllFields_v1(_PrintableStructure):
@@ -1233,9 +1374,19 @@ class c_dcgmUpdateAllFields_v1(_PrintableStructure):
 
 dcgmUpdateAllFields_version1 = make_dcgm_version(c_dcgmUpdateAllFields_v1, 1)
 
-dcgmGetMultipleValuesForField_version1 = 1
+dcgmGetMultipleValuesForFieldResponse_version1 = 1
+
+# policy enums (and table indices)
+DCGM_POLICY_COND_IDX_DBE = 0
+DCGM_POLICY_COND_IDX_PCI = 1
+DCGM_POLICY_COND_IDX_MAX_PAGES_RETIRED = 2
+DCGM_POLICY_COND_IDX_THERMAL = 3
+DCGM_POLICY_COND_IDX_POWER = 4
+DCGM_POLICY_COND_IDX_NVLINK = 5
+DCGM_POLICY_COND_IDX_XID = 6
+DCGM_POLICY_COND_IDX_MAX = 7
 
-# policy enums
+# policy enum bitmasks
 DCGM_POLICY_COND_DBE = 0x1
 DCGM_POLICY_COND_PCI = 0x2
 DCGM_POLICY_COND_MAX_PAGES_RETIRED = 0x4
@@ -1251,12 +1402,13 @@ class c_dcgmUpdateAllFields_v1(_PrintableStructure):
 DCGM_POLICY_ISOLATION_NONE = 0
 
 DCGM_POLICY_ACTION_NONE = 0
-DCGM_POLICY_ACTION_GPURESET = 1  # Deprecated
+DCGM_POLICY_ACTION_GPURESET = 1  #Deprecated
 
 DCGM_POLICY_VALID_NONE = 0
 DCGM_POLICY_VALID_SV_SHORT = 1
 DCGM_POLICY_VALID_SV_MED = 2
 DCGM_POLICY_VALID_SV_LONG = 3
+DCGM_POLICY_VALID_SV_XLONG = 4
 
 DCGM_POLICY_FAILURE_NONE = 0
 
@@ -1264,6 +1416,7 @@ class c_dcgmUpdateAllFields_v1(_PrintableStructure):
 DCGM_DIAG_LVL_SHORT = 10
 DCGM_DIAG_LVL_MED = 20
 DCGM_DIAG_LVL_LONG = 30
+DCGM_DIAG_LVL_XLONG = 40
 
 DCGM_DIAG_RESULT_PASS = 0
 DCGM_DIAG_RESULT_SKIP = 1
@@ -1272,7 +1425,7 @@ class c_dcgmUpdateAllFields_v1(_PrintableStructure):
 DCGM_DIAG_RESULT_NOT_RUN = 4
 
 
-class c_dcgmPolicyConditionParmTypes_t(Union):
+class c_dcgmPolicyConditionParmTypes_t(DcgmUnion):
     _fields_ = [
         ("boolean", c_bool),
         ("llval", c_longlong),
@@ -1280,7 +1433,7 @@ class c_dcgmPolicyConditionParmTypes_t(Union):
 
 
 class c_dcgmPolicyConditionParms_t(_PrintableStructure):
-    _fields_ = [("tag", c_uint), ("val", c_dcgmPolicyConditionParmTypes_t)]
+    _fields_ = [('tag', c_uint), ('val', c_dcgmPolicyConditionParmTypes_t)]
 
 
 class c_dcgmPolicy_v1(_PrintableStructure):
@@ -1303,39 +1456,39 @@ class c_dcgmPolicy_v1(_PrintableStructure):
 class c_dcgmPolicyConditionPci_t(_PrintableStructure):
     _fields_ = [
         ("timestamp", c_longlong),  # timestamp of the error
-        ("counter", c_uint),  # value of the PCIe replay counter
+        ("counter", c_uint)  # value of the PCIe replay counter
     ]
 
 
 class c_dcgmPolicyConditionDbe_t(_PrintableStructure):
-    LOCATIONS = {"L1": 0, "L2": 1, "DEVICE": 2, "REGISTER": 3, "TEXTURE": 4}
+    LOCATIONS = {'L1': 0, 'L2': 1, 'DEVICE': 2, 'REGISTER': 3, 'TEXTURE': 4}
 
     _fields_ = [
-        ("timestamp", c_longlong),  # timestamp of the error
+        ("timestamp", c_longlong),  # timestamp of the error 
         ("location", c_int),  # location of the error (one of self.LOCATIONS)
-        ("numerrors", c_uint),  # number of errors
+        ("numerrors", c_uint)  # number of errors       
     ]
 
 
 class c_dcgmPolicyConditionMpr_t(_PrintableStructure):
     _fields_ = [
-        ("timestamp", c_longlong),  # timestamp of the error
-        ("sbepages", c_uint),  # number of pending pages due to SBE
-        ("dbepages", c_uint),  # number of pending pages due to DBE
+        ("timestamp", c_longlong),  # timestamp of the error             
+        ("sbepages", c_uint),  # number of pending pages due to SBE 
+        ("dbepages", c_uint)  # number of pending pages due to DBE 
     ]
 
 
 class c_dcgmPolicyConditionThermal_t(_PrintableStructure):
     _fields_ = [
         ("timestamp", c_longlong),  # timestamp of the error
-        ("thermalViolation", c_uint),  # Temperature reached that violated policy
+        ("thermalViolation", c_uint)  # Temperature reached that violated policy
     ]
 
 
 class c_dcgmPolicyConditionPower_t(_PrintableStructure):
     _fields_ = [
         ("timestamp", c_longlong),  # timestamp of the error
-        ("powerViolation", c_uint),  # Power value reached that violated policyy
+        ("powerViolation", c_uint)  # Power value reached that violated policyy
     ]
 
 
@@ -1343,59 +1496,49 @@ class c_dcgmPolicyConditionNvlink_t(_PrintableStructure):
     _fields_ = [
         ("timestamp", c_longlong),  # timestamp of the error
         ("fieldId", c_ushort),  # FieldId of the nvlink error counter
-        ("counter", c_uint),  # Error value reached that violated policyy
+        ("counter", c_uint)  # Error value reached that violated policyy
     ]
 
 
 class c_dcgmPolicyConditionXID_t(_PrintableStructure):
     _fields_ = [
         ("timestamp", c_longlong),  # timestamp of the error
-        ("errnum", c_uint),  # XID error number
+        ("errnum", c_uint)  # XID error number
     ]
 
 
 class c_dcgmPolicyCallbackResponse_v1(_PrintableStructure):
-    class Value(Union):
+
+    class Value(DcgmUnion):
         # implement more of the fields when a test requires them
         _fields_ = [
-            ("dbe", c_dcgmPolicyConditionDbe_t),  # ECC DBE return structure
-            ("pci", c_dcgmPolicyConditionPci_t),  # PCI replay error return structure
-            (
-                "mpr",
-                c_dcgmPolicyConditionMpr_t,
-            ),  # Max retired pages limit return structure
-            (
-                "thermal",
-                c_dcgmPolicyConditionThermal_t,
-            ),  # Thermal policy violations return structure
-            (
-                "power",
-                c_dcgmPolicyConditionPower_t,
-            ),  # Power policy violations return structure
-            (
-                "nvlink",
-                c_dcgmPolicyConditionNvlink_t,
+            ("dbe", c_dcgmPolicyConditionDbe_t
+            ),  #  ECC DBE return structure                   
+            ("pci", c_dcgmPolicyConditionPci_t
+            ),  #  PCI replay error return structure          
+            ("mpr", c_dcgmPolicyConditionMpr_t
+            ),  #  Max retired pages limit return structure   
+            ("thermal", c_dcgmPolicyConditionThermal_t
+            ),  #  Thermal policy violations return structure 
+            ("power", c_dcgmPolicyConditionPower_t
+            ),  #  Power policy violations return structure   
+            ("nvlink", c_dcgmPolicyConditionNvlink_t
             ),  # Nvlink policy violations return structure..
-            (
-                "xid",
-                c_dcgmPolicyConditionXID_t,
-            ),  # XID policy violations return structure
+            ("xid", c_dcgmPolicyConditionXID_t
+            )  # XID policy violations return structure
         ]
 
     _fields_ = [
         ("version", c_uint),
         ("condition", c_int),  # an OR'ed list of DCGM_POLICY_COND_*
-        ("val", Value),
+        ("val", Value)
     ]
 
 
-class c_dcgmFieldValue_v1_value(Union):
-    _fields_ = [
-        ("i64", c_int64),
-        ("dbl", c_double),
-        ("str", c_char * DCGM_MAX_STR_LENGTH),
-        ("blob", c_byte * DCGM_MAX_BLOB_LENGTH),
-    ]
+class c_dcgmFieldValue_v1_value(DcgmUnion):
+    _fields_ = [('i64', c_int64), ('dbl', c_double),
+                ('str', c_char * DCGM_MAX_STR_LENGTH),
+                ('blob', c_byte * DCGM_MAX_BLOB_LENGTH)]
 
 
 # This structure is used to represent value for the field to be queried.
@@ -1414,8 +1557,7 @@ class c_dcgmFieldValue_v1(_PrintableStructure):
 dcgmFieldValue_version1 = make_dcgm_version(c_dcgmFieldValue_v1, 1)
 
 
-# This structure is used to represent value for the field to be queried
-# (version 2)
+# This structure is used to represent value for the field to be queried (version 2)
 class c_dcgmFieldValue_v2(_PrintableStructure):
     _fields_ = [
         # version must always be first
@@ -1433,7 +1575,7 @@ class c_dcgmFieldValue_v2(_PrintableStructure):
 
 dcgmFieldValue_version2 = make_dcgm_version(c_dcgmFieldValue_v2, 2)
 
-# Field value flags used by dcgm_agent.dcgmEntitiesGetLatestValues()
+#Field value flags used by dcgm_agent.dcgmEntitiesGetLatestValues()
 DCGM_FV_FLAG_LIVE_DATA = 0x00000001
 
 DCGM_HEALTH_WATCH_PCIE = 0x1
@@ -1458,7 +1600,7 @@ class c_dcgmFieldValue_v2(_PrintableStructure):
 
 
 class c_dcgmDiagErrorDetail_t(_PrintableStructure):
-    _fields_ = [("msg", c_char * 1024), ("code", c_uint)]
+    _fields_ = [('msg', c_char * 1024), ('code', c_uint)]
 
 
 DCGM_HEALTH_WATCH_MAX_INCIDENTS = DCGM_GROUP_MAX_ENTITIES
@@ -1486,37 +1628,36 @@ class c_dcgmHealthResponse_v4(_PrintableStructure):
 
 
 class c_dcgmHealthSetParams_v2(_PrintableStructure):
-    _fields_ = [
-        ("version", c_uint32),
-        ("groupId", c_void_p),
-        ("systems", c_uint32),
-        ("updateInterval", c_int64),
-        ("maxKeepAge", c_double),
-    ]
+    _fields_ = [('version', c_uint32), ('groupId', c_void_p),
+                ('systems', c_uint32), ('updateInterval', c_int64),
+                ('maxKeepAge', c_double)]
 
 
 dcgmHealthSetParams_version2 = make_dcgm_version(c_dcgmHealthSetParams_v2, 2)
 
 
-# Pid info structs
+#Pid info structs
 class c_dcgmStatSummaryInt64_t(_PrintableStructure):
-    _fields_ = [("minValue", c_int64), ("maxValue", c_int64), ("average", c_int64)]
+    _fields_ = [('minValue', c_int64), ('maxValue', c_int64),
+                ('average', c_int64)]
 
 
 class c_dcgmStatSummaryInt32_t(_PrintableStructure):
-    _fields_ = [("minValue", c_int32), ("maxValue", c_int32), ("average", c_int32)]
+    _fields_ = [('minValue', c_int32), ('maxValue', c_int32),
+                ('average', c_int32)]
 
 
 class c_dcgmStatSummaryFp64_t(_PrintableStructure):
-    _fields_ = [("minValue", c_double), ("maxValue", c_double), ("average", c_double)]
+    _fields_ = [('minValue', c_double), ('maxValue', c_double),
+                ('average', c_double)]
 
 
 class c_dcgmProcessUtilInfo_t(_PrintableStructure):
-    _fields_ = [("pid", c_uint), ("smUtil", c_double), ("memUtil", c_double)]
+    _fields_ = [('pid', c_uint), ('smUtil', c_double), ('memUtil', c_double)]
 
 
 class c_dcgmHealthResponseInfo_t(_PrintableStructure):
-    _fields_ = [("system", c_uint), ("health", c_uint)]
+    _fields_ = [('system', c_uint), ('health', c_uint)]
 
 
 DCGM_MAX_PID_INFO_NUM = 16
@@ -1524,167 +1665,153 @@ class c_dcgmHealthResponseInfo_t(_PrintableStructure):
 
 class c_dcgmPidSingleInfo_t(_PrintableStructure):
     _fields_ = [
-        ("gpuId", c_uint32),
-        ("energyConsumed", c_int64),
-        ("pcieRxBandwidth", c_dcgmStatSummaryInt64_t),
-        ("pcieTxBandwidth", c_dcgmStatSummaryInt64_t),
-        ("pcieReplays", c_int64),
-        ("startTime", c_int64),
-        ("endTime", c_int64),
-        ("processUtilization", c_dcgmProcessUtilInfo_t),
-        ("smUtilization", c_dcgmStatSummaryInt32_t),
-        ("memoryUtilization", c_dcgmStatSummaryInt32_t),
-        ("eccSingleBit", c_uint32),  # Deprecated
-        ("eccDoubleBit", c_uint32),
-        ("memoryClock", c_dcgmStatSummaryInt32_t),
-        ("smClock", c_dcgmStatSummaryInt32_t),
-        ("numXidCriticalErrors", c_int32),
-        ("xidCriticalErrorsTs", c_int64 * 10),
-        ("numOtherComputePids", c_int32),
-        ("otherComputePids", c_uint32 * DCGM_MAX_PID_INFO_NUM),
-        ("numOtherGraphicsPids", c_int32),
-        ("otherGraphicsPids", c_uint32 * DCGM_MAX_PID_INFO_NUM),
-        ("maxGpuMemoryUsed", c_int64),
-        ("powerViolationTime", c_int64),
-        ("thermalViolationTime", c_int64),
-        ("reliabilityViolationTime", c_int64),
-        ("boardLimitViolationTime", c_int64),
-        ("lowUtilizationTime", c_int64),
-        ("syncBoostTime", c_int64),
-        ("overallHealth", c_uint),
-        ("incidentCount", c_uint),
-        ("systems", c_dcgmHealthResponseInfo_t * DCGM_HEALTH_WATCH_COUNT_V1),
+        ('gpuId', c_uint32),
+        ('energyConsumed', c_int64),
+        ('pcieRxBandwidth', c_dcgmStatSummaryInt64_t),
+        ('pcieTxBandwidth', c_dcgmStatSummaryInt64_t),
+        ('pcieReplays', c_int64),
+        ('startTime', c_int64),
+        ('endTime', c_int64),
+        ('processUtilization', c_dcgmProcessUtilInfo_t),
+        ('smUtilization', c_dcgmStatSummaryInt32_t),
+        ('memoryUtilization', c_dcgmStatSummaryInt32_t),
+        ('eccSingleBit', c_uint32),  #Deprecated
+        ('eccDoubleBit', c_uint32),
+        ('memoryClock', c_dcgmStatSummaryInt32_t),
+        ('smClock', c_dcgmStatSummaryInt32_t),
+        ('numXidCriticalErrors', c_int32),
+        ('xidCriticalErrorsTs', c_int64 * 10),
+        ('numOtherComputePids', c_int32),
+        ('otherComputePids', c_uint32 * DCGM_MAX_PID_INFO_NUM),
+        ('numOtherGraphicsPids', c_int32),
+        ('otherGraphicsPids', c_uint32 * DCGM_MAX_PID_INFO_NUM),
+        ('maxGpuMemoryUsed', c_int64),
+        ('powerViolationTime', c_int64),
+        ('thermalViolationTime', c_int64),
+        ('reliabilityViolationTime', c_int64),
+        ('boardLimitViolationTime', c_int64),
+        ('lowUtilizationTime', c_int64),
+        ('syncBoostTime', c_int64),
+        ('overallHealth', c_uint),
+        ('incidentCount', c_uint),
+        ('systems', c_dcgmHealthResponseInfo_t * DCGM_HEALTH_WATCH_COUNT_V1)
     ]
 
 
 class c_dcgmPidInfo_v2(_PrintableStructure):
-    _fields_ = [
-        ("version", c_uint32),
-        ("pid", c_uint32),
-        ("unused", c_uint32),
-        ("numGpus", c_int32),
-        ("summary", c_dcgmPidSingleInfo_t),
-        ("gpus", c_dcgmPidSingleInfo_t * DCGM_MAX_NUM_DEVICES),
-    ]
+    _fields_ = [('version', c_uint32), ('pid', c_uint32), ('unused', c_uint32),
+                ('numGpus', c_int32), ('summary', c_dcgmPidSingleInfo_t),
+                ('gpus', c_dcgmPidSingleInfo_t * DCGM_MAX_NUM_DEVICES)]
 
 
 dcgmPidInfo_version2 = make_dcgm_version(c_dcgmPidInfo_v2, 2)
 
 
 class c_dcgmRunningProcess_v1(_PrintableStructure):
-    _fields_ = [("version", c_uint32), ("pid", c_uint32), ("memoryUsed", c_uint64)]
+    _fields_ = [('version', c_uint32), ('pid', c_uint32),
+                ('memoryUsed', c_uint64)]
 
 
 dcgmRunningProcess_version1 = make_dcgm_version(c_dcgmRunningProcess_v1, 1)
 
+c_dcgmRunningProcess_t = c_dcgmRunningProcess_v1
+
 
 class c_dcgmGpuUsageInfo_t(_PrintableStructure):
     _fields_ = [
-        ("gpuId", c_uint32),
-        ("energyConsumed", c_int64),
-        ("powerUsage", c_dcgmStatSummaryFp64_t),
-        ("pcieRxBandwidth", c_dcgmStatSummaryInt64_t),
-        ("pcieTxBandwidth", c_dcgmStatSummaryInt64_t),
-        ("pcieReplays", c_int64),
-        ("startTime", c_int64),
-        ("endTime", c_int64),
-        ("smUtilization", c_dcgmStatSummaryInt32_t),
-        ("memoryUtilization", c_dcgmStatSummaryInt32_t),
-        ("eccSingleBit", c_uint32),  # Deprecated
-        ("eccDoubleBit", c_uint32),
-        ("memoryClock", c_dcgmStatSummaryInt32_t),
-        ("smClock", c_dcgmStatSummaryInt32_t),
-        ("numXidCriticalErrors", c_int32),
-        ("xidCriticalErrorsTs", c_int64 * 10),
-        ("numComputePids", c_int32),
-        ("computePids", c_dcgmProcessUtilInfo_t * DCGM_MAX_PID_INFO_NUM),
-        ("numGraphicsPids", c_int32),
-        ("graphicsPids", c_dcgmProcessUtilInfo_t * DCGM_MAX_PID_INFO_NUM),
-        ("maxGpuMemoryUsed", c_int64),
-        ("powerViolationTime", c_int64),
-        ("thermalViolationTime", c_int64),
-        ("reliabilityViolationTime", c_int64),
-        ("boardLimitViolationTime", c_int64),
-        ("lowUtilizationTime", c_int64),
-        ("syncBoostTime", c_int64),
-        ("overallHealth", c_uint),
-        ("incidentCount", c_uint),
-        ("systems", c_dcgmHealthResponseInfo_t * DCGM_HEALTH_WATCH_COUNT_V1),
+        ('gpuId', c_uint32),
+        ('energyConsumed', c_int64),
+        ('powerUsage', c_dcgmStatSummaryFp64_t),
+        ('pcieRxBandwidth', c_dcgmStatSummaryInt64_t),
+        ('pcieTxBandwidth', c_dcgmStatSummaryInt64_t),
+        ('pcieReplays', c_int64),
+        ('startTime', c_int64),
+        ('endTime', c_int64),
+        ('smUtilization', c_dcgmStatSummaryInt32_t),
+        ('memoryUtilization', c_dcgmStatSummaryInt32_t),
+        ('eccSingleBit', c_uint32),  #Deprecated
+        ('eccDoubleBit', c_uint32),
+        ('memoryClock', c_dcgmStatSummaryInt32_t),
+        ('smClock', c_dcgmStatSummaryInt32_t),
+        ('numXidCriticalErrors', c_int32),
+        ('xidCriticalErrorsTs', c_int64 * 10),
+        ('numComputePids', c_int32),
+        ('computePids', c_dcgmProcessUtilInfo_t * DCGM_MAX_PID_INFO_NUM),
+        ('numGraphicsPids', c_int32),
+        ('graphicsPids', c_dcgmProcessUtilInfo_t * DCGM_MAX_PID_INFO_NUM),
+        ('maxGpuMemoryUsed', c_int64),
+        ('powerViolationTime', c_int64),
+        ('thermalViolationTime', c_int64),
+        ('reliabilityViolationTime', c_int64),
+        ('boardLimitViolationTime', c_int64),
+        ('lowUtilizationTime', c_int64),
+        ('syncBoostTime', c_int64),
+        ('overallHealth', c_uint),
+        ('incidentCount', c_uint),
+        ('systems', c_dcgmHealthResponseInfo_t * DCGM_HEALTH_WATCH_COUNT_V1)
     ]
 
 
 class c_dcgmJobInfo_v3(_PrintableStructure):
-    _fields_ = [
-        ("version", c_uint32),
-        ("numGpus", c_int32),
-        ("summary", c_dcgmGpuUsageInfo_t),
-        ("gpus", c_dcgmGpuUsageInfo_t * DCGM_MAX_NUM_DEVICES),
-    ]
+    _fields_ = [('version', c_uint32), ('numGpus', c_int32),
+                ('summary', c_dcgmGpuUsageInfo_t),
+                ('gpus', c_dcgmGpuUsageInfo_t * DCGM_MAX_NUM_DEVICES)]
 
 
 dcgmJobInfo_version3 = make_dcgm_version(c_dcgmJobInfo_v3, 3)
 
 
 class c_dcgmDiagTestResult_v2(_PrintableStructure):
-    _fields_ = [
-        ("result", c_uint),
-        ("error", c_dcgmDiagErrorDetail_t),
-        ("info", c_char * 1024),
-    ]
+    _fields_ = [('result', c_uint), ('error', c_dcgmDiagErrorDetail_t),
+                ('info', c_char * 1024)]
 
 
-class c_dcgmDiagResponsePerGpu_v2(_PrintableStructure):
-    _fields_ = [
-        ("gpuId", c_uint),
-        ("hwDiagnosticReturn", c_uint),
-        ("results", c_dcgmDiagTestResult_v2 * DCGM_PER_GPU_TEST_COUNT),
-    ]
+class c_dcgmDiagResponsePerGpu_v4(_PrintableStructure):
+    _fields_ = [('gpuId', c_uint), ('hwDiagnosticReturn', c_uint),
+                ('results',
+                 c_dcgmDiagTestResult_v2 * DCGM_PER_GPU_TEST_COUNT_V8)]
 
 
 DCGM_SWTEST_COUNT = 10
 LEVEL_ONE_MAX_RESULTS = 16
 
 
-class c_dcgmDiagResponse_v6(_PrintableStructure):
+class c_dcgmDiagResponse_v8(_PrintableStructure):
     _fields_ = [
-        ("version", c_uint),
-        ("gpuCount", c_uint),
-        ("levelOneTestCount", c_uint),
-        ("levelOneResults", c_dcgmDiagTestResult_v2 * LEVEL_ONE_MAX_RESULTS),
-        ("perGpuResponses", c_dcgmDiagResponsePerGpu_v2 * DCGM_MAX_NUM_DEVICES),
-        ("systemError", c_dcgmDiagErrorDetail_t),
-        ("trainingMsg", c_char * 1024),
+        ('version', c_uint), ('gpuCount', c_uint),
+        ('levelOneTestCount', c_uint),
+        ('levelOneResults', c_dcgmDiagTestResult_v2 * LEVEL_ONE_MAX_RESULTS),
+        ('perGpuResponses', c_dcgmDiagResponsePerGpu_v4 * DCGM_MAX_NUM_DEVICES),
+        ('systemError', c_dcgmDiagErrorDetail_t), ('_unused', c_char * 1024)
     ]
 
 
-dcgmDiagResponse_version6 = make_dcgm_version(c_dcgmDiagResponse_v6, 6)
+dcgmDiagResponse_version8 = make_dcgm_version(c_dcgmDiagResponse_v8, 8)
 
 DCGM_AFFINITY_BITMASK_ARRAY_SIZE = 8
 
 
 class c_dcgmDeviceTopologyPath_t(_PrintableStructure):
-    _fields_ = [("gpuId", c_uint32), ("path", c_uint32), ("localNvLinkIds", c_uint32)]
+    _fields_ = [('gpuId', c_uint32), ('path', c_uint32),
+                ('localNvLinkIds', c_uint32)]
 
 
 class c_dcgmDeviceTopology_v1(_PrintableStructure):
-    _fields_ = [
-        ("version", c_uint32),
-        ("cpuAffinityMask", c_ulong * DCGM_AFFINITY_BITMASK_ARRAY_SIZE),
-        ("numGpus", c_uint32),
-        ("gpuPaths", c_dcgmDeviceTopologyPath_t * (DCGM_MAX_NUM_DEVICES - 1)),
-    ]
+    _fields_ = [('version', c_uint32),
+                ('cpuAffinityMask', c_ulong * DCGM_AFFINITY_BITMASK_ARRAY_SIZE),
+                ('numGpus', c_uint32),
+                ('gpuPaths',
+                 c_dcgmDeviceTopologyPath_t * (DCGM_MAX_NUM_DEVICES - 1))]
 
 
 dcgmDeviceTopology_version1 = make_dcgm_version(c_dcgmDeviceTopology_v1, 1)
 
 
 class c_dcgmGroupTopology_v1(_PrintableStructure):
-    _fields_ = [
-        ("version", c_uint32),
-        ("groupCpuAffinityMask", c_ulong * DCGM_AFFINITY_BITMASK_ARRAY_SIZE),
-        ("numaOptimalFlag", c_uint32),
-        ("slowestPath", c_uint32),
-    ]
+    _fields_ = [('version', c_uint32),
+                ('groupCpuAffinityMask',
+                 c_ulong * DCGM_AFFINITY_BITMASK_ARRAY_SIZE),
+                ('numaOptimalFlag', c_uint32), ('slowestPath', c_uint32)]
 
 
 dcgmGroupTopology_version1 = make_dcgm_version(c_dcgmGroupTopology_v1, 1)
@@ -1697,202 +1824,50 @@ class c_dcgmGroupTopology_v1(_PrintableStructure):
 
 
 class c_dcgmFieldGroupInfo_v1(_PrintableStructure):
-    _fields_ = [
-        ("version", c_uint32),
-        ("numFieldIds", c_uint32),
-        ("fieldGroupId", c_void_p),
-        ("fieldGroupName", c_char * DCGM_MAX_STR_LENGTH),
-        ("fieldIds", c_uint16 * DCGM_MAX_FIELD_IDS_PER_FIELD_GROUP),
-    ]
+    _fields_ = [('version', c_uint32), ('numFieldIds', c_uint32),
+                ('fieldGroupId', c_void_p),
+                ('fieldGroupName', c_char * DCGM_MAX_STR_LENGTH),
+                ('fieldIds', c_uint16 * DCGM_MAX_FIELD_IDS_PER_FIELD_GROUP)]
 
 
 dcgmFieldGroupInfo_version1 = make_dcgm_version(c_dcgmFieldGroupInfo_v1, 1)
 
 
 class c_dcgmAllFieldGroup_v1(_PrintableStructure):
-    _fields_ = [
-        ("version", c_uint32),
-        ("numFieldGroups", c_uint32),
-        ("fieldGroups", c_dcgmFieldGroupInfo_v1 * DCGM_MAX_NUM_FIELD_GROUPS),
-    ]
+    _fields_ = [('version', c_uint32), ('numFieldGroups', c_uint32),
+                ('fieldGroups',
+                 c_dcgmFieldGroupInfo_v1 * DCGM_MAX_NUM_FIELD_GROUPS)]
 
 
 dcgmAllFieldGroup_version1 = make_dcgm_version(c_dcgmAllFieldGroup_v1, 1)
 
 
-class DCGM_INTROSPECT_LVL(object):
-    """
-    Identifies a level to retrieve field introspection info for
-    """
-
-    INVALID = 0
-    FIELD = 1
-    FIELD_GROUP = 2
-    ALL_FIELDS = 3
-
-
-class c_dcgmIntrospectContext_v1(_PrintableStructure):
-    """
-    Identifies the retrieval context for introspection API calls.
-    """
-
-    _fields_ = [
-        ("version", c_uint32),
-        # one of DCGM_INTROSPECT_LVL_?
-        ("introspectLvl", c_int),
-        # Only needed if \ref introspectLvl is FIELD_GROUP
-        ("fieldGroupId", c_void_p),
-    ]
-
-
-dcgmIntrospectContext_version1 = make_dcgm_version(c_dcgmIntrospectContext_v1, 1)
-
-
 class c_dcgmIntrospectMemory_v1(_PrintableStructure):
     _fields_ = [
-        ("version", c_uint32),
-        (
-            # The total number of bytes being used to store all of the fields
-            # being watched
-            "bytesUsed",
-            c_longlong,
-        ),
+        ('version', c_uint32),
+        ('bytesUsed', c_longlong
+        )  # The total number of bytes being used to store all of the fields being watched
     ]
 
 
 dcgmIntrospectMemory_version1 = make_dcgm_version(c_dcgmIntrospectMemory_v1, 1)
 
 
-class c_dcgmIntrospectFieldsExecTime_v1(_PrintableStructure):
-    _fields_ = [
-        (
-            # version number (dcgmIntrospectFieldsExecTime_version)
-            "version",
-            c_uint32,
-        ),
-        (
-            # the mean update frequency of all fields
-            "meanUpdateFreqUsec",
-            c_longlong,
-        ),
-        (
-            # the sum of every field's most recent execution time after they
-            # have been normalized to \ref meanUpdateFreqUsec.
-            # This is roughly how long it takes to update fields every \ref
-            # meanUpdateFreqUsec
-            "recentUpdateUsec",
-            c_double,
-        ),
-        (
-            # The total amount of time, ever, that has been spent updating all
-            # the fields
-            "totalEverUpdateUsec",
-            c_longlong,
-        ),
-    ]
-
-
-dcgmIntrospectFieldsExecTime_version1 = make_dcgm_version(
-    c_dcgmIntrospectFieldsExecTime_v1, 1
-)
-
-
-class c_dcgmIntrospectFullFieldsExecTime_v2(_PrintableStructure):
-    """
-    Full introspection info for field execution time
-    """
-
-    _fields_ = [
-        ("version", c_uint32),
-        (
-            "aggregateInfo",
-            c_dcgmIntrospectFieldsExecTime_v1,
-        ),  # info that includes global and device scope
-        (
-            "hasGlobalInfo",
-            c_int,
-        ),  # 0 means \ref globalInfo is populated, !0 means it's not
-        (
-            "globalInfo",
-            c_dcgmIntrospectFieldsExecTime_v1,
-        ),  # info that only includes global field scope
-        (
-            "gpuInfoCount",
-            c_uint,
-        ),  # count of how many entries in \ref gpuInfo are populated
-        (
-            "gpuIdsForGpuInfo",
-            c_uint * DCGM_MAX_NUM_DEVICES,
-        ),  # the GPU ID at a given index identifies which gpu
-        # the corresponding entry in \ref gpuInfo is from
-        (
-            "gpuInfo",
-            c_dcgmIntrospectFieldsExecTime_v1 * DCGM_MAX_NUM_DEVICES,
-        ),  # info that is separated by the
-        # GPU ID that the watches were for
-    ]
-
-
-dcgmIntrospectFullFieldsExecTime_version2 = make_dcgm_version(
-    c_dcgmIntrospectFullFieldsExecTime_v2, 2
-)
-
-
-class c_dcgmIntrospectFullMemory_v1(_PrintableStructure):
-    """
-    Full introspection info for field memory
-    """
-
-    _fields_ = [
-        ("version", c_uint32),
-        (
-            "aggregateInfo",
-            c_dcgmIntrospectMemory_v1,
-        ),  # info that includes global and device scope
-        (
-            "hasGlobalInfo",
-            c_int,
-        ),  # 0 means \ref globalInfo is populated, !0 means it's not
-        (
-            "globalInfo",
-            c_dcgmIntrospectMemory_v1,
-        ),  # info that only includes global field scope
-        (
-            "gpuInfoCount",
-            c_uint,
-        ),  # count of how many entries in \ref gpuInfo are populated
-        (
-            "gpuIdsForGpuInfo",
-            c_uint * DCGM_MAX_NUM_DEVICES,
-        ),  # the GPU ID at a given index identifies which gpu
-        # the corresponding entry in \ref gpuInfo is from
-        (
-            "gpuInfo",
-            c_dcgmIntrospectMemory_v1 * DCGM_MAX_NUM_DEVICES,
-        ),  # info that is separated by the
-        # GPU ID that the watches were for
-    ]
-
-
-dcgmIntrospectFullMemory_version1 = make_dcgm_version(c_dcgmIntrospectFullMemory_v1, 1)
-
-
 class c_dcgmIntrospectCpuUtil_v1(_PrintableStructure):
     _fields_ = [
-        ("version", c_uint32),  # version number (dcgmIntrospectCpuUtil_version)
-        ("total", c_double),  # fraction of device's CPU resources that were used
-        (
-            "kernel",
-            c_double,
-        ),  # fraction of device's CPU resources that were used in kernel mode
-        (
-            "user",
-            c_double,
-        ),  # fraction of device's CPU resources that were used in user mode
+        ('version', c_uint32
+        ),  #!< version number (dcgmIntrospectCpuUtil_version)                     
+        ('total', c_double
+        ),  #!< fraction of device's CPU resources that were used                
+        ('kernel', c_double
+        ),  #!< fraction of device's CPU resources that were used in kernel mode 
+        ('user', c_double
+        ),  #!< fraction of device's CPU resources that were used in user mode   
     ]
 
 
-dcgmIntrospectCpuUtil_version1 = make_dcgm_version(c_dcgmIntrospectCpuUtil_v1, 1)
+dcgmIntrospectCpuUtil_version1 = make_dcgm_version(c_dcgmIntrospectCpuUtil_v1,
+                                                   1)
 
 DCGM_MAX_CONFIG_FILE_LEN = 10000
 DCGM_MAX_TEST_NAMES = 20
@@ -1907,229 +1882,173 @@ class c_dcgmIntrospectCpuUtil_v1(_PrintableStructure):
 # Flags options for running the GPU diagnostic
 DCGM_RUN_FLAGS_VERBOSE = 0x0001
 DCGM_RUN_FLAGS_STATSONFAIL = 0x0002
+# UNUSED
 DCGM_RUN_FLAGS_TRAIN = 0x0004
+# UNUSED
 DCGM_RUN_FLAGS_FORCE_TRAIN = 0x0008
-# Enable fail early checks for the Targeted Stress, Targeted Power, SM Stress,
-# and Diagnostic tests
-DCGM_RUN_FLAGS_FAIL_EARLY = 0x0010
+DCGM_RUN_FLAGS_FAIL_EARLY = 0x0010  # Enable fail early checks for the Targeted Stress, Targeted Power, SM Stress, and Diagnostic tests
 
 
-class c_dcgmRunDiag_v6(_PrintableStructure):
+class c_dcgmRunDiag_v7(_PrintableStructure):
     _fields_ = [
-        ("version", c_uint),  # version of this message
-        (
-            # flags specifying binary options for running it. Currently verbose
-            # and stats on fail
-            "flags",
-            c_uint,
-        ),
-        (
-            "debugLevel",
-            c_uint,
+        ('version', c_uint),  # version of this message
+        ('flags', c_uint
+        ),  # flags specifying binary options for running it. Currently verbose and stats on fail
+        ('debugLevel', c_uint
         ),  # 0-5 for the debug level the GPU diagnostic will use for logging
-        (
-            # group of GPUs to verify. Cannot be specified together with
-            # gpuList.
-            "groupId",
-            c_void_p,
-        ),
-        ("validate", c_uint),  # 0-3 for which tests to run. Optional.
-        (
-            "testNames",
-            c_char * DCGM_MAX_TEST_NAMES * DCGM_MAX_TEST_NAMES_LEN,
-        ),  # Specified list of test names. Optional.
-        (
-            # Parameters to set for specified tests in the format:
-            # testName.parameterName=parameterValue. Optional.
-            "testParms",
-            c_char * DCGM_MAX_TEST_PARMS * DCGM_MAX_TEST_PARMS_LEN,
-        ),
-        (
-            # Comma-separated list of gpus. Cannot be specified with the
-            # groupId.
-            "gpuList",
-            c_char * DCGM_GPU_LIST_LEN,
-        ),
-        (
-            "debugLogFile",
-            c_char * DCGM_PATH_LEN,
+        ('groupId', c_void_p
+        ),  # group of GPUs to verify. Cannot be specified together with gpuList.
+        ('validate', c_uint),  # 0-3 for which tests to run. Optional.
+        ('testNames', c_char * DCGM_MAX_TEST_NAMES *
+         DCGM_MAX_TEST_NAMES_LEN),  # Specifed list of test names. Optional.
+        ('testParms', c_char * DCGM_MAX_TEST_PARMS * DCGM_MAX_TEST_PARMS_LEN
+        ),  # Parameters to set for specified tests in the format: testName.parameterName=parameterValue. Optional.
+        ('fakeGpuList', c_char * DCGM_GPU_LIST_LEN
+        ),  # Comma-separated list of fake gpus. Cannot be specified with the groupId or gpuList.
+        ('gpuList', c_char * DCGM_GPU_LIST_LEN
+        ),  # Comma-separated list of gpus. Cannot be specified with the groupId.
+        ('debugLogFile', c_char * DCGM_PATH_LEN
         ),  # Alternate name for the debug log file that should be used
-        (
-            "statsPath",
-            c_char * DCGM_PATH_LEN,
+        ('statsPath', c_char * DCGM_PATH_LEN
         ),  # Path that the plugin's statistics files should be written to
-        (
-            "configFileContents",
-            c_char * DCGM_MAX_CONFIG_FILE_LEN,
+        ('configFileContents', c_char * DCGM_MAX_CONFIG_FILE_LEN
         ),  # Contents of nvvs config file (likely yaml)
-        (
-            # Throttle reasons to ignore as either integer mask or csv list of
-            # reasons
-            "throttleMask",
-            c_char * DCGM_THROTTLE_MASK_LEN,
-        ),
-        ("pluginPath", c_char * DCGM_PATH_LEN),  # Custom path to the diagnostic plugins
-        ("trainingValues", c_uint),  # Number of iterations for training.
-        (
-            # Acceptable training variance as a percentage of the value.
-            # (0-100)
-            "trainingVariance",
-            c_uint,
-        ),
-        (
-            # Acceptable training tolerance as a percentage of the value.
-            # (0-100)
-            "trainingTolerance",
-            c_uint,
-        ),
-        (
-            "goldenValuesFile",
-            c_char * DCGM_PATH_LEN,
-        ),  # The path where the golden values should be recorded
-        (
-            # How often the fail early checks should occur when
-            # DCGM_RUN_FLAGS_FAIL_EARLY is set.
-            "failCheckInterval",
-            c_uint,
-        ),
+        ('throttleMask', c_char * DCGM_THROTTLE_MASK_LEN
+        ),  # Throttle reasons to ignore as either integer mask or csv list of reasons
+        ('pluginPath',
+         c_char * DCGM_PATH_LEN),  # Custom path to the diagnostic plugins
+        ('_unusedInt1', c_uint),  # Unused
+        ('_unusedInt2', c_uint),  # Unused
+        ('_unusedInt3', c_uint),  # Unused
+        ('_unusedBuf', c_char * DCGM_PATH_LEN),  # Unused
+        ('failCheckInterval', c_uint
+        ),  # How often the fail early checks should occur when DCGM_RUN_FLAGS_FAIL_EARLY is set.
     ]
 
 
-dcgmRunDiag_version6 = make_dcgm_version(c_dcgmRunDiag_v6, 6)
+dcgmRunDiag_version7 = make_dcgm_version(c_dcgmRunDiag_v7, 7)
 
 # Latest c_dcgmRunDiag class
-c_dcgmRunDiag_t = c_dcgmRunDiag_v6
+c_dcgmRunDiag_t = c_dcgmRunDiag_v7
 
 # Latest version for dcgmRunDiag_t
-dcgmRunDiag_version = dcgmRunDiag_version6
+dcgmRunDiag_version = dcgmRunDiag_version7
 
-# Flags for dcgmGetEntityGroupEntities's flags parameter
-# Only return entities that are supported by DCGM.
-DCGM_GEGE_FLAG_ONLY_SUPPORTED = 0x00000001
+#Flags for dcgmGetEntityGroupEntities's flags parameter
+DCGM_GEGE_FLAG_ONLY_SUPPORTED = 0x00000001  #Only return entities that are supported by DCGM.
 
-# Identifies a GPU NVLink error type returned by DCGM_FI_DEV_GPU_NVLINK_ERRORS
-# NVLink link recovery error occurred
-DCGM_GPU_NVLINK_ERROR_RECOVERY_REQUIRED = 1
-# NVLink link fatal error occurred
-DCGM_GPU_NVLINK_ERROR_FATAL = 2
+#Identifies a GPU NVLink error type returned by DCGM_FI_DEV_GPU_NVLINK_ERRORS
+DCGM_GPU_NVLINK_ERROR_RECOVERY_REQUIRED = 1  # NVLink link recovery error occurred
+DCGM_GPU_NVLINK_ERROR_FATAL = 2  # NVLink link fatal error occurred
 
 # Topology hints for dcgmSelectGpusByTopology()
 DCGM_TOPO_HINT_F_NONE = 0x00000000  # No hints specified
-# Ignore the health of the GPUs when picking GPUs for job execution.
-DCGM_TOPO_HINT_F_IGNOREHEALTH = 0x00000001
+DCGM_TOPO_HINT_F_IGNOREHEALTH = 0x00000001  # Ignore the health of the GPUs when picking GPUs for job execution.
 # By default, only healthy GPUs are considered.
 
 
 class c_dcgmTopoSchedHint_v1(_PrintableStructure):
     _fields_ = [
-        ("version", c_uint),  # version of this message
-        ("inputGpuIds", c_uint64),  # bitmask of the GPU ids to choose from
-        ("numGpus", c_uint32),  # the number of GPUs that DCGM should choose
-        (
-            "hintFlags",
-            c_uint64,
-        ),  # Hints to ignore certain factors for the scheduling hint
+        ('version', c_uint),  # version of this message
+        ('inputGpuIds', c_uint64),  # bitmask of the GPU ids to choose from
+        ('numGpus', c_uint32),  # the number of GPUs that DCGM should chooose
+        ('hintFlags',
+         c_uint64),  # Hints to ignore certain factors for the scheduling hint
     ]
 
 
 dcgmTopoSchedHint_version1 = make_dcgm_version(c_dcgmTopoSchedHint_v1, 1)
 
-# DCGM NvLink link states used by c_dcgmNvLinkGpuLinkStatus_v1 & 2 and
-# c_dcgmNvLinkNvSwitchLinkStatus_t's linkState field
-# NvLink is unsupported by this GPU (Default for GPUs)
-DcgmNvLinkLinkStateNotSupported = 0
-# NvLink is supported for this link but this link is disabled (Default for
-# NvSwitches)
-DcgmNvLinkLinkStateDisabled = 1
-# This NvLink link is down (inactive)
-DcgmNvLinkLinkStateDown = 2
-# This NvLink link is up (active)
-DcgmNvLinkLinkStateUp = 3
+#DCGM NvLink link states used by c_dcgmNvLinkGpuLinkStatus_v1 & 2 and c_dcgmNvLinkNvSwitchLinkStatus_t's linkState field
+DcgmNvLinkLinkStateNotSupported = 0  # NvLink is unsupported by this GPU (Default for GPUs)
+DcgmNvLinkLinkStateDisabled = 1  # NvLink is supported for this link but this link is disabled (Default for NvSwitches)
+DcgmNvLinkLinkStateDown = 2  # This NvLink link is down (inactive)
+DcgmNvLinkLinkStateUp = 3  # This NvLink link is up (active)
 
 
 # State of NvLink links for a GPU
 class c_dcgmNvLinkGpuLinkStatus_v1(_PrintableStructure):
     _fields_ = [
-        ("entityId", c_uint32),  # Entity ID of the GPU (gpuId)
-        (
-            "linkState",
-            c_uint32 * DCGM_NVLINK_MAX_LINKS_PER_GPU_LEGACY1,
-        ),  # Link state of each link of this GPU
+        ('entityId', c_uint32),  # Entity ID of the GPU (gpuId)
+        ('linkState', c_uint32 * DCGM_NVLINK_MAX_LINKS_PER_GPU_LEGACY1
+        ),  #Link state of each link of this GPU
     ]
 
 
 # State of NvLink links for a GPU
 class c_dcgmNvLinkGpuLinkStatus_v2(_PrintableStructure):
     _fields_ = [
-        ("entityId", c_uint32),  # Entity ID of the GPU (gpuId)
-        (
-            "linkState",
-            c_uint32 * DCGM_NVLINK_MAX_LINKS_PER_GPU,
-        ),  # Link state of each link of this GPU
+        ('entityId', c_uint32),  # Entity ID of the GPU (gpuId)
+        ('linkState', c_uint32 * DCGM_NVLINK_MAX_LINKS_PER_GPU_LEGACY2
+        ),  #Link state of each link of this GPU
+    ]
+
+
+class c_dcgmNvLinkGpuLinkStatus_v3(_PrintableStructure):
+    _fields_ = [
+        ('entityId', c_uint32),  # Entity ID of the GPU (gpuId)
+        ('linkState', c_uint32 *
+         DCGM_NVLINK_MAX_LINKS_PER_GPU),  #Link state of each link of this GPU
     ]
 
 
-# State of NvLink links for a NvSwitch
-class c_dcgmNvLinkNvSwitchLinkStatus_t(_PrintableStructure):
+#State of NvLink links for a NvSwitch
+class c_dcgmNvLinkNvSwitchLinkStatus_v1(_PrintableStructure):
     _fields_ = [
-        ("entityId", c_uint32),  # Entity ID of the NvSwitch (physicalId)
-        (
-            "linkState",
-            c_uint32 * DCGM_NVLINK_MAX_LINKS_PER_NVSWITCH,
-        ),  # Link state of each link of this NvSwitch
+        ('entityId', c_uint32),  # Entity ID of the NvSwitch (physicalId)
+        ('linkState', c_uint32 * DCGM_NVLINK_MAX_LINKS_PER_NVSWITCH_V1
+        )  #Link state of each link of this NvSwitch
     ]
 
 
-class c_dcgmNvLinkStatus_v1(_PrintableStructure):
+class c_dcgmNvLinkStatus_v2(_PrintableStructure):
     """
     NvSwitch link status for all GPUs and NvSwitches in the system
     """
 
     _fields_ = [
-        (
-            "version",
-            c_uint32,
+        ('version', c_uint32
         ),  # version of this message. Should be dcgmNvLinkStatus_version1
-        ("numGpus", c_uint32),  # Number of GPUs populated in gpus[]
-        (
-            "gpus",
-            c_dcgmNvLinkGpuLinkStatus_v1 * DCGM_MAX_NUM_DEVICES,
-        ),  # Per-GPU NvLink link statuses
-        ("numNvSwitches", c_uint32),  # Number of NvSwitches populated in nvSwitches[]
-        (
-            "nvSwitches",
-            c_dcgmNvLinkNvSwitchLinkStatus_t * DCGM_MAX_NUM_SWITCHES,
-        ),  # Per-NvSwitch NvLink link statuses
+        ('numGpus', c_uint32),  # Number of GPUs populated in gpus[]
+        ('gpus', c_dcgmNvLinkGpuLinkStatus_v2 *
+         DCGM_MAX_NUM_DEVICES),  #Per-GPU NvLink link statuses
+        ('numNvSwitches',
+         c_uint32),  # Number of NvSwitches populated in nvSwitches[]
+        ('nvSwitches', c_dcgmNvLinkNvSwitchLinkStatus_v1 * DCGM_MAX_NUM_SWITCHES
+        )  #Per-NvSwitch NvLink link statuses
     ]
 
 
-dcgmNvLinkStatus_version1 = make_dcgm_version(c_dcgmNvLinkStatus_v1, 1)
+dcgmNvLinkStatus_version2 = make_dcgm_version(c_dcgmNvLinkStatus_v2, 2)
 
 
-class c_dcgmNvLinkStatus_v2(_PrintableStructure):
-    """
-    NvSwitch link status for all GPUs and NvSwitches in the system
-    """
+#State of NvLink links for a NvSwitch
+class c_dcgmNvLinkNvSwitchLinkStatus_v2(_PrintableStructure):
+    _fields_ = [
+        ('entityId', c_uint32),  # Entity ID of the NvSwitch (physicalId)
+        ('linkState', c_uint32 * DCGM_NVLINK_MAX_LINKS_PER_NVSWITCH
+        )  #Link state of each link of this NvSwitch
+    ]
+
 
+class c_dcgmNvLinkStatus_v3(_PrintableStructure):
+    '''
+    NvSwitch link status for all GPUs and NvSwitches in the system
+    '''
     _fields_ = [
-        (
-            "version",
-            c_uint32,
+        ('version', c_uint32
         ),  # version of this message. Should be dcgmNvLinkStatus_version1
-        ("numGpus", c_uint32),  # Number of GPUs populated in gpus[]
-        (
-            "gpus",
-            c_dcgmNvLinkGpuLinkStatus_v2 * DCGM_MAX_NUM_DEVICES,
-        ),  # Per-GPU NvLink link statuses
-        ("numNvSwitches", c_uint32),  # Number of NvSwitches populated in nvSwitches[]
-        (
-            "nvSwitches",
-            c_dcgmNvLinkNvSwitchLinkStatus_t * DCGM_MAX_NUM_SWITCHES,
-        ),  # Per-NvSwitch NvLink link statuses
+        ('numGpus', c_uint32),  # Number of GPUs populated in gpus[]
+        ('gpus', c_dcgmNvLinkGpuLinkStatus_v3 *
+         DCGM_MAX_NUM_DEVICES),  #Per-GPU NvLink link statuses
+        ('numNvSwitches',
+         c_uint32),  # Number of NvSwitches populated in nvSwitches[]
+        ('nvSwitches', c_dcgmNvLinkNvSwitchLinkStatus_v2 * DCGM_MAX_NUM_SWITCHES
+        )  #Per-NvSwitch NvLink link statuses
     ]
 
 
-dcgmNvLinkStatus_version2 = make_dcgm_version(c_dcgmNvLinkStatus_v2, 2)
+dcgmNvLinkStatus_version3 = make_dcgm_version(c_dcgmNvLinkStatus_v3, 3)
 
 # Bitmask values for dcgmGetFieldIdSummary
 DCGM_SUMMARY_MIN = 0x00000001
@@ -2143,7 +2062,8 @@ class c_dcgmNvLinkStatus_v2(_PrintableStructure):
 
 
 class c_dcgmSummaryResponse_t(_PrintableStructure):
-    class ResponseValue(Union):
+
+    class ResponseValue(DcgmUnion):
         _fields_ = [
             ("i64", c_int64),
             ("dbl", c_double),
@@ -2169,7 +2089,8 @@ class c_dcgmFieldSummaryRequest_v1(_PrintableStructure):
     ]
 
 
-dcgmFieldSummaryRequest_version1 = make_dcgm_version(c_dcgmFieldSummaryRequest_v1, 1)
+dcgmFieldSummaryRequest_version1 = make_dcgm_version(
+    c_dcgmFieldSummaryRequest_v1, 1)
 
 # Module IDs
 DcgmModuleIdCore = 0  # Core DCGM
@@ -2184,90 +2105,61 @@ class c_dcgmFieldSummaryRequest_v1(_PrintableStructure):
 DcgmModuleIdCount = 9  # 1 greater than largest ID above
 
 # Module Status
-# Module has not been loaded yet
-DcgmModuleStatusNotLoaded = 0
-# Module has been blacklisted from being loaded
-DcgmModuleStatusBlacklisted = 1
-# Loading the module failed
-DcgmModuleStatusFailed = 2
-# Module has been loaded
-DcgmModuleStatusLoaded = 3
+DcgmModuleStatusNotLoaded = 0  # Module has not been loaded yet
+DcgmModuleStatusDenylisted = 1  # Module has been added to the denylist so it can't be loaded
+DcgmModuleStatusFailed = 2  # Loading the module failed
+DcgmModuleStatusLoaded = 3  # Module has been loaded
+DcgmModuleStatusUnloaded = 4  # Module has been unloaded
+DcgmModuleStatusPaused = 5  # Module has been paused. Implies it's been loaded
 
 DCGM_MODULE_STATUSES_CAPACITY = 16
 
 
 class c_dcgmModuleGetStatusesModule_t(_PrintableStructure):
     _fields_ = [
-        ("id", c_uint32),  # One of DcgmModuleId*
-        ("status", c_uint32),  # One of DcgmModuleStatus*
+        ('id', c_uint32),  #One of DcgmModuleId*
+        ('status', c_uint32),  #One of DcgmModuleStatus*
     ]
 
 
 class c_dcgmModuleGetStatuses_v1(_PrintableStructure):
     _fields_ = [
-        ("version", c_uint),
-        ("numStatuses", c_uint32),
-        ("statuses", c_dcgmModuleGetStatusesModule_t * DCGM_MODULE_STATUSES_CAPACITY),
-    ]
-
-
-dcgmModuleGetStatuses_version1 = make_dcgm_version(c_dcgmModuleGetStatuses_v1, 1)
-
-# Maximum number of metric ID groups that can exist in DCGM
-DCGM_PROF_MAX_NUM_GROUPS = 10
-# Maximum number of field IDs that can be in a single DCGM profiling metric
-# group
-DCGM_PROF_MAX_FIELD_IDS_PER_GROUP = 8
-
-
-class c_dcgmProfMetricGroupInfo_t(_PrintableStructure):
-    _fields_ = [
-        ("majorId", c_ushort),
-        ("minorId", c_ushort),
-        ("numFieldIds", c_uint32),
-        ("fieldIds", c_ushort * DCGM_PROF_MAX_FIELD_IDS_PER_GROUP),
+        ('version', c_uint),
+        ('numStatuses', c_uint32),
+        ('statuses',
+         c_dcgmModuleGetStatusesModule_t * DCGM_MODULE_STATUSES_CAPACITY),
     ]
 
 
-class c_dcgmProfGetMetricGroups_v2(_PrintableStructure):
-    _fields_ = [
-        ("version", c_uint32),
-        ("unused", c_uint32),
-        ("groupId", c_void_p),
-        ("numMetricGroups", c_uint32),
-        ("unused1", c_uint32),
-        ("metricGroups", c_dcgmProfMetricGroupInfo_t * DCGM_PROF_MAX_NUM_GROUPS),
-    ]
+dcgmModuleGetStatuses_version1 = make_dcgm_version(c_dcgmModuleGetStatuses_v1,
+                                                   1)
 
+DCGM_PROF_MAX_NUM_GROUPS_V2 = 10  # Maximum number of metric ID groups that can exist in DCGM
+DCGM_PROF_MAX_FIELD_IDS_PER_GROUP_V2 = 64  # Maximum number of field IDs that can be in a single DCGM profiling metric group
 
-dcgmProfGetMetricGroups_version1 = make_dcgm_version(c_dcgmProfGetMetricGroups_v2, 2)
 
-
-class c_dcgmProfWatchFields_v1(_PrintableStructure):
+class c_dcgmProfMetricGroupInfo_v2(_PrintableStructure):
     _fields_ = [
-        ("version", c_uint32),
-        ("groupId", c_void_p),
-        ("numFieldIds", c_uint32),
-        ("fieldIds", c_ushort * 16),
-        ("updateFreq", c_int64),
-        ("maxKeepAge", c_double),
-        ("maxKeepSamples", c_int32),
-        ("flags", c_uint32),
+        ('majorId', c_ushort),
+        ('minorId', c_ushort),
+        ('numFieldIds', c_uint32),
+        ('fieldIds', c_ushort * DCGM_PROF_MAX_FIELD_IDS_PER_GROUP_V2),
     ]
 
 
-dcgmProfWatchFields_version1 = make_dcgm_version(c_dcgmProfWatchFields_v1, 1)
-
-
-class c_dcgmProfUnwatchFields_v1(_PrintableStructure):
+class c_dcgmProfGetMetricGroups_v3(_PrintableStructure):
     _fields_ = [
-        ("version", c_uint32),
-        ("groupId", c_void_p),
-        ("flags", c_uint32),
+        ('version', c_uint32),
+        ('unused', c_uint32),
+        ('gpuId', c_uint32),
+        ('numMetricGroups', c_uint32),
+        ('metricGroups',
+         c_dcgmProfMetricGroupInfo_v2 * DCGM_PROF_MAX_NUM_GROUPS_V2),
     ]
 
 
-dcgmProfUnwatchFields_version1 = make_dcgm_version(c_dcgmProfUnwatchFields_v1, 1)
+dcgmProfGetMetricGroups_version3 = make_dcgm_version(
+    c_dcgmProfGetMetricGroups_v3, 3)
 
 
 class c_dcgmVersionInfo_v2(_PrintableStructure):
diff --git a/model_analyzer/monitor/dcgm/dcgm_telegraf.py b/model_analyzer/monitor/dcgm/dcgm_telegraf.py
new file mode 100644
index 000000000..63563662e
--- /dev/null
+++ b/model_analyzer/monitor/dcgm/dcgm_telegraf.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from model_analyzer.monitor.dcgm.common.dcgm_client_main import main
+from model_analyzer.monitor.dcgm.DcgmJsonReader import DcgmJsonReader
+from socket import socket, AF_INET, SOCK_DGRAM
+
+# Displayed to the user
+TELEGRAF_NAME = 'Telegraf'
+DEFAULT_TELEGRAF_PORT = 8094
+
+# Telegraf Configuration
+# ======================
+#
+# In order for Telegraf to understand the format of the data sent by this
+# module, it needs to be configured with the input plugin below
+#
+# If you modify the list of published fields, you will need to add non-numeric
+# ones as tag_keys for Telegraf to store them
+#
+# [[inputs.socket_listener]]
+#   name_override = "dcgm"
+#   service_address = "udp://:8094"
+#   data_format = "json"
+#   tag_keys = [
+#     "compute_pids",
+#     "driver_version",
+#     "gpu_uuid",
+#     "nvml_version",
+#     "process_name",
+#     "xid_errors"
+#   ]
+
+
+class DcgmTelegraf(DcgmJsonReader):
+    ###########################################################################
+    def __init__(self, publish_hostname, publish_port, **kwargs):
+        self.m_sock = socket(AF_INET, SOCK_DGRAM)
+        self.m_dest = (publish_hostname, publish_port)
+        super(DcgmTelegraf, self).__init__(**kwargs)
+
+    ###########################################################################
+    def SendToTelegraf(self, payload):
+        self.m_sock.sendto(payload, self.m_dest)
+
+    ###########################################################################
+    def CustomJsonHandler(self, outJson):
+        self.SendToTelegraf(outJson)
+
+
+if __name__ == '__main__':  # pragma: no cover
+    main(DcgmTelegraf,
+         TELEGRAF_NAME,
+         DEFAULT_TELEGRAF_PORT,
+         add_target_host=True)
diff --git a/model_analyzer/monitor/dcgm/dcgmvalue.py b/model_analyzer/monitor/dcgm/dcgmvalue.py
new file mode 100644
index 000000000..d26625d50
--- /dev/null
+++ b/model_analyzer/monitor/dcgm/dcgmvalue.py
@@ -0,0 +1,155 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Base value for integer blank. can be used as an unspecified blank
+DCGM_INT32_BLANK = 0x7ffffff0
+DCGM_INT64_BLANK = 0x7ffffffffffffff0
+
+# Base value for double blank. 2 ** 47. FP 64 has 52 bits of mantissa,
+#so 47 bits can still increment by 1 and represent each value from 0-15
+DCGM_FP64_BLANK = 140737488355328.0
+
+DCGM_STR_BLANK = "<<<NULL>>>"
+
+# Represents an error where data was not found
+DCGM_INT32_NOT_FOUND = (DCGM_INT32_BLANK + 1)
+DCGM_INT64_NOT_FOUND = (DCGM_INT64_BLANK + 1)
+DCGM_FP64_NOT_FOUND = (DCGM_FP64_BLANK + 1.0)
+DCGM_STR_NOT_FOUND = "<<<NOT_FOUND>>>"
+
+# Represents an error where fetching the value is not supported
+DCGM_INT32_NOT_SUPPORTED = (DCGM_INT32_BLANK + 2)
+DCGM_INT64_NOT_SUPPORTED = (DCGM_INT64_BLANK + 2)
+DCGM_FP64_NOT_SUPPORTED = (DCGM_FP64_BLANK + 2.0)
+DCGM_STR_NOT_SUPPORTED = "<<<NOT_SUPPORTED>>>"
+
+# Represents and error where fetching the value is not allowed with our current credentials
+DCGM_INT32_NOT_PERMISSIONED = (DCGM_INT32_BLANK + 3)
+DCGM_INT64_NOT_PERMISSIONED = (DCGM_INT64_BLANK + 3)
+DCGM_FP64_NOT_PERMISSIONED = (DCGM_FP64_BLANK + 3.0)
+DCGM_STR_NOT_PERMISSIONED = "<<<NOT_PERM>>>"
+
+
+###############################################################################
+# Functions to check if a value is blank or not
+def DCGM_INT32_IS_BLANK(val):
+    if val >= DCGM_INT32_BLANK:
+        return True
+    else:
+        return False
+
+
+def DCGM_INT64_IS_BLANK(val):
+    if val >= DCGM_INT64_BLANK:
+        return True
+    else:
+        return False
+
+
+def DCGM_FP64_IS_BLANK(val):
+    if val >= DCGM_FP64_BLANK:
+        return True
+    else:
+        return False
+
+
+#Looks for <<< at first position and >>> inside string
+def DCGM_STR_IS_BLANK(val):
+    if 0 != val.find("<<<"):
+        return False
+    elif 0 > val.find(">>>"):
+        return False
+    return True
+
+
+###############################################################################
+class DcgmValue:
+
+    def __init__(self, value):
+        self.value = value  #Contains either an integer (int64), string, or double of the actual value
+
+    ###########################################################################
+    def SetFromInt32(self, i32Value):
+        '''
+        Handle the special case where our source data was an int32 but is currently
+        stored in a python int (int64), dealing with blanks
+        '''
+        value = int(i32Value)
+
+        if not DCGM_INT32_IS_BLANK(i32Value):
+            self.value = value
+            return
+
+        if value == DCGM_INT32_NOT_FOUND:
+            self.value = DCGM_INT64_NOT_FOUND
+        elif value == DCGM_INT32_NOT_SUPPORTED:
+            self.value = DCGM_INT64_NOT_SUPPORTED
+        elif value == DCGM_INT32_NOT_PERMISSIONED:
+            self.value = DCGM_INT64_NOT_PERMISSIONED
+        else:
+            self.value = DCGM_INT64_BLANK
+
+    ###########################################################################
+    def IsBlank(self):
+        '''
+        Returns True if the currently-stored value is a blank value. False if not
+        '''
+        if self.value is None:
+            return True
+        elif type(self.value) == int or type(self.value) == int:
+            return DCGM_INT64_IS_BLANK(self.value)
+        elif type(self.value) == float:
+            return DCGM_FP64_IS_BLANK(self.value)
+        elif type(self.value) == str:
+            return DCGM_STR_IS_BLANK(self.value)
+        else:
+            raise Exception("Unknown type: %s") % str(type(self.value))
+
+    ###########################################################################
+    def __str__(self):
+        return str(self.value)
+
+    ###########################################################################
+
+
+###############################################################################
+def self_test():
+
+    v = DcgmValue(1.0)
+    assert (not v.IsBlank())
+    assert (v.value == 1.0)
+
+    v = DcgmValue(100)
+    assert (not v.IsBlank())
+    assert (v.value == 100)
+
+    v = DcgmValue(DCGM_INT64_NOT_FOUND)
+    assert (v.IsBlank())
+
+    v = DcgmValue(DCGM_FP64_NOT_FOUND)
+    assert (v.IsBlank())
+
+    v.SetFromInt32(DCGM_INT32_NOT_SUPPORTED)
+    assert (v.IsBlank())
+    assert (v.value == DCGM_INT64_NOT_SUPPORTED)
+
+    print("Tests passed")
+    return
+
+
+###############################################################################
+if __name__ == "__main__":
+    self_test()
+
+###############################################################################
diff --git a/model_analyzer/monitor/dcgm/denylist_recommendations.py b/model_analyzer/monitor/dcgm/denylist_recommendations.py
new file mode 100644
index 000000000..38dafc624
--- /dev/null
+++ b/model_analyzer/monitor/dcgm/denylist_recommendations.py
@@ -0,0 +1,573 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import sys
+import logging
+import json
+import os
+
+try:
+    import model_analyzer.monitor.dcgm.pydcgm as pydcgm
+    import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent
+    import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
+    import model_analyzer.monitor.dcgm.dcgm_errors as dcgm_errors
+    import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields
+    import model_analyzer.monitor.dcgm.DcgmSystem as DcgmSystem
+except:
+    # If we don't find the bindings, add the default path and try again
+    if 'PYTHONPATH' in os.environ:
+        os.environ['PYTHONPATH'] = os.environ[
+            'PYTHONPATH'] + ":/usr/local/dcgm/bindings"
+    else:
+        os.environ['PYTHONPATH'] = '/usr/local/dcgm/bindings'
+
+    import model_analyzer.monitor.dcgm.pydcgm as pydcgm
+    import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent
+    import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
+    import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields
+    import model_analyzer.monitor.dcgm.DcgmSystem as DcgmSystem
+
+BR_ST_HEALTHY = 0x0000
+BR_ST_NOT_DETECTED = 0x0001
+BR_ST_FAILED_PASSIVE_HEALTH = 0x0002
+BR_ST_FAILED_ACTIVE_HEALTH = 0x0004
+
+BR_HEALTH_WATCH_BITMAP = dcgm_structs.DCGM_HEALTH_WATCH_ALL
+
+DIAG_SM_STRESS_DURATION = 90.0
+DIAG_CONSTANT_POWER_DURATION = 120.0
+DIAG_CONSTANT_STRESS_DURATION = 120.0
+DIAG_DIAGNOSTIC_DURATION = 300.0
+
+global g_gpus
+global g_switches
+g_gpus = []
+g_switches = []
+
+
+class Entity(object):
+
+    def __init__(self,
+                 entityId,
+                 entityType=dcgm_fields.DCGM_FE_GPU,
+                 uuid=None,
+                 bdf=None):
+        self.health = BR_ST_HEALTHY
+        self.entityType = entityType
+        self.entityId = entityId
+        self.reasonsUnhealthy = []
+        if uuid:
+            self.uuid = uuid
+        if bdf:
+            self.bdf = bdf
+
+    def IsHealthy(self):
+        return self.health == BR_ST_HEALTHY
+
+    def MarkUnhealthy(self, failCondition, reason):
+        self.health = self.health | failCondition
+        self.reasonsUnhealthy.append(reason)
+
+    def WhyUnhealthy(self):
+        return self.reasonsUnhealthy
+
+    def SetEntityId(self, entityId):
+        self.entityId = entityId
+
+    def GetEntityId(self):
+        return self.entityId
+
+    def GetUUID(self):
+        return self.uuid
+
+    def GetBDF(self):
+        return self.bdf
+
+
+def mark_entity_unhealthy(entities, entityId, code, reason):
+    found = False
+    for entity in entities:
+        if entityId == entity.GetEntityId():
+            entity.MarkUnhealthy(code, reason)
+            found = True
+
+    return found
+
+
+def addParamString(runDiagInfo, paramIndex, paramStr):
+    strIndex = 0
+    for c in paramStr:
+        runDiagInfo.testParms[paramIndex][strIndex] = c
+        strIndex = strIndex + 1
+
+
+def setTestDurations(runDiagInfo, timePercentage):
+    # We only are reducing the test time for the default case
+    if runDiagInfo.validate != 3:
+        return
+
+    stressDuration = int(DIAG_SM_STRESS_DURATION * timePercentage)
+    powerDuration = int(DIAG_CONSTANT_POWER_DURATION * timePercentage)
+    constantStressDuration = int(DIAG_CONSTANT_STRESS_DURATION * timePercentage)
+    diagDuration = int(DIAG_DIAGNOSTIC_DURATION * timePercentage)
+
+    smParam = "sm stress.test_duration=%d" % (stressDuration)
+    powerParam = "targeted power.test_duration=%d" % (powerDuration)
+    constantStressParam = "targeted stress.test_duration=%d" % (
+        constantStressDuration)
+    diagParam = "diagnostic.test_duration=%d" % (diagDuration)
+
+    addParamString(runDiagInfo, 0, diagParam)
+    addParamString(runDiagInfo, 1, smParam)
+    addParamString(runDiagInfo, 2, constantStressParam)
+    addParamString(runDiagInfo, 3, powerParam)
+
+
+def initialize_run_diag_info(settings):
+    runDiagInfo = dcgm_structs.c_dcgmRunDiag_v7()
+    runDiagInfo.version = dcgm_structs.dcgmRunDiag_version7
+    runDiagInfo.flags = dcgm_structs.DCGM_RUN_FLAGS_VERBOSE
+    testNamesStr = settings['testNames']
+    if testNamesStr == '1':
+        runDiagInfo.validate = 1
+    elif testNamesStr == '2':
+        runDiagInfo.validate = 2
+    elif testNamesStr == '3':
+        runDiagInfo.validate = 3
+    else:
+        # Make sure no number other that 1-3 were submitted
+        if testNamesStr.isdigit():
+            raise ValueError("'%s' is not a valid test name" % testNamesStr)
+
+        # Copy to the testNames portion of the object
+        names = testNamesStr.split(',')
+        testIndex = 0
+        if len(names) > dcgm_structs.DCGM_MAX_TEST_NAMES:
+            err = 'Aborting DCGM Diag because %d test names were specified exceeding the limit of %d' %\
+                  (len(names), dcgm_structs.DCGM_MAX_TEST_NAMES)
+            raise ValueError(err)
+
+        for testName in names:
+            testNameIndex = 0
+            if len(testName) >= dcgm_structs.DCGM_MAX_TEST_NAMES_LEN:
+                err = 'Aborting DCGM Diag because test name %s exceeds max length %d' % \
+                      (testName, dcgm_structs.DCGM_MAX_TEST_NAMES_LEN)
+                raise ValueError(err)
+
+            for c in testName:
+                runDiagInfo.testNames[testIndex][testNameIndex] = c
+                testNameIndex = testNameIndex + 1
+
+            testIndex = testIndex + 1
+
+    if 'timePercentage' in settings:
+        setTestDurations(runDiagInfo, settings['timePercentage'])
+
+    activeGpuIds = []
+
+    first = True
+    for gpuObj in g_gpus:
+        if gpuObj.IsHealthy():
+            activeGpuIds.append(gpuObj.GetEntityId())
+            if first:
+                runDiagInfo.gpuList = str(gpuObj.GetEntityId())
+                first = False
+            else:
+                to_append = ',%s' % (str(gpuObj.GetEntityId()))
+                runDiagInfo.gpuList = runDiagInfo.gpuList + to_append
+
+    return runDiagInfo, activeGpuIds
+
+
+def mark_all_unhealthy(activeGpuIds, reason):
+    for gpuId in activeGpuIds:
+        mark_entity_unhealthy(g_gpus, gpuId, BR_ST_FAILED_ACTIVE_HEALTH, reason)
+
+
+def result_to_str(result):
+    if result == dcgm_structs.DCGM_DIAG_RESULT_PASS:
+        return 'PASS'
+    elif result == dcgm_structs.DCGM_DIAG_RESULT_SKIP:
+        return 'SKIP'
+    elif result == dcgm_structs.DCGM_DIAG_RESULT_WARN:
+        return 'WARN'
+    elif result == dcgm_structs.DCGM_DIAG_RESULT_FAIL:
+        return 'FAIL'
+    else:
+        return 'NOT RUN'
+
+
+def check_passive_health_checks(response, activeGpuIds):
+    unhealthy = False
+    for i in range(0, dcgm_structs.DCGM_SWTEST_COUNT):
+        if response.levelOneResults[
+                i].result == dcgm_structs.DCGM_DIAG_RESULT_FAIL:
+            mark_all_unhealthy(activeGpuIds,
+                               response.levelOneResults[i].error.msg)
+            unhealthy = True
+            break
+
+    return unhealthy
+
+
+def check_gpu_diagnostic(handleObj, settings):
+    runDiagInfo, activeGpuIds = initialize_run_diag_info(settings)
+    if len(activeGpuIds) == 0:
+        return
+
+    response = dcgm_agent.dcgmActionValidate_v2(handleObj.handle, runDiagInfo)
+
+    sysError = response.systemError
+    if (sysError.code != dcgm_errors.DCGM_FR_OK):
+        raise ValueError(sysError)
+
+    if check_passive_health_checks(response, activeGpuIds) == False:
+        for gpuIndex in range(response.gpuCount):
+            for testIndex in range(dcgm_structs.DCGM_PER_GPU_TEST_COUNT_V8):
+                if response.perGpuResponses[gpuIndex].results[
+                        testIndex].result == dcgm_structs.DCGM_DIAG_RESULT_FAIL:
+                    gpuId = response.perGpuResponses[gpuIndex].gpuId
+                    mark_entity_unhealthy(
+                        g_gpus, gpuId, BR_ST_FAILED_ACTIVE_HEALTH,
+                        response.perGpuResponses[gpuIndex].results[testIndex].
+                        result.error.msg)
+
+                    # NVVS marks all subsequent tests as failed so there's no point in continuing
+                    break
+
+
+def query_passive_health(handleObj, desired_watches):
+    dcgmGroup = handleObj.GetSystem().GetDefaultGroup()
+    watches = dcgmGroup.health.Get()
+
+    # Check for the correct watches to be set and set them if necessary
+    if watches != desired_watches:
+        dcgmGroup.health.Set(desired_watches)
+
+    return dcgmGroup.health.Check()
+
+
+def denylist_from_passive_health_check(response):
+    for incidentIndex in range(response.incidentCount):
+        if response.incidents[
+                incidentIndex].health != dcgm_structs.DCGM_HEALTH_RESULT_FAIL:
+            # Only add to the denylist for failures; ignore warnings
+            continue
+
+        entityId = response.incidents[incidentIndex].entityInfo.entityId
+        entityGroupId = response.incidents[
+            incidentIndex].entityInfo.entityGroupId
+        errorString = response.incidents[incidentIndex].error.msg
+
+        if entityGroupId == dcgm_fields.DCGM_FE_GPU:
+            mark_entity_unhealthy(g_gpus, entityId, BR_ST_FAILED_PASSIVE_HEALTH,
+                                  errorString)
+        else:
+            mark_entity_unhealthy(g_switches, entityId,
+                                  BR_ST_FAILED_PASSIVE_HEALTH, errorString)
+
+
+def check_passive_health(handleObj, watches):
+    response = query_passive_health(handleObj, watches)
+
+    if response.overallHealth != dcgm_structs.DCGM_HEALTH_RESULT_PASS:
+        denylist_from_passive_health_check(response)
+
+
+def initialize_devices(handle, flags):
+    gpuIds = dcgm_agent.dcgmGetEntityGroupEntities(handle,
+                                                   dcgm_fields.DCGM_FE_GPU,
+                                                   flags)
+    switchIds = dcgm_agent.dcgmGetEntityGroupEntities(
+        handle, dcgm_fields.DCGM_FE_SWITCH, flags)
+
+    i = 0
+    for gpuId in gpuIds:
+        attributes = dcgm_agent.dcgmGetDeviceAttributes(handle, gpuId)
+        gpuObj = Entity(gpuId,
+                        entityType=dcgm_fields.DCGM_FE_GPU,
+                        uuid=attributes.identifiers.uuid,
+                        bdf=attributes.identifiers.pciBusId)
+        g_gpus.append(gpuObj)
+        i = i + 1
+
+    i = 0
+    for switchId in switchIds:
+        switchObj = Entity(switchId, entityType=dcgm_fields.DCGM_FE_SWITCH)
+        g_switches.append(switchObj)
+        i = i + 1
+
+
+# Process command line arguments
+def __process_command_line__(settings):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-g',
+                        '--num-gpus',
+                        dest='num_gpus',
+                        type=int,
+                        help='The expected number of GPUs.')
+    parser.add_argument('-s',
+                        '--num-switches',
+                        dest='num_switches',
+                        type=int,
+                        help='The expected number of NvSwitches.')
+    parser.add_argument(
+        '-n',
+        '--hostname',
+        dest='hostname',
+        type=str,
+        help='The hostname of the nv-hostengine we want to query.')
+    parser.add_argument(
+        '-d',
+        '--detect',
+        dest='detect',
+        action='store_true',
+        help='Run on whatever GPUs can be detected. Do not check counts.')
+    parser.add_argument(
+        '-l',
+        '--log-file',
+        dest='logfileName',
+        type=str,
+        help=
+        'The name of the log file where details should be stored. Default is stdout'
+    )
+    parser.add_argument(
+        '-u',
+        '--unsupported-too',
+        dest='unsupported',
+        action='store_true',
+        help='Get unsupported devices in addition to the ones DCGM supports')
+    parser.add_argument('-f',
+                        '--full-report',
+                        dest='fullReport',
+                        action='store_true',
+                        help='Print a health status for each GPU')
+    parser.add_argument(
+        '-c',
+        '--csv',
+        dest='outfmtCSV',
+        action='store_true',
+        help='Write output in csv format. By default, output is in json format.'
+    )
+    parser.add_argument(
+        '-w',
+        '--watches',
+        dest='watches',
+        type=str,
+        help=
+        'Specify which health watches to monitor. By default, all are watched. Any list of the following may be specified:\n\ta = All watches\n\tp = PCIE\n\tm = Memory\n\ti = Inforom\n\tt = Thermal and Power\n\tn = NVLINK'
+    )
+
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        '-r',
+        '--specified-test',
+        dest='testNames',
+        type=str,
+        help='Option to specify what tests are run in dcgmi diag.')
+    group.add_argument(
+        '-i',
+        '--instantaneous',
+        dest='instant',
+        action='store_true',
+        help='Specify to skip the longer tests and run instantaneously')
+    group.add_argument(
+        '-t',
+        '--time-limit',
+        dest='timeLimit',
+        type=int,
+        help=
+        'The time limit in seconds that all the tests should not exceed. Diagnostics will be reduced in their time to meet this boundary.'
+    )
+
+    parser.set_defaults(instant=False, detect=False, fullReport=False)
+    args = parser.parse_args()
+
+    if args.num_gpus is not None and args.num_switches is not None:
+        settings['numGpus'] = args.num_gpus
+        settings['numSwitches'] = args.num_switches
+    elif args.detect == False:
+        raise ValueError(
+            'Must specify either a number of gpus and switches with -g and -s or auto-detect with -d'
+        )
+
+    if args.hostname:
+        settings['hostname'] = args.hostname
+    else:
+        settings['hostname'] = 'localhost'
+
+    if args.unsupported:
+        settings['entity_get_flags'] = 0
+    else:
+        settings[
+            'entity_get_flags'] = dcgm_structs.DCGM_GEGE_FLAG_ONLY_SUPPORTED
+
+    settings['instant'] = args.instant
+    settings['fullReport'] = args.fullReport
+
+    if args.testNames:
+        settings['testNames'] = args.testNames
+    else:
+        settings['testNames'] = '3'
+
+    if args.timeLimit:
+        settings['timePercentage'] = float(args.timeLimit) / 840.0
+
+    if args.logfileName:
+        logging.basicConfig(filename=args.logfileName)
+
+    if args.outfmtCSV:
+        settings['outfmtCSV'] = 1
+
+    if args.watches:
+        health_watches = 0
+        for c in args.watches:
+            if c == 'p':
+                health_watches |= dcgm_structs.DCGM_HEALTH_WATCH_PCIE
+            elif c == 'm':
+                health_watches |= dcgm_structs.DCGM_HEALTH_WATCH_MEM
+            elif c == 'i':
+                health_watches |= dcgm_structs.DCGM_HEALTH_WATCH_INFOROM
+            elif c == 't':
+                health_watches |= dcgm_structs.DCGM_HEALTH_WATCH_THERMAL
+                health_watches |= dcgm_structs.DCGM_HEALTH_WATCH_POWER
+            elif c == 'n':
+                health_watches |= dcgm_structs.DCGM_HEALTH_WATCH_NVLINK
+            elif c == 'a':
+                health_watches |= dcgm_structs.DCGM_HEALTH_WATCH_ALL
+            else:
+                print(("Unrecognized character %s found in watch string '%s'" %
+                       (c, args.watches)))
+                sys.exit(-1)
+        settings['watches'] = health_watches
+    else:
+        settings['watches'] = BR_HEALTH_WATCH_BITMAP
+
+
+def get_entity_id_list(entities):
+    ids = ""
+    first = True
+    for entity in entities:
+        if first:
+            ids = str(entity.GetEntityId())
+        else:
+            ids += ",%d" % (entity.GetEntityId())
+        first = False
+
+    return ids
+
+
+def check_health(handleObj, settings, error_list):
+    initialize_devices(handleObj.handle, settings['entity_get_flags'])
+
+    if 'numGpus' in settings:
+        if len(g_gpus) != settings['numGpus']:
+            error_list.append(
+                "%d GPUs were specified but only %d were detected with ids '%s'"
+                %
+                (settings['numGpus'], len(g_gpus), get_entity_id_list(g_gpus)))
+
+    if 'numSwitches' in settings:
+        if len(g_switches) != settings['numSwitches']:
+            error_list.append(
+                "%d switches were specified but only %d were detected with ids '%s'"
+                % (settings['numSwitches'], len(g_switches),
+                   get_entity_id_list(g_switches)))
+
+    check_passive_health(handleObj, settings['watches'])  # quick check
+
+    if settings['instant'] == False:
+        check_gpu_diagnostic(handleObj, settings)
+
+
+def process_command_line(settings):
+    try:
+        __process_command_line__(settings)
+    except ValueError as e:
+        return str(e)
+
+
+def main():
+    # Parse the command line
+    settings = {}
+    error_list = []
+
+    exitCode = 0
+    jsonTop = {}
+
+    error = process_command_line(settings)
+    if error:
+        # If we had an error processing the command line, don't attempt to check anything
+        error_list.append(error)
+    else:
+        try:
+            handleObj = pydcgm.DcgmHandle(None, settings['hostname'],
+                                          dcgm_structs.DCGM_OPERATION_MODE_AUTO)
+
+            check_health(handleObj, settings, error_list)
+        except dcgm_structs.DCGMError as e:
+            # Catch any exceptions from DCGM and add them to the error_list so they'll be printed as JSON
+            error_list.append(str(e))
+        except ValueError as e:
+            error_list.append(str(e))
+
+        if 'outfmtCSV' in settings:  # show all health, then all un-healthy
+            for gpuObj in g_gpus:
+                if gpuObj.IsHealthy() == True:
+                    print("healthy,%s,%s" % (gpuObj.GetBDF(), gpuObj.GetUUID()))
+            for gpuObj in g_gpus:
+                if gpuObj.IsHealthy() == False:
+                    print("unhealthy,%s,%s,\"%s\"" %
+                          (gpuObj.GetBDF(), gpuObj.GetUUID(),
+                           gpuObj.WhyUnhealthy()))
+
+        else:  # build obj that can be output in json
+            denylistGpus = {}
+            healthyGpus = {}
+            for gpuObj in g_gpus:
+                if gpuObj.IsHealthy() == False:
+                    details = {}
+                    details['UUID'] = gpuObj.GetUUID()
+                    details['BDF'] = gpuObj.GetBDF()
+                    details['Failure Explanation'] = gpuObj.WhyUnhealthy()
+                    denylistGpus[gpuObj.GetEntityId()] = details
+                elif settings['fullReport']:
+                    details = {}
+                    details['UUID'] = gpuObj.GetUUID()
+                    details['BDF'] = gpuObj.GetBDF()
+                    healthyGpus[gpuObj.GetEntityId()] = details
+
+            jsonTop['denylistedGpus'] = denylistGpus
+            if settings['fullReport']:
+                jsonTop['Healthy GPUs'] = healthyGpus
+
+    if len(error_list):  # had error processing the command line
+        exitCode = 1
+        if 'outfmtCSV' in settings:  # json output
+            if len(error_list):
+                for errObj in error_list:
+                    print("errors,\"%s\"" % (errObj))
+        else:
+            jsonTop['errors'] = error_list
+
+    if 'outfmtCSV' in settings:  # show all health, then all un-healthy
+        pass
+    else:
+        print(json.dumps(jsonTop, indent=4, separators=(',', ': ')))
+
+    sys.exit(exitCode)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/model_analyzer/monitor/dcgm/pydcgm.py b/model_analyzer/monitor/dcgm/pydcgm.py
new file mode 100644
index 000000000..da6157471
--- /dev/null
+++ b/model_analyzer/monitor/dcgm/pydcgm.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def _python_version_check():
+    import sys
+    python_version = sys.version.split(None, 1)[0]
+    if python_version < '3':
+        print(
+            '[ERROR] Detected Python version {}. These bindings are for Python 3.5+. Please load the Python 2 bindings found at /usr/local/dcgm/bindings'
+            .format(python_version))
+        sys.exit(1)
+
+
+_python_version_check()
+
+#Bring classes into this namespace
+from model_analyzer.monitor.dcgm.DcgmHandle import *
+from model_analyzer.monitor.dcgm.DcgmGroup import *
+from model_analyzer.monitor.dcgm.DcgmStatus import *
+from model_analyzer.monitor.dcgm.DcgmSystem import *
+from model_analyzer.monitor.dcgm.DcgmFieldGroup import *
+
+import os
+if '__DCGM_TESTING_FRAMEWORK_ACTIVE' in os.environ and os.environ[
+        '__DCGM_TESTING_FRAMEWORK_ACTIVE'] == '1':
+    import utils
+    import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
+    dcgm_structs._dcgmInit(utils.get_testing_framework_library_path())
+'''
+Define a unique exception type we will return so that callers can distinguish our exceptions from python standard ones
+'''
+
+
+class DcgmException(Exception):
+    pass