diff --git a/internal/lm/nvml.go b/internal/lm/nvml.go index 0b5ed6e9a..b4f0b09aa 100644 --- a/internal/lm/nvml.go +++ b/internal/lm/nvml.go @@ -80,6 +80,11 @@ func NewDeviceLabeler(manager resource.Manager, config *spec.Config) (Labeler, e return nil, fmt.Errorf("error creating resource labeler: %v", err) } + imexLabeler, err := newImexDomainLabeler(devices) + if err != nil { + return nil, fmt.Errorf("error creating imex domain labeler: %v", err) + } + l := Merge( machineTypeLabeler, versionLabeler, @@ -87,6 +92,7 @@ func NewDeviceLabeler(manager resource.Manager, config *spec.Config) (Labeler, e sharingLabeler, resourceLabeler, gpuModeLabeler, + imexLabeler, ) return l, nil @@ -218,6 +224,45 @@ func newGPUModeLabeler(devices []resource.Device) (Labeler, error) { return labels, nil } +func newImexDomainLabeler(devices []resource.Device) (Labeler, error) { + var commonClusterUUID, commonCliqueID string + + for _, d := range devices { + clusterUuid, err := d.GetClusterUUID() + if err != nil { + return nil, err + } + if commonClusterUUID == "" { + commonClusterUUID = clusterUuid + } + if commonClusterUUID != clusterUuid { + klog.Warningf("cluster UUID mismatch: %s != %s", commonClusterUUID, clusterUuid) + return nil, fmt.Errorf("cluster UUID mismatch: %s != %s", commonClusterUUID, clusterUuid) + } + + cliqueId, err := d.GetCliqueID() + if err != nil { + return nil, err + } + if commonCliqueID == "" { + commonCliqueID = cliqueId + } + if commonCliqueID != cliqueId { + klog.Warningf("clique ID mismatch: %s != %s", commonCliqueID, cliqueId) + return nil, fmt.Errorf("clique ID mismatch: %s != %s", commonCliqueID, cliqueId) + } + } + + domain := fmt.Sprintf("%s-%s", commonClusterUUID, commonCliqueID) + labels := Labels{ + "nvidia.com/gpu.clusteruuid": commonClusterUUID, + "nvidia.com/gpu.cliqueid": commonCliqueID, + "nvidia.com/imex-domain": domain, + } + + return labels, nil +} + func getModeForClasses(classes []uint32) string { if len(classes) == 0 { return "unknown" diff --git a/internal/resource/cuda-device.go b/internal/resource/cuda-device.go index a4f4bc4a4..5bcc60839 100644 --- a/internal/resource/cuda-device.go +++ b/internal/resource/cuda-device.go @@ -100,3 +100,11 @@ func (d *cudaDevice) IsMigEnabled() (bool, error) { func (d *cudaDevice) GetPCIClass() (uint32, error) { return 0, nil } + +func (d *cudaDevice) GetClusterUUID() (string, error) { + return "", nil +} + +func (d *cudaDevice) GetCliqueID() (string, error) { + return "", nil +} diff --git a/internal/resource/device_mock.go b/internal/resource/device_mock.go index 1024ec96b..56e94b0d2 100644 --- a/internal/resource/device_mock.go +++ b/internal/resource/device_mock.go @@ -20,6 +20,12 @@ var _ Device = &DeviceMock{} // GetAttributesFunc: func() (map[string]interface{}, error) { // panic("mock out the GetAttributes method") // }, +// GetCliqueIDFunc: func() (string, error) { +// panic("mock out the GetCliqueID method") +// }, +// GetClusterUUIDFunc: func() (string, error) { +// panic("mock out the GetClusterUUID method") +// }, // GetCudaComputeCapabilityFunc: func() (int, int, error) { // panic("mock out the GetCudaComputeCapability method") // }, @@ -54,6 +60,12 @@ type DeviceMock struct { // GetAttributesFunc mocks the GetAttributes method. GetAttributesFunc func() (map[string]interface{}, error) + // GetCliqueIDFunc mocks the GetCliqueID method. + GetCliqueIDFunc func() (string, error) + + // GetClusterUUIDFunc mocks the GetClusterUUID method. + GetClusterUUIDFunc func() (string, error) + // GetCudaComputeCapabilityFunc mocks the GetCudaComputeCapability method. GetCudaComputeCapabilityFunc func() (int, int, error) @@ -83,6 +95,12 @@ type DeviceMock struct { // GetAttributes holds details about calls to the GetAttributes method. GetAttributes []struct { } + // GetCliqueID holds details about calls to the GetCliqueID method. + GetCliqueID []struct { + } + // GetClusterUUID holds details about calls to the GetClusterUUID method. + GetClusterUUID []struct { + } // GetCudaComputeCapability holds details about calls to the GetCudaComputeCapability method. GetCudaComputeCapability []struct { } @@ -109,6 +127,8 @@ type DeviceMock struct { } } lockGetAttributes sync.RWMutex + lockGetCliqueID sync.RWMutex + lockGetClusterUUID sync.RWMutex lockGetCudaComputeCapability sync.RWMutex lockGetDeviceHandleFromMigDeviceHandle sync.RWMutex lockGetMigDevices sync.RWMutex @@ -146,6 +166,60 @@ func (mock *DeviceMock) GetAttributesCalls() []struct { return calls } +// GetCliqueID calls GetCliqueIDFunc. +func (mock *DeviceMock) GetCliqueID() (string, error) { + if mock.GetCliqueIDFunc == nil { + panic("DeviceMock.GetCliqueIDFunc: method is nil but Device.GetCliqueID was just called") + } + callInfo := struct { + }{} + mock.lockGetCliqueID.Lock() + mock.calls.GetCliqueID = append(mock.calls.GetCliqueID, callInfo) + mock.lockGetCliqueID.Unlock() + return mock.GetCliqueIDFunc() +} + +// GetCliqueIDCalls gets all the calls that were made to GetCliqueID. +// Check the length with: +// +// len(mockedDevice.GetCliqueIDCalls()) +func (mock *DeviceMock) GetCliqueIDCalls() []struct { +} { + var calls []struct { + } + mock.lockGetCliqueID.RLock() + calls = mock.calls.GetCliqueID + mock.lockGetCliqueID.RUnlock() + return calls +} + +// GetClusterUUID calls GetClusterUUIDFunc. +func (mock *DeviceMock) GetClusterUUID() (string, error) { + if mock.GetClusterUUIDFunc == nil { + panic("DeviceMock.GetClusterUUIDFunc: method is nil but Device.GetClusterUUID was just called") + } + callInfo := struct { + }{} + mock.lockGetClusterUUID.Lock() + mock.calls.GetClusterUUID = append(mock.calls.GetClusterUUID, callInfo) + mock.lockGetClusterUUID.Unlock() + return mock.GetClusterUUIDFunc() +} + +// GetClusterUUIDCalls gets all the calls that were made to GetClusterUUID. +// Check the length with: +// +// len(mockedDevice.GetClusterUUIDCalls()) +func (mock *DeviceMock) GetClusterUUIDCalls() []struct { +} { + var calls []struct { + } + mock.lockGetClusterUUID.RLock() + calls = mock.calls.GetClusterUUID + mock.lockGetClusterUUID.RUnlock() + return calls +} + // GetCudaComputeCapability calls GetCudaComputeCapabilityFunc. func (mock *DeviceMock) GetCudaComputeCapability() (int, int, error) { if mock.GetCudaComputeCapabilityFunc == nil { diff --git a/internal/resource/nvml-device.go b/internal/resource/nvml-device.go index 1184657d2..e3d4ae2e7 100644 --- a/internal/resource/nvml-device.go +++ b/internal/resource/nvml-device.go @@ -17,7 +17,9 @@ package resource import ( + "encoding/hex" "fmt" + "strconv" "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" "github.com/NVIDIA/go-nvlib/pkg/nvpci" @@ -99,3 +101,35 @@ func (d nvmlDevice) GetPCIClass() (uint32, error) { } return nvDevice.Class, nil } + +func (d nvmlDevice) GetClusterUUID() (string, error) { + gfInfo, ret := d.GetGpuFabricInfo() + if ret != nvml.SUCCESS { + return "", ret + } + + // Convert the array to a byte slice + byteSlice := gfInfo.ClusterUuid[:] + + // Encode the byte slice as a hex string + hexStr := hex.EncodeToString(byteSlice) + + // Format the hex string with dashes to match UUID format + uuid := fmt.Sprintf("%s-%s-%s-%s-%s", + hexStr[0:8], // First 8 characters + hexStr[8:12], // Next 4 characters + hexStr[12:16], // Next 4 characters + hexStr[16:20], // Next 4 characters + hexStr[20:32]) // Last 12 characters + + return uuid, nil +} + +func (d nvmlDevice) GetCliqueID() (string, error) { + gfInfo, ret := d.GetGpuFabricInfo() + if ret != nvml.SUCCESS { + return "", ret + } + + return strconv.FormatUint(uint64(gfInfo.CliqueId), 10), nil +} diff --git a/internal/resource/nvml-mig-device.go b/internal/resource/nvml-mig-device.go index 8ef933ff5..d3b4d3cd9 100644 --- a/internal/resource/nvml-mig-device.go +++ b/internal/resource/nvml-mig-device.go @@ -138,3 +138,11 @@ func (d nvmlMigDevice) GetPCIClass() (uint32, error) { // GPU devices that support MIG do not support switching mode between graphics and compute, so they are always in compute mode. return nvpci.PCI3dControllerClass, nil } + +func (d nvmlMigDevice) GetClusterUUID() (string, error) { + return "", fmt.Errorf("GetClusterUUID is not supported for MIG devices") +} + +func (d nvmlMigDevice) GetCliqueID() (string, error) { + return "", fmt.Errorf("GetCliqueID is not supported for MIG devices") +} diff --git a/internal/resource/sysfs-device.go b/internal/resource/sysfs-device.go index 105229fe4..c1074e017 100644 --- a/internal/resource/sysfs-device.go +++ b/internal/resource/sysfs-device.go @@ -68,3 +68,11 @@ func (d vfioDevice) IsMigCapable() (bool, error) { func (d vfioDevice) GetPCIClass() (uint32, error) { return d.nvidiaPCIDevice.Class, nil } + +func (d vfioDevice) GetClusterUUID() (string, error) { + return "", nil +} + +func (d vfioDevice) GetCliqueID() (string, error) { + return "", nil +} diff --git a/internal/resource/types.go b/internal/resource/types.go index ec89ec579..bd6b93a3e 100644 --- a/internal/resource/types.go +++ b/internal/resource/types.go @@ -40,4 +40,6 @@ type Device interface { GetDeviceHandleFromMigDeviceHandle() (Device, error) GetCudaComputeCapability() (int, int, error) GetPCIClass() (uint32, error) + GetClusterUUID() (string, error) + GetCliqueID() (string, error) }