Skip to content

Commit

Permalink
Enable labels for ClusterUUID and CliqueId
Browse files Browse the repository at this point in the history
Signed-off-by: Carlos Eduardo Arango Gutierrez <[email protected]>
  • Loading branch information
ArangoGutierrez committed Oct 2, 2024
1 parent 37d2609 commit 1e9f62e
Show file tree
Hide file tree
Showing 19 changed files with 325 additions and 4 deletions.
3 changes: 3 additions & 0 deletions api/config/v1/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ type GFDCommandLineFlags struct {
NoTimestamp *bool `json:"noTimestamp" yaml:"noTimestamp"`
SleepInterval *Duration `json:"sleepInterval" yaml:"sleepInterval"`
OutputFile *string `json:"outputFile" yaml:"outputFile"`
ImexNodesConfig *string `json:"imexNodesConfig" yaml:"imexNodesConfig"`
MachineTypeFile *string `json:"machineTypeFile" yaml:"machineTypeFile"`
}

Expand Down Expand Up @@ -162,6 +163,8 @@ func (f *Flags) UpdateFromCLIFlags(c *cli.Context, flags []cli.Flag) {
updateFromCLIFlag(&f.GFD.Oneshot, c, n)
case "output-file":
updateFromCLIFlag(&f.GFD.OutputFile, c, n)
case "imex-nodes-config":
updateFromCLIFlag(&f.GFD.ImexNodesConfig, c, n)
case "sleep-interval":
updateFromCLIFlag(&f.GFD.SleepInterval, c, n)
case "no-timestamp":
Expand Down
6 changes: 6 additions & 0 deletions cmd/gpu-feature-discovery/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,12 @@ func main() {
Value: "/etc/kubernetes/node-feature-discovery/features.d/gfd",
EnvVars: []string{"GFD_OUTPUT_FILE"},
},
&cli.StringFlag{
Name: "imex-nodes-config",
Value: "/etc/nvidia-imex/nodes_config.cfg",
Usage: "the path to nvidia-imex nodes config file",
EnvVars: []string{"GFD_IMEX_NODES_CONFIG"},
},
&cli.StringFlag{
Name: "machine-type-file",
Value: "/sys/class/dmi/id/product_name",
Expand Down
4 changes: 4 additions & 0 deletions cmd/gpu-feature-discovery/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ func TestRunOneshot(t *testing.T) {
GFD: &spec.GFDCommandLineFlags{
Oneshot: ptr(true),
OutputFile: ptr("./gfd-test-oneshot"),
ImexNodesConfig: ptr(""),
SleepInterval: ptr(spec.Duration(time.Second)),
NoTimestamp: ptr(false),
MachineTypeFile: ptr(testMachineTypeFile),
Expand Down Expand Up @@ -158,6 +159,7 @@ func TestRunWithNoTimestamp(t *testing.T) {
GFD: &spec.GFDCommandLineFlags{
Oneshot: ptr(true),
OutputFile: ptr("./gfd-test-with-no-timestamp"),
ImexNodesConfig: ptr(""),
SleepInterval: ptr(spec.Duration(time.Second)),
NoTimestamp: ptr(true),
MachineTypeFile: ptr(testMachineTypeFile),
Expand Down Expand Up @@ -218,6 +220,7 @@ func TestRunSleep(t *testing.T) {
GFD: &spec.GFDCommandLineFlags{
Oneshot: ptr(false),
OutputFile: ptr("./gfd-test-loop"),
ImexNodesConfig: ptr(""),
SleepInterval: ptr(spec.Duration(time.Second)),
NoTimestamp: ptr(false),
MachineTypeFile: ptr(testMachineTypeFile),
Expand Down Expand Up @@ -390,6 +393,7 @@ func TestFailOnNVMLInitError(t *testing.T) {
GFD: &spec.GFDCommandLineFlags{
Oneshot: ptr(true),
OutputFile: ptr(outputFile),
ImexNodesConfig: ptr(""),
SleepInterval: ptr(spec.Duration(500 * time.Millisecond)),
NoTimestamp: ptr(false),
MachineTypeFile: ptr(testMachineTypeFile),
Expand Down
5 changes: 5 additions & 0 deletions cmd/gpu-feature-discovery/mig_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ func TestMigStrategyNone(t *testing.T) {
GFD: &spec.GFDCommandLineFlags{
Oneshot: ptr(true),
OutputFile: ptr("./gfd-test-mig-none"),
ImexNodesConfig: ptr(""),
SleepInterval: ptr(spec.Duration(time.Second)),
NoTimestamp: ptr(false),
MachineTypeFile: ptr(testMachineTypeFile),
Expand Down Expand Up @@ -97,6 +98,7 @@ func TestMigStrategySingleForNoMigDevices(t *testing.T) {
GFD: &spec.GFDCommandLineFlags{
Oneshot: ptr(true),
OutputFile: ptr("./gfd-test-mig-single-no-mig"),
ImexNodesConfig: ptr(""),
SleepInterval: ptr(spec.Duration(time.Second)),
NoTimestamp: ptr(false),
MachineTypeFile: ptr(testMachineTypeFile),
Expand Down Expand Up @@ -165,6 +167,7 @@ func TestMigStrategySingleForMigDeviceMigDisabled(t *testing.T) {
GFD: &spec.GFDCommandLineFlags{
Oneshot: ptr(true),
OutputFile: ptr("./gfd-test-mig-single-no-mig"),
ImexNodesConfig: ptr(""),
SleepInterval: ptr(spec.Duration(time.Second)),
NoTimestamp: ptr(false),
MachineTypeFile: ptr(testMachineTypeFile),
Expand Down Expand Up @@ -233,6 +236,7 @@ func TestMigStrategySingle(t *testing.T) {
GFD: &spec.GFDCommandLineFlags{
Oneshot: ptr(true),
OutputFile: ptr("./gfd-test-mig-single"),
ImexNodesConfig: ptr(""),
SleepInterval: ptr(spec.Duration(time.Second)),
NoTimestamp: ptr(false),
MachineTypeFile: ptr(testMachineTypeFile),
Expand Down Expand Up @@ -302,6 +306,7 @@ func TestMigStrategyMixed(t *testing.T) {
GFD: &spec.GFDCommandLineFlags{
Oneshot: ptr(true),
OutputFile: ptr("./gfd-test-mig-mixed"),
ImexNodesConfig: ptr(""),
SleepInterval: ptr(spec.Duration(time.Second)),
NoTimestamp: ptr(false),
MachineTypeFile: ptr(testMachineTypeFile),
Expand Down
1 change: 1 addition & 0 deletions deployments/devel/go.mod
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
module github.com/NVIDIA/k8s-device-plugin/deployments/devel

go 1.22

toolchain go1.23.0

require github.com/matryer/moq v0.5.0
Expand Down
18 changes: 18 additions & 0 deletions deployments/helm/nvidia-device-plugin/templates/daemonset-gfd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,10 @@ spec:
- name: GFD_USE_NODE_FEATURE_API
value: {{ .Values.nfd.enableNodeFeatureApi | quote }}
{{- end }}
{{- if and (typeIs "string" .Values.imex.nodesConfigFile) (ne .Values.imex.nodesConfigFile "") }}
- name: GFD_IMEX_NODES_CONFIG
value: {{ .Values.imex.nodesConfigFile | quote }}
{{- end }}
{{- if $options.hasConfigMap }}
- name: CONFIG_FILE
value: /config/config.yaml
Expand All @@ -182,6 +186,12 @@ spec:
mountPath: "/etc/kubernetes/node-feature-discovery/features.d"
- name: host-sys
mountPath: "/sys"
- name: imex-nodes-config
{{- if typeIs "string" .Values.nvidiaDriverRoot }}
mountPath:"{{ .Values.nvidiaDriverRoot }}/etc/nvidia-imex"
{{- else }}
mountPath: "/etc/nvidia-imex"
{{- end }}
{{- if $options.hasConfigMap }}
- name: available-configs
mountPath: /available-configs
Expand All @@ -199,6 +209,14 @@ spec:
- name: host-sys
hostPath:
path: "/sys"
- name: imex-nodes-config
type: DirectoryOrCreate
hostPath:
{{- if typeIs "string" .Values.nvidiaDriverRoot }}
path:"{{ .Values.nvidiaDriverRoot }}/etc/nvidia-imex"
{{- else }}
path: "/etc/nvidia-imex"
{{- end }}
{{- if $options.hasConfigMap }}
- name: available-configs
configMap:
Expand Down
2 changes: 2 additions & 0 deletions deployments/helm/nvidia-device-plugin/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ deviceIDStrategy: null
nvidiaDriverRoot: null
gdsEnabled: null
mofedEnabled: null
imex:
nodesConfigFile: null
deviceDiscoveryStrategy: null

nameOverride: ""
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ go 1.22.2

require (
github.com/NVIDIA/go-gpuallocator v0.5.0
github.com/NVIDIA/go-nvlib v0.6.1
github.com/NVIDIA/go-nvlib v0.6.2-0.20240928162840-41955a08425b
github.com/NVIDIA/go-nvml v0.12.4-0
github.com/NVIDIA/nvidia-container-toolkit v1.16.1
github.com/fsnotify/fsnotify v1.7.0
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
github.com/NVIDIA/go-gpuallocator v0.5.0 h1:166ICvPv2dU9oZ2J3kJ4y3XdbGCi6LhXgFZJtrqeu3A=
github.com/NVIDIA/go-gpuallocator v0.5.0/go.mod h1:zos5bTIN01hpQioOyu9oRKglrznImMQvm0bZllMmckw=
github.com/NVIDIA/go-nvlib v0.6.1 h1:0/5FvaKvDJoJeJ+LFlh+NDQMxMlVw9wOXrOVrGXttfE=
github.com/NVIDIA/go-nvlib v0.6.1/go.mod h1:9UrsLGx/q1OrENygXjOuM5Ey5KCtiZhbvBlbUIxtGWY=
github.com/NVIDIA/go-nvlib v0.6.2-0.20240928162840-41955a08425b h1:k5ptZB9RGUaR5RcK0R8Cfa4mtTHrSZZ73BFyD3c6KvM=
github.com/NVIDIA/go-nvlib v0.6.2-0.20240928162840-41955a08425b/go.mod h1:9UrsLGx/q1OrENygXjOuM5Ey5KCtiZhbvBlbUIxtGWY=
github.com/NVIDIA/go-nvml v0.12.4-0 h1:4tkbB3pT1O77JGr0gQ6uD8FrsUPqP1A/EOEm2wI1TUg=
github.com/NVIDIA/go-nvml v0.12.4-0/go.mod h1:8Llmj+1Rr+9VGGwZuRer5N/aCjxGuR5nPb/9ebBiIEQ=
github.com/NVIDIA/nvidia-container-toolkit v1.16.1 h1:PkY6RqYD1wIt1izCvYZ7kr7IitxK8e9+k/prO6b3vD0=
Expand Down
113 changes: 113 additions & 0 deletions internal/lm/nvml.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,20 @@
package lm

import (
"bufio"
"errors"
"fmt"
"math/rand" // nolint:gosec
"net"
"os"
"sort"
"strconv"
"strings"

"k8s.io/klog/v2"

"github.com/google/uuid"

"github.com/NVIDIA/go-nvlib/pkg/nvpci"

spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1"
Expand Down Expand Up @@ -80,13 +87,19 @@ func NewDeviceLabeler(manager resource.Manager, config *spec.Config) (Labeler, e
return nil, fmt.Errorf("error creating resource labeler: %v", err)
}

imexLabeler, err := newImexDomainLabeler(*config.Flags.GFD.ImexNodesConfig, devices)
if err != nil {
return nil, fmt.Errorf("error creating imex domain labeler: %v", err)
}

l := Merge(
machineTypeLabeler,
versionLabeler,
migCapabilityLabeler,
sharingLabeler,
resourceLabeler,
gpuModeLabeler,
imexLabeler,
)

return l, nil
Expand Down Expand Up @@ -218,6 +231,88 @@ func newGPUModeLabeler(devices []resource.Device) (Labeler, error) {
return labels, nil
}

func newImexDomainLabeler(configFile string, devices []resource.Device) (Labeler, error) {
if configFile == "" {
// No imex config file, return empty labels
return empty{}, nil
}

var commonClusterUUID string
var commonCliqueID string
for _, d := range devices {
isFabricAttached, err := d.IsFabricAttached()
if err != nil {
return nil, fmt.Errorf("error checking imex capability: %v", err)
}
if !isFabricAttached {
continue
}

clusterUUID, cliqueID, err := d.GetFabricIds()
if err != nil {
return nil, fmt.Errorf("error getting cluster UUID: %v", err)
}
if commonClusterUUID == "" {
commonClusterUUID = clusterUUID
}
if commonClusterUUID != clusterUUID {
klog.Warningf("Cluster UUIDs are different: %s != %s", commonClusterUUID, clusterUUID)
// Return empty labels if cluster UUIDs are different
return empty{}, nil
}

if commonCliqueID == "" {
commonCliqueID = cliqueID
}
if commonCliqueID != cliqueID {
klog.Warningf("Clique IDs are different: %s != %s", commonCliqueID, cliqueID)
// Return empty labels if clique IDs are different
return empty{}, nil
}
}

// Read file and parse it
imexConfig, err := os.Open(configFile)
if os.IsNotExist(err) {
// No imex config file, return empty labels
return empty{}, nil
} else if err != nil {
return nil, fmt.Errorf("failed to open imex config file: %v", err)
}
defer imexConfig.Close()

// Read the file line by line
var ips []string
scanner := bufio.NewScanner(imexConfig)
for scanner.Scan() {
ip := strings.TrimSpace(scanner.Text())
if net.ParseIP(ip) == nil {
return nil, fmt.Errorf("invalid IP address in imex config file: %s", ip)
}
ips = append(ips, ip)
}

if err := scanner.Err(); err != nil {
return nil, fmt.Errorf("failed to read imex config file: %v", err)
}

if len(ips) == 0 {
// No IPs in the file, return empty labels
return empty{}, nil
}

// Sort the IP addresses
sort.Strings(ips)

labels := Labels{
"nvidia.com/gpu.clusteruuid": commonClusterUUID,
"nvidia.com/gpu.cliqueid": commonCliqueID,
"nvidia.com/gpu.imex-domain": generateUUID(strings.Join(ips, "\n")) + "-" + commonCliqueID,
}

return labels, nil
}

func getModeForClasses(classes []uint32) string {
if len(classes) == 0 {
return "unknown"
Expand Down Expand Up @@ -254,3 +349,21 @@ func getDeviceClasses(devices []resource.Device) ([]uint32, error) {
}
return classes, nil
}

func generateUUID(seed string) string {
// nolint:gosec
rand := rand.New(rand.NewSource(hash(seed)))

charset := make([]byte, 16)
rand.Read(charset)
uuid, _ := uuid.FromBytes(charset)
return uuid.String()
}

func hash(s string) int64 {
h := int64(0)
for _, c := range s {
h = 31*h + int64(c)
}
return h
}
8 changes: 8 additions & 0 deletions internal/resource/cuda-device.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,14 @@ func (d *cudaDevice) IsMigEnabled() (bool, error) {
return false, nil
}

func (d *cudaDevice) IsFabricAttached() (bool, error) {
return false, nil
}

func (d *cudaDevice) GetPCIClass() (uint32, error) {
return 0, nil
}

func (d *cudaDevice) GetFabricIds() (string, string, error) {
return "", "", fmt.Errorf("GetFabricIds is not supported for CUDA devices")
}
Loading

0 comments on commit 1e9f62e

Please sign in to comment.