diff --git a/api/config/v1/flags.go b/api/config/v1/flags.go index 2720379e2..60cc0d67c 100644 --- a/api/config/v1/flags.go +++ b/api/config/v1/flags.go @@ -107,6 +107,7 @@ type GFDCommandLineFlags struct { NoTimestamp *bool `json:"noTimestamp" yaml:"noTimestamp"` SleepInterval *Duration `json:"sleepInterval" yaml:"sleepInterval"` OutputFile *string `json:"outputFile" yaml:"outputFile"` + ImexNodesConfig *string `json:"imexNodesConfig" yaml:"imexNodesConfig"` MachineTypeFile *string `json:"machineTypeFile" yaml:"machineTypeFile"` } @@ -162,6 +163,8 @@ func (f *Flags) UpdateFromCLIFlags(c *cli.Context, flags []cli.Flag) { updateFromCLIFlag(&f.GFD.Oneshot, c, n) case "output-file": updateFromCLIFlag(&f.GFD.OutputFile, c, n) + case "imex-nodes-config": + updateFromCLIFlag(&f.GFD.ImexNodesConfig, c, n) case "sleep-interval": updateFromCLIFlag(&f.GFD.SleepInterval, c, n) case "no-timestamp": diff --git a/cmd/gpu-feature-discovery/main.go b/cmd/gpu-feature-discovery/main.go index c824ffcc2..6427d6a57 100644 --- a/cmd/gpu-feature-discovery/main.go +++ b/cmd/gpu-feature-discovery/main.go @@ -86,6 +86,13 @@ func main() { Value: "/etc/kubernetes/node-feature-discovery/features.d/gfd", EnvVars: []string{"GFD_OUTPUT_FILE"}, }, + &cli.StringFlag{ + Name: "imex-nopdes-config", + Aliases: []string{"imex-nodes-config"}, + Usage: "the path to nvidia-imex nodes config file", + Value: "/etc/nvidia-imex/nodes_config.cfg", + EnvVars: []string{"GFD_IMEX_NODES_CONFIG"}, + }, &cli.StringFlag{ Name: "machine-type-file", Value: "/sys/class/dmi/id/product_name", diff --git a/deployments/helm/nvidia-device-plugin/templates/daemonset-gfd.yml b/deployments/helm/nvidia-device-plugin/templates/daemonset-gfd.yml index 940dcc902..218810a6f 100644 --- a/deployments/helm/nvidia-device-plugin/templates/daemonset-gfd.yml +++ b/deployments/helm/nvidia-device-plugin/templates/daemonset-gfd.yml @@ -163,6 +163,10 @@ spec: - name: GFD_USE_NODE_FEATURE_API value: {{ .Values.nfd.enableNodeFeatureApi | quote }} {{- end }} + {{- if .Values.imex.enabled }} + - name: GFD_IMEX_NODES_CONFIG + value: {{ .Values.imex.configFile | quote }} + {{- end }} {{- if $options.hasConfigMap }} - name: CONFIG_FILE value: /config/config.yaml @@ -182,6 +186,10 @@ spec: mountPath: "/etc/kubernetes/node-feature-discovery/features.d" - name: host-sys mountPath: "/sys" + {{- if .Values.imex.enabled }} + - name: imex-nodes-config + mountPath: {{ .Values.imex.configFile | quote }} + {{- end }} {{- if $options.hasConfigMap }} - name: available-configs mountPath: /available-configs @@ -199,6 +207,11 @@ spec: - name: host-sys hostPath: path: "/sys" + {{- if .Values.imex.enabled }} + - name: imex-nodes-config + hostPath: + path: {{ .Values.imex.configFile | quote }} + {{- end }} {{- if $options.hasConfigMap }} - name: available-configs configMap: diff --git a/deployments/helm/nvidia-device-plugin/values.yaml b/deployments/helm/nvidia-device-plugin/values.yaml index b0c624295..4bbb4eeb4 100644 --- a/deployments/helm/nvidia-device-plugin/values.yaml +++ b/deployments/helm/nvidia-device-plugin/values.yaml @@ -35,6 +35,9 @@ deviceIDStrategy: null nvidiaDriverRoot: null gdsEnabled: null mofedEnabled: null +imex: + enabled: false + configFile: "/etc/nvidia-imex/nodes_config.cfg" deviceDiscoveryStrategy: null nameOverride: "" diff --git a/internal/lm/nvml.go b/internal/lm/nvml.go index 0b5ed6e9a..2624b22c9 100644 --- a/internal/lm/nvml.go +++ b/internal/lm/nvml.go @@ -17,8 +17,12 @@ package lm import ( + "bufio" "errors" "fmt" + "math/rand" + "os" + "sort" "strconv" "strings" @@ -28,6 +32,7 @@ import ( spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1" "github.com/NVIDIA/k8s-device-plugin/internal/resource" + "github.com/google/uuid" ) var errMPSSharingNotSupported = errors.New("MPS sharing is not supported") @@ -80,6 +85,14 @@ func NewDeviceLabeler(manager resource.Manager, config *spec.Config) (Labeler, e return nil, fmt.Errorf("error creating resource labeler: %v", err) } + var imexLabeler Labeler + if *config.Flags.GFD.ImexNodesConfig != "" { + imexLabeler, err = newImexDomainLabeler(*config.Flags.GFD.ImexNodesConfig) + if err != nil { + return nil, fmt.Errorf("error creating imex domain labeler: %v", err) + } + } + l := Merge( machineTypeLabeler, versionLabeler, @@ -87,6 +100,7 @@ func NewDeviceLabeler(manager resource.Manager, config *spec.Config) (Labeler, e sharingLabeler, resourceLabeler, gpuModeLabeler, + imexLabeler, ) return l, nil @@ -218,6 +232,41 @@ func newGPUModeLabeler(devices []resource.Device) (Labeler, error) { return labels, nil } +func newImexDomainLabeler(configFile string) (Labeler, error) { + // read file and parse it + imexConfig, err := os.Open(configFile) + if err != nil { + return nil, fmt.Errorf("failed to read imex config file: %v", err) + } + defer imexConfig.Close() + + // Read the file line by line + var ips []string + scanner := bufio.NewScanner(imexConfig) + for scanner.Scan() { + line := scanner.Text() + ips = append(ips, line) + } + + if err := scanner.Err(); err != nil { + return nil, fmt.Errorf("failed to read imex config file: %v", err) + } + + // Sort the IP addresses + sort.Strings(ips) + + // Join the sorted IPs into a single string + sortedIPs := strings.Join(ips, "\n") + + hashedconfig := generateUUIDs(sortedIPs, 1)[0] + + labels := Labels{ + "nvidia.com/gpu.imex-domain": hashedconfig, + } + + return labels, nil +} + func getModeForClasses(classes []uint32) string { if len(classes) == 0 { return "unknown" @@ -254,3 +303,25 @@ func getDeviceClasses(devices []resource.Device) ([]uint32, error) { } return classes, nil } + +func generateUUIDs(seed string, count int) []string { + rand := rand.New(rand.NewSource(hash(seed))) + + uuids := make([]string, count) + for i := 0; i < count; i++ { + charset := make([]byte, 16) + rand.Read(charset) + uuid, _ := uuid.FromBytes(charset) + uuids[i] = uuid.String() + } + + return uuids +} + +func hash(s string) int64 { + h := int64(0) + for _, c := range s { + h = 31*h + int64(c) + } + return h +}