Skip to content

Commit

Permalink
Enable labels for ClusterUUID and CliqueId
Browse files Browse the repository at this point in the history
Signed-off-by: Carlos Eduardo Arango Gutierrez <[email protected]>
  • Loading branch information
ArangoGutierrez committed Sep 25, 2024
1 parent 71c1fa7 commit 53ca695
Show file tree
Hide file tree
Showing 5 changed files with 97 additions and 0 deletions.
3 changes: 3 additions & 0 deletions api/config/v1/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ type GFDCommandLineFlags struct {
NoTimestamp *bool `json:"noTimestamp" yaml:"noTimestamp"`
SleepInterval *Duration `json:"sleepInterval" yaml:"sleepInterval"`
OutputFile *string `json:"outputFile" yaml:"outputFile"`
ImexNodesConfig *string `json:"imexNodesConfig" yaml:"imexNodesConfig"`
MachineTypeFile *string `json:"machineTypeFile" yaml:"machineTypeFile"`
}

Expand Down Expand Up @@ -162,6 +163,8 @@ func (f *Flags) UpdateFromCLIFlags(c *cli.Context, flags []cli.Flag) {
updateFromCLIFlag(&f.GFD.Oneshot, c, n)
case "output-file":
updateFromCLIFlag(&f.GFD.OutputFile, c, n)
case "imex-nodes-config":
updateFromCLIFlag(&f.GFD.ImexNodesConfig, c, n)
case "sleep-interval":
updateFromCLIFlag(&f.GFD.SleepInterval, c, n)
case "no-timestamp":
Expand Down
7 changes: 7 additions & 0 deletions cmd/gpu-feature-discovery/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,13 @@ func main() {
Value: "/etc/kubernetes/node-feature-discovery/features.d/gfd",
EnvVars: []string{"GFD_OUTPUT_FILE"},
},
&cli.StringFlag{
Name: "imex-nopdes-config",
Aliases: []string{"imex-nodes-config"},
Usage: "the path to nvidia-imex nodes config file",
Value: "/etc/nvidia-imex/nodes_config.cfg",
EnvVars: []string{"GFD_IMEX_NODES_CONFIG"},
},
&cli.StringFlag{
Name: "machine-type-file",
Value: "/sys/class/dmi/id/product_name",
Expand Down
13 changes: 13 additions & 0 deletions deployments/helm/nvidia-device-plugin/templates/daemonset-gfd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,10 @@ spec:
- name: GFD_USE_NODE_FEATURE_API
value: {{ .Values.nfd.enableNodeFeatureApi | quote }}
{{- end }}
{{- if .Values.imex.enabled }}
- name: GFD_IMEX_NODES_CONFIG
value: {{ .Values.imex.configFile | quote }}
{{- end }}
{{- if $options.hasConfigMap }}
- name: CONFIG_FILE
value: /config/config.yaml
Expand All @@ -182,6 +186,10 @@ spec:
mountPath: "/etc/kubernetes/node-feature-discovery/features.d"
- name: host-sys
mountPath: "/sys"
{{- if .Values.imex.enabled }}
- name: imex-nodes-config
mountPath: {{ .Values.imex.configFile | quote }}
{{- end }}
{{- if $options.hasConfigMap }}
- name: available-configs
mountPath: /available-configs
Expand All @@ -199,6 +207,11 @@ spec:
- name: host-sys
hostPath:
path: "/sys"
{{- if .Values.imex.enabled }}
- name: imex-nodes-config
hostPath:
path: {{ .Values.imex.configFile | quote }}
{{- end }}
{{- if $options.hasConfigMap }}
- name: available-configs
configMap:
Expand Down
3 changes: 3 additions & 0 deletions deployments/helm/nvidia-device-plugin/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ deviceIDStrategy: null
nvidiaDriverRoot: null
gdsEnabled: null
mofedEnabled: null
imex:
enabled: false
configFile: "/etc/nvidia-imex/nodes_config.cfg"
deviceDiscoveryStrategy: null

nameOverride: ""
Expand Down
71 changes: 71 additions & 0 deletions internal/lm/nvml.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,12 @@
package lm

import (
"bufio"
"errors"
"fmt"
"math/rand"
"os"
"sort"
"strconv"
"strings"

Expand All @@ -28,6 +32,7 @@ import (

spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1"
"github.com/NVIDIA/k8s-device-plugin/internal/resource"
"github.com/google/uuid"
)

var errMPSSharingNotSupported = errors.New("MPS sharing is not supported")
Expand Down Expand Up @@ -80,13 +85,22 @@ func NewDeviceLabeler(manager resource.Manager, config *spec.Config) (Labeler, e
return nil, fmt.Errorf("error creating resource labeler: %v", err)
}

var imexLabeler Labeler
if *config.Flags.GFD.ImexNodesConfig != "" {
imexLabeler, err = newImexDomainLabeler(*config.Flags.GFD.ImexNodesConfig)
if err != nil {
return nil, fmt.Errorf("error creating imex domain labeler: %v", err)
}
}

l := Merge(
machineTypeLabeler,
versionLabeler,
migCapabilityLabeler,
sharingLabeler,
resourceLabeler,
gpuModeLabeler,
imexLabeler,
)

return l, nil
Expand Down Expand Up @@ -218,6 +232,41 @@ func newGPUModeLabeler(devices []resource.Device) (Labeler, error) {
return labels, nil
}

func newImexDomainLabeler(configFile string) (Labeler, error) {
// read file and parse it
imexConfig, err := os.Open(configFile)
if err != nil {
return nil, fmt.Errorf("failed to read imex config file: %v", err)
}
defer imexConfig.Close()

// Read the file line by line
var ips []string
scanner := bufio.NewScanner(imexConfig)
for scanner.Scan() {
line := scanner.Text()
ips = append(ips, line)
}

if err := scanner.Err(); err != nil {
return nil, fmt.Errorf("failed to read imex config file: %v", err)
}

// Sort the IP addresses
sort.Strings(ips)

// Join the sorted IPs into a single string
sortedIPs := strings.Join(ips, "\n")

hashedconfig := generateUUIDs(sortedIPs, 1)[0]

labels := Labels{
"nvidia.com/gpu.imex-domain": hashedconfig,
}

return labels, nil
}

func getModeForClasses(classes []uint32) string {
if len(classes) == 0 {
return "unknown"
Expand Down Expand Up @@ -254,3 +303,25 @@ func getDeviceClasses(devices []resource.Device) ([]uint32, error) {
}
return classes, nil
}

func generateUUIDs(seed string, count int) []string {
rand := rand.New(rand.NewSource(hash(seed)))

Check failure on line 308 in internal/lm/nvml.go

View workflow job for this annotation

GitHub Actions / check

G404: Use of weak random number generator (math/rand or math/rand/v2 instead of crypto/rand) (gosec)

uuids := make([]string, count)
for i := 0; i < count; i++ {
charset := make([]byte, 16)
rand.Read(charset)
uuid, _ := uuid.FromBytes(charset)
uuids[i] = uuid.String()
}

return uuids
}

func hash(s string) int64 {
h := int64(0)
for _, c := range s {
h = 31*h + int64(c)
}
return h
}

0 comments on commit 53ca695

Please sign in to comment.