From 45051efeb67a6da4f34c233d3aad4ff7d0c8c0de Mon Sep 17 00:00:00 2001 From: "rongfu.leng" Date: Sat, 14 Sep 2024 11:10:26 +0800 Subject: [PATCH] add filter device when register node by uuid or index Signed-off-by: rongfu.leng --- .../templates/device-plugin/configmap.yaml | 6 +- cmd/device-plugin/nvidia/main.go | 3 +- cmd/device-plugin/nvidia/vgpucfg.go | 10 +- .../nvinternal/plugin/register.go | 1 + .../nvinternal/rm/nvml_manager.go | 7 + pkg/util/types.go | 21 ++- pkg/util/util.go | 29 ++++ pkg/util/util_test.go | 139 ++++++++++++++++++ 8 files changed, 207 insertions(+), 9 deletions(-) diff --git a/charts/hami/templates/device-plugin/configmap.yaml b/charts/hami/templates/device-plugin/configmap.yaml index 302252055..6d9055b71 100644 --- a/charts/hami/templates/device-plugin/configmap.yaml +++ b/charts/hami/templates/device-plugin/configmap.yaml @@ -13,7 +13,11 @@ data: "name": "m5-cloudinfra-online02", "devicememoryscaling": 1.8, "devicesplitcount": 10, - "migstrategy":"none" + "migstrategy":"none", + "filterdevices": { + "uuid": [], + "index": [] + } } ] } \ No newline at end of file diff --git a/cmd/device-plugin/nvidia/main.go b/cmd/device-plugin/nvidia/main.go index ab8e171f8..342b5b8fb 100644 --- a/cmd/device-plugin/nvidia/main.go +++ b/cmd/device-plugin/nvidia/main.go @@ -250,7 +250,8 @@ func startPlugins(c *cli.Context, flags []cli.Flag, restarting bool) ([]plugin.I //fmt.Println("NodeName=", config.NodeName) devConfig, err := generateDeviceConfigFromNvidia(config, c, flags) if err != nil { - fmt.Printf("failed to load config file %s", err.Error()) + klog.Errorf("failed to load config file %s", err.Error()) + return nil, false, err } // Update the configuration file with default resources. diff --git a/cmd/device-plugin/nvidia/vgpucfg.go b/cmd/device-plugin/nvidia/vgpucfg.go index 5ac45cc96..91b4f309f 100644 --- a/cmd/device-plugin/nvidia/vgpucfg.go +++ b/cmd/device-plugin/nvidia/vgpucfg.go @@ -107,7 +107,7 @@ func readFromConfigFile() error { } klog.Infof("Device Plugin Configs: %v", fmt.Sprintf("%v", deviceConfigs)) for _, val := range deviceConfigs.Nodeconfig { - if strings.Compare(os.Getenv(util.NodeNameEnvName), val.Name) == 0 { + if os.Getenv(util.NodeNameEnvName) == val.Name { klog.Infof("Reading config from file %s", val.Name) if val.Devicememoryscaling > 0 { *util.DeviceMemoryScaling = val.Devicememoryscaling @@ -118,6 +118,10 @@ func readFromConfigFile() error { if val.Devicesplitcount > 0 { *util.DeviceSplitCount = val.Devicesplitcount } + if val.FilterDevice != nil && (len(val.FilterDevice.UUID) > 0 || len(val.FilterDevice.Index) > 0) { + util.DevicePluginFilterDevice = val.FilterDevice + } + klog.Infof("FilterDevice: %v", val.FilterDevice) } } return nil @@ -149,6 +153,8 @@ func generateDeviceConfigFromNvidia(cfg *spec.Config, c *cli.Context, flags []cl } } } - readFromConfigFile() + if err := readFromConfigFile(); err != nil { + return devcfg, err + } return devcfg, nil } diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/plugin/register.go b/pkg/device-plugin/nvidiadevice/nvinternal/plugin/register.go index b7dafc491..5f14523b0 100644 --- a/pkg/device-plugin/nvidiadevice/nvinternal/plugin/register.go +++ b/pkg/device-plugin/nvidiadevice/nvinternal/plugin/register.go @@ -94,6 +94,7 @@ func parseNvidiaNumaInfo(idx int, nvidiaTopoStr string) (int, error) { func (plugin *NvidiaDevicePlugin) getAPIDevices() *[]*api.DeviceInfo { devs := plugin.Devices() + klog.V(5).InfoS("getAPIDevices", "devices", devs) nvml.Init() res := make([]*api.DeviceInfo, 0, len(devs)) idx := 0 diff --git a/pkg/device-plugin/nvidiadevice/nvinternal/rm/nvml_manager.go b/pkg/device-plugin/nvidiadevice/nvinternal/rm/nvml_manager.go index 18977ad14..1e1fb659f 100644 --- a/pkg/device-plugin/nvidiadevice/nvinternal/rm/nvml_manager.go +++ b/pkg/device-plugin/nvidiadevice/nvinternal/rm/nvml_manager.go @@ -55,6 +55,13 @@ func NewNVMLResourceManagers(nvmllib nvml.Interface, config *util.DeviceConfig) if len(devices) == 0 { continue } + for key, value := range devices { + if util.FilterDeviceToRegister(value.ID, value.Index) { + klog.V(5).InfoS("Filtering device", "device", value.ID) + delete(devices, key) + continue + } + } r := &nvmlResourceManager{ resourceManager: resourceManager{ config: config, diff --git a/pkg/util/types.go b/pkg/util/types.go index ce0c23a9a..6361ddfc4 100644 --- a/pkg/util/types.go +++ b/pkg/util/types.go @@ -48,13 +48,21 @@ const ( NodeNameEnvName = "NODE_NAME" ) +type FilterDevice struct { + // UUID is the device ID. + UUID []string `json:"uuid"` + // Index is the device index. + Index []uint `json:"index"` +} + type DevicePluginConfigs struct { Nodeconfig []struct { - Name string `json:"name"` - Devicememoryscaling float64 `json:"devicememoryscaling"` - Devicecorescaling float64 `json:"devicecorescaling"` - Devicesplitcount uint `json:"devicesplitcount"` - Migstrategy string `json:"migstrategy"` + Name string `json:"name"` + Devicememoryscaling float64 `json:"devicememoryscaling"` + Devicecorescaling float64 `json:"devicecorescaling"` + Devicesplitcount uint `json:"devicesplitcount"` + Migstrategy string `json:"migstrategy"` + FilterDevice *FilterDevice `json:"filterdevices"` } `json:"nodeconfig"` } @@ -74,6 +82,9 @@ var ( NodeName string RuntimeSocketFlag string DisableCoreLimit *bool + + // DevicePluginFilterDevice need device-plugin filter this device, don't register this device. + DevicePluginFilterDevice *FilterDevice ) // type ContainerDevices struct { diff --git a/pkg/util/util.go b/pkg/util/util.go index 61e5ba9d1..8ce47e850 100644 --- a/pkg/util/util.go +++ b/pkg/util/util.go @@ -415,3 +415,32 @@ func MarkAnnotationsToDelete(devType string, nn string) error { } return PatchNodeAnnotations(n, tmppat) } + +func FilterDeviceToRegister(uuid, indexStr string) bool { + if DevicePluginFilterDevice == nil || (len(DevicePluginFilterDevice.UUID) == 0 && len(DevicePluginFilterDevice.Index) == 0) { + return false + } + uuidMap, indexMap := make(map[string]struct{}), make(map[uint]struct{}) + for _, u := range DevicePluginFilterDevice.UUID { + uuidMap[u] = struct{}{} + } + for _, index := range DevicePluginFilterDevice.Index { + indexMap[index] = struct{}{} + } + if uuid != "" { + if _, ok := uuidMap[uuid]; ok { + return true + } + } + if indexStr != "" { + index, err := strconv.Atoi(indexStr) + if err != nil { + klog.Errorf("Error converting index to int: %v", err) + return false + } + if _, ok := indexMap[uint(index)]; ok { + return true + } + } + return false +} diff --git a/pkg/util/util_test.go b/pkg/util/util_test.go index 0c4b68090..08aec4f0c 100644 --- a/pkg/util/util_test.go +++ b/pkg/util/util_test.go @@ -282,3 +282,142 @@ func TestUnMarshalNodeDevices(t *testing.T) { }) } } + +func Test_FilterDeviceToRegister(t *testing.T) { + tests := []struct { + name string + args struct { + uuid string + idx string + *FilterDevice + } + want bool + }{ + { + name: "filter is nil", + args: struct { + uuid string + idx string + *FilterDevice + }{ + uuid: "GPU-8dcd427f-483b-b48f-d7e5-75fb19a52b76", + idx: "0", + FilterDevice: nil, + }, + want: false, + }, + { + name: "uuid is empty", + args: struct { + uuid string + idx string + *FilterDevice + }{ + uuid: "", + idx: "0", + FilterDevice: &FilterDevice{ + UUID: []string{"GPU-8dcd427f-483b-b48f-d7e5-75fb19a52b76"}, + }, + }, + want: false, + }, + { + name: "uuid is not in filter", + args: struct { + uuid string + idx string + *FilterDevice + }{ + uuid: "GPU-8dcd427f-483b-b48f-d7e5-75fb19a52b76", + idx: "0", + FilterDevice: &FilterDevice{ + UUID: []string{"GPU-8dcd427f-483b-b48f-d7e5-75fb19a52b77"}, + }, + }, + want: false, + }, + { + name: "uuid is in filter", + args: struct { + uuid string + idx string + *FilterDevice + }{ + uuid: "GPU-8dcd427f-483b-b48f-d7e5-75fb19a52b76", + idx: "0", + FilterDevice: &FilterDevice{ + UUID: []string{"GPU-8dcd427f-483b-b48f-d7e5-75fb19a52b76"}, + }, + }, + want: true, + }, + { + name: "idx is empty", + args: struct { + uuid string + idx string + *FilterDevice + }{ + uuid: "GPU-8dcd427f-483b-b48f-d7e5-75fb19a52b76", + idx: "", + FilterDevice: &FilterDevice{ + Index: []uint{0}, + }, + }, + want: false, + }, + { + name: "idx is not in filter", + args: struct { + uuid string + idx string + *FilterDevice + }{ + uuid: "GPU-8dcd427f-483b-b48f-d7e5-75fb19a52b76", + idx: "0", + FilterDevice: &FilterDevice{ + Index: []uint{1}, + }, + }, + want: false, + }, + { + name: "idx is in filter", + args: struct { + uuid string + idx string + *FilterDevice + }{ + uuid: "GPU-8dcd427f-483b-b48f-d7e5-75fb19a52b76", + idx: "0", + FilterDevice: &FilterDevice{ + Index: []uint{0}, + }, + }, + want: true, + }, + { + name: "idx is invalid", + args: struct { + uuid string + idx string + *FilterDevice + }{ + uuid: "GPU-8dcd427f-483b-b48f-d7e5-75fb19a52b76", + idx: "a", + FilterDevice: &FilterDevice{ + Index: []uint{0}, + }, + }, + want: false, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + DevicePluginFilterDevice = test.args.FilterDevice + got := FilterDeviceToRegister(test.args.uuid, test.args.idx) + assert.DeepEqual(t, test.want, got) + }) + } +}