Skip to content

Commit

Permalink
add filter device when register node by uuid or index
Browse files Browse the repository at this point in the history
Signed-off-by: rongfu.leng <[email protected]>
  • Loading branch information
lengrongfu authored and wawa0210 committed Sep 24, 2024
1 parent f2ea682 commit 45051ef
Show file tree
Hide file tree
Showing 8 changed files with 207 additions and 9 deletions.
6 changes: 5 additions & 1 deletion charts/hami/templates/device-plugin/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,11 @@ data:
"name": "m5-cloudinfra-online02",
"devicememoryscaling": 1.8,
"devicesplitcount": 10,
"migstrategy":"none"
"migstrategy":"none",
"filterdevices": {
"uuid": [],
"index": []
}
}
]
}
3 changes: 2 additions & 1 deletion cmd/device-plugin/nvidia/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,8 @@ func startPlugins(c *cli.Context, flags []cli.Flag, restarting bool) ([]plugin.I
//fmt.Println("NodeName=", config.NodeName)
devConfig, err := generateDeviceConfigFromNvidia(config, c, flags)
if err != nil {
fmt.Printf("failed to load config file %s", err.Error())
klog.Errorf("failed to load config file %s", err.Error())
return nil, false, err
}

// Update the configuration file with default resources.
Expand Down
10 changes: 8 additions & 2 deletions cmd/device-plugin/nvidia/vgpucfg.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ func readFromConfigFile() error {
}
klog.Infof("Device Plugin Configs: %v", fmt.Sprintf("%v", deviceConfigs))
for _, val := range deviceConfigs.Nodeconfig {
if strings.Compare(os.Getenv(util.NodeNameEnvName), val.Name) == 0 {
if os.Getenv(util.NodeNameEnvName) == val.Name {
klog.Infof("Reading config from file %s", val.Name)
if val.Devicememoryscaling > 0 {
*util.DeviceMemoryScaling = val.Devicememoryscaling
Expand All @@ -118,6 +118,10 @@ func readFromConfigFile() error {
if val.Devicesplitcount > 0 {
*util.DeviceSplitCount = val.Devicesplitcount
}
if val.FilterDevice != nil && (len(val.FilterDevice.UUID) > 0 || len(val.FilterDevice.Index) > 0) {
util.DevicePluginFilterDevice = val.FilterDevice
}
klog.Infof("FilterDevice: %v", val.FilterDevice)
}
}
return nil
Expand Down Expand Up @@ -149,6 +153,8 @@ func generateDeviceConfigFromNvidia(cfg *spec.Config, c *cli.Context, flags []cl
}
}
}
readFromConfigFile()
if err := readFromConfigFile(); err != nil {
return devcfg, err
}
return devcfg, nil
}
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ func parseNvidiaNumaInfo(idx int, nvidiaTopoStr string) (int, error) {

func (plugin *NvidiaDevicePlugin) getAPIDevices() *[]*api.DeviceInfo {
devs := plugin.Devices()
klog.V(5).InfoS("getAPIDevices", "devices", devs)
nvml.Init()
res := make([]*api.DeviceInfo, 0, len(devs))
idx := 0
Expand Down
7 changes: 7 additions & 0 deletions pkg/device-plugin/nvidiadevice/nvinternal/rm/nvml_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,13 @@ func NewNVMLResourceManagers(nvmllib nvml.Interface, config *util.DeviceConfig)
if len(devices) == 0 {
continue
}
for key, value := range devices {
if util.FilterDeviceToRegister(value.ID, value.Index) {
klog.V(5).InfoS("Filtering device", "device", value.ID)
delete(devices, key)
continue
}
}
r := &nvmlResourceManager{
resourceManager: resourceManager{
config: config,
Expand Down
21 changes: 16 additions & 5 deletions pkg/util/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,21 @@ const (
NodeNameEnvName = "NODE_NAME"
)

type FilterDevice struct {
// UUID is the device ID.
UUID []string `json:"uuid"`
// Index is the device index.
Index []uint `json:"index"`
}

type DevicePluginConfigs struct {
Nodeconfig []struct {
Name string `json:"name"`
Devicememoryscaling float64 `json:"devicememoryscaling"`
Devicecorescaling float64 `json:"devicecorescaling"`
Devicesplitcount uint `json:"devicesplitcount"`
Migstrategy string `json:"migstrategy"`
Name string `json:"name"`
Devicememoryscaling float64 `json:"devicememoryscaling"`
Devicecorescaling float64 `json:"devicecorescaling"`
Devicesplitcount uint `json:"devicesplitcount"`
Migstrategy string `json:"migstrategy"`
FilterDevice *FilterDevice `json:"filterdevices"`
} `json:"nodeconfig"`
}

Expand All @@ -74,6 +82,9 @@ var (
NodeName string
RuntimeSocketFlag string
DisableCoreLimit *bool

// DevicePluginFilterDevice need device-plugin filter this device, don't register this device.
DevicePluginFilterDevice *FilterDevice
)

// type ContainerDevices struct {
Expand Down
29 changes: 29 additions & 0 deletions pkg/util/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -415,3 +415,32 @@ func MarkAnnotationsToDelete(devType string, nn string) error {
}
return PatchNodeAnnotations(n, tmppat)
}

func FilterDeviceToRegister(uuid, indexStr string) bool {
if DevicePluginFilterDevice == nil || (len(DevicePluginFilterDevice.UUID) == 0 && len(DevicePluginFilterDevice.Index) == 0) {
return false
}
uuidMap, indexMap := make(map[string]struct{}), make(map[uint]struct{})
for _, u := range DevicePluginFilterDevice.UUID {
uuidMap[u] = struct{}{}
}
for _, index := range DevicePluginFilterDevice.Index {
indexMap[index] = struct{}{}
}
if uuid != "" {
if _, ok := uuidMap[uuid]; ok {
return true
}
}
if indexStr != "" {
index, err := strconv.Atoi(indexStr)
if err != nil {
klog.Errorf("Error converting index to int: %v", err)
return false
}
if _, ok := indexMap[uint(index)]; ok {
return true
}
}
return false
}
139 changes: 139 additions & 0 deletions pkg/util/util_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -282,3 +282,142 @@ func TestUnMarshalNodeDevices(t *testing.T) {
})
}
}

func Test_FilterDeviceToRegister(t *testing.T) {
tests := []struct {
name string
args struct {
uuid string
idx string
*FilterDevice
}
want bool
}{
{
name: "filter is nil",
args: struct {
uuid string
idx string
*FilterDevice
}{
uuid: "GPU-8dcd427f-483b-b48f-d7e5-75fb19a52b76",
idx: "0",
FilterDevice: nil,
},
want: false,
},
{
name: "uuid is empty",
args: struct {
uuid string
idx string
*FilterDevice
}{
uuid: "",
idx: "0",
FilterDevice: &FilterDevice{
UUID: []string{"GPU-8dcd427f-483b-b48f-d7e5-75fb19a52b76"},
},
},
want: false,
},
{
name: "uuid is not in filter",
args: struct {
uuid string
idx string
*FilterDevice
}{
uuid: "GPU-8dcd427f-483b-b48f-d7e5-75fb19a52b76",
idx: "0",
FilterDevice: &FilterDevice{
UUID: []string{"GPU-8dcd427f-483b-b48f-d7e5-75fb19a52b77"},
},
},
want: false,
},
{
name: "uuid is in filter",
args: struct {
uuid string
idx string
*FilterDevice
}{
uuid: "GPU-8dcd427f-483b-b48f-d7e5-75fb19a52b76",
idx: "0",
FilterDevice: &FilterDevice{
UUID: []string{"GPU-8dcd427f-483b-b48f-d7e5-75fb19a52b76"},
},
},
want: true,
},
{
name: "idx is empty",
args: struct {
uuid string
idx string
*FilterDevice
}{
uuid: "GPU-8dcd427f-483b-b48f-d7e5-75fb19a52b76",
idx: "",
FilterDevice: &FilterDevice{
Index: []uint{0},
},
},
want: false,
},
{
name: "idx is not in filter",
args: struct {
uuid string
idx string
*FilterDevice
}{
uuid: "GPU-8dcd427f-483b-b48f-d7e5-75fb19a52b76",
idx: "0",
FilterDevice: &FilterDevice{
Index: []uint{1},
},
},
want: false,
},
{
name: "idx is in filter",
args: struct {
uuid string
idx string
*FilterDevice
}{
uuid: "GPU-8dcd427f-483b-b48f-d7e5-75fb19a52b76",
idx: "0",
FilterDevice: &FilterDevice{
Index: []uint{0},
},
},
want: true,
},
{
name: "idx is invalid",
args: struct {
uuid string
idx string
*FilterDevice
}{
uuid: "GPU-8dcd427f-483b-b48f-d7e5-75fb19a52b76",
idx: "a",
FilterDevice: &FilterDevice{
Index: []uint{0},
},
},
want: false,
},
}

for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
DevicePluginFilterDevice = test.args.FilterDevice
got := FilterDeviceToRegister(test.args.uuid, test.args.idx)
assert.DeepEqual(t, test.want, got)
})
}
}

0 comments on commit 45051ef

Please sign in to comment.