From 5b68fe2035878864faab7b8f7a57d1f5030b7c09 Mon Sep 17 00:00:00 2001 From: qmloong Date: Wed, 28 Jul 2021 15:39:04 +0800 Subject: [PATCH 1/4] fix #39: get pods from kubelet client rather than list cluster-scope pods from apiserver (#41) * fix: add .gitignore * fix: get pods from kubelet client rather than list cluster-scope pods from apiserver * feat: add query-kubelet flag for control the mothod of query pending podLists Co-authored-by: menglong.qi --- .gitignore | 1 + cmd/nvidia/main.go | 50 +++++++++-- cmd/podgetter/main.go | 57 +++++++++++++ device-plugin-rbac.yaml | 1 + pkg/gpu/nvidia/allocate.go | 10 ++- pkg/gpu/nvidia/gpumanager.go | 17 ++-- pkg/gpu/nvidia/podmanager.go | 120 +++++++++++++++++++------- pkg/gpu/nvidia/server.go | 7 +- pkg/kubelet/client/client.go | 134 ++++++++++++++++++++++++++++++ pkg/kubelet/client/client_test.go | 57 +++++++++++++ 10 files changed, 408 insertions(+), 46 deletions(-) create mode 100644 .gitignore create mode 100644 cmd/podgetter/main.go create mode 100644 pkg/kubelet/client/client.go create mode 100644 pkg/kubelet/client/client_test.go diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..723ef36f --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.idea \ No newline at end of file diff --git a/cmd/nvidia/main.go b/cmd/nvidia/main.go index 05747a7d..3cadbed2 100644 --- a/cmd/nvidia/main.go +++ b/cmd/nvidia/main.go @@ -2,21 +2,61 @@ package main import ( "flag" - + "fmt" "github.com/AliyunContainerService/gpushare-device-plugin/pkg/gpu/nvidia" + "github.com/AliyunContainerService/gpushare-device-plugin/pkg/kubelet/client" log "github.com/golang/glog" + "io/ioutil" + "k8s.io/client-go/rest" + "time" ) var ( - mps = flag.Bool("mps", false, "Enable or Disable MPS") - healthCheck = flag.Bool("health-check", false, "Enable or disable Health check") - memoryUnit = flag.String("memory-unit", "GiB", "Set memoryUnit of the GPU Memroy, support 'GiB' and 'MiB'") + mps = flag.Bool("mps", false, "Enable or Disable MPS") + healthCheck = flag.Bool("health-check", false, "Enable or disable Health check") + memoryUnit = flag.String("memory-unit", "GiB", "Set memoryUnit of the GPU Memroy, support 'GiB' and 'MiB'") + queryFromKubelet = flag.Bool("query-kubelet", true, "Query pending pods from kubelet instead of kube-apiserver") + kubeletAddress = flag.String("kubelet-address", "0.0.0.0", "Kubelet IP Address") + kubeletPort = flag.Uint("kubelet-port", 10250, "Kubelet listened Port") + clientCert = flag.String("client-cert", "", "Kubelet TLS client certificate") + clientKey = flag.String("client-key", "", "Kubelet TLS client key") + token = flag.String("token", "", "Kubelet client bearer token") + timeout = flag.Int("timeout", 10, "Kubelet client http timeout duration") ) +func buildKubeletClient() *client.KubeletClient { + if *clientCert == "" && *clientKey == "" && *token == "" { + tokenByte, err := ioutil.ReadFile("/var/run/secrets/kubernetes.io/serviceaccount/token") + if err != nil { + panic(fmt.Errorf("in cluster mode, find token failed, error: %v", err)) + } + tokenStr := string(tokenByte) + token = &tokenStr + } + kubeletClient, err := client.NewKubeletClient(&client.KubeletClientConfig{ + Address: *kubeletAddress, + Port: *kubeletPort, + TLSClientConfig: rest.TLSClientConfig{ + Insecure: true, + ServerName: "gpushare-device-plugin", + CertFile: *clientCert, + KeyFile: *clientKey, + }, + BearerToken: *token, + HTTPTimeout: time.Duration(*timeout) * time.Second, + }) + if err != nil { + panic(err) + } + return kubeletClient +} + func main() { flag.Parse() log.V(1).Infoln("Start gpushare device plugin") - ngm := nvidia.NewSharedGPUManager(*mps, *healthCheck, translatememoryUnits(*memoryUnit)) + + kubeletClient := buildKubeletClient() + ngm := nvidia.NewSharedGPUManager(*mps, *healthCheck, *queryFromKubelet, translatememoryUnits(*memoryUnit), kubeletClient) err := ngm.Run() if err != nil { log.Fatalf("Failed due to %v", err) diff --git a/cmd/podgetter/main.go b/cmd/podgetter/main.go new file mode 100644 index 00000000..7bcababa --- /dev/null +++ b/cmd/podgetter/main.go @@ -0,0 +1,57 @@ +package main + +import ( + "flag" + "fmt" + "github.com/AliyunContainerService/gpushare-device-plugin/pkg/kubelet/client" + "io/ioutil" + "k8s.io/client-go/rest" + "time" +) + +var ( + clientCert string + clientKey string + token string + timeout int +) + +func main() { + flag.StringVar(&clientCert, "client-cert", "", "") + flag.StringVar(&clientKey, "client-key", "", "") + flag.StringVar(&token, "token", "", "") + flag.IntVar(&timeout, "timeout", 10, "") + + flag.Parse() + + if clientCert == "" && clientKey == "" && token == "" { + tokenByte, err := ioutil.ReadFile("/var/run/secrets/kubernetes.io/serviceaccount/token") + if err != nil { + panic(fmt.Errorf("in cluster mode, find token failed, error: %v", err)) + } + token = string(tokenByte) + } + + c, err := client.NewKubeletClient(&client.KubeletClientConfig{ + Address: "127.0.0.1", + Port: 10250, + TLSClientConfig: rest.TLSClientConfig{ + Insecure: true, + ServerName: "kubelet", + CertFile: clientCert, + KeyFile: clientKey, + }, + BearerToken: token, + HTTPTimeout: time.Duration(timeout) * time.Second, + }) + if err != nil { + fmt.Println(err) + return + } + podsList, err := c.GetNodeRunningPods() + if err != nil { + fmt.Println(err) + return + } + fmt.Println(podsList) +} diff --git a/device-plugin-rbac.yaml b/device-plugin-rbac.yaml index 794611f6..edff811a 100644 --- a/device-plugin-rbac.yaml +++ b/device-plugin-rbac.yaml @@ -9,6 +9,7 @@ rules: - "" resources: - nodes + - nodes/proxy verbs: - get - list diff --git a/pkg/gpu/nvidia/allocate.go b/pkg/gpu/nvidia/allocate.go index 25a82c43..57cf665a 100644 --- a/pkg/gpu/nvidia/allocate.go +++ b/pkg/gpu/nvidia/allocate.go @@ -59,7 +59,7 @@ func (m *NvidiaDevicePlugin) Allocate(ctx context.Context, m.Lock() defer m.Unlock() log.Infoln("checking...") - pods, err := getCandidatePods() + pods, err := getCandidatePods(m.queryKubelet, m.kubeletClient) if err != nil { log.Infof("invalid allocation requst: Failed to find candidate pods due to %v", err) return buildErrResponse(reqs, podReqGPU), nil @@ -155,8 +155,12 @@ func (m *NvidiaDevicePlugin) Allocate(ctx context.Context, return buildErrResponse(reqs, podReqGPU), nil } - log.Infof("new allocated GPUs info %v", &responses) - log.Infoln("----Allocating GPU for gpu mem is ended----") + podName := "" + if assumePod != nil { + podName = assumePod.Name + } + log.Infof("pod %v, new allocated GPUs info %v", podName, &responses) + log.Infof("----Allocating GPU for gpu mem for %v is ended----", podName) // // Add this to make sure the container is created at least // currentTime := time.Now() diff --git a/pkg/gpu/nvidia/gpumanager.go b/pkg/gpu/nvidia/gpumanager.go index 5f2d78b6..0d76817a 100644 --- a/pkg/gpu/nvidia/gpumanager.go +++ b/pkg/gpu/nvidia/gpumanager.go @@ -2,6 +2,7 @@ package nvidia import ( "fmt" + "github.com/AliyunContainerService/gpushare-device-plugin/pkg/kubelet/client" "syscall" "time" @@ -12,15 +13,19 @@ import ( ) type sharedGPUManager struct { - enableMPS bool - healthCheck bool + enableMPS bool + healthCheck bool + queryKubelet bool + kubeletClient *client.KubeletClient } -func NewSharedGPUManager(enableMPS, healthCheck bool, bp MemoryUnit) *sharedGPUManager { +func NewSharedGPUManager(enableMPS, healthCheck, queryKubelet bool, bp MemoryUnit, client *client.KubeletClient) *sharedGPUManager { metric = bp return &sharedGPUManager{ - enableMPS: enableMPS, - healthCheck: healthCheck, + enableMPS: enableMPS, + healthCheck: healthCheck, + queryKubelet: queryKubelet, + kubeletClient: client, } } @@ -61,7 +66,7 @@ L: devicePlugin.Stop() } - devicePlugin, err = NewNvidiaDevicePlugin(ngm.enableMPS, ngm.healthCheck) + devicePlugin, err = NewNvidiaDevicePlugin(ngm.enableMPS, ngm.healthCheck, ngm.queryKubelet, ngm.kubeletClient) if err != nil { log.Warningf("Failed to get device plugin due to %v", err) } else if err = devicePlugin.Serve(); err != nil { diff --git a/pkg/gpu/nvidia/podmanager.go b/pkg/gpu/nvidia/podmanager.go index 57547753..fa132dac 100644 --- a/pkg/gpu/nvidia/podmanager.go +++ b/pkg/gpu/nvidia/podmanager.go @@ -1,30 +1,29 @@ package nvidia import ( + "encoding/json" "fmt" - "os" - "sort" - "time" - + "github.com/AliyunContainerService/gpushare-device-plugin/pkg/kubelet/client" log "github.com/golang/glog" - "k8s.io/apimachinery/pkg/labels" - "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/fields" - + "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" "k8s.io/client-go/tools/clientcmd" nodeutil "k8s.io/kubernetes/pkg/util/node" + "os" + "sort" + "time" ) var ( clientset *kubernetes.Clientset nodeName string - retries = 5 + retries = 8 ) func kubeInit() { @@ -58,18 +57,18 @@ func kubeInit() { } func disableCGPUIsolationOrNot() (bool, error) { - disable := false - node, err := clientset.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{}) - if err != nil { - return disable, err - } - labels := node.ObjectMeta.Labels - value, ok := labels[EnvNodeLabelForDisableCGPU] - if ok && value == "true" { - log.Infof("enable gpusharing mode and disable cgpu mode") - disable = true - } - return disable, nil + disable := false + node, err := clientset.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{}) + if err != nil { + return disable, err + } + labels := node.ObjectMeta.Labels + value, ok := labels[EnvNodeLabelForDisableCGPU] + if ok && value == "true" { + log.Infof("enable gpusharing mode and disable cgpu mode") + disable = true + } + return disable, nil } func patchGPUCount(gpuCount int) error { @@ -99,31 +98,90 @@ func patchGPUCount(gpuCount int) error { return err } -func getPendingPodsInNode() ([]v1.Pod, error) { - // pods, err := m.lister.List(labels.Everything()) - // if err != nil { - // return nil, err - // } - pods := []v1.Pod{} +func getPodList(kubeletClient *client.KubeletClient) (*v1.PodList, error) { + podList, err := kubeletClient.GetNodeRunningPods() + if err != nil { + return nil, err + } - podIDMap := map[types.UID]bool{} + list, _ := json.Marshal(podList) + log.V(8).Infof("get pods list %v", string(list)) + + resultPodList := &v1.PodList{} + for _, metaPod := range podList.Items { + if metaPod.Status.Phase != v1.PodPending { + continue + } + resultPodList.Items = append(resultPodList.Items, metaPod) + } + + if len(resultPodList.Items) == 0 { + return nil, fmt.Errorf("not found pending pod") + } + + return resultPodList, nil +} +func getPodListsByQueryKubelet(kubeletClient *client.KubeletClient) (*v1.PodList, error) { + podList, err := getPodList(kubeletClient) + for i := 0; i < retries && err != nil; i++ { + podList, err = getPodList(kubeletClient) + log.Warningf("failed to get pending pod list, retry") + time.Sleep(100 * time.Millisecond) + } + if err != nil { + log.Warningf("not found from kubelet /pods api, start to list apiserver") + podList, err = getPodListsByListAPIServer() + if err != nil { + return nil, err + } + } + return podList, nil +} + +func getPodListsByListAPIServer() (*v1.PodList, error) { selector := fields.SelectorFromSet(fields.Set{"spec.nodeName": nodeName, "status.phase": "Pending"}) podList, err := clientset.CoreV1().Pods(v1.NamespaceAll).List(metav1.ListOptions{ FieldSelector: selector.String(), LabelSelector: labels.Everything().String(), }) - for i := 0; i < retries && err != nil; i++ { + for i := 0; i < 3 && err != nil; i++ { podList, err = clientset.CoreV1().Pods(v1.NamespaceAll).List(metav1.ListOptions{ FieldSelector: selector.String(), LabelSelector: labels.Everything().String(), }) - time.Sleep(100 * time.Millisecond) + time.Sleep(1 * time.Second) } if err != nil { return nil, fmt.Errorf("failed to get Pods assigned to node %v", nodeName) } + return podList, nil +} + +func getPendingPodsInNode(queryKubelet bool, kubeletClient *client.KubeletClient) ([]v1.Pod, error) { + // pods, err := m.lister.List(labels.Everything()) + // if err != nil { + // return nil, err + // } + pods := []v1.Pod{} + + podIDMap := map[types.UID]bool{} + + var podList *v1.PodList + var err error + if queryKubelet { + podList, err = getPodListsByQueryKubelet(kubeletClient) + if err != nil { + return nil, err + } + } else { + podList, err = getPodListsByListAPIServer() + if err != nil { + return nil, err + } + } + log.V(5).Infof("all pod list %v", podList.Items) // if log.V(5) { @@ -154,9 +212,9 @@ func getPendingPodsInNode() ([]v1.Pod, error) { } // pick up the gpushare pod with assigned status is false, and -func getCandidatePods() ([]*v1.Pod, error) { +func getCandidatePods(queryKubelet bool, client *client.KubeletClient) ([]*v1.Pod, error) { candidatePods := []*v1.Pod{} - allPods, err := getPendingPodsInNode() + allPods, err := getPendingPodsInNode(queryKubelet, client) if err != nil { return candidatePods, err } diff --git a/pkg/gpu/nvidia/server.go b/pkg/gpu/nvidia/server.go index 89f30b1e..abf05dc0 100644 --- a/pkg/gpu/nvidia/server.go +++ b/pkg/gpu/nvidia/server.go @@ -1,6 +1,7 @@ package nvidia import ( + "github.com/AliyunContainerService/gpushare-device-plugin/pkg/kubelet/client" "net" "os" "path" @@ -26,13 +27,15 @@ type NvidiaDevicePlugin struct { disableCGPUIsolation bool stop chan struct{} health chan *pluginapi.Device + queryKubelet bool + kubeletClient *client.KubeletClient server *grpc.Server sync.RWMutex } // NewNvidiaDevicePlugin returns an initialized NvidiaDevicePlugin -func NewNvidiaDevicePlugin(mps, healthCheck bool) (*NvidiaDevicePlugin, error) { +func NewNvidiaDevicePlugin(mps, healthCheck, queryKubelet bool, client *client.KubeletClient) (*NvidiaDevicePlugin, error) { devs, devNameMap := getDevices() devList := []string{} @@ -61,6 +64,8 @@ func NewNvidiaDevicePlugin(mps, healthCheck bool) (*NvidiaDevicePlugin, error) { disableCGPUIsolation: disableCGPUIsolation, stop: make(chan struct{}), health: make(chan *pluginapi.Device), + queryKubelet: queryKubelet, + kubeletClient: client, }, nil } diff --git a/pkg/kubelet/client/client.go b/pkg/kubelet/client/client.go new file mode 100644 index 00000000..01dcead1 --- /dev/null +++ b/pkg/kubelet/client/client.go @@ -0,0 +1,134 @@ +package client + +import ( + "encoding/json" + "fmt" + "io" + v1 "k8s.io/api/core/v1" + utilnet "k8s.io/apimachinery/pkg/util/net" + restclient "k8s.io/client-go/rest" + "k8s.io/client-go/transport" + "net/http" + "time" +) + +// KubeletClientConfig defines config parameters for the kubelet client +type KubeletClientConfig struct { + // Address specifies the kubelet address + Address string + + // Port specifies the default port - used if no information about Kubelet port can be found in Node.NodeStatus.DaemonEndpoints. + Port uint + + // TLSClientConfig contains settings to enable transport layer security + restclient.TLSClientConfig + + // Server requires Bearer authentication + BearerToken string + + // HTTPTimeout is used by the client to timeout http requests to Kubelet. + HTTPTimeout time.Duration +} + +type KubeletClient struct { + defaultPort uint + host string + client *http.Client +} + +func NewKubeletClient(config *KubeletClientConfig) (*KubeletClient, error) { + trans, err := makeTransport(config, true) + if err != nil { + return nil, err + } + client := &http.Client{ + Transport: trans, + Timeout: config.HTTPTimeout, + } + return &KubeletClient{ + host: config.Address, + defaultPort: config.Port, + client: client, + }, nil +} + +// transportConfig converts a client config to an appropriate transport config. +func (c *KubeletClientConfig) transportConfig() *transport.Config { + cfg := &transport.Config{ + TLS: transport.TLSConfig{ + CAFile: c.CAFile, + CAData: c.CAData, + CertFile: c.CertFile, + CertData: c.CertData, + KeyFile: c.KeyFile, + KeyData: c.KeyData, + }, + BearerToken: c.BearerToken, + } + if !cfg.HasCA() { + cfg.TLS.Insecure = true + } + return cfg +} + +// makeTransport creates a RoundTripper for HTTP Transport. +func makeTransport(config *KubeletClientConfig, insecureSkipTLSVerify bool) (http.RoundTripper, error) { + // do the insecureSkipTLSVerify on the pre-transport *before* we go get a potentially cached connection. + // transportConfig always produces a new struct pointer. + preTLSConfig := config.transportConfig() + if insecureSkipTLSVerify && preTLSConfig != nil { + preTLSConfig.TLS.Insecure = true + preTLSConfig.TLS.CAData = nil + preTLSConfig.TLS.CAFile = "" + } + + tlsConfig, err := transport.TLSConfigFor(preTLSConfig) + if err != nil { + return nil, err + } + + rt := http.DefaultTransport + if tlsConfig != nil { + // If SSH Tunnel is turned on + rt = utilnet.SetOldTransportDefaults(&http.Transport{ + TLSClientConfig: tlsConfig, + }) + } + + return transport.HTTPWrappersForConfig(config.transportConfig(), rt) +} + +func ReadAll(r io.Reader) ([]byte, error) { + b := make([]byte, 0, 512) + for { + if len(b) == cap(b) { + // Add more capacity (let append pick how much). + b = append(b, 0)[:len(b)] + } + n, err := r.Read(b[len(b):cap(b)]) + b = b[:len(b)+n] + if err != nil { + if err == io.EOF { + err = nil + } + return b, err + } + } +} + +func (k *KubeletClient) GetNodeRunningPods() (*v1.PodList, error) { + resp, err := k.client.Get(fmt.Sprintf("https://%v:%d/pods/", k.host, k.defaultPort)) + if err != nil { + return nil, err + } + + body, err := ReadAll(resp.Body) + if err != nil { + return nil, err + } + podLists := &v1.PodList{} + if err = json.Unmarshal(body, &podLists); err != nil { + return nil, err + } + return podLists, err +} diff --git a/pkg/kubelet/client/client_test.go b/pkg/kubelet/client/client_test.go new file mode 100644 index 00000000..d5f51d49 --- /dev/null +++ b/pkg/kubelet/client/client_test.go @@ -0,0 +1,57 @@ +package client + +import ( + "flag" + "fmt" + "io/ioutil" + "k8s.io/client-go/rest" + "testing" + "time" +) + +var ( + clientCert string + clientKey string + token string + timeout int +) + +func TestNewKubeletClient(t *testing.T) { + flag.StringVar(&clientCert, "client-cert", "", "") + flag.StringVar(&clientKey, "client-key", "", "") + flag.StringVar(&token, "token", "", "") + flag.IntVar(&timeout, "timeout", 10, "") + + flag.Parse() + + if clientCert == "" && clientKey == "" && token == "" { + tokenByte, err := ioutil.ReadFile("/var/run/secrets/kubernetes.io/serviceaccount/token") + if err != nil { + panic(fmt.Errorf("in cluster mode, find token failed, error: %v", err)) + } + token = string(tokenByte) + } + + c, err := NewKubeletClient(&KubeletClientConfig{ + Address: "127.0.0.1", + Port: 10250, + TLSClientConfig: rest.TLSClientConfig{ + Insecure: true, + ServerName: "kubelet", + CertFile: clientCert, + KeyFile: clientKey, + }, + BearerToken: token, + HTTPTimeout: time.Duration(timeout) * time.Second, + }) + if err != nil { + fmt.Println(err) + return + } + podsList, err := c.GetNodeRunningPods() + if err != nil { + fmt.Println(err) + return + } + fmt.Println(podsList) +} From b0d42f5b4c1535851eced6cf2639193d9fd76ff1 Mon Sep 17 00:00:00 2001 From: happy2048 <2270020588@qq.com> Date: Wed, 13 Oct 2021 11:32:18 +0800 Subject: [PATCH 2/4] skip to search pod when count of gpu devices is only one (#46) --- cmd/nvidia/main.go | 7 ++++--- pkg/gpu/nvidia/allocate.go | 28 ++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/cmd/nvidia/main.go b/cmd/nvidia/main.go index 3cadbed2..9ec969d2 100644 --- a/cmd/nvidia/main.go +++ b/cmd/nvidia/main.go @@ -3,19 +3,20 @@ package main import ( "flag" "fmt" + "io/ioutil" + "time" + "github.com/AliyunContainerService/gpushare-device-plugin/pkg/gpu/nvidia" "github.com/AliyunContainerService/gpushare-device-plugin/pkg/kubelet/client" log "github.com/golang/glog" - "io/ioutil" "k8s.io/client-go/rest" - "time" ) var ( mps = flag.Bool("mps", false, "Enable or Disable MPS") healthCheck = flag.Bool("health-check", false, "Enable or disable Health check") memoryUnit = flag.String("memory-unit", "GiB", "Set memoryUnit of the GPU Memroy, support 'GiB' and 'MiB'") - queryFromKubelet = flag.Bool("query-kubelet", true, "Query pending pods from kubelet instead of kube-apiserver") + queryFromKubelet = flag.Bool("query-kubelet", false, "Query pending pods from kubelet instead of kube-apiserver") kubeletAddress = flag.String("kubelet-address", "0.0.0.0", "Kubelet IP Address") kubeletPort = flag.Uint("kubelet-port", 10250, "Kubelet listened Port") clientCert = flag.String("client-cert", "", "Kubelet TLS client certificate") diff --git a/pkg/gpu/nvidia/allocate.go b/pkg/gpu/nvidia/allocate.go index 57cf665a..6566fed8 100644 --- a/pkg/gpu/nvidia/allocate.go +++ b/pkg/gpu/nvidia/allocate.go @@ -148,6 +148,34 @@ func (m *NvidiaDevicePlugin) Allocate(ctx context.Context, } } + } else if len(m.devNameMap) == 1 { + var devName string + var devIndex uint + for d, index := range m.devNameMap { + devName = d + devIndex = index + break + } + log.Infof("this node has only one gpu device,skip to search pod and directly specify the device %v(%v) for container", devIndex, devName) + for _, req := range reqs.ContainerRequests { + reqGPU := uint(len(req.DevicesIDs)) + response := pluginapi.ContainerAllocateResponse{ + Envs: map[string]string{ + envNVGPU: devName, + EnvResourceIndex: fmt.Sprintf("%d", devIndex), + EnvResourceByPod: fmt.Sprintf("%d", podReqGPU), + EnvResourceByContainer: fmt.Sprintf("%d", reqGPU), + EnvResourceByDev: fmt.Sprintf("%d", getGPUMemory()), + }, + } + if m.disableCGPUIsolation { + response.Envs["CGPU_DISABLE"] = "true" + } + responses.ContainerResponses = append(responses.ContainerResponses, &response) + } + log.Infof("get allocated GPUs info %v", responses) + return &responses, nil + } else { log.Warningf("invalid allocation requst: request GPU memory %d can't be satisfied.", podReqGPU) From dff44dc290041126311e191be7d361915c3a9bad Mon Sep 17 00:00:00 2001 From: happy2048 <2270020588@qq.com> Date: Thu, 25 Nov 2021 15:06:03 +0800 Subject: [PATCH 3/4] exit the device plugin when creating failed (#48) --- pkg/gpu/nvidia/gpumanager.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pkg/gpu/nvidia/gpumanager.go b/pkg/gpu/nvidia/gpumanager.go index 0d76817a..119fe9a2 100644 --- a/pkg/gpu/nvidia/gpumanager.go +++ b/pkg/gpu/nvidia/gpumanager.go @@ -4,6 +4,7 @@ import ( "fmt" "github.com/AliyunContainerService/gpushare-device-plugin/pkg/kubelet/client" "syscall" + "os" "time" "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml" @@ -69,8 +70,10 @@ L: devicePlugin, err = NewNvidiaDevicePlugin(ngm.enableMPS, ngm.healthCheck, ngm.queryKubelet, ngm.kubeletClient) if err != nil { log.Warningf("Failed to get device plugin due to %v", err) + os.Exit(1) } else if err = devicePlugin.Serve(); err != nil { log.Warningf("Failed to start device plugin due to %v", err) + os.Exit(2) } else { restart = false } From 45fb8b88692250cff2d53cb64b0a41864a5fcaf3 Mon Sep 17 00:00:00 2001 From: happy2048 <2270020588@qq.com> Date: Thu, 21 Apr 2022 15:31:52 +0800 Subject: [PATCH 4/4] set gpu index rather than gpu uuid for env NVIDIA_VISIBLE_DEVICES (#50) --- pkg/gpu/nvidia/allocate.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/gpu/nvidia/allocate.go b/pkg/gpu/nvidia/allocate.go index 6566fed8..cbcc0cc1 100644 --- a/pkg/gpu/nvidia/allocate.go +++ b/pkg/gpu/nvidia/allocate.go @@ -108,13 +108,13 @@ func (m *NvidiaDevicePlugin) Allocate(ctx context.Context, if id < 0 { return buildErrResponse(reqs, podReqGPU), nil } - + log.Infof("gpu index %v,uuid: %v", id, candidateDevID) // 1. Create container requests for _, req := range reqs.ContainerRequests { reqGPU := uint(len(req.DevicesIDs)) response := pluginapi.ContainerAllocateResponse{ Envs: map[string]string{ - envNVGPU: candidateDevID, + envNVGPU: fmt.Sprintf("%v", id), EnvResourceIndex: fmt.Sprintf("%d", id), EnvResourceByPod: fmt.Sprintf("%d", podReqGPU), EnvResourceByContainer: fmt.Sprintf("%d", reqGPU),