From 76a7ccc6de6ffc2975d4e921306b6dedeb974b9d Mon Sep 17 00:00:00 2001 From: Snir Koppelman <43176669+snirkop89@users.noreply.github.com> Date: Sun, 10 Mar 2024 13:01:22 +0000 Subject: [PATCH] Sync repo with internal repo (#15) Co-authored-by: Tal Cohen --- Dockerfile | 61 +++++++++------ Makefile | 59 ++++++++++---- README.md | 144 +++++++--------------------------- examples/hl-smi-test.yml | 10 +-- go.mod | 9 +-- go.sum | 2 + habanalabs-device-plugin.yml | 57 -------------- habanalabs.go | 147 +++++++++++++++++------------------ main.go | 87 ++++++++++++++------- server.go | 122 +++++++++++++++-------------- versions.mk | 25 ++++++ watcher.go | 2 +- 12 files changed, 338 insertions(+), 387 deletions(-) delete mode 100644 habanalabs-device-plugin.yml create mode 100644 versions.mk diff --git a/Dockerfile b/Dockerfile index c3fbe8a..f5124d3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -# Copyright (c) 2019, HabanaLabs Ltd. All rights reserved. +# Copyright (c) 2022, HabanaLabs Ltd. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,26 +12,36 @@ # See the License for the specific language governing permissions and # limitations under the License. -ARG BASE_IMAGE -FROM ${BASE_IMAGE} as builder - -ENV GOLANG_VERSION 1.20 -RUN wget -nv -O - https://dl.google.com/go/go${GOLANG_VERSION}.linux-amd64.tar.gz \ +ARG VERSION=1.14.0 +ARG MINOR_VERSION=493 +ARG DIST=ubuntu22.04 +ARG REGISTRY=vault.habana.ai + +FROM ${REGISTRY}/gaudi-docker/${VERSION}/${DIST}/habanalabs/pytorch-installer-2.1.1:${VERSION}-${MINOR_VERSION} as builder + +RUN apt-get update && \ + apt-get install -y wget make git gcc \ + && \ + rm -rf /var/lib/apt/lists/* + +ARG GOLANG_VERSION=1.21.5 +RUN set -eux; \ + \ + arch="$(uname -m)"; \ + case "${arch##*-}" in \ + x86_64 | amd64) ARCH='amd64' ;; \ + ppc64el | ppc64le) ARCH='ppc64le' ;; \ + aarch64) ARCH='arm64' ;; \ + *) echo "unsupported architecture" ; exit 1 ;; \ + esac; \ + wget -nv -O - https://storage.googleapis.com/golang/go${GOLANG_VERSION}.linux-${ARCH}.tar.gz \ | tar -C /usr/local -xz ENV GOPATH /opt/habanalabs/go ENV PATH $GOPATH/bin:/usr/local/go/bin:$PATH -# go-hlml must be download before building the image, since it is hosted in gerrit, -# and gerrit doesn't support go modules in our version. Then it is copied as -# a sibling to the device plugin folder -WORKDIR /opt/habanalabs/go/src/go-hlml -COPY go-hlml/ . - WORKDIR /opt/habanalabs/go/src/habanalabs-device-plugin -COPY go.mod go.sum ./ -RUN go mod download COPY . . RUN go mod tidy @@ -39,25 +49,32 @@ RUN go mod tidy RUN go build -buildvcs=false -o bin/habanalabs-device-plugin . -FROM artifactory-kfs.habana-labs.com/docker-developers/base/ubuntu:focal ARG BUILD_DATE ARG BUILD_REF +FROM ${REGISTRY}/gaudi-docker/${VERSION}/${DIST}/habanalabs/pytorch-installer-2.1.1:${VERSION}-${MINOR_VERSION} + +# Remove Habana libs(compat etc) in favor of libs installed by the NVIDIA driver +RUN apt-get --purge -y autoremove habana* + RUN apt update && apt install -y --no-install-recommends \ pciutils && \ rm -rf /var/lib/apt/lists/* COPY --from=builder /usr/lib/habanalabs /usr/lib/habanalabs COPY --from=builder /usr/include/habanalabs /usr/include/habanalabs +COPY --from=builder /opt/habanalabs/go/src/habanalabs-device-plugin/bin/habanalabs-device-plugin /usr/bin/habanalabs-device-plugin RUN echo "/usr/lib/habanalabs/" >> /etc/ld.so.conf.d/habanalabs.conf RUN ldconfig -COPY --from=builder /opt/habanalabs/go/src/habanalabs-device-plugin/bin/habanalabs-device-plugin /usr/bin/habanalabs-device-plugin -CMD ["habanalabs-device-plugin"] - - -LABEL image.created="${BUILD_DATE}" \ +LABEL io.k8s.display-name="HABANA Device Plugin" \ + vendor="HABANA" \ + version=${VERSION} \ + image.git-commit="${GIT_COMMIT}" \ + image.created="${BUILD_DATE}" \ image.revision="${BUILD_REF}" \ - image.title="habana-device-plugin" \ - image.author="Habana Labs Ltd" \ No newline at end of file + summary="HABANA device plugin for Kubernetes" \ + description="See summary" + +CMD ["habanalabs-device-plugin"] diff --git a/Makefile b/Makefile index 53480ae..eaa4343 100644 --- a/Makefile +++ b/Makefile @@ -1,19 +1,46 @@ -image ?= "artifactory-kfs.habana-labs.com/k8s-docker-dev/device_plugin/habana-device-plugin" -version ?= "test" -base_image ?= "artifactory-kfs.habana-labs.com/docker-local/1.13.0/ubuntu20.04/habanalabs/base-installer:1.13.0-10" - - -## build: build docker image in ci-cd process -.PHONY: build -build: - docker build \ - -t $(image):$(version) \ - --build-arg BASE_IMAGE=$(base_image) \ - --build-arg BUILD_REF=$(version) \ +# Copyright (c) 2020-2022, HabanaLabs Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +DOCKER ?= docker + +include $(CURDIR)/versions.mk + + +ifeq ($(IMAGE_NAME),) +IMAGE_NAME := $(REGISTRY)/$(APP_NAME) +endif + +IMAGE_TAG ?= $(VERSION)-$(MINOR_VERSION) +IMAGE = $(IMAGE_NAME):$(IMAGE_TAG) + + +.PHONY: build push + +## build: build docker image +build: + $(DOCKER) build \ + -t $(IMAGE) \ + --build-arg BUILD_REF=$(IMAGE_TAG) \ + --build-arg REGISTRY=$(REGISTRY) \ + --build-arg VERSION="$(VERSION)" \ + --build-arg MINOR_VERSION="$(MINOR_VERSION)" \ + --build-arg DIST="$(DIST)" \ + --build-arg GOLANG_VERSION="$(GOLANG_VERSION)" \ + --build-arg GIT_COMMIT="$(GIT_COMMIT)" \ --build-arg BUILD_DATE=`date -u +"%Y-%m-%dT%H:%M:%SZ"` \ . -## push-image: push the image to the registry -.PHONY: push-image -push-image: - docker image push $(image):$(version) \ No newline at end of file +## push: push the image to the registry +push: + $(DOCKER) image push $(IMAGE) diff --git a/README.md b/README.md index f78ebf3..2ea272a 100644 --- a/README.md +++ b/README.md @@ -1,141 +1,55 @@ -# HABANA device plugin for Kubernetes +# Habana Device Plugin for Kubernetes -## Table of Contents +The Habana device plugin for Kubernetes, operating as a DaemonSet, enables the automatic registration of +Habana devices within your Kubernetes cluster, while also monitoring the health status of these devices. +This integration ensures seamless management and monitoring of Habana devices within the Kubernetes ecosystem, +enhancing operational efficiency and reliability. + +This repository contains Habana official implementation of the [Kubernetes device plugin](https://kubernetes.io/docs/concepts/extend-kubernetes/compute-storage-net/device-plugins/). -- [HABANA device plugin for Kubernetes](#habana-device-plugin-for-kubernetes) +## Table of Contents +- [Habana Device Plugin for Kubernetes](#habana-device-plugin-for-kubernetes) - [Table of Contents](#table-of-contents) - - [Introduction](#introduction) - [Prerequisites](#prerequisites) - - [The below sections detail existing plugins](#the-below-sections-detail-existing-plugins) - - [Goya device plugin](#goya-device-plugin) - - [Running Jobs](#running-jobs) - - [Gaudi device plugin](#gaudi-device-plugin) - - [With Docker](#with-docker) - - [Build](#build) - - [Build in CD](#build-in-cd) - - [Deploy as Daemon Set:](#deploy-as-daemon-set) - - [Changelog](#changelog) - - [Version 0.9.1](#version-091) - - [Version 0.8.1-beta1](#version-081-beta1) -- [Issues](#issues) - - -## Introduction - -The HABANA device plugin for Kubernetes is a Daemonset that allows you to automatically: -- Enables the registration of HABANA devices in your Kubernetes cluster. -- Keep track of the health of your Device - -## Prerequisites -The list of prerequisites for running the HABANA device plugin is described below: -- HABANA drivers -- Kubernetes version >= 1.10 - -### The below sections detail existing plugins + - [Gaudi Device Registration](#gaudi-device-registration) + - [Building and Running Locally Using Docker](#building-and-running-locally-using-docker) -#### Goya device plugin -Once you have enabled this option on *all* the nodes you wish to use, -you can then enable support in your cluster by deploying the following Daemonset: +## Prerequisites -```shell -$ kubectl create -f habanalabs-device-plugin.yaml -``` +The below lists the prerequisites needed for running Habana device plugin: +- Habana Drivers +- Kubernetes version >= 1.19 +- [Habana-container-runtime](https://github.com/HabanaAI/habana-container-runtime) -##### Running Jobs - -Can now be consumed via container level resource requirements using the resource name habana.com/goya: -```yaml -apiVersion: v1 -kind: Pod -metadata: - name: habanalabs-goya-demo0 -spec: - nodeSelector: - accelerator: habanalabs - containers: - - name: habana-ai-base-container - image: habanai/goya-demo:0.9.1-43-debian9.8 - workingDir: /home/user1 - securityContext: - capabilities: - add: ["SYS_RAWIO"] - command: ["sleep"] - args: ["10000"] - resources: - limits: - habana.ai/goya: 1 - imagePullSecrets: - - name: regcred -``` -#### Gaudi device plugin +## Gaudi Device Registration -Once you have enabled this option on *all* the nodes you wish to use, -you can then enable support in your cluster by deploying the following Daemonset: +Once the prerequisites mentioned earlier have been established in the nodes, +you can then activate support in your cluster by deploying the Daemonset: ```shell $ kubectl create -f habanalabs-device-plugin-gaudi.yaml ``` -### With Docker -#### Build -Option 1, pull the prebuilt image from [Docker Hub](https://hub.docker.com/r/habanai/k8s-device-plugin): -```shell -$ docker pull habanai/k8s-device-plugin:0.9.1 -``` +## Building and Running Locally Using Docker -Option 2, build without cloning the repository: -```shell -$ docker build --network=host --no-cache -t habanai/k8s-device-plugin:0.9.1 habanalabs-k8s-device-plugin -``` +To build and run using a docker, employ the following options according to your specific scenario: -Option 3, if you want to modify the code: +- To pull the prebuilt image, run: ```shell -https://github.com/HabDevops/habanalabs-k8s-device-plugin -$ git clone https://github.com/HabDevops/habanalabs-k8s-device-plugin.git && cd habanalabs-k8s-device-plugin -$ git checkout v0.9.1 -$ docker build -t habanai/k8s-device-plugin:0.9.1 . +$ docker pull vault.habana.ai/docker-k8s-device-plugin/docker-k8s-device-plugin:1.14.0 ``` -### Build in CD -Requirements: -- go-hlml repo must be first downloaded from gerrit into the habanalabs-device-plugin repo. - It is copied by the Dockerfile into the image during the build process. - _(this is due a lack of go modules support in Gerrit v2)_ - - -To build the image in the CD process use the `make build` build. -It accepts the following parameters: -- `base_image` -Image to use as the builder for the application -- `image` - Final full image name to deploy -- `version` - Image's tag - -Full example showing usage of current jenkins variables(or parameters): -``` -make build base_image=$baseDockerImage image=$pluginDockerImage version=$"{release_version}-${release_build_id}" +- To build without cloning the repository, run: +```shell +$ docker build --network=host --no-cache -t habanai/k8s-device-plugin:1.14.0 habanalabs-k8s-device-plugin ``` -#### Deploy as Daemon Set: +- To modify the code, run: ```shell -$ kubectl create -f habanalabs-device-plugin.yaml +$ git clone https://github.com/HabanaAI/habanalabs-k8s-device-plugin.git && cd habanalabs-k8s-device-plugin +$ docker build -t habanai/k8s-device-plugin:devel . ``` -## Changelog - -### Version 0.9.1 -- New HLML SW 0.9.1-43 debian9.8 - -### Version 0.8.1-beta1 -- Support k8s plugin for Gaudi -- New HLML SW 0.8.1-55 debian9.8 -- Add new resource namespace e.g: goya/gaudi -- Refactor device plugin to eventually handle multiple resource types -- Move plugin error retry to event loop so we can exit with a signal - -# Issues -* You can report a bug by [filing a new issue](https://github.com/HabDevops/habanalabs-k8s-device-plugin/issues/new) -* You can contribute by opening a [pull request](https://help.github.com/articles/using-pull-requests/) - - diff --git a/examples/hl-smi-test.yml b/examples/hl-smi-test.yml index 9129c52..57144ab 100644 --- a/examples/hl-smi-test.yml +++ b/examples/hl-smi-test.yml @@ -3,19 +3,11 @@ kind: Pod metadata: name: hl-smi spec: - hostNetwork: true containers: - name: app - image: artifactory.habana-labs.com/docker-developers/base/u18/u18-base:bionic-20190307 + image: vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:1.14.0-493 command: ["/bin/bash"] args: ["-c", "hl-smi"] resources: limits: habana.ai/gaudi: 1 - volumeMounts: - - name: usr - mountPath: /usr/bin - volumes: - - name: usr - hostPath: - path: /usr/bin diff --git a/go.mod b/go.mod index 59f3008..6df4c4a 100644 --- a/go.mod +++ b/go.mod @@ -1,9 +1,11 @@ module github.com/HabanaAI/habanalabs-k8s-device-plugin -go 1.20 +go 1.21 + +toolchain go1.21.5 require ( - github.com/HabanaAI/gohlml v1.3.0 + github.com/HabanaAI/gohlml v1.14.0 github.com/fsnotify/fsnotify v1.4.9 google.golang.org/grpc v1.35.0 k8s.io/kubelet v0.19.7 @@ -18,6 +20,3 @@ require ( google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013 // indirect google.golang.org/protobuf v1.25.0 // indirect ) - -// uncomment below if developing with a local copy of gohlml -replace github.com/HabanaAI/gohlml v1.3.0 => ../go-hlml diff --git a/go.sum b/go.sum index ade283a..b3614e6 100644 --- a/go.sum +++ b/go.sum @@ -182,6 +182,7 @@ github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UV github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU= go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8= go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= @@ -350,6 +351,7 @@ gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.5/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b h1:h8qDotaEPuJATrMmW04NCwg7v22aHH28wwpauUhK9Oo= +gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gotest.tools v2.2.0+incompatible/go.mod h1:DsYFclhRJ6vuDpmuTbkuFWG+y2sxOXAzmJt81HFBacw= gotest.tools/v3 v3.0.2/go.mod h1:3SzNCllyD9/Y+b5r9JIKQ474KzkZyqLqEfYqMsX94Bk= honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= diff --git a/habanalabs-device-plugin.yml b/habanalabs-device-plugin.yml deleted file mode 100644 index d510b1a..0000000 --- a/habanalabs-device-plugin.yml +++ /dev/null @@ -1,57 +0,0 @@ -# Copyright (c) 2019, HabanaLabs Ltd. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: habanalabs-device-plugin-daemonset-gaudi - namespace: kube-system -spec: - selector: - matchLabels: - name: habanalabs-device-plugin-ds - updateStrategy: - type: RollingUpdate - template: - metadata: - # This annotation is deprecated. Kept here for backward compatibility - # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ - annotations: - scheduler.alpha.kubernetes.io/critical-pod: "" - labels: - name: habanalabs-device-plugin-ds - spec: - priorityClassName: "system-node-critical" - containers: - - image: habanai/k8s-device-plugin:1.2.0 - name: habanalabs-device-plugin-ctr - command: ["habanalabs-device-plugin"] - args: ["--dev_type", " gaudi"] - env: - - name: LD_LIBRARY_PATH - value: "/usr/lib/habanalabs" - securityContext: - privileged: true - volumeMounts: - - name: device-plugin - mountPath: /var/lib/kubelet/device-plugins - - name: habana-lib - mountPath: /usr/lib/habanalabs - volumes: - - name: device-plugin - hostPath: - path: /var/lib/kubelet/device-plugins - - name: habana-lib - hostPath: - path: /usr/lib/habanalabs diff --git a/habanalabs.go b/habanalabs.go index a803391..9a2ed80 100644 --- a/habanalabs.go +++ b/habanalabs.go @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, HabanaLabs Ltd. All rights reserved. + * Copyright (c) 2022, HabanaLabs Ltd. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,7 +18,7 @@ package main import ( "context" "fmt" - "log" + "log/slog" "strings" "time" @@ -26,61 +26,62 @@ import ( pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" ) -const ( - GOYA = "GOYA" - GAUDI = "GAUDI" -) - // ResourceManager interface type ResourceManager interface { - Devices() []*pluginapi.Device + Devices() ([]*pluginapi.Device, error) } // DeviceManager string devType: GOYA / GAUDI type DeviceManager struct { + log *slog.Logger devType string } // NewDeviceManager Init Manager -func NewDeviceManager(devType string) *DeviceManager { - return &DeviceManager{devType: devType} +func NewDeviceManager(log *slog.Logger, devType string) *DeviceManager { + return &DeviceManager{log: log, devType: devType} } // Devices Get Habana Device -func (dm *DeviceManager) Devices() []*pluginapi.Device { +func (dm *DeviceManager) Devices() ([]*pluginapi.Device, error) { NumOfDevices, err := hlml.DeviceCount() - mustErr(err) + if err != nil { + return nil, err + } var devs []*pluginapi.Device - log.Println("Finding devices...") + dm.log.Info("Discovering devices...") for i := uint(0); i < NumOfDevices; i++ { newDevice, err := hlml.DeviceHandleByIndex(i) - mustErr(err) + if err != nil { + return nil, err + } pciID, err := newDevice.PCIID() - mustErr(err) + if err != nil { + return nil, err + } serial, err := newDevice.SerialNumber() - mustErr(err) + if err != nil { + return nil, err + } uuid, err := newDevice.UUID() - mustErr(err) + if err != nil { + return nil, err + } pciBusID, _ := newDevice.PCIBusID() - dID := fmt.Sprintf("%x", pciID) - - log.Printf( - "device: %s,\tserial: %s,\tuuid: %s", - strings.ToUpper(dm.devType), - serial, - uuid, - ) - - log.Printf("pci id: %s\t pci bus id: %s", - dID, - pciBusID, + dm.log.Info( + "Device found", + "device", strings.ToUpper(dm.devType), + "serial", serial, + "uuid", uuid, + "id", dID, + "pci_bus_id", pciBusID, ) dev := pluginapi.Device{ @@ -89,10 +90,12 @@ func (dm *DeviceManager) Devices() []*pluginapi.Device { } cpuAffinity, err := newDevice.NumaNode() - mustErr(err) + if err != nil { + return nil, err + } if cpuAffinity != nil { - log.Printf("cpu affinity: %d", *cpuAffinity) + dm.log.Info("Device cpu affinity", "id", dID, "cpu_affinity", *cpuAffinity) dev.Topology = &pluginapi.TopologyInfo{ Nodes: []*pluginapi.NUMANode{{ID: int64(*cpuAffinity)}}, } @@ -100,13 +103,7 @@ func (dm *DeviceManager) Devices() []*pluginapi.Device { devs = append(devs, &dev) } - return devs -} - -func mustErr(err error) { - if err != nil { - log.Panicln("Fatal:", err) - } + return devs, nil } func getDevice(devs []*pluginapi.Device, id string) *pluginapi.Device { @@ -123,63 +120,59 @@ func watchXIDs(ctx context.Context, devs []*pluginapi.Device, xids chan<- *plugi defer hlml.DeleteEventSet(eventSet) for _, d := range devs { - err := hlml.RegisterEventForDevice(eventSet, hlml.HlmlCriticalError, d.ID) if err != nil { - log.Printf("Failed to register critical events for %s, error %s. Marking it unhealthy", d.ID, err) - + slog.Error("Failed registering critial event for device. Marking it unhealthy", "device_id", d.ID, "error", err) xids <- d continue } } + // TODO: provide as flag + healthCheckInterval := time.NewTicker(10 * time.Second) + for { select { case <-ctx.Done(): return - default: - } - - // Wait between health checks - time.Sleep(5 * time.Second) - - e, err := hlml.WaitForEvent(eventSet, 1000) - if err != nil { - log.Println(err) - time.Sleep(2 * time.Second) - continue - } + case <-healthCheckInterval.C: + e, err := hlml.WaitForEvent(eventSet, 1000) + if err != nil { + slog.Error("hlml WaitForEvent failed", "errror", err.Error()) + time.Sleep(2 * time.Second) + continue + } - if e.Etype != hlml.HlmlCriticalError { - continue - } + if e.Etype != hlml.HlmlCriticalError { + continue + } - dev, err := hlml.DeviceHandleBySerial(e.Serial) - if err != nil { - log.Printf("XidCriticalError: Xid=%d, All devices will go unhealthy", e.Etype) - // All devices are unhealthy - for _, d := range devs { - xids <- d + dev, err := hlml.DeviceHandleBySerial(e.Serial) + if err != nil { + slog.Error("XidCriticalError: All devices will go unhealthy", "xid", e.Etype) + // All devices are unhealthy + for _, d := range devs { + xids <- d + } + continue } - continue - } - uuid, err := dev.UUID() - if err != nil || len(uuid) == 0 { - log.Printf("XidCriticalError: Xid=%d, All devices will go unhealthy", e.Etype) - // All devices are unhealthy - for _, d := range devs { - xids <- d + uuid, err := dev.UUID() + if err != nil || len(uuid) == 0 { + slog.Error("XidCriticalError: All devices will go unhealthy", "xid", e.Etype) + // All devices are unhealthy + for _, d := range devs { + xids <- d + } + continue } - continue - } - for _, d := range devs { - if d.ID == uuid { - log.Printf("XidCriticalError: Xid=%d on AIP=%s, the device will go unhealthy", e.Etype, d.ID) - xids <- d + for _, d := range devs { + if d.ID == uuid { + slog.Error("XidCriticalError: the device will go unhealthy", "xid", e.Etype, "aip", d.ID) + xids <- d + } } } - } } diff --git a/main.go b/main.go index c29e811..ad2e6df 100644 --- a/main.go +++ b/main.go @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, HabanaLabs Ltd. All rights reserved. + * Copyright (c) 2022, HabanaLabs Ltd. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,7 +17,8 @@ package main import ( - "log" + "fmt" + "log/slog" "os" "strings" "syscall" @@ -28,38 +29,63 @@ import ( pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" ) +// build is overridden with an actual version in the build process. +var build = "develop" + func main() { - var devicePlugin *HabanalabsDevicePlugin - var err error + log := initLogger() + if err := run(log); err != nil { + log.Error(err.Error()) + os.Exit(1) + } +} + +func initLogger() *slog.Logger { + lvl := slog.LevelInfo + if os.Getenv("LOG_LEVEL") == slog.LevelDebug.String() { + lvl = slog.LevelDebug + } + attrs := []slog.Attr{ + slog.String("service", "habana-device-plugin"), + } + h := slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{Level: lvl}).WithAttrs(attrs) + return slog.New(h) +} + +func run(log *slog.Logger) error { restart := true + log.Info("Started Habana device plugin manager", "version", build) - log.Println("Habana device plugin manager") - log.Println("Loading HLML...") + log.Info("Initializing HLML...") if err := hlml.Initialize(); err != nil { - log.Printf("Failed to initialize HLML: %s", err) - return + return fmt.Errorf("failed to initialize HLML: %w", err) } - defer func() { log.Println("Shutdown of HLML returned:", hlml.Shutdown()) }() + defer func() { + log.Info("Shutting down hlml") + err := hlml.Shutdown() + if err != nil { + log.Error(err.Error()) + } + }() - log.Println("Starting FS watcher...") + log.Info("Starting FS watcher...") watcher, err := newFSWatcher(pluginapi.DevicePluginPath) if err != nil { - log.Println("Failed to created FS watcher") - os.Exit(1) + return fmt.Errorf("failed to create FS watcher: %w", err) } defer watcher.Close() - log.Println("Starting OS watcher...") + log.Info("Starting OS watcher...") sigs := newOSWatcher(syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT) dev, err := hlml.GetDeviceTypeName() if err != nil { - log.Println("failed detecting Habana's devices on the system", err) - os.Exit(1) + return fmt.Errorf("failed detecting Habana's devices on the system: %w", err) } - devicePlugin = NewHabanalabsDevicePlugin( - NewDeviceManager(strings.ToUpper(dev)), + devicePlugin := NewHabanalabsDevicePlugin( + log, + NewDeviceManager(log, strings.ToUpper(dev)), "habana.ai/"+dev, pluginapi.DevicePluginPath+dev+"_habanalabs.sock", ) @@ -67,12 +93,14 @@ func main() { L: for { if restart { - devicePlugin.Stop() + err = devicePlugin.Stop() + if err != nil { + log.Warn("Failed stopping device plugin gracefully", "error", err) + } numDevices, err := hlml.DeviceCount() if err != nil { - log.Fatalln("Could not get number of devices") - continue + return fmt.Errorf("failed getting number of devices: %w", err) } if numDevices == 0 { @@ -80,30 +108,33 @@ L: } if err := devicePlugin.Serve(); err != nil { - log.Println("Could not contact Kubelet, retrying. Did you enable the device plugin feature gate?") - } else { - restart = false + log.Error(err.Error()) + return fmt.Errorf("could not contact Kubelet, retrying. Did you enable the device plugin feature gate?") } + restart = false } select { case event := <-watcher.Events: if event.Name == pluginapi.KubeletSocket && event.Op&fsnotify.Create == fsnotify.Create { - log.Printf("inotify: %s created, restarting.", pluginapi.KubeletSocket) + log.Warn("Kubelet restart detected, restarting device plugin.") restart = true } case err := <-watcher.Errors: - log.Printf("inotify: %s", err) + log.Error("Watcher error received", "error", err) case s := <-sigs: switch s { case syscall.SIGHUP: - log.Println("Received SIGHUP, restarting.") + log.Info("Received SIGHUP, restarting.") restart = true default: - log.Printf("Received signal \"%v\", shutting down", s) - devicePlugin.Stop() + log.Info("Received OS signal. Shutting down", "signal", s) + if err := devicePlugin.Stop(); err != nil { + log.Error("Failed stopping device plugin gracefully", "error", err) + } break L } } } + return nil } diff --git a/server.go b/server.go index d8d8687..16cdeec 100644 --- a/server.go +++ b/server.go @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, HabanaLabs Ltd. All rights reserved. + * Copyright (c) 2022, HabanaLabs Ltd. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,7 +20,7 @@ import ( "context" "errors" "fmt" - "log" + "log/slog" "net" "os" "path" @@ -36,22 +36,29 @@ import ( // HabanalabsDevicePlugin implements the Kubernetes device plugin API type HabanalabsDevicePlugin struct { ResourceManager + log *slog.Logger + stop chan interface{} + health chan *pluginapi.Device + server *grpc.Server resourceName string socket string - - devs []*pluginapi.Device - stop chan interface{} - health chan *pluginapi.Device - server *grpc.Server + devs []*pluginapi.Device } +// GetPreferredAllocation returns a preferred set of devices to allocate +// from a list of available ones. The resulting preferred allocation is not +// guaranteed to be the allocation ultimately performed by the +// devicemanager. It is only designed to help the devicemanager make a more +// informed allocation decision when possible. +// NOT Implemented func (m *HabanalabsDevicePlugin) GetPreferredAllocation(ctx context.Context, request *pluginapi.PreferredAllocationRequest) (*pluginapi.PreferredAllocationResponse, error) { return nil, errors.New("GetPreferredAllocation should not be called as this device plugin doesn't implement it") } // NewHabanalabsDevicePlugin returns an initialized HabanalabsDevicePlugin. -func NewHabanalabsDevicePlugin(resourceManager ResourceManager, resourceName string, socket string) *HabanalabsDevicePlugin { +func NewHabanalabsDevicePlugin(log *slog.Logger, resourceManager ResourceManager, resourceName string, socket string) *HabanalabsDevicePlugin { return &HabanalabsDevicePlugin{ + log: log, ResourceManager: resourceManager, resourceName: resourceName, socket: socket, @@ -66,7 +73,9 @@ func NewHabanalabsDevicePlugin(resourceManager ResourceManager, resourceName str // GetDevicePluginOptions returns the device plugin options. func (m *HabanalabsDevicePlugin) GetDevicePluginOptions(context.Context, *pluginapi.Empty) (*pluginapi.DevicePluginOptions, error) { - return &pluginapi.DevicePluginOptions{}, nil + return &pluginapi.DevicePluginOptions{ + GetPreferredAllocationAvailable: false, // Indicate to kubelet we don't have an implementation. + }, nil } // dial establishes the gRPC communication with the registered device plugin. @@ -80,7 +89,6 @@ func dial(unixSocketPath string, timeout time.Duration) (*grpc.ClientConn, error return net.DialTimeout("unix", s, timeout) }), ) - if err != nil { return nil, err } @@ -90,7 +98,8 @@ func dial(unixSocketPath string, timeout time.Duration) (*grpc.ClientConn, error // Start starts the gRPC server of the device plugin func (m *HabanalabsDevicePlugin) Start() error { - if err := m.cleanup(); err != nil { + err := m.cleanup() + if err != nil { return err } @@ -99,17 +108,23 @@ func (m *HabanalabsDevicePlugin) Start() error { } // initialize Devices - m.devs = m.Devices() + m.devs, err = m.Devices() + if err != nil { + return err + } sock, err := net.Listen("unix", m.socket) if err != nil { return err } + // First start serving the gRPC connection before registering. + // It is required since kubernetes 1.26. Change is backward compatible. m.server = grpc.NewServer([]grpc.ServerOption{}...) pluginapi.RegisterDevicePluginServer(m.server, m) - go m.server.Serve(sock) + // Ignore error returns since the next block will fail if Serve fails. + go func() { _ = m.server.Serve(sock) }() // Wait for server to start by launching a blocking connection conn, err := dial(m.socket, 5*time.Second) @@ -129,7 +144,7 @@ func (m *HabanalabsDevicePlugin) Stop() error { return nil } - log.Printf("Stopping to serve '%s' on %s", m.resourceName, m.socket) + m.log.Info("Stoppping device plugin", "resource_name", m.resourceName, "socket", m.socket) m.server.Stop() m.server = nil close(m.stop) @@ -162,7 +177,10 @@ func (m *HabanalabsDevicePlugin) Register() error { // ListAndWatch lists devices and update that list according to the health status func (m *HabanalabsDevicePlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin_ListAndWatchServer) error { - s.Send(&pluginapi.ListAndWatchResponse{Devices: m.devs}) + err := s.Send(&pluginapi.ListAndWatchResponse{Devices: m.devs}) + if err != nil { + return err + } for { select { @@ -170,8 +188,10 @@ func (m *HabanalabsDevicePlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.De return nil case d := <-m.health: d.Health = pluginapi.Unhealthy - log.Printf("'%s' device %s is unhealthy", m.resourceName, d.ID) - s.Send(&pluginapi.ListAndWatchResponse{Devices: m.devs}) + m.log.Info("Device is unhealthy", "resource", m.resourceName, "id", d.ID) + if err := s.Send(&pluginapi.ListAndWatchResponse{Devices: m.devs}); err != nil { + m.log.Error("Failed sending ListAndWatch to kubelet", "error", err) + } } } } @@ -196,24 +216,34 @@ func (m *HabanalabsDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi.A if device == nil { return nil, fmt.Errorf("invalid request for %q: device unknown: %s", m.resourceName, id) } - log.Printf("device == %s", device) + m.log.Info("Preparing device for registration", "device", device) + m.log.Info("Getting device handle from hlml") deviceHandle, err := hlml.DeviceHandleBySerial(id) - mustErr(err) + if err != nil { + m.log.Error(err.Error()) + return nil, err + } + m.log.Info("Getting device minor number") minor, err := deviceHandle.MinorNumber() - mustErr(err) + if err != nil { + m.log.Error(err.Error()) + return nil, err + } - moduleId, err := deviceHandle.ModuleID() - mustErr(err) + m.log.Info("Getting device module id") + moduleID, err := deviceHandle.ModuleID() + if err != nil { + m.log.Error(err.Error()) + return nil, err + } path := fmt.Sprintf("/dev/accel/accel%d", minor) paths = append(paths, path) uuids = append(uuids, id) netConfig = append(netConfig, fmt.Sprintf("%d", minor)) - visibleModule = append(visibleModule, fmt.Sprintf("%d", moduleId)) - - log.Printf("path == %s", path) + visibleModule = append(visibleModule, fmt.Sprintf("%d", moduleID)) ds := &pluginapi.DeviceSpec{ ContainerPath: path, @@ -222,26 +252,6 @@ func (m *HabanalabsDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi.A } devicesList = append(devicesList, ds) path = fmt.Sprintf("/dev/accel/accel_controlD%d", minor) - log.Printf("path == %s", path) - - ds = &pluginapi.DeviceSpec{ - ContainerPath: path, - HostPath: path, - Permissions: "rw", - } - devicesList = append(devicesList, ds) - - path = fmt.Sprintf("/dev/hl%d", minor) - log.Printf("path == %s", path) - - ds = &pluginapi.DeviceSpec{ - ContainerPath: path, - HostPath: path, - Permissions: "rw", - } - devicesList = append(devicesList, ds) - path = fmt.Sprintf("/dev/hl_controlD%d", minor) - log.Printf("path == %s", path) ds = &pluginapi.DeviceSpec{ ContainerPath: path, @@ -252,13 +262,13 @@ func (m *HabanalabsDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi.A } envMap := map[string]string{ - "HABANA_VISIBLE_DEVICES": strings.Join(netConfig[:], ","), - "HL_VISIBLE_DEVICES": strings.Join(paths[:], ","), - "HL_VISIBLE_DEVICES_UUID": strings.Join(uuids[:], ","), + "HABANA_VISIBLE_DEVICES": strings.Join(netConfig, ","), + "HL_VISIBLE_DEVICES": strings.Join(paths, ","), + "HL_VISIBLE_DEVICES_UUID": strings.Join(uuids, ","), } - if len(req.DevicesIDs) < int(len(m.devs)) { - envMap["HABANA_VISIBLE_MODULES"] = strings.Join(visibleModule[:], ",") + if len(req.DevicesIDs) < len(m.devs) { + envMap["HABANA_VISIBLE_MODULES"] = strings.Join(visibleModule, ",") } response.ContainerResponses = append(response.ContainerResponses, &pluginapi.ContainerAllocateResponse{ @@ -304,18 +314,16 @@ func (m *HabanalabsDevicePlugin) healthcheck() { func (m *HabanalabsDevicePlugin) Serve() error { err := m.Start() if err != nil { - log.Printf("Could not start device plugin: %s", err) - return err + return fmt.Errorf("could not start device plugln: %w", err) } - log.Println("Starting to serve on", m.socket) + m.log.Info("Starting to serve", "socket", m.socket) err = m.Register() if err != nil { - log.Printf("Could not register device plugin: %s", err) - m.Stop() - return err + _ = m.Stop() + return fmt.Errorf("could not register device plugin: %w", err) } - log.Println("Registered device plugin with Kubelet") + m.log.Info("Registered device plugin with Kubelet") return nil } diff --git a/versions.mk b/versions.mk new file mode 100644 index 0000000..751851f --- /dev/null +++ b/versions.mk @@ -0,0 +1,25 @@ +# Copyright (c) 2020-2022, HabanaLabs Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +APP_NAME := docker-k8s-device-plugin + +REGISTRY ?= vault.habana.ai + +VERSION ?= 1.14.0 +MINOR_VERSION ?= 493 +DIST ?= ubuntu22.04 + +GOLANG_VERSION ?= 1.21.5 + +GIT_COMMIT ?= $(shell git describe --match="" --dirty --long --always --abbrev=40 2> /dev/null || echo "") diff --git a/watcher.go b/watcher.go index 7ac853e..f8466ff 100644 --- a/watcher.go +++ b/watcher.go @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, HabanaLabs Ltd. All rights reserved. + * Copyright (c) 2022, HabanaLabs Ltd. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License.