diff --git a/.github/workflows/ci-helm-deploy-nginx.yml b/.github/workflows/ci-helm-deploy-nginx.yml index ef11b24a..0a17845d 100644 --- a/.github/workflows/ci-helm-deploy-nginx.yml +++ b/.github/workflows/ci-helm-deploy-nginx.yml @@ -56,6 +56,8 @@ jobs: sleep 15 # wait for the scheduler to create pods kubectl -n kube-system wait pods -l app.kubernetes.io/name=kubenurse --for=condition=Ready kubectl -n kube-system get pods -l app.kubernetes.io/name=kubenurse + kubectl rollout restart daemonset kubenurse + kubectl rollout status daemonset kubenurse --timeout=1m sleep 60 # Wait to generate some checks etc. - name: Check deployment uses: ./.github/actions/check-deployment diff --git a/.github/workflows/ci-helm-deploy-traefik.yml b/.github/workflows/ci-helm-deploy-traefik.yml index 0e5d73e4..a27d8824 100644 --- a/.github/workflows/ci-helm-deploy-traefik.yml +++ b/.github/workflows/ci-helm-deploy-traefik.yml @@ -60,6 +60,8 @@ jobs: sleep 15 # wait for the scheduler to create pods kubectl -n kube-system wait pods -l app=kubenurse --for=condition=Ready kubectl -n kube-system get pods -l app=kubenurse + kubectl rollout restart daemonset kubenurse + kubectl rollout status daemonset kubenurse --timeout=1m sleep 60 # Wait to generate some checks etc. - name: Check deployment uses: ./.github/actions/check-deployment diff --git a/.github/workflows/ci-kustomize-deploy.yml b/.github/workflows/ci-kustomize-deploy.yml index 47cef8ff..e90a126f 100644 --- a/.github/workflows/ci-kustomize-deploy.yml +++ b/.github/workflows/ci-kustomize-deploy.yml @@ -52,6 +52,8 @@ jobs: sleep 15 # wait for the scheduler to create pods kubectl wait pods -l app.kubernetes.io/name=kubenurse --for=condition=Ready kubectl get pods -l app.kubernetes.io/name=kubenurse + kubectl rollout restart daemonset kubenurse + kubectl rollout status daemonset kubenurse --timeout=1m sleep 60 # Wait to generate some checks etc. - name: Check deployment uses: ./.github/actions/check-deployment diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index b51a8cb7..543083a3 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -10,7 +10,7 @@ jobs: - uses: actions/checkout@v4 - uses: golangci/golangci-lint-action@v3 with: - version: v1.52 + version: v1.55 args: --timeout 5m lint-helm: runs-on: ubuntu-latest diff --git a/.golangci.yml b/.golangci.yml index ada0007f..55314a8a 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -42,13 +42,10 @@ linters-settings: - gocognit - funlen - gocyclo - linters: disable-all: true enable: - bodyclose - - deadcode - - depguard - dogsled - dupl - errcheck @@ -71,19 +68,18 @@ linters: - misspell - nakedret - prealloc + - protogetter - rowserrcheck - exportloopref - staticcheck - - structcheck - stylecheck + - sqlclosecheck - typecheck - unconvert - unparam - unused - - varcheck - whitespace - wsl -issues: exclude: # Very commonly not checked. - 'Error return value of .(l.Sync|.*Close|.*.Write|.*Flush|os\.Remove(All)?|os\.(Un)?Setenv). is not checked' diff --git a/Dockerfile b/Dockerfile index a2d8b9eb..fa2905e2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ FROM alpine:latest -MAINTAINER OpenSource PF +LABEL OpenSource="PF " RUN apk --no-cache add ca-certificates curl COPY kubenurse /bin/kubenurse diff --git a/internal/kubediscovery/kubediscovery.go b/internal/kubediscovery/kubediscovery.go index fe1483da..c36af220 100644 --- a/internal/kubediscovery/kubediscovery.go +++ b/internal/kubediscovery/kubediscovery.go @@ -5,6 +5,7 @@ import ( "context" "fmt" + v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" @@ -35,7 +36,8 @@ type Neighbour struct { HostIP string NodeName string NodeSchedulable NodeSchedulability - Phase string // Pod Phase + Phase v1.PodPhase + Terminating bool } // New creates a new kubediscovery client. The context is used to stop the k8s watchers/informers. @@ -92,8 +94,9 @@ func (c *Client) GetNeighbours(ctx context.Context, namespace, labelSelector str PodName: pod.Name, PodIP: pod.Status.PodIP, HostIP: pod.Status.HostIP, - Phase: string(pod.Status.Phase), + Phase: pod.Status.Phase, NodeName: pod.Spec.NodeName, + Terminating: pod.DeletionTimestamp != nil, NodeSchedulable: sched, } neighbours[idx] = n diff --git a/internal/kubenurse/server.go b/internal/kubenurse/server.go index 5bb586fb..a77417b5 100644 --- a/internal/kubenurse/server.go +++ b/internal/kubenurse/server.go @@ -46,6 +46,7 @@ type Server struct { // * KUBERNETES_SERVICE_PORT // * KUBENURSE_NAMESPACE // * KUBENURSE_NEIGHBOUR_FILTER +// * KUBENURSE_SHUTDOWN_DURATION // * KUBENURSE_CHECK_API_SERVER_DIRECT // * KUBENURSE_CHECK_API_SERVER_DNS // * KUBENURSE_CHECK_ME_INGRESS @@ -107,12 +108,24 @@ func New(ctx context.Context, k8s kubernetes.Interface) (*Server, error) { return nil, err } + shutdownDuration := 5 * time.Second + + if v, ok := os.LookupEnv("KUBENURSE_SHUTDOWN_DURATION"); ok { + var err error + shutdownDuration, err = time.ParseDuration(v) + + if err != nil { + return nil, err + } + } + chk.KubenurseIngressURL = os.Getenv("KUBENURSE_INGRESS_URL") chk.KubenurseServiceURL = os.Getenv("KUBENURSE_SERVICE_URL") chk.KubernetesServiceHost = os.Getenv("KUBERNETES_SERVICE_HOST") chk.KubernetesServicePort = os.Getenv("KUBERNETES_SERVICE_PORT") chk.KubenurseNamespace = os.Getenv("KUBENURSE_NAMESPACE") chk.NeighbourFilter = os.Getenv("KUBENURSE_NEIGHBOUR_FILTER") + chk.ShutdownDuration = shutdownDuration //nolint:goconst // No need to make "false" a constant in my opinion, readability is better like this. chk.SkipCheckAPIServerDirect = os.Getenv("KUBENURSE_CHECK_API_SERVER_DIRECT") == "false" @@ -198,6 +211,12 @@ func (s *Server) Shutdown(ctx context.Context) error { s.ready = false s.mu.Unlock() + // wait before actually shutting down the http/s server, as the updated + // endpoints for the kubenurse service might not have propagated everywhere + // (other kubenurse/ingress controller) yet, which will lead to + // me_ingress or path errors in other pods + time.Sleep(s.checker.ShutdownDuration) + // stop the scheduled checker s.checker.StopScheduled() diff --git a/internal/servicecheck/servicecheck.go b/internal/servicecheck/servicecheck.go index b285553c..d1c8d7ef 100644 --- a/internal/servicecheck/servicecheck.go +++ b/internal/servicecheck/servicecheck.go @@ -11,6 +11,7 @@ import ( "github.com/postfinance/kubenurse/internal/kubediscovery" "github.com/prometheus/client_golang/prometheus" + v1 "k8s.io/api/core/v1" ) const ( @@ -170,7 +171,7 @@ func (c *Checker) MeIngress() (string, error) { return skippedStr, nil } - return c.doRequest(c.KubenurseIngressURL + "/alwayshappy") + return c.doRequest(c.KubenurseIngressURL + "/alwayshappy") //nolint:goconst // readability } // MeService checks if the kubenurse is reachable at the /alwayshappy endpoint through the kubernetes service @@ -186,8 +187,10 @@ func (c *Checker) MeService() (string, error) { // which are not schedulable are excluded from this check to avoid possible false errors. func (c *Checker) checkNeighbours(nh []kubediscovery.Neighbour) { for _, neighbour := range nh { - neighbour := neighbour // pin - if c.allowUnschedulable || neighbour.NodeSchedulable == kubediscovery.NodeSchedulable { + neighbour := neighbour // pin + if neighbour.Phase == v1.PodRunning && // only query running pods (excludes pending ones) + !neighbour.Terminating && // exclude terminating pods + (c.allowUnschedulable || neighbour.NodeSchedulable == kubediscovery.NodeSchedulable) { check := func() (string, error) { if c.UseTLS { return c.doRequest("https://" + neighbour.PodIP + ":8443/alwayshappy") diff --git a/internal/servicecheck/transport.go b/internal/servicecheck/transport.go index cc665ee5..9bf8ef19 100644 --- a/internal/servicecheck/transport.go +++ b/internal/servicecheck/transport.go @@ -66,7 +66,7 @@ func generateRoundTripper(extraCA string, insecure bool) (http.RoundTripper, err // Append extra CA, if set if extraCA != "" { - caCert, err := os.ReadFile(extraCA) //nolint:gosec // Intentionally included by the user. + caCert, err := os.ReadFile(extraCA) // Intentionally included by the user. if err != nil { return nil, fmt.Errorf("could not load certificate %s: %w", extraCA, err) } diff --git a/internal/servicecheck/types.go b/internal/servicecheck/types.go index 45bfcf24..d08b1e14 100644 --- a/internal/servicecheck/types.go +++ b/internal/servicecheck/types.go @@ -16,6 +16,9 @@ type Checker struct { SkipCheckMeIngress bool SkipCheckMeService bool + // shutdownDuration defines the time during which kubenurse will wait before stopping + ShutdownDuration time.Duration + // Kubernetes API KubernetesServiceHost string KubernetesServicePort string