Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix graceful shutdown #101

Merged
merged 6 commits into from
Dec 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/ci-helm-deploy-nginx.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ jobs:
sleep 15 # wait for the scheduler to create pods
kubectl -n kube-system wait pods -l app.kubernetes.io/name=kubenurse --for=condition=Ready
kubectl -n kube-system get pods -l app.kubernetes.io/name=kubenurse
kubectl rollout restart daemonset kubenurse
kubectl rollout status daemonset kubenurse --timeout=1m
sleep 60 # Wait to generate some checks etc.
- name: Check deployment
uses: ./.github/actions/check-deployment
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/ci-helm-deploy-traefik.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ jobs:
sleep 15 # wait for the scheduler to create pods
kubectl -n kube-system wait pods -l app=kubenurse --for=condition=Ready
kubectl -n kube-system get pods -l app=kubenurse
kubectl rollout restart daemonset kubenurse
kubectl rollout status daemonset kubenurse --timeout=1m
sleep 60 # Wait to generate some checks etc.
- name: Check deployment
uses: ./.github/actions/check-deployment
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/ci-kustomize-deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ jobs:
sleep 15 # wait for the scheduler to create pods
kubectl wait pods -l app.kubernetes.io/name=kubenurse --for=condition=Ready
kubectl get pods -l app.kubernetes.io/name=kubenurse
kubectl rollout restart daemonset kubenurse
kubectl rollout status daemonset kubenurse --timeout=1m
sleep 60 # Wait to generate some checks etc.
- name: Check deployment
uses: ./.github/actions/check-deployment
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
- uses: actions/checkout@v4
- uses: golangci/golangci-lint-action@v3
with:
version: v1.52
version: v1.55
args: --timeout 5m
lint-helm:
runs-on: ubuntu-latest
Expand Down
8 changes: 2 additions & 6 deletions .golangci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,10 @@ linters-settings:
- gocognit
- funlen
- gocyclo

linters:
disable-all: true
enable:
- bodyclose
- deadcode
- depguard
- dogsled
- dupl
- errcheck
Expand All @@ -71,19 +68,18 @@ linters:
- misspell
- nakedret
- prealloc
- protogetter
- rowserrcheck
- exportloopref
- staticcheck
- structcheck
- stylecheck
- sqlclosecheck
- typecheck
- unconvert
- unparam
- unused
- varcheck
- whitespace
- wsl
issues:
exclude:
# Very commonly not checked.
- 'Error return value of .(l.Sync|.*Close|.*.Write|.*Flush|os\.Remove(All)?|os\.(Un)?Setenv). is not checked'
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
FROM alpine:latest
MAINTAINER OpenSource PF <[email protected]>
LABEL OpenSource="PF <[email protected]>"

RUN apk --no-cache add ca-certificates curl
COPY kubenurse /bin/kubenurse
Expand Down
7 changes: 5 additions & 2 deletions internal/kubediscovery/kubediscovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"context"
"fmt"

v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

"k8s.io/client-go/kubernetes"
Expand Down Expand Up @@ -35,7 +36,8 @@ type Neighbour struct {
HostIP string
NodeName string
NodeSchedulable NodeSchedulability
Phase string // Pod Phase
Phase v1.PodPhase
Terminating bool
}

// New creates a new kubediscovery client. The context is used to stop the k8s watchers/informers.
Expand Down Expand Up @@ -92,8 +94,9 @@ func (c *Client) GetNeighbours(ctx context.Context, namespace, labelSelector str
PodName: pod.Name,
PodIP: pod.Status.PodIP,
HostIP: pod.Status.HostIP,
Phase: string(pod.Status.Phase),
Phase: pod.Status.Phase,
NodeName: pod.Spec.NodeName,
Terminating: pod.DeletionTimestamp != nil,
NodeSchedulable: sched,
}
neighbours[idx] = n
Expand Down
19 changes: 19 additions & 0 deletions internal/kubenurse/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ type Server struct {
// * KUBERNETES_SERVICE_PORT
// * KUBENURSE_NAMESPACE
// * KUBENURSE_NEIGHBOUR_FILTER
// * KUBENURSE_SHUTDOWN_DURATION
// * KUBENURSE_CHECK_API_SERVER_DIRECT
// * KUBENURSE_CHECK_API_SERVER_DNS
// * KUBENURSE_CHECK_ME_INGRESS
Expand Down Expand Up @@ -107,12 +108,24 @@ func New(ctx context.Context, k8s kubernetes.Interface) (*Server, error) {
return nil, err
}

shutdownDuration := 5 * time.Second

if v, ok := os.LookupEnv("KUBENURSE_SHUTDOWN_DURATION"); ok {
var err error
shutdownDuration, err = time.ParseDuration(v)

if err != nil {
return nil, err
}
}

chk.KubenurseIngressURL = os.Getenv("KUBENURSE_INGRESS_URL")
chk.KubenurseServiceURL = os.Getenv("KUBENURSE_SERVICE_URL")
chk.KubernetesServiceHost = os.Getenv("KUBERNETES_SERVICE_HOST")
chk.KubernetesServicePort = os.Getenv("KUBERNETES_SERVICE_PORT")
chk.KubenurseNamespace = os.Getenv("KUBENURSE_NAMESPACE")
chk.NeighbourFilter = os.Getenv("KUBENURSE_NEIGHBOUR_FILTER")
chk.ShutdownDuration = shutdownDuration

//nolint:goconst // No need to make "false" a constant in my opinion, readability is better like this.
chk.SkipCheckAPIServerDirect = os.Getenv("KUBENURSE_CHECK_API_SERVER_DIRECT") == "false"
Expand Down Expand Up @@ -198,6 +211,12 @@ func (s *Server) Shutdown(ctx context.Context) error {
s.ready = false
s.mu.Unlock()

// wait before actually shutting down the http/s server, as the updated
// endpoints for the kubenurse service might not have propagated everywhere
// (other kubenurse/ingress controller) yet, which will lead to
// me_ingress or path errors in other pods
time.Sleep(s.checker.ShutdownDuration)

// stop the scheduled checker
s.checker.StopScheduled()

Expand Down
9 changes: 6 additions & 3 deletions internal/servicecheck/servicecheck.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (

"github.com/postfinance/kubenurse/internal/kubediscovery"
"github.com/prometheus/client_golang/prometheus"
v1 "k8s.io/api/core/v1"
)

const (
Expand Down Expand Up @@ -170,7 +171,7 @@ func (c *Checker) MeIngress() (string, error) {
return skippedStr, nil
}

return c.doRequest(c.KubenurseIngressURL + "/alwayshappy")
return c.doRequest(c.KubenurseIngressURL + "/alwayshappy") //nolint:goconst // readability
}

// MeService checks if the kubenurse is reachable at the /alwayshappy endpoint through the kubernetes service
Expand All @@ -186,8 +187,10 @@ func (c *Checker) MeService() (string, error) {
// which are not schedulable are excluded from this check to avoid possible false errors.
func (c *Checker) checkNeighbours(nh []kubediscovery.Neighbour) {
for _, neighbour := range nh {
neighbour := neighbour // pin
if c.allowUnschedulable || neighbour.NodeSchedulable == kubediscovery.NodeSchedulable {
neighbour := neighbour // pin
if neighbour.Phase == v1.PodRunning && // only query running pods (excludes pending ones)
!neighbour.Terminating && // exclude terminating pods
(c.allowUnschedulable || neighbour.NodeSchedulable == kubediscovery.NodeSchedulable) {
check := func() (string, error) {
if c.UseTLS {
return c.doRequest("https://" + neighbour.PodIP + ":8443/alwayshappy")
Expand Down
2 changes: 1 addition & 1 deletion internal/servicecheck/transport.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ func generateRoundTripper(extraCA string, insecure bool) (http.RoundTripper, err

// Append extra CA, if set
if extraCA != "" {
caCert, err := os.ReadFile(extraCA) //nolint:gosec // Intentionally included by the user.
caCert, err := os.ReadFile(extraCA) // Intentionally included by the user.
if err != nil {
return nil, fmt.Errorf("could not load certificate %s: %w", extraCA, err)
}
Expand Down
3 changes: 3 additions & 0 deletions internal/servicecheck/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ type Checker struct {
SkipCheckMeIngress bool
SkipCheckMeService bool

// shutdownDuration defines the time during which kubenurse will wait before stopping
ShutdownDuration time.Duration

// Kubernetes API
KubernetesServiceHost string
KubernetesServicePort string
Expand Down
Loading