From 194e3313b31c9f0f314c025c3f82c922935273b2 Mon Sep 17 00:00:00 2001 From: Simon Beck Date: Thu, 5 Dec 2024 16:16:51 +0100 Subject: [PATCH] Fail whole prebackup if command exit != 0 This will fix issues were problems with pre-backup go undected. K8up will now completely fail the whole whole prebackup pod if the commands exist with a code != 0. Also a small issue with concurrent `restic init` was fixed. Signed-off-by: Simon Beck --- cmd/operator/main.go | 1 + .../annotated-subject/deployment-error.yaml | 49 +++++++++++++++++++ e2e/definitions/operator/deploy.yaml | 4 -- e2e/definitions/operator/values.yaml | 2 + e2e/lib/k8up.bash | 23 +++++++++ e2e/test-12-annotated-failure.bats | 31 ++++++++++++ go.mod | 2 +- operator/backupcontroller/executor.go | 8 +-- operator/cfg/config.go | 1 + restic/kubernetes/pod_exec.go | 3 ++ 10 files changed, 116 insertions(+), 8 deletions(-) create mode 100644 e2e/definitions/annotated-subject/deployment-error.yaml delete mode 100644 e2e/definitions/operator/deploy.yaml create mode 100644 e2e/test-12-annotated-failure.bats diff --git a/cmd/operator/main.go b/cmd/operator/main.go index 399568f2b..f6a72495a 100644 --- a/cmd/operator/main.go +++ b/cmd/operator/main.go @@ -41,6 +41,7 @@ var ( &cli.StringFlag{Destination: &cfg.Config.FileExtensionAnnotation, Name: "fileextensionannotation", EnvVars: []string{"BACKUP_FILEEXTENSIONANNOTATION"}, Value: "k8up.io/file-extension", Usage: "set the annotation name where the file extension is stored for backup commands"}, &cli.IntFlag{Destination: &cfg.Config.GlobalKeepJobs, Hidden: true, Name: "globalkeepjobs", EnvVars: []string{"BACKUP_GLOBALKEEPJOBS"}, Value: -1, DefaultText: "unlimited", Usage: "set the number of old jobs to keep when cleaning up, applies to all job types"}, + &cli.IntFlag{Destination: &cfg.Config.GlobalBackoffLimit, Name: "global-backoff-limit", EnvVars: []string{"BACKUP_GLOBAL_BACKOFF_LIMIT"}, Value: 6, Usage: "set the backoff limit for all backup jobs"}, &cli.IntFlag{Destination: &cfg.Config.GlobalFailedJobsHistoryLimit, Name: "global-failed-jobs-history-limit", EnvVars: []string{"BACKUP_GLOBAL_FAILED_JOBS_HISTORY_LIMIT"}, Value: 3, Usage: "set the number of old, failed jobs to keep when cleaning up, applies to all job types"}, &cli.IntFlag{Destination: &cfg.Config.GlobalSuccessfulJobsHistoryLimit, Name: "global-successful-jobs-history-limit", EnvVars: []string{"BACKUP_GLOBAL_SUCCESSFUL_JOBS_HISTORY_LIMIT"}, Value: 3, Usage: "set the number of old, successful jobs to keep when cleaning up, applies to all job types"}, &cli.IntFlag{Destination: &cfg.Config.GlobalConcurrentArchiveJobsLimit, Name: "global-concurrent-archive-jobs-limit", EnvVars: []string{"BACKUP_GLOBAL_CONCURRENT_ARCHIVE_JOBS_LIMIT"}, DefaultText: "unlimited", Usage: "set the limit of concurrent archive jobs"}, diff --git a/e2e/definitions/annotated-subject/deployment-error.yaml b/e2e/definitions/annotated-subject/deployment-error.yaml new file mode 100644 index 000000000..59d1f3a36 --- /dev/null +++ b/e2e/definitions/annotated-subject/deployment-error.yaml @@ -0,0 +1,49 @@ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: annotated-subject-deployment + namespace: k8up-e2e-subject +spec: + replicas: 1 + selector: + matchLabels: + app: subject + template: + metadata: + labels: + app: subject + annotations: + k8up.io/backupcommand: 'invalid' + k8up.io/backupcommand-container: subject-container + spec: + containers: + - image: busybox + imagePullPolicy: IfNotPresent + name: dummy-container-blocking-first-position + command: + - "/bin/sh" + - "-c" + - "sleep infinity" + - name: subject-container + image: quay.io/prometheus/busybox:latest + imagePullPolicy: IfNotPresent + args: + - sh + - -c + - | + sleep infinity + securityContext: + runAsUser: $ID + volumeMounts: + - name: volume + mountPath: /data + env: + - name: BACKUP_FILE_CONTENT + value: "" + - name: BACKUP_FILE_NAME + value: "" + volumes: + - name: volume + persistentVolumeClaim: + claimName: subject-pvc diff --git a/e2e/definitions/operator/deploy.yaml b/e2e/definitions/operator/deploy.yaml deleted file mode 100644 index 20eba28b1..000000000 --- a/e2e/definitions/operator/deploy.yaml +++ /dev/null @@ -1,4 +0,0 @@ -k8up: - envVars: - - name: K8UP_DEBUG - value: "true" diff --git a/e2e/definitions/operator/values.yaml b/e2e/definitions/operator/values.yaml index 566d074e3..7525341e4 100644 --- a/e2e/definitions/operator/values.yaml +++ b/e2e/definitions/operator/values.yaml @@ -9,3 +9,5 @@ k8up: envVars: - name: K8UP_DEBUG value: "true" + - name: BACKUP_GLOBAL_BACKOFF_LIMIT + value: "2" diff --git a/e2e/lib/k8up.bash b/e2e/lib/k8up.bash index d2716901e..774593c46 100755 --- a/e2e/lib/k8up.bash +++ b/e2e/lib/k8up.bash @@ -170,6 +170,15 @@ given_an_annotated_subject() { echo "✅ The annotated subject is ready" } +given_a_broken_annotated_subject() { + require_args 2 ${#} + + kubectl apply -f definitions/pv/pvc.yaml + yq e '.spec.template.spec.containers[1].securityContext.runAsUser='$(id -u)' ' definitions/annotated-subject/deployment-error.yaml | kubectl apply -f - + + echo "✅ The annotated subject is ready" +} + given_an_annotated_subject_pod() { require_args 2 ${#} @@ -457,6 +466,20 @@ wait_until() { kubectl -n "${ns}" wait --timeout 5m --for "condition=${condition}" "${object}" } +wait_for_until_jsonpath() { + require_args 3 ${#} + + local object condition ns + object=${1} + until=${2} + jsonpath=${3} + ns=${NAMESPACE=${DETIK_CLIENT_NAMESPACE}} + + echo "Waiting for '${object}' in namespace '${ns}' to become '${condition}' ..." + kubectl -n "${ns}" wait --timeout "${until}" --for "${jsonpath}" "${object}" +} + + expect_file_in_container() { require_args 4 ${#} diff --git a/e2e/test-12-annotated-failure.bats b/e2e/test-12-annotated-failure.bats new file mode 100644 index 000000000..f4b26683b --- /dev/null +++ b/e2e/test-12-annotated-failure.bats @@ -0,0 +1,31 @@ +#!/usr/bin/env bats + +load "lib/utils" +load "lib/detik" +load "lib/k8up" + +# shellcheck disable=SC2034 +DETIK_CLIENT_NAME="kubectl" +# shellcheck disable=SC2034 +DETIK_CLIENT_NAMESPACE="k8up-e2e-subject" +# shellcheck disable=SC2034 +DEBUG_DETIK="true" + +@test "Annotated app, When creating a backup, Then expect Error" { + expected_content="expected content: $(timestamp)" + expected_filename="expected_filename.txt" + + given_a_running_operator + given_a_clean_ns + given_s3_storage + given_a_broken_annotated_subject "${expected_filename}" "${expected_content}" + + kubectl apply -f definitions/secrets + yq e '.spec.podSecurityContext.runAsUser='$(id -u)'' definitions/backup/backup.yaml | kubectl apply -f - + + try "at most 10 times every 5s to get backup named 'k8up-backup' and verify that '.status.started' is 'true'" + verify_object_value_by_label job 'k8up.io/owned-by=backup_k8up-backup' '.status.active' 1 true + + wait_for_until_jsonpath backup/k8up-backup 2m 'jsonpath={.status.conditions[?(@.type=="Completed")].reason}=Failed' + +} diff --git a/go.mod b/go.mod index 5632e9cab..062ca6693 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/k8up-io/k8up/v2 -go 1.21 +go 1.23 require ( github.com/firepear/qsplit/v2 v2.5.0 diff --git a/operator/backupcontroller/executor.go b/operator/backupcontroller/executor.go index 0cd83e112..dfbfeb99d 100644 --- a/operator/backupcontroller/executor.go +++ b/operator/backupcontroller/executor.go @@ -18,6 +18,7 @@ import ( corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/types" + "k8s.io/utils/ptr" controllerruntime "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -236,7 +237,7 @@ func (b *BackupExecutor) startBackup(ctx context.Context) error { } index := 0 - for _, batchJob := range backupJobs { + for name, batchJob := range backupJobs { _, err = controllerruntime.CreateOrUpdate(ctx, b.Generic.Config.Client, batchJob.job, func() error { mutateErr := job.MutateBatchJob(ctx, batchJob.job, b.backup, b.Generic.Config, b.Client) if mutateErr != nil { @@ -262,16 +263,17 @@ func (b *BackupExecutor) startBackup(ctx context.Context) error { } // each job sleeps for index seconds to avoid concurrent restic repository creation. Not the prettiest way but it works and a repository // is created only once usually. - if index > 0 { + if name == "prebackup" || index != 0 { batchJob.job.Spec.Template.Spec.Containers[0].Env = append(batchJob.job.Spec.Template.Spec.Containers[0].Env, corev1.EnvVar{ Name: "SLEEP_DURATION", - Value: (time.Duration(index) * time.Second).String(), + Value: (3 * time.Second).String(), }) } b.backup.Spec.AppendEnvFromToContainer(&batchJob.job.Spec.Template.Spec.Containers[0]) batchJob.job.Spec.Template.Spec.Volumes = append(batchJob.job.Spec.Template.Spec.Volumes, batchJob.volumes...) batchJob.job.Spec.Template.Spec.Volumes = append(batchJob.job.Spec.Template.Spec.Volumes, utils.AttachTLSVolumes(b.backup.Spec.Volumes)...) batchJob.job.Spec.Template.Spec.Containers[0].VolumeMounts = append(b.newVolumeMounts(batchJob.volumes), b.attachTLSVolumeMounts()...) + batchJob.job.Spec.BackoffLimit = ptr.To(int32(cfg.Config.GlobalBackoffLimit)) batchJob.job.Spec.Template.Spec.Containers[0].Args = b.setupArgs() diff --git a/operator/cfg/config.go b/operator/cfg/config.go index bf76d5665..7d0fa04ce 100644 --- a/operator/cfg/config.go +++ b/operator/cfg/config.go @@ -51,6 +51,7 @@ type Configuration struct { GlobalKeepJobs int GlobalFailedJobsHistoryLimit int GlobalSuccessfulJobsHistoryLimit int + GlobalBackoffLimit int GlobalRepoPassword string GlobalRestoreS3AccessKey string GlobalRestoreS3Bucket string diff --git a/restic/kubernetes/pod_exec.go b/restic/kubernetes/pod_exec.go index 274cf1f29..fe5c46b41 100644 --- a/restic/kubernetes/pod_exec.go +++ b/restic/kubernetes/pod_exec.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "io" + "os" "strings" "github.com/firepear/qsplit/v2" @@ -73,6 +74,8 @@ func PodExec(pod BackupPod, log logr.Logger) (*ExecData, error) { if err != nil { execLogger.Error(err, "streaming data failed", "namespace", pod.Namespace, "pod", pod.PodName) + // we just completely hard fail the whole backup pod + os.Exit(1) return } }()