diff --git a/cluster-autoscaler/processors/datadog/pods/filter_long_pending.go b/cluster-autoscaler/processors/datadog/pods/filter_long_pending.go index b43b7bb537a5..ae1c14aafe7a 100644 --- a/cluster-autoscaler/processors/datadog/pods/filter_long_pending.go +++ b/cluster-autoscaler/processors/datadog/pods/filter_long_pending.go @@ -33,6 +33,8 @@ package pods import ( "math/rand" + "os" + "strconv" "time" apiv1 "k8s.io/api/core/v1" @@ -42,13 +44,17 @@ import ( klog "k8s.io/klog/v2" ) -// we won't quarantine a workload's pods until they all went -// through (were considered for upscales) minAttempts times -const minAttempts = 3 - var ( now = time.Now // unit tests retriesIncrement = 2 * time.Minute + + // we won't quarantine a workload's pods until they all went + // through (were considered for upscales) minAttempts times + minAttempts = getEnv("QUARANTINE_MIN_ATTEMPTS", 2) + + // how many multiple of coolDownDelay will "long pending" + // pods be delayed (at most). + delayFactor = getEnv("QUARANTINE_DELAY_FACTOR", 3) ) type pendingTracker struct { @@ -159,7 +165,8 @@ func logSkipped(ref types.UID, tracker *pendingTracker, pods []*apiv1.Pod) { func buildDeadline(attempts int, coolDownDelay time.Duration) time.Time { increment := time.Duration(attempts-minAttempts) * retriesIncrement - delay := minDuration(increment, 2*coolDownDelay.Abs()) - jitterDuration(coolDownDelay.Abs()) + delay := minDuration(increment, time.Duration(delayFactor)*coolDownDelay.Abs()) + delay -= jitterDuration(coolDownDelay.Abs()) return now().Add(delay.Abs()) } @@ -174,3 +181,12 @@ func jitterDuration(duration time.Duration) time.Duration { jitter := time.Duration(rand.Int63n(int64(duration.Abs() + 1))) return jitter - duration/2 } + +func getEnv(key string, fallback int) int { + if value, ok := os.LookupEnv(key); ok { + if intVal, err := strconv.Atoi(value); err == nil { + return intVal + } + } + return fallback +} diff --git a/cluster-autoscaler/processors/datadog/pods/filter_long_pending_test.go b/cluster-autoscaler/processors/datadog/pods/filter_long_pending_test.go index d66ee398cca7..37ba39a7ae48 100644 --- a/cluster-autoscaler/processors/datadog/pods/filter_long_pending_test.go +++ b/cluster-autoscaler/processors/datadog/pods/filter_long_pending_test.go @@ -146,7 +146,7 @@ func TestBuildDeadline(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { now = func() time.Time { return time.Time{} } - maxExpected := now().Add(3 * test.coolDownDelay.Abs()) + maxExpected := now().Add(time.Duration(delayFactor+1) * test.coolDownDelay.Abs()) result := buildDeadline(test.attempts, test.coolDownDelay) if result.Before(now()) || result.After(maxExpected) { t.Errorf("buildDeadline(%v, %v) = %v, should be in [%v-%v] range",