Skip to content

Commit

Permalink
[local] Eager and longer long-pending quarantines
Browse files Browse the repository at this point in the history
Initial values were meant to be conservative and proven in the wild.

This was successful in the sense that no false positive delays were
detected. But less so in the sense that we missed opportunities to
protect overwhelmed autoscaler soon enough and for long enough. This
change makes it quarantine pods earlier, and for a longer time
(also securing more opportunities to get out of scale down cool down
periods).

I don't want to change code for future fine tuning, and I'd rather
avoid conflicting with command line flags during rebases (that
processor isn't available in upstream autoscaler), so changed
to support passing values from env.
  • Loading branch information
bpineau authored and domenicbozzuto committed Oct 31, 2023
1 parent bf4afce commit 910c80d
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 6 deletions.
26 changes: 21 additions & 5 deletions cluster-autoscaler/processors/datadog/pods/filter_long_pending.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ package pods

import (
"math/rand"
"os"
"strconv"
"time"

apiv1 "k8s.io/api/core/v1"
Expand All @@ -42,13 +44,17 @@ import (
klog "k8s.io/klog/v2"
)

// we won't quarantine a workload's pods until they all went
// through (were considered for upscales) minAttempts times
const minAttempts = 3

var (
now = time.Now // unit tests
retriesIncrement = 2 * time.Minute

// we won't quarantine a workload's pods until they all went
// through (were considered for upscales) minAttempts times
minAttempts = getEnv("QUARANTINE_MIN_ATTEMPTS", 2)

// how many multiple of coolDownDelay will "long pending"
// pods be delayed (at most).
delayFactor = getEnv("QUARANTINE_DELAY_FACTOR", 3)
)

type pendingTracker struct {
Expand Down Expand Up @@ -159,7 +165,8 @@ func logSkipped(ref types.UID, tracker *pendingTracker, pods []*apiv1.Pod) {

func buildDeadline(attempts int, coolDownDelay time.Duration) time.Time {
increment := time.Duration(attempts-minAttempts) * retriesIncrement
delay := minDuration(increment, 2*coolDownDelay.Abs()) - jitterDuration(coolDownDelay.Abs())
delay := minDuration(increment, time.Duration(delayFactor)*coolDownDelay.Abs())
delay -= jitterDuration(coolDownDelay.Abs())
return now().Add(delay.Abs())
}

Expand All @@ -174,3 +181,12 @@ func jitterDuration(duration time.Duration) time.Duration {
jitter := time.Duration(rand.Int63n(int64(duration.Abs() + 1)))
return jitter - duration/2
}

func getEnv(key string, fallback int) int {
if value, ok := os.LookupEnv(key); ok {
if intVal, err := strconv.Atoi(value); err == nil {
return intVal
}
}
return fallback
}
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ func TestBuildDeadline(t *testing.T) {
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
now = func() time.Time { return time.Time{} }
maxExpected := now().Add(3 * test.coolDownDelay.Abs())
maxExpected := now().Add(time.Duration(delayFactor+1) * test.coolDownDelay.Abs())
result := buildDeadline(test.attempts, test.coolDownDelay)
if result.Before(now()) || result.After(maxExpected) {
t.Errorf("buildDeadline(%v, %v) = %v, should be in [%v-%v] range",
Expand Down

0 comments on commit 910c80d

Please sign in to comment.