From 3e19f30e76d969035cc9f8820df9341ccaf513ae Mon Sep 17 00:00:00 2001 From: "dom.bozzuto" Date: Mon, 21 Oct 2024 14:30:39 -0400 Subject: [PATCH] Wait for initial cache to sync before starting autoscaling --- cluster-autoscaler/main.go | 43 +++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/cluster-autoscaler/main.go b/cluster-autoscaler/main.go index 888c34ac2c2b..8fc29bcb5ec3 100644 --- a/cluster-autoscaler/main.go +++ b/cluster-autoscaler/main.go @@ -23,6 +23,7 @@ import ( "net/http" "os" "os/signal" + "reflect" "strconv" "strings" "syscall" @@ -608,10 +609,50 @@ func buildAutoscaler(debuggingSnapshotter debuggingsnapshot.DebuggingSnapshotter // additional informers might have been registered in the factory during NewAutoscaler. stop := make(chan struct{}) informerFactory.Start(stop) - + klog.V(1).Info("Started shared informer factory, waiting for initial cache sync") + + syncStart := time.Now() + allSynced := false + for !allSynced { + syncStatus := waitForCacheSyncWithTimeout(informerFactory, time.Second*10) + var missing []string + for t, synced := range syncStatus { + if !synced { + missing = append(missing, t.String()) + } + } + if len(missing) > 0 { + klog.V(4).Infof("Still waiting to sync the following caches: %s", strings.Join(missing, ",")) + } else { + allSynced = true + } + } + klog.V(1).Infof("Shared informer factory initialized, took %v", time.Since(syncStart)) return autoscaler, nil } +func waitForCacheSyncWithTimeout(informerFactory informers.SharedInformerFactory, timeout time.Duration) map[reflect.Type]bool { + stopCh := make(chan struct{}) + defer close(stopCh) + + doneCh := make(chan map[reflect.Type]bool) + go func() { + syncStatus := informerFactory.WaitForCacheSync(stopCh) + doneCh <- syncStatus + }() + + for { + select { + // WaitForCacheSync has returned; return the resulting status + case syncStatus := <-doneCh: + return syncStatus + // The timeout has expired, signal the stop channel and then read the result from the done channel + case <-time.After(timeout): + stopCh <- struct{}{} + } + } +} + func run(healthCheck *metrics.HealthCheck, debuggingSnapshotter debuggingsnapshot.DebuggingSnapshotter) { metrics.RegisterAll(*emitPerNodeGroupMetrics)