From 396a348da9613931bdd5cd3d50beea9193900a75 Mon Sep 17 00:00:00 2001 From: Furst Roman Date: Wed, 27 Sep 2023 20:35:36 +0200 Subject: [PATCH] propagate current redis cluster state to redisfailver crd --- api/redisfailover/v1/defaults.go | 2 + api/redisfailover/v1/types.go | 9 +- api/redisfailover/v1/validate.go | 4 + api/redisfailover/v1/validate_test.go | 5 + ...atabases.spotahome.com_redisfailovers.yaml | 13 ++ mocks/operator/redisfailover/RedisFailover.go | 2 + mocks/service/k8s/Services.go | 2 + operator/redisfailover/checker.go | 130 ++++++++++++++++-- operator/redisfailover/checker_test.go | 3 + operator/redisfailover/handler.go | 4 +- service/k8s/redisfailover.go | 9 ++ 11 files changed, 171 insertions(+), 12 deletions(-) diff --git a/api/redisfailover/v1/defaults.go b/api/redisfailover/v1/defaults.go index cc8259141..5fb18c5eb 100644 --- a/api/redisfailover/v1/defaults.go +++ b/api/redisfailover/v1/defaults.go @@ -7,6 +7,8 @@ const ( defaultExporterImage = "quay.io/oliver006/redis_exporter:v1.43.0" defaultImage = "redis:6.2.6-alpine" defaultRedisPort = 6379 + HealthyState = "Healthy" + NotHealthyState = "NotHealthy" ) var ( diff --git a/api/redisfailover/v1/types.go b/api/redisfailover/v1/types.go index 45e801451..7c875c87e 100644 --- a/api/redisfailover/v1/types.go +++ b/api/redisfailover/v1/types.go @@ -17,7 +17,8 @@ import ( type RedisFailover struct { metav1.TypeMeta `json:",inline"` metav1.ObjectMeta `json:"metadata,omitempty"` - Spec RedisFailoverSpec `json:"spec"` + Spec RedisFailoverSpec `json:"spec"` + Status RedisFailoverStatus `json:"status,omitempty"` } // RedisFailoverSpec represents a Redis failover spec @@ -198,3 +199,9 @@ type RedisFailoverList struct { Items []RedisFailover `json:"items"` } + +type RedisFailoverStatus struct { + State string `json:"state,omitempty"` + LastChanged string `json:"lastChanged,omitempty"` + Message string `json:"message,omitempty"` +} diff --git a/api/redisfailover/v1/validate.go b/api/redisfailover/v1/validate.go index 5b4bcb4f8..c4a468aa0 100644 --- a/api/redisfailover/v1/validate.go +++ b/api/redisfailover/v1/validate.go @@ -61,6 +61,10 @@ func (r *RedisFailover) Validate() error { r.Spec.Sentinel.CustomConfig = defaultSentinelCustomConfig } + r.Status = RedisFailoverStatus{ + State: HealthyState, + } + return nil } diff --git a/api/redisfailover/v1/validate_test.go b/api/redisfailover/v1/validate_test.go index 30eede285..aa4742028 100644 --- a/api/redisfailover/v1/validate_test.go +++ b/api/redisfailover/v1/validate_test.go @@ -120,6 +120,11 @@ func TestValidate(t *testing.T) { }, BootstrapNode: test.expectedBootstrapNode, }, + Status: RedisFailoverStatus{ + State: HealthyState, + LastChanged: "", + Message: "", + }, } assert.Equal(expectedRF, rf) } else { diff --git a/charts/redisoperator/crds/databases.spotahome.com_redisfailovers.yaml b/charts/redisoperator/crds/databases.spotahome.com_redisfailovers.yaml index cc8ee11e8..72015cca2 100644 --- a/charts/redisoperator/crds/databases.spotahome.com_redisfailovers.yaml +++ b/charts/redisoperator/crds/databases.spotahome.com_redisfailovers.yaml @@ -12387,6 +12387,19 @@ spec: type: array type: object type: object + status: + description: CRD status defined by redisfailover cluster state + properties: + state: + description: state of redis failover cluster + type: string + lastChanged: + description: timestamp of last state change + type: string + message: + description: message for current state if needed + type: string + type: object required: - spec type: object diff --git a/mocks/operator/redisfailover/RedisFailover.go b/mocks/operator/redisfailover/RedisFailover.go index 33c086c26..67bc90ec8 100644 --- a/mocks/operator/redisfailover/RedisFailover.go +++ b/mocks/operator/redisfailover/RedisFailover.go @@ -71,6 +71,8 @@ func (_m *RedisFailover) WatchRedisFailovers(ctx context.Context, namespace stri return r0, r1 } +func (_m *RedisFailover) UpdateRedisFailoverStatus(ctx context.Context, namespace string, redisFailover *redisfailoverv1.RedisFailover, opts v1.UpdateOptions) {} + type mockConstructorTestingTNewRedisFailover interface { mock.TestingT Cleanup(func()) diff --git a/mocks/service/k8s/Services.go b/mocks/service/k8s/Services.go index e734f6f47..dde60801d 100644 --- a/mocks/service/k8s/Services.go +++ b/mocks/service/k8s/Services.go @@ -969,6 +969,8 @@ func (_m *Services) WatchRedisFailovers(ctx context.Context, namespace string, o return r0, r1 } +func (_m *Services) UpdateRedisFailoverStatus(ctx context.Context, namespace string, redisFailover *redisfailoverv1.RedisFailover, opts metav1.UpdateOptions) {} + type mockConstructorTestingTNewServices interface { mock.TestingT Cleanup(func()) diff --git a/operator/redisfailover/checker.go b/operator/redisfailover/checker.go index 1671da74d..eb93fc162 100644 --- a/operator/redisfailover/checker.go +++ b/operator/redisfailover/checker.go @@ -1,7 +1,10 @@ package redisfailover import ( + "context" "errors" + "github.com/spotahome/redis-operator/service/k8s" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "strconv" "time" @@ -85,6 +88,15 @@ func (r *RedisFailoverHandler) UpdateRedisesPods(rf *redisfailoverv1.RedisFailov // CheckAndHeal runs verifcation checks to ensure the RedisFailover is in an expected and healthy state. // If the checks do not match up to expectations, an attempt will be made to "heal" the RedisFailover into a healthy state. func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) error { + + oldState := rf.Status.State + + rf.Status = redisfailoverv1.RedisFailoverStatus{ + State: redisfailoverv1.HealthyState, + } + + defer updateStatus(r.k8sservice, rf, oldState) + if rf.Bootstrapping() { return r.checkAndHealBootstrapMode(rf) } @@ -99,19 +111,33 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e // Sentinel knows the correct slave number if !r.rfChecker.IsRedisRunning(rf) { - setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.REDIS_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, errors.New("not all replicas running")) + errorMsg := "not all replicas running" + rf.Status = redisfailoverv1.RedisFailoverStatus{ + State: redisfailoverv1.NotHealthyState, + Message: errorMsg, + } + setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.REDIS_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, errors.New(errorMsg)) r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Debugf("Number of redis mismatch, waiting for redis statefulset reconcile") return nil } if !r.rfChecker.IsSentinelRunning(rf) { - setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, errors.New("not all replicas running")) + errorMsg := "not all replicas running" + rf.Status = redisfailoverv1.RedisFailoverStatus{ + State: redisfailoverv1.NotHealthyState, + Message: errorMsg, + } + setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, errors.New(errorMsg)) r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Debugf("Number of sentinel mismatch, waiting for sentinel deployment reconcile") return nil } nMasters, err := r.rfChecker.GetNumberMasters(rf) if err != nil { + rf.Status = redisfailoverv1.RedisFailoverStatus{ + State: redisfailoverv1.NotHealthyState, + Message: "unable to get number of masters", + } return err } @@ -125,7 +151,12 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e err = r.rfHealer.SetOldestAsMaster(rf) setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NO_MASTER, metrics.NOT_APPLICABLE, err) if err != nil { - r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("Error in Setting oldest Pod as master") + errorMsg := "Error in Setting oldest Pod as master" + rf.Status = redisfailoverv1.RedisFailoverStatus{ + State: redisfailoverv1.NotHealthyState, + Message: errorMsg, + } + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf(errorMsg) return err } return nil @@ -138,6 +169,10 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Number of Masters running is 0") maxUptime, err := r.rfChecker.GetMaxRedisPodTime(rf) if err != nil { + rf.Status = redisfailoverv1.RedisFailoverStatus{ + State: redisfailoverv1.NotHealthyState, + Message: "unable to get Redis POD time", + } return err } @@ -150,13 +185,22 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e err2 := r.rfHealer.SetOldestAsMaster(rf) setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NO_MASTER, metrics.NOT_APPLICABLE, err2) if err2 != nil { - r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("Error in Setting oldest Pod as master") + errorMsg := "Error in Setting oldest Pod as master" + rf.Status = redisfailoverv1.RedisFailoverStatus{ + State: redisfailoverv1.NotHealthyState, + Message: errorMsg, + } + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf(errorMsg) return err2 } } else { //sentinels are having a quorum to make a failover , but check if redis are not having local hostip (first boot) as master status, err2 := r.rfChecker.CheckIfMasterLocalhost(rf) if err2 != nil { + rf.Status = redisfailoverv1.RedisFailoverStatus{ + State: redisfailoverv1.NotHealthyState, + Message: "unable to check if master localhost", + } r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("CheckIfMasterLocalhost failed retry later") return err2 } else if status { @@ -165,7 +209,12 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e err3 := r.rfHealer.SetOldestAsMaster(rf) setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NO_MASTER, metrics.NOT_APPLICABLE, err3) if err3 != nil { - r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("Error in Setting oldest Pod as master") + errorMsg := "Error in Setting oldest Pod as master" + rf.Status = redisfailoverv1.RedisFailoverStatus{ + State: redisfailoverv1.NotHealthyState, + Message: errorMsg, + } + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf(errorMsg) return err3 } @@ -183,11 +232,20 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NUMBER_OF_MASTERS, metrics.NOT_APPLICABLE, nil) default: setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NUMBER_OF_MASTERS, metrics.NOT_APPLICABLE, errors.New("multiple masters detected")) - return errors.New("more than one master, fix manually") + errorMsg := "more than one master, fix manually" + rf.Status = redisfailoverv1.RedisFailoverStatus{ + State: redisfailoverv1.NotHealthyState, + Message: errorMsg, + } + return errors.New(errorMsg) } master, err := r.rfChecker.GetMasterIP(rf) if err != nil { + rf.Status = redisfailoverv1.RedisFailoverStatus{ + State: redisfailoverv1.NotHealthyState, + Message: "unable to get master IP", + } return err } @@ -196,6 +254,9 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e if err != nil { r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Slave not associated to master: %s", err.Error()) if err = r.rfHealer.SetMasterOnAll(master, rf); err != nil { + rf.Status = redisfailoverv1.RedisFailoverStatus{ + State: redisfailoverv1.NotHealthyState, + } return err } } @@ -203,16 +264,28 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e err = r.applyRedisCustomConfig(rf) setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.APPLY_REDIS_CONFIG, metrics.NOT_APPLICABLE, err) if err != nil { + rf.Status = redisfailoverv1.RedisFailoverStatus{ + State: redisfailoverv1.NotHealthyState, + Message: "unable to apply custom config", + } return err } err = r.UpdateRedisesPods(rf) if err != nil { + rf.Status = redisfailoverv1.RedisFailoverStatus{ + State: redisfailoverv1.NotHealthyState, + Message: "unable to update redis PODs", + } return err } sentinels, err := r.rfChecker.GetSentinelsIPs(rf) if err != nil { + rf.Status = redisfailoverv1.RedisFailoverStatus{ + State: redisfailoverv1.NotHealthyState, + Message: "unable to get sentinels IPs", + } return err } @@ -223,6 +296,9 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e if err != nil { r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Fixing sentinel not monitoring expected master: %s", err.Error()) if err := r.rfHealer.NewSentinelMonitor(sip, master, rf); err != nil { + rf.Status = redisfailoverv1.RedisFailoverStatus{ + State: redisfailoverv1.NotHealthyState, + } return err } } @@ -233,18 +309,27 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e func (r *RedisFailoverHandler) checkAndHealBootstrapMode(rf *redisfailoverv1.RedisFailover) error { if !r.rfChecker.IsRedisRunning(rf) { - setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.REDIS_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, errors.New("not all replicas running")) + errorMsg := "not all replicas running" + r.k8sservice.UpdateRedisFailoverStatus(context.Background(), rf.Namespace, rf, metav1.UpdateOptions{}) + setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.REDIS_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, errors.New(errorMsg)) r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Debugf("Number of redis mismatch, waiting for redis statefulset reconcile") return nil } err := r.UpdateRedisesPods(rf) if err != nil { - return err + rf.Status = redisfailoverv1.RedisFailoverStatus{ + State: redisfailoverv1.NotHealthyState, + Message: "unable to update Redis PODs", + } } err = r.applyRedisCustomConfig(rf) setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.APPLY_REDIS_CONFIG, metrics.NOT_APPLICABLE, err) if err != nil { + rf.Status = redisfailoverv1.RedisFailoverStatus{ + State: redisfailoverv1.NotHealthyState, + Message: "unable to set Redis custom config", + } return err } @@ -252,18 +337,34 @@ func (r *RedisFailoverHandler) checkAndHealBootstrapMode(rf *redisfailoverv1.Red err = r.rfHealer.SetExternalMasterOnAll(bootstrapSettings.Host, bootstrapSettings.Port, rf) setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.APPLY_EXTERNAL_MASTER, metrics.NOT_APPLICABLE, err) if err != nil { + rf.Status = redisfailoverv1.RedisFailoverStatus{ + State: redisfailoverv1.NotHealthyState, + Message: "unable to set external master to all", + } return err } if rf.SentinelsAllowed() { if !r.rfChecker.IsSentinelRunning(rf) { - setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, errors.New("not all replicas running")) + errorMsg := "not all replicas running" + rf.Status = redisfailoverv1.RedisFailoverStatus{ + State: redisfailoverv1.NotHealthyState, + Message: errorMsg, + } + r.k8sservice.UpdateRedisFailoverStatus(context.Background(), rf.Namespace, rf, metav1.UpdateOptions{}) + setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, errors.New(errorMsg)) r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Debugf("Number of sentinel mismatch, waiting for sentinel deployment reconcile") return nil + } else { + r.k8sservice.UpdateRedisFailoverStatus(context.Background(), rf.Namespace, rf, metav1.UpdateOptions{}) } sentinels, err := r.rfChecker.GetSentinelsIPs(rf) if err != nil { + rf.Status = redisfailoverv1.RedisFailoverStatus{ + State: redisfailoverv1.NotHealthyState, + Message: "unable to get sentinels IPs", + } return err } for _, sip := range sentinels { @@ -272,6 +373,10 @@ func (r *RedisFailoverHandler) checkAndHealBootstrapMode(rf *redisfailoverv1.Red if err != nil { r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Fixing sentinel not monitoring expected master: %s", err.Error()) if err := r.rfHealer.NewSentinelMonitorWithPort(sip, bootstrapSettings.Host, bootstrapSettings.Port, rf); err != nil { + rf.Status = redisfailoverv1.RedisFailoverStatus{ + State: redisfailoverv1.NotHealthyState, + Message: "unable to check sentinel monitor", + } return err } } @@ -346,3 +451,10 @@ func setRedisCheckerMetrics(metricsClient metrics.Recorder, mode /* redis or sen } } } + +func updateStatus(k8sservice k8s.Services, rf *redisfailoverv1.RedisFailover, oldState string) { + if oldState != rf.Status.State { + rf.Status.LastChanged = time.Now().Format(time.RFC3339) + } + k8sservice.UpdateRedisFailoverStatus(context.Background(), rf.Namespace, rf, metav1.UpdateOptions{}) +} diff --git a/operator/redisfailover/checker_test.go b/operator/redisfailover/checker_test.go index 2e83a47b4..a6816b043 100644 --- a/operator/redisfailover/checker_test.go +++ b/operator/redisfailover/checker_test.go @@ -3,6 +3,7 @@ package redisfailover_test import ( "errors" "fmt" + v1 "github.com/spotahome/redis-operator/api/redisfailover/v1" "testing" "time" @@ -420,8 +421,10 @@ func TestCheckAndHeal(t *testing.T) { if expErr { assert.Error(err) + assert.Equal(v1.NotHealthyState, rf.Status.State) } else { assert.NoError(err) + assert.Equal(v1.HealthyState, rf.Status.State) } mrfc.AssertExpectations(t) mrfh.AssertExpectations(t) diff --git a/operator/redisfailover/handler.go b/operator/redisfailover/handler.go index 6a03afd56..0880360df 100644 --- a/operator/redisfailover/handler.go +++ b/operator/redisfailover/handler.go @@ -31,7 +31,7 @@ var ( // resources that a RF needs. type RedisFailoverHandler struct { config Config - k8sservice k8s.Service + k8sservice k8s.Services rfService rfservice.RedisFailoverClient rfChecker rfservice.RedisFailoverCheck rfHealer rfservice.RedisFailoverHeal @@ -40,7 +40,7 @@ type RedisFailoverHandler struct { } // NewRedisFailoverHandler returns a new RF handler -func NewRedisFailoverHandler(config Config, rfService rfservice.RedisFailoverClient, rfChecker rfservice.RedisFailoverCheck, rfHealer rfservice.RedisFailoverHeal, k8sservice k8s.Service, mClient metrics.Recorder, logger log.Logger) *RedisFailoverHandler { +func NewRedisFailoverHandler(config Config, rfService rfservice.RedisFailoverClient, rfChecker rfservice.RedisFailoverCheck, rfHealer rfservice.RedisFailoverHeal, k8sservice k8s.Services, mClient metrics.Recorder, logger log.Logger) *RedisFailoverHandler { return &RedisFailoverHandler{ config: config, rfService: rfService, diff --git a/service/k8s/redisfailover.go b/service/k8s/redisfailover.go index 9878390f6..08be9dd23 100644 --- a/service/k8s/redisfailover.go +++ b/service/k8s/redisfailover.go @@ -18,6 +18,7 @@ type RedisFailover interface { ListRedisFailovers(ctx context.Context, namespace string, opts metav1.ListOptions) (*redisfailoverv1.RedisFailoverList, error) // WatchRedisFailovers watches the redisfailovers on a cluster. WatchRedisFailovers(ctx context.Context, namespace string, opts metav1.ListOptions) (watch.Interface, error) + UpdateRedisFailoverStatus(ctx context.Context, namespace string, redisFailover *redisfailoverv1.RedisFailover, opts metav1.UpdateOptions) } // RedisFailoverService is the RedisFailover service implementation using API calls to kubernetes. @@ -50,3 +51,11 @@ func (r *RedisFailoverService) WatchRedisFailovers(ctx context.Context, namespac recordMetrics(namespace, "RedisFailover", metrics.NOT_APPLICABLE, "WATCH", err, r.metricsRecorder) return watcher, err } + +func (r *RedisFailoverService) UpdateRedisFailoverStatus(ctx context.Context, namespace string, redisFailover *redisfailoverv1.RedisFailover, opts metav1.UpdateOptions) { + _, err := r.k8sCli.DatabasesV1().RedisFailovers(namespace).Update(ctx, redisFailover, opts) + if err != nil { + recordMetrics(namespace, "RedisFailover", metrics.NOT_APPLICABLE, "UPDATE", err, r.metricsRecorder) + r.logger.Errorf("Error while updating RedisFailover %s/%s : %s", redisFailover.Namespace, redisFailover.Name, err.Error()) + } +}