From 7c6427c986ff6b68447cb289a368dbc52a0926f9 Mon Sep 17 00:00:00 2001 From: Natali Aharoni <105302293+NataliAharoniPayu@users.noreply.github.com> Date: Sun, 10 Jul 2022 13:47:56 +0300 Subject: [PATCH] Operator Horizontal Scale (#139) * Add horizontal scale feature * New design and architecture to support horizontal scale and recovery flows --- .gitignore | 2 + Dockerfile | 11 +- README.md | 38 + config/configfiles/operator.conf | 134 +- config/configfiles/redis.conf | 4 +- config/configfiles/users.acl | 2 +- config/configmap/kustomization.yaml | 6 +- config/manager/base/manager.yaml | 2 +- config/samples/local_cluster.yaml | 2 +- config/samples/updated_cluster.yaml | 2 +- controllers/config.go | 212 +- controllers/configmap_controller.go | 24 +- controllers/k8sresources.go | 546 ++-- controllers/operator_entrypoints_module.go | 328 +++ controllers/rediscli/cli.go | 148 +- controllers/rediscli/cli_test.go | 151 +- controllers/rediscli/info.go | 17 +- controllers/redisclient/redis_client.go | 163 ++ controllers/rediscluster.go | 2318 +++++++++++------ controllers/rediscluster_controller.go | 306 ++- controllers/testlab/test_lab.go | 527 ++++ controllers/view/cluster_view.go | 176 ++ data/clusterData.go | 32 - go.mod | 7 +- go.sum | 69 +- hack/cloud.yaml | 14 +- hack/dev.Dockerfile | 2 +- hack/gen_kind_config.py | 2 +- hack/install.sh | 18 + hack/redis-bin/Dockerfile | 2 +- .../rediscluster/configmap-redisconf.yaml | 2 +- main.go | 19 +- server/module.go | 85 - server/router.go | 17 +- server/server.go | 6 +- test/e2e/rediscluster_test.go | 9 +- test/framework/k8sresources.go | 7 +- test/framework/rediscluster.go | 9 +- tilt.Dockerfile | 2 +- 39 files changed, 4074 insertions(+), 1347 deletions(-) create mode 100644 controllers/operator_entrypoints_module.go create mode 100644 controllers/redisclient/redis_client.go create mode 100644 controllers/testlab/test_lab.go create mode 100644 controllers/view/cluster_view.go delete mode 100644 data/clusterData.go delete mode 100644 server/module.go diff --git a/.gitignore b/.gitignore index d80eea22..86ece502 100644 --- a/.gitignore +++ b/.gitignore @@ -25,6 +25,8 @@ bin *.swp *.swo *~ +.metals* +.vscode* .vscode/* !.vscode/settings.json !.vscode/tasks.json diff --git a/Dockerfile b/Dockerfile index fb81195f..2b839fbc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ # Build the manager binary -FROM golang:1.15 as builder +FROM golang:1.16 as builder ARG DEBIAN_FRONTEND=noninteractive @@ -10,6 +10,7 @@ RUN apt-get update \ && apt-get install -y curl # install redis cli + RUN cd /tmp &&\ curl http://download.redis.io/redis-stable.tar.gz | tar xz &&\ make -C redis-stable &&\ @@ -28,18 +29,20 @@ COPY main.go main.go COPY api/ api/ COPY controllers/ controllers/ COPY server/ server/ -COPY data/ data/ # Build RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 GO111MODULE=on go build -a -o manager main.go # Use distroless as minimal base image to package the manager binary # Refer to https://github.com/GoogleContainerTools/distroless for more details -FROM gcr.io/distroless/base-debian10 +FROM gcr.io/distroless/base-debian11 WORKDIR / COPY --from=builder /workspace/manager . COPY --from=builder /bin/redis-cli . +COPY --from=builder /bin ./bin +COPY --from=builder /lib ./lib +COPY --from=builder /usr/bin/yes . USER nonroot:nonroot ENV PATH="./:${PATH}" -ENTRYPOINT ["/manager"] +ENTRYPOINT ["/manager"] \ No newline at end of file diff --git a/README.md b/README.md index 2cbb10ea..f540cc35 100644 --- a/README.md +++ b/README.md @@ -14,11 +14,22 @@ Requirements: * `kustomize` >= 4.0 * `docker`: latest version, at least 6.25 GB of memory limit +**Quick Start** + +```bash +sh ./hack/install.sh # you might need to run this as sudo if a regular user can't use docker +cd ./hack/redis-bin -> sh ./run.sh # in case it is the first run on the local machine +tilt up +``` + +**Set up on non-local env** + **1. Setting up a cluster** ```bash cd hack sh ./install.sh # you might need to run this as sudo if a regular user can't use docker +cd ./hack/redis-bin -> sh ./run.sh # in case it is the first run on the local machine ``` If the `.kube/config` file was not updated it can be populated using @@ -82,6 +93,33 @@ Run non cache `go test` command on specific path. for example: go test -count=1 ./controllers/rediscli/ ``` +### Use the test cluster feature + +Test cluster feature is a set of tests implemented to run asynchrounously to the operator manager loop, they simulates: +* Loss of random follower in cluster +* Loss of random leader in cluster +* Loss of random follower and random leader (that owns different set of slots) +* Loss of all followers +* Loss of all nodes beside one replica for each set of slots range, randomely chosen - somethines the survivor is follower and sometimes it is leader (actual scenario for example is loss of all az's beside one) +* Loss of leader and all of its followrs + +The test can fill the cluster nodes with mock data during to the performed "live site", and keep track on which and how many of the inserted keys succeeded, and at the end of the recovery process attempts to read the keys from cluster and match the value to the value that got reported during tracking writting process. + +The report reflects: +* If the recovery process suceeded with healthy and ready cluster before test time out expired (configurable estimated value) +* How many writes succeeded with ratio to the itended keys that been attempted to be inserted +* How many reads succeded with ratio to the number of successful writes +* Both values in terms of: Actual amount and Success rate + +Run the test: +* Port forward the manager to some local port (8080 for example) +* ```Curl -X POST localhost:/test``` (no mock data) +* ```Curl -X POST localhost:/testData``` (with mock data) + +Note: +Running the test lab with mock data is concidered sensitive operation, and naturally is not allowed. +In order to enable it, the config param 'ExposeSensitiveEntryPoints' need to be set to 'true' (please follow the config file documentation regard this param before doing so). + ### Development using Tilt The recommended development flow is based on [Tilt](https://tilt.dev/) - it is used for quick iteration on code running in live containers. diff --git a/config/configfiles/operator.conf b/config/configfiles/operator.conf index b3b04286..bac78ddf 100644 --- a/config/configfiles/operator.conf +++ b/config/configfiles/operator.conf @@ -1,31 +1,44 @@ + + +# The setters value defines a set of indicators for the operator to determine if to adjust or avoid a behaviour that can be practiced as default based on the booleanic value of the related indicator + +# The operator exposes set of entry points that can serve the user in case of need, part of them are sensitive and holds the potential to harm existing data if the operator is deployed on production. +# Those same entry points can serve the user for debug, validation and testing if operator is deployed on development environment. +# The following indicator serves as a 'feature-bit' that tells the operator to hide those sensitive entry points in order to avoid harm on sensitive environment, naturally it is set to be 'false' (Recommended). +# ExposeSensitiveEntryPoints + +# The thresholds value sets definite bounderies for the operator to perform during running concurrent operations +# and during decision making based on given stated values + +# During new node initialization, a request for data replication is sent, and each new node is being sampled and watched untill thre +# in order to make sure the sync process is being performed properly +# SyncMatchThreshold + +# During recovery process, missing pods will be recreated asynchronously, +# this value set the maximum unhealthy nodes that will be recovered by operator at once per reconcile loop +# MaxToleratedPodsRecoverAtOnce + +# During updating process, pods get failed-over, removed, and recreated so the new ones will hold +# the new requested updated form in terms of the new spec. +# this value set the maximum number of nodes to be deleted at once per update loop +# MaxToleratedPodsUpdateAtOnce + # The wait times are defined by an interval value - how often the check is done # and a timeout value, total amount of time to wait before considering the # operation failed. -# Wait duration for the SYNC operation start. After a new node connects to a leader -# there can be a delay before the sync operation starts. -# SyncStartCheckInterval -# SyncStartCheckTimeout - -# Wait duration for the SYNC operation. +# Wait values for the SYNC operation. # SyncCheckInterval # SyncCheckTimeout -# Wait duration for the LOAD operation start. -# LoadStartCheckInterval -# LoadStartCheckTimeout - -# Wait duration for the LOAD operation. This time should be set reasonably high -# because it depends on the size of the DB shards and network latency. Make sure -# the time is high enough to allow for the data transfer between two nodes. -# The LOAD and SYNC operations are important during the recreation of a lost -# node, when the data from a leader is loaded on a replica. -# https://redis.io/topics/replication -# The operator uses the INFO message from Redis to get information about the -# status of SYNC (master_sync_in_progress) and LOAD (loading_eta_seconds) -# https://redis.io/commands/info -# LoadCheckInterval -# LoadCheckTimeout +# Wait values for redis cluster configuration alignment. +# SleepDuringTablesAlignProcess +# RedisNodesAgreeAboutSlotsConfigCheckInterval +# RedisNodesAgreeAboutSlotsConfigTimeout + +# Wait duration of the '--cluster create' command. +# ClusterCreateInterval +# ClusterCreateTimeout # The estimated time it takes for volume mounted configmaps to be updated on the # pods. After a configmap is changed, the configmap controller will update a @@ -35,10 +48,6 @@ # The estimated time it takes for Redis to load the new config map from disk. # ACLFileLoadDuration -# Wait duration of the '--cluster create' command. -# ClusterCreateInterval -# ClusterCreateTimeout - # Wait duration for a pod to be in ready state - pod is in Ready state and # the containers passed all conditions. # PodReadyCheckInterval @@ -52,6 +61,10 @@ # PodDeleteCheckInterval # PodDeleteCheckTimeout +# Wait duration for the removal of node id from other nodes tables +# RedisRemoveNodeCheckInterval +# RedisRemoveNodeTimeout + # Duration of the PING command. # RedisPingCheckInterval # RedisPingCheckTimeout @@ -60,6 +73,10 @@ # RedisClusterReplicationCheckInterval # RedisClusterReplicationCheckTimeout +# Wait duration for nodes to load dataset to their memory +# WaitForRedisLoadDataSetInMemoryCheckInterval +# WaitForRedisLoadDataSetInMemoryTimeout + # Wait duration of the MEET command. # RedisClusterMeetCheckInterval # RedisClusterMeetCheckTimeout @@ -74,32 +91,43 @@ # RedisAutoFailoverCheckInterval # RedisAutoFailoverCheckTimeout +# SleepIfForgetNodeFails +# If forget node function fails, sleep before taking any deletion or irreversible action + +setters: + ExposeSensitiveEntryPoints: false +thresholds: + SyncMatchThreshold: 90 + MaxToleratedPodsRecoverAtOnce: 15 + MaxToleratedPodsUpdateAtOnce: 5 times: - syncStartCheckInterval: 500ms - syncStartCheckTimeout: 15000ms - syncCheckInterval: 500ms - syncCheckTimeout: 15000ms - loadCheckInterval: 500ms - loadCheckTimeout: 180000ms - loadStartCheckInterval: 500ms - loadStartCheckTimeout: 180000ms - clusterCreateInterval: 5000ms - clusterCreateTimeout: 30000ms - aclFilePropagationDuration: 5000ms - aclFileLoadDuration: 500ms - podReadyCheckInterval: 2000ms - podReadyCheckTimeout: 30000ms - podNetworkCheckInterval: 2000ms - podNetworkCheckTimeout: 30000ms - podDeleteCheckInterval: 2000ms - podDeleteCheckTimeout: 30000ms - redisPingCheckInterval: 2000ms - redisPingCheckTimeout: 30000ms - redisClusterReplicationCheckInterval: 2000ms - redisClusterReplicationCheckTimeout: 30000ms - redisClusterMeetCheckInterval: 2000ms - redisClusterMeetCheckTimeout: 30000ms - redisManualFailoverCheckInterval: 2000ms - redisManualFailoverCheckTimeout: 30000ms - redisAutoFailoverCheckInterval: 2000ms - redisAutoFailoverCheckTimeout: 30000ms + SyncCheckInterval: 5000ms + SyncCheckTimeout: 30000ms + SleepDuringTablesAlignProcess: 12000ms + ClusterCreateInterval: 5000ms + ClusterCreateTimeout: 90000ms + ACLFilePropagationDuration: 5000ms + ACLFileLoadDuration: 5000ms + PodReadyCheckInterval: 3000ms + PodReadyCheckTimeout: 30000ms + PodNetworkCheckInterval: 3000ms + PodNetworkCheckTimeout: 60000ms + PodDeleteCheckInterval: 3000ms + PodDeleteCheckTimeout: 60000ms + RedisPingCheckInterval: 2000ms + RedisPingCheckTimeout: 20000ms + RedisClusterReplicationCheckInterval: 2000ms + RedisClusterReplicationCheckTimeout: 30000ms + RedisClusterMeetCheckInterval: 2000ms + RedisClusterMeetCheckTimeout: 10000ms + RedisManualFailoverCheckInterval: 5000ms + RedisManualFailoverCheckTimeout: 40000ms + RedisAutoFailoverCheckInterval: 5000ms + RedisAutoFailoverCheckTimeout: 40000ms + RedisNodesAgreeAboutSlotsConfigCheckInterval: 3000ms + RedisNodesAgreeAboutSlotsConfigTimeout: 12000ms + RedisRemoveNodeCheckInterval: 2000ms + RedisRemoveNodeTimeout: 20000ms + WaitForRedisLoadDataSetInMemoryCheckInterval: 2000ms + WaitForRedisLoadDataSetInMemoryTimeout: 10000ms + SleepIfForgetNodeFails: 20000ms \ No newline at end of file diff --git a/config/configfiles/redis.conf b/config/configfiles/redis.conf index c881c37b..a3d2cb14 100644 --- a/config/configfiles/redis.conf +++ b/config/configfiles/redis.conf @@ -1513,6 +1513,8 @@ cluster-require-full-coverage no # In order to setup your cluster make sure to read the documentation # available at https://redis.io web site. +enable-debug-command yes + ########################## CLUSTER DOCKER/NAT support ######################## # In certain deployments, Redis Cluster nodes address discovery fails, because @@ -2048,4 +2050,4 @@ jemalloc-bg-thread yes # by setting the following config which takes a space delimited list of warnings # to suppress # -# ignore-warnings ARM64-COW-BUG +# ignore-warnings ARM64-COW-BUG \ No newline at end of file diff --git a/config/configfiles/users.acl b/config/configfiles/users.acl index 18246f0c..af0f19e3 100644 --- a/config/configfiles/users.acl +++ b/config/configfiles/users.acl @@ -1,4 +1,4 @@ user default off nopass -@all user admin on #713bfda78870bf9d1b261f565286f85e97ee614efe5f0faf7c34e7ca4f65baca ~* &* +@all user testuser on #13d249f2cb4127b40cfa757866850278793f814ded3c587fe5889e889a7a9f6c ~testkey:* &* -@all +get +set -user rdcuser on #400f9f96b4a343f4766d29dbe7bee178d7de6e186464d22378214c0232fb38ca &* -@all +replconf +ping +psync +user rdcuser on #400f9f96b4a343f4766d29dbe7bee178d7de6e186464d22378214c0232fb38ca &* -@all +replconf +ping +psync \ No newline at end of file diff --git a/config/configmap/kustomization.yaml b/config/configmap/kustomization.yaml index 8c520a7a..8d86fee3 100644 --- a/config/configmap/kustomization.yaml +++ b/config/configmap/kustomization.yaml @@ -7,16 +7,16 @@ configMapgenerator: - ../configfiles/redis.conf options: labels: - redis-cluster: rdc-test + redis-cluster: dev-rdc - name: users-acl files: - ../configfiles/users.acl options: labels: - redis-cluster: rdc-test + redis-cluster: dev-rdc - name: operator-config files: - ../configfiles/operator.conf options: labels: - redis-operator: rdc-test + redis-operator: dev-rdc diff --git a/config/manager/base/manager.yaml b/config/manager/base/manager.yaml index 761aa3e2..803607f5 100644 --- a/config/manager/base/manager.yaml +++ b/config/manager/base/manager.yaml @@ -5,7 +5,7 @@ metadata: namespace: system labels: control-plane: controller-manager - redis-operator: rdc-test + redis-operator: dev-rdc spec: selector: matchLabels: diff --git a/config/samples/local_cluster.yaml b/config/samples/local_cluster.yaml index 125d0e7d..e92c0f2e 100644 --- a/config/samples/local_cluster.yaml +++ b/config/samples/local_cluster.yaml @@ -1,7 +1,7 @@ apiVersion: db.payu.com/v1 kind: RedisCluster metadata: - name: rdc-test + name: dev-rdc namespace: default spec: leaderCount: 3 diff --git a/config/samples/updated_cluster.yaml b/config/samples/updated_cluster.yaml index 92a47546..799af1d2 100644 --- a/config/samples/updated_cluster.yaml +++ b/config/samples/updated_cluster.yaml @@ -1,7 +1,7 @@ apiVersion: db.payu.com/v1 kind: RedisCluster metadata: - name: rdc-test + name: dev-rdc namespace: default spec: leaderCount: 3 diff --git a/controllers/config.go b/controllers/config.go index 2a3f6bac..77a2c93f 100644 --- a/controllers/config.go +++ b/controllers/config.go @@ -10,35 +10,50 @@ import ( "k8s.io/utils/inotify" ) +/* + +# The setters value defines a set of indicators for the operator to determine if to adjust or avoid a behaviour that can be practiced as default based on the booleanic value of the related indicator + +# The operator exposes set of entry points that can serve the user in case of need, part of them are sensitive and holds the potential to harm existing data if the operator is deployed on production. +# Those same entry points can serve the user for debug, validation and testing if operator is deployed on development environment. +# The following indicator serves as a 'feature-bit' that tells the operator to hide those sensitive entry points in order to avoid harm on sensitive environment, naturally it is set to be 'false' (Recommended). +# ExposeSensitiveEntryPoints + +# The thresholds value sets definite bounderies for the operator to perform during running concurrent operations +# and during decision making based on given stated values + +# During new node initialization, a request for data replication is sent, and each new node is being sampled and watched untill threshold is reached +# in order to make sure the sync process is being performed properly +# SyncMatchThreshold + +# During recovery process, missing pods will be recreated asynchronously, +# this value set the maximum unhealthy nodes that will be recovered by operator at once per reconcile loop +# MaxToleratedPodsRecoverAtOnce + +# During updating process, pods get failed-over, removed, and recreated so the new ones will hold +# the new requested updated form in terms of the new spec. +# this value set the maximum number of nodes to be deleted at once per update loop +# MaxToleratedPodsUpdateAtOnce + +*/ + /* # The wait times are defined by an interval value - how often the check is done # and a timeout value, total amount of time to wait before considering the # operation failed. -# Wait duration for the SYNC operation start. After a new node connects to a leader -# there can be a delay before the sync operation starts. -# SyncStartCheckInterval -# SyncStartCheckTimeout - -# Wait duration for the SYNC operation. +# Wait values for the SYNC operation. # SyncCheckInterval # SyncCheckTimeout -# Wait duration for the LOAD operation start. -# LoadStartCheckInterval -# LoadStartCheckTimeout - -# Wait duration for the LOAD operation. This time should be set reasonably high -# because it depends on the size of the DB shards and network latency. Make sure -# the time is high enough to allow for the data transfer between two nodes. -# The LOAD and SYNC operations are important during the recreation of a lost -# node, when the data from a leader is loaded on a replica. -# https://redis.io/topics/replication -# The operator uses the INFO message from Redis to get information about the -# status of SYNC (master_sync_in_progress) and LOAD (loading_eta_seconds) -# https://redis.io/commands/info -# LoadCheckInterval -# LoadCheckTimeout +# Wait values for redis cluster configuration alignment. +# SleepDuringTablesAlignProcess +# RedisNodesAgreeAboutSlotsConfigCheckInterval +# RedisNodesAgreeAboutSlotsConfigTimeout + +# Wait duration of the '--cluster create' command. +# ClusterCreateInterval +# ClusterCreateTimeout # The estimated time it takes for volume mounted configmaps to be updated on the # pods. After a configmap is changed, the configmap controller will update a @@ -48,10 +63,6 @@ import ( # The estimated time it takes for Redis to load the new config map from disk. # ACLFileLoadDuration -# Wait duration of the '--cluster create' command. -# ClusterCreateInterval -# ClusterCreateTimeout - # Wait duration for a pod to be in ready state - pod is in Ready state and # the containers passed all conditions. # PodReadyCheckInterval @@ -65,6 +76,10 @@ import ( # PodDeleteCheckInterval # PodDeleteCheckTimeout +# Wait duration for the removal of node id from other nodes tables +# RedisRemoveNodeCheckInterval +# RedisRemoveNodeTimeout + # Duration of the PING command. # RedisPingCheckInterval # RedisPingCheckTimeout @@ -73,6 +88,10 @@ import ( # RedisClusterReplicationCheckInterval # RedisClusterReplicationCheckTimeout +# Wait duration for nodes to load dataset to their memory +# WaitForRedisLoadDataSetInMemoryCheckInterval +# WaitForRedisLoadDataSetInMemoryTimeout + # Wait duration of the MEET command. # RedisClusterMeetCheckInterval # RedisClusterMeetCheckTimeout @@ -86,6 +105,9 @@ import ( # failure. # RedisAutoFailoverCheckInterval # RedisAutoFailoverCheckTimeout + +# SleepIfForgetNodeFails +# If forget node function fails, sleep before taking any deletion or irreversible action */ type RedisOperatorConfig struct { @@ -94,39 +116,53 @@ type RedisOperatorConfig struct { Config OperatorConfig } +type OperatorSetters struct { + ExposeSensitiveEntryPoints bool `yaml:"ExposeSensitiveEntryPoints"` +} + +type OperatorConfigThresholds struct { + SyncMatchThreshold int `yaml:"SyncMatchThreshold"` + MaxToleratedPodsRecoverAtOnce int `yaml:"MaxToleratedPodsRecoverAtOnce"` + MaxToleratedPodsUpdateAtOnce int `yaml:"MaxToleratedPodsUpdateAtOnce"` +} + type OperatorConfigTimes struct { - SyncStartCheckInterval time.Duration `yaml:"syncStartCheckInterval"` - SyncStartCheckTimeout time.Duration `yaml:"syncStartCheckTimeout"` - SyncCheckInterval time.Duration `yaml:"syncCheckInterval"` - SyncCheckTimeout time.Duration `yaml:"syncCheckTimeout"` - LoadStartCheckInterval time.Duration `yaml:"loadStartCheckInterval"` - LoadStartCheckTimeout time.Duration `yaml:"loadStartCheckTimeout"` - LoadCheckInterval time.Duration `yaml:"loadCheckInterval"` - LoadCheckTimeout time.Duration `yaml:"loadCheckTimeout"` - ClusterCreateInterval time.Duration `yaml:"clusterCreateInterval"` - ClusterCreateTimeout time.Duration `yaml:"clusterCreateTimeout"` - ACLFilePropagationDuration time.Duration `yaml:"aclFilePropagationDuration"` - ACLFileLoadDuration time.Duration `yaml:"aclFileLoadDuration"` - PodReadyCheckInterval time.Duration `yaml:"podReadyCheckInterval"` - PodReadyCheckTimeout time.Duration `yaml:"podReadyCheckTimeout"` - PodNetworkCheckInterval time.Duration `yaml:"podNetworkCheckInterval"` - PodNetworkCheckTimeout time.Duration `yaml:"podNetworkCheckTimeout"` - PodDeleteCheckInterval time.Duration `yaml:"podDeleteCheckInterval"` - PodDeleteCheckTimeout time.Duration `yaml:"podDeleteCheckTimeout"` - RedisPingCheckInterval time.Duration `yaml:"redisPingCheckInterval"` - RedisPingCheckTimeout time.Duration `yaml:"redisPingCheckTimeout"` - RedisClusterReplicationCheckInterval time.Duration `yaml:"redisClusterReplicationCheckInterval"` - RedisClusterReplicationCheckTimeout time.Duration `yaml:"redisClusterReplicationCheckTimeout"` - RedisClusterMeetCheckInterval time.Duration `yaml:"redisClusterMeetCheckInterval"` - RedisClusterMeetCheckTimeout time.Duration `yaml:"redisClusterMeetCheckTimeout"` - RedisManualFailoverCheckInterval time.Duration `yaml:"redisManualFailoverCheckInterval"` - RedisManualFailoverCheckTimeout time.Duration `yaml:"redisManualFailoverCheckTimeout"` - RedisAutoFailoverCheckInterval time.Duration `yaml:"redisAutoFailoverCheckInterval"` - RedisAutoFailoverCheckTimeout time.Duration `yaml:"redisAutoFailoverCheckTimeout"` + SyncCheckInterval time.Duration `yaml:"SyncCheckInterval"` + SyncCheckTimeout time.Duration `yaml:"SyncCheckTimeout"` + SleepDuringTablesAlignProcess time.Duration `yaml:"SleepDuringTablesAlignProcess"` + ClusterCreateInterval time.Duration `yaml:"ClusterCreateInterval"` + ClusterCreateTimeout time.Duration `yaml:"ClusterCreateTimeout"` + ACLFilePropagationDuration time.Duration `yaml:"ACLFilePropagationDuration"` + ACLFileLoadDuration time.Duration `yaml:"ACLFileLoadDuration"` + PodReadyCheckInterval time.Duration `yaml:"PodReadyCheckInterval"` + PodReadyCheckTimeout time.Duration `yaml:"PodReadyCheckTimeout"` + PodNetworkCheckInterval time.Duration `yaml:"PodNetworkCheckInterval"` + PodNetworkCheckTimeout time.Duration `yaml:"PodNetworkCheckTimeout"` + PodDeleteCheckInterval time.Duration `yaml:"PodDeleteCheckInterval"` + PodDeleteCheckTimeout time.Duration `yaml:"PodDeleteCheckTimeout"` + RedisPingCheckInterval time.Duration `yaml:"RedisPingCheckInterval"` + RedisPingCheckTimeout time.Duration `yaml:"RedisPingCheckTimeout"` + RedisClusterReplicationCheckInterval time.Duration `yaml:"RedisClusterReplicationCheckInterval"` + RedisClusterReplicationCheckTimeout time.Duration `yaml:"RedisClusterReplicationCheckTimeout"` + RedisClusterMeetCheckInterval time.Duration `yaml:"RedisClusterMeetCheckInterval"` + RedisClusterMeetCheckTimeout time.Duration `yaml:"RedisClusterMeetCheckTimeout"` + RedisManualFailoverCheckInterval time.Duration `yaml:"RedisManualFailoverCheckInterval"` + RedisManualFailoverCheckTimeout time.Duration `yaml:"RedisManualFailoverCheckTimeout"` + RedisAutoFailoverCheckInterval time.Duration `yaml:"RedisAutoFailoverCheckInterval"` + RedisAutoFailoverCheckTimeout time.Duration `yaml:"RedisAutoFailoverCheckTimeout"` + RedisNodesAgreeAboutSlotsConfigTimeout time.Duration `yaml:"RedisNodesAgreeAboutSlotsConfigTimeout"` + RedisNodesAgreeAboutSlotsConfigCheckInterval time.Duration `yaml:"RedisNodesAgreeAboutSlotsConfigCheckInterval"` + RedisRemoveNodeCheckInterval time.Duration `yaml:"RedisRemoveNodeCheckInterval"` + RedisRemoveNodeTimeout time.Duration `yaml:"RedisRemoveNodeTimeout"` + WaitForRedisLoadDataSetInMemoryCheckInterval time.Duration `yaml:"WaitForRedisLoadDataSetInMemoryCheckInterval"` + WaitForRedisLoadDataSetInMemoryTimeout time.Duration `yaml:"WaitForRedisLoadDataSetInMemoryTimeout"` + SleepIfForgetNodeFails time.Duration `yaml:"SleepIfForgetNodeFails"` } type OperatorConfig struct { - Times OperatorConfigTimes `yaml:"times"` + Times OperatorConfigTimes `yaml:"times"` + Thresholds OperatorConfigThresholds `yaml:"thresholds"` + Setters OperatorSetters `yaml:"setters"` } func NewRedisOperatorConfig(configPath string, logger logr.Logger) (*RedisOperatorConfig, error) { @@ -142,40 +178,50 @@ func NewRedisOperatorConfig(configPath string, logger logr.Logger) (*RedisOperat } // Default constants used by the controllers for various tasks. -// It is used as fallback in case a configmap is not provided. +// It is used as fallback in case a "config/configfiles/operator.conf" configmap is not provided. func DefaultRedisOperatorConfig(logger logr.Logger) *RedisOperatorConfig { return &RedisOperatorConfig{ Log: logger, Config: OperatorConfig{ + Setters: OperatorSetters{ + ExposeSensitiveEntryPoints: false, + }, + Thresholds: OperatorConfigThresholds{ + SyncMatchThreshold: 90, + MaxToleratedPodsRecoverAtOnce: 15, + MaxToleratedPodsUpdateAtOnce: 5, + }, Times: OperatorConfigTimes{ - SyncStartCheckInterval: 500 * time.Millisecond, - SyncStartCheckTimeout: 15000 * time.Millisecond, - SyncCheckInterval: 500 * time.Millisecond, - SyncCheckTimeout: 15000 * time.Millisecond, - LoadStartCheckInterval: 500 * time.Millisecond, - LoadStartCheckTimeout: 180000 * time.Millisecond, - LoadCheckInterval: 500 * time.Millisecond, - LoadCheckTimeout: 180000 * time.Millisecond, - ClusterCreateInterval: 5000 * time.Millisecond, - ClusterCreateTimeout: 30000 * time.Millisecond, - ACLFilePropagationDuration: 5000 * time.Millisecond, - ACLFileLoadDuration: 500 * time.Millisecond, - PodReadyCheckInterval: 2000 * time.Millisecond, - PodReadyCheckTimeout: 30000 * time.Millisecond, - PodNetworkCheckInterval: 2000 * time.Millisecond, - PodNetworkCheckTimeout: 30000 * time.Millisecond, - PodDeleteCheckInterval: 2000 * time.Millisecond, - PodDeleteCheckTimeout: 30000 * time.Millisecond, - RedisPingCheckInterval: 2000 * time.Millisecond, - RedisPingCheckTimeout: 30000 * time.Millisecond, - RedisClusterReplicationCheckInterval: 2000 * time.Millisecond, - RedisClusterReplicationCheckTimeout: 30000 * time.Millisecond, - RedisClusterMeetCheckInterval: 2000 * time.Millisecond, - RedisClusterMeetCheckTimeout: 30000 * time.Millisecond, - RedisManualFailoverCheckInterval: 2000 * time.Millisecond, - RedisManualFailoverCheckTimeout: 30000 * time.Millisecond, - RedisAutoFailoverCheckInterval: 2000 * time.Millisecond, - RedisAutoFailoverCheckTimeout: 30000 * time.Millisecond, + SyncCheckInterval: 5 * 1000 * time.Millisecond, + SyncCheckTimeout: 30 * 1000 * time.Millisecond, + SleepDuringTablesAlignProcess: 12 * 1000 * time.Millisecond, + ClusterCreateInterval: 5 * 1000 * time.Millisecond, + ClusterCreateTimeout: 90 * 1000 * time.Millisecond, + ACLFilePropagationDuration: 5 * 1000 * time.Millisecond, + ACLFileLoadDuration: 5 * 1000 * time.Millisecond, + PodReadyCheckInterval: 3 * 1000 * time.Millisecond, + PodReadyCheckTimeout: 30 * 1000 * time.Millisecond, + PodNetworkCheckInterval: 3 * 1000 * time.Millisecond, + PodNetworkCheckTimeout: 60 * 1000 * time.Millisecond, + PodDeleteCheckInterval: 3 * 1000 * time.Millisecond, + PodDeleteCheckTimeout: 60 * 1000 * time.Millisecond, + RedisPingCheckInterval: 2 * 1000 * time.Millisecond, + RedisPingCheckTimeout: 20 * 1000 * time.Millisecond, + RedisClusterReplicationCheckInterval: 2 * 1000 * time.Millisecond, + RedisClusterReplicationCheckTimeout: 30 * 1000 * time.Millisecond, + RedisClusterMeetCheckInterval: 2 * 1000 * time.Millisecond, + RedisClusterMeetCheckTimeout: 10 * 1000 * time.Millisecond, + RedisManualFailoverCheckInterval: 5 * 1000 * time.Millisecond, + RedisManualFailoverCheckTimeout: 40 * 1000 * time.Millisecond, + RedisAutoFailoverCheckInterval: 5 * 1000 * time.Millisecond, + RedisAutoFailoverCheckTimeout: 40 * 1000 * time.Millisecond, + RedisNodesAgreeAboutSlotsConfigCheckInterval: 3 * 1000 * time.Millisecond, + RedisNodesAgreeAboutSlotsConfigTimeout: 12 * 1000 * time.Millisecond, + RedisRemoveNodeCheckInterval: 2 * 1000 * time.Millisecond, + RedisRemoveNodeTimeout: 20 * 1000 * time.Millisecond, + WaitForRedisLoadDataSetInMemoryCheckInterval: 2 * 1000 * time.Millisecond, + WaitForRedisLoadDataSetInMemoryTimeout: 10 * 1000 * time.Millisecond, + SleepIfForgetNodeFails: 20 * 1000 * time.Millisecond, }, }, } diff --git a/controllers/configmap_controller.go b/controllers/configmap_controller.go index 08b920fd..579ef5a3 100644 --- a/controllers/configmap_controller.go +++ b/controllers/configmap_controller.go @@ -5,6 +5,7 @@ import ( "crypto/sha256" "fmt" "reflect" + "strings" "sync" "time" @@ -59,9 +60,9 @@ const ACLFileLoadDuration time.Duration = time.Millisecond * 500 const redisConfigLabelKey string = "redis-cluster" const handleACLConfigErrorMessage = "Failed to handle ACL configuration" const operatorConfigLabelKey string = "redis-operator" +const RedisClusterStateMapName string = "redis-cluster-state-map" func (r *RedisConfigReconciler) syncConfig(latestConfigHash string, redisPods ...corev1.Pod) error { - time.Sleep(ACLFilePropagationDuration) for _, pod := range redisPods { @@ -141,10 +142,15 @@ func (r *RedisConfigReconciler) handleACLConfig(configMap *corev1.ConfigMap) err configMapACLHash := fmt.Sprintf("%x", sha256.Sum256([]byte(acl.String()))) r.Log.Info(fmt.Sprintf("Computed hash: %s", configMapACLHash)) + wg.Add(len(rdcPods.Items)) for i := range rdcPods.Items { - wg.Add(1) - go func(failSignal *bool, pod *corev1.Pod, wg *sync.WaitGroup) { + go func(failSignal *bool, pod *corev1.Pod) { defer wg.Done() + if _, e := r.RedisCLI.Ping(pod.Status.PodIP); e != nil { + r.Log.Info(fmt.Sprintf("[Warn] ACL config sync is not ready yet for pod: [%v]", pod.Name)) + //*failSignal = true + return + } redisNodeConfigHash, err := r.getACLConfigHash(pod) if err != nil { r.Log.Error(err, "Failed to get the config for %s(%s)", pod.Name, pod.Status.PodIP) @@ -193,7 +199,7 @@ func (r *RedisConfigReconciler) handleACLConfig(configMap *corev1.ConfigMap) err } } } - }(&syncFail, &rdcPods.Items[i], &wg) + }(&syncFail, &rdcPods.Items[i]) } wg.Wait() @@ -233,7 +239,11 @@ func (r *RedisConfigReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) var configMap corev1.ConfigMap if err := r.Get(context.Background(), req.NamespacedName, &configMap); err != nil { - r.Log.Error(err, "Failed to fetch configmap") + if strings.Contains(err.Error(), RedisClusterStateMapName) { + r.Log.Info("[Warn] Failed to fetch config map [" + RedisClusterStateMapName + "]") + } else { + r.Log.Error(err, "Failed to fetch configmap") + } } labels := configMap.GetObjectMeta().GetLabels() for label := range labels { @@ -241,7 +251,7 @@ func (r *RedisConfigReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) if _, ok := configMap.Data["users.acl"]; ok { if err := r.handleACLConfig(&configMap); err != nil { r.Log.Error(err, "Failed to reconcile ACL config") - return ctrl.Result{}, err + return ctrl.Result{RequeueAfter: 30 * time.Second}, err } return ctrl.Result{}, nil } @@ -249,7 +259,7 @@ func (r *RedisConfigReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) if _, ok := configMap.Data["operator.conf"]; ok { if err := r.handleOperatorConfig(&configMap); err != nil { r.Log.Error(err, "Failed to reconcile operator config") - return ctrl.Result{}, err + return ctrl.Result{RequeueAfter: 30 * time.Second}, err } return ctrl.Result{}, nil } diff --git a/controllers/k8sresources.go b/controllers/k8sresources.go index dcb96432..567e0d38 100644 --- a/controllers/k8sresources.go +++ b/controllers/k8sresources.go @@ -2,11 +2,14 @@ package controllers import ( "context" + "encoding/json" "fmt" - "sort" "strings" + "sync" dbv1 "github.com/PayU/redis-operator/api/v1" + rediscli "github.com/PayU/redis-operator/controllers/rediscli" + view "github.com/PayU/redis-operator/controllers/view" "github.com/go-logr/logr" "github.com/pkg/errors" corev1 "k8s.io/api/core/v1" @@ -20,72 +23,134 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" ) +type K8sManager struct { + client.Client + Log logr.Logger + Config *RedisOperatorConfig + Scheme *runtime.Scheme +} + +// Get/Read methods + func (r *RedisClusterReconciler) getRedisClusterPods(redisCluster *dbv1.RedisCluster, podType ...string) ([]corev1.Pod, error) { pods := &corev1.PodList{} matchingLabels := redisCluster.Spec.PodLabelSelector - if len(podType) > 0 && strings.TrimSpace(podType[0]) != "" { pt := strings.TrimSpace(podType[0]) if pt == "follower" || pt == "leader" { matchingLabels["redis-node-role"] = pt } } - err := r.List(context.Background(), pods, client.InNamespace(redisCluster.ObjectMeta.Namespace), client.MatchingLabels(matchingLabels)) if err != nil { return nil, err } + return pods.Items, nil +} - sortedPods := pods.Items - sort.Slice(sortedPods, func(i, j int) bool { - return pods.Items[i].Labels["node-number"] < pods.Items[j].Labels["node-number"] - }) - - return sortedPods, nil +func getSelectorRequirementFromPodLabelSelector(redisCluster *dbv1.RedisCluster) []metav1.LabelSelectorRequirement { + lsr := []metav1.LabelSelectorRequirement{} + for k, v := range redisCluster.Spec.PodLabelSelector { + lsr = append(lsr, metav1.LabelSelectorRequirement{Key: k, Operator: metav1.LabelSelectorOpIn, Values: []string{v}}) + } + return lsr } -func (r *RedisClusterReconciler) getPodByIP(namespace string, podIP string) (corev1.Pod, error) { - var podList corev1.PodList - err := r.List(context.Background(), &podList, client.InNamespace(namespace), client.MatchingFields{"status.podIP": podIP}) +func (r *RedisClusterReconciler) setClusterStateView(redisCluster *dbv1.RedisCluster) (error) { + if r.RedisClusterStateView == nil { + r.RedisClusterStateView = &view.RedisClusterStateView{Name: RedisClusterStateMapName} + } + configMapName := r.RedisClusterStateView.Name + configMapNamespace := redisCluster.ObjectMeta.Namespace + var configMap corev1.ConfigMap + var redisClusterStateView view.RedisClusterStateView + err := r.Get(context.Background(), client.ObjectKey{Name: configMapName, Namespace: configMapNamespace}, &configMap) if err != nil { - return corev1.Pod{}, err + return err } - if len(podList.Items) == 0 { - return corev1.Pod{}, apierrors.NewNotFound(corev1.Resource("Pod"), "") + view := configMap.Data["data"] + err = json.Unmarshal([]byte(view), &redisClusterStateView) + if err != nil { + return err } - return podList.Items[0], nil + r.RedisClusterStateView = &redisClusterStateView + return nil } -func (r *RedisClusterReconciler) deletePodsByIP(namespace string, ip ...string) ([]corev1.Pod, error) { - var deletedPods []corev1.Pod - for _, ip := range ip { - pod, err := r.getPodByIP(namespace, ip) - if err != nil { - if apierrors.IsNotFound(err) { - continue - } - return nil, err - } - if err := r.Delete(context.Background(), &pod); err != nil { - if apierrors.IsNotFound(err) { - continue - } - return nil, err - } - deletedPods = append(deletedPods, pod) +// Update methods + +func (r *RedisClusterReconciler) saveClusterStateView(redisCluster *dbv1.RedisCluster) { + r.Log.Info("Saving cluster state view") + configMapName := r.RedisClusterStateView.Name + configMapNamespace := redisCluster.ObjectMeta.Namespace + bytes, e := json.Marshal(r.RedisClusterStateView) + if e != nil { + r.Log.Error(e, "Error while attemting json marshal for cluster state view...") + return } - return deletedPods, nil + data := map[string]string{ + "data": string(bytes), + } + configMap := corev1.ConfigMap{ + TypeMeta: metav1.TypeMeta{ + Kind: "ConfigMap", + APIVersion: "v1", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: configMapName, + Namespace: configMapNamespace, + }, + Data: data, + } + e = r.Update(context.Background(), &configMap) + if e != nil { + r.Log.Error(e, "Error while attemting update for cluster state view...") + return + } + r.Log.Info("Cluster state view saved") } -func getSelectorRequirementFromPodLabelSelector(redisCluster *dbv1.RedisCluster) []metav1.LabelSelectorRequirement { - lsr := []metav1.LabelSelectorRequirement{} - for k, v := range redisCluster.Spec.PodLabelSelector { - lsr = append(lsr, metav1.LabelSelectorRequirement{Key: k, Operator: metav1.LabelSelectorOpIn, Values: []string{v}}) +// Create/Make/Write methods + +func (r *RedisClusterReconciler) postNewClusterStateView(redisCluster *dbv1.RedisCluster) error { + configMapName := r.RedisClusterStateView.Name + configMapNamespace := redisCluster.ObjectMeta.Namespace + bytes, e := json.Marshal(r.RedisClusterStateView) + if e != nil { + return e } - return lsr + data := map[string]string{ + "data": string(bytes), + } + configMap := corev1.ConfigMap{ + TypeMeta: metav1.TypeMeta{ + Kind: "ConfigMap", + APIVersion: "v1", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: configMapName, + Namespace: configMapNamespace, + }, + Data: data, + } + return r.Create(context.Background(), &configMap) } -func (r *RedisClusterReconciler) makeRedisPod(redisCluster *dbv1.RedisCluster, nodeRole string, leaderNumber string, nodeNumber string, preferredLabelSelectorRequirement []metav1.LabelSelectorRequirement) corev1.Pod { +func (r *K8sManager) WritePodAnnotations(annotations map[string]string, pods ...corev1.Pod) error { + annotationsString := "" + for key, val := range annotations { + annotationsString = fmt.Sprintf("\"%s\": \"%s\",%s", key, val, annotationsString) + } + patch := []byte(fmt.Sprintf(`{"metadata":{"annotations":{%s}}}`, annotationsString[:len(annotationsString)-1])) + for _, pod := range pods { + if err := r.Patch(context.Background(), &pod, client.RawPatch(types.StrategicMergePatchType, patch)); err != nil { + r.Log.Error(err, fmt.Sprintf("Failed to patch the annotations on pod %s (%s)", pod.Name, pod.Status.PodIP)) + } + } + return nil +} + +func (r *RedisClusterReconciler) makeRedisPod(redisCluster *dbv1.RedisCluster, nodeRole string, leaderName string, nodeName string, preferredLabelSelectorRequirement []metav1.LabelSelectorRequirement) corev1.Pod { var affinity corev1.Affinity podLabels := make(map[string]string) @@ -97,8 +162,8 @@ func (r *RedisClusterReconciler) makeRedisPod(redisCluster *dbv1.RedisCluster, n } podLabels["redis-node-role"] = nodeRole - podLabels["leader-number"] = leaderNumber - podLabels["node-number"] = nodeNumber + podLabels["leader-name"] = leaderName + podLabels["node-name"] = nodeName podLabels["redis-cluster"] = redisCluster.Name if redisCluster.Spec.EnableDefaultAffinity { @@ -116,7 +181,7 @@ func (r *RedisClusterReconciler) makeRedisPod(redisCluster *dbv1.RedisCluster, n LabelSelector: &metav1.LabelSelector{ MatchExpressions: getSelectorRequirementFromPodLabelSelector(redisCluster), }, - TopologyKey: "failure-domain.beta.kubernetes.io/node", + TopologyKey: "kubernetes.io/hostname", } if affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution == nil { @@ -126,21 +191,21 @@ func (r *RedisClusterReconciler) makeRedisPod(redisCluster *dbv1.RedisCluster, n affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution, requiredPodAffinityTerm) } - prefferedPodAffinityTerm := corev1.WeightedPodAffinityTerm{ + preferredPodAffinityTerm := corev1.WeightedPodAffinityTerm{ Weight: 100, PodAffinityTerm: corev1.PodAffinityTerm{ LabelSelector: &metav1.LabelSelector{ MatchExpressions: preferredLabelSelectorRequirement, }, - TopologyKey: "failure-domain.beta.kubernetes.io/zone", + TopologyKey: "topology.kubernetes.io/zone", }, } if affinity.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution == nil { - affinity.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution = []corev1.WeightedPodAffinityTerm{prefferedPodAffinityTerm} + affinity.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution = []corev1.WeightedPodAffinityTerm{preferredPodAffinityTerm} } else { affinity.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution = append( - affinity.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution, prefferedPodAffinityTerm) + affinity.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution, preferredPodAffinityTerm) } } @@ -150,7 +215,7 @@ func (r *RedisClusterReconciler) makeRedisPod(redisCluster *dbv1.RedisCluster, n pod := corev1.Pod{ TypeMeta: metav1.TypeMeta{APIVersion: "v1", Kind: "Pod"}, ObjectMeta: metav1.ObjectMeta{ - Name: fmt.Sprintf("redis-node-%s", nodeNumber), + Name: nodeName, Namespace: redisCluster.ObjectMeta.Namespace, Labels: podLabels, Annotations: redisCluster.Annotations, @@ -161,126 +226,223 @@ func (r *RedisClusterReconciler) makeRedisPod(redisCluster *dbv1.RedisCluster, n return pod } -func (r *RedisClusterReconciler) makeFollowerPod(redisCluster *dbv1.RedisCluster, nodeNumber string, leaderNumber string) (corev1.Pod, error) { - preferredLabelSelectorRequirement := []metav1.LabelSelectorRequirement{{Key: "leader-number", Operator: metav1.LabelSelectorOpIn, Values: []string{leaderNumber}}} - pod := r.makeRedisPod(redisCluster, "follower", leaderNumber, nodeNumber, preferredLabelSelectorRequirement) +func (r *RedisClusterReconciler) createRedisService(redisCluster *dbv1.RedisCluster) (*corev1.Service, error) { + svc, err := r.makeService(redisCluster) + if err != nil { + return nil, err + } + err = r.Create(context.Background(), &svc) + if !apierrors.IsAlreadyExists(err) { + return nil, err + } + return &svc, nil +} - if err := ctrl.SetControllerReference(redisCluster, &pod, r.Scheme); err != nil { - return pod, err +func (r *RedisClusterReconciler) makeService(redisCluster *dbv1.RedisCluster) (corev1.Service, error) { + service := corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: "redis-cluster-service", + Namespace: redisCluster.ObjectMeta.Namespace, + }, + Spec: corev1.ServiceSpec{ + Ports: []corev1.ServicePort{ + { + Name: "redis-client-port", + Port: 6379, + TargetPort: intstr.FromInt(6379), + }, + }, + Selector: redisCluster.Spec.PodLabelSelector, + }, } - return pod, nil + if err := ctrl.SetControllerReference(redisCluster, &service, r.Scheme); err != nil { + return service, err + } + + return service, nil } -func (r *RedisClusterReconciler) createRedisFollowerPods(redisCluster *dbv1.RedisCluster, nodeNumbers ...NodeNumbers) ([]corev1.Pod, error) { - if len(nodeNumbers) == 0 { - return nil, errors.Errorf("Failed to create Redis followers - no node numbers") +func (r *RedisClusterReconciler) makeAndCreateRedisPod(redisCluster *dbv1.RedisCluster, n *view.NodeStateView, createOpts []client.CreateOption) (corev1.Pod, error) { + preferredLabelSelectorRequirement := []metav1.LabelSelectorRequirement{{Key: "leader-name", Operator: metav1.LabelSelectorOpIn, Values: []string{n.LeaderName}}} + var role string + if n.Name == n.LeaderName { + role = "leader" + } else { + role = "follower" } + pod := r.makeRedisPod(redisCluster, role, n.LeaderName, n.Name, preferredLabelSelectorRequirement) + err := ctrl.SetControllerReference(redisCluster, &pod, r.Scheme) + if err != nil { + r.Log.Error(err, fmt.Sprintf("Could not re create pod [%s]", n.Name)) + r.deletePod(pod) + return pod, err + } + err = r.Create(context.Background(), &pod, createOpts...) + if err != nil && !strings.Contains(err.Error(), "already exists") { + r.Log.Error(err, fmt.Sprintf("Could not re create pod [%s]", n.Name)) + r.deletePod(pod) + return pod, err + } + return pod, nil +} - var followerPods []corev1.Pod +func (r *RedisClusterReconciler) createMissingRedisPods(redisCluster *dbv1.RedisCluster, v *view.RedisClusterView) map[string]corev1.Pod { + var pods []corev1.Pod createOpts := []client.CreateOption{client.FieldOwner("redis-operator-controller")} - - for _, nodeNumber := range nodeNumbers { - pod, err := r.makeFollowerPod(redisCluster, nodeNumber[0], nodeNumber[1]) - if err != nil { - return nil, err + var wg sync.WaitGroup + handledNodesCounter := 0 + mutex := &sync.Mutex{} + for _, n := range r.RedisClusterStateView.Nodes { + if n.NodeState == view.DeleteNode || n.NodeState == view.ReshardNode || n.NodeState == view.DeleteNodeKeepInMap || n.NodeState == view.ReshardNodeKeepInMap { + continue } - err = r.Create(context.Background(), &pod, createOpts...) - if err != nil && !apierrors.IsAlreadyExists(err) { - return nil, err + if _, isLeaderReported := r.RedisClusterStateView.Nodes[n.LeaderName]; !isLeaderReported { + node, exists := v.Nodes[n.Name] + if exists && node != nil { + isMaster, err := r.checkIfMaster(node.Ip) + if err != nil || !isMaster { + r.RedisClusterStateView.SetNodeState(n.Name, n.LeaderName, view.DeleteNode) + } else { + r.RedisClusterStateView.SetNodeState(n.Name, n.LeaderName, view.ReshardNode) + } + } else { + r.RedisClusterStateView.SetNodeState(n.Name, n.LeaderName, view.DeleteNode) + } + continue + } + node, exists := v.Nodes[n.Name] + if exists && node != nil { + ipsToNodes, err := r.ClusterNodesWaitForRedisLoadDataSetInMemory(node.Ip) + nodesTable, exists := ipsToNodes[node.Ip] + if !exists || err != nil || nodesTable == nil || len(*nodesTable) < 1 { + r.RedisClusterStateView.SetNodeState(n.Name, n.LeaderName, view.DeleteNodeKeepInMap) + continue + } + if len(*nodesTable) == 1 { + r.RedisClusterStateView.SetNodeState(n.Name, n.LeaderName, view.AddNode) + continue + } else { + if n.NodeState != view.ReplicateNode && n.NodeState != view.SyncNode { + r.RedisClusterStateView.SetNodeState(n.Name, n.LeaderName, view.NodeOK) + } + } + continue + } + if handledNodesCounter >= r.Config.Thresholds.MaxToleratedPodsRecoverAtOnce { + continue } - followerPods = append(followerPods, pod) + handledNodesCounter++ + r.RedisClusterStateView.SetNodeState(n.Name, n.LeaderName, view.CreateNode) + wg.Add(1) + go func(n *view.NodeStateView) { + defer wg.Done() + pod, err := r.makeAndCreateRedisPod(redisCluster, n, createOpts) + if err != nil { + return + } + mutex.Lock() + pods = append(pods, pod) + mutex.Unlock() + }(n) } - - followerPods, err := r.waitForPodNetworkInterface(followerPods...) - if err != nil { - return nil, err + wg.Wait() + readyPods := map[string]corev1.Pod{} + if len(pods) > 0 { + newPods, err := r.waitForPodNetworkInterface(pods...) + if err != nil { + r.Log.Error(err, fmt.Sprintf("Could not re create missing pods")) + r.deletePods(pods) + return map[string]corev1.Pod{} + } + wg.Add(len(newPods)) + for _, p := range newPods { + go func(p corev1.Pod) { + defer wg.Done() + readyPod, err := r.waitForRedisPod(p) + if err != nil { + r.deletePod(p) + return + } + mutex.Lock() + readyPods[readyPod.Name] = readyPod + s := r.RedisClusterStateView.Nodes[readyPod.Name] + r.RedisClusterStateView.SetNodeState(s.Name, s.LeaderName, view.AddNode) + mutex.Unlock() + }(p) + } + wg.Wait() } - - r.Log.Info(fmt.Sprintf("New follower pods created: %v", nodeNumbers)) - return followerPods, nil + return readyPods } -func (r *RedisClusterReconciler) makeLeaderPod(redisCluster *dbv1.RedisCluster, nodeNumber string) (corev1.Pod, error) { - preferredLabelSelectorRequirement := []metav1.LabelSelectorRequirement{{Key: "redis-node-role", Operator: metav1.LabelSelectorOpIn, Values: []string{"leader"}}} - pod := r.makeRedisPod(redisCluster, "leader", nodeNumber, nodeNumber, preferredLabelSelectorRequirement) - - if err := ctrl.SetControllerReference(redisCluster, &pod, r.Scheme); err != nil { - return pod, err +func (r *RedisClusterReconciler) waitForRedisPod(p corev1.Pod) (corev1.Pod, error) { + r.Log.Info(fmt.Sprintf("Waiting for redis pod [%s]", p.Name)) + podArray, err := r.waitForPodReady(p) + if err != nil || len(podArray) == 0 { + r.Log.Error(err, fmt.Sprintf("Could not re create pod [%s]", p.Name)) + r.deletePod(p) + return corev1.Pod{}, err + } + pod := podArray[0] + err = r.waitForRedis(pod.Status.PodIP) + if err != nil { + r.Log.Error(err, fmt.Sprintf("Could not re create pod [%s]", p.Name)) + r.deletePod(p) + return corev1.Pod{}, err } return pod, nil } -// Creates one or more leader pods; waits for available IP before returing -func (r *RedisClusterReconciler) createRedisLeaderPods(redisCluster *dbv1.RedisCluster, nodeNumbers ...string) ([]corev1.Pod, error) { - - if len(nodeNumbers) == 0 { - return nil, errors.New("Failed to create leader pods - no node numbers") +func (r *RedisClusterReconciler) createRedisLeaderPods(redisCluster *dbv1.RedisCluster, nodeNames ...string) ([]corev1.Pod, error) { + if len(nodeNames) == 0 { + return nil, errors.New("Failed to create leader pods - no node names") } - var leaderPods []corev1.Pod - for _, nodeNumber := range nodeNumbers { - pod, err := r.makeLeaderPod(redisCluster, nodeNumber) + for _, nodeName := range nodeNames { + pod, err := r.makeLeaderPod(redisCluster, nodeName) if err != nil { return nil, err } + err = ctrl.SetControllerReference(redisCluster, &pod, r.Scheme) + if err != nil { + r.Log.Error(err, fmt.Sprintf("Could not re create pod [%s]", pod.Name)) + r.deletePod(pod) + return leaderPods, err + } leaderPods = append(leaderPods, pod) } applyOpts := []client.CreateOption{client.FieldOwner("redis-operator-controller")} - - for i := range leaderPods { - err := r.Create(context.Background(), &leaderPods[i], applyOpts...) + for _, pod := range leaderPods { + err := r.Create(context.Background(), &pod, applyOpts...) if err != nil && !apierrors.IsAlreadyExists(err) && !apierrors.IsConflict(err) { return nil, err } } - - leaderPods, err := r.waitForPodNetworkInterface(leaderPods...) - if err != nil { - return nil, err + newPods := []corev1.Pod{} + var err error + if len(leaderPods) > 0 { + newPods, err = r.waitForPodNetworkInterface(leaderPods...) + if err != nil { + return nil, err + } + r.Log.Info(fmt.Sprintf("New leader pods created: %v", nodeNames)) } - - r.Log.Info(fmt.Sprintf("New leader pods created: %v ", nodeNumbers)) - return leaderPods, nil + return newPods, nil } -func (r *RedisClusterReconciler) makeService(redisCluster *dbv1.RedisCluster) (corev1.Service, error) { - service := corev1.Service{ - ObjectMeta: metav1.ObjectMeta{ - Name: "redis-cluster-service", - Namespace: redisCluster.ObjectMeta.Namespace, - }, - Spec: corev1.ServiceSpec{ - Ports: []corev1.ServicePort{ - { - Name: "redis-client-port", - Port: 6379, - TargetPort: intstr.FromInt(6379), - }, - }, - Selector: redisCluster.Spec.PodLabelSelector, - }, - } +func (r *RedisClusterReconciler) makeLeaderPod(redisCluster *dbv1.RedisCluster, nodeName string) (corev1.Pod, error) { + preferredLabelSelectorRequirement := []metav1.LabelSelectorRequirement{{Key: "redis-node-role", Operator: metav1.LabelSelectorOpIn, Values: []string{"leader"}}} + pod := r.makeRedisPod(redisCluster, "leader", nodeName, nodeName, preferredLabelSelectorRequirement) - if err := ctrl.SetControllerReference(redisCluster, &service, r.Scheme); err != nil { - return service, err + if err := ctrl.SetControllerReference(redisCluster, &pod, r.Scheme); err != nil { + return pod, err } - - return service, nil + return pod, nil } -func (r *RedisClusterReconciler) createRedisService(redisCluster *dbv1.RedisCluster) (*corev1.Service, error) { - svc, err := r.makeService(redisCluster) - if err != nil { - return nil, err - } - err = r.Create(context.Background(), &svc) - if !apierrors.IsAlreadyExists(err) { - return nil, err - } - return &svc, nil -} +// Wait methods func (r *RedisClusterReconciler) waitForPodReady(pods ...corev1.Pod) ([]corev1.Pod, error) { var readyPods []corev1.Pod @@ -289,11 +451,10 @@ func (r *RedisClusterReconciler) waitForPodReady(pods ...corev1.Pod) ([]corev1.P if err != nil { return nil, err } - r.Log.Info(fmt.Sprintf("Waiting for pod ready: %s(%s)", pod.Name, pod.Status.PodIP)) - if pollErr := wait.PollImmediate(r.Config.Times.PodReadyCheckInterval, r.Config.Times.PodReadyCheckTimeout, func() (bool, error) { + if pollErr := wait.PollImmediate(r.Config.Times.PodReadyCheckInterval, 4*r.Config.Times.PodReadyCheckTimeout, func() (bool, error) { err := r.Get(context.Background(), key, &pod) if err != nil { - return false, err + return true, err } if pod.Status.Phase != corev1.PodRunning { return false, nil @@ -312,7 +473,6 @@ func (r *RedisClusterReconciler) waitForPodReady(pods ...corev1.Pod) ([]corev1.P return readyPods, nil } -// Method used to wait for one or more pods to have an IP address func (r *RedisClusterReconciler) waitForPodNetworkInterface(pods ...corev1.Pod) ([]corev1.Pod, error) { r.Log.Info(fmt.Sprintf("Waiting for pod network interfaces...")) var readyPods []corev1.Pod @@ -337,47 +497,107 @@ func (r *RedisClusterReconciler) waitForPodNetworkInterface(pods ...corev1.Pod) return readyPods, nil } -// TODO should wait as long as delete grace period -func (r *RedisClusterReconciler) waitForPodDelete(pods ...corev1.Pod) error { +func (r *RedisClusterReconciler) waitForPodDelete(pods ...corev1.Pod) { + var wg sync.WaitGroup + wg.Add(len(pods)) for _, p := range pods { - key, err := client.ObjectKeyFromObject(&p) - if err != nil { - return err - } - r.Log.Info(fmt.Sprintf("Waiting for pod delete: %s", p.Name)) - if pollErr := wait.Poll(r.Config.Times.PodDeleteCheckInterval, r.Config.Times.PodDeleteCheckTimeout, func() (bool, error) { - err := r.Get(context.Background(), key, &p) + go func(p corev1.Pod) { + defer wg.Done() + key, err := client.ObjectKeyFromObject(&p) if err != nil { - if apierrors.IsNotFound(err) { - return true, nil + r.Log.Error(err, "Error while getting object key for deletion process") + return + } + r.Log.Info(fmt.Sprintf("Waiting for pod delete: %s", p.Name)) + if pollErr := wait.Poll(r.Config.Times.PodDeleteCheckInterval, r.Config.Times.PodDeleteCheckTimeout, func() (bool, error) { + err := r.Get(context.Background(), key, &p) + if err != nil { + if apierrors.IsNotFound(err) { + return true, nil + } + return false, err } - return false, err + return false, nil + }); pollErr != nil { + r.Log.Error(err, "Error while waiting for pod to be deleted") + return } - return false, nil - }); pollErr != nil { - return pollErr + }(p) + } + wg.Wait() +} + +// Delete methods + +func (r *RedisClusterReconciler) deleteAllRedisClusterPods() error { + pods, e := r.getRedisClusterPods(cluster) + if e != nil { + return e + } + deletedPods, err := r.deletePods(pods) + if err != nil { + return err + } + r.waitForPodDelete(deletedPods...) + return nil +} + +func (r *RedisClusterReconciler) deletePods(pods []corev1.Pod) ([]corev1.Pod, error) { + deletedPods := []corev1.Pod{} + for _, pod := range pods { + err := r.deletePod(pod) + if err != nil { + return deletedPods, err + } + deletedPods = append(deletedPods, pod) + } + return deletedPods, nil +} + +func (r *RedisClusterReconciler) deletePod(pod corev1.Pod) error { + if err := r.Delete(context.Background(), &pod); err != nil { + if !strings.Contains(err.Error(), "not found") { + r.Log.Error(err, "Could not delete pod: "+pod.Name) + return err } } return nil } -type K8sManager struct { - client.Client - Log logr.Logger - Config *RedisOperatorConfig - Scheme *runtime.Scheme +func (r *RedisClusterReconciler) deleteClusterStateView(redisCluster *dbv1.RedisCluster) error { + configMapName := r.RedisClusterStateView.Name + configMapNamespace := redisCluster.ObjectMeta.Namespace + var configMap corev1.ConfigMap + e := r.Get(context.Background(), client.ObjectKey{Name: configMapName, Namespace: configMapNamespace}, &configMap) + if e != nil { + return e + } + if len(configMap.Data) > 0 { + return r.Delete(context.Background(), &configMap) + } + return nil } -func (r *K8sManager) WritePodAnnotations(annotations map[string]string, pods ...corev1.Pod) error { - annotationsString := "" - for key, val := range annotations { - annotationsString = fmt.Sprintf("\"%s\": \"%s\",%s", key, val, annotationsString) +func (r *RedisClusterReconciler) ClusterNodesWaitForRedisLoadDataSetInMemory(ips ...string) (ipsToNodes map[string]*rediscli.RedisClusterNodes, err error) { + if len(ips) == 0 { + return nil, nil } - patch := []byte(fmt.Sprintf(`{"metadata":{"annotations":{%s}}}`, annotationsString[:len(annotationsString)-1])) - for i, pod := range pods { - if err := r.Patch(context.Background(), &pods[i], client.RawPatch(types.StrategicMergePatchType, patch)); err != nil { - r.Log.Error(err, fmt.Sprintf("Failed to patch the annotations on pod %s (%s)", pod.Name, pod.Status.PodIP)) + ipsToNodes = map[string]*rediscli.RedisClusterNodes{} + if pollErr := wait.PollImmediate(r.Config.Times.WaitForRedisLoadDataSetInMemoryCheckInterval, r.Config.Times.WaitForRedisLoadDataSetInMemoryTimeout, func() (bool, error) { + for _, ip := range ips { + nodes, std, err := r.RedisCLI.ClusterNodes(ip) + if err != nil { + if strings.Contains(err.Error(), "Redis is loading the dataset in memory") || strings.Contains(std, "Redis is loading the dataset in memory"){ + return false, nil + } + return false, err + } + ipsToNodes[ip] = nodes } + return true, err + }); pollErr != nil { + return nil, pollErr } - return nil + + return ipsToNodes, err } diff --git a/controllers/operator_entrypoints_module.go b/controllers/operator_entrypoints_module.go new file mode 100644 index 00000000..f001baf8 --- /dev/null +++ b/controllers/operator_entrypoints_module.go @@ -0,0 +1,328 @@ +package controllers + +import ( + "encoding/json" + "fmt" + "net/http" + "os" + "sync" + "time" + + rediscli "github.com/PayU/redis-operator/controllers/rediscli" + "github.com/PayU/redis-operator/controllers/redisclient" + "github.com/PayU/redis-operator/controllers/testlab" + view "github.com/PayU/redis-operator/controllers/view" + "github.com/labstack/echo/v4" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" +) + +/** +Gets cluster info in a form of cluster pods view +**/ +func ClusterInfo(c echo.Context) error { + if reconciler == nil || cluster == nil { + return c.String(http.StatusOK, "Could not get redis cluster info") + } + v, ok := reconciler.NewRedisClusterView(cluster) + if !ok { + return c.String(http.StatusInternalServerError, "Could not retrieve redis cluster view") + } + for _, n := range v.Nodes { + n.Pod = corev1.Pod{} + } + data, err := json.Marshal(v) + if err != nil { + return c.String(http.StatusInternalServerError, "Could not get redis cluster info: "+err.Error()) + } + return c.String(http.StatusOK, string(data)) +} + +/** +Get operator state and cluster state +**/ +func ClusterState(c echo.Context) error { + if reconciler == nil || cluster == nil { + return c.String(http.StatusInternalServerError, "Could not get redis cluster state") + } + operatorState := cluster.Status.ClusterState + clusterState := reconciler.RedisClusterStateView.ClusterState + return c.String(http.StatusOK, fmt.Sprintf("Operator state [%v], Cluster state [%v]", operatorState, clusterState)) +} + +/** +Set the operator state to RESET state. +In the next reconcile loop, the operator will enter a RESET mode, which will lead to the following steps: +1. Delete all redis cluster pods +2. Wait for all redis cluster pods to terminate +3. Create new redis cluster pods according to the spec +[WARN] This entry point is concidered sensitive, and is not allowed naturally. In order to enable it, the config param 'ExposeSensitiveEntryPoints' need to be set to 'true'. +**/ +func DoResetCluster(c echo.Context) error { + if reconciler == nil || cluster == nil { + return c.String(http.StatusInternalServerError, "Could not perform cluster reset action") + } + if reconciler.Config.Setters.ExposeSensitiveEntryPoints == false { + return c.String(http.StatusUnauthorized, "Sensitive operation - Not allowed") + } + reconciler.Log.Info("[WARN] Sensitive entry point, on the way to pre-prod / prod environments, the access should be removed from router list") + cluster.Status.ClusterState = string(Reset) + reconciler.saveOperatorState(cluster) + return c.String(http.StatusOK, "Set cluster state to reset mode") +} + +/** +Triggers the redis-cli command CLUSTER REBALANCE +In case of failure, the cluster state will be set to ClusterFix, which will lead to a trigger of ClusterFix redis-cli command within the next reconcile loop +**/ +func ClusterRebalance(c echo.Context) error { + if reconciler == nil || cluster == nil { + return c.String(http.StatusInternalServerError, "Could not perform cluster rebalance action") + } + reconciler.saveClusterStateView(cluster) + v, ok := reconciler.NewRedisClusterView(cluster) + if !ok { + return c.String(http.StatusInternalServerError, "Could not retrieve redis cluster view") + } + reconciler.removeSoloLeaders(v) + healthyServerName, found := reconciler.findHealthyLeader(v) + if !found { + return c.String(http.StatusOK, "Could not find healthy server to serve the rebalance request") + } + mutex := &sync.Mutex{} + mutex.Lock() + reconciler.RedisClusterStateView.ClusterState = view.ClusterRebalance + healthyServerIp := v.Nodes[healthyServerName].Ip + reconciler.waitForAllNodesAgreeAboutSlotsConfiguration(v, nil) + _, _, err := reconciler.RedisCLI.ClusterRebalance(healthyServerIp, true) + if err != nil { + reconciler.RedisClusterStateView.ClusterState = view.ClusterFix + reconciler.Log.Error(err, "Could not perform cluster rebalance") + } + reconciler.RedisClusterStateView.ClusterState = view.ClusterOK + mutex.Unlock() + reconciler.saveClusterStateView(cluster) + return c.String(http.StatusOK, "Cluster rebalance attempt executed") +} + +/** +Triggers the redis-cli command CLUSTER FIX +In case of failure, the cluster state will remain ClusterFix, which will lead to a triger of additional attempt to fix in the next reconcile loop +In case of success, the cluster state will be set to ClusterReblance, which will lead to a trigger of redic-cli command CLUSTER REBALANCE in the next reconcile loop +**/ +func ClusterFix(c echo.Context) error { + if reconciler == nil || cluster == nil { + return c.String(http.StatusInternalServerError, "Could not perform cluster fix action") + } + reconciler.saveClusterStateView(cluster) + v, ok := reconciler.NewRedisClusterView(cluster) + if !ok { + return c.String(http.StatusInternalServerError, "Could not retrieve redis cluster view") + } + healthyServerName, found := reconciler.findHealthyLeader(v) + if !found { + return c.String(http.StatusInternalServerError, "Could not find healthy server to serve the fix request") + } + healthyServerIp := v.Nodes[healthyServerName].Ip + mutex := &sync.Mutex{} + mutex.Lock() + reconciler.RedisClusterStateView.ClusterState = view.ClusterFix + _, _, err := reconciler.RedisCLI.ClusterFix(healthyServerIp) + if err != nil { + reconciler.Log.Error(err, "Could not perform cluster fix") + } + reconciler.RedisClusterStateView.ClusterState = view.ClusterRebalance + reconciler.Log.Info("It is recommended to run rebalance after each cluster fix, changing state to [ClusterRebalance]") + mutex.Unlock() + reconciler.saveClusterStateView(cluster) + return c.String(http.StatusOK, "Cluster fix attempt executed") +} + +/** +Triggers an atomic flow of forgetting all redis cluster lost nodes: non-responsive nodes that still exists in the tables of some of the responsive ones. +**/ +func ForgetLostNodes(c echo.Context) error { + if reconciler == nil || cluster == nil { + return c.String(http.StatusInternalServerError, "Could not perform cluster forget lost nodes action") + } + v, ok := reconciler.NewRedisClusterView(cluster) + if !ok { + return c.String(http.StatusInternalServerError, "Could not retrieve redis cluster view") + } + reconciler.forgetLostNodes(cluster, v) + return c.String(http.StatusOK, "Finish execution for attempt to forget lost nodes") +} + +/** +Triggers reconcile loop by manual request. +Can be useful in case of event that prevents from manager to proceed with it's cluster maintenance routin +[WARN] Direct reconcile trigger might run the loop without enqueue it again causing the operator to not scheduling another run within requested time. +In case of need run eforced reconcile manually several times until recovery is complete, and restart manager when cluster is stable. +**/ +func ForceReconcile(c echo.Context) error { + if reconciler == nil || cluster == nil { + return c.String(http.StatusInternalServerError, "Could not perform cluster reconcile action") + } + reconciler.saveClusterStateView(cluster) + _, err := reconciler.Reconcile(ctrl.Request{types.NamespacedName{Name: cluster.Name, Namespace: cluster.Namespace}}) + if err != nil { + reconciler.Log.Error(err, "Could not perform reconcile trigger") + } + return c.String(http.StatusOK, "Force Reconcile request triggered, direct reconcile trigger might run the loop without enqueue it again causing the operator to not scheduling another run within requested time. "+ + "\nIn case of need run eforced reconcile manually several times until recovery is complete, and restart manager when cluster is stable") +} + +/** +Sets the value of parameter 'IsUpToDate' to false for each one of the reported nodes in the cluster state map. +In the next healthy reconcile loop, upgrade process will take part: each marked node will failover, forgotten, removed, deleted, re created and marked again with 'IsUpToDate' value of true +This process takes part moderately according to a suggested heuristic that relays on cluster size, and separates upgrade steps of leaders from upgrade steps of followers. +**/ +func UpgradeCluster(c echo.Context) error { + if reconciler == nil || cluster == nil { + return c.String(http.StatusInternalServerError, "Could not perform cluster upgarde action") + } + for _, n := range reconciler.RedisClusterStateView.Nodes { + n.IsUpToDate = false + } + requestUpgrade = false + reconciler.saveClusterStateView(cluster) + return c.String(http.StatusOK, "Cluster upgarde request triggered") +} + +/** +Triggers a flow of testing routine that induces events with different severities in order to challenge the operator by simulating possible dissaster scenarios. +**/ +func ClusterTest(c echo.Context) error { + if reconciler == nil || cluster == nil { + return c.String(http.StatusInternalServerError, "Could not perform cluster test") + } + return c.String(http.StatusOK, setAndStartTestLab(&c, false)) +} + +/** +Triggers a flow of testing routine that induces events with different severities in order to challenge the operator by simulating possible dissaster scenarios. +The flow creates mock data and sends it to the redis cluster nodes, later attempts to report estimated possible data loss that might be expirienced during each dissaster scenario. +[WARN] This entry point is concidered sensitive, and is not allowed naturally. In order to enable it, the config param 'ExposeSensitiveEntryPoints' need to be set to 'true'. +**/ +func ClusterTestWithData(c echo.Context) error { + if reconciler == nil || cluster == nil { + return c.String(http.StatusInternalServerError, "Could not perform cluster test") + } + if reconciler.Config.Setters.ExposeSensitiveEntryPoints == false { + return c.String(http.StatusUnauthorized, "Sensitive operation - Not allowed") + } + reconciler.Log.Info("[WARN] Sensitive entry point, on the way to pre-prod / prod environments, the access should be removed from router list") + return c.String(http.StatusOK, setAndStartTestLab(&c, true)) +} + +/** +Populates the redis cluster nodes with mock data for debug purposes. +[WARN] This entry point is concidered sensitive, and is not allowed naturally. In order to enable it, the config param 'ExposeSensitiveEntryPoints' need to be set to 'true'. +**/ +func PopulateClusterWithMockData(c echo.Context) error { + if reconciler == nil || cluster == nil { + return c.String(http.StatusInternalServerError, "Could not perform cluster popluate data") + } + if reconciler.Config.Setters.ExposeSensitiveEntryPoints == false { + return c.String(http.StatusUnauthorized, "Sensitive operation - Not allowed") + } + v, ok := reconciler.NewRedisClusterView(cluster) + if !ok || v == nil { + return c.String(http.StatusInternalServerError, "Could not perform cluster populate data") + } + reconciler.Log.Info("[WARN] Sensitive entry point, on the way to pre-prod / prod environments, the access should be removed from router list") + redisCli := rediscli.NewRedisCLI(&reconciler.Log) + user := os.Getenv("REDIS_USERNAME") + if user != "" { + redisCli.Auth = &rediscli.RedisAuth{ + User: user, + } + } + clusterCli := redisclient.GetRedisClusterClient(v, redisCli) + printUsedMemoryForAllNodes(v) + + total := 5000000 + init := 0 + sw := 0 + + updateClientPer := 15000 + loopsBeforeClientUpdate := 0 + + for i := init; i < init+total; i++ { + key := "key" + fmt.Sprintf("%v", i) + val := "val" + fmt.Sprintf("%v", i) + err := clusterCli.Set(key, val, 3) + if err == nil { + sw++ + } + loopsBeforeClientUpdate++ + if loopsBeforeClientUpdate == updateClientPer { + v, ok := reconciler.NewRedisClusterView(cluster) + if ok && v != nil { + clusterCli = redisclient.GetRedisClusterClient(v, redisCli) + loopsBeforeClientUpdate = 0 + } + } + } + printUsedMemoryForAllNodes(v) + return c.String(http.StatusOK, "Cluster populated with data") +} + +/** +Flushes all the data of redis cluster nodes. +[WARN] This entry point is concidered sensitive, and is not allowed naturally. In order to enable it, the config param 'ExposeSensitiveEntryPoints' need to be set to 'true'. +**/ +func FlushClusterData(c echo.Context) error { + if reconciler == nil || cluster == nil { + return c.String(http.StatusInternalServerError, "Could not perform cluster flush data") + } + if reconciler.Config.Setters.ExposeSensitiveEntryPoints == false { + return c.String(http.StatusUnauthorized, "Sensitive operation - Not allowed") + } + var cl *redisclient.RedisClusterClient = nil + v, ok := reconciler.NewRedisClusterView(cluster) + if !ok || v == nil { + return c.String(http.StatusInternalServerError, "Could not perform cluster flush data") + } + reconciler.Log.Info("[WARN] Sensitive entry point, on the way to pre-prod / prod environments, the access should be removed from router list") + cl = redisclient.GetRedisClusterClient(v, reconciler.RedisCLI) + cl.FlushAllData() + time.Sleep(10 * time.Second) + printUsedMemoryForAllNodes(v) + return c.String(http.StatusOK, "Cluster data flushed") +} + +func setAndStartTestLab(c *echo.Context, data bool) string { + cli := rediscli.NewRedisCLI(&reconciler.Log) + user := os.Getenv("REDIS_USERNAME") + if user != "" { + cli.Auth = &rediscli.RedisAuth{ + User: user, + } + } + t := &testlab.TestLab{ + Client: reconciler.Client, + RedisCLI: cli, + Cluster: cluster, + RedisClusterClient: nil, + Log: reconciler.Log, + Report: "", + } + t.RunTest(&reconciler.RedisClusterStateView.Nodes, data) + return t.Report +} + +func printUsedMemoryForAllNodes(v *view.RedisClusterView) { + for _, n := range v.Nodes { + printUsedMemory(n.Name, n.Ip) + } +} + +func printUsedMemory(name string, ip string) { + info, _, err := reconciler.RedisCLI.Info(ip) + if err != nil || info == nil { + return + } + println(name + ": " + info.Memory["used_memory_human"]) +} diff --git a/controllers/rediscli/cli.go b/controllers/rediscli/cli.go index caceaf3a..a828563d 100644 --- a/controllers/rediscli/cli.go +++ b/controllers/rediscli/cli.go @@ -3,8 +3,10 @@ package rediscli import ( "bytes" "context" + "fmt" "os/exec" "regexp" + "strconv" "strings" "time" @@ -15,6 +17,7 @@ import ( const ( defaultRedisCliTimeout = 20 * time.Second REDIS_DEFAULT_PORT string = "6379" + MAX_SLOTS_PER_LEADER = 16384 ) type RedisAuth struct { @@ -23,7 +26,7 @@ type RedisAuth struct { type CommandHandler interface { buildCommand(routingPort string, args []string, auth *RedisAuth, opt ...string) ([]string, map[string]string) - executeCommand(args []string) (string, string, error) + executeCommand(pipedArgs []string, args []string, useBash bool, multipFactorForTimeout ...float64) (string, string, error) buildRedisInfoModel(stdoutInfo string) (*RedisInfo, error) buildRedisClusterInfoModel(stdoutInfo string) (*RedisClusterInfo, error) } @@ -69,17 +72,34 @@ func (h *RunTimeCommandHandler) buildCommand(routingPort string, args []string, return args, argListToArgMap(args) } -/* Executes command and returns cmd stdout, stderr and runtime error if appears - * args: arguments, flags and their values, in the order they should appear as if they were executed in the cli itself - */ -func (h *RunTimeCommandHandler) executeCommand(args []string) (string, string, error) { +func (h *RunTimeCommandHandler) executeCommand(pipedArgs []string, args []string, useBash bool, multipFactorForTimeout ...float64) (string, string, error) { var stdout, stderr bytes.Buffer - ctx, cancel := context.WithTimeout(context.Background(), defaultRedisCliTimeout) + multipFactor := 1.0 + if len(multipFactorForTimeout) > 0 { + multipFactor = multipFactorForTimeout[0] + } + ctx, cancel := context.WithTimeout(context.Background(), time.Duration(multipFactor)*defaultRedisCliTimeout) defer cancel() - cmd := exec.CommandContext(ctx, "redis-cli", args...) + var cmd *exec.Cmd + if useBash { + argLine := "" + if len(pipedArgs) > 0 { + for _, arg := range pipedArgs { + argLine += arg + " " + } + argLine += " | " + } + argLine += "redis-cli" + for _, arg := range args { + argLine += " " + arg + } + cmd = exec.CommandContext(ctx, "bash", "-c", argLine) + }else{ + cmd = exec.CommandContext(ctx, "redis-cli", args...) + } cmd.Stdout = &stdout cmd.Stderr = &stderr @@ -123,6 +143,7 @@ func (h *RunTimeCommandHandler) executeCommand(args []string) (string, string, e return stdOutput, errOutput, nil } + // Helpers func argLineToArgMap(argLine string, argMap map[string]string) { @@ -224,7 +245,7 @@ func (r *RedisCLI) ClusterCreate(leadersAddresses []string, opt ...string) (stri args := append([]string{"--cluster", "create"}, fullAddresses...) args = append(args, "--cluster-yes") // this will run the command non-interactively args, _ = r.Handler.buildCommand(r.Port, args, r.Auth, opt...) - stdout, stderr, err := r.Handler.executeCommand(args) + stdout, stderr, err := r.Handler.executeCommand([]string{}, args, false) if err != nil || strings.TrimSpace(stderr) != "" || IsError(strings.TrimSpace(stdout)) { return stdout, errors.Errorf("Failed to execute cluster create (%v): %s | %s | %v", fullAddresses, stdout, stderr, err) } @@ -234,7 +255,7 @@ func (r *RedisCLI) ClusterCreate(leadersAddresses []string, opt ...string) (stri func (r *RedisCLI) ClusterCheck(nodeAddr string, opt ...string) (string, error) { args := []string{"--cluster", "check", addressPortDecider(nodeAddr, r.Port)} args, _ = r.Handler.buildCommand(r.Port, args, r.Auth, opt...) - stdout, stderr, err := r.Handler.executeCommand(args) + stdout, stderr, err := r.Handler.executeCommand([]string{}, args, false) if err != nil || strings.TrimSpace(stderr) != "" || IsError(strings.TrimSpace(stdout)) { return stdout, errors.Errorf("Cluster check result: (%s): %s | %s | %v", nodeAddr, stdout, stderr, err) } @@ -249,20 +270,35 @@ func (r *RedisCLI) ClusterCheck(nodeAddr string, opt ...string) (string, error) func (r *RedisCLI) AddFollower(newNodeAddr string, existingNodeAddr string, leaderID string, opt ...string) (string, error) { args := []string{"--cluster", "add-node", addressPortDecider(newNodeAddr, r.Port), addressPortDecider(existingNodeAddr, r.Port), "--cluster-slave", "--cluster-master-id", leaderID} args, _ = r.Handler.buildCommand(r.Port, args, r.Auth, opt...) - stdout, stderr, err := r.Handler.executeCommand(args) + stdout, stderr, err := r.Handler.executeCommand([]string{}, args, false) if err != nil || strings.TrimSpace(stderr) != "" || IsError(strings.TrimSpace(stdout)) { return stdout, errors.Errorf("Failed to execute cluster add node (%s, %s, %s): %s | %s | %v", newNodeAddr, existingNodeAddr, leaderID, stdout, stderr, err) } return stdout, nil } +// AddLeader uses the '--cluster add-node' option on redis-cli to add a node to the cluster +// newNodeAddr: Address of the follower that will join the cluster in a format of : or : or +// existingNodeAddr: IP of a node in the cluster in a format of : or : or +// leaderID: Redis ID of the leader that the new follower will replicate +// In case port won't bw provided as part of the given addresses, cli default port will be added automatically to the address +func (r *RedisCLI) AddLeader(newNodeAddr string, existingNodeAddr string, opt ...string) (string, error) { + args := []string{"--cluster", "add-node", addressPortDecider(newNodeAddr, r.Port), addressPortDecider(existingNodeAddr, r.Port)} + args, _ = r.Handler.buildCommand(r.Port, args, r.Auth, opt...) + stdout, stderr, err := r.Handler.executeCommand([]string{}, args, false, 2) + if err != nil || strings.TrimSpace(stderr) != "" || IsError(strings.TrimSpace(stdout)) { + return stdout, errors.Errorf("Failed to execute cluster add node (%s, %s): %s | %s | %v", newNodeAddr, existingNodeAddr, stdout, stderr, err) + } + return stdout, nil +} + // DelNode uses the '--cluster del-node' option of redis-cli to remove a node from the cluster // nodeIP: any node of the cluster // nodeID: node that needs to be removed func (r *RedisCLI) DelNode(nodeIP string, nodeID string, opt ...string) (string, error) { args := []string{"--cluster", "del-node", addressPortDecider(nodeIP, r.Port), nodeID} args, _ = r.Handler.buildCommand(r.Port, args, r.Auth, opt...) - stdout, stderr, err := r.Handler.executeCommand(args) + stdout, stderr, err := r.Handler.executeCommand([]string{}, args, false) if err != nil || stderr != "" || IsError(strings.TrimSpace(stdout)) { return stdout, errors.Errorf("Failed to execute cluster del-node (%s, %s): %s | %s | %v", nodeIP, nodeID, stdout, stderr, err) } @@ -274,7 +310,7 @@ func (r *RedisCLI) ClusterInfo(nodeIP string, opt ...string) (*RedisClusterInfo, args := []string{"-h", nodeIP, "cluster", "info"} args, _ = r.Handler.buildCommand(r.Port, args, r.Auth, opt...) - stdout, stderr, err := r.Handler.executeCommand(args) + stdout, stderr, err := r.Handler.executeCommand([]string{}, args, false) if err != nil || strings.TrimSpace(stderr) != "" || IsError(strings.TrimSpace(stdout)) { return nil, "", errors.Errorf("Failed to execute CLUSTER INFO (%s): %s | %s | %v", nodeIP, stdout, stderr, err) } @@ -287,7 +323,7 @@ func (r *RedisCLI) Info(nodeIP string, opt ...string) (*RedisInfo, string, error args := []string{"-h", nodeIP, "info"} args, _ = r.Handler.buildCommand(r.Port, args, r.Auth, opt...) - stdout, stderr, err := r.Handler.executeCommand(args) + stdout, stderr, err := r.Handler.executeCommand([]string{}, args, false) if err != nil || strings.TrimSpace(stderr) != "" || IsError(strings.TrimSpace(stdout)) { return nil, "", errors.Errorf("Failed to execute INFO (%s): %s | %s | %v", nodeIP, stdout, stderr, err) } @@ -295,11 +331,22 @@ func (r *RedisCLI) Info(nodeIP string, opt ...string) (*RedisInfo, string, error return c, stdout, e } +func (r *RedisCLI) DBSIZE(nodeIP string, opt ...string) (int64, string, error) { + args := []string{"-h", nodeIP, "DBSIZE"} + args, _ = r.Handler.buildCommand(r.Port, args, r.Auth, opt...) + stdout, stderr, err := r.Handler.executeCommand([]string{}, args, false) + if err != nil || strings.TrimSpace(stderr) != "" || IsError(strings.TrimSpace(stdout)) { + return 0, stdout, errors.Errorf("Failed to execute INFO (%s): %s | %s | %v", nodeIP, stdout, stderr, err) + } + dbsize, err := strconv.ParseInt(stdout, 10, 64) + return dbsize, stdout, err +} + // https://redis.io/commands/ping func (r *RedisCLI) Ping(nodeIP string, message ...string) (string, error) { args := []string{"-h", nodeIP, "ping"} args, _ = r.Handler.buildCommand(r.Port, args, r.Auth, message...) - stdout, stderr, err := r.Handler.executeCommand(args) + stdout, stderr, err := r.Handler.executeCommand([]string{}, args, false) if err != nil || strings.TrimSpace(stderr) != "" || IsError(strings.TrimSpace(stdout)) { return stdout, errors.Errorf("Failed to execute INFO (%s): %s | %s | %v", nodeIP, stdout, stderr, err) } @@ -310,7 +357,7 @@ func (r *RedisCLI) Ping(nodeIP string, message ...string) (string, error) { func (r *RedisCLI) ClusterNodes(nodeIP string, opt ...string) (*RedisClusterNodes, string, error) { args := []string{"-h", nodeIP, "cluster", "nodes"} args, _ = r.Handler.buildCommand(r.Port, args, r.Auth, opt...) - stdout, stderr, err := r.Handler.executeCommand(args) + stdout, stderr, err := r.Handler.executeCommand([]string{}, args, false) if err != nil || strings.TrimSpace(stderr) != "" || IsError(strings.TrimSpace(stdout)) { return nil, "", errors.Errorf("Failed to execute CLUSTER NODES(%s): %s | %s | %v", nodeIP, stdout, stderr, err) } @@ -321,7 +368,7 @@ func (r *RedisCLI) ClusterNodes(nodeIP string, opt ...string) (*RedisClusterNode func (r *RedisCLI) MyClusterID(nodeIP string, opt ...string) (string, error) { args := []string{"-h", nodeIP, "cluster", "myid"} args, _ = r.Handler.buildCommand(r.Port, args, r.Auth, opt...) - stdout, stderr, err := r.Handler.executeCommand(args) + stdout, stderr, err := r.Handler.executeCommand([]string{}, args, false) if err != nil || strings.TrimSpace(stderr) != "" || IsError(strings.TrimSpace(stdout)) { return stdout, errors.Errorf("Failed to execute MYID(%s): %s | %s | %v", nodeIP, stdout, stderr, err) } @@ -334,7 +381,7 @@ func (r *RedisCLI) MyClusterID(nodeIP string, opt ...string) (string, error) { func (r *RedisCLI) ClusterForget(nodeIP string, forgetNodeID string, opt ...string) (string, error) { args := []string{"-h", nodeIP, "cluster", "forget", forgetNodeID} args, _ = r.Handler.buildCommand(r.Port, args, r.Auth, opt...) - stdout, stderr, err := r.Handler.executeCommand(args) + stdout, stderr, err := r.Handler.executeCommand([]string{}, args, false) if err != nil || strings.TrimSpace(stderr) != "" || strings.TrimSpace(stdout) != "OK" { return stdout, errors.Errorf("Failed to execute CLUSTER FORGET (%s, %s): %s | %s | %v", nodeIP, forgetNodeID, stdout, stderr, err) } @@ -346,7 +393,7 @@ func (r *RedisCLI) ClusterForget(nodeIP string, forgetNodeID string, opt ...stri func (r *RedisCLI) ClusterReplicas(nodeIP string, leaderNodeID string, opt ...string) (*RedisClusterNodes, string, error) { args := []string{"-h", nodeIP, "cluster", "replicas", leaderNodeID} args, _ = r.Handler.buildCommand(r.Port, args, r.Auth, opt...) - stdout, stderr, err := r.Handler.executeCommand(args) + stdout, stderr, err := r.Handler.executeCommand([]string{}, args, false) if err != nil || strings.TrimSpace(stderr) != "" || IsError(strings.TrimSpace(stdout)) { return nil, "", errors.Errorf("Failed to execute CLUSTER REPLICAS (%s, %s): %s | %s | %v", nodeIP, leaderNodeID, stdout, stderr, err) } @@ -357,7 +404,7 @@ func (r *RedisCLI) ClusterReplicas(nodeIP string, leaderNodeID string, opt ...st func (r *RedisCLI) ClusterFailover(nodeIP string, opt ...string) (string, error) { args := []string{"-h", nodeIP, "cluster", "failover"} args, _ = r.Handler.buildCommand(r.Port, args, r.Auth, opt...) - stdout, stderr, err := r.Handler.executeCommand(args) + stdout, stderr, err := r.Handler.executeCommand([]string{}, args, false, 5) if err != nil || strings.TrimSpace(stderr) != "" || strings.TrimSpace(stdout) != "OK" { return stdout, errors.Errorf("Failed to execute CLUSTER FAILOVER (%s, %v): %s | %s | %v", nodeIP, opt, stdout, stderr, err) } @@ -368,7 +415,7 @@ func (r *RedisCLI) ClusterFailover(nodeIP string, opt ...string) (string, error) func (r *RedisCLI) ClusterMeet(nodeIP string, newNodeIP string, newNodePort string, opt ...string) (string, error) { args := []string{"-h", nodeIP, "cluster", "meet", newNodeIP, newNodePort} args, _ = r.Handler.buildCommand(r.Port, args, r.Auth, opt...) - stdout, stderr, err := r.Handler.executeCommand(args) + stdout, stderr, err := r.Handler.executeCommand([]string{}, args, false) if err != nil || strings.TrimSpace(stderr) != "" || strings.TrimSpace(stdout) != "OK" { return stdout, errors.Errorf("Failed to execute CLUSTER MEET (%s, %s, %s, %v): %s | %s | %v", nodeIP, newNodeIP, newNodePort, opt, stdout, stderr, err) } @@ -379,19 +426,50 @@ func (r *RedisCLI) ClusterMeet(nodeIP string, newNodeIP string, newNodePort stri func (r *RedisCLI) ClusterReset(nodeIP string, opt ...string) (string, error) { args := []string{"-h", nodeIP, "cluster", "reset"} args, _ = r.Handler.buildCommand(r.Port, args, r.Auth, opt...) - stdout, stderr, err := r.Handler.executeCommand(args) + stdout, stderr, err := r.Handler.executeCommand([]string{}, args, false) if err != nil || strings.TrimSpace(stderr) != "" || strings.TrimSpace(stdout) != "OK" { return stdout, errors.Errorf("Failed to execute CLUSTER RESET (%s, %v): %s | %s | %v", nodeIP, opt, stdout, stderr, err) } return stdout, nil } +func (r *RedisCLI) ClusterRebalance(nodeIP string, useEmptyMasters bool, opt ...string) (bool, string, error) { + args := []string{"--cluster", "rebalance", addressPortDecider(nodeIP, r.Port)} + if useEmptyMasters { + args = append(args, "--cluster-use-empty-masters") + } + args = append(args, "--cluster-yes") + args, _ = r.Handler.buildCommand(r.Port, args, r.Auth, opt...) + stdout, stderr, err := r.Handler.executeCommand([]string{}, args, false, 50) + if err != nil || strings.TrimSpace(stderr) != "" || IsError(strings.TrimSpace(stdout)) { + return false, stdout, errors.Errorf("Failed to execute cluster rebalance (%v): %s | %s | %v", nodeIP, stdout, stderr, err) + } + return true, stdout, nil +} + +func (r *RedisCLI) ClusterReshard(nodeIP string, sourceId string, targetId string, slots int, opt ...string) (bool, string, error) { + args := []string{ + "--cluster reshard", addressPortDecider(nodeIP, r.Port), + "--cluster-from", sourceId, + "--cluster-to", targetId, + "--cluster-slots", fmt.Sprint(slots), + "--cluster-yes", + } + useBash := true + args, _ = r.Handler.buildCommand(r.Port, args, r.Auth, opt...) + stdout, stderr, err := r.Handler.executeCommand([]string{}, args, useBash, 50) + if err != nil || strings.TrimSpace(stderr) != "" || IsError(strings.TrimSpace(stdout)) { + return false, stdout, errors.Errorf("Failed to execute cluster reshard (%v): from [%s] to [%s] stdout: %s | stderr : %s | err: %v", nodeIP, sourceId, targetId, stdout, stderr, err) + } + return true, stdout, nil +} + // https://redis.io/commands/flushall func (r *RedisCLI) Flushall(nodeIP string, opt ...string) (string, error) { args := []string{"-h", nodeIP, "flushall"} args, _ = r.Handler.buildCommand(r.Port, args, r.Auth, opt...) - stdout, stderr, err := r.Handler.executeCommand(args) + stdout, stderr, err := r.Handler.executeCommand([]string{}, args, false) if err != nil || strings.TrimSpace(stderr) != "" || IsError(strings.TrimSpace(stdout)) { return stdout, errors.Errorf("Failed to execute FLUSHALL (%s, %v): %s | %s | %v", nodeIP, opt, stdout, stderr, err) } @@ -402,7 +480,7 @@ func (r *RedisCLI) Flushall(nodeIP string, opt ...string) (string, error) { func (r *RedisCLI) ClusterReplicate(nodeIP string, leaderID string, opt ...string) (string, error) { args := []string{"-h", nodeIP, "cluster", "replicate", leaderID} args, _ = r.Handler.buildCommand(r.Port, args, r.Auth, opt...) - stdout, stderr, err := r.Handler.executeCommand(args) + stdout, stderr, err := r.Handler.executeCommand([]string{}, args, false) if err != nil || strings.TrimSpace(stderr) != "" || strings.TrimSpace(stdout) != "OK" { return stdout, errors.Errorf("Failed to execute CLUSTER REPLICATE (%s, %s): %s | %s | %v", nodeIP, leaderID, stdout, stderr, err) } @@ -414,7 +492,7 @@ func (r *RedisCLI) ACLLoad(nodeIP string, opt ...string) (string, error) { args := []string{"-h", nodeIP, "acl", "load"} args, _ = r.Handler.buildCommand(r.Port, args, r.Auth, opt...) - stdout, stderr, err := r.Handler.executeCommand(args) + stdout, stderr, err := r.Handler.executeCommand([]string{}, args, false) if err != nil || strings.TrimSpace(stderr) != "" || strings.TrimSpace(stdout) != "OK" { return stdout, errors.Errorf("Failed to execute ACL LOAD (%s): %s | %s | %v", nodeIP, stdout, stderr, err) } @@ -426,7 +504,7 @@ func (r *RedisCLI) ACLList(nodeIP string, opt ...string) (*RedisACL, string, err args := []string{"-h", nodeIP, "acl", "list"} args, _ = r.Handler.buildCommand(r.Port, args, r.Auth, opt...) - stdout, stderr, err := r.Handler.executeCommand(args) + stdout, stderr, err := r.Handler.executeCommand([]string{}, args, false) if err != nil || strings.TrimSpace(stderr) != "" || IsError(strings.TrimSpace(stdout)) { return nil, "", errors.Errorf("Failed to execute ACL LIST (%s): %s | %s | %v", nodeIP, stdout, stderr, err) } @@ -436,3 +514,23 @@ func (r *RedisCLI) ACLList(nodeIP string, opt ...string) (*RedisACL, string, err } return acl, stdout, nil } + +func (r *RedisCLI) ClusterFix(nodeIP string, opt ...string) (bool, string, error) { + args := []string{"--cluster", "fix", addressPortDecider(nodeIP, r.Port), "--cluster-fix-with-unreachable-masters", "--cluster-yes"} + args, _ = r.Handler.buildCommand(r.Port, args, r.Auth, opt...) + stdout, stderr, err := r.Handler.executeCommand([]string{"yes", "yes"}, args, true, 50) + if err != nil || strings.TrimSpace(stderr) != "" || IsError(strings.TrimSpace(stdout)) { + return false, stdout, errors.Errorf("Failed to execute cluster fix (%v): %s | %s | %v", addressPortDecider(nodeIP, r.Port), stdout, stderr, err) + } + return true, stdout, nil +} + +func (r *RedisCLI) Role(nodeIP string) (string, error) { + args := []string{"-h", nodeIP, "role"} + args, _ = r.Handler.buildCommand(r.Port, args, r.Auth) + stdout, stderr, err := r.Handler.executeCommand([]string{}, args, false) + if err != nil || strings.TrimSpace(stderr) != "" || IsError(strings.TrimSpace(stdout)) { + return stdout, errors.Errorf("Failed to execute Role (%s): %s | %s | %v", nodeIP, stdout, stderr, err) + } + return stdout, nil +} diff --git a/controllers/rediscli/cli_test.go b/controllers/rediscli/cli_test.go index 7b7f3c29..9ca4ea2f 100644 --- a/controllers/rediscli/cli_test.go +++ b/controllers/rediscli/cli_test.go @@ -1,6 +1,7 @@ package rediscli import ( + "fmt" "strings" "testing" ) @@ -27,7 +28,7 @@ func (h *TestCommandHandler) buildCommand(routingPort string, args []string, aut return args, argListToArgMap(args) } -func (h *TestCommandHandler) executeCommand(args []string) (string, string, error) { +func (h *TestCommandHandler) executeCommand(pipedArgs []string, args []string, useBash bool, multipFactorForTimeout ...float64) (string, string, error) { executedCommand := "" for _, arg := range args { executedCommand += arg + " " @@ -68,6 +69,7 @@ func TestRedisCLI(test *testing.T) { testClusterCreate() testClusterCheck() testAddFollower() + testAddLeader() testDelNode() testClusterInfo() testInfo() @@ -79,10 +81,13 @@ func TestRedisCLI(test *testing.T) { testClusterFailOver() testClusterMeet() testClusterReset() + testClusterRebalance() + testClusterReshard() testFlushAll() testClusterReplicate() testACLLoad() testACLList() + testClusterFix() } func testClusterCreate() { @@ -142,6 +147,25 @@ func testAddFollower() { execAddFollowerTest("5", newNodeAddr, existingNodeAddr, leaderID, "-optArg1 optVal1", "-p 6379", "-optArg2 optVal2") } +func testAddLeader() { + // Test 1 : Routing port is not provided, newNodeAddr port is not provided, existingNodeAddr port is not provided, no optional args + newNodeAddr := "127.0.0.1" + existingNodeAddr := "128.1.1.2:" + execAddLeaderTest("1", newNodeAddr, existingNodeAddr) + // Test 2 : Routing port is provided, newNodeAddr port is provided, existingNodeAddr port is not provided, no optional args + newNodeAddr = "127.0.0.1:6565" + execAddLeaderTest("2", newNodeAddr, existingNodeAddr, "-p 8080") + // Test 3 : Routing port is not provided, newNodeAddr port is not provided, existingNodeAddr port is provided, optional arguments provided + newNodeAddr = "127.0.0.1:" + existingNodeAddr = "128.1.1.2:6377" + execAddLeaderTest("3", newNodeAddr, existingNodeAddr, "-optArg1 optVal1") + // Test 4 : Routing port is provided, newNodeAddr port is provided, existingNodeAddr port is provided, optional arguments provided + newNodeAddr = "127.0.0.1:6377" + execAddLeaderTest("4", newNodeAddr, existingNodeAddr, "-p 6379 -optArg1 optVal1 -optArg2 optVal2") + // Test 5 : Routing port is provided, newNodeAddr port is provided, existingNodeAddr port is provided, optional arguments provided as a parametrized arg list + execAddLeaderTest("5", newNodeAddr, existingNodeAddr, "-optArg1 optVal1", "-p 6379", "-optArg2 optVal2") +} + func testDelNode() { nodeIP := "127.0.0.1" nodeID := "abcde12345" @@ -301,6 +325,34 @@ func testClusterReset() { execClusterResetTest("5", nodeIP, "-p 6399", "-optArg1 optVal1") } +func testClusterRebalance() { + nodeIP := "127.0.0.1" + // Test 1 : Routing port is not provided, no optional arguments + execClusterRebalanceTest("1", nodeIP, true) + // Test 2 : Routing port is provided, no optional arguments + execClusterRebalanceTest("2", nodeIP, false, "-p 8383") + // Test 3 : Routing port is not provided, optional arguments are provided + execClusterRebalanceTest("3", nodeIP, true, "-optArg1 optVal1") + // Test 4 : Routing port is provided, optional arguments are provided + execClusterRebalanceTest("4", nodeIP, false, "-p 8384 -optArg1 optVal1") + // Test 5 : Routing port is provided, optional arguments are provided as parametrized arg list + execClusterRebalanceTest("5", nodeIP, true, "-p 6399", "-optArg1 optVal1") +} + +func testClusterReshard() { + nodeIP := "127.0.0.1" + // Test 1 : Routing port is not provided, no optional arguments + execClusterReshardTest("1", nodeIP, "abc", "edf", 16384) + // Test 2 : Routing port is provided, no optional arguments + execClusterReshardTest("2", nodeIP, "abc", "edf", 16384, "-p 8383") + // Test 3 : Routing port is not provided, optional arguments are provided + execClusterReshardTest("3", nodeIP, "abc", "edf", 16384, "-optArg1 optVal1") + // Test 4 : Routing port is provided, optional arguments are provided + execClusterReshardTest("4", nodeIP, "abc", "edf", 16384, "-p 8384 -optArg1 optVal1") + // Test 5 : Routing port is provided, optional arguments are provided as parametrized arg list + execClusterReshardTest("5", nodeIP, "abc", "edf", 16384, "-p 6399", "-optArg1 optVal1") +} + func testFlushAll() { nodeIP := "128.0.1.1" // Test 1 : Routing port is not provided, no optional arguments @@ -358,6 +410,20 @@ func testACLList() { execACLListTest("5", nodeIP, "-p 6381", "-optArg1 optVal1") } +func testClusterFix() { + nodeIP := "129.4.6.2" + // Test 1 : Routing port is not provided, no optional arguments + execClusterFixTest("1", nodeIP) + // Test 2 : Routing port is provided, no optional arguments + execClusterFixTest("2", nodeIP, "-p 6379") + // Test 3 : Routing port is not provided, optional arguments are provided + execClusterFixTest("3", nodeIP, "-optArg1 optVal1") + // Test 4 : Routing port is provided, optional arguments are provided + execClusterFixTest("4", nodeIP, "-p 6381 -optArg1 optVal1") + // Test 5 : Routing port is provided, optional arguments are provided as parametrized arg list + execClusterFixTest("5", nodeIP, "-p 6381", "-optArg1 optVal1") +} + // Test exec helpers func execClusterCreateTest(testCaseId string, addresses []string, opt ...string) { @@ -368,7 +434,7 @@ func execClusterCreateTest(testCaseId string, addresses []string, opt ...string) expectedArgList := append([]string{"--cluster", "create"}, updatedAddresses...) expectedArgList = append(expectedArgList, "--cluster-yes") expectedArgList, expectedArgMap := r.Handler.buildCommand(r.Port, expectedArgList, r.Auth, opt...) - expectedResult, _, _ := r.Handler.executeCommand(expectedArgList) + expectedResult, _, _ := r.Handler.executeCommand([]string{}, expectedArgList, false) resultHandler(expectedResult, result, "Cluster Create "+testCaseId, argMap, expectedArgMap) } @@ -378,7 +444,7 @@ func execClusterCheckTest(testCaseId string, address string, opt ...string) { argLineToArgMap(result, argMap) expectedArgList := []string{"--cluster", "check", addressPortDecider(address, r.Port)} expectedArgList, expectedArgMap := r.Handler.buildCommand(r.Port, expectedArgList, r.Auth, opt...) - expectedResult, _, _ := r.Handler.executeCommand(expectedArgList) + expectedResult, _, _ := r.Handler.executeCommand([]string{}, expectedArgList, false) resultHandler(expectedResult, result, "Cluster Check "+testCaseId, argMap, expectedArgMap) } @@ -393,17 +459,27 @@ func execAddFollowerTest(testCaseId string, newNodeAddr string, existingNodeAddr expectedArgList := []string{"--cluster", "add-node"} expectedArgList = append(expectedArgList, newNodeAddr, existingNodeAddr, leadershipType, leaderIdFlag, leaderID) expectedArgList, expectedArgMap := r.Handler.buildCommand(r.Port, expectedArgList, r.Auth, opt...) - expectedResult, _, _ := r.Handler.executeCommand(expectedArgList) + expectedResult, _, _ := r.Handler.executeCommand([]string{}, expectedArgList, false) resultHandler(expectedResult, result, "Add follower "+testCaseId, argMap, expectedArgMap) } +func execAddLeaderTest(testCaseId string, newNodeAddr string, existingNodeAddr string, opt ...string) { + result, _ := r.AddLeader(newNodeAddr, existingNodeAddr, opt...) + argMap := make(map[string]string) + argLineToArgMap(result, argMap) + expectedArgList := []string{"--cluster", "add-node", addressPortDecider(newNodeAddr, r.Port), addressPortDecider(existingNodeAddr, r.Port)} + expectedArgList, expectedArgMap := r.Handler.buildCommand(r.Port, expectedArgList, r.Auth, opt...) + expectedResult, _, _ := r.Handler.executeCommand([]string{}, expectedArgList, false) + resultHandler(expectedResult, result, "Add Leader "+testCaseId, argMap, expectedArgMap) +} + func execDelNodeTest(testCaseId string, nodeIP string, nodeID string, opt ...string) { result, _ := r.DelNode(nodeIP, nodeID, opt...) argMap := make(map[string]string) argLineToArgMap(result, argMap) expectedArgList := []string{"--cluster", "del-node", addressPortDecider(nodeIP, r.Port), nodeID} expectedArgList, expectedArgMap := r.Handler.buildCommand(r.Port, expectedArgList, r.Auth, opt...) - expectedResult, _, _ := r.Handler.executeCommand(expectedArgList) + expectedResult, _, _ := r.Handler.executeCommand([]string{}, expectedArgList, false) resultHandler(expectedResult, result, "Delete Node "+testCaseId, argMap, expectedArgMap) } @@ -413,7 +489,7 @@ func execClusterInfoTest(testCaseId string, nodeIP string, opt ...string) { argLineToArgMap(result, argMap) expectedArgList := []string{"-h", nodeIP, "cluster", "info"} expectedArgList, expectedArgMap := r.Handler.buildCommand(r.Port, expectedArgList, r.Auth, opt...) - expectedResult, _, _ := r.Handler.executeCommand(expectedArgList) + expectedResult, _, _ := r.Handler.executeCommand([]string{}, expectedArgList, false) resultHandler(expectedResult, result, "Cluster Info "+testCaseId, argMap, expectedArgMap) } @@ -423,7 +499,7 @@ func execInfoTest(testCaseId string, nodeIP string, opt ...string) { argLineToArgMap(result, argMap) expectedArgList := []string{"-h", nodeIP, "info"} expectedArgList, expectedArgMap := r.Handler.buildCommand(r.Port, expectedArgList, r.Auth, opt...) - expectedResult, _, _ := r.Handler.executeCommand(expectedArgList) + expectedResult, _, _ := r.Handler.executeCommand([]string{}, expectedArgList, false) resultHandler(expectedResult, result, "Info "+testCaseId, argMap, expectedArgMap) } @@ -433,7 +509,7 @@ func execPingTest(testCaseId string, nodeIP string, opt ...string) { argLineToArgMap(result, argMap) expectedArgList := []string{"-h", nodeIP, "ping"} expectedArgList, expectedArgMap := r.Handler.buildCommand(r.Port, expectedArgList, r.Auth, opt...) - expectedResult, _, _ := r.Handler.executeCommand(expectedArgList) + expectedResult, _, _ := r.Handler.executeCommand([]string{}, expectedArgList, false) resultHandler(expectedResult, result, "Ping "+testCaseId, argMap, expectedArgMap) } @@ -443,7 +519,7 @@ func execClusterNodesTest(testCaseId string, nodeIP string, opt ...string) { argLineToArgMap(result, argMap) expectedArgList := []string{"-h", nodeIP, "cluster", "nodes"} expectedArgList, expectedArgMap := r.Handler.buildCommand(r.Port, expectedArgList, r.Auth, opt...) - expectedResult, _, _ := r.Handler.executeCommand(expectedArgList) + expectedResult, _, _ := r.Handler.executeCommand([]string{}, expectedArgList, false) resultHandler(expectedResult, result, "Cluster Nodes "+testCaseId, argMap, expectedArgMap) } @@ -453,7 +529,7 @@ func execMyClusterIDTest(testCaseId string, nodeIP string, opt ...string) { argLineToArgMap(result, argMap) expectedArgLine := []string{"-h", nodeIP, "cluster", "myid"} expectedArgLine, expectedArgMap := r.Handler.buildCommand(r.Port, expectedArgLine, r.Auth, opt...) - expectedResult, _, _ := r.Handler.executeCommand(expectedArgLine) + expectedResult, _, _ := r.Handler.executeCommand([]string{}, expectedArgLine, false) resultHandler(expectedResult, result, "My Cluster ID "+testCaseId, argMap, expectedArgMap) } @@ -463,7 +539,7 @@ func execClusterForgetTest(testCaseId string, nodeIP string, forgetNodeID string argLineToArgMap(result, argMap) expectedArgList := []string{"-h", nodeIP, "cluster", "forget", forgetNodeID} expectedArgList, expectedArgMap := r.Handler.buildCommand(r.Port, expectedArgList, r.Auth, opt...) - expectedResult, _, _ := r.Handler.executeCommand(expectedArgList) + expectedResult, _, _ := r.Handler.executeCommand([]string{}, expectedArgList, false) resultHandler(expectedResult, result, "Cluster Forget "+testCaseId, argMap, expectedArgMap) } @@ -473,7 +549,7 @@ func execClusterReplicasTest(testCaseId string, nodeIP string, leaderNodeID stri argLineToArgMap(result, argMap) expectedArgList := []string{"-h", nodeIP, "cluster", "replicas", leaderNodeID} expectedArgList, expectedArgMap := r.Handler.buildCommand(r.Port, expectedArgList, r.Auth, opt...) - expectedResult, _, _ := r.Handler.executeCommand(expectedArgList) + expectedResult, _, _ := r.Handler.executeCommand([]string{}, expectedArgList, false) resultHandler(expectedResult, result, "Cluster Replicas "+testCaseId, argMap, expectedArgMap) } @@ -483,7 +559,7 @@ func execClusterFailOverTest(testCaseId string, nodeIP string, opt ...string) { argLineToArgMap(result, argMap) expectedArgList := []string{"-h", nodeIP, "cluster", "failover"} expectedArgList, expectedArgMap := r.Handler.buildCommand(r.Port, expectedArgList, r.Auth, opt...) - expectedResult, _, _ := r.Handler.executeCommand(expectedArgList) + expectedResult, _, _ := r.Handler.executeCommand([]string{}, expectedArgList, false) resultHandler(expectedResult, result, "Cluster Failover "+testCaseId, argMap, expectedArgMap) } @@ -493,7 +569,7 @@ func execClusterMeetTest(testCaseId string, nodeIP string, newNodeIP string, new argLineToArgMap(result, argMap) expectedArgList := []string{"-h", nodeIP, "cluster", "meet", newNodeIP, newNodePort} expectedArgList, expectedArgMap := r.Handler.buildCommand(r.Port, expectedArgList, r.Auth, opt...) - expectedResult, _, _ := r.Handler.executeCommand(expectedArgList) + expectedResult, _, _ := r.Handler.executeCommand([]string{}, expectedArgList, false) resultHandler(expectedResult, result, "Cluster Meet "+testCaseId, argMap, expectedArgMap) } @@ -503,17 +579,43 @@ func execClusterResetTest(testCaseId string, nodeIP string, opt ...string) { argLineToArgMap(result, argMap) expectedArgList := []string{"-h", nodeIP, "cluster", "reset"} expectedArgList, expectedArgMap := r.Handler.buildCommand(r.Port, expectedArgList, r.Auth, opt...) - expectedResult, _, _ := r.Handler.executeCommand(expectedArgList) + expectedResult, _, _ := r.Handler.executeCommand([]string{}, expectedArgList, false) resultHandler(expectedResult, result, "Cluster Reset "+testCaseId, argMap, expectedArgMap) } +func execClusterRebalanceTest(testCaseId string, nodeIP string, useEmptyMasters bool, opt ...string) { + _, result, _ := r.ClusterRebalance(nodeIP, useEmptyMasters, opt...) + argMap := make(map[string]string) + argLineToArgMap(fmt.Sprint(result), argMap) + expectedArgList := []string{"--cluster", "rebalance", addressPortDecider(nodeIP, r.Port)} + if useEmptyMasters { + expectedArgList = append(expectedArgList, "--cluster-use-empty-masters") + } + expectedArgList = append(expectedArgList, "--cluster-yes") + expectedArgList, expectedArgMap := r.Handler.buildCommand(r.Port, expectedArgList, r.Auth, opt...) + expectedResult, _, _ := r.Handler.executeCommand([]string{}, expectedArgList, false) + resultHandler(expectedResult, fmt.Sprint(result), "Cluster Rebalance "+testCaseId, argMap, expectedArgMap) +} + +func execClusterReshardTest(testCaseId string, nodeIP string, sourceId string, targetId string, slots int, opt ...string) { + _, result, _ := r.ClusterReshard(nodeIP, sourceId, targetId, slots, opt...) + argMap := make(map[string]string) + argLineToArgMap(fmt.Sprint(result), argMap) + expectedArgList := []string{"--cluster reshard", addressPortDecider(nodeIP, r.Port)} + expectedArgList = append(expectedArgList, "--cluster-from", sourceId, "--cluster-to", targetId) + expectedArgList = append(expectedArgList, "--cluster-slots", fmt.Sprint(slots), "--cluster-yes") + expectedArgList, expectedArgMap := r.Handler.buildCommand(r.Port, expectedArgList, r.Auth, opt...) + expectedResult, _, _ := r.Handler.executeCommand([]string{}, expectedArgList, false) + resultHandler(expectedResult, fmt.Sprint(result), "Cluster Reshard "+testCaseId, argMap, expectedArgMap) +} + func execFlushAllTest(testCaseId string, nodeIP string, opt ...string) { result, _ := r.Flushall(nodeIP, opt...) argMap := make(map[string]string) argLineToArgMap(result, argMap) expectedArgList := []string{"-h", nodeIP, "flushall"} expectedArgList, expectedArgMap := r.Handler.buildCommand(r.Port, expectedArgList, r.Auth, opt...) - expectedResult, _, _ := r.Handler.executeCommand(expectedArgList) + expectedResult, _, _ := r.Handler.executeCommand([]string{}, expectedArgList, false) resultHandler(expectedResult, result, "Flush All "+testCaseId, argMap, expectedArgMap) } @@ -523,7 +625,7 @@ func execClusterReplicateTest(testCaseId string, nodeIP string, leaderID string, argLineToArgMap(result, argMap) expectedArgList := []string{"-h", nodeIP, "cluster", "replicate", leaderID} expectedArgList, expectedArgMap := r.Handler.buildCommand(r.Port, expectedArgList, r.Auth, opt...) - expectedResult, _, _ := r.Handler.executeCommand(expectedArgList) + expectedResult, _, _ := r.Handler.executeCommand([]string{}, expectedArgList, false) resultHandler(expectedResult, result, "Cluster replicate "+testCaseId, argMap, expectedArgMap) } @@ -533,7 +635,7 @@ func execACLLoadTest(testcaseId string, nodeIP string, opt ...string) { argLineToArgMap(result, argMap) expectedArgList := []string{"-h", nodeIP, "acl", "load"} expectedArgList, expectedArgMap := r.Handler.buildCommand(r.Port, expectedArgList, r.Auth, opt...) - expectedResult, _, _ := r.Handler.executeCommand(expectedArgList) + expectedResult, _, _ := r.Handler.executeCommand([]string{}, expectedArgList, false) resultHandler(expectedResult, result, "ACLLoad "+testcaseId, argMap, expectedArgMap) } @@ -543,6 +645,17 @@ func execACLListTest(testCaseId string, nodeIP string, opt ...string) { argLineToArgMap(result, argMap) expectedArgList := []string{"-h", nodeIP, "acl", "list"} expectedArgList, expectedArgMap := r.Handler.buildCommand(r.Port, expectedArgList, r.Auth, opt...) - expectedResult, _, _ := r.Handler.executeCommand(expectedArgList) + expectedResult, _, _ := r.Handler.executeCommand([]string{}, expectedArgList, false) resultHandler(expectedResult, result, "ACLList "+testCaseId, argMap, expectedArgMap) } + +func execClusterFixTest(testCaseId string, nodeIP string, opt ...string) { + _, result, _ := r.ClusterFix(nodeIP, opt...) + argMap := make(map[string]string) + argLineToArgMap(result, argMap) + expectedArgList := []string{"--cluster", "fix", addressPortDecider(nodeIP, r.Port), "--cluster-fix-with-unreachable-masters", "--cluster-yes"} + expectedArgList, expectedArgMap := r.Handler.buildCommand(r.Port, expectedArgList, r.Auth, opt...) + //pipeArgs := []string{"yes", "yes"} + expectedResult, _, _ := r.Handler.executeCommand([]string{}, expectedArgList, false) + resultHandler(expectedResult, result, "Cluster fix "+testCaseId, argMap, expectedArgMap) +} diff --git a/controllers/rediscli/info.go b/controllers/rediscli/info.go index 38f23589..f2d2ff8d 100644 --- a/controllers/rediscli/info.go +++ b/controllers/rediscli/info.go @@ -36,6 +36,7 @@ type RedisClusterNode struct { PingRecv string ConfigEpoch string LinkState string + Slots string } func validateRedisInfo(redisInfo *RedisInfo) error { @@ -150,7 +151,7 @@ func NewRedisClusterNodes(rawData string) *RedisClusterNodes { if strings.Contains(nodeInfo[0], ")") { // special case for CLUSTER REPLICAS output nodeInfo = nodeInfo[1:] } - if len(nodeInfo) >= 8 { + if len(nodeInfo) >= 9 { nodes = append(nodes, RedisClusterNode{ ID: nodeInfo[0], Addr: nodeInfo[1], @@ -160,8 +161,20 @@ func NewRedisClusterNodes(rawData string) *RedisClusterNodes { PingRecv: nodeInfo[5], ConfigEpoch: nodeInfo[6], LinkState: nodeInfo[7], + Slots: nodeInfo[8], + }) + } else if len(nodeInfo) >= 8 { + nodes = append(nodes, RedisClusterNode{ + ID: nodeInfo[0], + Addr: nodeInfo[1], + Flags: nodeInfo[2], + Leader: nodeInfo[3], + PingSend: nodeInfo[4], + PingRecv: nodeInfo[5], + ConfigEpoch: nodeInfo[6], + LinkState: nodeInfo[7], + Slots: "", }) - } } return &nodes diff --git a/controllers/redisclient/redis_client.go b/controllers/redisclient/redis_client.go new file mode 100644 index 00000000..e56df4c4 --- /dev/null +++ b/controllers/redisclient/redis_client.go @@ -0,0 +1,163 @@ +package redisclient + +import ( + "context" + "errors" + "fmt" + "regexp" + "strings" + "sync" + + "github.com/PayU/redis-operator/controllers/rediscli" + "github.com/PayU/redis-operator/controllers/view" + "github.com/go-redis/redis/v8" +) + +type RedisClusterClient struct { + clients map[string]*redis.Client +} + +var clusterClient *RedisClusterClient = nil + +var lookups int = 5 + +var format string = "MOVED\\s*\\d+\\s*(\\d+\\.\\d+\\.\\d+\\.\\d+:\\d+)" +var comp *regexp.Regexp = regexp.MustCompile(format) + +func GetRedisClusterClient(v *view.RedisClusterView, cli *rediscli.RedisCLI) *RedisClusterClient { + mutex := &sync.Mutex{} + mutex.Lock() + if clusterClient == nil { + clusterClient = &RedisClusterClient{ + clients: map[string]*redis.Client{}, + } + } + for _, n := range v.Nodes { + if n == nil { + continue + } + nodes, _, err := cli.ClusterNodes(n.Ip) + if err != nil || nodes == nil || len(*nodes) <= 1 { + continue + } + addr := n.Ip + ":" + cli.Port + clusterClient.clients[addr] = redis.NewClient(&redis.Options{ + Addr: addr, + Username: "admin", + Password: "adminpass", + }) + } + mutex.Unlock() + return clusterClient +} + +func (c *RedisClusterClient) Set(key string, val interface{}, retries int) error { + if retries == 0 { + return errors.New(fmt.Sprintf("Could not set key [%v], val [%v] into cluster, all nodes errored during attempt", key, val)) + } + ctx := context.Background() + mutex := &sync.Mutex{} + for addr := range c.clients { + e := c.set(ctx, key, val, addr, lookups, mutex) + if e == nil { + return nil + } + } + return c.Set(key, val, retries-1) +} + +func (c *RedisClusterClient) set(ctx context.Context, key string, val interface{}, addr string, lookups int, mutex *sync.Mutex) error { + if lookups == 0 { + return errors.New(fmt.Sprintf("Could not write data row [%v, %v]", key, val)) + } + mutex.Lock() + client, exists := c.clients[addr] + mutex.Unlock() + if !exists || client == nil { + return errors.New(fmt.Sprintf("Client [%v] doesnt exists", addr)) + } + mutex.Lock() + _, err := client.Set(ctx, key, val, 0).Result() + mutex.Unlock() + if err != nil { + if strings.Contains(err.Error(), "MOVED") { + a := c.extractAddress(err.Error()) + return c.set(ctx, key, val, a, lookups-1, mutex) + } + if strings.Contains(err.Error(), "i/o timeout") { + mutex.Lock() + c.clients[addr] = nil + mutex.Unlock() + } + } + return err +} + +func (c *RedisClusterClient) Get(key string, retries int) (value string, err error) { + if retries == 0 { + return "", errors.New(fmt.Sprintf("Could not extract key [%v]", key)) + } + mutex := &sync.Mutex{} + ctx := context.Background() + for addr := range c.clients { + v, e := c.get(ctx, key, addr, lookups, mutex) + if e == nil { + return v, nil + } + } + return c.Get(key, retries-1) +} + +func (c *RedisClusterClient) get(ctx context.Context, key string, addr string, lookups int, mutex *sync.Mutex) (value string, err error) { + if lookups == 0 { + return "", errors.New(fmt.Sprintf("Could not extract key [%v]", key)) + } + mutex.Lock() + client, exists := c.clients[addr] + mutex.Unlock() + if !exists || client == nil { + return "", errors.New(fmt.Sprintf("Client [%v] doesnt exists", addr)) + } + mutex.Lock() + value, err = client.Get(ctx, key).Result() + mutex.Unlock() + if err != nil { + if strings.Contains(err.Error(), "nil") { + err = nil + } else { + if strings.Contains(err.Error(), "MOVED") { + a := c.extractAddress(err.Error()) + return c.get(ctx, key, a, lookups-1, mutex) + } + if strings.Contains(err.Error(), "i/o timeout") { + mutex.Lock() + c.clients[addr] = nil + mutex.Unlock() + return value, err + } + return value, err + } + } + return value, err +} + +func (c *RedisClusterClient) extractAddress(msg string) string { + matchingStrings := comp.FindAllStringSubmatch(msg, -1) + for _, match := range matchingStrings { + if len(match) > 1 { + if len(match[1]) > 0 { + return match[1] + } + } + } + return "" +} + +func (c *RedisClusterClient) FlushAllData() { + ctx := context.Background() + for _, client := range c.clients { + if client != nil { + client.FlushAll(ctx) + } + } +} diff --git a/controllers/rediscluster.go b/controllers/rediscluster.go index 3f68d910..cb53af37 100644 --- a/controllers/rediscluster.go +++ b/controllers/rediscluster.go @@ -1,13 +1,14 @@ package controllers import ( - "encoding/json" "fmt" + "math" "reflect" - "sort" + "regexp" "strconv" "strings" "sync" + "time" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/util/wait" @@ -16,200 +17,45 @@ import ( dbv1 "github.com/PayU/redis-operator/api/v1" rediscli "github.com/PayU/redis-operator/controllers/rediscli" - clusterData "github.com/PayU/redis-operator/data" + view "github.com/PayU/redis-operator/controllers/view" ) -var EMPTY struct{} +type ScaleType int -// Representation of a cluster, each element contains information about a leader -type RedisClusterView []LeaderNode +const ( + ScaleUpLeaders ScaleType = iota + ScaleUpFollowers + ScaleDownLeaders + ScaleDownFollowers +) -type LeaderNode struct { - Pod *corev1.Pod - NodeNumber string - RedisID string - Failed bool - Terminating bool - Followers []FollowerNode -} +var ( + memorySizeFormat = "\\s*(\\d+\\.*\\d*)\\w+" + comp = regexp.MustCompile(memorySizeFormat) +) -type FollowerNode struct { - Pod *corev1.Pod - NodeNumber string - LeaderNumber string - RedisID string - Failed bool - Terminating bool +func (s ScaleType) String() string { + return [...]string{"ScaleUpLeaders", "ScaleUpFollowers", "ScaleDownLeaders", "ScaleDownFollowers"}[s] } -// In-place sort of a cluster view - ascending alphabetical order by node number -func (v *RedisClusterView) Sort() { - sort.Slice(*v, func(i, j int) bool { - return (*v)[i].NodeNumber < (*v)[j].NodeNumber - }) - for _, leader := range *v { - sort.Slice(leader.Followers, func(i, j int) bool { - return leader.Followers[i].NodeNumber < leader.Followers[j].NodeNumber - }) - } -} - -func (v *RedisClusterView) String() string { - result := "" - for _, leader := range *v { - leaderStatus := "ok" - leaderPodStatus := "up" - if leader.Pod == nil { - leaderPodStatus = "down" - } else if leader.Terminating { - leaderPodStatus = "terminating" - } - if leader.Failed { - leaderStatus = "fail" - } - result = result + fmt.Sprintf("Leader: %s(%s,%s)-[", leader.NodeNumber, leaderPodStatus, leaderStatus) - for _, follower := range leader.Followers { - status := "ok" - podStatus := "up" - if follower.Pod == nil { - podStatus = "down" - } else if follower.Terminating { - podStatus = "terminating" - } - if follower.Failed { - status = "fail" - } - result = result + fmt.Sprintf("%s(%s,%s)", follower.NodeNumber, podStatus, status) - } - result += "]" - } - return result -} - -func (r *RedisClusterReconciler) NewRedisClusterView(redisCluster *dbv1.RedisCluster) (*RedisClusterView, error) { - var cv RedisClusterView - - pods, err := r.getRedisClusterPods(redisCluster) - if err != nil { - return nil, err - } - - followerCounter := redisCluster.Spec.LeaderCount - for i := 0; i < redisCluster.Spec.LeaderCount; i++ { - leader := LeaderNode{Pod: nil, NodeNumber: strconv.Itoa(i), RedisID: "", Failed: true, Terminating: false, Followers: nil} - for j := 0; j < redisCluster.Spec.LeaderFollowersCount; j++ { - follower := FollowerNode{Pod: nil, NodeNumber: strconv.Itoa(followerCounter), LeaderNumber: strconv.Itoa(i), RedisID: "", Failed: true, Terminating: false} - leader.Followers = append(leader.Followers, follower) - followerCounter++ - } - cv = append(cv, leader) - } - - for i, pod := range pods { - nn, err := strconv.Atoi(pod.Labels["node-number"]) - if err != nil { - return nil, errors.Errorf("Failed to parse node-number label: %s (%s)", pod.Labels["node-number"], pod.Name) - } - ln, err := strconv.Atoi(pod.Labels["leader-number"]) - if err != nil { - return nil, errors.Errorf("Failed to parse leader-number label: %s (%s)", pod.Labels["leader-number"], pod.Name) - } - if nn == ln { - cv[ln].Pod = &pods[i] - cv[ln].NodeNumber = pod.Labels["node-number"] - if pod.ObjectMeta.DeletionTimestamp != nil { - cv[ln].Terminating = true - } else { - if pod.Status.PodIP != "" { - clusterInfo, _, err := r.RedisCLI.ClusterInfo(pod.Status.PodIP) - if err == nil && clusterInfo != nil && (*clusterInfo)["cluster_state"] == "ok" { - cv[ln].Failed = false - } - } - } +func (r *RedisClusterReconciler) NewRedisClusterView(redisCluster *dbv1.RedisCluster) (*view.RedisClusterView, bool) { + r.Log.Info("Getting cluster view...") + v := &view.RedisClusterView{} + pods, e := r.getRedisClusterPods(redisCluster) + if e != nil { + r.Log.Error(e, "Could not fetch cluster pods list for cluster view") + return v, false + } + e = v.CreateView(pods, r.RedisCLI) + if e != nil { + if strings.Contains(e.Error(), "Non reachable node found") { + r.Log.Info("[Warn] Non reachable nodes found during view creation, re-attempting reconcile loop...") } else { - index := (nn - redisCluster.Spec.LeaderCount) % redisCluster.Spec.LeaderFollowersCount - cv[ln].Followers[index].Pod = &pods[i] - cv[ln].Followers[index].NodeNumber = pod.Labels["node-number"] - cv[ln].Followers[index].LeaderNumber = pod.Labels["leader-number"] - if pod.ObjectMeta.DeletionTimestamp != nil { - cv[ln].Followers[index].Terminating = true - } else { - if pod.Status.PodIP != "" { - clusterInfo, _, err := r.RedisCLI.ClusterInfo(pod.Status.PodIP) - if err == nil && clusterInfo != nil && (*clusterInfo)["cluster_state"] == "ok" { - cv[ln].Followers[index].Failed = false - } - } - } - } - } - return &cv, nil -} - -// Returns a list with all the IPs of the Redis nodes -func (v *RedisClusterView) IPs() []string { - var ips []string - for _, leader := range *v { - if leader.Pod != nil { - ips = append(ips, leader.Pod.Status.PodIP) - } - for _, follower := range leader.Followers { - if follower.Pod != nil { - ips = append(ips, follower.Pod.Status.PodIP) - } - } - } - return ips -} - -// Returns a list with all the IPs of the Redis nodes that are healthy -// A nod eis healthy if Redis can be reached and is in cluster mode -func (r *RedisClusterView) HealthyNodeIPs() []string { - var ips []string - for _, leader := range *r { - if leader.Pod != nil && !(leader.Failed || leader.Terminating) { - ips = append(ips, leader.Pod.Status.PodIP) + r.Log.Info("[Warn] Could not get view for api view update, Error: %v", e.Error()) } - for _, follower := range leader.Followers { - if follower.Pod != nil && !(follower.Failed || follower.Terminating) { - ips = append(ips, follower.Pod.Status.PodIP) - } - } - } - return ips -} - -type NodeNumbers [2]string // 0: node number, 1: leader number - -func (r *RedisClusterReconciler) getLeaderIP(followerIP string) (string, error) { - info, _, err := r.RedisCLI.Info(followerIP) - if err != nil { - return "", err + return v, false } - return info.Replication["master_host"], nil -} - -// Returns the node number and leader number from a pod -func (r *RedisClusterReconciler) getRedisNodeNumbersFromIP(namespace string, podIP string) (string, string, error) { - pod, err := r.getPodByIP(namespace, podIP) - if err != nil { - return "", "", err - } - return pod.Labels["node-number"], pod.Labels["leader-number"], err -} - -// Returns a mapping between node numbers and IPs -func (r *RedisClusterReconciler) getNodeIPs(redisCluster *dbv1.RedisCluster) (map[string]string, error) { - nodeIPs := make(map[string]string) - pods, err := r.getRedisClusterPods(redisCluster) - if err != nil { - return nil, err - } - for _, pod := range pods { - nodeIPs[pod.Labels["node-number"]] = pod.Status.PodIP - } - return nodeIPs, nil + return v, true } func (r *RedisClusterReconciler) createNewRedisCluster(redisCluster *dbv1.RedisCluster) error { @@ -219,47 +65,25 @@ func (r *RedisClusterReconciler) createNewRedisCluster(redisCluster *dbv1.RedisC return err } - if err := r.initializeCluster(redisCluster); err != nil { + if err := r.initializeLeaders(redisCluster); err != nil { return err } r.Log.Info("[OK] Redis cluster initialized successfully") return nil } -func (r *RedisClusterReconciler) initializeFollowers(redisCluster *dbv1.RedisCluster) error { - r.Log.Info("Initializing followers...") - leaderPods, err := r.getRedisClusterPods(redisCluster, "leader") - if err != nil { - return err - } - - var nodeNumbers []NodeNumbers - nodeNumber := redisCluster.Spec.LeaderCount // first node numbers are reserved for leaders - for _, leaderPod := range leaderPods { - for i := 0; i < redisCluster.Spec.LeaderFollowersCount; i++ { - nodeNumbers = append(nodeNumbers, NodeNumbers{strconv.Itoa(nodeNumber), leaderPod.Labels["node-number"]}) - nodeNumber++ - } - } - - err = r.addFollowers(redisCluster, nodeNumbers...) - if err != nil { - return err - } - - r.Log.Info("[OK] Redis followers initialized successfully") - return nil -} - -func (r *RedisClusterReconciler) initializeCluster(redisCluster *dbv1.RedisCluster) error { - var leaderNumbers []string +func (r *RedisClusterReconciler) initializeLeaders(redisCluster *dbv1.RedisCluster) error { + var leaderNames []string // leaders are created first to increase the chance they get scheduled on different // AZs when using soft affinity rules - for leaderNumber := 0; leaderNumber < redisCluster.Spec.LeaderCount; leaderNumber++ { - leaderNumbers = append(leaderNumbers, strconv.Itoa(leaderNumber)) + + for _, n := range r.RedisClusterStateView.Nodes { + if n.Name == n.LeaderName { + leaderNames = append(leaderNames, n.Name) + } } - newLeaderPods, err := r.createRedisLeaderPods(redisCluster, leaderNumbers...) + newLeaderPods, err := r.createRedisLeaderPods(redisCluster, leaderNames...) if err != nil { return err } @@ -283,566 +107,1188 @@ func (r *RedisClusterReconciler) initializeCluster(redisCluster *dbv1.RedisClust return err } - return r.waitForClusterCreate(nodeIPs) -} - -// Make a new Redis node join the cluster as a follower and wait until data sync is complete -func (r *RedisClusterReconciler) replicateLeader(followerIP string, leaderIP string) error { - r.Log.Info(fmt.Sprintf("Replicating leader: %s->%s", followerIP, leaderIP)) - leaderID, err := r.RedisCLI.MyClusterID(leaderIP) - if err != nil { + if err := r.waitForClusterCreate(nodeIPs); err != nil { return err } - followerID, err := r.RedisCLI.MyClusterID(followerIP) - if err != nil { - return err + for _, n := range r.RedisClusterStateView.Nodes { + if n.Name == n.LeaderName { + r.RedisClusterStateView.SetNodeState(n.Name, n.LeaderName, view.NodeOK) + } } + r.RedisClusterStateView.ClusterState = view.ClusterOK + return nil +} - if stdout, err := r.RedisCLI.AddFollower(followerIP, leaderIP, leaderID); err != nil { - if !strings.Contains(stdout, "All nodes agree about slots configuration") { - return err +func (r *RedisClusterReconciler) failOverToReplica(leaderName string, v *view.RedisClusterView) (promotedReplica *view.NodeView) { + for _, n := range v.Nodes { + if n != nil { + if n.LeaderName == leaderName && n.Name != leaderName { + err := r.attemptToFailOver(n.Ip) + if err != nil { + continue + } + return n + } } } + return nil +} - if err = r.waitForRedisMeet(leaderIP, followerIP); err != nil { - return err +func (r *RedisClusterReconciler) attemptToFailOver(followerIP string, opt ...string) error { + _, e := r.RedisCLI.Ping(followerIP) + if e != nil { + r.Log.Info(fmt.Sprintf("[Warning] Attempt to failover: ping to node ip [%s] failed", followerIP)) + return e } - - r.Log.Info(fmt.Sprintf("Replication successful")) - - if err = r.waitForRedisReplication(leaderIP, leaderID, followerID); err != nil { - return err + e = r.doFailover(followerIP, opt...) + if e != nil { + r.Log.Info(fmt.Sprintf("[Warning] Attempt to failover with node ip [%s] failed", followerIP)) + return e } - - return r.waitForRedisSync(followerIP) + r.Log.Info(fmt.Sprintf("[OK] Attempt to failover succeeded. [%s] is a leader", followerIP)) + return nil } -// Triggeres a failover command on the specified node and waits for the follower +// Triggers a failover command on the specified node and waits for the follower // to become leader -func (r *RedisClusterReconciler) doFailover(followerIP string, opt string) error { - r.Log.Info(fmt.Sprintf("Running 'cluster failover %s' on %s", opt, followerIP)) - _, err := r.RedisCLI.ClusterFailover(followerIP, opt) +func (r *RedisClusterReconciler) doFailover(promotedNodeIp string, opt ...string) error { + r.Log.Info(fmt.Sprintf("Running failover on [%s]", promotedNodeIp)) + _, err := r.RedisCLI.ClusterFailover(promotedNodeIp, opt...) if err != nil { return err } - if err := r.waitForManualFailover(followerIP); err != nil { - return err - } - return nil + return r.waitForManualFailover(promotedNodeIp) } -// Changes the role of a leader with one of its healthy followers -// Returns the IP of the promoted follower -// leaderIP: IP of leader that will be turned into a follower -// opt: the type of failover operation ('', 'force', 'takeover') -// followerIP (optional): followers that should be considered for the failover process -func (r *RedisClusterReconciler) doLeaderFailover(leaderIP string, opt string, followerIPs ...string) (string, error) { - var promotedFollowerIP string - leaderID, err := r.RedisCLI.MyClusterID(leaderIP) - if err != nil { - return "", err +func (r *RedisClusterReconciler) cleanMapFromNodesToRemove(redisCluster *dbv1.RedisCluster, v *view.RedisClusterView) { + r.Log.Info("Cleaning state map from nodes that shuold be removed...") + healthyLeaderName, found := r.findHealthyLeader(v) + if !found { + r.Log.Error(errors.New(""), "Could not find healthy leader ip, aborting remove and delete operation...") + return + } + healthyServerIp := v.Nodes[healthyLeaderName].Ip + podsToDelete := []corev1.Pod{} + toDeleteFromMap := []string{} + + // first remove followers (followers get errored when forgetting masters) + for _, n := range r.RedisClusterStateView.Nodes { + if n.Name != n.LeaderName { + node, exists := v.Nodes[n.Name] + if n.NodeState == view.DeleteNode { + if exists && node != nil { + r.removeNode(healthyServerIp, node) + podsToDelete = append(podsToDelete, node.Pod) + } + toDeleteFromMap = append(toDeleteFromMap, n.Name) + } + if n.NodeState == view.DeleteNodeKeepInMap { + if exists && node != nil { + r.removeNode(healthyServerIp, node) + podsToDelete = append(podsToDelete, node.Pod) + } + n.NodeState = view.CreateNode + } + } } - r.Log.Info(fmt.Sprintf("Starting manual failover on leader: %s(%s)", leaderIP, leaderID)) - - if len(followerIPs) != 0 { - for i, followerIP := range followerIPs { - if _, pingErr := r.RedisCLI.Ping(followerIP); pingErr == nil { - promotedFollowerIP = followerIPs[i] - break + // second remove leaders + for _, n := range r.RedisClusterStateView.Nodes { + if n.Name == n.LeaderName { + node, exists := v.Nodes[n.Name] + if n.NodeState == view.DeleteNode { + if exists && node != nil { + r.removeNode(healthyServerIp, node) + podsToDelete = append(podsToDelete, node.Pod) + } + toDeleteFromMap = append(toDeleteFromMap, n.Name) + } + if n.NodeState == view.DeleteNodeKeepInMap { + if exists && node != nil { + r.removeNode(healthyServerIp, node) + podsToDelete = append(podsToDelete, node.Pod) + } + n.NodeState = view.CreateNode } } - } else { - followers, _, err := r.RedisCLI.ClusterReplicas(leaderIP, leaderID) - if err != nil { - return "", err - } - if len(*followers) == 0 { - return "", errors.Errorf("Attempted FAILOVER on a leader (%s) with no followers. This case is not supported yet.", leaderIP) - } - for i := range *followers { - if !(*followers)[i].IsFailing() { - promotedFollowerIP = strings.Split((*followers)[i].Addr, ":")[0] + } + + // third delete pods + r.waitForAllNodesAgreeAboutSlotsConfiguration(v, redisCluster) + deletedPods, err := r.deletePods(podsToDelete) + if err != nil { + r.Log.Error(err, "Error while attempting to delete removed pods") + return + } + r.waitForPodDelete(deletedPods...) + + // four detect if there exists pods that are not reported in state map + for _, node := range v.Nodes { + if node != nil { + if _, existsInMap := r.RedisClusterStateView.Nodes[node.Name]; !existsInMap { + isMaster, err := r.checkIfMaster(node.Ip) + if err != nil || !isMaster { + r.deletePod(node.Pod) + return + } + r.waitForAllNodesAgreeAboutSlotsConfiguration(v, redisCluster) + r.scaleDownSingleUnit(node.Name, map[string]bool{node.Name: true}, v) + r.RedisClusterStateView.ClusterState = view.ClusterRebalance } } } - if err := r.doFailover(promotedFollowerIP, opt); err != nil { - return "", err + // five delete from map if necessary + for _, d := range toDeleteFromMap { + delete(r.RedisClusterStateView.Nodes, d) } - r.Log.Info(fmt.Sprintf("[OK] Leader failover successful for (%s). New leader: (%s)", leaderIP, promotedFollowerIP)) - return promotedFollowerIP, nil + r.Log.Info("Done processing nodes to be removed") } -// Recreates a leader based on a replica that took its place in a failover process; -// the old leader pod must be already deleted -func (r *RedisClusterReconciler) recreateLeader(redisCluster *dbv1.RedisCluster, promotedFollowerIP string) error { - nodeNumber, oldLeaderNumber, err := r.getRedisNodeNumbersFromIP(redisCluster.Namespace, promotedFollowerIP) - if err != nil { - return err - } - r.Log.Info(fmt.Sprintf("Recreating leader [%s] using node [%s]", oldLeaderNumber, nodeNumber)) - - newLeaderPods, err := r.createRedisLeaderPods(redisCluster, oldLeaderNumber) +func (r *RedisClusterReconciler) removeNode(healthyServerIp string, n *view.NodeView) error { + _, err := r.RedisCLI.Ping(n.Ip) if err != nil { return err } - newLeaderIP := newLeaderPods[0].Status.PodIP - - newLeaderPods, err = r.waitForPodReady(newLeaderPods...) + r.Log.Info(fmt.Sprintf("Removing node [%s] from all tables...", n.Id)) + _, err = r.RedisCLI.DelNode(healthyServerIp, n.Id) if err != nil { return err } - - if err := r.waitForRedis(newLeaderIP); err != nil { - return err - } - - if err = r.replicateLeader(newLeaderIP, promotedFollowerIP); err != nil { - return err - } - - r.Log.Info("Leader replication successful") - - if _, err = r.doLeaderFailover(promotedFollowerIP, "", newLeaderIP); err != nil { - return err + r.Log.Info(fmt.Sprintf("Waiting for node [%s:%s] removal to be completed...", n.Ip, n.Id)) + if pollErr := wait.PollImmediate(r.Config.Times.RedisRemoveNodeCheckInterval, r.Config.Times.RedisRemoveNodeTimeout, func() (bool, error) { + clusterNodes, _, err := r.RedisCLI.ClusterNodes(n.Ip) + if err != nil { + if strings.Contains(err.Error(), "Redis is loading the dataset in memory") { + return false, nil + } + return true, err + } + if len(*clusterNodes) == 1 { + return true, nil + } + return false, nil + }); pollErr != nil { + return pollErr } - - r.Log.Info(fmt.Sprintf("[OK] Leader [%s] recreated successfully; new IP: [%s]", oldLeaderNumber, newLeaderIP)) return nil } -// Adds one or more follower pods to the cluster -func (r *RedisClusterReconciler) addFollowers(redisCluster *dbv1.RedisCluster, nodeNumbers ...NodeNumbers) error { - if len(nodeNumbers) == 0 { - return errors.Errorf("Failed to add followers - no node numbers: (%s)", nodeNumbers) +func (r *RedisClusterReconciler) waitForAllNodesAgreeAboutSlotsConfiguration(v *view.RedisClusterView, redisCluster *dbv1.RedisCluster) { + r.Log.Info("Waiting for all cluster nodes to agree about slots configuration...") + if redisCluster != nil { + newView, ok := r.NewRedisClusterView(redisCluster) + if ok { + v = newView + } } - newFollowerPods, err := r.createRedisFollowerPods(redisCluster, nodeNumbers...) - if err != nil { - return err + mutex := &sync.Mutex{} + nonResponsive := map[string]bool{} + if pollErr := wait.PollImmediate(r.Config.Times.RedisNodesAgreeAboutSlotsConfigCheckInterval, r.Config.Times.RedisNodesAgreeAboutSlotsConfigTimeout, func() (bool, error) { + nameToTableSize := map[string]int{} + var wg sync.WaitGroup + for _, node := range v.Nodes { + if _, exclude := nonResponsive[node.Name]; exclude { + continue + } + wg.Add(1) + go func(node *view.NodeView) { + defer wg.Done() + stdout, err := r.RedisCLI.ClusterCheck(node.Ip) + if err != nil { + mutex.Lock() + nonResponsive[node.Name] = true + mutex.Unlock() + return + } + if strings.Contains(stdout, "[OK] All nodes agree about slots configuration") { + nodesTable, _, err := r.RedisCLI.ClusterNodes(node.Ip) + if err != nil || nodesTable == nil { + mutex.Lock() + nonResponsive[node.Name] = true + mutex.Unlock() + return + } + mutex.Lock() + nameToTableSize[node.Name] = len(*nodesTable) + mutex.Unlock() + } + }(node) + } + wg.Wait() + tableSize := -1 + for _, node := range v.Nodes { + if _, exclude := nonResponsive[node.Name]; exclude { + continue + } + if nodeTableSize, reported := nameToTableSize[node.Name]; !reported { + return false, nil + } else { + if tableSize == -1 { + tableSize = nodeTableSize + } else { + if nodeTableSize != tableSize { + return false, nil + } + } + } + } + return true, nil + }); pollErr != nil { + r.Log.Info("[Warn] Error occured during waiting for cluster nodes to agree about slots configuration, performing CLUSTER FIX might need to be followed by CLUSTER REBALANCE") } +} - nodeIPs, err := r.getNodeIPs(redisCluster) - if err != nil { - return err +func (r *RedisClusterReconciler) addLeaderNodes(redisCluster *dbv1.RedisCluster, healthyServerIp string, leaderNames []string, v *view.RedisClusterView) error { + if len(leaderNames) == 0 { + return nil } - - pods, err := r.waitForPodReady(newFollowerPods...) - if err != nil { - return err + leaders, e := r.createRedisLeaderPods(redisCluster, leaderNames...) + if e != nil || len(leaders) == 0 { + r.deletePods(leaders) + r.Log.Error(e, "Could not add new leaders") + return e } - - for _, followerPod := range pods { - if err := r.waitForRedis(followerPod.Status.PodIP); err != nil { - return err - } - r.Log.Info(fmt.Sprintf("Replicating: %s %s", followerPod.Name, "redis-node-"+followerPod.Labels["leader-number"])) - if err = r.replicateLeader(followerPod.Status.PodIP, nodeIPs[followerPod.Labels["leader-number"]]); err != nil { - return err - } + readyNodes := []corev1.Pod{} + var wg sync.WaitGroup + mutex := &sync.Mutex{} + wg.Add(len(leaders)) + for _, leader := range leaders { + go func(leader corev1.Pod) { + defer wg.Done() + if r.preperNewRedisNode(leader, mutex) { + mutex.Lock() + readyNodes = append(readyNodes, leader) + mutex.Unlock() + } + }(leader) } + wg.Wait() + for _, leader := range readyNodes { + r.waitForAllNodesAgreeAboutSlotsConfiguration(v, nil) + r.joindNewLeaderToCluster(leader, healthyServerIp, mutex) + } + r.waitForAllNodesAgreeAboutSlotsConfiguration(v, nil) return nil } -// Removes all nodes the cluster node table entries with IDs of nodes not available -// Recives the list of healthy cluster nodes (Redis is reachable and has cluster mode on) -func (r *RedisClusterReconciler) forgetLostNodes(redisCluster *dbv1.RedisCluster) error { +func (r *RedisClusterReconciler) preperNewRedisNode(pod corev1.Pod, mutex *sync.Mutex) bool { + newLeader, e := r.waitForPodReady(pod) + if e != nil || len(newLeader) == 0 { + message := fmt.Sprintf("Error while waiting for pod [%s] to be read", pod.Name) + r.handleCreateErrByDeleteGracefully(pod.Name, pod, mutex, message, e) + return false + } + leaderPod := newLeader[0] + e = r.waitForRedis(leaderPod.Status.PodIP) + if e != nil { + message := fmt.Sprintf("Error while waiting for pod [%s] to be ready", leaderPod.Name) + r.handleCreateErrByDeleteGracefully(pod.Name, leaderPod, mutex, message, e) + return false + } + r.RedisCLI.Flushall(leaderPod.Status.PodIP) + r.RedisCLI.ClusterReset(leaderPod.Status.PodIP) + return true +} - clusterView, err := r.NewRedisClusterView(redisCluster) - if err != nil { - return err +func (r *RedisClusterReconciler) joindNewLeaderToCluster(pod corev1.Pod, healthyServerIp string, mutex *sync.Mutex) { + r.Log.Info(fmt.Sprintf("Adding new leader: [%s]", pod.Name)) + _, e := r.RedisCLI.AddLeader(pod.Status.PodIP, healthyServerIp) + if e != nil { + message := fmt.Sprintf("Error while adding pod [%s] to redis cluster, healthy node ip: [%s]", pod.Name, healthyServerIp) + r.handleCreateErrByDeleteGracefully(pod.Name, pod, mutex, message, e) + return } + e = r.waitForRedisMeet(pod.Status.PodIP) + if e != nil { + message := fmt.Sprintf("Error while adding pod [%s] to redis cluster, healthy node ip: [%s]", pod.Name, healthyServerIp) + r.handleCreateErrByDeleteGracefully(pod.Name, pod, mutex, message, e) + return + } +} - lostNodeIDSet := make(map[string]struct{}) - nodeMap := make(map[string]string) +func (r *RedisClusterReconciler) handleCreateErrByDeleteGracefully(name string, leaderPod corev1.Pod, mutex *sync.Mutex, message string, e error) { + r.deletePod(leaderPod) + r.RedisClusterStateView.LockResourceAndRemoveFromMap(name, mutex) + r.Log.Error(e, message) +} +// Removes all nodes the cluster node table entries with IDs of nodes not available +// Recives the list of healthy cluster nodes (Redis is reachable and has cluster mode on) +func (r *RedisClusterReconciler) forgetLostNodes(redisCluster *dbv1.RedisCluster, v *view.RedisClusterView) bool { r.Log.Info("Forgetting lost nodes...") - healthyNodeIPs := clusterView.HealthyNodeIPs() - - for _, healthyNodeIP := range healthyNodeIPs { - healthyNodeID, err := r.RedisCLI.MyClusterID(healthyNodeIP) - if err != nil { - r.Log.Error(err, fmt.Sprintf("Could not reach node %s", healthyNodeIP)) - return err + healthyNodes := map[string]string{} + lostIds := map[string]bool{} + for _, node := range v.Nodes { + if node == nil { + continue } - nodeMap[healthyNodeIP] = healthyNodeID - } - - for healthyNodeIP := range nodeMap { - nodeTable, _, err := r.RedisCLI.ClusterNodes(healthyNodeIP) - if err != nil || nodeTable == nil || len(*nodeTable) == 0 { - r.Log.Info(fmt.Sprintf("[WARN] Could not forget lost nodes on node %s", healthyNodeIP)) + if _, declaredLost := lostIds[node.Id]; declaredLost { continue } - for _, node := range *nodeTable { - lost := true - for _, id := range nodeMap { - if node.ID == id { - lost = false - break + ipsToNodesTable, err := r.ClusterNodesWaitForRedisLoadDataSetInMemory(node.Ip) + nodesTable, exists := ipsToNodesTable[node.Ip] + if err != nil || !exists || nodesTable == nil { + continue + } + healthyNodes[node.Name] = node.Ip + for _, tableNode := range *nodesTable { + if strings.Contains(tableNode.Flags, "fail") { + lostIds[tableNode.ID] = true + } + } + } + r.waitForAllNodesAgreeAboutSlotsConfiguration(v, nil) + if len(lostIds) > 0 { + r.Log.Info(fmt.Sprintf("List of healthy nodes: %v", healthyNodes)) + r.Log.Info(fmt.Sprintf("List of lost nodes ids: %v", lostIds)) + failingForgets := r.runForget(lostIds, healthyNodes, map[string]string{}) + if len(failingForgets) > 0 { + for name, id := range failingForgets { + node, exists := v.Nodes[name] + if exists && node != nil { + _, err := r.RedisCLI.ClusterForget(node.Ip, id) + if err != nil { + r.RedisCLI.ClusterFailover(node.Ip) + r.waitForAllNodesAgreeAboutSlotsConfiguration(v, nil) + } } } - if lost { - lostNodeIDSet[node.ID] = EMPTY + time.Sleep(r.Config.Times.SleepIfForgetNodeFails) + for name, id := range failingForgets { + node, exists := v.Nodes[name] + if exists && node != nil { + _, err := r.RedisCLI.ClusterForget(node.Ip, id) + if err != nil && strings.Contains(err.Error(), "Can't forget my master") { + isMaster, err := r.checkIfMaster(node.Ip) + if err == nil && isMaster { + for name, _ := range healthyNodes { + if _, failing := failingForgets[name]; !failing { + if h, exists := v.Nodes[name]; exists { + r.reshardAndKeepInMap(node.Name, node.LeaderName, h.Ip, h.Id, v) + break + } + } + } + } + r.deletePod(node.Pod) + } + } } + r.runForget(lostIds, healthyNodes, failingForgets) } + r.Log.Info(fmt.Sprintf("Cluster FORGET sent for [%v] lost nodes", len(lostIds))) } + return len(lostIds) > 0 +} - for id := range lostNodeIDSet { - r.forgetNode(healthyNodeIPs, id) +func (r *RedisClusterReconciler) runForget(lostIds map[string]bool, healthyNodes map[string]string, ignore map[string]string) map[string]string { + podsToDelete := map[string]string{} + var wg sync.WaitGroup + waitIfFails := 20 * time.Second + mutex := &sync.Mutex{} + wg.Add(len(lostIds) * len(healthyNodes)) + for id, _ := range lostIds { + for name, ip := range healthyNodes { + go func(ip string, id string) { + defer wg.Done() + if _, toIgnore := ignore[name]; toIgnore { + return + } + _, err := r.RedisCLI.ClusterForget(ip, id) + if err != nil && strings.Contains(err.Error(), "Can't forget my master") { + mutex.Lock() + r.Log.Info(fmt.Sprintf("[Warn] node [%v:%v] is not able to forget [%v] properly, additional attempt to forget will be performed within [%v], additional failure to forget [%v] will lead to node [%v:%v] deletion", name, ip, id, waitIfFails, id, name, ip)) + podsToDelete[name] = id + mutex.Unlock() + } + }(ip, id) + } } - data, _ := json.MarshalIndent(clusterView, "", "") - clusterData.SaveRedisClusterView(data) - return nil + wg.Wait() + return podsToDelete } -// Removes a node from the cluster nodes table of all specified nodes -// nodeIPs: list of active node IPs -// removedID: ID of node to be removed -func (r *RedisClusterReconciler) forgetNode(nodeIPs []string, removedID string) error { - var wg sync.WaitGroup - errs := make(chan error, len(nodeIPs)) +func (r *RedisClusterReconciler) recoverCluster(redisCluster *dbv1.RedisCluster, v *view.RedisClusterView) error { - for _, nodeIP := range nodeIPs { - wg.Add(1) - go func(ip string, wg *sync.WaitGroup) { - defer wg.Done() - r.Log.Info(fmt.Sprintf("Running cluster FORGET with: %s %s", ip, removedID)) - if _, err := r.RedisCLI.ClusterForget(ip, removedID); err != nil { - // TODO we should chatch here the error thrown when the ID was already removed - errs <- err - } - }(nodeIP, &wg) + failingPodsCleared := r.waitForNonReachablePodsTermination(redisCluster, v) + if failingPodsCleared { + r.Log.Info("[Warn] Terminating pods detcted...") + return nil } - wg.Wait() - close(errs) + lostNodesDetcted := r.forgetLostNodes(redisCluster, v) + if lostNodesDetcted { + r.Log.Info("[Warn] Lost nodes detcted on some of cluster nodes...") + return nil + } - for err := range errs { - if err != nil { - return err - } + r.Log.Info("Validating cluster state...") + recoveryRequired, err := r.recoverRedisCluster(redisCluster, v) + if err != nil || recoveryRequired { + return err + } + + r.Log.Info("Recovering non healthy nodes...") + recoveryRequired = r.recoverNodes(redisCluster, v) + if recoveryRequired { + return nil + } + + complete, err := r.isClusterHealthy(redisCluster, v) + recoveryComplete := complete && err == nil + if err != nil { + r.Log.Error(err, "Could not perform cluster-complete validation") } + r.Log.Info(fmt.Sprintf("Recovery complete: %v", recoveryComplete)) + if recoveryComplete { + redisCluster.Status.ClusterState = string(Ready) + } + return nil } -func (r *RedisClusterReconciler) cleanupNodeList(podIPs []string) error { +func (r *RedisClusterReconciler) detectLossOfLeadersWithAllReplicas(v *view.RedisClusterView) []string { + missing := []string{} + reportedMissing := map[string]bool{} var wg sync.WaitGroup - errs := make(chan error, len(podIPs)) - - r.Log.Info(fmt.Sprintf("Cleanning up: %v", podIPs)) - - for _, podIP := range podIPs { + mutex := &sync.Mutex{} + for _, n := range r.RedisClusterStateView.Nodes { + if n.Name != n.LeaderName { + continue + } wg.Add(1) - go func(ip string, wg *sync.WaitGroup) { + go func(n *view.NodeStateView) { defer wg.Done() - clusterNodes, _, err := r.RedisCLI.ClusterNodes(ip) - if err != nil { - // TODO node is not reachable => nothing to clean; we could consider throwing an error instead + _, exists := v.Nodes[n.Name] + if exists || n.NodeState == view.DeleteNode || n.NodeState == view.ReshardNode || n.NodeState == view.DeleteNodeKeepInMap || n.NodeState == view.ReshardNodeKeepInMap { return } - for _, clusterNode := range *clusterNodes { - if clusterNode.IsFailing() { - // TODO opportunity for higher concurrency - spawn a routine for each ClusterForget command - if _, err := r.RedisCLI.ClusterForget(ip, clusterNode.ID); err != nil { - errs <- err - return - } - } + _, hasPromotedReplica := r.findPromotedMasterReplica(n.LeaderName, v) + if hasPromotedReplica { + return } - }(podIP, &wg) + mutex.Lock() + _, markedMissing := reportedMissing[n.LeaderName] + if markedMissing { + return + } + missing = append(missing, n.LeaderName) + r.RedisClusterStateView.SetNodeState(n.LeaderName, n.LeaderName, view.CreateNode) + mutex.Unlock() + }(n) } - wg.Wait() - close(errs) + return missing +} - for err := range errs { - if err != nil { - return err +func (r *RedisClusterReconciler) findPromotedMasterReplica(leaderName string, v *view.RedisClusterView) (*view.NodeView, bool) { + for _, node := range v.Nodes { + if node == nil { + continue + } + if leaderName != node.LeaderName { + continue + } + info, _, err := r.RedisCLI.Info(node.Ip) + if err != nil || info == nil || info.Replication["role"] != "master" { + continue + } + ipsToNodesTable, err := r.ClusterNodesWaitForRedisLoadDataSetInMemory(node.Ip) + nodesTable, exists := ipsToNodesTable[node.Ip] + if err != nil || !exists || nodesTable == nil { + continue } + if len(*nodesTable) == 1 { + continue + } + return node, true } - return nil + return nil, false } -// Handles the failover process for a leader. Waits for automatic failover, then -// attempts a forced failover and eventually a takeover -// Returns the ip of the promoted follower -func (r *RedisClusterReconciler) handleFailover(redisCluster *dbv1.RedisCluster, leader *LeaderNode) (string, error) { - var promotedPodIP string = "" - - promotedPodIP, err := r.waitForFailover(redisCluster, leader) - if err != nil || promotedPodIP == "" { - r.Log.Info(fmt.Sprintf("[WARN] Automatic failover failed for leader [%s]. Attempting forced failover.", leader.NodeNumber)) +func (r *RedisClusterReconciler) retrievePodForProcessing(name string, leaderName string, pods map[string]corev1.Pod, v *view.RedisClusterView) (corev1.Pod, bool) { + pod := corev1.Pod{} + newPod, justCreated := pods[name] + existingNode, exists := v.Nodes[name] + if justCreated { + pod = newPod } else { - return promotedPodIP, nil + if exists { + pod = existingNode.Pod + } else { + return pod, false + } } + return pod, true +} - // Automatic failover failed. Attempt to force failover on a healthy follower. - for _, follower := range leader.Followers { - if follower.Pod != nil && !follower.Failed { - if _, pingErr := r.RedisCLI.Ping(follower.Pod.Status.PodIP); pingErr == nil { - if forcedFailoverErr := r.doFailover(follower.Pod.Status.PodIP, "force"); forcedFailoverErr != nil { - if rediscli.IsFailoverNotOnReplica(forcedFailoverErr) { - r.Log.Info(fmt.Sprintf("Forced failover successful on [%s](%s)", follower.NodeNumber, follower.Pod.Status.PodIP)) - promotedPodIP = follower.Pod.Status.PodIP - break - } - r.Log.Error(forcedFailoverErr, fmt.Sprintf("[WARN] Failed forced attempt to make node [%s](%s) leader", follower.NodeNumber, follower.Pod.Status.PodIP)) - } else { - r.Log.Info(fmt.Sprintf("Forced failover successful on [%s](%s)", follower.NodeNumber, follower.Pod.Status.PodIP)) - promotedPodIP = follower.Pod.Status.PodIP - break - } - } +func (r *RedisClusterReconciler) checkStateMissAlignments(n *view.NodeStateView, v *view.RedisClusterView, mutex *sync.Mutex) bool { + skip := n.NodeState == view.DeleteNode || n.NodeState == view.ReshardNode || n.NodeState == view.DeleteNodeKeepInMap || n.NodeState == view.ReshardNodeKeepInMap + if skip { + return false + } + node, exists := v.Nodes[n.Name] + if !exists { + r.RedisClusterStateView.LockResourceAndSetNodeState(n.Name, n.LeaderName, view.CreateNode, mutex) + return false + } + ipsToNodesTable, err := r.ClusterNodesWaitForRedisLoadDataSetInMemory(node.Ip) + nodesTable, exists := ipsToNodesTable[node.Ip] + if err != nil || !exists || nodesTable == nil { + r.RedisClusterStateView.LockResourceAndSetNodeState(n.Name, n.LeaderName, view.DeleteNodeKeepInMap, mutex) + return false + } + if len(*nodesTable) == 1 { + r.RedisClusterStateView.LockResourceAndSetNodeState(n.Name, n.LeaderName, view.AddNode, mutex) + return false + } + if _, leaderInMap := r.RedisClusterStateView.Nodes[n.LeaderName]; !leaderInMap { + node, exists := v.Nodes[n.Name] + if !exists || node == nil { + r.RedisClusterStateView.LockResourceAndSetNodeState(n.Name, n.LeaderName, view.DeleteNode, mutex) + return false + } + isMaster, err := r.checkIfMaster(node.Ip) + if err != nil || !isMaster { + r.RedisClusterStateView.LockResourceAndSetNodeState(n.Name, n.LeaderName, view.DeleteNode, mutex) + return false + } + r.RedisClusterStateView.LockResourceAndSetNodeState(n.Name, n.LeaderName, view.ReshardNode, mutex) + return false + } + if n.Name == n.LeaderName { + isMaster, err := r.checkIfMaster(node.Ip) + if err != nil { + return false + } + if !isMaster { + r.RedisClusterStateView.LockResourceAndSetNodeState(n.Name, n.LeaderName, view.FailoverNode, mutex) + return false } } + return true +} - if promotedPodIP != "" { - return promotedPodIP, nil +func (r *RedisClusterReconciler) removeSoloLeaders(v *view.RedisClusterView) { + for _, node := range v.Nodes { + if node == nil { + continue + } + ipsToNodesTables, err := r.ClusterNodesWaitForRedisLoadDataSetInMemory(node.Ip) + nodesTable, exists := ipsToNodesTables[node.Ip] + if err != nil || !exists || nodesTable == nil || len(*nodesTable) == 1 { + r.deletePod(node.Pod) + } } +} - // Forced failover failed. Attempt to takeover on a healthy follower. - for _, follower := range leader.Followers { - if follower.Pod != nil && !follower.Failed { - if _, pingErr := r.RedisCLI.Ping(follower.Pod.Status.PodIP); pingErr == nil { - if forcedFailoverErr := r.doFailover(follower.Pod.Status.PodIP, "takeover"); forcedFailoverErr != nil { - if rediscli.IsFailoverNotOnReplica(forcedFailoverErr) { - r.Log.Info(fmt.Sprintf("Takeover successful on [%s](%s)", follower.NodeNumber, follower.Pod.Status.PodIP)) - promotedPodIP = follower.Pod.Status.PodIP - break - } - r.Log.Error(forcedFailoverErr, fmt.Sprintf("[WARN] Failed takeover attempt to make node [%s](%s) leader", follower.NodeNumber, follower.Pod.Status.PodIP)) - } else { - r.Log.Info(fmt.Sprintf("Takeover successful on [%s](%s)", follower.NodeNumber, follower.Pod.Status.PodIP)) - promotedPodIP = follower.Pod.Status.PodIP +func (r *RedisClusterReconciler) handleLossOfLeaderWithAllReplicas(redisCluster *dbv1.RedisCluster, v *view.RedisClusterView) bool { + r.Log.Info("Checking for potential data loss..") + missingLeadersWithLossOfReplicas := r.detectLossOfLeadersWithAllReplicas(v) + if len(missingLeadersWithLossOfReplicas) > 0 { + r.removeSoloLeaders(v) + r.Log.Info("[Warn] Loss of leader with all of his replica detected, mitigating with CLUSTER FIX...") + r.RedisClusterStateView.ClusterState = view.ClusterFix + healthyLeaderName, found := r.findHealthyLeader(v) + if !found { + return true + } + healthyLeader, exists := v.Nodes[healthyLeaderName] + if !exists { + return true + } + _, stdout, e := r.RedisCLI.ClusterFix(healthyLeader.Ip) + if e != nil && !strings.Contains(e.Error(), "[OK] All 16384 slots covered") && !strings.Contains(stdout, "[OK] All 16384 slots covered") { + return true + } + r.RedisClusterStateView.ClusterState = view.ClusterRebalance + r.waitForAllNodesAgreeAboutSlotsConfiguration(v, redisCluster) + rebalanced, _, e := r.RedisCLI.ClusterRebalance(healthyLeader.Ip, true) + if !rebalanced || e != nil { + r.RedisClusterStateView.ClusterState = view.ClusterFix + return true + } + r.addLeaderNodes(redisCluster, healthyLeader.Ip, missingLeadersWithLossOfReplicas, v) + return true + } + r.Log.Info("[OK] At least one leader exdists for each set of replicas") + return false +} + +func (r *RedisClusterReconciler) detectExisintgLeadersAndReplicasWithFullDataLoss(v *view.RedisClusterView) bool { + // If node is created thinking there exists other temp-master to replicate from, during its waiting for pod-create the other single replica fails + // it will get stuck in "CreateNode/AddNode" state forever, cluster will be in [recover] mode forever + // indication: + // If all the nodes with the same leader name has len of nodes == 1 + // reshard & delete them all, it will be detected later as case of loss leaders with all followers, and recovered by FIX -> REBALANCE + lost := []string{} + var wg sync.WaitGroup + mutex := &sync.Mutex{} + for _, n := range v.Nodes { + if n.Name != n.LeaderName { + continue + } + k, existsInMap := r.RedisClusterStateView.Nodes[n.Name] + if !existsInMap || k.NodeState == view.DeleteNode || k.NodeState == view.DeleteNodeKeepInMap || k.NodeState == view.ReshardNode || k.NodeState == view.ReshardNodeKeepInMap { + continue + } + wg.Add(1) + go func(n *view.NodeView) { + defer wg.Done() + setOK := false + nodes, _, err := r.RedisCLI.ClusterNodes(n.Ip) + if err != nil || nodes == nil { + return + } + if len(*nodes) > 2 { + return + } + for _, node := range v.Nodes { + if node.LeaderName != n.LeaderName || node.Name == n.Name { + continue + } + nodes, _, err := r.RedisCLI.ClusterNodes(node.Ip) + if err != nil || nodes == nil { + continue + } + if len(*nodes) > 2 { + setOK = true break } } + if !setOK { + // At this point: n has one recognized node in his table, all the existing nodes with same leader name has one recognized node in their table + // Mitigation: all of them need to be resharded, but kept in map + mutex.Lock() + lost = append(lost, n.LeaderName) + if r.RedisClusterStateView.ClusterState != view.ClusterFix { + r.RedisClusterStateView.ClusterState = view.ClusterFix + } + mutex.Unlock() + } + }(n) + } + wg.Wait() + for _, l := range lost { + r.Log.Info(fmt.Sprintf("[Warn] A case of existing nodes with complete loss of data detected, leader name: [%v], all replicas of this leader will be resharded and deleted", l)) + for _, node := range v.Nodes { + if node.LeaderName == l { + r.RedisClusterStateView.SetNodeState(node.Name, node.LeaderName, view.ReshardNodeKeepInMap) + } } } - return promotedPodIP, nil + return len(lost) > 0 } -func (r *RedisClusterReconciler) recoverCluster(redisCluster *dbv1.RedisCluster) error { - var runLeaderRecover bool = false - clusterView, err := r.NewRedisClusterView(redisCluster) - if err != nil { - return err +func (r *RedisClusterReconciler) recoverNodes(redisCluster *dbv1.RedisCluster, v *view.RedisClusterView) bool { + if r.handleLossOfLeaderWithAllReplicas(redisCluster, v) { + return true } - - r.Log.Info(clusterView.String()) - for i, leader := range *clusterView { - if leader.Failed { - runLeaderRecover = true - - if leader.Terminating { - if err = r.waitForPodDelete(*leader.Pod); err != nil { - return errors.Errorf("Failed to wait for leader pod to be deleted %s: %v", leader.NodeNumber, err) - } + if r.detectExisintgLeadersAndReplicasWithFullDataLoss(v) { + return true + } + // must be synchrounous + if r.handleInterruptedScaleFlows(redisCluster, v) { + return true + } + actionRequired := false + mutex := &sync.Mutex{} + r.Log.Info("Detecting missing nodes in cluster...") + pods := r.createMissingRedisPods(redisCluster, v) + if len(pods) == 0 { + r.Log.Info("[OK] No missing nodes detected") + } + var wg sync.WaitGroup + // can be asynchrounous + r.Log.Info("Mitigating failures and interrupted flows...") + for _, n := range r.RedisClusterStateView.Nodes { + if n.NodeState == view.NodeOK { + if r.checkStateMissAlignments(n, v, mutex) { + continue } - - promotedPodIP, err := r.handleFailover(redisCluster, &(*clusterView)[i]) - if err != nil { - return err + } + if n.NodeState == view.DeleteNode || n.NodeState == view.DeleteNodeKeepInMap || n.NodeState == view.ReshardNode || n.NodeState == view.ReshardNodeKeepInMap { + continue + } + wg.Add(1) + go func(n *view.NodeStateView) { + defer wg.Done() + pod, proceed := r.retrievePodForProcessing(n.Name, n.LeaderName, pods, v) + if !proceed { + return } - - if leader.Pod != nil && !leader.Terminating { - _, err := r.deletePodsByIP(redisCluster.Namespace, leader.Pod.Status.PodIP) - if err != nil { - return err - } - if err = r.waitForPodDelete(*leader.Pod); err != nil { - return err - } + op := r.handleInterruptedClusterHealthFlow(redisCluster, n, pod, v, mutex) + if op { + mutex.Lock() + actionRequired = true + mutex.Unlock() } - if err := r.forgetLostNodes(redisCluster); err != nil { - return err - } + }(n) + } + wg.Wait() + if !actionRequired { + r.Log.Info("[OK] Cluster nodes are healthy") + } + return actionRequired +} - if err := r.recreateLeader(redisCluster, promotedPodIP); err != nil { - return err - } - } +func (r *RedisClusterReconciler) handleInterruptedScaleFlows(redisCluster *dbv1.RedisCluster, v *view.RedisClusterView) bool { + actionRequired := false + r.Log.Info("Checking if sharding process have been interrupted for some nodes...") + for _, n := range r.RedisClusterStateView.Nodes { + switch n.NodeState { + case view.ReshardNode: + actionRequired = true + r.scaleDownLeader(n.Name, n.LeaderName, map[string]bool{n.Name: true}, v) + break + case view.ReshardNodeKeepInMap: + actionRequired = true + r.scaleDownLeaderKeepInMap(n.Name, n.LeaderName, map[string]bool{n.Name: true}, v) + break + case view.NewEmptyNode: + actionRequired = true + r.recoverFromNewEmptyNode(n.Name, v) + break + } + } + if actionRequired { + r.RedisClusterStateView.ClusterState = view.ClusterRebalance + } else { + r.Log.Info("[OK] Previous sharding requests ended successfully") } + return actionRequired +} - if runLeaderRecover { - // we fetch again the cluster view in case the state has changed - // since the last check (before handling the failed leaders) - clusterView, err = r.NewRedisClusterView(redisCluster) - if err != nil { - return err - } +func (r *RedisClusterReconciler) handleInterruptedClusterHealthFlow(redisCluster *dbv1.RedisCluster, n *view.NodeStateView, pod corev1.Pod, v *view.RedisClusterView, mutex *sync.Mutex) bool { + + promotedMasterReplica, hasPromotedReplica := r.findPromotedMasterReplica(n.LeaderName, v) + if !hasPromotedReplica || promotedMasterReplica == nil { + return true + } + + if promotedMasterReplica.Name == n.Name { + r.RedisClusterStateView.LockResourceAndSetNodeState(n.Name, n.LeaderName, view.NodeOK, mutex) + } + + m := &view.MissingNodeView{ + Name: n.Name, + LeaderName: n.LeaderName, + CurrentMasterName: promotedMasterReplica.Name, + CurrentMasterId: promotedMasterReplica.Id, + CurrentMasterIp: promotedMasterReplica.Ip, + } + var err error + actionRequired := false + + switch n.NodeState { + case view.AddNode: + actionRequired = true + mutex.Lock() + r.waitForAllNodesAgreeAboutSlotsConfiguration(v, nil) + mutex.Unlock() + err = r.recoverFromAddNode(pod, m, mutex) + break + case view.ReplicateNode: + actionRequired = true + err = r.recoverFromReplicateNode(pod.Status.PodIP, m, mutex) + break + case view.SyncNode: + actionRequired = true + err = r.recoverFromSyncNode(pod.Status.PodIP, m, mutex) + break + case view.FailoverNode: + actionRequired = true + err = r.recoverFromFailOver(pod.Status.PodIP, m, mutex) + break + } - r.Log.Info(clusterView.String()) + if err != nil { + r.forgetLostNodes(redisCluster, v) + r.RedisClusterStateView.ClusterState = view.ClusterFix + r.Log.Info("[Warn] " + err.Error()) } - for _, leader := range *clusterView { - var missingFollowers []NodeNumbers - var failedFollowerIPs []string - var terminatingFollowerIPs []string - var terminatingFollowerPods []corev1.Pod + return actionRequired +} - for _, follower := range leader.Followers { - if follower.Pod == nil { - missingFollowers = append(missingFollowers, NodeNumbers{follower.NodeNumber, follower.LeaderNumber}) - } else if follower.Terminating { - terminatingFollowerPods = append(terminatingFollowerPods, *follower.Pod) - terminatingFollowerIPs = append(terminatingFollowerIPs, follower.Pod.Status.PodIP) - missingFollowers = append(missingFollowers, NodeNumbers{follower.NodeNumber, follower.LeaderNumber}) - } else if follower.Failed { - failedFollowerIPs = append(failedFollowerIPs, follower.Pod.Status.PodIP) - missingFollowers = append(missingFollowers, NodeNumbers{follower.NodeNumber, follower.LeaderNumber}) +func (r *RedisClusterReconciler) recoverFromNewEmptyNode(name string, v *view.RedisClusterView) { + if n, exists := v.Nodes[name]; exists && n != nil { + ipsToNodesTables, err := r.ClusterNodesWaitForRedisLoadDataSetInMemory(n.Ip) + nodesTable, exists := ipsToNodesTables[n.Ip] + if err != nil || !exists || nodesTable == nil || len(*nodesTable) <= 1 { + r.RedisClusterStateView.SetNodeState(n.Name, n.LeaderName, view.DeleteNodeKeepInMap) + return + } + for _, tableNode := range *nodesTable { + if tableNode.ID == n.Id { + if len(tableNode.Slots) > 0 { + r.RedisClusterStateView.SetNodeState(n.Name, n.LeaderName, view.NodeOK) + } } } - deletedPods, err := r.deletePodsByIP(redisCluster.Namespace, failedFollowerIPs...) + } +} + +func (r *RedisClusterReconciler) recoverFromAddNode(p corev1.Pod, m *view.MissingNodeView, mutex *sync.Mutex) error { + masterIp := m.CurrentMasterIp + masterId := m.CurrentMasterId + newPodIp := p.Status.PodIP + + ipsToNodesTables, err := r.ClusterNodesWaitForRedisLoadDataSetInMemory(newPodIp) + nodesTable, exists := ipsToNodesTables[newPodIp] + if err != nil || !exists || nodesTable == nil { + return err + } + if len(*nodesTable) == 1 { + warnMsg := "[WARN] This failure might be an indication for additional failures that appeared in cluster during recovering process, try to wait for/induce FORGET of failing nodes and re-attempt reconcile loop" + r.Log.Info(fmt.Sprintf("Adding new redis node [%s], current master [%s]", m.Name, m.CurrentMasterName)) + mutex.Lock() + r.RedisClusterStateView.SetNodeState(m.Name, m.LeaderName, view.AddNode) + _, err = r.RedisCLI.AddFollower(newPodIp, masterIp, masterId) + mutex.Unlock() if err != nil { + r.Log.Error(err, fmt.Sprintf("Could not perform ADD NODE [%s] to cluster", m.Name)) + r.Log.Info(warnMsg) + r.deletePod(p) return err } - if err = r.waitForPodDelete(append(terminatingFollowerPods, deletedPods...)...); err != nil { + r.Log.Info(fmt.Sprintf("Waiting for master node [%s] to meet [%s]", m.CurrentMasterName, m.Name)) + err = r.waitForRedisMeet(newPodIp) + if err != nil { + r.Log.Error(err, fmt.Sprintf("Error while waiting for cluster to meet [%s]", m.Name)) + r.Log.Info(warnMsg) return err } - - if len(missingFollowers) > 0 { - if err := r.forgetLostNodes(redisCluster); err != nil { - return err - } - if err := r.addFollowers(redisCluster, missingFollowers...); err != nil { - return err - } - } } - - complete, err := r.isClusterComplete(redisCluster) - if err != nil || !complete { - return errors.Errorf("Cluster recovery not complete") - } - data, _ := json.MarshalIndent(clusterView, "", "") - clusterData.SaveRedisClusterView(data) - return nil + return r.recoverFromReplicateNode(p.Status.PodIP, m, mutex) } -func (r *RedisClusterReconciler) updateFollower(redisCluster *dbv1.RedisCluster, followerIP string) error { - pod, err := r.getPodByIP(redisCluster.Namespace, followerIP) +func (r *RedisClusterReconciler) recoverFromReplicateNode(podIp string, m *view.MissingNodeView, mutex *sync.Mutex) error { + if m.CurrentMasterIp == podIp { + r.RedisClusterStateView.LockResourceAndSetNodeState(m.Name, m.LeaderName, view.NodeOK, mutex) + return nil + } + newPodId, err := r.RedisCLI.MyClusterID(podIp) if err != nil { return err } - - deletedPods, err := r.deletePodsByIP(redisCluster.Namespace, followerIP) + err = r.waitForRedisReplication(m.CurrentMasterName, m.CurrentMasterIp, m.CurrentMasterId, m.Name, newPodId) if err != nil { + r.Log.Error(err, fmt.Sprintf("Error while waiting for node [%s] replication ", m.Name)) return err - } else { - if err := r.waitForPodDelete(deletedPods...); err != nil { - return err - } } + return r.recoverFromSyncNode(podIp, m, mutex) +} - if err := r.forgetLostNodes(redisCluster); err != nil { +func (r *RedisClusterReconciler) recoverFromSyncNode(podIp string, m *view.MissingNodeView, mutex *sync.Mutex) error { + r.RedisClusterStateView.LockResourceAndSetNodeState(m.Name, m.LeaderName, view.SyncNode, mutex) + err := r.waitForRedisSync(m, podIp) + if err != nil { + r.Log.Error(err, fmt.Sprintf("Error while waiting for node [%s] sync process ", m.Name)) return err } + return r.recoverFromFailOver(podIp, m, mutex) +} - r.Log.Info(fmt.Sprintf("Starting to add follower: (%s %s)", pod.Labels["node-number"], pod.Labels["leader-number"])) - if err := r.addFollowers(redisCluster, NodeNumbers{pod.Labels["node-number"], pod.Labels["leader-number"]}); err != nil { - return err +func (r *RedisClusterReconciler) recoverFromFailOver(podIp string, m *view.MissingNodeView, mutex *sync.Mutex) error { + if m.Name == m.LeaderName { + r.Log.Info(fmt.Sprintf("Performing failover for node [%s]", m.Name)) + r.RedisClusterStateView.LockResourceAndSetNodeState(m.Name, m.LeaderName, view.FailoverNode, mutex) + _, err := r.RedisCLI.Ping(podIp) + if err != nil { + return err + } + failOver := r.retryFailover(m.Name, podIp, 3) + if !failOver { + for _, n := range r.RedisClusterStateView.Nodes { + if n.LeaderName == m.LeaderName { + r.RedisClusterStateView.LockResourceAndSetNodeState(n.Name, n.LeaderName, view.ReshardNodeKeepInMap, mutex) + } + } + return nil + } } - + r.RedisClusterStateView.LockResourceAndSetNodeState(m.Name, m.LeaderName, view.NodeOK, mutex) return nil } -func (r *RedisClusterReconciler) updateLeader(redisCluster *dbv1.RedisCluster, leaderIP string) error { - // TODO handle the case where a leader has no followers - promotedFollowerIP, err := r.doLeaderFailover(leaderIP, "") +func (r *RedisClusterReconciler) retryFailover(name string, ip string, attempts int) bool { + if attempts == 0 { + return false + } + err := r.doFailover(ip, "") if err != nil { - return err + r.Log.Info(fmt.Sprintf("[Warning] Attempt to failover with node [%s:%s] failed, retries left: (%s)", name, ip, fmt.Sprint(attempts-1))) + return r.retryFailover(name, ip, attempts-1) + } + return true +} + +func (r *RedisClusterReconciler) detectNodeTableMissalignments(v *view.RedisClusterView) bool { + r.Log.Info("Detecting nodes table missalignments...") + missalignments := []string{} + for _, node := range v.Nodes { + if node == nil { + continue + } + if node.IsLeader { + continue + } + leaderNode, leaderPodExists := v.Nodes[node.LeaderName] + if !leaderPodExists || leaderNode == nil { + continue + } + ipsToNodeTables, err := r.ClusterNodesWaitForRedisLoadDataSetInMemory(node.Ip, leaderNode.Ip) + if err != nil { + continue + } + if followerNodeTable, exists := ipsToNodeTables[node.Ip]; !exists || followerNodeTable == nil || len(*followerNodeTable) <= 1 { + continue + } + if leaderNodeTable, exists := ipsToNodeTables[leaderNode.Ip]; !exists || leaderNodeTable == nil || len(*leaderNodeTable) <= 1 { + continue + } + isFollowerMaster, err := r.checkIfMaster(node.Ip) + if err != nil || !isFollowerMaster { + continue + } + isLeaderMaster, err := r.checkIfMaster(leaderNode.Ip) + if err != nil { + continue + } + if !isLeaderMaster { + r.RedisClusterStateView.SetNodeState(node.LeaderName, node.LeaderName, view.FailoverNode) + continue + } + // At this point: pod which we concider follower, serves as master in the context of the cluster, as well its leader exists and serves as master in the context of cluster: + // they are responsible for different sets of slots instead of being replicas of each other. + // This is an indication for corner case of rebalance request that accidentally included the follower into the rebalance before the decision about its role was made (probably interrupt during cluster meet) + missalignments = append(missalignments, node.Name) } - if deletedPods, err := r.deletePodsByIP(redisCluster.Namespace, leaderIP); err != nil { - return err + if len(missalignments) > 0 { + r.Log.Info(fmt.Sprintf("[Warn] Detected a case of missalignment between expected follower role to its part in cluster, nodes: %v", missalignments)) } else { - if err := r.waitForPodDelete(deletedPods...); err != nil { - return err - } + r.Log.Info("[OK] No missalignments been discovered") } - if err := r.forgetLostNodes(redisCluster); err != nil { - return err + for _, missAlignedNode := range missalignments { + r.scaleDownSingleUnit(missAlignedNode, map[string]bool{missAlignedNode: true}, v) } + return len(missalignments) > 0 +} - if err := r.recreateLeader(redisCluster, promotedFollowerIP); err != nil { - return err +func (r *RedisClusterReconciler) recoverRedisCluster(redisCluster *dbv1.RedisCluster, v *view.RedisClusterView) (bool, error) { + s := r.RedisClusterStateView.ClusterState + r.Log.Info(fmt.Sprintf("Cluster state: %v", s)) + switch s { + case view.ClusterOK: + return r.detectNodeTableMissalignments(v), nil + case view.ClusterFix: + healthyLeaderName, found := r.findHealthyLeader(v) + if !found { + return true, errors.New("Could not find healthy reachable leader to serve cluster fix request") + } + healthyLeaderIp := v.Nodes[healthyLeaderName].Ip + _, stdout, e := r.RedisCLI.ClusterFix(healthyLeaderIp) + if e != nil && !strings.Contains(e.Error(), "[OK] All 16384 slots covered") && !strings.Contains(stdout, "[OK] All 16384 slots covered") { + return true, e + } + r.RedisClusterStateView.ClusterState = view.ClusterRebalance + return true, nil + case view.ClusterRebalance: + r.removeSoloLeaders(v) + r.waitForAllNodesAgreeAboutSlotsConfiguration(v, nil) + healthyLeaderName, found := r.findHealthyLeader(v) + if !found { + return true, errors.New("Could not find healthy reachable leader to serve cluster rebalance request") + } + healthyLeaderIp := v.Nodes[healthyLeaderName].Ip + rebalanced, _, e := r.RedisCLI.ClusterRebalance(healthyLeaderIp, true) + if !rebalanced || e != nil { + r.RedisClusterStateView.ClusterState = view.ClusterFix + return true, e + } + r.RedisClusterStateView.ClusterState = view.ClusterOK + return true, nil } - return nil + return false, nil } -func (r *RedisClusterReconciler) updateCluster(redisCluster *dbv1.RedisCluster) error { - clusterView, err := r.NewRedisClusterView(redisCluster) - if err != nil { - return err +func (r *RedisClusterReconciler) waitForNonReachablePodsTermination(redisCluster *dbv1.RedisCluster, v *view.RedisClusterView) bool { + terminatingPods := []corev1.Pod{} + nonReachablePods := []corev1.Pod{} + for _, node := range v.Nodes { + if node == nil { + continue + } + pod := node.Pod + if pod.Status.Phase == "Terminating" { + terminatingPods = append(terminatingPods, pod) + continue + } + clusterInfo, _, e := r.RedisCLI.ClusterInfo(pod.Status.PodIP) + if e != nil || (*clusterInfo) == nil { + nonReachablePods = append(nonReachablePods, pod) + continue + } + } + if len(nonReachablePods) > 0 { + r.Log.Info(fmt.Sprintf("Removing non reachable pods...number of non reachable pods: %d", len(nonReachablePods))) + deletedPods, _ := r.deletePods(nonReachablePods) + if len(deletedPods) > 0 { + terminatingPods = append(terminatingPods, deletedPods...) + } + } + + if len(terminatingPods) > 0 { + r.Log.Info(fmt.Sprintf("Waiting for terminating pods...number of terminating pods: %d", len(terminatingPods))) + for _, terminatingPod := range terminatingPods { + r.waitForPodDelete(terminatingPod) + } } - r.Log.Info(clusterView.String()) - r.Log.Info("Updating...") - for _, leader := range *clusterView { - for _, follower := range leader.Followers { - podUpToDate, err := r.isPodUpToDate(redisCluster, follower.Pod) + return len(terminatingPods)+len(nonReachablePods) > 0 +} + +func (r *RedisClusterReconciler) getMaxUpdatedPodsPerUpdateBtach(v *view.RedisClusterView) int { + clusterSize := len(v.Nodes) + if clusterSize <= 6 { + return 1 + } else if clusterSize <= 12 { + return 3 + } else { + return r.Config.Thresholds.MaxToleratedPodsUpdateAtOnce + } +} + +func (r *RedisClusterReconciler) updateCluster(redisCluster *dbv1.RedisCluster) error { + requestUpgrade = false + r.Log.Info("Updating Cluster Pods...") + v, ok := r.NewRedisClusterView(redisCluster) + if !ok { + return errors.New("Could not perform redis cluster update") + } + hl, found := r.findHealthyLeader(v, map[string]bool{}) + if !found { + r.Log.Info("[Warn] Coud not find healthy leader to promote update process, re attempting in next healthy reconcile loop...") + return nil + } + healthyLeader, exists := v.Nodes[hl] + if !exists { + r.Log.Info("[Warn] Coud not find healthy leader to promote update process, re attempting in next healthy reconcile loop...") + return nil + } + maxUpdatePodsPerBatch := r.getMaxUpdatedPodsPerUpdateBtach(v) + updatedPodsCounter := 0 + deletedPods := []corev1.Pod{} + for _, n := range v.Nodes { + if updatedPodsCounter >= maxUpdatePodsPerBatch { + break + } + if n == nil { + continue + } + if n.Name == n.LeaderName && n.Name != hl { + podUpToDate, err := r.isPodUpToDate(redisCluster, n.Pod) + if err != nil || podUpToDate { + continue + } + isMaster, err := r.checkIfMaster(n.Ip) if err != nil { - return err + continue + } + if !isMaster { + continue } - if !podUpToDate { - if err = r.updateFollower(redisCluster, follower.Pod.Status.PodIP); err != nil { - return err + hasFollower := false + for _, node := range r.RedisClusterStateView.Nodes { + if node.LeaderName == n.LeaderName && node.Name != n.Name { + _, hasFollower = v.Nodes[node.Name] + if hasFollower { + break + } } - } else { - if _, pollErr := r.waitForPodReady(*follower.Pod); pollErr != nil { - return pollErr + } + if hasFollower { + promotedReplica := r.failOverToReplica(n.Name, v) + if promotedReplica == nil { + continue } - if pollErr := r.waitForRedis(follower.Pod.Status.PodIP); pollErr != nil { - return pollErr + } else { + r.Log.Info("[Warn] Update master with no defined followers will lead to repeated resharding and rebalancing attempts during leaders recreation") + success, _, err := r.RedisCLI.ClusterReshard(healthyLeader.Ip, n.Id, healthyLeader.Id, rediscli.MAX_SLOTS_PER_LEADER) + if err != nil || !success { + continue } + r.RedisClusterStateView.ClusterState = view.ClusterRebalance } + r.removeNode(healthyLeader.Ip, n) + r.deletePod(n.Pod) + deletedPods = append(deletedPods, n.Pod) + r.RedisClusterStateView.SetNodeState(n.Name, n.LeaderName, view.DeleteNodeKeepInMap) + r.RedisClusterStateView.Nodes[n.Name].IsUpToDate = true + updatedPodsCounter++ } - podUpToDate, err := r.isPodUpToDate(redisCluster, leader.Pod) - if err != nil { - return err + } + if updatedPodsCounter > 0 { + r.waitForPodDelete(deletedPods...) + return nil + } + for _, n := range v.Nodes { + if updatedPodsCounter >= maxUpdatePodsPerBatch { + break } - if !podUpToDate { - if err = r.updateLeader(redisCluster, leader.Pod.Status.PodIP); err != nil { - // >>> TODO the logic of checking if a leader pod (frst N pods) is indeed a Redis leader must be handled separately - if rediscli.IsNodeIsNotMaster(err) { - if _, errDel := r.deletePodsByIP(redisCluster.Namespace, leader.Pod.Status.PodIP); errDel != nil { - return errDel - } - } - return err + if n == nil { + continue + } + if n.Name != n.LeaderName { + podUpToDate, err := r.isPodUpToDate(redisCluster, n.Pod) + if err != nil { + continue + } + if podUpToDate { + continue + } + isMaster, err := r.checkIfMaster(n.Ip) + if err != nil { + continue } + if isMaster { + continue + } + r.removeNode(healthyLeader.Ip, n) + r.deletePod(n.Pod) + deletedPods = append(deletedPods, n.Pod) + r.RedisClusterStateView.SetNodeState(n.Name, n.LeaderName, view.DeleteNodeKeepInMap) + r.RedisClusterStateView.Nodes[n.Name].IsUpToDate = true + updatedPodsCounter++ } } - if err = r.cleanupNodeList(clusterView.HealthyNodeIPs()); err != nil { - return err - } - data, _ := json.MarshalIndent(clusterView, "", "") - clusterData.SaveRedisClusterView(data) + requestUpgrade = false + r.waitForPodDelete(deletedPods...) return nil } // TODO replace with a readyness probe on the redis container func (r *RedisClusterReconciler) waitForRedis(nodeIPs ...string) error { for _, nodeIP := range nodeIPs { - r.Log.Info("Waiting for Redis on " + nodeIP) if nodeIP == "" { return errors.Errorf("Missing IP") } if pollErr := wait.PollImmediate(r.Config.Times.RedisPingCheckInterval, r.Config.Times.RedisPingCheckTimeout, func() (bool, error) { reply, err := r.RedisCLI.Ping(nodeIP) if err != nil { - return false, err + return true, err } - if strings.ToLower(strings.TrimSpace(reply)) != "pong" { - return false, nil + if strings.Compare(reply, "PONG") == 0 { + return true, nil } - return true, nil + return false, nil }); pollErr != nil { return pollErr } @@ -863,6 +1309,9 @@ func (r *RedisClusterReconciler) waitForClusterCreate(leaderIPs []string) error } clusterNodes, _, err := r.RedisCLI.ClusterNodes(leaderIP) if err != nil { + if strings.Contains(err.Error(), "Redis is loading the dataset in memory") { + return false, nil + } return false, err } if len(*clusterNodes) != len(leaderIPs) { @@ -874,95 +1323,92 @@ func (r *RedisClusterReconciler) waitForClusterCreate(leaderIPs []string) error } // Safe to be called with both followers and leaders, the call on a leader will be ignored -func (r *RedisClusterReconciler) waitForRedisSync(nodeIP string) error { - r.Log.Info("Waiting for SYNC to start on " + nodeIP) - if err := wait.PollImmediate(r.Config.Times.SyncStartCheckInterval, r.Config.Times.SyncStartCheckTimeout, func() (bool, error) { - redisInfo, _, err := r.RedisCLI.Info(nodeIP) +func (r *RedisClusterReconciler) waitForRedisSync(m *view.MissingNodeView, nodeIP string) error { + r.Log.Info(fmt.Sprintf("Waiting for SYNC to start on [%s:%s]", m.Name, nodeIP)) + return wait.PollImmediate(r.Config.Times.SyncCheckInterval, r.Config.Times.SyncCheckTimeout, func() (bool, error) { + stdoutF, err := r.RedisCLI.Role(nodeIP) if err != nil { + if strings.Contains(err.Error(), "Redis is loading the dataset in memory") { + return false, nil + } return false, err } - - syncStatus := redisInfo.GetSyncStatus() - if syncStatus == "" { - return false, nil + stdoutL, err := r.RedisCLI.Role(m.CurrentMasterIp) + if err != nil { + if strings.Contains(err.Error(), "Redis is loading the dataset in memory") { + return false, nil + } + return false, err } - - return true, nil - }); err != nil { - if err.Error() != wait.ErrWaitTimeout.Error() { - return err + if !strings.Contains(stdoutF, m.CurrentMasterIp) || !strings.Contains(stdoutL, nodeIP) { + return false, nil } - r.Log.Info(fmt.Sprintf("[WARN] Timeout waiting for SYNC process to start on %s", nodeIP)) - } - - return wait.PollImmediate(r.Config.Times.SyncCheckInterval, r.Config.Times.SyncCheckTimeout, func() (bool, error) { - redisInfo, _, err := r.RedisCLI.Info(nodeIP) + infoF, std, err := reconciler.RedisCLI.Info(nodeIP) if err != nil { + if strings.Contains(err.Error(), "Redis is loading the dataset in memory") || strings.Contains(std, "Redis is loading the dataset in memory") { + return false, nil + } return false, err } - syncStatus := redisInfo.GetSyncStatus() - if syncStatus != "" { - // after aquiring the ETA we should use it instead of a constant for waiting - loadStatusETA := redisInfo.GetLoadETA() - if loadStatusETA != "" { - r.Log.Info(fmt.Sprintf("Node %s LOAD ETA: %s", nodeIP, loadStatusETA)) - } else { - r.Log.Info(fmt.Sprintf("Node %s SYNC status: %s", nodeIP, syncStatus)) + infoL, std, err := reconciler.RedisCLI.Info(m.CurrentMasterIp) + if err != nil { + if strings.Contains(err.Error(), "Redis is loading the dataset in memory") || strings.Contains(std, "Redis is loading the dataset in memory") { + return false, nil } + return false, err + } + if infoF == nil || infoL == nil { return false, nil } + memorySizeF := infoF.Memory["used_memory_human"] + memorySizeL := infoL.Memory["used_memory_human"] - r.Log.Info(fmt.Sprintf("Node %s is synced", nodeIP)) - return true, nil - }) -} - -func (r *RedisClusterReconciler) waitForRedisLoad(nodeIP string) error { - r.Log.Info(fmt.Sprintf("Waiting for node %s to start LOADING", nodeIP)) - if err := wait.PollImmediate(r.Config.Times.LoadStartCheckInterval, r.Config.Times.LoadStartCheckTimeout, func() (bool, error) { - redisInfo, _, err := r.RedisCLI.Info(nodeIP) + mF := comp.FindAllStringSubmatch(memorySizeF, -1) + var memF float64 = 0 + for _, match := range mF { + if len(match) > 1 { + memF, _ = strconv.ParseFloat(match[1], 64) + } + } + mL := comp.FindAllStringSubmatch(memorySizeL, -1) + var memL float64 = 0 + for _, match := range mL { + if len(match) > 0 { + memL, _ = strconv.ParseFloat(match[1], 64) + } + } + dbsizeF, stdoutF, err := r.RedisCLI.DBSIZE(nodeIP) if err != nil { return false, err } - - loadStatusETA := redisInfo.GetLoadETA() - if loadStatusETA == "" { - return false, nil + dbsizeL, stdoutL, err := r.RedisCLI.DBSIZE(m.CurrentMasterIp) + if err != nil { + return false, err } - r.Log.Info(fmt.Sprintf("node %s started to load", nodeIP)) - return true, nil - }); err != nil { - if err.Error() != wait.ErrWaitTimeout.Error() { - return err - } - r.Log.Info(fmt.Sprintf("[WARN] timeout waiting for LOADING process to start on node %s", nodeIP)) - } + var dbSizeMatch int64 = 100 + var memSizeMatch float64 = 100.0 - // waiting for loading process to finish - return wait.PollImmediate(r.Config.Times.LoadCheckInterval, r.Config.Times.LoadCheckTimeout, func() (bool, error) { - redisInfo, _, err := r.RedisCLI.Info(nodeIP) - if err != nil { - return false, err + if dbsizeL > 0 { + dbSizeMatch = (dbsizeF * 100) / dbsizeL } - loadStatusETA := redisInfo.GetLoadETA() - if loadStatusETA != "" { - r.Log.Info(fmt.Sprintf("node %s LOAD ETA: %s", nodeIP, loadStatusETA)) - return false, nil + if memL > 0 { + memSizeMatch = (memF * 100) / memL + memSizeMatch = roundFloatToPercision(memSizeMatch, 3) } - r.Log.Info(fmt.Sprintf("node %s is fully loaded", nodeIP)) - return true, nil + r.Log.Info(fmt.Sprintf("Checking sync on master [%v] to replica [%v]: Memory size (%v, %v, %v%v), DB size (%v, %v, %v%v)", m.CurrentMasterIp, nodeIP, memorySizeL, memorySizeF, memSizeMatch, "% match", dbsizeL, dbsizeF, dbSizeMatch, "% match")) + return dbSizeMatch >= int64(r.Config.Thresholds.SyncMatchThreshold), nil }) } -func (r *RedisClusterReconciler) waitForRedisReplication(leaderIP string, leaderID string, followerID string) error { - r.Log.Info(fmt.Sprintf("Waiting for CLUSTER REPLICATION (%s, %s)", leaderIP, followerID)) +func (r *RedisClusterReconciler) waitForRedisReplication(leaderName string, leaderIP string, leaderID string, followerName string, followerID string) error { + r.Log.Info(fmt.Sprintf("Waiting for CLUSTER REPLICATION [%s:%s]->[%s:%s]", leaderName, leaderID, followerName, followerID)) return wait.PollImmediate(r.Config.Times.RedisClusterReplicationCheckInterval, r.Config.Times.RedisClusterReplicationCheckTimeout, func() (bool, error) { replicas, _, err := r.RedisCLI.ClusterReplicas(leaderIP, leaderID) if err != nil { - return false, err + return true, err } for _, replica := range *replicas { if replica.ID == followerID { @@ -973,17 +1419,17 @@ func (r *RedisClusterReconciler) waitForRedisReplication(leaderIP string, leader }) } -func (r *RedisClusterReconciler) waitForRedisMeet(nodeIP string, newNodeIP string) error { - r.Log.Info(fmt.Sprintf("Waiting for CLUSTER MEET (%s, %s)", nodeIP, newNodeIP)) +func (r *RedisClusterReconciler) waitForRedisMeet(newNodeIP string) error { return wait.PollImmediate(r.Config.Times.RedisClusterMeetCheckInterval, r.Config.Times.RedisClusterMeetCheckTimeout, func() (bool, error) { - clusterNodes, _, err := r.RedisCLI.ClusterNodes(nodeIP) + clusterNodes, _, err := r.RedisCLI.ClusterNodes(newNodeIP) if err != nil { - return false, err - } - for _, node := range *clusterNodes { - if strings.Split(node.Addr, ":")[0] == newNodeIP { - return true, nil + if strings.Contains(err.Error(), "Redis is loading the dataset in memory") { + return false, nil } + return true, err + } + if len(*clusterNodes) > 2 { + return true, nil } return false, nil }) @@ -993,47 +1439,28 @@ func (r *RedisClusterReconciler) waitForRedisMeet(nodeIP string, newNodeIP strin func (r *RedisClusterReconciler) waitForManualFailover(podIP string) error { r.Log.Info(fmt.Sprintf("Waiting for [%s] to become leader", podIP)) return wait.PollImmediate(r.Config.Times.RedisManualFailoverCheckInterval, r.Config.Times.RedisManualFailoverCheckTimeout, func() (bool, error) { - info, _, err := r.RedisCLI.Info(podIP) + isMaster, err := r.checkIfMaster(podIP) if err != nil { - return false, err - } - if info.Replication["role"] == "master" { - return true, nil + return true, err } - return false, nil + return isMaster, nil }) } // Waits for Redis to pick a new leader // Returns the IP of the promoted follower -func (r *RedisClusterReconciler) waitForFailover(redisCluster *dbv1.RedisCluster, leader *LeaderNode) (string, error) { - r.Log.Info(fmt.Sprintf("Waiting for leader [%s] failover", leader.NodeNumber)) - failedFollowers := 0 - var promotedFollowerIP string - - for _, follower := range leader.Followers { - if follower.Failed { - failedFollowers++ - } - } +func (r *RedisClusterReconciler) waitForFailover(redisCluster *dbv1.RedisCluster, leaderName string, reachableFollowers []corev1.Pod) (corev1.Pod, error) { + r.Log.Info(fmt.Sprintf("Waiting for leader [%s] failover", leaderName)) + var promotedFollower corev1.Pod - if failedFollowers == redisCluster.Spec.LeaderFollowersCount { - return "", errors.Errorf("Failing leader [%s] lost all followers. Recovery unsupported.", leader.NodeNumber) - } - - return promotedFollowerIP, wait.PollImmediate(r.Config.Times.RedisAutoFailoverCheckInterval, r.Config.Times.RedisAutoFailoverCheckTimeout, func() (bool, error) { - for _, follower := range leader.Followers { - if follower.Failed { - continue - } - - info, _, err := r.RedisCLI.Info(follower.Pod.Status.PodIP) + return promotedFollower, wait.PollImmediate(r.Config.Times.RedisAutoFailoverCheckInterval, r.Config.Times.RedisAutoFailoverCheckTimeout, func() (bool, error) { + for _, follower := range reachableFollowers { + isMaster, err := r.checkIfMaster(follower.Status.PodIP) if err != nil { continue } - - if info.Replication["role"] == "master" { - promotedFollowerIP = follower.Pod.Status.PodIP + if isMaster { + promotedFollower = follower return true, nil } } @@ -1041,7 +1468,11 @@ func (r *RedisClusterReconciler) waitForFailover(redisCluster *dbv1.RedisCluster }) } -func (r *RedisClusterReconciler) isPodUpToDate(redisCluster *dbv1.RedisCluster, pod *corev1.Pod) (bool, error) { +func (r *RedisClusterReconciler) isPodUpToDate(redisCluster *dbv1.RedisCluster, pod corev1.Pod) (bool, error) { + node, existsInMap := r.RedisClusterStateView.Nodes[pod.Name] + if existsInMap && node != nil && !node.IsUpToDate { + return false, nil + } for _, container := range pod.Spec.Containers { for _, crContainer := range redisCluster.Spec.RedisPodSpec.Containers { if crContainer.Name == container.Name { @@ -1055,13 +1486,13 @@ func (r *RedisClusterReconciler) isPodUpToDate(redisCluster *dbv1.RedisCluster, } // Checks if the image declared by the custom resource is the same as the image in the pods -func (r *RedisClusterReconciler) isClusterUpToDate(redisCluster *dbv1.RedisCluster) (bool, error) { - pods, err := r.getRedisClusterPods(redisCluster) - if err != nil { - return false, err - } - for _, pod := range pods { - podUpdated, err := r.isPodUpToDate(redisCluster, &pod) +func (r *RedisClusterReconciler) isClusterUpToDate(redisCluster *dbv1.RedisCluster, v *view.RedisClusterView) (bool, error) { + for _, node := range v.Nodes { + if node == nil { + continue + } + pod := node.Pod + podUpdated, err := r.isPodUpToDate(redisCluster, pod) if err != nil { return false, err } @@ -1072,33 +1503,464 @@ func (r *RedisClusterReconciler) isClusterUpToDate(redisCluster *dbv1.RedisClust return true, nil } -func (r *RedisClusterReconciler) isClusterComplete(redisCluster *dbv1.RedisCluster) (bool, error) { - clusterView, err := r.NewRedisClusterView(redisCluster) - if err != nil { - return false, err +func (r *RedisClusterReconciler) isClusterHealthy(redisCluster *dbv1.RedisCluster, v *view.RedisClusterView) (bool, error) { + if len(v.Nodes) == 0 { + r.Log.Info("[WARN] Could not find redis cluster nodes, reseting cluster...") + redisCluster.Status.ClusterState = string(Reset) + return false, nil } - for _, leader := range *clusterView { - if leader.Terminating { - r.Log.Info("Found terminating leader: " + leader.NodeNumber) - return false, nil + r.Log.Info("Checking for non-healthy nodes...") + nonHealthyNodes := map[string]view.NodeState{} + for _, n := range r.RedisClusterStateView.Nodes { + if n.NodeState == view.NodeOK { + node, exists := v.Nodes[n.Name] + if exists { + if r.checkIfNodeAligned(n, node, nonHealthyNodes) { + continue + } + } else { + r.RedisClusterStateView.SetNodeState(n.Name, n.LeaderName, view.CreateNode) + } } - if leader.Failed { - r.Log.Info("Found failed leader: " + leader.NodeNumber) + nonHealthyNodes[n.Name] = n.NodeState + } + for _, node := range v.Nodes { + if _, reported := r.RedisClusterStateView.Nodes[node.Name]; !reported { + r.Log.Info(fmt.Sprintf("[Warn] Non reported node detected in view: [%v]", node.Name)) return false, nil } - for _, follower := range leader.Followers { - if follower.Terminating { - r.Log.Info("Found terminating follower: " + follower.NodeNumber) - return false, nil + if !requestUpgrade && len(node.Pod.Labels["leader-name"]) == 0 { + requestUpgrade = true + } + } + isComplete := r.RedisClusterStateView.ClusterState == view.ClusterOK && len(nonHealthyNodes) == 0 + + r.Log.Info(fmt.Sprintf("Is cluster complete: %v", isComplete)) + if !isComplete { + r.Log.Info(fmt.Sprintf("Missing/unhealthy nodes report: %+v", nonHealthyNodes)) + } + return isComplete, nil +} + +func (r *RedisClusterReconciler) checkIfNodeAligned(n *view.NodeStateView, node *view.NodeView, nonHealthyNodes map[string]view.NodeState) bool { + ipsToNodesTable, err := r.ClusterNodesWaitForRedisLoadDataSetInMemory(node.Ip) + nodesTable, exists := ipsToNodesTable[node.Ip] + if err != nil || !exists || nodesTable == nil { + return false + } + if len(*nodesTable) <= 1 { + return false + } + if node.IsLeader { + isMaster, err := r.checkIfMaster(node.Ip) + if err != nil || !isMaster { + return false + } + } + return true +} + +func (r *RedisClusterReconciler) findHealthyLeader(v *view.RedisClusterView, exclude ...map[string]bool) (name string, found bool) { + for _, node := range v.Nodes { + if node == nil { + continue + } + if len(exclude) > 0 { + skipNode := false + for _, excludeMap := range exclude { + if _, excludeNode := excludeMap[node.Name]; excludeNode { + skipNode = true + break + } } - if follower.Failed { - r.Log.Info("Found failed follower: " + follower.NodeNumber) - return false, nil + if skipNode { + continue + } + } + if n, exists := r.RedisClusterStateView.Nodes[node.Name]; exists && n.NodeState == view.NodeOK { + isMaster, err := r.checkIfMaster(node.Ip) + if err != nil { + continue + } + if isMaster { + ipsToNodesTable, err := r.ClusterNodesWaitForRedisLoadDataSetInMemory(node.Ip) + nodesTable, exists := ipsToNodesTable[node.Ip] + if err != nil || !exists || nodesTable == nil || len(*nodesTable) <= 1 { + continue + } + return node.Name, len(node.Name) > 0 } } } - data, _ := json.MarshalIndent(clusterView, "", "") - clusterData.SaveRedisClusterView(data) + return "", false +} - return true, nil +func (r *RedisClusterReconciler) isScaleRequired(redisCluster *dbv1.RedisCluster) (bool, ScaleType) { + leaders := 0 + followers := 0 + for _, n := range r.RedisClusterStateView.Nodes { + if n.Name == n.LeaderName { + leaders++ + } else { + followers++ + } + } + leadersBySpec := redisCluster.Spec.LeaderCount + followersBySpec := leadersBySpec * redisCluster.Spec.LeaderFollowersCount + isRequired := (leaders != leadersBySpec) || (followers != followersBySpec) + var scaleType ScaleType + if leaders < leadersBySpec { + scaleType = ScaleUpLeaders + } else if leaders > leadersBySpec { + scaleType = ScaleDownLeaders + } else if followers < followersBySpec { + scaleType = ScaleUpFollowers + } else if followers > followersBySpec { + scaleType = ScaleDownFollowers + } + return isRequired, scaleType +} + +func (r *RedisClusterReconciler) scaleCluster(redisCluster *dbv1.RedisCluster) error { + v, ok := r.NewRedisClusterView(redisCluster) + if !ok { + return nil + } + lostNodesDetcted := r.forgetLostNodes(redisCluster, v) + if lostNodesDetcted { + r.Log.Info("[Warn] Lost nodes detcted on some of cluster nodes...") + return nil + } + + var err error + _, scaleType := r.isScaleRequired(redisCluster) + switch scaleType { + case ScaleUpLeaders: + err = r.scaleUpLeaders(redisCluster, v) + break + case ScaleDownLeaders: + err = r.scaleDownLeaders(redisCluster, v) + break + case ScaleUpFollowers: + err = r.scaleUpFollowers(redisCluster, v) + break + case ScaleDownFollowers: + err = r.scaleDownFollowers(redisCluster, v) + break + } + return err +} + +func (r *RedisClusterReconciler) scaleUpLeaders(redisCluster *dbv1.RedisCluster, v *view.RedisClusterView) error { + r.Log.Info("Scaling up leaders") + healthyLeaderName, found := r.findHealthyLeader(v) + if !found { + return errors.New("Could not find healthy reachable leader to serve scale up leaders request") + } + healthyLeaderIp := v.Nodes[healthyLeaderName].Ip + leaders := r.leadersCount() + leadersBySpec := redisCluster.Spec.LeaderCount + newLeadersNames := []string{} + for l := leaders; l < leadersBySpec; l++ { + name := "redis-node-" + fmt.Sprint(l) + newLeadersNames = append(newLeadersNames, name) + r.RedisClusterStateView.Nodes[name] = &view.NodeStateView{ + Name: name, + LeaderName: name, + IsUpToDate: true, + NodeState: view.NewEmptyNode, + } + for f := 1; f <= redisCluster.Spec.LeaderFollowersCount; f++ { + followerName := name + "-" + fmt.Sprint(f) + r.RedisClusterStateView.Nodes[followerName] = &view.NodeStateView{ + Name: followerName, + LeaderName: name, + IsUpToDate: true, + NodeState: view.CreateNode, + } + } + } + if len(newLeadersNames) > 0 { + r.RedisClusterStateView.ClusterState = view.ClusterRebalance + e := r.addLeaderNodes(redisCluster, healthyLeaderIp, newLeadersNames, v) + if e != nil { + return e + } + r.Log.Info("Leaders added successfully to redis cluster") + r.waitForAllNodesAgreeAboutSlotsConfiguration(v, nil) + } else { + r.Log.Info("[Warn] New leader names list appeard to be empty, no leaders been added to cluster") + } + return nil +} + +func (r *RedisClusterReconciler) scaleDownLeaders(redisCluster *dbv1.RedisCluster, v *view.RedisClusterView) error { + r.Log.Info("Scaling down leaders") + leaders := r.leadersCount() + leadersBySpec := redisCluster.Spec.LeaderCount + + leadersToReshard := map[string]bool{} + for l := leadersBySpec; l < leaders; l++ { + leaderName := "redis-node-" + fmt.Sprint(l) + leadersToReshard[leaderName] = true + n, exists := r.RedisClusterStateView.Nodes[leaderName] + if exists { + n.NodeState = view.ReshardNode + } + } + r.RedisClusterStateView.ClusterState = view.ClusterRebalance + for leaderName, _ := range leadersToReshard { + r.scaleDownLeader(leaderName, leaderName, leadersToReshard, v) + } + r.cleanMapFromNodesToRemove(redisCluster, v) + return nil +} + +func (r *RedisClusterReconciler) scaleDownLeader(name string, leaderName string, excludeList map[string]bool, v *view.RedisClusterView) { + healthyLeaderName, found := r.findHealthyLeader(v, excludeList) + if !found { + return + } + targetLeaderName, found := r.findHealthyLeader(v, excludeList, map[string]bool{healthyLeaderName: true}) + if !found { + return + } + healthyLeaderIp := v.Nodes[healthyLeaderName].Ip + targetLeaderId := v.Nodes[targetLeaderName].Id + r.reshardAndRemoveLeader(name, leaderName, healthyLeaderIp, targetLeaderId, v) +} + +func (r *RedisClusterReconciler) scaleDownLeaderKeepInMap(name string, leaderName string, excludeList map[string]bool, v *view.RedisClusterView) { + healthyLeaderName, found := r.findHealthyLeader(v, excludeList) + if !found { + return + } + targetLeaderName, found := r.findHealthyLeader(v, excludeList, map[string]bool{healthyLeaderName: true}) + if !found { + return + } + healthyLeaderIp := v.Nodes[healthyLeaderName].Ip + targetLeaderId := v.Nodes[targetLeaderName].Id + r.reshardAndKeepInMap(name, leaderName, healthyLeaderIp, targetLeaderId, v) +} + +func (r *RedisClusterReconciler) scaleDownSingleUnit(name string, excludeList map[string]bool, v *view.RedisClusterView) { + healthyLeaderName, found := r.findHealthyLeader(v, excludeList) + if !found { + return + } + targetLeaderName, found := r.findHealthyLeader(v, excludeList, map[string]bool{healthyLeaderName: true}) + if !found { + return + } + healthyLeaderIp := v.Nodes[healthyLeaderName].Ip + targetLeaderId := v.Nodes[targetLeaderName].Id + r.reshardAndRemoveSingleUnit(name, healthyLeaderIp, targetLeaderId, v) +} + +func (r *RedisClusterReconciler) reshardAndRemoveLeader(name string, leaderName string, healthyLeaderIp string, targetLeaderId string, v *view.RedisClusterView) { + if leaderToRemove, exists := v.Nodes[name]; exists && leaderToRemove != nil { + r.Log.Info(fmt.Sprintf("Resharding node: [%s]->all slots->[%s]", leaderToRemove.Id, targetLeaderId)) + err := r.reshardLeaderCheckCoverage(healthyLeaderIp, targetLeaderId, leaderToRemove, false) + if err != nil { + r.RedisClusterStateView.ClusterState = view.ClusterFix + r.Log.Error(err, fmt.Sprintf("Error during attempt to reshard node [%s]", name)) + return + } + r.Log.Info(fmt.Sprintf("Leader reshard successful between [%s]->[%s]", leaderToRemove.Id, targetLeaderId)) + } + + r.RedisClusterStateView.SetNodeState(name, leaderName, view.DeleteNode) + for _, n := range r.RedisClusterStateView.Nodes { + if n.LeaderName == leaderName { + if n.NodeState != view.ReshardNode { + r.RedisClusterStateView.SetNodeState(n.Name, n.LeaderName, view.DeleteNode) + } + } + } +} + +func (r *RedisClusterReconciler) reshardAndRemoveSingleUnit(name string, healthyLeaderIp string, targetLeaderId string, v *view.RedisClusterView) { + if nodeToRemove, exists := v.Nodes[name]; exists && nodeToRemove != nil { + r.Log.Info(fmt.Sprintf("Resharding node: [%s]->all slots->[%s]", nodeToRemove.Id, targetLeaderId)) + err := r.reshardLeaderCheckCoverage(healthyLeaderIp, targetLeaderId, nodeToRemove, true) + if err != nil { + r.RedisClusterStateView.ClusterState = view.ClusterFix + r.Log.Error(err, fmt.Sprintf("Error during attempt to reshard node [%s]", name)) + return + } + r.waitForAllNodesAgreeAboutSlotsConfiguration(v, nil) + r.Log.Info(fmt.Sprintf("Leader reshard successful between [%s]->[%s]", nodeToRemove.Id, targetLeaderId)) + r.deletePod(nodeToRemove.Pod) + } +} + +func (r *RedisClusterReconciler) reshardAndKeepInMap(name string, leaderName, healthyLeaderIp string, targetLeaderId string, v *view.RedisClusterView) { + if leaderToRemove, exists := v.Nodes[name]; exists && leaderToRemove != nil { + r.Log.Info(fmt.Sprintf("Resharding node: [%s]->all slots->[%s]", leaderToRemove.Id, targetLeaderId)) + err := r.reshardLeaderCheckCoverage(healthyLeaderIp, targetLeaderId, leaderToRemove, true) + if err != nil { + r.RedisClusterStateView.ClusterState = view.ClusterFix + r.Log.Error(err, fmt.Sprintf("Error during attempt to reshard node [%s]", leaderName)) + return + } + r.Log.Info(fmt.Sprintf("Leader reshard successful between [%s]->[%s]", leaderToRemove.Id, targetLeaderId)) + } + + r.RedisClusterStateView.SetNodeState(name, leaderName, view.DeleteNodeKeepInMap) + for _, n := range r.RedisClusterStateView.Nodes { + if n.LeaderName == leaderName { + if n.NodeState != view.ReshardNodeKeepInMap { + r.RedisClusterStateView.SetNodeState(n.Name, n.LeaderName, view.DeleteNodeKeepInMap) + } + } + } +} + +func (r *RedisClusterReconciler) scaleUpFollowers(redisCluster *dbv1.RedisCluster, v *view.RedisClusterView) error { + r.Log.Info("Scaling up followers") + leadersToFollowerCount := r.numOfFollowersPerLeader(v) + followersBySpec := redisCluster.Spec.LeaderFollowersCount + for leaderName, followerCount := range leadersToFollowerCount { + for f := followerCount + 1; f <= followersBySpec; f++ { + name := leaderName + "-" + fmt.Sprint(f) + r.RedisClusterStateView.Nodes[name] = &view.NodeStateView{ + Name: name, + LeaderName: leaderName, + IsUpToDate: true, + NodeState: view.CreateNode, + } + } + } + r.Log.Info("Scaling up followers: new followers will be created within the next reconcile loop") + return nil +} + +func (r *RedisClusterReconciler) scaleDownFollowers(redisCluster *dbv1.RedisCluster, v *view.RedisClusterView) error { + r.Log.Info("Scaling down followers") + leadersToFollowerCount := r.numOfFollowersPerLeader(v) + followersBySpec := redisCluster.Spec.LeaderFollowersCount + for leaderName, followerCount := range leadersToFollowerCount { + for f := followersBySpec + 1; f <= followerCount; f++ { + name := leaderName + "-" + fmt.Sprint(f) + if _, exists := r.RedisClusterStateView.Nodes[name]; exists { + r.RedisClusterStateView.Nodes[name].NodeState = view.DeleteNode + } + } + } + return nil +} + +func (r *RedisClusterReconciler) checkIfMaster(nodeIP string) (bool, error) { + info, _, err := r.RedisCLI.Info(nodeIP) + if err != nil || info == nil { + return false, err + } + if info.Replication["role"] == "master" { + return true, nil + } + return false, nil +} + +func (r *RedisClusterReconciler) reshardLeaderCheckCoverage(healthyLeaderIp string, targetLeaderId string, leaderToRemove *view.NodeView, keepInMap bool) error { + isMaster, e := r.checkIfMaster(leaderToRemove.Ip) + if e != nil { + return e + } + if !isMaster { + r.RedisClusterStateView.SetNodeState(leaderToRemove.Name, leaderToRemove.LeaderName, view.DeleteNode) + return nil + } + if !keepInMap { + r.RedisClusterStateView.SetNodeState(leaderToRemove.Name, leaderToRemove.LeaderName, view.ReshardNode) + } + + r.Log.Info(fmt.Sprintf("Performing resharding and coverage check on leader [%s]", leaderToRemove.Name)) + + success, _, e := r.RedisCLI.ClusterReshard(healthyLeaderIp, leaderToRemove.Id, targetLeaderId, rediscli.MAX_SLOTS_PER_LEADER) + if e != nil || !success { + return e + } + emptyLeadersIds, fullCoverage, e := r.CheckClusterAndCoverage(healthyLeaderIp) + if !fullCoverage || e != nil { + r.RedisClusterStateView.ClusterState = view.ClusterFix + return e + } + if _, leaderHasZeroSlots := emptyLeadersIds[leaderToRemove.Id]; !leaderHasZeroSlots { + return errors.New(fmt.Sprintf("Could not perform reshard operation for leader %s %s", leaderToRemove.Ip, leaderToRemove.Id)) + } + return nil +} + +func (r *RedisClusterReconciler) CheckClusterAndCoverage(nodeIp string) (emptyLeadersIds map[string]bool, fullyCovered bool, err error) { + emptyLeadersIds = map[string]bool{} + clusterCheckResult, err := r.RedisCLI.ClusterCheck(nodeIp) + if err != nil { + return emptyLeadersIds, false, err + } + slotsConfigurationFormat := "[OK] All nodes agree about slots configuration" + allSlotsCoveredFormat := "[OK] All 16384 slots covered" + zeroSlotsPerMasterFormat := "M:\\s*(\\w*|\\d*)\\s*\\d+\\.\\d+\\.\\d+\\.\\d+:\\d+\\s*slots:\\s*\\(0 slots\\)\\s*master" + c := regexp.MustCompile(zeroSlotsPerMasterFormat) + matchingSubstrings := c.FindAllStringSubmatch(clusterCheckResult, -1) + for _, match := range matchingSubstrings { + if len(match) > 1 { + captureId := match[1] + emptyLeadersIds[captureId] = true + } + } + + if strings.Contains(clusterCheckResult, slotsConfigurationFormat) && strings.Contains(clusterCheckResult, allSlotsCoveredFormat) { + r.Log.Info(fmt.Sprintf("[OK] All slots are covered, empty leaders list contains leaders that it is safe now to remove: %v", emptyLeadersIds)) + return emptyLeadersIds, true, nil + } + return emptyLeadersIds, false, errors.New(fmt.Sprintf("Cluster check validation failed, command stdout result: %v", clusterCheckResult)) +} + +func (r *RedisClusterReconciler) leadersCount() int { + leaders := 0 + for _, n := range r.RedisClusterStateView.Nodes { + if n.Name == n.LeaderName { + leaders++ + } + } + return leaders +} + +func (r *RedisClusterReconciler) numOfFollowersPerLeader(v *view.RedisClusterView) map[string]int { + followersPerLeader := map[string]int{} + for _, node := range v.Nodes { + if node == nil { + continue + } + if _, contained := followersPerLeader[node.LeaderName]; !contained { + followersPerLeader[node.LeaderName] = 0 + } + if !node.IsLeader { + followersPerLeader[node.LeaderName]++ + } + } + return followersPerLeader +} + +func roundFloatToPercision(num float64, percision int) float64 { + o := math.Pow(10, float64(percision)) + return float64(round(num*o)) / o +} + +func round(num float64) int { + return int(num + math.Copysign(0.5, num)) +} + +func (r *RedisClusterReconciler) logCurrentMastersList(v *view.RedisClusterView) { + masters := []string{} + for _, node := range v.Nodes { + isMaster, err := r.checkIfMaster(node.Ip) + if err != nil || !isMaster { + continue + } + masters = append(masters, node.Name) + } + + r.Log.Info(fmt.Sprintf("Num of masters in cluster: [%v] masters list: %v", len(masters), masters)) } diff --git a/controllers/rediscluster_controller.go b/controllers/rediscluster_controller.go index 5ea7d629..62a988d9 100644 --- a/controllers/rediscluster_controller.go +++ b/controllers/rediscluster_controller.go @@ -19,6 +19,14 @@ package controllers import ( "context" "fmt" + "os" + "os/signal" + "regexp" + "sync" + "syscall" + "time" + + "github.com/PayU/redis-operator/controllers/view" "github.com/go-logr/logr" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -31,7 +39,6 @@ import ( corev1 "k8s.io/api/core/v1" "github.com/PayU/redis-operator/controllers/rediscli" - clusterData "github.com/PayU/redis-operator/data" ) const ( @@ -40,10 +47,8 @@ const ( // InitializingCluster: ConfigMap, Service resources are created; the leader // pods are created and clusterized - InitializingCluster RedisClusterState = "InitializingCluster" - // InitializingFollowers: followers are added to the cluster - InitializingFollowers RedisClusterState = "InitializingFollowers" + Reset RedisClusterState = "Reset" // Ready: cluster is up & running as expected Ready RedisClusterState = "Ready" @@ -53,138 +58,234 @@ const ( // Updating: the cluster is in the middle of a rolling update Updating RedisClusterState = "Updating" + + Scale RedisClusterState = "Scale" ) type RedisClusterState string type RedisClusterReconciler struct { client.Client - Log logr.Logger - Scheme *runtime.Scheme - RedisCLI *rediscli.RedisCLI - Config *OperatorConfig - State RedisClusterState + Log logr.Logger + Scheme *runtime.Scheme + RedisCLI *rediscli.RedisCLI + Config *OperatorConfig + State RedisClusterState + RedisClusterStateView *view.RedisClusterStateView } +var reconciler *RedisClusterReconciler +var cluster *dbv1.RedisCluster + +var requestUpgrade bool = false +var setChannelOnSigTerm bool = true + // +kubebuilder:rbac:groups=db.payu.com,resources=redisclusters,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=db.payu.com,resources=redisclusters/status,verbs=get;update;patch // +kubebuilder:rbac:groups=*,resources=pods;services;configmaps,verbs=create;update;patch;get;list;watch;delete -func getCurrentClusterState(redisCluster *dbv1.RedisCluster) RedisClusterState { +func (r *RedisClusterReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) { + reconciler = r + r.Status() + var redisCluster dbv1.RedisCluster + var err error + + if err = r.Get(context.Background(), req.NamespacedName, &redisCluster); err != nil { + r.Log.Info("Unable to fetch RedisCluster resource") + return ctrl.Result{Requeue: true, RequeueAfter: 15 * time.Second}, client.IgnoreNotFound(err) + } + + r.State = RedisClusterState(redisCluster.Status.ClusterState) if len(redisCluster.Status.ClusterState) == 0 { - return NotExists + r.State = NotExists + } + + cluster = &redisCluster + + if r.State != NotExists && r.State != Reset { + err = r.setClusterStateView(&redisCluster) + if err != nil { + r.Log.Error(err, "Could not perform reconcile loop") + r.deriveStateViewOutOfExistingCluster(&redisCluster) + return ctrl.Result{Requeue: true, RequeueAfter: 20 * time.Second}, nil + } + } + + if setChannelOnSigTerm { + r.saveClusterStateOnSigTerm(&redisCluster) + setChannelOnSigTerm = false + } + + switch r.State { + case NotExists: + err = r.handleInitializingCluster(&redisCluster) + break + case Reset: + err = r.handleInitializingCluster(&redisCluster) + break + case Ready: + err = r.handleReadyState(&redisCluster) + break + case Recovering: + err = r.handleRecoveringState(&redisCluster) + break + case Updating: + err = r.handleUpdatingState(&redisCluster) + break + case Scale: + err = r.handleScaleState(&redisCluster) } - return RedisClusterState(redisCluster.Status.ClusterState) + if err != nil { + r.Log.Error(err, "Handling error") + } + + r.saveClusterView(&redisCluster) + return ctrl.Result{Requeue: true, RequeueAfter: 15 * time.Second}, err } -func (r *RedisClusterReconciler) handleInitializingCluster(redisCluster *dbv1.RedisCluster) error { - r.Log.Info("Handling initializing cluster...") - if err := r.createNewRedisCluster(redisCluster); err != nil { - return err +func (r *RedisClusterReconciler) saveOperatorState(redisCluster *dbv1.RedisCluster) { + r.Status().Update(context.Background(), redisCluster) + operatorState := redisCluster.Status.ClusterState + r.Client.Status() + r.Log.Info(fmt.Sprintf("Operator state: [%s], Cluster state: [%s]", operatorState, r.RedisClusterStateView.ClusterState)) +} + +func (r *RedisClusterReconciler) saveClusterView(redisCluster *dbv1.RedisCluster) { + if redisCluster.Status.ClusterState == string(Ready) && r.RedisClusterStateView.ClusterState == view.ClusterOK { + r.RedisClusterStateView.NumOfReconcileLoopsSinceHealthyCluster = 0 + } else { + r.RedisClusterStateView.NumOfReconcileLoopsSinceHealthyCluster++ } - redisCluster.Status.ClusterState = string(InitializingFollowers) - return nil + r.saveClusterStateView(redisCluster) + v, ok := r.NewRedisClusterView(redisCluster) + if !ok { + return + } + for _, n := range v.Nodes { + if n != nil { + n.Pod = corev1.Pod{} + } + } + r.saveOperatorState(redisCluster) } -func (r *RedisClusterReconciler) handleInitializingFollowers(redisCluster *dbv1.RedisCluster) error { - r.Log.Info("Handling initializing followers...") - if err := r.initializeFollowers(redisCluster); err != nil { +func (r *RedisClusterReconciler) handleInitializingCluster(redisCluster *dbv1.RedisCluster) error { + r.Log.Info("Clear all cluster pods...") + e := r.deleteAllRedisClusterPods() + if e != nil { + return e + } + r.Log.Info("Clear cluster state map...") + r.deleteClusterStateView(redisCluster) + r.RedisClusterStateView.CreateStateView(redisCluster.Spec.LeaderCount, redisCluster.Spec.LeaderFollowersCount) + r.Log.Info("Handling initializing cluster...") + if err := r.createNewRedisCluster(redisCluster); err != nil { + redisCluster.Status.ClusterState = string(Reset) return err } redisCluster.Status.ClusterState = string(Ready) + r.postNewClusterStateView(redisCluster) + r.saveClusterView(redisCluster) return nil } func (r *RedisClusterReconciler) handleReadyState(redisCluster *dbv1.RedisCluster) error { - complete, err := r.isClusterComplete(redisCluster) + r.Log.Info("Handling ready state...") + v, ok := r.NewRedisClusterView(redisCluster) + if !ok { + r.RedisClusterStateView.NumOfReconcileLoopsSinceHealthyCluster++ + r.RedisClusterStateView.NumOfHealthyReconcileLoopsInRow = 0 + redisCluster.Status.ClusterState = string(Recovering) + return nil + } + lostNodesDetected := r.forgetLostNodes(redisCluster, v) + if lostNodesDetected { + r.RedisClusterStateView.NumOfHealthyReconcileLoopsInRow = 0 + r.Log.Info("[Warn] Lost nodes detcted on some of the nodes tables...") + return nil + } + healthy, err := r.isClusterHealthy(redisCluster, v) if err != nil { - r.Log.Info("Could not check if cluster is complete") + r.RedisClusterStateView.NumOfHealthyReconcileLoopsInRow = 0 + r.Log.Info("Could not check if cluster is healthy") return err } - if !complete { + if !healthy { + r.RedisClusterStateView.NumOfHealthyReconcileLoopsInRow = 0 redisCluster.Status.ClusterState = string(Recovering) return nil } - - uptodate, err := r.isClusterUpToDate(redisCluster) + r.logCurrentMastersList(v) + uptodate, err := r.isClusterUpToDate(redisCluster, v) if err != nil { + r.RedisClusterStateView.NumOfHealthyReconcileLoopsInRow = 0 r.Log.Info("Could not check if cluster is updated") redisCluster.Status.ClusterState = string(Recovering) return err } if !uptodate { + r.RedisClusterStateView.NumOfHealthyReconcileLoopsInRow = 0 redisCluster.Status.ClusterState = string(Updating) return nil } + scale, scaleType := r.isScaleRequired(redisCluster) + if scale { + r.RedisClusterStateView.NumOfHealthyReconcileLoopsInRow = 0 + r.Log.Info(fmt.Sprintf("Scale is required, scale type: [%v]", scaleType.String())) + redisCluster.Status.ClusterState = string(Scale) + } r.Log.Info("Cluster is healthy") - return nil -} - -func (r *RedisClusterReconciler) handleRecoveringState(redisCluster *dbv1.RedisCluster) error { - r.Log.Info("Handling cluster recovery...") - if err := r.recoverCluster(redisCluster); err != nil { - r.Log.Info("Cluster recovery failed") - return err + if r.RedisClusterStateView.NumOfHealthyReconcileLoopsInRow < 10 { + r.RedisClusterStateView.NumOfHealthyReconcileLoopsInRow++ + } else { + r.Log.Info("[OK] Cluster is in finalized state") + } + if requestUpgrade { + r.Log.Info("[Warn] Cluster upgrade is required, upgrade can be triggered by using entry point /upgrade") } - redisCluster.Status.ClusterState = string(Ready) return nil } -func (r *RedisClusterReconciler) handleUpdatingState(redisCluster *dbv1.RedisCluster) error { - r.Log.Info("Handling rolling update...") - if err := r.updateCluster(redisCluster); err != nil { - r.Log.Info("Rolling update failed") - redisCluster.Status.ClusterState = string(Recovering) - return err +func (r *RedisClusterReconciler) handleScaleState(redisCluster *dbv1.RedisCluster) error { + r.Log.Info("Handling cluster scale...") + e := r.scaleCluster(redisCluster) + if e != nil { + r.Log.Error(e, "Could not perform cluster scale") } redisCluster.Status.ClusterState = string(Ready) return nil } -func (r *RedisClusterReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) { - r.Status() - - var redisCluster dbv1.RedisCluster - var err error - - if err = r.Get(context.Background(), req.NamespacedName, &redisCluster); err != nil { - r.Log.Info("Unable to fetch RedisCluster resource") - return ctrl.Result{}, client.IgnoreNotFound(err) +func (r *RedisClusterReconciler) handleRecoveringState(redisCluster *dbv1.RedisCluster) error { + r.Log.Info("Handling cluster recovery...") + v, ok := r.NewRedisClusterView(redisCluster) + if !ok { + return nil } - - r.State = getCurrentClusterState(&redisCluster) - - switch r.State { - case NotExists: - redisCluster.Status.ClusterState = string(InitializingCluster) - err = r.handleInitializingCluster(&redisCluster) - break - case InitializingCluster: - err = r.handleInitializingCluster(&redisCluster) - break - case InitializingFollowers: - err = r.handleInitializingFollowers(&redisCluster) - break - case Ready: - err = r.handleReadyState(&redisCluster) - break - case Recovering: - err = r.handleRecoveringState(&redisCluster) - break - case Updating: - err = r.handleUpdatingState(&redisCluster) - break + e := r.recoverCluster(redisCluster, v) + r.cleanMapFromNodesToRemove(redisCluster, v) + if e != nil { + return e } + return nil +} - clusterData.SaveRedisClusterState(string(r.State)) +func (r *RedisClusterReconciler) handleUpdatingState(redisCluster *dbv1.RedisCluster) error { + var err error = nil + r.Log.Info("Handling rolling update...") + r.updateCluster(redisCluster) + redisCluster.Status.ClusterState = string(Recovering) + reconciler.saveOperatorState(cluster) + return err +} - if err != nil { - r.Log.Error(err, "Handling error") +func (r *RedisClusterReconciler) validateStateUpdated(redisCluster *dbv1.RedisCluster) (ctrl.Result, error) { + clusterState := RedisClusterState(redisCluster.Status.ClusterState) + if len(redisCluster.Status.ClusterState) == 0 { + clusterState = NotExists } - - clusterState := getCurrentClusterState(&redisCluster) if clusterState != r.State { - err := r.Status().Update(context.Background(), &redisCluster) + err := r.Status().Update(context.Background(), redisCluster) if err != nil && !apierrors.IsConflict(err) { r.Log.Info("Failed to update state to " + string(clusterState)) return ctrl.Result{}, err @@ -197,7 +298,6 @@ func (r *RedisClusterReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error r.State = clusterState r.Log.Info(fmt.Sprintf("Updated state to: [%s]", clusterState)) } - return ctrl.Result{}, nil } @@ -214,3 +314,49 @@ func (r *RedisClusterReconciler) SetupWithManager(mgr ctrl.Manager) error { WithOptions(controller.Options{MaxConcurrentReconciles: 1}). Complete(r) } + +func (r *RedisClusterReconciler) deriveStateViewOutOfExistingCluster(redisCluster *dbv1.RedisCluster) { + r.RedisClusterStateView.CreateStateView(redisCluster.Spec.LeaderCount, redisCluster.Spec.LeaderFollowersCount) + v, ok := r.NewRedisClusterView(redisCluster) + if ok && v != nil { + if len(v.Nodes) > 0 { + r.RedisClusterStateView.ClusterState = view.ClusterOK + } + leaderFormat := "redis-node-(\\d+)" + followerFormat := "redis-node-(\\d+)-(\\d+)" + for _, n := range v.Nodes { + isMaster, err := r.checkIfMaster(n.Ip) + if err == nil { + continue + } + if isMaster { + match, e := regexp.MatchString(leaderFormat, n.Name) + if e != nil && match { + r.RedisClusterStateView.SetNodeState(n.Name, n.LeaderName, view.NodeOK) + } + } else { + match, e := regexp.MatchString(followerFormat, n.Name) + if e != nil && match { + r.RedisClusterStateView.SetNodeState(n.Name, n.LeaderName, view.NodeOK) + } + } + } + r.postNewClusterStateView(redisCluster) + } +} + +func (r *RedisClusterReconciler) saveClusterStateOnSigTerm(redisCluster *dbv1.RedisCluster) { + if setChannelOnSigTerm && r.RedisClusterStateView != nil { + mutex := &sync.Mutex{} + saveStatusOnQuit := make(chan os.Signal, 1) + signal.Notify(saveStatusOnQuit, syscall.SIGTERM, syscall.SIGQUIT, syscall.SIGKILL) + go func() { + <-saveStatusOnQuit + mutex.Lock() + close(saveStatusOnQuit) + r.Log.Info("[WARN] reconcile loop interrupted by os signal, saving cluster state view...") + r.saveClusterStateView(redisCluster) + mutex.Unlock() + }() + } +} diff --git a/controllers/testlab/test_lab.go b/controllers/testlab/test_lab.go new file mode 100644 index 00000000..02e9a079 --- /dev/null +++ b/controllers/testlab/test_lab.go @@ -0,0 +1,527 @@ +package testlab + +import ( + "context" + "fmt" + "math/rand" + "strings" + "sync" + "time" + + dbv1 "github.com/PayU/redis-operator/api/v1" + "github.com/PayU/redis-operator/controllers/rediscli" + "github.com/PayU/redis-operator/controllers/redisclient" + "github.com/PayU/redis-operator/controllers/view" + "github.com/go-logr/logr" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/wait" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +type TestLab struct { + Client client.Client + RedisCLI *rediscli.RedisCLI + Cluster *dbv1.RedisCluster + RedisClusterClient *redisclient.RedisClusterClient + Log logr.Logger + Report string +} + +var fetchViewInterval = 10 * time.Second +var fetchViewTimeOut = 1 * time.Minute + +var clusterHealthCheckInterval = 10 * time.Second +var clusterHealthCheckTimeOutLimit = 5 * time.Minute + +var randomChoiceRetries int = 4 + +var sleepPerTest time.Duration = 2 * time.Second +var sleepPerPodCheck time.Duration = 2 * time.Second +var sleepPerHealthCheck time.Duration = 5 * time.Second + +var dataWriteRetries int = 5 +var dataReadRetries int = 5 + +var intervalsBetweenWrites time.Duration = 500 * time.Millisecond + +var totalDataWrites int = 200 + +var mutex = &sync.Mutex{} + +func (t *TestLab) RunTest(nodes *map[string]*view.NodeStateView, withData bool) { + t.Report = "\n[TEST LAB] Cluster test report:\n\n" + isReady := t.waitForHealthyCluster(nodes) + if !isReady { + return + } + if withData { + t.testSuitWithData(nodes) + } else { + t.testSuit(nodes) + } +} + +func (t *TestLab) testSuit(nodes *map[string]*view.NodeStateView) { + test_1 := t.runTest(nodes, 1) + if !test_1 { + return + } + test_2 := t.runTest(nodes, 2) + if !test_2 { + return + } + test_3 := t.runTest(nodes, 3) + if !test_3 { + return + } + test_4 := t.runTest(nodes, 4) + if !test_4 { + return + } + test_5 := t.runTest(nodes, 5) + if !test_5 { + return + } + test_6 := t.runTest(nodes, 6) + if !test_6 { + return + } +} + +func (t *TestLab) testSuitWithData(nodes *map[string]*view.NodeStateView) { + test_1 := t.runTestWithData(nodes, 1) + if !test_1 { + return + } + test_2 := t.runTestWithData(nodes, 2) + if !test_2 { + return + } + test_3 := t.runTestWithData(nodes, 3) + if !test_3 { + return + } + test_4 := t.runTestWithData(nodes, 4) + if !test_4 { + return + } + test_5 := t.runTestWithData(nodes, 5) + if !test_5 { + return + } + test_6 := t.runTestWithData(nodes, 6) + if !test_6 { + return + } +} + +func (t *TestLab) testDataWrites(total int) (successfulWrites int, data map[string]string) { + data = map[string]string{} + successfulWrites = 0 + for i := 0; i < total; i++ { + key := "key" + fmt.Sprintf("%v", i) + val := "val" + fmt.Sprintf("%v", i) + err := t.RedisClusterClient.Set(key, val, dataWriteRetries) + if err == nil { + successfulWrites++ + data[key] = val + } + time.Sleep(intervalsBetweenWrites) + } + return successfulWrites, data +} + +func (t *TestLab) testDataReads(data map[string]string) (successfulReads int) { + successfulReads = 0 + for k, expected_v := range data { + actual_v, err := t.RedisClusterClient.Get(k, dataReadRetries) + if err == nil { + if expected_v == actual_v { + successfulReads++ + } + } + } + return successfulReads +} + +func (t *TestLab) analyzeDataResults(total int, successfulWrites int, successfulReads int) { + writeSuccessRate := (successfulWrites * 100.0) / (total) + readSuccessRate := 0 + if successfulWrites > 0 { + readSuccessRate = (successfulReads * 100.0) / (successfulWrites) + } + t.Report += fmt.Sprintf("[TEST LAB] Total : [%v]\n", total) + t.Report += fmt.Sprintf("[TEST LAB] Successful writes : [%v]\n", successfulWrites) + t.Report += fmt.Sprintf("[TEST LAB] Successful reads : [%v]\n", successfulReads) + t.Report += fmt.Sprintf("[TEST LAB] Writes success rate : [%v%v]\n", writeSuccessRate, "%") + t.Report += fmt.Sprintf("[TEST LAB] Reads success rate : [%v%v]\n", readSuccessRate, "%") +} + +func (t *TestLab) runTest(nodes *map[string]*view.NodeStateView, testNum int) bool { + if t.RedisClusterClient == nil { + return false + } + result := false + switch testNum { + case 1: + result = t.test_delete_follower(nodes, testNum) + break + case 2: + result = t.test_delete_leader(nodes, testNum) + break + case 3: + result = t.test_delete_leader_and_follower(nodes, testNum) + break + case 4: + result = t.test_delete_all_followers(nodes, testNum) + break + case 5: + result = t.test_delete_all_azs_beside_one(nodes, testNum) + break + case 6: + result = t.test_delete_leader_and_all_its_followers(nodes, testNum) + break + } + return result +} + +func (t *TestLab) runTestWithData(nodes *map[string]*view.NodeStateView, testNum int) bool { + if t.RedisClusterClient == nil { + return false + } + var wg sync.WaitGroup + result := false + sw := 0 + sr := 0 + data := map[string]string{} + wg.Add(2) + go func() { + defer wg.Done() + switch testNum { + case 1: + result = t.test_delete_follower(nodes, testNum) + break + case 2: + result = t.test_delete_leader(nodes, testNum) + break + case 3: + result = t.test_delete_leader_and_follower(nodes, testNum) + break + case 4: + result = t.test_delete_all_followers(nodes, testNum) + break + case 5: + result = t.test_delete_all_azs_beside_one(nodes, testNum) + break + case 6: + result = t.test_delete_leader_and_all_its_followers(nodes, testNum) + break + } + }() + go func() { + defer wg.Done() + sw, data = t.testDataWrites(totalDataWrites) + }() + wg.Wait() + sr = t.testDataReads(data) + t.analyzeDataResults(totalDataWrites, sw, sr) + return result +} + +func (t *TestLab) test_delete_follower(nodes *map[string]*view.NodeStateView, testNum int) bool { + time.Sleep(sleepPerTest) + t.Log.Info("[TEST LAB] Running test: Delete follower...") + t.Report += fmt.Sprintf("\n[TEST LAB] Test %v delete follower...", testNum) + randomFollower := t.PickRandomeFollower(map[string]bool{}, nodes, randomChoiceRetries) + result := t.test_delete_pods([]string{randomFollower}, nodes) + t.Report += fmt.Sprintf("\n[TEST LAB] Test %v: delete follower result [%v]\n", testNum, result) + return result +} + +func (t *TestLab) test_delete_leader(nodes *map[string]*view.NodeStateView, testNum int) bool { + time.Sleep(sleepPerTest) + t.Log.Info("[TEST LAB] Running test: Delete leader...") + t.Report += fmt.Sprintf("\n[TEST LAB] Test %v delete leader...", testNum) + randomLeader := t.PickRandomeLeader(map[string]bool{}, nodes, randomChoiceRetries) + result := t.test_delete_pods([]string{randomLeader}, nodes) + t.Report += fmt.Sprintf("\n[TEST LAB] Test %v: delete leader result [%v]\n", testNum, result) + return result +} + +func (t *TestLab) test_delete_leader_and_follower(nodes *map[string]*view.NodeStateView, testNum int) bool { + time.Sleep(sleepPerTest) + t.Log.Info("[TEST LAB] Running test: Delete leader and follower...") + t.Report += fmt.Sprintf("\n[TEST LAB] Test %v delete leader and follower...", testNum) + randomFollower := t.PickRandomeFollower(map[string]bool{}, nodes, randomChoiceRetries) + f, exists := (*nodes)[randomFollower] + if !exists { + return false + } + randomLeader := t.PickRandomeLeader(map[string]bool{f.LeaderName: true}, nodes, randomChoiceRetries) + result := t.test_delete_pods([]string{randomFollower, randomLeader}, nodes) + t.Report += fmt.Sprintf("\n[TEST LAB] Test %v: delete leader and follower result [%v]\n", testNum, result) + return result +} + +func (t *TestLab) test_delete_all_followers(nodes *map[string]*view.NodeStateView, testNum int) bool { + time.Sleep(sleepPerTest) + t.Log.Info("[TEST LAB] Running test: Delete all followers...") + t.Report += fmt.Sprintf("\n[TEST LAB] Test %v delete all followers...", testNum) + followers := []string{} + for _, n := range *nodes { + if n.Name != n.LeaderName { + followers = append(followers, n.Name) + } + } + result := t.test_delete_pods(followers, nodes) + t.Report += fmt.Sprintf("\n[TEST LAB] Test %v: delete all followers result [%v]\n", testNum, result) + return result +} + +func (t *TestLab) test_delete_leader_and_all_its_followers(nodes *map[string]*view.NodeStateView, testNum int) bool { + time.Sleep(sleepPerTest) + t.Log.Info("[TEST LAB] Running test: Delete leader and all its folowers...") + t.Report += fmt.Sprintf("\n[TEST LAB] Test %v delete leader and all his followers...", testNum) + toDelete := []string{} + randomFollower := t.PickRandomeFollower(map[string]bool{}, nodes, randomChoiceRetries) + f, exists := (*nodes)[randomFollower] + if !exists { + return false + } + for _, n := range *nodes { + if n.LeaderName == f.LeaderName { + toDelete = append(toDelete, n.Name) + } + } + result := t.test_delete_pods(toDelete, nodes) + t.Report += fmt.Sprintf("\n[TEST LAB] Test %v: delete leader and all his followers result [%v]\n", testNum, result) + return result +} + +func (t *TestLab) test_delete_all_azs_beside_one(nodes *map[string]*view.NodeStateView, testNum int) bool { + time.Sleep(sleepPerTest) + t.Log.Info("[TEST LAB] Running test: Simulating loss of all az's beside one...") + t.Report += fmt.Sprintf("\n[TEST LAB] Test %v delete all pods beside one replica foreach set...", testNum) + del := map[string]bool{} + keep := map[string]bool{} + d := false + for _, n := range *nodes { + if n.Name == n.LeaderName { + del[n.Name] = d + d = !d + } + } + for _, n := range *nodes { + if n.Name != n.LeaderName { + delLeader, leaderInMap := del[n.LeaderName] + if !leaderInMap { + del[n.Name] = false + keep[n.LeaderName] = true + } else { + if delLeader { + _, hasReplicaToKeep := keep[n.LeaderName] + if hasReplicaToKeep { + del[n.Name] = true + } else { + del[n.Name] = false + keep[n.LeaderName] = true + } + } else { + del[n.Name] = true + } + } + } + } + toDelete := []string{} + for n, d := range del { + if d { + toDelete = append(toDelete, n) + } + } + result := t.test_delete_pods(toDelete, nodes) + t.Report += fmt.Sprintf("\n[TEST LAB] Test %v: delete all pods beside one replica foreach set result [%v]\n", testNum, result) + return result +} + +func (t *TestLab) PickRandomeFollower(exclude map[string]bool, nodes *map[string]*view.NodeStateView, retry int) string { + k := rand.Intn(t.Cluster.Spec.LeaderFollowersCount*t.Cluster.Spec.LeaderCount - len(exclude)) + i := 0 + for _, n := range *nodes { + if _, ex := exclude[n.Name]; ex || n.Name == n.LeaderName { + continue + } + if i == k { + return n.Name + } + i++ + } + if retry == 0 { + return "" + } + return t.PickRandomeFollower(exclude, nodes, retry-1) +} + +func (t *TestLab) PickRandomeLeader(exclude map[string]bool, nodes *map[string]*view.NodeStateView, retry int) string { + k := rand.Intn(t.Cluster.Spec.LeaderCount - len(exclude)) + i := 0 + for _, n := range *nodes { + if _, ex := exclude[n.Name]; ex || n.Name != n.LeaderName { + continue + } + if i == k { + return n.Name + } + i++ + } + if retry == 0 { + return "" + } + return t.PickRandomeLeader(exclude, nodes, retry-1) +} + +func (t *TestLab) checkIfMaster(nodeIP string) (bool, error) { + mutex.Lock() + info, _, err := t.RedisCLI.Info(nodeIP) + mutex.Unlock() + if err != nil || info == nil { + return false, err + } + if info.Replication["role"] == "master" { + return true, nil + } + return false, nil +} + +func (t *TestLab) test_delete_pods(podsToDelete []string, nodes *map[string]*view.NodeStateView) bool { + v := t.WaitForClusterView() + if v == nil { + return false + } + for _, toDelete := range podsToDelete { + node, exists := v.Nodes[toDelete] + if !exists || node == nil { + continue + } + time.Sleep(1 * time.Second) + t.deletePod(node.Pod) + } + return t.waitForHealthyCluster(nodes) +} + +func (t *TestLab) WaitForClusterView() *view.RedisClusterView { + var v *view.RedisClusterView = nil + var ok bool = false + if pollErr := wait.PollImmediate(fetchViewInterval, fetchViewTimeOut, func() (bool, error) { + v, ok = t.newRedisClusterView() + if !ok { + return false, nil + } + return true, nil + }); pollErr != nil { + t.Report += fmt.Sprintf("\n[TEST LAB] Error: Could not fetch cluster view, prob intervals: [%v], probe timeout: [%v]\n", fetchViewInterval, fetchViewTimeOut) + } + return v +} + +func (t *TestLab) waitForHealthyCluster(nodes *map[string]*view.NodeStateView) bool { + time.Sleep(sleepPerHealthCheck) + t.Report += fmt.Sprintf("\n[TEST LAB] Waiting for cluster to be declared ready...") + isHealthyCluster := false + if pollErr := wait.PollImmediate(clusterHealthCheckInterval, clusterHealthCheckTimeOutLimit, func() (bool, error) { + if t.isClusterAligned(nodes) { + isHealthyCluster = true + return true, nil + } + return false, nil + }); pollErr != nil { + t.Report += fmt.Sprintf("\n[TEST LAB] Error while waiting for cluster to heal, probe intervals: [%v], probe timeout: [%v]", clusterHealthCheckInterval, clusterHealthCheckTimeOutLimit) + return false + } + return isHealthyCluster +} + +func (t *TestLab) isClusterAligned(expectedNodes *map[string]*view.NodeStateView) bool { + v := t.WaitForClusterView() + if v == nil { + return false + } + t.Log.Info("[TEST LAB] Checking if cluster is ready...") + for _, n := range *expectedNodes { + time.Sleep(sleepPerPodCheck) + node, exists := v.Nodes[n.Name] + if !exists || node == nil { + return false + } + if node.IsLeader { + if !t.isLeaderAligned(node) { + return false + } + } else { + if !t.isFollowerAligned(node, expectedNodes) { + return false + } + } + } + totalExpectedNodes := t.Cluster.Spec.LeaderCount * (t.Cluster.Spec.LeaderFollowersCount + 1) + clusterOK := len(v.Nodes) == totalExpectedNodes && len(*expectedNodes) == totalExpectedNodes + if clusterOK { + t.RedisClusterClient = redisclient.GetRedisClusterClient(v, t.RedisCLI) + } + return clusterOK +} + +func (t *TestLab) isFollowerAligned(n *view.NodeView, nodes *map[string]*view.NodeStateView) bool { + _, leaderExists := (*nodes)[n.LeaderName] + if !leaderExists { + return false + } + isMaster, e := t.checkIfMaster(n.Ip) + if e != nil || isMaster { + return false + } + return true +} + +func (t *TestLab) isLeaderAligned(n *view.NodeView) bool { + isMaster, e := t.checkIfMaster(n.Ip) + if e != nil || !isMaster { + return false + } + mutex.Lock() + nodes, _, e := t.RedisCLI.ClusterNodes(n.Ip) + mutex.Unlock() + if e != nil || nodes == nil || len(*nodes) <= 1 { + return false + } + return true +} + +func (t *TestLab) deletePod(pod corev1.Pod) error { + if err := t.Client.Delete(context.Background(), &pod); err != nil && !strings.Contains(err.Error(), "not found") { + return err + } + return nil +} + +func (t *TestLab) newRedisClusterView() (*view.RedisClusterView, bool) { + v := &view.RedisClusterView{} + pods, e := t.getRedisClusterPods() + if e != nil { + return v, false + } + e = v.CreateView(pods, t.RedisCLI) + if e != nil { + return v, false + } + return v, true +} + +func (t *TestLab) getRedisClusterPods() ([]corev1.Pod, error) { + pods := &corev1.PodList{} + matchingLabels := t.Cluster.Spec.PodLabelSelector + err := t.Client.List(context.Background(), pods, client.InNamespace(t.Cluster.ObjectMeta.Namespace), client.MatchingLabels(matchingLabels)) + if err != nil { + return nil, err + } + return pods.Items, nil +} diff --git a/controllers/view/cluster_view.go b/controllers/view/cluster_view.go new file mode 100644 index 00000000..7f020bdd --- /dev/null +++ b/controllers/view/cluster_view.go @@ -0,0 +1,176 @@ +package view + +import ( + "errors" + "fmt" + "sync" + + "github.com/PayU/redis-operator/controllers/rediscli" + corev1 "k8s.io/api/core/v1" +) + +type ClusterState string +type NodeState string + +const ( + ClusterCreate ClusterState = "ClusterCreate" + ClusterFix ClusterState = "ClusterFix" + ClusterRebalance ClusterState = "ClusterRebalance" + ClusterOK ClusterState = "ClusterOK" +) + +const ( + CreateNode NodeState = "CreateNode" + AddNode NodeState = "AddNode" + ReplicateNode NodeState = "ReplicateNode" + SyncNode NodeState = "SyncNode" + FailoverNode NodeState = "FailoverNode" + ReshardNode NodeState = "ReshardNode" + ReshardNodeKeepInMap NodeState = "ReshardNodeKeepInMap" + NewEmptyNode NodeState = "NewEmptyNode" + DeleteNode NodeState = "DeleteNode" + DeleteNodeKeepInMap NodeState = "DeleteNodeKeepInMap" + NodeOK NodeState = "NodeOK" +) + +type RedisClusterView struct { + Nodes map[string]*NodeView +} + +type RedisClusterStateView struct { + Name string + ClusterState ClusterState + NumOfReconcileLoopsSinceHealthyCluster int + NumOfHealthyReconcileLoopsInRow int + Nodes map[string]*NodeStateView +} + +type NodeView struct { + Name string + Id string + Namespace string + Ip string + LeaderName string + IsLeader bool + Pod corev1.Pod +} + +type NodeStateView struct { + Name string + LeaderName string + IsUpToDate bool + NodeState NodeState +} + +type MissingNodeView struct { + Name string + LeaderName string + CurrentMasterName string + CurrentMasterId string + CurrentMasterIp string +} + +func (sv *RedisClusterStateView) CreateStateView(leaderCount int, followersPerLeaderCount int) { + sv.ClusterState = ClusterCreate + sv.NumOfReconcileLoopsSinceHealthyCluster = 0 + sv.NumOfHealthyReconcileLoopsInRow = 0 + sv.Nodes = make(map[string]*NodeStateView) + for l := 0; l < leaderCount; l++ { + name := "redis-node-" + fmt.Sprint(l) + sv.Nodes[name] = &NodeStateView{ + Name: name, + LeaderName: name, + IsUpToDate: true, + NodeState: CreateNode, + } + } + for _, leader := range sv.Nodes { + if leader.Name == leader.LeaderName { + for f := 1; f <= followersPerLeaderCount; f++ { + name := leader.Name + "-" + fmt.Sprint(f) + sv.Nodes[name] = &NodeStateView{ + Name: name, + LeaderName: leader.Name, + IsUpToDate: true, + NodeState: CreateNode, + } + } + } + } +} + +func (sv *RedisClusterStateView) SetNodeState(name string, leaderName string, nodeState NodeState) { + n, exists := sv.Nodes[name] + if exists { + n.NodeState = nodeState + } else { + sv.Nodes[name] = &NodeStateView{ + Name: name, + LeaderName: leaderName, + IsUpToDate: true, + NodeState: nodeState, + } + } +} + +func (sv *RedisClusterStateView) LockResourceAndSetNodeState(name string, leaderName string, nodeState NodeState, mutex *sync.Mutex) { + mutex.Lock() + n, exists := sv.Nodes[name] + if exists { + n.NodeState = nodeState + } else { + sv.Nodes[name] = &NodeStateView{ + Name: name, + LeaderName: leaderName, + IsUpToDate: true, + NodeState: nodeState, + } + } + mutex.Unlock() +} + +func (sv *RedisClusterStateView) LockResourceAndRemoveFromMap(name string, mutex *sync.Mutex) { + mutex.Lock() + delete(sv.Nodes, name) + mutex.Unlock() +} + +func (v *RedisClusterView) CreateView(pods []corev1.Pod, redisCli *rediscli.RedisCLI) error { + v.Nodes = make(map[string]*NodeView) + for _, pod := range pods { + redisNode := &NodeView{ + Name: pod.Name, + Id: "", + Namespace: pod.Namespace, + Ip: pod.Status.PodIP, + LeaderName: getLeaderName(pod), + IsLeader: pod.Labels["redis-node-role"] == "leader", + Pod: pod, + } + if !isReachableNode(redisNode, redisCli) { + return errors.New("Non reachable node found") + } + v.Nodes[pod.Name] = redisNode + } + return nil +} + +func getLeaderName(pod corev1.Pod) string { + leaderName := pod.Labels["leader-name"] + if len(leaderName) > 0 { + return leaderName + } + nodeNumber := pod.Labels["leader-number"] + return "redis-node-" + nodeNumber +} + +func isReachableNode(n *NodeView, redisCli *rediscli.RedisCLI) bool { + var e error + if n.Id, e = redisCli.MyClusterID(n.Ip); e != nil { + return false + } + if clusterInfo, _, e := redisCli.ClusterInfo(n.Ip); e != nil || clusterInfo == nil { + return false + } + return true +} diff --git a/data/clusterData.go b/data/clusterData.go deleted file mode 100644 index 62a371a6..00000000 --- a/data/clusterData.go +++ /dev/null @@ -1,32 +0,0 @@ -package data - -import ( - "io/ioutil" - "os" -) - -func SaveRedisClusterView(data []byte) { - fileName := os.Getenv("CLUSTER_VIEW_FILE") - - _ = ioutil.WriteFile(fileName, data, 0644) -} - -func GetClusterView() ([]byte, error) { - fileName := os.Getenv("CLUSTER_VIEW_FILE") - - return ioutil.ReadFile(fileName) -} - -func SaveRedisClusterState(s string) { - fileName := os.Getenv("CLUSTER_STATE_FILE") - _ = ioutil.WriteFile(fileName, []byte(s), 0644) -} - -func GetRedisClusterState() string { - fileName := os.Getenv("CLUSTER_STATE_FILE") - byteValue, err := ioutil.ReadFile(fileName) - if err != nil { - return "NotExists" - } - return string(byteValue) -} diff --git a/go.mod b/go.mod index 6e602ce4..84d986a3 100644 --- a/go.mod +++ b/go.mod @@ -1,14 +1,15 @@ module github.com/PayU/redis-operator -go 1.15 +go 1.16 require ( github.com/go-logr/logr v0.1.0 + github.com/go-redis/redis/v8 v8.11.5 github.com/go-test/deep v1.0.7 github.com/labstack/echo/v4 v4.6.1 github.com/pkg/errors v0.9.1 - golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e - gopkg.in/yaml.v2 v2.3.0 + golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9 + gopkg.in/yaml.v2 v2.4.0 k8s.io/api v0.18.6 k8s.io/apiextensions-apiserver v0.18.6 k8s.io/apimachinery v0.18.6 diff --git a/go.sum b/go.sum index ba23fe3d..d8285348 100644 --- a/go.sum +++ b/go.sum @@ -35,7 +35,12 @@ github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+Ce github.com/bgentry/speakeasy v0.1.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kBD4zp0CCIs= github.com/blang/semver v3.5.0+incompatible/go.mod h1:kRBLl5iJ+tD4TcOOxsy/0fnwebNt5EWlYSAyrTnjyyk= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= +github.com/cespare/xxhash/v2 v2.1.2 h1:YRXhKfTDauu4ajMg1TPgFO5jnlC2HCbmLXMcTG5cbYE= +github.com/cespare/xxhash/v2 v2.1.2/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/chai2010/gettext-go v0.0.0-20160711120539-c6fed771bfd5/go.mod h1:/iP1qXHoty45bqomnu2LM+VVyAEdWN+vtSHGlQgyxbw= +github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI= +github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI= +github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU= github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= github.com/cockroachdb/datadriven v0.0.0-20190809214429-80d97fb3cbaa/go.mod h1:zn76sxSg3SzpJ0PPJaLDCu+Bu0Lg3sKTORVIj19EIF8= github.com/coreos/etcd v3.3.10+incompatible/go.mod h1:uF7uidLiAD3TWHmW31ZFd/JWoc32PjwdhPthX9715RE= @@ -54,6 +59,8 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/daviddengcn/go-colortext v0.0.0-20160507010035-511bcaf42ccd/go.mod h1:dv4zxwHi5C/8AeI+4gX4dCWOIvNi7I6JCSX0HvlKPgE= github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ= +github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78= +github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc= github.com/docker/distribution v2.7.1+incompatible/go.mod h1:J2gT2udsDAN96Uj4KfcMRqY0/ypR+oyYUYmja8H+y+w= github.com/docker/docker v0.7.3-0.20190327010347-be7ac8be2ae0 h1:w3NnFcKR5241cfmQU5ZZAsf0xcpId6mWOupTvJlUX2U= github.com/docker/docker v0.7.3-0.20190327010347-be7ac8be2ae0/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= @@ -140,7 +147,10 @@ github.com/go-openapi/swag v0.19.5/go.mod h1:POnQmlKehdgb5mhVOsnJFsivZCEZ/vjK9gh github.com/go-openapi/validate v0.18.0/go.mod h1:Uh4HdOzKt19xGIGm1qHf/ofbX1YQ4Y+MYsct2VUrAJ4= github.com/go-openapi/validate v0.19.2/go.mod h1:1tRCw7m3jtI8eNWEEliiAqUIcBztB2KDnRCRMUi7GTA= github.com/go-openapi/validate v0.19.5/go.mod h1:8DJv2CVJQ6kGNpFW6eV9N3JviE1C85nY1c2z52x1Gk4= +github.com/go-redis/redis/v8 v8.11.5 h1:AcZZR7igkdvfVmQTPnu9WE37LRrO/YrBH5zWyjDC0oI= +github.com/go-redis/redis/v8 v8.11.5/go.mod h1:gREzHqY1hg6oD9ngVRbLStwAWKhA0FEgq8Jd4h5lpwo= github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY= +github.com/go-task/slim-sprig v0.0.0-20210107165309-348f09dbbbc0/go.mod h1:fyg7847qk6SyHyPtNmDHnmrv/HOrqktSC+C9fM+CJOE= github.com/go-test/deep v1.0.7 h1:/VSMRlnY/JSyqxQUzQLKVMAskpY/NZKFA5j2P+0pP2M= github.com/go-test/deep v1.0.7/go.mod h1:QV8Hv/iy04NyLBxAdO9njL0iVPN1S4d/A3NVv1V36o8= github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= @@ -163,8 +173,10 @@ github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:x github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs= github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w= github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0= -github.com/golang/protobuf v1.4.2 h1:+Z5KGCizgyZCbGh1KZqA0fcLLkwbsjIzS4aV2v7wJX0= github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= +github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= +github.com/golang/protobuf v1.5.2 h1:ROPKBNFfQgOUMifHyP+KYbvpjbdoFNs+aK7DXlji0Tw= +github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= github.com/golangplus/bytes v0.0.0-20160111154220-45c989fe5450/go.mod h1:Bk6SMAONeMXrxql8uvOKuAZSu8aM5RUGv+1C6IJaEho= github.com/golangplus/fmt v0.0.0-20150411045040-2a5d6d7d2995/go.mod h1:lJgMEyOkYFkPcDKwRXegd+iM6E7matEszMG5HhwytU8= github.com/golangplus/testing v0.0.0-20180327235837-af21d9c3145e/go.mod h1:0AA//k/eakGydO4jKRoRL2j92ZKSzTgj9tclaCrvXHk= @@ -174,13 +186,15 @@ github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= -github.com/google/go-cmp v0.4.0 h1:xsAVV57WRhGj6kEIi8ReJzQlHHqcBYCElAvkovg3B/4= github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.5 h1:Khx7svrCpmxxtHBq5j2mp/xVjsi8hQMfNLvJFAlrGgU= +github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gofuzz v1.1.0 h1:Hsa8mG0dQ46ij8Sl2AYJDUv1oA9/d6Vk+3LG99Oe02g= github.com/google/gofuzz v1.1.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/martian v2.1.0+incompatible/go.mod h1:9I4somxYTbIHy5NJKHRl3wXiIaQGbYVAs8BPL6v8lEs= github.com/google/pprof v0.0.0-20181206194817-3ea8567a2e57/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc= +github.com/google/pprof v0.0.0-20210407192527-94a9f03dee38/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= github.com/google/uuid v1.0.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.1.1 h1:Gkbcsh/GbpXz7lPftLA3P6TYMwjCLYm83jiFQZF/3gY= github.com/google/uuid v1.1.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= @@ -203,6 +217,7 @@ github.com/hashicorp/golang-lru v0.5.4 h1:YDjusn29QI/Das2iO9M0BHnIbxPeyuCHsjMW+l github.com/hashicorp/golang-lru v0.5.4/go.mod h1:iADmTwqILo4mZ8BN3D2Q6+9jd8WM5uGBxy+E8yxSoD4= github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= +github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= github.com/imdario/mergo v0.3.5/go.mod h1:2EnlNZ0deacrJVfApfmtdGgDfMuh/nq6Ok1EcJh5FfA= github.com/imdario/mergo v0.3.9 h1:UauaLniWCFHWd+Jp9oCEkTBj8VO/9DKg3PV3VCNMDIg= github.com/imdario/mergo v0.3.9/go.mod h1:2EnlNZ0deacrJVfApfmtdGgDfMuh/nq6Ok1EcJh5FfA= @@ -228,8 +243,6 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/pty v1.1.5/go.mod h1:9r2w37qlBe7rQ6e1fg1S/9xpWHSnaqNdHD3WcMdbPDA= github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= -github.com/labstack/echo v1.4.4 h1:1bEiBNeGSUKxcPDGfZ/7IgdhJJZx8wV/pICJh4W2NJI= -github.com/labstack/echo v3.3.10+incompatible h1:pGRcYk231ExFAyoAjAfD85kQzRJCRI8bbnE7CX5OEgg= github.com/labstack/echo/v4 v4.6.1 h1:OMVsrnNFzYlGSdaiYGHbgWQnr+JM7NG+B9suCPie14M= github.com/labstack/echo/v4 v4.6.1/go.mod h1:RnjgMWNDB9g/HucVWhQYNQP9PvbYf6adqftqryo7s9k= github.com/labstack/gommon v0.3.0 h1:JEeO0bvc78PKdyHxloTKiF8BD5iGrH8T6MSeGvSgob0= @@ -272,19 +285,25 @@ github.com/munnerz/goautoneg v0.0.0-20120707110453-a547fc61f48d/go.mod h1:+n7T8m github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw= -github.com/nxadm/tail v1.4.4 h1:DQuhQpB1tVlglWS2hLQ5OV6B5r8aGxSrPc5Qo6uTN78= github.com/nxadm/tail v1.4.4/go.mod h1:kenIhsEOeOJmVchQTgglprH7qJGnHDVpk1VPCcaMI8A= +github.com/nxadm/tail v1.4.8 h1:nPr65rt6Y5JFSKQO7qToXr7pePgD6Gwiw05lkbyAQTE= +github.com/nxadm/tail v1.4.8/go.mod h1:+ncqLTQzXmGhMZNUePPaPqPvBxHAIsmXswZKocGu+AU= github.com/olekukonko/tablewriter v0.0.0-20170122224234-a0225b3f23b5/go.mod h1:vsDQFd/mU46D+Z4whnwzcISnGGzXWMclvtLoiIKAKIo= github.com/onsi/ginkgo v0.0.0-20170829012221-11459a886d9c/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= github.com/onsi/ginkgo v1.11.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= -github.com/onsi/ginkgo v1.12.1 h1:mFwc4LvZ0xpSvDZ3E+k8Yte0hLOMxXUlP+yXtJqkYfQ= github.com/onsi/ginkgo v1.12.1/go.mod h1:zj2OWP4+oCPe1qIXoGWkgMRwljMUYCdkwsT2108oapk= +github.com/onsi/ginkgo v1.16.4/go.mod h1:dX+/inL/fNMqNlz0e9LfyB9TswhZpCVdJM/Z6Vvnwo0= +github.com/onsi/ginkgo v1.16.5 h1:8xi0RTUf59SOSfEtZMvwTvXYMzG4gV23XVHOZiXNtnE= +github.com/onsi/ginkgo v1.16.5/go.mod h1:+E8gABHa3K6zRBolWtd+ROzc/U5bkGt0FwiG042wbpU= +github.com/onsi/ginkgo/v2 v2.0.0/go.mod h1:vw5CSIxN1JObi/U8gcbwft7ZxR2dgaR70JSE3/PpL4c= github.com/onsi/gomega v0.0.0-20170829124025-dcabb60a477c/go.mod h1:C1qb7wdrVGGVU+Z6iS04AVkA3Q65CEZX59MT0QO5uiA= github.com/onsi/gomega v1.7.0/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY= github.com/onsi/gomega v1.7.1/go.mod h1:XdKZgCCFLUoM/7CFJVPcG8C1xQ1AJ0vpAezJrB7JYyY= -github.com/onsi/gomega v1.10.1 h1:o0+MgICZLuZ7xjH7Vx6zS/zcu93/BEp1VwkIW1mEXCE= github.com/onsi/gomega v1.10.1/go.mod h1:iN09h71vgCQne3DLsj+A5owkum+a2tYe+TOCB1ybHNo= +github.com/onsi/gomega v1.17.0/go.mod h1:HnhC7FXeEQY45zxNK3PPoIUhzk/80Xly9PcubAlGdZY= +github.com/onsi/gomega v1.18.1 h1:M1GfJqGRrBrrGGsbxzV5dqM2U2ApXefZCQpkukxYRLE= +github.com/onsi/gomega v1.18.1/go.mod h1:0q+aL8jAiMXy9hbwj2mr5GziHiwhAIQpFmmtT5hitRs= github.com/opencontainers/go-digest v1.0.0-rc1/go.mod h1:cMLVZDEM3+U2I4VmLI6N8jQYUd2OVphdqWwCJHrFt2s= github.com/pborman/uuid v1.2.0/go.mod h1:X/NO0urCmaxf9VXbdlT7C2Yzkj2IKimNn4k+gtPdI/k= github.com/pelletier/go-toml v1.2.0/go.mod h1:5z9KED0ma1S8pY6P1sdut58dfprrGBbd/94hg7ilaic= @@ -337,8 +356,9 @@ github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+ github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= -github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +github.com/stretchr/testify v1.5.1 h1:nOGnQDM7FYENwehXlg/kFVnos3rEvtKTjRvOWSzb6H4= +github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= github.com/tidwall/pretty v1.0.0/go.mod h1:XNkn88O1ChpSDQmQeStsy+sBenx6DDtFZJxhVysOjyk= github.com/tmc/grpc-websocket-proxy v0.0.0-20170815181823-89b8d40f7ca8/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U= github.com/ugorji/go/codec v0.0.0-20181204163529-d75b2dcb6bc8/go.mod h1:VFNgLljTbGfSG7qAOspJ7OScBnGdDN/yBr0sguwnwf0= @@ -352,6 +372,7 @@ github.com/vektah/gqlparser v1.1.2/go.mod h1:1ycwN7Ij5njmMkPPAOaRFY4rET2Enx7IkVv github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2/go.mod h1:UETIi67q53MR2AWcXfiuqkDkRtnGDLqkBTpCHuJHxtU= github.com/xlab/handysort v0.0.0-20150421192137-fb3537ed64a1/go.mod h1:QcJo0QPSfTONNIgpN5RA8prR7fF8nkF6cTWTcNerRO8= github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q= +github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= go.etcd.io/bbolt v1.3.3/go.mod h1:IbVyRI1SCnLcuJnV2u8VeU0CEYM7e686BmAb1XKL+uU= go.etcd.io/etcd v0.0.0-20191023171146-3cf2f69b5738/go.mod h1:dnLIgRNXwCJa5e+c6mIZCrds/GIG4ncV9HhK5PX7jPg= go.mongodb.org/mongo-driver v1.0.3/go.mod h1:u7ryQJ+DOzQmeO7zB6MHyr8jkEQvC8vH7qLUO4lqsUM= @@ -372,8 +393,9 @@ golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACk golang.org/x/crypto v0.0.0-20190320223903-b7391e95e576/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20190611184440-5c40567a22f8/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20190617133340-57b3e21c3d56/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= -golang.org/x/crypto v0.0.0-20200220183623-bac4c82f6975 h1:/Tl7pH94bvbAAHBdZJT947M/+gp0+CqQXDtMRC0fseo= +golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200220183623-bac4c82f6975/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20210817164053-32db794688a5 h1:HWj/xjIHfjYU5nVXpTM0s39J9CbLn7Cc5a7IC5rwsMQ= golang.org/x/crypto v0.0.0-20210817164053-32db794688a5/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= @@ -381,6 +403,7 @@ golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTk golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= golang.org/x/lint v0.0.0-20190301231843-5614ed5bae6f/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/net v0.0.0-20170114055629-f2499483f923/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -398,9 +421,10 @@ golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLL golang.org/x/net v0.0.0-20190813141303-74dc4d7220e7/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20190827160401-ba9fcec4b297/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20191004110552-13f9640d40b9/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20200520004742-59133d7f0dd7 h1:AeiKBIuRw3UomYXSbLy0Mc2dDLfdtbT/IVn4keq83P0= golang.org/x/net v0.0.0-20200520004742-59133d7f0dd7/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= +golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20210428140749-89ef3d95e781/go.mod h1:OJAsFXCWl8Ukc7SiCT/9KSuxbyM7479/AVlXFRxuMCk= golang.org/x/net v0.0.0-20210913180222-943fd674d43e h1:+b/22bPvDYt4NPDcy4xAGCmON713ONAWFeY3Z7I3tR8= golang.org/x/net v0.0.0-20210913180222-943fd674d43e/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= @@ -412,8 +436,9 @@ golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e h1:vcxGaoTs7kV8m5Np9uUNQin4BrLOthgV7252N8V+FwY= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9 h1:SQFwaSi55rU7vdNs9Yr0Z324VNlrF+0wMqRXT4St8ck= +golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20170830134202-bb24a47a89ea/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= @@ -434,31 +459,32 @@ golang.org/x/sys v0.0.0-20190904154756-749cb33beabd/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20191005200804-aed5e4c7ecf9/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191022100944-742c48ecaeb7/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191120155948-bd437916bb0e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191204072324-ce4227a45e2e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200106162015-b016eb3dc98e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200223170610-d5e6a3e2c0ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd h1:xhmwyvizuTgC2qz7ZlMluP20uW+C3Rm0FD/WLDX8884= golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210112080510-489259a85091/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20210910150752-751e447fb3d0 h1:xrCZDmdtoloIiooiA9q0OQb9r8HejIHYoHGhGCe1pGg= golang.org/x/sys v0.0.0-20210910150752-751e447fb3d0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e h1:fLOSk5Q00efkSvAm+4xcoXD+RRmLmmulPn5I3Y9F2EM= +golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1 h1:v+OssWQX+hTHEmOBgwxdZxK4zHq3yOs8F9J7mk0PY8E= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/text v0.0.0-20160726164857-2910a502d2bf/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= -golang.org/x/text v0.3.3 h1:cokOdA+Jmi5PJGXLlLllQSgYigAEfHXJAERHVMaCc2k= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= golang.org/x/time v0.0.0-20180412165947-fbb02b2291d2/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= -golang.org/x/time v0.0.0-20190308202827-9d24e82272b4 h1:SvFZT6jyqRaOeXpc5h/JSfZenJ2O330aBsf7JfSUXmQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20201208040808-7e3f01d25324 h1:Hir2P/De0WpUhtrKGGjvSb2YxUgyZ7EFOSLIcSSpiwE= golang.org/x/time v0.0.0-20201208040808-7e3f01d25324/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= @@ -475,9 +501,13 @@ golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBn golang.org/x/tools v0.0.0-20190614205625-5aca471b1d59/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= golang.org/x/tools v0.0.0-20190617190820-da514acc4774/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= golang.org/x/tools v0.0.0-20190920225731-5eefd052ad72/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20201224043029-2b0845dc783e/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4= +golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= gomodules.xyz/jsonpatch/v2 v2.0.1 h1:xyiBuvkD2g5n7cYzx6u2sxQvsAy4QJsZFCzGVdzOXZ0= gomodules.xyz/jsonpatch/v2 v2.0.1/go.mod h1:IhYNNY4jnS53ZnfE4PAmpKtDpTCj1JFXc+3mwe7XcUU= google.golang.org/api v0.4.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE= @@ -498,8 +528,10 @@ google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM= google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE= google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo= -google.golang.org/protobuf v1.23.0 h1:4MY060fB1DLGMB/7MBTLnwQUY6+F09GEiz6SsrNqyzM= google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= +google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= +google.golang.org/protobuf v1.26.0 h1:bxAC2xTBsZGibn2RTntX0oH50xLsqy1OxA9tTL3p/lk= +google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= @@ -519,8 +551,9 @@ gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= -gopkg.in/yaml.v2 v2.3.0 h1:clyUAQHOM3G0M3f5vQj7LuJrETvjVot3Z5el9nffUtU= gopkg.in/yaml.v2 v2.3.0/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= +gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gotest.tools v2.2.0+incompatible h1:VsBPFP1AI068pPrMxtb/S8Zkgf9xEmTLJjfM+P5UIEo= gotest.tools v2.2.0+incompatible/go.mod h1:DsYFclhRJ6vuDpmuTbkuFWG+y2sxOXAzmJt81HFBacw= honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= diff --git a/hack/cloud.yaml b/hack/cloud.yaml index 76ccd351..7018acb9 100644 --- a/hack/cloud.yaml +++ b/hack/cloud.yaml @@ -2,9 +2,9 @@ kind: Cluster apiVersion: kind.x-k8s.io/v1alpha4 nodes: - role: control-plane - image: kindest/node:v1.18.19@sha256:7af1492e19b3192a79f606e43c35fb741e520d195f96399284515f077b3b622c + image: kindest/node:1.19.11@sha256:07db187ae84b4b7de440a73886f008cf903fcf5764ba8106a9fd5243d6f32729 - role: worker - image: kindest/node:v1.18.19@sha256:7af1492e19b3192a79f606e43c35fb741e520d195f96399284515f077b3b622c + image: kindest/node:1.19.11@sha256:07db187ae84b4b7de440a73886f008cf903fcf5764ba8106a9fd5243d6f32729 kubeadmConfigPatches: - | kind: JoinConfiguration @@ -12,7 +12,7 @@ nodes: kubeletExtraArgs: node-labels: "failure-domain.beta.kubernetes.io/zone=eu-central-1a" - role: worker - image: kindest/node:v1.18.19@sha256:7af1492e19b3192a79f606e43c35fb741e520d195f96399284515f077b3b622c + image: kindest/node:1.19.11@sha256:07db187ae84b4b7de440a73886f008cf903fcf5764ba8106a9fd5243d6f32729 kubeadmConfigPatches: - | kind: JoinConfiguration @@ -20,7 +20,7 @@ nodes: kubeletExtraArgs: node-labels: "failure-domain.beta.kubernetes.io/zone=eu-central-1a" - role: worker - image: kindest/node:v1.18.19@sha256:7af1492e19b3192a79f606e43c35fb741e520d195f96399284515f077b3b622c + image: kindest/node:1.19.11@sha256:07db187ae84b4b7de440a73886f008cf903fcf5764ba8106a9fd5243d6f32729 kubeadmConfigPatches: - | kind: JoinConfiguration @@ -28,7 +28,7 @@ nodes: kubeletExtraArgs: node-labels: "failure-domain.beta.kubernetes.io/zone=eu-central-1b" - role: worker - image: kindest/node:v1.18.19@sha256:7af1492e19b3192a79f606e43c35fb741e520d195f96399284515f077b3b622c + image: kindest/node:1.19.11@sha256:07db187ae84b4b7de440a73886f008cf903fcf5764ba8106a9fd5243d6f32729 kubeadmConfigPatches: - | kind: JoinConfiguration @@ -36,7 +36,7 @@ nodes: kubeletExtraArgs: node-labels: "failure-domain.beta.kubernetes.io/zone=eu-central-1b" - role: worker - image: kindest/node:v1.18.19@sha256:7af1492e19b3192a79f606e43c35fb741e520d195f96399284515f077b3b622c + image: kindest/node:1.19.11@sha256:07db187ae84b4b7de440a73886f008cf903fcf5764ba8106a9fd5243d6f32729 kubeadmConfigPatches: - | kind: JoinConfiguration @@ -44,7 +44,7 @@ nodes: kubeletExtraArgs: node-labels: "failure-domain.beta.kubernetes.io/zone=eu-central-1c" - role: worker - image: kindest/node:v1.18.19@sha256:7af1492e19b3192a79f606e43c35fb741e520d195f96399284515f077b3b622c + image: kindest/node:1.19.11@sha256:07db187ae84b4b7de440a73886f008cf903fcf5764ba8106a9fd5243d6f32729 kubeadmConfigPatches: - | kind: JoinConfiguration diff --git a/hack/dev.Dockerfile b/hack/dev.Dockerfile index 9344ac43..00ac61dd 100644 --- a/hack/dev.Dockerfile +++ b/hack/dev.Dockerfile @@ -1,5 +1,5 @@ # Build the manager binary -FROM golang:1.15 as builder +FROM golang:1.16 as builder # Package used to track changes to the source code and autocompile RUN go get github.com/githubnemo/CompileDaemon diff --git a/hack/gen_kind_config.py b/hack/gen_kind_config.py index e8f1a919..46cd96e6 100644 --- a/hack/gen_kind_config.py +++ b/hack/gen_kind_config.py @@ -5,7 +5,7 @@ class ClusterParams: NODE_LAYOUT = [['eu-central-1a']*2, ['eu-central-1b']*2, ['eu-central-1c']*2] ZONE_KEY = 'failure-domain.beta.kubernetes.io/zone' KIND_API_VERSION = 'kind.x-k8s.io/v1alpha4' - KUBERNETES_VERSION = 'v1.18.19@sha256:7af1492e19b3192a79f606e43c35fb741e520d195f96399284515f077b3b622c' + KUBERNETES_VERSION = '1.19.11@sha256:07db187ae84b4b7de440a73886f008cf903fcf5764ba8106a9fd5243d6f32729' def get_node_count(): return len([zone for group in ClusterParams.NODE_LAYOUT for zone in group]) diff --git a/hack/install.sh b/hack/install.sh index cb03bf65..6d4402d8 100755 --- a/hack/install.sh +++ b/hack/install.sh @@ -12,3 +12,21 @@ if [ "$current_context" = "kind-$CLUSTER_NAME" ]; then else echo "Please set the current cluster context to kind-$CLUSTER_NAME and re-run the install script" fi + +# increasing inotify max users in order to aviod 'kind' too many open files errors +# more info can be found here: https://github.com/kubernetes-sigs/kind/issues/2586 +KIND_DOCKER_IDS=$(docker ps -a -q) +KIND_DOCKER_IDS_ARRAY=($KIND_DOCKER_IDS) + +for dockerID in "${KIND_DOCKER_IDS_ARRAY[@]}" +do + : + export dockerName=$(docker inspect $dockerID | jq .[0].Name) + if [[ "$dockerName" == *"redis-test"* ]]; then + echo "increase inotify max users for docker: $dockerName" + docker exec -t $dockerID bash -c "echo 'fs.inotify.max_user_watches=1048576' >> /etc/sysctl.conf" + docker exec -t $dockerID bash -c "echo 'fs.inotify.max_user_instances=512' >> /etc/sysctl.conf" + docker exec -i $dockerID bash -c "sysctl -p /etc/sysctl.conf" + fi + +done \ No newline at end of file diff --git a/hack/redis-bin/Dockerfile b/hack/redis-bin/Dockerfile index c0667763..d78888cb 100644 --- a/hack/redis-bin/Dockerfile +++ b/hack/redis-bin/Dockerfile @@ -1,5 +1,5 @@ # should build the redis binary with the same image that the operator will run -FROM golang:1.15 +FROM golang:1.16 ARG DEBIAN_FRONTEND=noninteractive diff --git a/helm/templates/rediscluster/configmap-redisconf.yaml b/helm/templates/rediscluster/configmap-redisconf.yaml index e4634934..eb8d8740 100644 --- a/helm/templates/rediscluster/configmap-redisconf.yaml +++ b/helm/templates/rediscluster/configmap-redisconf.yaml @@ -1,4 +1,4 @@ -{{ if and .Values.redisCluster.enabled }} +{{- if and .Values.redisCluster.enabled }} apiVersion: v1 kind: ConfigMap metadata: diff --git a/main.go b/main.go index 6bfe8f7c..62219bcb 100644 --- a/main.go +++ b/main.go @@ -15,6 +15,7 @@ import ( dbv1 "github.com/PayU/redis-operator/api/v1" "github.com/PayU/redis-operator/controllers" "github.com/PayU/redis-operator/controllers/rediscli" + "github.com/PayU/redis-operator/controllers/view" "github.com/PayU/redis-operator/server" "github.com/go-logr/logr" // +kubebuilder:scaffold:imports @@ -41,7 +42,7 @@ func getRedisCLI(log *logr.Logger) *rediscli.RedisCLI { func main() { go server.StartServer() - + startManager() } @@ -91,15 +92,17 @@ func startManager() { k8sManager := controllers.K8sManager{ Client: mgr.GetClient(), Scheme: mgr.GetScheme(), - Log: configLogger} + Log: configLogger, + } if err = (&controllers.RedisClusterReconciler{ - Client: mgr.GetClient(), - Log: rdcLogger, - Scheme: mgr.GetScheme(), - RedisCLI: getRedisCLI(&rdcLogger), - Config: &operatorConfig.Config, - State: controllers.NotExists, + Client: mgr.GetClient(), + Log: rdcLogger, + Scheme: mgr.GetScheme(), + RedisCLI: getRedisCLI(&rdcLogger), + Config: &operatorConfig.Config, + State: controllers.NotExists, + RedisClusterStateView: &view.RedisClusterStateView{Name: controllers.RedisClusterStateMapName}, }).SetupWithManager(mgr); err != nil { setupLogger.Error(err, "unable to create controller", "controller", "RedisCluster") os.Exit(1) diff --git a/server/module.go b/server/module.go deleted file mode 100644 index 182e9526..00000000 --- a/server/module.go +++ /dev/null @@ -1,85 +0,0 @@ -package server - -import ( - "encoding/json" - "net/http" - - . "github.com/PayU/redis-operator/controllers" - clusterData "github.com/PayU/redis-operator/data" - "github.com/labstack/echo/v4" - v1 "k8s.io/api/core/v1" -) - -type ResponseRedisClusterView struct { - State string - Nodes []ResponseLeaderNode -} - -type ResponseLeaderNode struct { - PodIp string - NodeNumber string - Failed bool - Terminating bool - Followers []ResponseFollowerNode -} - -type ResponseFollowerNode struct { - PodIp string - NodeNumber string - LeaderNumber string - Failed bool - Terminating bool -} - -func clusterInfo(c echo.Context) error { - byteValue, err := clusterData.GetClusterView() - if err != nil { - return c.String(http.StatusNotFound, "Cluster info not available") - } - - var result RedisClusterView - json.Unmarshal([]byte(byteValue), &result) - - s := clusterData.GetRedisClusterState() - ResponseRedisClusterView := ResponseRedisClusterView{ - State: s, - Nodes: make([]ResponseLeaderNode, len(result)), - } - - for i, leaderNode := range result { - ip := getIP(leaderNode.Pod) - - ResponseRedisClusterView.Nodes[i] = ResponseLeaderNode{ - Followers: make([]ResponseFollowerNode, len(leaderNode.Followers)), - PodIp: ip, - NodeNumber: leaderNode.NodeNumber, - Failed: leaderNode.Failed, - Terminating: leaderNode.Terminating, - } - for j, follower := range leaderNode.Followers { - followerIp := getIP(follower.Pod) - ResponseRedisClusterView.Nodes[i].Followers[j] = ResponseFollowerNode{ - PodIp: followerIp, - NodeNumber: follower.NodeNumber, - LeaderNumber: follower.LeaderNumber, - Failed: follower.Failed, - Terminating: leaderNode.Terminating, - } - } - } - - return c.JSON(http.StatusOK, ResponseRedisClusterView) -} - -func getIP(pod *v1.Pod) string { - if pod == nil { - return "" - } else { - return pod.Status.PodIP - } -} - -func clusterState(c echo.Context) error { - s := clusterData.GetRedisClusterState() - return c.String(http.StatusOK, s) -} diff --git a/server/router.go b/server/router.go index 767c36f9..6ddc4879 100644 --- a/server/router.go +++ b/server/router.go @@ -1,10 +1,21 @@ package server import ( + "github.com/PayU/redis-operator/controllers" "github.com/labstack/echo/v4" ) -func Register(e *echo.Echo) { - e.GET("/state", clusterState) - e.GET("/info", clusterInfo) +func register(e *echo.Echo) { + e.GET("/state", controllers.ClusterState) + e.GET("/info", controllers.ClusterInfo) + e.POST("/rebalance", controllers.ClusterRebalance) + e.POST("/fix", controllers.ClusterFix) + e.POST("/forgetLostNodes", controllers.ForgetLostNodes) + e.POST("/forceReconcile", controllers.ForceReconcile) + e.POST("/upgrade", controllers.UpgradeCluster) + e.POST("/test", controllers.ClusterTest) + e.POST("/reset", controllers.DoResetCluster) + e.POST("/testData", controllers.ClusterTestWithData) + e.POST("/populateMockData", controllers.PopulateClusterWithMockData) + e.POST("/flushAllData", controllers.FlushClusterData) } diff --git a/server/server.go b/server/server.go index 12b85fcb..2146f27e 100644 --- a/server/server.go +++ b/server/server.go @@ -5,11 +5,11 @@ import ( ) func StartServer() { - e := echo.New() + echo := echo.New() // Routes - Register(e) + register(echo) // Start server - go e.Logger.Fatal(e.Start("0.0.0.0:8080")) + go echo.Logger.Fatal(echo.Start("0.0.0.0:8080")) } diff --git a/test/e2e/rediscluster_test.go b/test/e2e/rediscluster_test.go index 811dfb23..48d304a0 100644 --- a/test/e2e/rediscluster_test.go +++ b/test/e2e/rediscluster_test.go @@ -1,3 +1,4 @@ +//go:build e2e_redis_op // +build e2e_redis_op package e2e @@ -117,23 +118,23 @@ func makeAZMap(ctx *framework.TestCtx, t *testing.T, config TestConfig) map[stri } for _, pod := range leaders.Items { - group, found := az[pod.Labels["leader-number"]] + group, found := az[pod.Labels["leader-name"]] if !found { group = []*corev1.Pod{&pod} } else { group = append(group, &pod) } - az[pod.Labels["leader-number"]] = group + az[pod.Labels["leader-name"]] = group } for _, pod := range followers.Items { - group, found := az[pod.Labels["leader-number"]] + group, found := az[pod.Labels["leader-name"]] if !found { group = []*corev1.Pod{&pod} } else { group = append(group, &pod) } - az[pod.Labels["leader-number"]] = group + az[pod.Labels["leader-name"]] = group } azMap[node.Labels[zoneLabel]] = az diff --git a/test/framework/k8sresources.go b/test/framework/k8sresources.go index 122fdde5..c3c13d64 100644 --- a/test/framework/k8sresources.go +++ b/test/framework/k8sresources.go @@ -1,3 +1,4 @@ +//go:build e2e_redis_op // +build e2e_redis_op package framework @@ -66,7 +67,7 @@ func (f *Framework) CreateResources(ctx *TestCtx, timeout time.Duration, objs .. } fmt.Printf("Waiting on resource %v...\n", key) - err = wait.PollImmediate(2*time.Second, timeout, func() (bool, error) { + err = wait.PollImmediate(2*time.Second, 5*timeout, func() (bool, error) { if err = f.RuntimeClient.Get(context.TODO(), key, existingResource); err != nil { return false, err } @@ -114,7 +115,7 @@ func (f *Framework) DeleteResource(obj runtime.Object, timeout time.Duration) er return errors.Wrap(err, "Could not check delete resource - object key error") } - if pollErr := wait.PollImmediate(2*time.Second, timeout, func() (bool, error) { + if pollErr := wait.PollImmediate(2*time.Second, 5*timeout, func() (bool, error) { err = f.RuntimeClient.Get(context.TODO(), key, obj) switch { case apierrors.IsNotFound(err): @@ -222,7 +223,7 @@ func (f *Framework) CordonNode(nodeName string, unschedule bool, timeout time.Du fmt.Printf("Failed to cordon/uncordon: %v\n", err) return err } - if pollErr := wait.PollImmediate(time.Second, timeout, func() (bool, error) { + if pollErr := wait.PollImmediate(2*time.Second, 5*timeout, func() (bool, error) { node, err := f.KubeClient.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{}) if err != nil { return false, err diff --git a/test/framework/rediscluster.go b/test/framework/rediscluster.go index 5a261768..e5d09196 100644 --- a/test/framework/rediscluster.go +++ b/test/framework/rediscluster.go @@ -1,3 +1,4 @@ +//go:build e2e_redis_op // +build e2e_redis_op package framework @@ -82,7 +83,7 @@ func (f *Framework) WaitForState(redisCluster *dbv1.RedisCluster, state string, if len(timeout) > 0 { t = timeout[0] } - return wait.PollImmediate(2*time.Second, t, func() (bool, error) { + return wait.PollImmediate(2*time.Second, 5*t, func() (bool, error) { key, err := client.ObjectKeyFromObject(redisCluster) if err != nil { return false, err @@ -125,7 +126,7 @@ func (f *Framework) PopulateDatabase(keyCount int, keyName string, keySize int) errs := make(chan error, len(leaderPods.Items)) - for i, leaderPod := range leaderPods.Items { + for _, leaderPod := range leaderPods.Items { if leaderPod.Status.PodIP != "" { wg.Add(1) go func(keyCount string, keyName string, keySize string, pod corev1.Pod, wg *sync.WaitGroup) { @@ -135,9 +136,9 @@ func (f *Framework) PopulateDatabase(keyCount int, keyName string, keySize int) fmt.Printf("Failed to run container shell: %s | %s\n", stdout, stderr) errs <- err } - }(strconv.Itoa(keyCount), keyName, strconv.Itoa(keySize), leaderPods.Items[i], &wg) + }(strconv.Itoa(keyCount), keyName, strconv.Itoa(keySize), leaderPod, &wg) } else { - fmt.Printf("Node %s had no IP\n", leaderPod.Labels["node-number"]) + fmt.Printf("Node %s had no IP\n", leaderPod.Labels["node-name"]) } } diff --git a/tilt.Dockerfile b/tilt.Dockerfile index fe4c1b0a..cc1b2a7f 100644 --- a/tilt.Dockerfile +++ b/tilt.Dockerfile @@ -1,4 +1,4 @@ -FROM golang:1.15 +FROM golang:1.16 WORKDIR / ADD hack/redis-bin/build/redis-cli /bin/redis-cli ADD bin/manager /manager