Update slotmap when redisClusterCommandToNode() fails (#185)

Since a failure can be a timeout, we don't want the slotmap update to take more time in the same call. The update is scheduled to be done at the next command instead.
Nordix · Sep 1, 2023 · 7c18baf · 7c18baf
1 parent 16e6ee8
commit 7c18baf
Show file tree

Hide file tree

Showing 6 changed files with 159 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -276,6 +276,14 @@ If the cluster topology has changed the Redis node might respond with a redirect
 which the client will handle, update its slotmap and resend the command to correct node.
 The reply will in this case arrive from the correct node.
 
+If a node is unreachable, for example if the command times out or if the connect
+times out, it can indicated that there has been a failover and the node is no
+longer part of the cluster. In this case, `redisClusterCommand` returns NULL and
+sets `err` and `errstr` on the cluster context, but additionally, hiredis
+cluster schedules a slotmap update to be performed when the next command is
+sent. That means that if you try the same command again, there is a good chance
+the command will be sent to another node and the command may succeed.
+
 ### Sending multi-key commands
 
 Hiredis-cluster supports mget/mset/del multi-key commands.
@@ -297,6 +305,11 @@ reply = redisClusterCommandToNode(clustercontext, node, "DBSIZE");
 This function handles printf like arguments similar to `redisClusterCommand()`, but will
 only attempt to send the command to the given node and will not perform redirects or retries.
 
+If the command times out or the connection to the node fails, a slotmap update
+is scheduled to be performed when the next command is sent.
+`redisClusterCommandToNode` also performs a slotmap update if it has previously
+been scheduled.
+
 ### Teardown
 
 To disconnect and free the context the following function can be used:
@@ -547,6 +560,21 @@ set of master nodes. There is no bookkeeping for already iterated nodes when a r
 which means that a node can be iterated over more than once depending on when the slotmap update happened
 and the change of cluster nodes.
 
+Note that when `redisClusterCommandToNode` is called, a slotmap update can
+happen if it has been scheduled by the previous command, for example if the
+previous call to `redisClusterCommandToNode` timed out or the node wasn't
+reachable.
+
+To detect when the slotmap has been updated, you can check if the iterator's
+slotmap version (`iter.route_version`) is equal to the current cluster context's
+slotmap version (`cc->route_version`). If it isn't, it means that the slotmap
+has been updated and the iterator will restart itself at the next call to
+`redisClusterNodeNext`.
+
+Another way to detect that the slotmap has been updated is to [register an event
+callback](#events-per-cluster-context) and look for the event
+`HIRCLUSTER_EVENT_SLOTMAP_UPDATED`.
+
 ### Random number generator
 
 This library uses [random()](https://linux.die.net/man/3/random) while selecting

diff --git a/hircluster.c b/hircluster.c
@@ -3037,6 +3037,7 @@ void *redisClusterCommandToNode(redisClusterContext *cc, redisClusterNode *node,
     va_list ap;
     int ret;
     void *reply;
+    int updating_slotmap = 0;
 
     c = ctx_get_by_node(cc, node);
     if (c == NULL) {
@@ -3046,6 +3047,11 @@ void *redisClusterCommandToNode(redisClusterContext *cc, redisClusterNode *node,
         return NULL;
     }
 
+    if (cc->err) {
+        cc->err = 0;
+        memset(cc->errstr, '\0', sizeof(cc->errstr));
+    }
+
     va_start(ap, format);
     ret = redisvAppendCommand(c, format, ap);
     va_end(ap);
@@ -3055,11 +3061,29 @@ void *redisClusterCommandToNode(redisClusterContext *cc, redisClusterNode *node,
         return NULL;
     }
 
+    if (cc->need_update_route) {
+        /* Pipeline slotmap update on the same connection. */
+        if (clusterUpdateRouteSendCommand(cc, c) == REDIS_OK) {
+            updating_slotmap = 1;
+        }
+    }
+
     if (redisGetReply(c, &reply) != REDIS_OK) {
         __redisClusterSetError(cc, c->err, c->errstr);
+        if (c->err != REDIS_ERR_OOM)
+            cc->need_update_route = 1;
         return NULL;
     }
 
+    if (updating_slotmap) {
+        /* Handle reply from pipelined CLUSTER SLOTS or CLUSTER NODES. */
+        if (clusterUpdateRouteHandleReply(cc, c) != REDIS_OK) {
+            /* Ignore error. Update will be triggered on the next command. */
+            cc->err = 0;
+            cc->errstr[0] = '\0';
+        }
+    }
+
     return reply;
 }
 

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -195,6 +195,10 @@ add_test(NAME dbsize-to-all-nodes-test-async
                  WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}/tests/scripts/")
 add_test(NAME dbsize-to-all-nodes-during-scaledown-test
          COMMAND "${CMAKE_SOURCE_DIR}/tests/scripts/dbsize-to-all-nodes-during-scaledown-test.sh"
+                 "$<TARGET_FILE:clusterclient>"
+                 WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}/tests/scripts/")
+add_test(NAME dbsize-to-all-nodes-during-scaledown-test-async
+         COMMAND "${CMAKE_SOURCE_DIR}/tests/scripts/dbsize-to-all-nodes-during-scaledown-test-async.sh"
                  "$<TARGET_FILE:clusterclient_async>"
                  WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}/tests/scripts/")
 add_test(NAME reconnect-test

diff --git a/tests/clusterclient.c b/tests/clusterclient.c
@@ -118,6 +118,10 @@ int main(int argc, char **argv) {
                     printReply(reply);
                 }
                 freeReplyObject(reply);
+                if (ni.route_version != cc->route_version) {
+                    /* Updated slotmap resets the iterator. Abort iteration. */
+                    break;
+                }
             }
         } else {
             redisReply *reply = redisClusterCommand(cc, command);

diff --git a/tests/scripts/dbsize-to-all-nodes-during-scaledown-test-async.sh b/tests/scripts/dbsize-to-all-nodes-during-scaledown-test-async.sh
@@ -0,0 +1,94 @@
+#!/bin/bash
+#
+# Verify that commands can be sent using the low-level API to specific
+# nodes. The testcase will send each command to all known nodes and
+# verify the behaviour when a node is removed from the cluster.
+#
+# First the command DBSIZE is sent to all (two) nodes successfully,
+# then the second node is shutdown. The next DBSIZE command that is sent
+# triggers a slotmap update due to the lost node and all following commands
+# will therefor only be sent to a single node.
+#
+# Usage: $0 /path/to/clusterclient-binary
+
+clientprog=${1:-./clusterclient_async}
+testname=dbsize-to-all-nodes-during-scaledown-test-async
+
+# Sync processes waiting for CONT signals.
+perl -we 'use sigtrap "handler", sub{exit}, "CONT"; sleep 1; die "timeout"' &
+syncpid1=$!;
+perl -we 'use sigtrap "handler", sub{exit}, "CONT"; sleep 1; die "timeout"' &
+syncpid2=$!;
+
+# Start simulated redis node #1
+timeout 5s ./simulated-redis.pl -p 7401 -d --sigcont $syncpid1 <<'EOF' &
+EXPECT CONNECT
+EXPECT ["CLUSTER", "SLOTS"]
+SEND [[0, 8383, ["127.0.0.1", 7401, "nodeid7401"]], [8384, 16383, ["127.0.0.1", 7402, "nodeid7402"]]]
+EXPECT CLOSE
+EXPECT CONNECT
+EXPECT ["DBSIZE"]
+SEND 10
+EXPECT ["DBSIZE"]
+SEND 11
+# The second command to node2 fails which triggers a slotmap update.
+EXPECT ["CLUSTER", "SLOTS"]
+SEND [[0, 16383, ["127.0.0.1", 7401, "nodeid7401"]]]
+EXPECT ["DBSIZE"]
+SEND 12
+EXPECT CLOSE
+EOF
+server1=$!
+
+# Start simulated redis node #2
+timeout 5s ./simulated-redis.pl -p 7402 -d --sigcont $syncpid2 <<'EOF' &
+EXPECT CONNECT
+EXPECT ["DBSIZE"]
+SEND 20
+CLOSE
+EOF
+server2=$!
+
+# Wait until both nodes are ready to accept client connections
+wait $syncpid1 $syncpid2;
+
+# Run client
+timeout 5s "$clientprog" 127.0.0.1:7401 > "$testname.out" <<'EOF'
+!all
+DBSIZE
+DBSIZE
+# Allow slotmap update to finish.
+!sleep
+DBSIZE
+EOF
+clientexit=$?
+
+# Wait for servers to exit
+wait $server1; server1exit=$?
+wait $server2; server2exit=$?
+
+# Check exit statuses
+if [ $server1exit -ne 0 ]; then
+    echo "Simulated server #1 exited with status $server1exit"
+    exit $server1exit
+fi
+if [ $server2exit -ne 0 ]; then
+    echo "Simulated server #2 exited with status $server2exit"
+    exit $server2exit
+fi
+if [ $clientexit -ne 0 ]; then
+    echo "$clientprog exited with status $clientexit"
+    exit $clientexit
+fi
+
+# Check the output from clusterclient
+expected="10
+20
+error: Server closed the connection
+11
+12"
+
+echo "$expected" | diff -u - "$testname.out" || exit 99
+
+# Clean up
+rm "$testname.out"
diff --git a/tests/scripts/dbsize-to-all-nodes-during-scaledown-test.sh b/tests/scripts/dbsize-to-all-nodes-during-scaledown-test.sh
@@ -31,11 +31,12 @@ EXPECT ["DBSIZE"]
 SEND 10
 EXPECT ["DBSIZE"]
 SEND 11
-# The second command to node2 fails which triggers a slotmap update.
-EXPECT ["CLUSTER", "SLOTS"]
-SEND [[0, 16383, ["127.0.0.1", 7401, "nodeid7401"]]]
 EXPECT ["DBSIZE"]
 SEND 12
+# The second command to node2 fails which triggers a slotmap update pipelined
+# onto the 3rd DBSIZE to this node.
+EXPECT ["CLUSTER", "SLOTS"]
+SEND [[0, 16383, ["127.0.0.1", 7401, "nodeid7401"]]]
 EXPECT CLOSE
 EOF
 server1=$!
@@ -84,8 +85,8 @@ fi
 # Check the output from clusterclient
 expected="10
 20
-error: Server closed the connection
 11
+error: Server closed the connection
 12"
 
 echo "$expected" | diff -u - "$testname.out" || exit 99