Add chaos monkey test (disabled)

stackabletech · Oct 12, 2023 · 0b94a07 · 0b94a07
1 parent 56fcd3b
commit 0b94a07
Show file tree

Hide file tree

Showing 3 changed files with 95 additions and 15 deletions.
diff --git a/tests/templates/kuttl/smoke/30-assert.yaml b/tests/templates/kuttl/smoke/30-assert.yaml
@@ -14,8 +14,8 @@ spec:
     spec:
       terminationGracePeriodSeconds: 1200
 status:
-  readyReplicas: 1
-  replicas: 1
+  readyReplicas: 2
+  replicas: 2
 ---
 apiVersion: apps/v1
 kind: StatefulSet
@@ -26,8 +26,8 @@ spec:
     spec:
       terminationGracePeriodSeconds: 3600
 status:
-  readyReplicas: 1
-  replicas: 1
+  readyReplicas: 2
+  replicas: 2
 ---
 apiVersion: apps/v1
 kind: StatefulSet
@@ -38,32 +38,32 @@ spec:
     spec:
       terminationGracePeriodSeconds: 300
 status:
-  readyReplicas: 1
-  replicas: 1
+  readyReplicas: 2
+  replicas: 2
 ---
 apiVersion: policy/v1
 kind: PodDisruptionBudget
 metadata:
   name: test-hbase-master
 status:
-  expectedPods: 1
-  currentHealthy: 1
+  expectedPods: 2
+  currentHealthy: 2
   disruptionsAllowed: 1
 ---
 apiVersion: policy/v1
 kind: PodDisruptionBudget
 metadata:
   name: test-hbase-regionserver
 status:
-  expectedPods: 1
-  currentHealthy: 1
+  expectedPods: 2
+  currentHealthy: 2
   disruptionsAllowed: 1
 ---
 apiVersion: policy/v1
 kind: PodDisruptionBudget
 metadata:
   name: test-hbase-restserver
 status:
-  expectedPods: 1
-  currentHealthy: 1
+  expectedPods: 2
+  currentHealthy: 2
   disruptionsAllowed: 1
diff --git a/tests/templates/kuttl/smoke/30-install-hbase.yaml.j2 b/tests/templates/kuttl/smoke/30-install-hbase.yaml.j2
@@ -23,7 +23,7 @@ spec:
           hbase-site.xml:
             phoenix.log.saltBuckets: "2"
             hbase.regionserver.wal.codec: "org.apache.hadoop.hbase.regionserver.wal.IndexedWALEditCodec"
-        replicas: 1
+        replicas: 2
   regionServers:
     config:
       logging:
@@ -34,7 +34,7 @@ spec:
           hbase-site.xml:
             phoenix.log.saltBuckets: "2"
             hbase.regionserver.wal.codec: "org.apache.hadoop.hbase.regionserver.wal.IndexedWALEditCodec"
-        replicas: 1
+        replicas: 2
   restServers:
     config:
       logging:
@@ -45,4 +45,4 @@ spec:
           hbase-site.xml:
             phoenix.log.saltBuckets: "2"
             hbase.regionserver.wal.codec: "org.apache.hadoop.hbase.regionserver.wal.IndexedWALEditCodec"
-        replicas: 1
+        replicas: 2
diff --git a/tests/templates/kuttl/smoke/70-unleash-the-chaosmonkey.yaml.j2.DISABLED b/tests/templates/kuttl/smoke/70-unleash-the-chaosmonkey.yaml.j2.DISABLED
@@ -0,0 +1,80 @@
+# WARNING
+# This test is disabled as everything is broken:
+# * HBase 2.4 returns random DNS failures, such as
+# 2023-10-11 13:27:58,532 INFO  [master/test-hbase-master-default-0:16000:becomeActiveMaster] retry.RetryInvocationHandler: java.net.UnknownHostException: Invalid host name: local host is: (unknown); destination host is: "test-hdfs-namenode-default-1.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local":8020; java.net.UnknownHostException; For more details see:  http://wiki.apache.org/hadoop/UnknownHost, while invoking ClientNamenodeProtocolTranslatorPB.getFileInfo over test-hdfs-namenode-default-1.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local:8020 after 13 failover attempts. Trying to failover after sleeping for 21829ms.
+# or
+# 2023-10-11 13:29:01,311 WARN  [master/test-hbase-master-default-1:16000:becomeActiveMaster] ipc.Client: Address change detected. Old: test-hdfs-namenode-default-0.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local/10.244.0.188:8020 New: test-hdfs-namenode-default-0.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local/10.244.0.208:8020
+# 2023-10-11 13:29:21,341 WARN  [master/test-hbase-master-default-1:16000:becomeActiveMaster] ipc.Client: Address change detected. Old: test-hdfs-namenode-default-1.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local/10.244.0.173:8020 New: test-hdfs-namenode-default-1.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local/10.244.0.210:8020
+# 2023-10-11 13:29:42,657 INFO  [master/test-hbase-master-default-1:16000:becomeActiveMaster] retry.RetryInvocationHandler: org.apache.hadoop.net.ConnectTimeoutException: Call From test-hbase-master-default-1/10.244.0.201 to test-hdfs-namenode-default-0.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local:8020 failed on socket timeout exception: org.apache.hadoop.net.ConnectTimeoutException: 20000 millis timeout while waiting for channel to be ready for connect. ch : java.nio.channels.SocketChannel[connection-pending remote=test-hdfs-namenode-default-0.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local/10.244.0.188:8020]; For more details see:  http://wiki.apache.org/hadoop/SocketTimeout, while invoking ClientNamenodeProtocolTranslatorPB.setSafeMode over test-hdfs-namenode-default-0.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local/10.244.0.188:8020 after 2 failover attempts. Trying to failover after sleeping for 2803ms.
+# 2023-10-11 13:29:21,342 INFO  [master/test-hbase-master-default-1:16000:becomeActiveMaster] retry.RetryInvocationHandler: org.apache.hadoop.net.ConnectTimeoutException: Call From test-hbase-master-default-1/10.244.0.201 to test-hdfs-namenode-default-1.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local:8020 failed on socket timeout exception: org.apache.hadoop.net.ConnectTimeoutException: 20000 millis timeout while waiting for channel to be ready for connect. ch : java.nio.channels.SocketChannel[connection-pending remote=test-hdfs-namenode-default-1.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local/10.244.0.173:8020]; For more details see:  http://wiki.apache.org/hadoop/SocketTimeout, while invoking ClientNamenodeProtocolTranslatorPB.setSafeMode over test-hdfs-namenode-default-1.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local/10.244.0.173:8020 after 1 failover attempts. Trying to failover after sleeping for 1296ms.
+#
+# * HBase 2.5 causes the Phoernix test to be flaky. Works half of the time, or otherwise fail with some timeouts
+# WARNING
+
+# Tribute to https://github.com/Netflix/chaosmonkey
+
+# We need to reduce the number of monkeys, otherwise the tests literally take days
+# We only run them on some hand-picked test cases
+{% if test_scenario['values']['listener-class'] == 'cluster-internal' and test_scenario['values']['hdfs'] == test_scenario['values']['hdfs-latest'] and test_scenario['values']['zookeeper'] == test_scenario['values']['zookeeper-latest'] %}
+
+# We need to force-delete the Pods, because IONOS is sometimes unable to delete the pod (it's stuck in Terminating for > 20 minutes)
+---
+apiVersion: kuttl.dev/v1beta1
+kind: TestStep
+timeout: 3600
+commands:
+  # First, let's delete the first pod of every HBase service
+  # Should trigger failover of the namenode to 1
+  - script: kubectl -n $NAMESPACE delete pod --force test-hbase-master-default-0 test-hbase-regionserver-default-0 test-hbase-restserver-default-0
+    timeout: 600
+  - script: sleep 10
+  - script: kubectl -n $NAMESPACE wait --for=condition=Available hbase test-hbase --timeout 10m
+    timeout: 600
+
+  # Also delete the last pod of every HBase service
+  # Should trigger failover of the namenode back to 0
+  - script: kubectl -n $NAMESPACE delete pod --force test-hbase-master-default-1 test-hbase-regionserver-default-1 test-hbase-restserver-default-1
+    timeout: 600
+  - script: sleep 10
+  - script: kubectl -n $NAMESPACE wait --for=condition=Available hbase test-hbase --timeout 10m
+    timeout: 600
+
+  # Also delete the Zookeeper
+  - script: kubectl -n $NAMESPACE delete pod --force test-zk-server-default-0
+    timeout: 600
+  - script: sleep 10
+  - script: kubectl -n $NAMESPACE wait --for=condition=Available zookeepercluster test-zk --timeout 10m
+    timeout: 600
+
+  # Also delete some HDFS Pods
+  - script: kubectl -n $NAMESPACE delete pod --force test-hdfs-namenode-default-0 test-hdfs-datanode-default-0
+    timeout: 600
+  - script: sleep 10
+  - script: kubectl -n $NAMESPACE wait --for=condition=Available hdfs test-hdfs --timeout 10m
+    timeout: 600
+
+  # And now everything
+{% for n in range(3) %}
+  - script: kubectl -n $NAMESPACE delete pod --force -l app.kubernetes.io/name=hbase
+    timeout: 600
+  - script: kubectl -n $NAMESPACE delete pod --force -l app.kubernetes.io/name=hdfs
+    timeout: 600
+  - script: kubectl -n $NAMESPACE delete pod --force -l app.kubernetes.io/name=zookeeper
+    timeout: 600
+  - script: sleep 10
+  # Delete just after they have started up again, just to make things worse
+  - script: kubectl -n $NAMESPACE delete pod --force -l app.kubernetes.io/name=hbase
+    timeout: 600
+  - script: kubectl -n $NAMESPACE delete pod --force -l app.kubernetes.io/name=hdfs
+    timeout: 600
+  - script: kubectl -n $NAMESPACE delete pod --force -l app.kubernetes.io/name=zookeeper
+    timeout: 600
+  - script: sleep 10
+  - script: kubectl -n $NAMESPACE wait --for=condition=Available zookeepercluster test-zk --timeout 10m
+    timeout: 600
+  - script: kubectl -n $NAMESPACE wait --for=condition=Available hdfs test-hdfs --timeout 10m
+    timeout: 600
+  - script: kubectl -n $NAMESPACE wait --for=condition=Available hbase test-hbase --timeout 10m
+    timeout: 600
+{% endfor %}
+{% endif %}