diff --git a/tests/templates/kuttl/smoke/30-assert.yaml b/tests/templates/kuttl/smoke/30-assert.yaml index 89b2a029..cae4a019 100644 --- a/tests/templates/kuttl/smoke/30-assert.yaml +++ b/tests/templates/kuttl/smoke/30-assert.yaml @@ -14,8 +14,8 @@ spec: spec: terminationGracePeriodSeconds: 1200 status: - readyReplicas: 1 - replicas: 1 + readyReplicas: 2 + replicas: 2 --- apiVersion: apps/v1 kind: StatefulSet @@ -26,8 +26,8 @@ spec: spec: terminationGracePeriodSeconds: 3600 status: - readyReplicas: 1 - replicas: 1 + readyReplicas: 2 + replicas: 2 --- apiVersion: apps/v1 kind: StatefulSet @@ -38,16 +38,16 @@ spec: spec: terminationGracePeriodSeconds: 300 status: - readyReplicas: 1 - replicas: 1 + readyReplicas: 2 + replicas: 2 --- apiVersion: policy/v1 kind: PodDisruptionBudget metadata: name: test-hbase-master status: - expectedPods: 1 - currentHealthy: 1 + expectedPods: 2 + currentHealthy: 2 disruptionsAllowed: 1 --- apiVersion: policy/v1 @@ -55,8 +55,8 @@ kind: PodDisruptionBudget metadata: name: test-hbase-regionserver status: - expectedPods: 1 - currentHealthy: 1 + expectedPods: 2 + currentHealthy: 2 disruptionsAllowed: 1 --- apiVersion: policy/v1 @@ -64,6 +64,6 @@ kind: PodDisruptionBudget metadata: name: test-hbase-restserver status: - expectedPods: 1 - currentHealthy: 1 + expectedPods: 2 + currentHealthy: 2 disruptionsAllowed: 1 diff --git a/tests/templates/kuttl/smoke/30-install-hbase.yaml.j2 b/tests/templates/kuttl/smoke/30-install-hbase.yaml.j2 index 214c3fdc..ec30caef 100644 --- a/tests/templates/kuttl/smoke/30-install-hbase.yaml.j2 +++ b/tests/templates/kuttl/smoke/30-install-hbase.yaml.j2 @@ -23,7 +23,7 @@ spec: hbase-site.xml: phoenix.log.saltBuckets: "2" hbase.regionserver.wal.codec: "org.apache.hadoop.hbase.regionserver.wal.IndexedWALEditCodec" - replicas: 1 + replicas: 2 regionServers: config: logging: @@ -34,7 +34,7 @@ spec: hbase-site.xml: phoenix.log.saltBuckets: "2" hbase.regionserver.wal.codec: "org.apache.hadoop.hbase.regionserver.wal.IndexedWALEditCodec" - replicas: 1 + replicas: 2 restServers: config: logging: @@ -45,4 +45,4 @@ spec: hbase-site.xml: phoenix.log.saltBuckets: "2" hbase.regionserver.wal.codec: "org.apache.hadoop.hbase.regionserver.wal.IndexedWALEditCodec" - replicas: 1 + replicas: 2 diff --git a/tests/templates/kuttl/smoke/70-unleash-the-chaosmonkey.yaml.j2.DISABLED b/tests/templates/kuttl/smoke/70-unleash-the-chaosmonkey.yaml.j2.DISABLED new file mode 100644 index 00000000..341acb12 --- /dev/null +++ b/tests/templates/kuttl/smoke/70-unleash-the-chaosmonkey.yaml.j2.DISABLED @@ -0,0 +1,80 @@ +# WARNING +# This test is disabled as everything is broken: +# * HBase 2.4 returns random DNS failures, such as +# 2023-10-11 13:27:58,532 INFO [master/test-hbase-master-default-0:16000:becomeActiveMaster] retry.RetryInvocationHandler: java.net.UnknownHostException: Invalid host name: local host is: (unknown); destination host is: "test-hdfs-namenode-default-1.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local":8020; java.net.UnknownHostException; For more details see: http://wiki.apache.org/hadoop/UnknownHost, while invoking ClientNamenodeProtocolTranslatorPB.getFileInfo over test-hdfs-namenode-default-1.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local:8020 after 13 failover attempts. Trying to failover after sleeping for 21829ms. +# or +# 2023-10-11 13:29:01,311 WARN [master/test-hbase-master-default-1:16000:becomeActiveMaster] ipc.Client: Address change detected. Old: test-hdfs-namenode-default-0.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local/10.244.0.188:8020 New: test-hdfs-namenode-default-0.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local/10.244.0.208:8020 +# 2023-10-11 13:29:21,341 WARN [master/test-hbase-master-default-1:16000:becomeActiveMaster] ipc.Client: Address change detected. Old: test-hdfs-namenode-default-1.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local/10.244.0.173:8020 New: test-hdfs-namenode-default-1.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local/10.244.0.210:8020 +# 2023-10-11 13:29:42,657 INFO [master/test-hbase-master-default-1:16000:becomeActiveMaster] retry.RetryInvocationHandler: org.apache.hadoop.net.ConnectTimeoutException: Call From test-hbase-master-default-1/10.244.0.201 to test-hdfs-namenode-default-0.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local:8020 failed on socket timeout exception: org.apache.hadoop.net.ConnectTimeoutException: 20000 millis timeout while waiting for channel to be ready for connect. ch : java.nio.channels.SocketChannel[connection-pending remote=test-hdfs-namenode-default-0.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local/10.244.0.188:8020]; For more details see: http://wiki.apache.org/hadoop/SocketTimeout, while invoking ClientNamenodeProtocolTranslatorPB.setSafeMode over test-hdfs-namenode-default-0.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local/10.244.0.188:8020 after 2 failover attempts. Trying to failover after sleeping for 2803ms. +# 2023-10-11 13:29:21,342 INFO [master/test-hbase-master-default-1:16000:becomeActiveMaster] retry.RetryInvocationHandler: org.apache.hadoop.net.ConnectTimeoutException: Call From test-hbase-master-default-1/10.244.0.201 to test-hdfs-namenode-default-1.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local:8020 failed on socket timeout exception: org.apache.hadoop.net.ConnectTimeoutException: 20000 millis timeout while waiting for channel to be ready for connect. ch : java.nio.channels.SocketChannel[connection-pending remote=test-hdfs-namenode-default-1.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local/10.244.0.173:8020]; For more details see: http://wiki.apache.org/hadoop/SocketTimeout, while invoking ClientNamenodeProtocolTranslatorPB.setSafeMode over test-hdfs-namenode-default-1.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local/10.244.0.173:8020 after 1 failover attempts. Trying to failover after sleeping for 1296ms. +# +# * HBase 2.5 causes the Phoernix test to be flaky. Works half of the time, or otherwise fail with some timeouts +# WARNING + +# Tribute to https://github.com/Netflix/chaosmonkey + +# We need to reduce the number of monkeys, otherwise the tests literally take days +# We only run them on some hand-picked test cases +{% if test_scenario['values']['listener-class'] == 'cluster-internal' and test_scenario['values']['hdfs'] == test_scenario['values']['hdfs-latest'] and test_scenario['values']['zookeeper'] == test_scenario['values']['zookeeper-latest'] %} + +# We need to force-delete the Pods, because IONOS is sometimes unable to delete the pod (it's stuck in Terminating for > 20 minutes) +--- +apiVersion: kuttl.dev/v1beta1 +kind: TestStep +timeout: 3600 +commands: + # First, let's delete the first pod of every HBase service + # Should trigger failover of the namenode to 1 + - script: kubectl -n $NAMESPACE delete pod --force test-hbase-master-default-0 test-hbase-regionserver-default-0 test-hbase-restserver-default-0 + timeout: 600 + - script: sleep 10 + - script: kubectl -n $NAMESPACE wait --for=condition=Available hbase test-hbase --timeout 10m + timeout: 600 + + # Also delete the last pod of every HBase service + # Should trigger failover of the namenode back to 0 + - script: kubectl -n $NAMESPACE delete pod --force test-hbase-master-default-1 test-hbase-regionserver-default-1 test-hbase-restserver-default-1 + timeout: 600 + - script: sleep 10 + - script: kubectl -n $NAMESPACE wait --for=condition=Available hbase test-hbase --timeout 10m + timeout: 600 + + # Also delete the Zookeeper + - script: kubectl -n $NAMESPACE delete pod --force test-zk-server-default-0 + timeout: 600 + - script: sleep 10 + - script: kubectl -n $NAMESPACE wait --for=condition=Available zookeepercluster test-zk --timeout 10m + timeout: 600 + + # Also delete some HDFS Pods + - script: kubectl -n $NAMESPACE delete pod --force test-hdfs-namenode-default-0 test-hdfs-datanode-default-0 + timeout: 600 + - script: sleep 10 + - script: kubectl -n $NAMESPACE wait --for=condition=Available hdfs test-hdfs --timeout 10m + timeout: 600 + + # And now everything +{% for n in range(3) %} + - script: kubectl -n $NAMESPACE delete pod --force -l app.kubernetes.io/name=hbase + timeout: 600 + - script: kubectl -n $NAMESPACE delete pod --force -l app.kubernetes.io/name=hdfs + timeout: 600 + - script: kubectl -n $NAMESPACE delete pod --force -l app.kubernetes.io/name=zookeeper + timeout: 600 + - script: sleep 10 + # Delete just after they have started up again, just to make things worse + - script: kubectl -n $NAMESPACE delete pod --force -l app.kubernetes.io/name=hbase + timeout: 600 + - script: kubectl -n $NAMESPACE delete pod --force -l app.kubernetes.io/name=hdfs + timeout: 600 + - script: kubectl -n $NAMESPACE delete pod --force -l app.kubernetes.io/name=zookeeper + timeout: 600 + - script: sleep 10 + - script: kubectl -n $NAMESPACE wait --for=condition=Available zookeepercluster test-zk --timeout 10m + timeout: 600 + - script: kubectl -n $NAMESPACE wait --for=condition=Available hdfs test-hdfs --timeout 10m + timeout: 600 + - script: kubectl -n $NAMESPACE wait --for=condition=Available hbase test-hbase --timeout 10m + timeout: 600 +{% endfor %} +{% endif %}