-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
95 additions
and
15 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
80 changes: 80 additions & 0 deletions
80
tests/templates/kuttl/smoke/70-unleash-the-chaosmonkey.yaml.j2.DISABLED
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
# WARNING | ||
# This test is disabled as everything is broken: | ||
# * HBase 2.4 returns random DNS failures, such as | ||
# 2023-10-11 13:27:58,532 INFO [master/test-hbase-master-default-0:16000:becomeActiveMaster] retry.RetryInvocationHandler: java.net.UnknownHostException: Invalid host name: local host is: (unknown); destination host is: "test-hdfs-namenode-default-1.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local":8020; java.net.UnknownHostException; For more details see: http://wiki.apache.org/hadoop/UnknownHost, while invoking ClientNamenodeProtocolTranslatorPB.getFileInfo over test-hdfs-namenode-default-1.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local:8020 after 13 failover attempts. Trying to failover after sleeping for 21829ms. | ||
# or | ||
# 2023-10-11 13:29:01,311 WARN [master/test-hbase-master-default-1:16000:becomeActiveMaster] ipc.Client: Address change detected. Old: test-hdfs-namenode-default-0.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local/10.244.0.188:8020 New: test-hdfs-namenode-default-0.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local/10.244.0.208:8020 | ||
# 2023-10-11 13:29:21,341 WARN [master/test-hbase-master-default-1:16000:becomeActiveMaster] ipc.Client: Address change detected. Old: test-hdfs-namenode-default-1.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local/10.244.0.173:8020 New: test-hdfs-namenode-default-1.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local/10.244.0.210:8020 | ||
# 2023-10-11 13:29:42,657 INFO [master/test-hbase-master-default-1:16000:becomeActiveMaster] retry.RetryInvocationHandler: org.apache.hadoop.net.ConnectTimeoutException: Call From test-hbase-master-default-1/10.244.0.201 to test-hdfs-namenode-default-0.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local:8020 failed on socket timeout exception: org.apache.hadoop.net.ConnectTimeoutException: 20000 millis timeout while waiting for channel to be ready for connect. ch : java.nio.channels.SocketChannel[connection-pending remote=test-hdfs-namenode-default-0.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local/10.244.0.188:8020]; For more details see: http://wiki.apache.org/hadoop/SocketTimeout, while invoking ClientNamenodeProtocolTranslatorPB.setSafeMode over test-hdfs-namenode-default-0.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local/10.244.0.188:8020 after 2 failover attempts. Trying to failover after sleeping for 2803ms. | ||
# 2023-10-11 13:29:21,342 INFO [master/test-hbase-master-default-1:16000:becomeActiveMaster] retry.RetryInvocationHandler: org.apache.hadoop.net.ConnectTimeoutException: Call From test-hbase-master-default-1/10.244.0.201 to test-hdfs-namenode-default-1.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local:8020 failed on socket timeout exception: org.apache.hadoop.net.ConnectTimeoutException: 20000 millis timeout while waiting for channel to be ready for connect. ch : java.nio.channels.SocketChannel[connection-pending remote=test-hdfs-namenode-default-1.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local/10.244.0.173:8020]; For more details see: http://wiki.apache.org/hadoop/SocketTimeout, while invoking ClientNamenodeProtocolTranslatorPB.setSafeMode over test-hdfs-namenode-default-1.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local/10.244.0.173:8020 after 1 failover attempts. Trying to failover after sleeping for 1296ms. | ||
# | ||
# * HBase 2.5 causes the Phoernix test to be flaky. Works half of the time, or otherwise fail with some timeouts | ||
# WARNING | ||
|
||
# Tribute to https://github.com/Netflix/chaosmonkey | ||
|
||
# We need to reduce the number of monkeys, otherwise the tests literally take days | ||
# We only run them on some hand-picked test cases | ||
{% if test_scenario['values']['listener-class'] == 'cluster-internal' and test_scenario['values']['hdfs'] == test_scenario['values']['hdfs-latest'] and test_scenario['values']['zookeeper'] == test_scenario['values']['zookeeper-latest'] %} | ||
|
||
# We need to force-delete the Pods, because IONOS is sometimes unable to delete the pod (it's stuck in Terminating for > 20 minutes) | ||
--- | ||
apiVersion: kuttl.dev/v1beta1 | ||
kind: TestStep | ||
timeout: 3600 | ||
commands: | ||
# First, let's delete the first pod of every HBase service | ||
# Should trigger failover of the namenode to 1 | ||
- script: kubectl -n $NAMESPACE delete pod --force test-hbase-master-default-0 test-hbase-regionserver-default-0 test-hbase-restserver-default-0 | ||
timeout: 600 | ||
- script: sleep 10 | ||
- script: kubectl -n $NAMESPACE wait --for=condition=Available hbase test-hbase --timeout 10m | ||
timeout: 600 | ||
|
||
# Also delete the last pod of every HBase service | ||
# Should trigger failover of the namenode back to 0 | ||
- script: kubectl -n $NAMESPACE delete pod --force test-hbase-master-default-1 test-hbase-regionserver-default-1 test-hbase-restserver-default-1 | ||
timeout: 600 | ||
- script: sleep 10 | ||
- script: kubectl -n $NAMESPACE wait --for=condition=Available hbase test-hbase --timeout 10m | ||
timeout: 600 | ||
|
||
# Also delete the Zookeeper | ||
- script: kubectl -n $NAMESPACE delete pod --force test-zk-server-default-0 | ||
timeout: 600 | ||
- script: sleep 10 | ||
- script: kubectl -n $NAMESPACE wait --for=condition=Available zookeepercluster test-zk --timeout 10m | ||
timeout: 600 | ||
|
||
# Also delete some HDFS Pods | ||
- script: kubectl -n $NAMESPACE delete pod --force test-hdfs-namenode-default-0 test-hdfs-datanode-default-0 | ||
timeout: 600 | ||
- script: sleep 10 | ||
- script: kubectl -n $NAMESPACE wait --for=condition=Available hdfs test-hdfs --timeout 10m | ||
timeout: 600 | ||
|
||
# And now everything | ||
{% for n in range(3) %} | ||
- script: kubectl -n $NAMESPACE delete pod --force -l app.kubernetes.io/name=hbase | ||
timeout: 600 | ||
- script: kubectl -n $NAMESPACE delete pod --force -l app.kubernetes.io/name=hdfs | ||
timeout: 600 | ||
- script: kubectl -n $NAMESPACE delete pod --force -l app.kubernetes.io/name=zookeeper | ||
timeout: 600 | ||
- script: sleep 10 | ||
# Delete just after they have started up again, just to make things worse | ||
- script: kubectl -n $NAMESPACE delete pod --force -l app.kubernetes.io/name=hbase | ||
timeout: 600 | ||
- script: kubectl -n $NAMESPACE delete pod --force -l app.kubernetes.io/name=hdfs | ||
timeout: 600 | ||
- script: kubectl -n $NAMESPACE delete pod --force -l app.kubernetes.io/name=zookeeper | ||
timeout: 600 | ||
- script: sleep 10 | ||
- script: kubectl -n $NAMESPACE wait --for=condition=Available zookeepercluster test-zk --timeout 10m | ||
timeout: 600 | ||
- script: kubectl -n $NAMESPACE wait --for=condition=Available hdfs test-hdfs --timeout 10m | ||
timeout: 600 | ||
- script: kubectl -n $NAMESPACE wait --for=condition=Available hbase test-hbase --timeout 10m | ||
timeout: 600 | ||
{% endfor %} | ||
{% endif %} |