Skip to content

Commit

Permalink
Add chaos monkey test (disabled)
Browse files Browse the repository at this point in the history
  • Loading branch information
sbernauer committed Oct 12, 2023
1 parent 56fcd3b commit 0b94a07
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 15 deletions.
24 changes: 12 additions & 12 deletions tests/templates/kuttl/smoke/30-assert.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ spec:
spec:
terminationGracePeriodSeconds: 1200
status:
readyReplicas: 1
replicas: 1
readyReplicas: 2
replicas: 2
---
apiVersion: apps/v1
kind: StatefulSet
Expand All @@ -26,8 +26,8 @@ spec:
spec:
terminationGracePeriodSeconds: 3600
status:
readyReplicas: 1
replicas: 1
readyReplicas: 2
replicas: 2
---
apiVersion: apps/v1
kind: StatefulSet
Expand All @@ -38,32 +38,32 @@ spec:
spec:
terminationGracePeriodSeconds: 300
status:
readyReplicas: 1
replicas: 1
readyReplicas: 2
replicas: 2
---
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
name: test-hbase-master
status:
expectedPods: 1
currentHealthy: 1
expectedPods: 2
currentHealthy: 2
disruptionsAllowed: 1
---
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
name: test-hbase-regionserver
status:
expectedPods: 1
currentHealthy: 1
expectedPods: 2
currentHealthy: 2
disruptionsAllowed: 1
---
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
name: test-hbase-restserver
status:
expectedPods: 1
currentHealthy: 1
expectedPods: 2
currentHealthy: 2
disruptionsAllowed: 1
6 changes: 3 additions & 3 deletions tests/templates/kuttl/smoke/30-install-hbase.yaml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ spec:
hbase-site.xml:
phoenix.log.saltBuckets: "2"
hbase.regionserver.wal.codec: "org.apache.hadoop.hbase.regionserver.wal.IndexedWALEditCodec"
replicas: 1
replicas: 2
regionServers:
config:
logging:
Expand All @@ -34,7 +34,7 @@ spec:
hbase-site.xml:
phoenix.log.saltBuckets: "2"
hbase.regionserver.wal.codec: "org.apache.hadoop.hbase.regionserver.wal.IndexedWALEditCodec"
replicas: 1
replicas: 2
restServers:
config:
logging:
Expand All @@ -45,4 +45,4 @@ spec:
hbase-site.xml:
phoenix.log.saltBuckets: "2"
hbase.regionserver.wal.codec: "org.apache.hadoop.hbase.regionserver.wal.IndexedWALEditCodec"
replicas: 1
replicas: 2
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# WARNING
# This test is disabled as everything is broken:
# * HBase 2.4 returns random DNS failures, such as
# 2023-10-11 13:27:58,532 INFO [master/test-hbase-master-default-0:16000:becomeActiveMaster] retry.RetryInvocationHandler: java.net.UnknownHostException: Invalid host name: local host is: (unknown); destination host is: "test-hdfs-namenode-default-1.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local":8020; java.net.UnknownHostException; For more details see: http://wiki.apache.org/hadoop/UnknownHost, while invoking ClientNamenodeProtocolTranslatorPB.getFileInfo over test-hdfs-namenode-default-1.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local:8020 after 13 failover attempts. Trying to failover after sleeping for 21829ms.
# or
# 2023-10-11 13:29:01,311 WARN [master/test-hbase-master-default-1:16000:becomeActiveMaster] ipc.Client: Address change detected. Old: test-hdfs-namenode-default-0.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local/10.244.0.188:8020 New: test-hdfs-namenode-default-0.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local/10.244.0.208:8020
# 2023-10-11 13:29:21,341 WARN [master/test-hbase-master-default-1:16000:becomeActiveMaster] ipc.Client: Address change detected. Old: test-hdfs-namenode-default-1.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local/10.244.0.173:8020 New: test-hdfs-namenode-default-1.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local/10.244.0.210:8020
# 2023-10-11 13:29:42,657 INFO [master/test-hbase-master-default-1:16000:becomeActiveMaster] retry.RetryInvocationHandler: org.apache.hadoop.net.ConnectTimeoutException: Call From test-hbase-master-default-1/10.244.0.201 to test-hdfs-namenode-default-0.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local:8020 failed on socket timeout exception: org.apache.hadoop.net.ConnectTimeoutException: 20000 millis timeout while waiting for channel to be ready for connect. ch : java.nio.channels.SocketChannel[connection-pending remote=test-hdfs-namenode-default-0.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local/10.244.0.188:8020]; For more details see: http://wiki.apache.org/hadoop/SocketTimeout, while invoking ClientNamenodeProtocolTranslatorPB.setSafeMode over test-hdfs-namenode-default-0.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local/10.244.0.188:8020 after 2 failover attempts. Trying to failover after sleeping for 2803ms.
# 2023-10-11 13:29:21,342 INFO [master/test-hbase-master-default-1:16000:becomeActiveMaster] retry.RetryInvocationHandler: org.apache.hadoop.net.ConnectTimeoutException: Call From test-hbase-master-default-1/10.244.0.201 to test-hdfs-namenode-default-1.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local:8020 failed on socket timeout exception: org.apache.hadoop.net.ConnectTimeoutException: 20000 millis timeout while waiting for channel to be ready for connect. ch : java.nio.channels.SocketChannel[connection-pending remote=test-hdfs-namenode-default-1.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local/10.244.0.173:8020]; For more details see: http://wiki.apache.org/hadoop/SocketTimeout, while invoking ClientNamenodeProtocolTranslatorPB.setSafeMode over test-hdfs-namenode-default-1.test-hdfs-namenode-default.kuttl-test-joint-sloth.svc.cluster.local/10.244.0.173:8020 after 1 failover attempts. Trying to failover after sleeping for 1296ms.
#
# * HBase 2.5 causes the Phoernix test to be flaky. Works half of the time, or otherwise fail with some timeouts
# WARNING

# Tribute to https://github.com/Netflix/chaosmonkey

# We need to reduce the number of monkeys, otherwise the tests literally take days
# We only run them on some hand-picked test cases
{% if test_scenario['values']['listener-class'] == 'cluster-internal' and test_scenario['values']['hdfs'] == test_scenario['values']['hdfs-latest'] and test_scenario['values']['zookeeper'] == test_scenario['values']['zookeeper-latest'] %}

# We need to force-delete the Pods, because IONOS is sometimes unable to delete the pod (it's stuck in Terminating for > 20 minutes)
---
apiVersion: kuttl.dev/v1beta1
kind: TestStep
timeout: 3600
commands:
# First, let's delete the first pod of every HBase service
# Should trigger failover of the namenode to 1
- script: kubectl -n $NAMESPACE delete pod --force test-hbase-master-default-0 test-hbase-regionserver-default-0 test-hbase-restserver-default-0
timeout: 600
- script: sleep 10
- script: kubectl -n $NAMESPACE wait --for=condition=Available hbase test-hbase --timeout 10m
timeout: 600

# Also delete the last pod of every HBase service
# Should trigger failover of the namenode back to 0
- script: kubectl -n $NAMESPACE delete pod --force test-hbase-master-default-1 test-hbase-regionserver-default-1 test-hbase-restserver-default-1
timeout: 600
- script: sleep 10
- script: kubectl -n $NAMESPACE wait --for=condition=Available hbase test-hbase --timeout 10m
timeout: 600

# Also delete the Zookeeper
- script: kubectl -n $NAMESPACE delete pod --force test-zk-server-default-0
timeout: 600
- script: sleep 10
- script: kubectl -n $NAMESPACE wait --for=condition=Available zookeepercluster test-zk --timeout 10m
timeout: 600

# Also delete some HDFS Pods
- script: kubectl -n $NAMESPACE delete pod --force test-hdfs-namenode-default-0 test-hdfs-datanode-default-0
timeout: 600
- script: sleep 10
- script: kubectl -n $NAMESPACE wait --for=condition=Available hdfs test-hdfs --timeout 10m
timeout: 600

# And now everything
{% for n in range(3) %}
- script: kubectl -n $NAMESPACE delete pod --force -l app.kubernetes.io/name=hbase
timeout: 600
- script: kubectl -n $NAMESPACE delete pod --force -l app.kubernetes.io/name=hdfs
timeout: 600
- script: kubectl -n $NAMESPACE delete pod --force -l app.kubernetes.io/name=zookeeper
timeout: 600
- script: sleep 10
# Delete just after they have started up again, just to make things worse
- script: kubectl -n $NAMESPACE delete pod --force -l app.kubernetes.io/name=hbase
timeout: 600
- script: kubectl -n $NAMESPACE delete pod --force -l app.kubernetes.io/name=hdfs
timeout: 600
- script: kubectl -n $NAMESPACE delete pod --force -l app.kubernetes.io/name=zookeeper
timeout: 600
- script: sleep 10
- script: kubectl -n $NAMESPACE wait --for=condition=Available zookeepercluster test-zk --timeout 10m
timeout: 600
- script: kubectl -n $NAMESPACE wait --for=condition=Available hdfs test-hdfs --timeout 10m
timeout: 600
- script: kubectl -n $NAMESPACE wait --for=condition=Available hbase test-hbase --timeout 10m
timeout: 600
{% endfor %}
{% endif %}

0 comments on commit 0b94a07

Please sign in to comment.