From f39c8e29dd80280d324d9586a3a2e26e6db91d13 Mon Sep 17 00:00:00 2001 From: Yang Chiu Date: Tue, 8 Aug 2023 15:51:13 +0800 Subject: [PATCH] ci: add terraform setup retry mechanism Signed-off-by: Yang Chiu --- test_framework/scripts/cleanup.sh | 32 ++++++---- .../scripts/download-support-bundle.sh | 31 +--------- test_framework/scripts/kubeconfig.sh | 13 ++++ test_framework/scripts/longhorn-setup.sh | 31 +--------- test_framework/scripts/terraform-setup.sh | 59 ++++++++++++------- .../terraform/aws/sles/k3s_instances.tf | 2 +- .../terraform/aws/sles/rke2_instances.tf | 2 +- .../terraform/aws/sles/rke_instances.tf | 2 +- .../provision_k3s_server.sh.tpl | 6 ++ .../provision_rke2_server.sh.tpl | 6 ++ 10 files changed, 92 insertions(+), 92 deletions(-) create mode 100755 test_framework/scripts/kubeconfig.sh diff --git a/test_framework/scripts/cleanup.sh b/test_framework/scripts/cleanup.sh index 340748db55..742e522612 100755 --- a/test_framework/scripts/cleanup.sh +++ b/test_framework/scripts/cleanup.sh @@ -1,18 +1,24 @@ #!/usr/bin/env bash -# terminate any terraform processes -TERRAFORM_PIDS=( `ps aux | grep -i terraform | grep -v grep | awk '{printf("%s ",$1)}'` ) -if [[ -n ${TERRAFORM_PIDS[@]} ]] ; then - for PID in "${TERRAFORM_PIDS[@]}"; do - kill "${TERRAFORM_PIDS}" - done -fi +cleanup(){ + # terminate any terraform processes + TERRAFORM_PIDS=( `ps aux | grep -i terraform | grep -v grep | grep -v terraform-setup | awk '{printf("%s ",$1)}'` ) + if [[ -n ${TERRAFORM_PIDS[@]} ]] ; then + for PID in "${TERRAFORM_PIDS[@]}"; do + kill "${TERRAFORM_PIDS}" + done + fi -# wait 30 seconds for graceful terraform termination -sleep 30 + # wait 30 seconds for graceful terraform termination + sleep 30 -if [[ ${TF_VAR_k8s_distro_name} == "aks" ]] || [[ ${TF_VAR_k8s_distro_name} == "eks" ]]; then - DISTRO=${TF_VAR_k8s_distro_name} -fi + if [[ ${TF_VAR_k8s_distro_name} == "aks" ]] || [[ ${TF_VAR_k8s_distro_name} == "eks" ]]; then + DISTRO=${TF_VAR_k8s_distro_name} + fi -terraform -chdir=${TF_VAR_tf_workspace}/terraform/${LONGHORN_TEST_CLOUDPROVIDER}/${DISTRO} destroy -auto-approve -no-color + terraform -chdir=${TF_VAR_tf_workspace}/terraform/${LONGHORN_TEST_CLOUDPROVIDER}/${DISTRO} destroy -auto-approve -no-color +} + +if [[ "${BASH_SOURCE[0]}" -ef "$0" ]]; then + cleanup +fi diff --git a/test_framework/scripts/download-support-bundle.sh b/test_framework/scripts/download-support-bundle.sh index d7d5a04a2f..05932d401f 100755 --- a/test_framework/scripts/download-support-bundle.sh +++ b/test_framework/scripts/download-support-bundle.sh @@ -2,38 +2,13 @@ set -ex +source test_framework/scripts/kubeconfig.sh + SUPPORT_BUNDLE_FILE_NAME=${1:-"lh-support-bundle.zip"} SUPPORT_BUNDLE_ISSUE_URL=${2:-""} SUPPORT_BUNDLE_ISSUE_DESC=${3:-"Auto-generated support buundle"} -set_kubeconfig_envvar(){ - local ARCH=${1} - local BASEDIR=${2} - - if [[ ${ARCH} == "amd64" ]] ; then - if [[ ${TF_VAR_k8s_distro_name} == [rR][kK][eE] ]]; then - export KUBECONFIG="${BASEDIR}/kube_config_rke.yml" - elif [[ ${TF_VAR_k8s_distro_name} == [rR][kK][eE]2 ]]; then - export KUBECONFIG="${BASEDIR}/terraform/${LONGHORN_TEST_CLOUDPROVIDER}/${DISTRO}/rke2.yaml" - elif [[ ${TF_VAR_k8s_distro_name} == "aks" ]]; then - export KUBECONFIG="${BASEDIR}/aks.yml" - elif [[ ${TF_VAR_k8s_distro_name} == "eks" ]]; then - export KUBECONFIG="${BASEDIR}/eks.yml" - else - export KUBECONFIG="${BASEDIR}/terraform/${LONGHORN_TEST_CLOUDPROVIDER}/${DISTRO}/k3s.yaml" - fi - elif [[ ${ARCH} == "arm64" ]]; then - if [[ ${TF_VAR_k8s_distro_name} == "aks" ]]; then - export KUBECONFIG="${BASEDIR}/aks.yml" - elif [[ ${TF_VAR_k8s_distro_name} == "eks" ]]; then - export KUBECONFIG="${BASEDIR}/eks.yml" - else - export KUBECONFIG="${BASEDIR}/terraform/${LONGHORN_TEST_CLOUDPROVIDER}/${DISTRO}/k3s.yaml" - fi - fi -} - -set_kubeconfig_envvar ${TF_VAR_arch} ${TF_VAR_tf_workspace} +set_kubeconfig LH_FRONTEND_ADDR=`kubectl get svc -n longhorn-system longhorn-frontend -o json | jq -r '.spec.clusterIP + ":" + (.spec.ports[0].port|tostring)'` diff --git a/test_framework/scripts/kubeconfig.sh b/test_framework/scripts/kubeconfig.sh new file mode 100755 index 0000000000..cc519a4ca4 --- /dev/null +++ b/test_framework/scripts/kubeconfig.sh @@ -0,0 +1,13 @@ +set_kubeconfig(){ + if [[ "${TF_VAR_k8s_distro_name}" == "rke" ]]; then + export KUBECONFIG="test_framework/kube_config_rke.yml" + elif [[ "${TF_VAR_k8s_distro_name}" == "rke2" ]]; then + export KUBECONFIG="test_framework/terraform/${LONGHORN_TEST_CLOUDPROVIDER}/${DISTRO}/rke2.yaml" + elif [[ "${TF_VAR_k8s_distro_name}" == "k3s" ]]; then + export KUBECONFIG="test_framework/terraform/${LONGHORN_TEST_CLOUDPROVIDER}/${DISTRO}/k3s.yaml" + elif [[ ${TF_VAR_k8s_distro_name} == "aks" ]]; then + export KUBECONFIG="test_framework/aks.yml" + elif [[ ${TF_VAR_k8s_distro_name} == "eks" ]]; then + export KUBECONFIG="test_framework/eks.yml" + fi +} \ No newline at end of file diff --git a/test_framework/scripts/longhorn-setup.sh b/test_framework/scripts/longhorn-setup.sh index 5e1ea51ae6..e39c04924a 100755 --- a/test_framework/scripts/longhorn-setup.sh +++ b/test_framework/scripts/longhorn-setup.sh @@ -2,6 +2,8 @@ set -x +source test_framework/scripts/kubeconfig.sh + # create and clean tmpdir TMPDIR="/tmp/longhorn" mkdir -p ${TMPDIR} @@ -21,33 +23,6 @@ LONGHORN_MANIFEST_URL="https://raw.githubusercontent.com/longhorn/longhorn/${LON LONGHORN_REPO_URL="https://github.com/longhorn/longhorn" LONGHORN_REPO_DIR="${TMPDIR}/longhorn" -set_kubeconfig_envvar(){ - ARCH=${1} - BASEDIR=${2} - - if [[ ${ARCH} == "amd64" ]] ; then - if [[ ${TF_VAR_k8s_distro_name} == [rR][kK][eE] ]]; then - export KUBECONFIG="${BASEDIR}/kube_config_rke.yml" - elif [[ ${TF_VAR_k8s_distro_name} == [rR][kK][eE]2 ]]; then - export KUBECONFIG="${BASEDIR}/terraform/${LONGHORN_TEST_CLOUDPROVIDER}/${DISTRO}/rke2.yaml" - elif [[ ${TF_VAR_k8s_distro_name} == "aks" ]]; then - export KUBECONFIG="${BASEDIR}/aks.yml" - elif [[ ${TF_VAR_k8s_distro_name} == "eks" ]]; then - export KUBECONFIG="${BASEDIR}/eks.yml" - else - export KUBECONFIG="${BASEDIR}/terraform/${LONGHORN_TEST_CLOUDPROVIDER}/${DISTRO}/k3s.yaml" - fi - elif [[ ${ARCH} == "arm64" ]]; then - if [[ ${TF_VAR_k8s_distro_name} == "aks" ]]; then - export KUBECONFIG="${BASEDIR}/aks.yml" - elif [[ ${TF_VAR_k8s_distro_name} == "eks" ]]; then - export KUBECONFIG="${BASEDIR}/eks.yml" - else - export KUBECONFIG="${BASEDIR}/terraform/${LONGHORN_TEST_CLOUDPROVIDER}/${DISTRO}/k3s.yaml" - fi - fi -} - create_admin_service_account(){ kubectl apply -f "${TF_VAR_tf_workspace}/templates/kubeconfig_service_account.yaml" @@ -429,7 +404,7 @@ run_longhorn_tests(){ main(){ - set_kubeconfig_envvar ${TF_VAR_arch} ${TF_VAR_tf_workspace} + set_kubeconfig if [[ ${DISTRO} == "rhel" ]] || [[ ${DISTRO} == "rockylinux" ]] || [[ ${DISTRO} == "oracle" ]]; then apply_selinux_workaround diff --git a/test_framework/scripts/terraform-setup.sh b/test_framework/scripts/terraform-setup.sh index 6b47568431..6c7beee307 100755 --- a/test_framework/scripts/terraform-setup.sh +++ b/test_framework/scripts/terraform-setup.sh @@ -2,35 +2,54 @@ set -x -if [[ ${TF_VAR_k8s_distro_name} == "aks" ]] || [[ ${TF_VAR_k8s_distro_name} == "eks" ]]; then - DISTRO=${TF_VAR_k8s_distro_name} -fi +source test_framework/scripts/kubeconfig.sh +source test_framework/scripts/cleanup.sh + +terraform_setup(){ + if [[ ${TF_VAR_k8s_distro_name} == "aks" ]] || [[ ${TF_VAR_k8s_distro_name} == "eks" ]]; then + DISTRO=${TF_VAR_k8s_distro_name} + fi -if [[ ${TF_VAR_arch} == "amd64" ]]; then terraform -chdir=${TF_VAR_tf_workspace}/terraform/${LONGHORN_TEST_CLOUDPROVIDER}/${DISTRO} init terraform -chdir=${TF_VAR_tf_workspace}/terraform/${LONGHORN_TEST_CLOUDPROVIDER}/${DISTRO} apply -auto-approve -no-color - if [[ ${TF_VAR_k8s_distro_name} =~ [rR][kK][eE] ]]; then + + if [[ ${TF_VAR_k8s_distro_name} == "rke" ]]; then terraform -chdir=${TF_VAR_tf_workspace}/terraform/${LONGHORN_TEST_CLOUDPROVIDER}/${DISTRO} apply -auto-approve -no-color -refresh-only terraform -chdir=${TF_VAR_tf_workspace}/terraform/${LONGHORN_TEST_CLOUDPROVIDER}/${DISTRO} output -raw rke_config > ${TF_VAR_tf_workspace}/rke.yml sleep 30 rke up --config ${TF_VAR_tf_workspace}/rke.yml fi -else - terraform -chdir=${TF_VAR_tf_workspace}/terraform/${LONGHORN_TEST_CLOUDPROVIDER}/${DISTRO} init - terraform -chdir=${TF_VAR_tf_workspace}/terraform/${LONGHORN_TEST_CLOUDPROVIDER}/${DISTRO} apply -auto-approve -no-color -fi -if [[ ${TF_VAR_k8s_distro_name} == "aks" ]]; then - terraform -chdir=${TF_VAR_tf_workspace}/terraform/${LONGHORN_TEST_CLOUDPROVIDER}/${DISTRO} output -raw kubeconfig > ${TF_VAR_tf_workspace}/aks.yml - sleep 120 -fi + if [[ ${TF_VAR_k8s_distro_name} == "aks" ]]; then + terraform -chdir=${TF_VAR_tf_workspace}/terraform/${LONGHORN_TEST_CLOUDPROVIDER}/${DISTRO} output -raw kubeconfig > ${TF_VAR_tf_workspace}/aks.yml + sleep 120 + fi -if [[ ${TF_VAR_k8s_distro_name} == "eks" ]]; then - terraform -chdir=${TF_VAR_tf_workspace}/terraform/${LONGHORN_TEST_CLOUDPROVIDER}/${DISTRO} output -raw kubeconfig > ${TF_VAR_tf_workspace}/eks.yml -fi + if [[ ${TF_VAR_k8s_distro_name} == "eks" ]]; then + terraform -chdir=${TF_VAR_tf_workspace}/terraform/${LONGHORN_TEST_CLOUDPROVIDER}/${DISTRO} output -raw kubeconfig > ${TF_VAR_tf_workspace}/eks.yml + fi + + if [[ "${TF_VAR_create_load_balancer}" == true ]]; then + terraform -chdir=${TF_VAR_tf_workspace}/terraform/${LONGHORN_TEST_CLOUDPROVIDER}/${DISTRO} output -raw load_balancer_url > ${TF_VAR_tf_workspace}/load_balancer_url + fi +} -if [[ "${TF_VAR_create_load_balancer}" == true ]]; then - terraform -chdir=${TF_VAR_tf_workspace}/terraform/${LONGHORN_TEST_CLOUDPROVIDER}/${DISTRO} output -raw load_balancer_url > ${TF_VAR_tf_workspace}/load_balancer_url -fi -exit $? +if [[ "${BASH_SOURCE[0]}" -ef "$0" ]]; then + CLUSTER_READY=false + MAX_RETRY=3 + RETRY=0 + while [[ "${CLUSTER_READY}" == false ]] && [[ ${RETRY} -lt ${MAX_RETRY} ]]; do + terraform_setup + set_kubeconfig + if ! kubectl get pods -A | grep -q 'Running'; then + cleanup + RETRY=$((RETRY+1)) + else + CLUSTER_READY=true + fi + done + if [[ "${CLUSTER_READY}" == false ]]; then + exit 1 + fi +fi diff --git a/test_framework/terraform/aws/sles/k3s_instances.tf b/test_framework/terraform/aws/sles/k3s_instances.tf index 714986e5f2..f17afeec77 100644 --- a/test_framework/terraform/aws/sles/k3s_instances.tf +++ b/test_framework/terraform/aws/sles/k3s_instances.tf @@ -117,7 +117,7 @@ resource "null_resource" "rsync_kubeconfig_file" { inline = [ "cloud-init status --wait", "if [ \"`cloud-init status | grep error`\" ]; then sudo cat /var/log/cloud-init-output.log; fi", - "until([ -f /etc/rancher/k3s/k3s.yaml ] && [ `sudo /usr/local/bin/kubectl get node -o jsonpath='{.items[*].status.conditions}' | jq '.[] | select(.type == \"Ready\").status' | grep -ci true` -eq $((${var.lh_aws_instance_count_controlplane} + ${var.lh_aws_instance_count_worker})) ]); do echo \"waiting for k3s cluster nodes to be running\"; sleep 2; done" + "RETRY=0; MAX_RETRY=450; until([ -f /etc/rancher/k3s/k3s.yaml ] && [ `sudo /usr/local/bin/kubectl get node -o jsonpath='{.items[*].status.conditions}' | jq '.[] | select(.type == \"Ready\").status' | grep -ci true` -eq $((${var.lh_aws_instance_count_controlplane} + ${var.lh_aws_instance_count_worker})) ]); do echo \"waiting for k3s cluster nodes to be running\"; sleep 2; if [ $RETRY -eq $MAX_RETRY ]; then break; fi; RETRY=$((RETRY+1)); done" ] connection { diff --git a/test_framework/terraform/aws/sles/rke2_instances.tf b/test_framework/terraform/aws/sles/rke2_instances.tf index e10a2f03e4..4521bb44b8 100644 --- a/test_framework/terraform/aws/sles/rke2_instances.tf +++ b/test_framework/terraform/aws/sles/rke2_instances.tf @@ -113,7 +113,7 @@ resource "null_resource" "rsync_kubeconfig_file_rke2" { ] provisioner "remote-exec" { - inline = ["until([ -f /etc/rancher/rke2/rke2.yaml ] && [ `sudo KUBECONFIG=/etc/rancher/rke2/rke2.yaml /var/lib/rancher/rke2/bin/kubectl get node -o jsonpath='{.items[*].status.conditions}' | jq '.[] | select(.type == \"Ready\").status' | grep -ci true` -eq $((${var.lh_aws_instance_count_controlplane} + ${var.lh_aws_instance_count_worker})) ]); do echo \"waiting for rke2 cluster nodes to be running\"; sleep 2; done"] + inline = ["RETRY=0; MAX_RETRY=450; until([ -f /etc/rancher/rke2/rke2.yaml ] && [ `sudo KUBECONFIG=/etc/rancher/rke2/rke2.yaml /var/lib/rancher/rke2/bin/kubectl get node -o jsonpath='{.items[*].status.conditions}' | jq '.[] | select(.type == \"Ready\").status' | grep -ci true` -eq $((${var.lh_aws_instance_count_controlplane} + ${var.lh_aws_instance_count_worker})) ]); do echo \"waiting for rke2 cluster nodes to be running\"; sleep 2; if [ $RETRY -eq $MAX_RETRY ]; then break; fi; RETRY=$((RETRY+1)); done"] connection { diff --git a/test_framework/terraform/aws/sles/rke_instances.tf b/test_framework/terraform/aws/sles/rke_instances.tf index 8aa7cd85d1..fe84244bfe 100644 --- a/test_framework/terraform/aws/sles/rke_instances.tf +++ b/test_framework/terraform/aws/sles/rke_instances.tf @@ -45,7 +45,7 @@ resource "null_resource" "wait_for_docker_start_controlplane" { provisioner "remote-exec" { - inline = ["until( systemctl is-active docker.service ); do echo \"waiting for docker to start \"; sleep 2; done"] + inline = ["until( systemctl is-active docker.service ); do echo \"waiting for docker to start \"; sleep 2; done"] connection { type = "ssh" diff --git a/test_framework/terraform/aws/sles/user-data-scripts/provision_k3s_server.sh.tpl b/test_framework/terraform/aws/sles/user-data-scripts/provision_k3s_server.sh.tpl index d5e8fe2266..c04dd4cab8 100755 --- a/test_framework/terraform/aws/sles/user-data-scripts/provision_k3s_server.sh.tpl +++ b/test_framework/terraform/aws/sles/user-data-scripts/provision_k3s_server.sh.tpl @@ -14,8 +14,14 @@ until (curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="server --node-taint "nod sleep 2 done +RETRY=0 +MAX_RETRY=180 until (kubectl get pods -A | grep 'Running'); do echo 'Waiting for k3s startup' sleep 5 + if [ $RETRY -eq $MAX_RETRY ]; then + break + fi + RETRY=$((RETRY+1)) done diff --git a/test_framework/terraform/aws/sles/user-data-scripts/provision_rke2_server.sh.tpl b/test_framework/terraform/aws/sles/user-data-scripts/provision_rke2_server.sh.tpl index 682a004125..9f5add5c49 100755 --- a/test_framework/terraform/aws/sles/user-data-scripts/provision_rke2_server.sh.tpl +++ b/test_framework/terraform/aws/sles/user-data-scripts/provision_rke2_server.sh.tpl @@ -43,7 +43,13 @@ systemctl start rke2-server.service # TODO: It looks like "set -e" will break the intended functionality of the remaining code. Consider a refactor. set +e +RETRY=0 +MAX_RETRY=180 until (KUBECONFIG=/etc/rancher/rke2/rke2.yaml /var/lib/rancher/rke2/bin/kubectl get pods -A | grep 'Running'); do echo 'Waiting for rke2 startup' sleep 5 + if [ $RETRY -eq $MAX_RETRY ]; then + break + fi + RETRY=$((RETRY+1)) done