From 0598d0d1b6ff599feef5354342b7c136494c1834 Mon Sep 17 00:00:00 2001 From: Artur Troian Date: Thu, 1 Feb 2024 22:40:09 -0500 Subject: [PATCH] chore(scripts): add retry option for setup-kube commands Signed-off-by: Artur Troian --- .github/workflows/integration-tests.yaml | 5 +- _run/.envrc | 5 ++ _run/common-kube.mk | 34 +++++----- _run/kube/Makefile | 4 +- _run/ssh/.envrc | 6 +- _run/ssh/Makefile | 3 +- operator/psutil.go | 34 ++++++++-- script/setup-kube.sh | 79 +++++++++++++++++++----- 8 files changed, 126 insertions(+), 44 deletions(-) diff --git a/.github/workflows/integration-tests.yaml b/.github/workflows/integration-tests.yaml index 97258d4c..e601d980 100644 --- a/.github/workflows/integration-tests.yaml +++ b/.github/workflows/integration-tests.yaml @@ -4,7 +4,7 @@ on: workflow_call: env: - KUBE_SSH_NODE_NAME: kind + KUBE_SSH_NODES: kind defaults: run: @@ -113,6 +113,9 @@ jobs: run: | make -s -C _run/kube kube-deployment-rollout-operator-inventory make -s -C _run/kube kube-wait-inventory-available + make -s -C _run/kube kube-logs-operator-inventory + kubectl -n akash-services logs operator-inventory-hardware-discovery-kube-control-plane + exit 1 - name: Run E2E Tests working-directory: ${{ env.GOPATH }}/src/github.com/akash-network/provider run: | diff --git a/_run/.envrc b/_run/.envrc index d16a20e2..2cd978c8 100644 --- a/_run/.envrc +++ b/_run/.envrc @@ -4,4 +4,9 @@ if ! has grpcurl ; then echo -e "\033[31mgrpcurl is not installed"; exit 1 fi + +if ! has tqdm ; then + echo -e "\033[31mtqdm is not installed. https://github.com/tqdm/tqdm"; exit 1 +fi + export AKASH_KEYRING_BACKEND=test diff --git a/_run/common-kube.mk b/_run/common-kube.mk index 74dcdf23..e9447dfd 100644 --- a/_run/common-kube.mk +++ b/_run/common-kube.mk @@ -6,15 +6,17 @@ include ../common-kind.mk include ../common-helm.mk KUBE_UPLOAD_AKASH_IMAGE ?= false -KUBE_CLUSTER_CREATE_TARGET ?= default KUBE_ROLLOUT_TIMEOUT ?= 180 - INGRESS_CONFIG_PATH ?= ../ingress-nginx.yaml CALICO_MANIFEST ?= https://github.com/projectcalico/calico/blob/v3.25.0/manifests/calico.yaml CRD_FILE ?= $(AP_ROOT)/pkg/apis/akash.network/crd.yaml -ifeq ($(KUBE_SSH_NODE_NAME),) -$(error "KUBE_SSH_NODE_NAME is not set") +ifeq ($(KUBE_SSH_NODES),) +$(error "KUBE_SSH_NODES is not set") +endif + +ifeq ($(KUBE_CLUSTER_CREATE_TYPE),) +$(error "KUBE_CLUSTER_CREATE_TYPE is not set") endif # when image is built locally, for example on M1 (arm64) and kubernetes cluster is running on amd64 @@ -74,26 +76,26 @@ endif endif .PHONY: kube-upload-images -kube-upload-images: kube-upload-images-$(KUBE_CLUSTER_CREATE_TARGET) +kube-upload-images: kube-upload-images-$(KUBE_CLUSTER_CREATE_TYPE) .PHONY: kube-upload-images-kind kube-upload-images-kind: $(KIND) - $(AP_ROOT)/script/setup-kube.sh load-images docker2kind "$(KIND_NAME)" "$(DOCKER_LOAD_IMAGES)" + $(AP_ROOT)/script/setup-kube.sh upload images docker2kind "$(KIND_NAME)" "$(DOCKER_LOAD_IMAGES)" -.PHONY: kube-upload-images-default -kube-upload-images-default: - $(AP_ROOT)/script/setup-kube.sh load-images docker2ctr "$(KUBE_SSH_NODE_NAME)" "$(DOCKER_LOAD_IMAGES)" +.PHONY: kube-upload-images-ssh +kube-upload-images-ssh: + $(AP_ROOT)/script/setup-kube.sh upload images docker2ctr "$(KUBE_SSH_NODES)" "$(DOCKER_LOAD_IMAGES)" .PHONY: kube-upload-crd kube-upload-crd: - $(SETUP_KUBE) --crd=$(CRD_FILE) $(KUBE_SSH_NODE_NAME) init + $(SETUP_KUBE) --crd=$(CRD_FILE) upload crd -$(KUBE_CREATE): $(AP_RUN_DIR) kube-cluster-create-$(KUBE_CLUSTER_CREATE_TARGET) - $(SETUP_KUBE) --crd=$(CRD_FILE) $(KUBE_SSH_NODE_NAME) init +$(KUBE_CREATE): $(AP_RUN_DIR) kube-cluster-create-$(KUBE_CLUSTER_CREATE_TYPE) + $(SETUP_KUBE) --crd=$(CRD_FILE) $(KUBE_CLUSTER_CREATE_TYPE) init "$(KUBE_SSH_NODES)" touch $@ -.INTERMEDIATE: kube-cluster-create-default -kube-cluster-create-default: $(KUBE_CREATE) +.INTERMEDIATE: kube-cluster-create-ssh +kube-cluster-create-ssh: .PHONY: kube-cluster-check-alive kube-cluster-check-info: @@ -129,7 +131,7 @@ kube-cluster-setup-e2e-ci: \ kube-install-helm-charts .PHONY: kube-cluster-delete -kube-cluster-delete: kube-cluster-delete-$(KUBE_SSH_NODE_NAME) +kube-cluster-delete: kube-cluster-delete-$(KUBE_SSH_NODES) .PHONY: kube-setup-ingress kube-setup-ingress: kube-setup-ingress-$(KIND_CONFIG) @@ -195,4 +197,4 @@ kube-logs-operator-inventory: .PHONY: kube-wait-inventory-available kube-wait-inventory-available: - $(SETUP_KUBE) wait inventory-available + $(SETUP_KUBE) --retries=10 wait inventory-available diff --git a/_run/kube/Makefile b/_run/kube/Makefile index b10bbe20..ec50387a 100644 --- a/_run/kube/Makefile +++ b/_run/kube/Makefile @@ -1,7 +1,7 @@ KUBE_SETUP_PREREQUISITES ?= \ -KUBE_CLUSTER_CREATE_TARGET := kind -KUBE_SSH_NODE_NAME ?= kind +KUBE_CLUSTER_CREATE_TYPE := kind +KUBE_SSH_NODES := kind KUSTOMIZE_INSTALLS ?= \ akash-operator-hostname \ diff --git a/_run/ssh/.envrc b/_run/ssh/.envrc index cd4d9fdd..99ebb719 100644 --- a/_run/ssh/.envrc +++ b/_run/ssh/.envrc @@ -2,12 +2,8 @@ source_up .envrc dotenv_if_exists dev.env -#source_env ~/projects/akash/gpu +source_env ~/projects/akash/gpu export AKASH_HOME=$DEVCACHE_RUN/ssh/.akash export AKASH_KUBECONFIG=$KUBECONFIG export AP_KUBECONFIG=$KUBECONFIG - -if ! has tqdm ; then - echo -e "\033[31mtqdm is not installed. https://github.com/tqdm/tqdm"; exit 1 -fi diff --git a/_run/ssh/Makefile b/_run/ssh/Makefile index 72eb97a6..2e363a84 100644 --- a/_run/ssh/Makefile +++ b/_run/ssh/Makefile @@ -1,6 +1,7 @@ KUBE_SETUP_PREREQUISITES ?= \ -KUBE_UPLOAD_AKASH_IMAGE ?= true +KUBE_UPLOAD_AKASH_IMAGE ?= true +KUBE_CLUSTER_CREATE_TYPE := ssh KUBE_DOCKER_IMAGE_ARCH := amd64 diff --git a/operator/psutil.go b/operator/psutil.go index 22a70273..faaa13d7 100644 --- a/operator/psutil.go +++ b/operator/psutil.go @@ -3,6 +3,7 @@ package operator import ( "context" "encoding/json" + "errors" "fmt" "net" "net/http" @@ -15,6 +16,7 @@ import ( "github.com/jaypipes/ghw/pkg/pci" "github.com/spf13/cobra" "github.com/spf13/viper" + "golang.org/x/sync/errgroup" ) const ( @@ -55,8 +57,8 @@ func cmdPsutilServe() *cobra.Command { SilenceUsage: true, RunE: func(cmd *cobra.Command, args []string) error { router := mux.NewRouter() - router.Methods(http.MethodGet).HandlerFunc(infoHandler) + router.HandleFunc("/", infoHandler).Methods(http.MethodGet) router.HandleFunc("/cpu", cpuInfoHandler).Methods(http.MethodGet) router.HandleFunc("/gpu", gpuHandler).Methods(http.MethodGet) router.HandleFunc("/memory", memoryHandler).Methods(http.MethodGet) @@ -64,16 +66,40 @@ func cmdPsutilServe() *cobra.Command { port := viper.GetUint16(flagAPIPort) + group, ctx := errgroup.WithContext(cmd.Context()) + + endpoint := fmt.Sprintf(":%d", port) + srv := &http.Server{ - Addr: fmt.Sprintf(":%d", port), + Addr: endpoint, Handler: router, BaseContext: func(_ net.Listener) context.Context { - return cmd.Context() + return ctx }, ReadHeaderTimeout: 5 * time.Second, } - return srv.ListenAndServe() + group.Go(func() error { + fmt.Printf("listening on %s\n", endpoint) + + return srv.ListenAndServe() + }) + + group.Go(func() error { + <-ctx.Done() + + fmt.Printf("received shutdown signal\n") + + _ = srv.Shutdown(context.Background()) + return ctx.Err() + }) + + err := group.Wait() + if !errors.Is(err, context.Canceled) && !errors.Is(err, http.ErrServerClosed) { + return err + } + + return nil }, } diff --git a/script/setup-kube.sh b/script/setup-kube.sh index 296f4b68..8c0dd889 100755 --- a/script/setup-kube.sh +++ b/script/setup-kube.sh @@ -14,6 +14,8 @@ rootdir="$(dirname "$0")/.." CRD_FILE=$rootdir/pkg/apis/akash.network/crd.yaml timeout=10 +retries=10 +retrywait=1 usage() { cat <&2 "timeout option must be positive integer" + exit 1 + fi + ;; + retries) + retries=$OPTARG + + if ! isuint "$retries" ; then + echo >&2 "retries option must be positive integer" + exit 1 + fi + ;; + retry-wait) + retrywait=$OPTARG + if ! isuint "$retrywait" ; then + echo >&2 "retry-wait option must be positive integer" + exit 1 + fi + ;; esac done shift "$((OPTIND - 1))" @@ -172,7 +196,10 @@ command_ssh() { init) shift - while read -r node; do + local nodes=("$1") + + # shellcheck disable=SC2048 + for node in ${nodes[*]}; do if ! ssh -n "$node" "test -e /etc/systemd/system/user@.service.d/delegate.conf"; then ssh -n "$node" 'sudo mkdir -p /etc/systemd/system/user@.service.d' ssh -n "$node" 'cat < /dev/null 2>&1 ; do sleep 0.1; done' - local retries=0 + local r=0 while ! grpcurl -plaintext localhost:8455 akash.inventory.v1.ClusterRPC.QueryCluster | jq '(.nodes | length > 0) and (.storage | length > 0)' --exit-status > /dev/null 2>&1; do - retries=$((retries+1)) - if [ ${retries} -eq "${timeout}" ]; then - exit 1 + r=$((r+1)) + if [ ${r} -eq "${retries}" ]; then + grpcurl -plaintext localhost:8455 akash.inventory.v1.ClusterRPC.QueryCluster + exit 0 fi - sleep 1 + + # shellcheck disable=SC2086 + sleep $retrywait done } @@ -346,9 +395,9 @@ kustomize) shift command_kustomize "$@" ;; -load-images) +upload) shift - command_load_images "$@" + command_upload "$@" ;; "wait") shift