Skip to content

Commit

Permalink
chore(scripts): add retry option for setup-kube commands
Browse files Browse the repository at this point in the history
Signed-off-by: Artur Troian <[email protected]>
  • Loading branch information
troian committed Feb 2, 2024
1 parent 181c338 commit 3b47537
Show file tree
Hide file tree
Showing 10 changed files with 127 additions and 47 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/integration-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ on:
workflow_call:

env:
KUBE_SSH_NODE_NAME: kind
KUBE_SSH_NODES: kind

defaults:
run:
Expand Down
5 changes: 5 additions & 0 deletions _run/.envrc
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,9 @@ if ! has grpcurl ; then
echo -e "\033[31mgrpcurl is not installed"; exit 1
fi


if ! has tqdm ; then
echo -e "\033[31mtqdm is not installed. https://github.com/tqdm/tqdm"; exit 1
fi

export AKASH_KEYRING_BACKEND=test
34 changes: 18 additions & 16 deletions _run/common-kube.mk
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,17 @@ include ../common-kind.mk
include ../common-helm.mk

KUBE_UPLOAD_AKASH_IMAGE ?= false
KUBE_CLUSTER_CREATE_TARGET ?= default
KUBE_ROLLOUT_TIMEOUT ?= 180

INGRESS_CONFIG_PATH ?= ../ingress-nginx.yaml
CALICO_MANIFEST ?= https://github.com/projectcalico/calico/blob/v3.25.0/manifests/calico.yaml
CRD_FILE ?= $(AP_ROOT)/pkg/apis/akash.network/crd.yaml

ifeq ($(KUBE_SSH_NODE_NAME),)
$(error "KUBE_SSH_NODE_NAME is not set")
ifeq ($(KUBE_SSH_NODES),)
$(error "KUBE_SSH_NODES is not set")
endif

ifeq ($(KUBE_CLUSTER_CREATE_TYPE),)
$(error "KUBE_CLUSTER_CREATE_TYPE is not set")
endif

# when image is built locally, for example on M1 (arm64) and kubernetes cluster is running on amd64
Expand Down Expand Up @@ -74,26 +76,26 @@ endif
endif

.PHONY: kube-upload-images
kube-upload-images: kube-upload-images-$(KUBE_CLUSTER_CREATE_TARGET)
kube-upload-images: kube-upload-images-$(KUBE_CLUSTER_CREATE_TYPE)

.PHONY: kube-upload-images-kind
kube-upload-images-kind: $(KIND)
$(AP_ROOT)/script/setup-kube.sh load-images docker2kind "$(KIND_NAME)" "$(DOCKER_LOAD_IMAGES)"
$(AP_ROOT)/script/setup-kube.sh upload images docker2kind "$(KIND_NAME)" "$(DOCKER_LOAD_IMAGES)"

.PHONY: kube-upload-images-default
kube-upload-images-default:
$(AP_ROOT)/script/setup-kube.sh load-images docker2ctr "$(KUBE_SSH_NODE_NAME)" "$(DOCKER_LOAD_IMAGES)"
.PHONY: kube-upload-images-ssh
kube-upload-images-ssh:
$(AP_ROOT)/script/setup-kube.sh upload images docker2ctr "$(KUBE_SSH_NODES)" "$(DOCKER_LOAD_IMAGES)"

.PHONY: kube-upload-crd
kube-upload-crd:
$(SETUP_KUBE) --crd=$(CRD_FILE) $(KUBE_SSH_NODE_NAME) init
$(SETUP_KUBE) --crd=$(CRD_FILE) upload crd

$(KUBE_CREATE): $(AP_RUN_DIR) kube-cluster-create-$(KUBE_CLUSTER_CREATE_TARGET)
$(SETUP_KUBE) --crd=$(CRD_FILE) $(KUBE_SSH_NODE_NAME) init
$(KUBE_CREATE): $(AP_RUN_DIR) kube-cluster-create-$(KUBE_CLUSTER_CREATE_TYPE)
$(SETUP_KUBE) --crd=$(CRD_FILE) $(KUBE_CLUSTER_CREATE_TYPE) init "$(KUBE_SSH_NODES)"
touch $@

.INTERMEDIATE: kube-cluster-create-default
kube-cluster-create-default: $(KUBE_CREATE)
.INTERMEDIATE: kube-cluster-create-ssh
kube-cluster-create-ssh:

.PHONY: kube-cluster-check-alive
kube-cluster-check-info:
Expand Down Expand Up @@ -129,7 +131,7 @@ kube-cluster-setup-e2e-ci: \
kube-install-helm-charts

.PHONY: kube-cluster-delete
kube-cluster-delete: kube-cluster-delete-$(KUBE_SSH_NODE_NAME)
kube-cluster-delete: kube-cluster-delete-$(KUBE_SSH_NODES)

.PHONY: kube-setup-ingress
kube-setup-ingress: kube-setup-ingress-$(KIND_CONFIG)
Expand Down Expand Up @@ -195,4 +197,4 @@ kube-logs-operator-inventory:

.PHONY: kube-wait-inventory-available
kube-wait-inventory-available:
$(SETUP_KUBE) wait inventory-available
$(SETUP_KUBE) --retries=60 wait inventory-available
4 changes: 2 additions & 2 deletions _run/kube/Makefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
KUBE_SETUP_PREREQUISITES ?= \

KUBE_CLUSTER_CREATE_TARGET := kind
KUBE_SSH_NODE_NAME ?= kind
KUBE_CLUSTER_CREATE_TYPE := kind
KUBE_SSH_NODES := kind

KUSTOMIZE_INSTALLS ?= \
akash-operator-hostname \
Expand Down
6 changes: 1 addition & 5 deletions _run/ssh/.envrc
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,8 @@ source_up .envrc

dotenv_if_exists dev.env

#source_env ~/projects/akash/gpu
source_env ~/projects/akash/gpu

export AKASH_HOME=$DEVCACHE_RUN/ssh/.akash
export AKASH_KUBECONFIG=$KUBECONFIG
export AP_KUBECONFIG=$KUBECONFIG

if ! has tqdm ; then
echo -e "\033[31mtqdm is not installed. https://github.com/tqdm/tqdm"; exit 1
fi
3 changes: 2 additions & 1 deletion _run/ssh/Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
KUBE_SETUP_PREREQUISITES ?= \

KUBE_UPLOAD_AKASH_IMAGE ?= true
KUBE_UPLOAD_AKASH_IMAGE ?= true
KUBE_CLUSTER_CREATE_TYPE := ssh

KUBE_DOCKER_IMAGE_ARCH := amd64

Expand Down
2 changes: 1 addition & 1 deletion make/test-integration.mk
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ test-nocache:

.PHONY: test-full
test-full:
$(GO_TEST) -tags=$(BUILD_TAGS) -race $(TEST_MODULES)
$(GO_TEST) -tags=$(BUILD_TAGS) -race -count=1 $(TEST_MODULES)

.PHONY: test-coverage
test-coverage: $(AP_DEVCACHE)
Expand Down
34 changes: 30 additions & 4 deletions operator/psutil.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package operator
import (
"context"
"encoding/json"
"errors"
"fmt"
"net"
"net/http"
Expand All @@ -15,6 +16,7 @@ import (
"github.com/jaypipes/ghw/pkg/pci"
"github.com/spf13/cobra"
"github.com/spf13/viper"
"golang.org/x/sync/errgroup"
)

const (
Expand Down Expand Up @@ -55,25 +57,49 @@ func cmdPsutilServe() *cobra.Command {
SilenceUsage: true,
RunE: func(cmd *cobra.Command, args []string) error {
router := mux.NewRouter()
router.Methods(http.MethodGet).HandlerFunc(infoHandler)

router.HandleFunc("/", infoHandler).Methods(http.MethodGet)
router.HandleFunc("/cpu", cpuInfoHandler).Methods(http.MethodGet)
router.HandleFunc("/gpu", gpuHandler).Methods(http.MethodGet)
router.HandleFunc("/memory", memoryHandler).Methods(http.MethodGet)
router.HandleFunc("/pci", pciHandler).Methods(http.MethodGet)

port := viper.GetUint16(flagAPIPort)

group, ctx := errgroup.WithContext(cmd.Context())

endpoint := fmt.Sprintf(":%d", port)

srv := &http.Server{
Addr: fmt.Sprintf(":%d", port),
Addr: endpoint,
Handler: router,
BaseContext: func(_ net.Listener) context.Context {
return cmd.Context()
return ctx
},
ReadHeaderTimeout: 5 * time.Second,
}

return srv.ListenAndServe()
group.Go(func() error {
fmt.Printf("listening on %s\n", endpoint)

return srv.ListenAndServe()
})

group.Go(func() error {
<-ctx.Done()

fmt.Printf("received shutdown signal\n")

_ = srv.Shutdown(context.Background())
return ctx.Err()
})

err := group.Wait()
if !errors.Is(err, context.Canceled) && !errors.Is(err, http.ErrServerClosed) {
return err
}

return nil
},
}

Expand Down
5 changes: 3 additions & 2 deletions operator/waiter/waiter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@ package waiter

import (
"context"
"github.com/akash-network/node/testutil"
"github.com/stretchr/testify/require"
"io"
"testing"
"time"

"github.com/akash-network/node/testutil"
"github.com/stretchr/testify/require"
)

func TestWaiterNoInput(t *testing.T) {
Expand Down
79 changes: 64 additions & 15 deletions script/setup-kube.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ rootdir="$(dirname "$0")/.."

CRD_FILE=$rootdir/pkg/apis/akash.network/crd.yaml
timeout=10
retries=10
retrywait=1

usage() {
cat <<EOF
Expand All @@ -40,8 +42,10 @@ EOF
exit 1
}

isuint() { [[ $1 =~ ^[0-9]+$ ]] ;}

short_opts=h
long_opts=help/crd:/timeout: # those who take an arg END with :
long_opts=help/crd:/retries:/retry-wait:/timeout: # those who take an arg END with :

while getopts ":$short_opts-:" o; do
case $o in
Expand Down Expand Up @@ -99,6 +103,26 @@ while getopts ":$short_opts-:" o; do
;;
timeout)
timeout=$OPTARG
if ! isuint "$timeout" ; then
echo >&2 "timeout option must be positive integer"
exit 1
fi
;;
retries)
retries=$OPTARG

if ! isuint "$retries" ; then
echo >&2 "retries option must be positive integer"
exit 1
fi
;;
retry-wait)
retrywait=$OPTARG
if ! isuint "$retrywait" ; then
echo >&2 "retry-wait option must be positive integer"
exit 1
fi
;;
esac
done
shift "$((OPTIND - 1))"
Expand Down Expand Up @@ -172,7 +196,10 @@ command_ssh() {
init)
shift

while read -r node; do
local nodes=("$1")

# shellcheck disable=SC2048
for node in ${nodes[*]}; do
if ! ssh -n "$node" "test -e /etc/systemd/system/[email protected]/delegate.conf"; then
ssh -n "$node" 'sudo mkdir -p /etc/systemd/system/[email protected]'
ssh -n "$node" 'cat <<EOF | sudo tee /etc/systemd/system/[email protected]/delegate.conf
Expand All @@ -198,8 +225,10 @@ EOF'
ssh -n "$node" 'curl -sSL https://github.com/containerd/nerdctl/releases/download/v1.7.2/nerdctl-1.7.2-linux-$(uname -m | sed "s/x86_64/amd64/g").tar.gz | sudo tar Cxzv /usr/local/bin/'
ssh -n "$node" 'curl -sSL https://github.com/rootless-containers/rootlesskit/releases/download/v2.0.0/rootlesskit-$(uname -m).tar.gz | sudo tar Cxzv /usr/local/bin/'
ssh -n "$node" 'containerd-rootless-setuptool.sh install'
done <<< "$1"
done

install_ns
install_crd
;;
*)
echo "invalid command \"$1\""
Expand Down Expand Up @@ -238,8 +267,8 @@ command_load_images() {
docker2ctr)
shift

remotes=("$1")
images=("$2")
local remotes=("$1")
local images=("$2")

# shellcheck disable=SC2048
for remote in ${remotes[*]}; do
Expand Down Expand Up @@ -268,8 +297,8 @@ command_load_images() {
docker2kind)
shift

kind_name=$1
images=("$2")
local kind_name=$1
local images=("$2")

# shellcheck disable=SC2048
for image in ${images[*]}; do
Expand All @@ -286,12 +315,29 @@ command_load_images() {

;;
*)
echo "invalid command \"$1\""
echo "invalid load images command \"$1\""
usage "$@"
;;
esac
}

command_upload() {
case "$1" in
crd)
shift
install_ns
install_crd
;;
images)
shift
command_load_images "$@"
;;
*)
echo "invalid upload command \"$1\""
usage "$@"
esac
}

wait_inventory_available() {
set -x

Expand All @@ -309,14 +355,17 @@ wait_inventory_available() {

timeout 10 bash -c -- 'while ! nc -vz localhost 8455 > /dev/null 2>&1 ; do sleep 0.1; done'

local retries=0
local r=0

while ! grpcurl -plaintext localhost:8455 akash.inventory.v1.ClusterRPC.QueryCluster | jq '(.nodes | length > 0) and (.storage | length > 0)' --exit-status > /dev/null 2>&1; do
retries=$((retries+1))
if [ ${retries} -eq "${timeout}" ]; then
exit 1
r=$((r+1))
if [ ${r} -eq "${retries}" ]; then
grpcurl -plaintext localhost:8455 akash.inventory.v1.ClusterRPC.QueryCluster
exit 0
fi
sleep 1

# shellcheck disable=SC2086
sleep $retrywait
done
}

Expand Down Expand Up @@ -346,9 +395,9 @@ kustomize)
shift
command_kustomize "$@"
;;
load-images)
upload)
shift
command_load_images "$@"
command_upload "$@"
;;
"wait")
shift
Expand Down

0 comments on commit 3b47537

Please sign in to comment.