Skip to content

Commit

Permalink
chore(scripts): add retry option for setup-kube commands (#186)
Browse files Browse the repository at this point in the history
Signed-off-by: Artur Troian <[email protected]>
  • Loading branch information
troian authored Feb 3, 2024
1 parent 181c338 commit 6ce7931
Show file tree
Hide file tree
Showing 17 changed files with 215 additions and 80 deletions.
13 changes: 11 additions & 2 deletions .github/workflows/integration-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ on:
workflow_call:

env:
KUBE_SSH_NODE_NAME: kind
KUBE_SSH_NODES: kind

defaults:
run:
Expand Down Expand Up @@ -112,11 +112,20 @@ jobs:
working-directory: ${{ env.GOPATH }}/src/github.com/akash-network/provider
run: |
make -s -C _run/kube kube-deployment-rollout-operator-inventory
make -s -C _run/kube kube-wait-inventory-available
- name: Run E2E Tests
working-directory: ${{ env.GOPATH }}/src/github.com/akash-network/provider
run: |
make test-e2e-integration
- name: Print operator inventory logs
if: always()
working-directory: ${{ env.GOPATH }}/src/github.com/akash-network/provider
run: |
kubectl -n akash-services logs -l app.kubernetes.io/part-of=provider,app.kubernetes.io/component=operator,app.kubernetes.io/instance=inventory-service,app.kubernetes.io/name=inventory
- name: Print operator inventory discovery logs
if: always()
working-directory: ${{ env.GOPATH }}/src/github.com/akash-network/provider
run: |
kubectl -n akash-services logs -l app.kubernetes.io/part-of=provider,app.kubernetes.io/component=operator,app.kubernetes.io/instance=inventory-hardware-discovery,app.kubernetes.io/name=inventory
- name: Run K8s Tests
working-directory: ${{ env.GOPATH }}/src/github.com/akash-network/provider
run: |
Expand Down
6 changes: 3 additions & 3 deletions _docs/kustomize/akash-operator-inventory/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,14 +59,14 @@ spec:
path: /metrics/health
port: api
scheme: HTTP
initialDelaySeconds: 5
periodSeconds: 5
initialDelaySeconds: 15
periodSeconds: 15
readinessProbe:
httpGet:
path: /metrics/ready
port: api
scheme: HTTP
initialDelaySeconds: 5
initialDelaySeconds: 15
periodSeconds: 5
ports:
- containerPort: 8080
Expand Down
5 changes: 5 additions & 0 deletions _run/.envrc
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,9 @@ if ! has grpcurl ; then
echo -e "\033[31mgrpcurl is not installed"; exit 1
fi


if ! has tqdm ; then
echo -e "\033[31mtqdm is not installed. https://github.com/tqdm/tqdm"; exit 1
fi

export AKASH_KEYRING_BACKEND=test
45 changes: 27 additions & 18 deletions _run/common-kube.mk
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,17 @@ include ../common-kind.mk
include ../common-helm.mk

KUBE_UPLOAD_AKASH_IMAGE ?= false
KUBE_CLUSTER_CREATE_TARGET ?= default
KUBE_ROLLOUT_TIMEOUT ?= 180

INGRESS_CONFIG_PATH ?= ../ingress-nginx.yaml
CALICO_MANIFEST ?= https://github.com/projectcalico/calico/blob/v3.25.0/manifests/calico.yaml
CRD_FILE ?= $(AP_ROOT)/pkg/apis/akash.network/crd.yaml

ifeq ($(KUBE_SSH_NODE_NAME),)
$(error "KUBE_SSH_NODE_NAME is not set")
ifeq ($(KUBE_SSH_NODES),)
$(error "KUBE_SSH_NODES is not set")
endif

ifeq ($(KUBE_CLUSTER_CREATE_TYPE),)
$(error "KUBE_CLUSTER_CREATE_TYPE is not set")
endif

# when image is built locally, for example on M1 (arm64) and kubernetes cluster is running on amd64
Expand Down Expand Up @@ -74,26 +76,26 @@ endif
endif

.PHONY: kube-upload-images
kube-upload-images: kube-upload-images-$(KUBE_CLUSTER_CREATE_TARGET)
kube-upload-images: kube-upload-images-$(KUBE_CLUSTER_CREATE_TYPE)

.PHONY: kube-upload-images-kind
kube-upload-images-kind: $(KIND)
$(AP_ROOT)/script/setup-kube.sh load-images docker2kind "$(KIND_NAME)" "$(DOCKER_LOAD_IMAGES)"
$(AP_ROOT)/script/setup-kube.sh upload images docker2kind "$(KIND_NAME)" "$(DOCKER_LOAD_IMAGES)"

.PHONY: kube-upload-images-default
kube-upload-images-default:
$(AP_ROOT)/script/setup-kube.sh load-images docker2ctr "$(KUBE_SSH_NODE_NAME)" "$(DOCKER_LOAD_IMAGES)"
.PHONY: kube-upload-images-ssh
kube-upload-images-ssh:
$(AP_ROOT)/script/setup-kube.sh upload images docker2ctr "$(KUBE_SSH_NODES)" "$(DOCKER_LOAD_IMAGES)"

.PHONY: kube-upload-crd
kube-upload-crd:
$(SETUP_KUBE) --crd=$(CRD_FILE) $(KUBE_SSH_NODE_NAME) init
$(SETUP_KUBE) --crd=$(CRD_FILE) upload crd

$(KUBE_CREATE): $(AP_RUN_DIR) kube-cluster-create-$(KUBE_CLUSTER_CREATE_TARGET)
$(SETUP_KUBE) --crd=$(CRD_FILE) $(KUBE_SSH_NODE_NAME) init
$(KUBE_CREATE): $(AP_RUN_DIR) kube-cluster-create-$(KUBE_CLUSTER_CREATE_TYPE)
$(SETUP_KUBE) --crd=$(CRD_FILE) $(KUBE_CLUSTER_CREATE_TYPE) init "$(KUBE_SSH_NODES)"
touch $@

.INTERMEDIATE: kube-cluster-create-default
kube-cluster-create-default: $(KUBE_CREATE)
.INTERMEDIATE: kube-cluster-create-ssh
kube-cluster-create-ssh:

.PHONY: kube-cluster-check-alive
kube-cluster-check-info:
Expand Down Expand Up @@ -129,7 +131,7 @@ kube-cluster-setup-e2e-ci: \
kube-install-helm-charts

.PHONY: kube-cluster-delete
kube-cluster-delete: kube-cluster-delete-$(KUBE_SSH_NODE_NAME)
kube-cluster-delete: kube-cluster-delete-$(KUBE_SSH_NODES)

.PHONY: kube-setup-ingress
kube-setup-ingress: kube-setup-ingress-$(KIND_CONFIG)
Expand Down Expand Up @@ -161,12 +163,19 @@ kube-status-ingress-%:
.PHONY: kube-deployment-rollout-operator-inventory
kube-deployment-rollout-operator-inventory:
kubectl -n akash-services rollout status deployment operator-inventory --timeout=$(KUBE_ROLLOUT_TIMEOUT)s
kubectl -n akash-services wait pods -l app.kubernetes.io/part-of=provider -l app.kubernetes.io/component=operator -l app.kubernetes.io/instance=inventory-service --for condition=Ready --timeout=$(KUBE_ROLLOUT_TIMEOUT)s
kubectl -n akash-services wait pods \
-l app.kubernetes.io/part-of=provider,app.kubernetes.io/component=operator,app.kubernetes.io/instance=inventory-service \
--for condition=Ready \
--timeout=$(KUBE_ROLLOUT_TIMEOUT)s
kubectl -n akash-services wait pods \
-l app.kubernetes.io/part-of=provider,app.kubernetes.io/component=operator,app.kubernetes.io/instance=inventory-hardware-discovery,app.kubernetes.io/name=inventory \
--for=condition=ready \
--timeout=$(KUBE_ROLLOUT_TIMEOUT)s

.PHONY: kube-deployment-rollout-%
kube-deployment-rollout-%:
kubectl -n akash-services rollout status deployment $* --timeout=$(KUBE_ROLLOUT_TIMEOUT)s
kubectl -n akash-services wait pods -l akash.network/component=operator -l akash.network/name=$(patsubst %, operator-%,$*) --for condition=Ready --timeout=$(KUBE_ROLLOUT_TIMEOUT)s
kubectl -n akash-services wait pods -l akash.network/component=operator,akash.network/name=$(patsubst %, operator-%,$*) --for condition=Ready --timeout=$(KUBE_ROLLOUT_TIMEOUT)s

.PHONY: akash-node-ready
akash-node-ready: SHELL=$(BASH_PATH)
Expand Down Expand Up @@ -195,4 +204,4 @@ kube-logs-operator-inventory:

.PHONY: kube-wait-inventory-available
kube-wait-inventory-available:
$(SETUP_KUBE) wait inventory-available
$(SETUP_KUBE) --retries=60 wait inventory-available
4 changes: 2 additions & 2 deletions _run/kube/Makefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
KUBE_SETUP_PREREQUISITES ?= \

KUBE_CLUSTER_CREATE_TARGET := kind
KUBE_SSH_NODE_NAME ?= kind
KUBE_CLUSTER_CREATE_TYPE := kind
KUBE_SSH_NODES := kind

KUSTOMIZE_INSTALLS ?= \
akash-operator-hostname \
Expand Down
6 changes: 1 addition & 5 deletions _run/ssh/.envrc
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,8 @@ source_up .envrc

dotenv_if_exists dev.env

#source_env ~/projects/akash/gpu
source_env ~/projects/akash/gpu

export AKASH_HOME=$DEVCACHE_RUN/ssh/.akash
export AKASH_KUBECONFIG=$KUBECONFIG
export AP_KUBECONFIG=$KUBECONFIG

if ! has tqdm ; then
echo -e "\033[31mtqdm is not installed. https://github.com/tqdm/tqdm"; exit 1
fi
3 changes: 2 additions & 1 deletion _run/ssh/Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
KUBE_SETUP_PREREQUISITES ?= \

KUBE_UPLOAD_AKASH_IMAGE ?= true
KUBE_UPLOAD_AKASH_IMAGE ?= true
KUBE_CLUSTER_CREATE_TYPE := ssh

KUBE_DOCKER_IMAGE_ARCH := amd64

Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ require (
github.com/spf13/viper v1.18.2
github.com/stretchr/testify v1.8.4
github.com/tendermint/tendermint v0.34.27
github.com/troian/pubsub v0.1.0
github.com/troian/pubsub v0.1.1
github.com/vektra/mockery/v2 v2.40.1
go.uber.org/zap v1.24.0
golang.org/x/net v0.19.0
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -1881,8 +1881,8 @@ github.com/tmc/grpc-websocket-proxy v0.0.0-20170815181823-89b8d40f7ca8/go.mod h1
github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U=
github.com/troian/hid v0.13.2 h1:O7PWZQm5YGyg0nVvknFVLVrNTPillz4ZXvxJOtoyteE=
github.com/troian/hid v0.13.2/go.mod h1:n6adloQ1876oEXZr6fFsthy4FDHxwJhh7QYQspm30Ds=
github.com/troian/pubsub v0.1.0 h1:ePToDcB/zZjDMk5uuUSCV93Xl7i+1SNvc18tcWso1Q8=
github.com/troian/pubsub v0.1.0/go.mod h1:ALzDZB06e+BF8JeLnO1hbVIY9dCTu8x6mhcdvitlNRs=
github.com/troian/pubsub v0.1.1 h1:huc5qneo0rtSKKsrkroyyMu+b8bw0talql2tt7GXl98=
github.com/troian/pubsub v0.1.1/go.mod h1:fOUAEWXes/SkyWPTdBpW3L/ovyg74N+eBxRpWKik+2Q=
github.com/ttacon/chalk v0.0.0-20160626202418-22c06c80ed31/go.mod h1:onvgF043R+lC5RZ8IT9rBXDaEDnpnw/Cl+HFiw+v/7Q=
github.com/tv42/httpunix v0.0.0-20150427012821-b75d8614f926/go.mod h1:9ESjWnEqriFuLhtthL60Sar/7RFoluCcXsuvEwTV5KM=
github.com/tv42/httpunix v0.0.0-20191220191345-2ba4b9c3382c h1:u6SKchux2yDvFQnDHS3lPnIRmfVJ5Sxy3ao2SIdysLQ=
Expand Down
2 changes: 1 addition & 1 deletion make/test-integration.mk
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ test-nocache:

.PHONY: test-full
test-full:
$(GO_TEST) -tags=$(BUILD_TAGS) -race $(TEST_MODULES)
$(GO_TEST) -tags=$(BUILD_TAGS) -race -count=1 $(TEST_MODULES)

.PHONY: test-coverage
test-coverage: $(AP_DEVCACHE)
Expand Down
29 changes: 24 additions & 5 deletions operator/inventory/cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ func Cmd() *cobra.Command {
}
}

fd := newClusterNodes(ctx, discoveryImage, namespace)
clNodes := newClusterNodes(ctx, discoveryImage, namespace)

storage = append(storage, st)

Expand All @@ -175,7 +175,7 @@ func Cmd() *cobra.Command {
}

fromctx.CmdSetContextValue(cmd, CtxKeyStorage, storage)
fromctx.CmdSetContextValue(cmd, CtxKeyFeatureDiscovery, fd)
fromctx.CmdSetContextValue(cmd, CtxKeyFeatureDiscovery, clNodes)
fromctx.CmdSetContextValue(cmd, CtxKeyClusterState, QuerierCluster(clState))

ctx = cmd.Context()
Expand Down Expand Up @@ -211,19 +211,20 @@ func Cmd() *cobra.Command {
gogoreflection.Register(grpcSrv)

group.Go(func() error {
return registryLoader(ctx)
return configWatcher(ctx, viper.GetString(FlagConfig))
})

group.Go(func() error {
return scWatcher(ctx)
})

group.Go(func() error {
return configWatcher(ctx, viper.GetString(FlagConfig))
return registryLoader(ctx)
})

group.Go(clState.run)
group.Go(fd.Wait)
group.Go(clNodes.Wait)

group.Go(func() error {
log.Info(fmt.Sprintf("rest listening on \"%s\"", restEndpoint))

Expand Down Expand Up @@ -355,6 +356,12 @@ func loadKubeConfig(c *cobra.Command) error {
}

func configWatcher(ctx context.Context, file string) error {
log := fromctx.LogrFromCtx(ctx).WithName("watcher.config")

defer func() {
log.Info("stopped")
}()

config, err := loadConfig(file, false)
if err != nil {
return err
Expand Down Expand Up @@ -388,6 +395,8 @@ func configWatcher(ctx context.Context, file string) error {

bus.Pub(config, []string{topicInventoryConfig}, pubsub.WithRetain())

log.Info("started")

for {
select {
case <-ctx.Done():
Expand Down Expand Up @@ -527,12 +536,22 @@ func registryLoader(ctx context.Context) error {
}

func scWatcher(ctx context.Context) error {
log := fromctx.LogrFromCtx(ctx).WithName("watcher.storageclasses")

defer func() {
log.Info("stopped")
}()

bus := fromctx.PubSubFromCtx(ctx)

scch := bus.Sub(topicKubeSC)

sc := make(storageClasses)

bus.Pub(sc.copy(), []string{topicStorageClasses}, pubsub.WithRetain())

log.Info("started")

for {
select {
case <-ctx.Done():
Expand Down
Loading

0 comments on commit 6ce7931

Please sign in to comment.