From 2feac0c537ea232c2d02aafde12fe897309d5189 Mon Sep 17 00:00:00 2001 From: Artur Troian Date: Mon, 29 Jan 2024 07:40:20 -0500 Subject: [PATCH] feat(inventory): load pcidb at startup Signed-off-by: Artur Troian --- Dockerfile | 1 + _docs/kustomize/akash-node/deployment.yaml | 3 +- _docs/kustomize/akash-node/ingress.yaml | 1 + _docs/kustomize/akash-node/kustomization.yaml | 1 + _docs/kustomize/akash-node/service.yaml | 1 + .../akash-operator-hostname/cluster_role.yaml | 1 + .../akash-operator-hostname/deployment.yaml | 87 ++++--- .../akash-operator-hostname/ingress.yaml | 15 +- .../kustomization.yaml | 1 + .../akash-operator-hostname/rbac.yaml | 1 + .../akash-operator-hostname/service.yaml | 1 + .../service_account.yaml | 1 + .../cluster-roles.yaml | 13 + .../akash-operator-inventory/daemonset.yaml | 2 + .../akash-operator-inventory/deployment.yaml | 1 + .../kustomization.yaml | 1 + .../akash-operator-inventory/service.yaml | 1 + .../akash-operator-ip/cluster_role.yaml | 47 +++- .../akash-operator-ip/deployment.yaml | 97 +++---- .../kustomize/akash-operator-ip/ingress.yaml | 15 +- .../akash-operator-ip/kustomization.yaml | 1 + _docs/kustomize/akash-operator-ip/rbac.yaml | 1 + .../kustomize/akash-operator-ip/service.yaml | 1 + .../akash-operator-ip/service_account.yaml | 1 + .../kustomize/akash-provider/deployment.yaml | 3 +- _docs/kustomize/akash-provider/ingress.yaml | 1 + .../akash-provider/kustomization.yaml | 1 + _docs/kustomize/akash-provider/rbac.yaml | 1 + _docs/kustomize/akash-provider/service.yaml | 1 + .../akash-provider/service_account.yaml | 1 + .../akash-services/kustomization.yaml | 1 + .../akash-services/network-policies.yaml | 24 +- _docs/kustomize/kind/kind-metrics-server.yaml | 1 + _docs/kustomize/networking/namespace.yaml | 2 +- .../network-policy-default-ns-deny.yaml | 6 +- .../templates/akash-node/docker-image.yaml | 1 + .../templates/akash-node/gateway-host.yaml | 1 + .../templates/akash-node/kustomization.yaml | 1 + .../akash-operator-hostname/docker-image.yaml | 1 + .../kustomization.yaml | 1 + .../docker-image.yaml | 1 + .../kustomization.yaml | 1 + .../akash-operator-ip/docker-image.yaml | 1 + .../akash-operator-ip/kustomization.yaml | 1 + .../akash-provider/docker-image.yaml | 1 + .../akash-provider/gateway-host.yaml | 1 + .../akash-provider/kustomization.yaml | 1 + .../templates/redis/kustomization.yaml | 1 + _run/common-kube.mk | 21 +- _run/common.mk | 2 + _run/kube/Makefile | 3 - _run/ssh/.envrc | 10 +- _run/ssh/Makefile | 5 +- _run/ssh/README.md | 4 +- _run/ssh/kind-config-80.yaml | 23 +- _run/ssh/kind-config.yaml | 21 +- _run/ssh/provider.yaml | 1 + .../cmd/flags/kube_config.go | 4 + go.mod | 2 +- go.sum | 4 +- operator/inventory/cmd.go | 115 ++++---- .../inventory/feature-discovery-client.go | 2 + operator/inventory/feature-discovery-node.go | 245 ++++++++++++++---- operator/inventory/gpu-info.json | 37 --- operator/inventory/registry.go | 16 ++ operator/inventory/types.go | 18 +- operator/psutil.go | 20 +- pkg/apis/akash.network/v2beta2/register.go | 1 + script/load_docker2ctr.sh | 25 -- script/load_docker2kind.sh | 17 -- script/setup-kube.sh | 97 +++++++ service.go | 4 + 72 files changed, 675 insertions(+), 372 deletions(-) delete mode 100644 operator/inventory/gpu-info.json create mode 100644 operator/inventory/registry.go delete mode 100755 script/load_docker2ctr.sh delete mode 100755 script/load_docker2kind.sh diff --git a/Dockerfile b/Dockerfile index 048afc1c..ffdb76ea 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,6 +7,7 @@ RUN \ apt-get update \ && apt-get install -y --no-install-recommends \ tini \ + ca-certificates \ && rm -rf /var/lib/apt/lists/* # default port for provider API diff --git a/_docs/kustomize/akash-node/deployment.yaml b/_docs/kustomize/akash-node/deployment.yaml index 977d9157..d3ef580b 100644 --- a/_docs/kustomize/akash-node/deployment.yaml +++ b/_docs/kustomize/akash-node/deployment.yaml @@ -1,3 +1,4 @@ +--- apiVersion: apps/v1 kind: Deployment metadata: @@ -17,7 +18,7 @@ spec: - name: akash-node image: ghcr.io/akash-network/node:stable imagePullPolicy: IfNotPresent - command: [ "/bin/sh" , "/boot/run.sh" ] + command: ["/bin/sh", "/boot/run.sh"] env: ## diff --git a/_docs/kustomize/akash-node/ingress.yaml b/_docs/kustomize/akash-node/ingress.yaml index 34b1513d..1567a893 100644 --- a/_docs/kustomize/akash-node/ingress.yaml +++ b/_docs/kustomize/akash-node/ingress.yaml @@ -1,3 +1,4 @@ +--- apiVersion: networking.k8s.io/v1 kind: Ingress metadata: diff --git a/_docs/kustomize/akash-node/kustomization.yaml b/_docs/kustomize/akash-node/kustomization.yaml index ea2e1830..5db73b80 100644 --- a/_docs/kustomize/akash-node/kustomization.yaml +++ b/_docs/kustomize/akash-node/kustomization.yaml @@ -1,3 +1,4 @@ +--- apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization namespace: akash-services diff --git a/_docs/kustomize/akash-node/service.yaml b/_docs/kustomize/akash-node/service.yaml index efd9b9ff..2e14c365 100644 --- a/_docs/kustomize/akash-node/service.yaml +++ b/_docs/kustomize/akash-node/service.yaml @@ -1,3 +1,4 @@ +--- apiVersion: v1 kind: Service metadata: diff --git a/_docs/kustomize/akash-operator-hostname/cluster_role.yaml b/_docs/kustomize/akash-operator-hostname/cluster_role.yaml index 2f7a4327..ddb86372 100644 --- a/_docs/kustomize/akash-operator-hostname/cluster_role.yaml +++ b/_docs/kustomize/akash-operator-hostname/cluster_role.yaml @@ -1,3 +1,4 @@ +--- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: diff --git a/_docs/kustomize/akash-operator-hostname/deployment.yaml b/_docs/kustomize/akash-operator-hostname/deployment.yaml index b029d93b..3fc6470a 100644 --- a/_docs/kustomize/akash-operator-hostname/deployment.yaml +++ b/_docs/kustomize/akash-operator-hostname/deployment.yaml @@ -1,3 +1,4 @@ +--- apiVersion: apps/v1 kind: Deployment metadata: @@ -18,46 +19,46 @@ spec: spec: serviceAccountName: akash-operator containers: - - name: akash-hostname-operator - image: ghcr.io/akash-network/provider:stable - imagePullPolicy: IfNotPresent - args: ["provider-services", "hostname-operator"] - ports: - - name: status - containerPort: 8085 - env: - - name: AP_K8S_MANIFEST_NS - valueFrom: - configMapKeyRef: - name: akash-provider-config - key: k8s-manifest-ns - - name: AP_PRUNE_INTERVAL - valueFrom: - configMapKeyRef: - name: akash-hostname-operator-config - key: prune-interval - - name: AP_IGNORE_LIST_ENTRY_LIMIT - valueFrom: - configMapKeyRef: - name: akash-hostname-operator-config - key: ignore-list-entry-limit - - name: AP_WEB_REFRESH_INTERVAL - valueFrom: - configMapKeyRef: - name: akash-hostname-operator-config - key: web-refresh-interval - - name: AP_RETRY_DELAY - valueFrom: - configMapKeyRef: - name: akash-hostname-operator-config - key: retry-delay - - name: AP_IGNORE_LIST_AGE_LIMIT - valueFrom: - configMapKeyRef: - name: akash-hostname-operator-config - key: ignore-list-age-limit - - name: AP_EVENT_FAILURE_LIMIT - valueFrom: - configMapKeyRef: - name: akash-hostname-operator-config - key: event-failure-limit + - name: akash-hostname-operator + image: ghcr.io/akash-network/provider:stable + imagePullPolicy: IfNotPresent + args: ["provider-services", "hostname-operator"] + ports: + - name: status + containerPort: 8085 + env: + - name: AP_K8S_MANIFEST_NS + valueFrom: + configMapKeyRef: + name: akash-provider-config + key: k8s-manifest-ns + - name: AP_PRUNE_INTERVAL + valueFrom: + configMapKeyRef: + name: akash-hostname-operator-config + key: prune-interval + - name: AP_IGNORE_LIST_ENTRY_LIMIT + valueFrom: + configMapKeyRef: + name: akash-hostname-operator-config + key: ignore-list-entry-limit + - name: AP_WEB_REFRESH_INTERVAL + valueFrom: + configMapKeyRef: + name: akash-hostname-operator-config + key: web-refresh-interval + - name: AP_RETRY_DELAY + valueFrom: + configMapKeyRef: + name: akash-hostname-operator-config + key: retry-delay + - name: AP_IGNORE_LIST_AGE_LIMIT + valueFrom: + configMapKeyRef: + name: akash-hostname-operator-config + key: ignore-list-age-limit + - name: AP_EVENT_FAILURE_LIMIT + valueFrom: + configMapKeyRef: + name: akash-hostname-operator-config + key: event-failure-limit diff --git a/_docs/kustomize/akash-operator-hostname/ingress.yaml b/_docs/kustomize/akash-operator-hostname/ingress.yaml index a0c1a816..2e38e0dc 100644 --- a/_docs/kustomize/akash-operator-hostname/ingress.yaml +++ b/_docs/kustomize/akash-operator-hostname/ingress.yaml @@ -1,3 +1,4 @@ +--- apiVersion: networking.k8s.io/v1 kind: Ingress metadata: @@ -12,10 +13,10 @@ spec: - host: akash-hostname-operator.localhost http: paths: - - path: / - pathType: Prefix - backend: - service: - name: akash-hostname-operator - port: - name: status + - path: / + pathType: Prefix + backend: + service: + name: akash-hostname-operator + port: + name: status diff --git a/_docs/kustomize/akash-operator-hostname/kustomization.yaml b/_docs/kustomize/akash-operator-hostname/kustomization.yaml index 14fb6533..a31c1f67 100644 --- a/_docs/kustomize/akash-operator-hostname/kustomization.yaml +++ b/_docs/kustomize/akash-operator-hostname/kustomization.yaml @@ -1,3 +1,4 @@ +--- apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization namespace: akash-services diff --git a/_docs/kustomize/akash-operator-hostname/rbac.yaml b/_docs/kustomize/akash-operator-hostname/rbac.yaml index d99a8978..1c0bcdda 100644 --- a/_docs/kustomize/akash-operator-hostname/rbac.yaml +++ b/_docs/kustomize/akash-operator-hostname/rbac.yaml @@ -1,3 +1,4 @@ +--- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: diff --git a/_docs/kustomize/akash-operator-hostname/service.yaml b/_docs/kustomize/akash-operator-hostname/service.yaml index 6a538d6a..5bc0b9f8 100644 --- a/_docs/kustomize/akash-operator-hostname/service.yaml +++ b/_docs/kustomize/akash-operator-hostname/service.yaml @@ -1,3 +1,4 @@ +--- apiVersion: v1 kind: Service metadata: diff --git a/_docs/kustomize/akash-operator-hostname/service_account.yaml b/_docs/kustomize/akash-operator-hostname/service_account.yaml index f5fccbd2..2288c026 100644 --- a/_docs/kustomize/akash-operator-hostname/service_account.yaml +++ b/_docs/kustomize/akash-operator-hostname/service_account.yaml @@ -1,3 +1,4 @@ +--- apiVersion: v1 kind: ServiceAccount metadata: diff --git a/_docs/kustomize/akash-operator-inventory/cluster-roles.yaml b/_docs/kustomize/akash-operator-inventory/cluster-roles.yaml index 353dd02d..05c24d8d 100644 --- a/_docs/kustomize/akash-operator-inventory/cluster-roles.yaml +++ b/_docs/kustomize/akash-operator-inventory/cluster-roles.yaml @@ -1,3 +1,4 @@ +--- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: @@ -74,6 +75,18 @@ rules: - get - list - watch + - apiGroups: + - '' + resources: + - configmaps + verbs: + - create + - update + - patch + - delete + - get + - list + - watch --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole diff --git a/_docs/kustomize/akash-operator-inventory/daemonset.yaml b/_docs/kustomize/akash-operator-inventory/daemonset.yaml index ef247d61..b5039e38 100644 --- a/_docs/kustomize/akash-operator-inventory/daemonset.yaml +++ b/_docs/kustomize/akash-operator-inventory/daemonset.yaml @@ -65,6 +65,8 @@ spec: initialDelaySeconds: 5 periodSeconds: 5 env: + - name: PCIDB_ENABLE_NETWORK_FETCH + value: "1" - name: AP_POD_NAME valueFrom: fieldRef: diff --git a/_docs/kustomize/akash-operator-inventory/deployment.yaml b/_docs/kustomize/akash-operator-inventory/deployment.yaml index 8cf7d133..dc93d96b 100644 --- a/_docs/kustomize/akash-operator-inventory/deployment.yaml +++ b/_docs/kustomize/akash-operator-inventory/deployment.yaml @@ -1,3 +1,4 @@ +--- apiVersion: apps/v1 kind: Deployment metadata: diff --git a/_docs/kustomize/akash-operator-inventory/kustomization.yaml b/_docs/kustomize/akash-operator-inventory/kustomization.yaml index 73238b1e..05e939c0 100644 --- a/_docs/kustomize/akash-operator-inventory/kustomization.yaml +++ b/_docs/kustomize/akash-operator-inventory/kustomization.yaml @@ -1,3 +1,4 @@ +--- apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization namespace: akash-services diff --git a/_docs/kustomize/akash-operator-inventory/service.yaml b/_docs/kustomize/akash-operator-inventory/service.yaml index 47044401..30f9d517 100644 --- a/_docs/kustomize/akash-operator-inventory/service.yaml +++ b/_docs/kustomize/akash-operator-inventory/service.yaml @@ -1,3 +1,4 @@ +--- apiVersion: v1 kind: Service metadata: diff --git a/_docs/kustomize/akash-operator-ip/cluster_role.yaml b/_docs/kustomize/akash-operator-ip/cluster_role.yaml index 54aaeee1..ad03c97b 100644 --- a/_docs/kustomize/akash-operator-ip/cluster_role.yaml +++ b/_docs/kustomize/akash-operator-ip/cluster_role.yaml @@ -1,35 +1,58 @@ +--- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: akash-ip-op-manage-service rules: - - apiGroups: [""] - resources: ["services"] - verbs: ["get", "list", "create", "update", "delete", "deletecollection", "watch"] + - apiGroups: + - "" + resources: + - services + verbs: + - get + - list + - create + - update + - delete + - deletecollection + - watch --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: akash-ip-op-watch-providerleasedip rules: - - apiGroups: ["akash.network"] - resources: ["providerleasedips"] - verbs: ["get", "list", "watch"] + - apiGroups: + - akash.network + resources: + - providerleasedips + verbs: + - get + - list + - watch --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: akash-ip-op-watch-configmaps rules: - - apiGroups: [""] - resources: ["configmaps"] - verbs: ["get", "list", "watch"] + - apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - list + - watch --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: akash-ip-op-get-namespaces rules: - - apiGroups: [""] - resources: ["namespaces"] - verbs: ["get"] + - apiGroups: + - "" + resources: + - namespaces + verbs: + - get diff --git a/_docs/kustomize/akash-operator-ip/deployment.yaml b/_docs/kustomize/akash-operator-ip/deployment.yaml index 2e0631b9..9f567fa4 100644 --- a/_docs/kustomize/akash-operator-ip/deployment.yaml +++ b/_docs/kustomize/akash-operator-ip/deployment.yaml @@ -1,3 +1,4 @@ +--- apiVersion: apps/v1 kind: Deployment metadata: @@ -19,51 +20,51 @@ spec: spec: serviceAccountName: akash-ip-operator containers: - - name: akash-ip-operator - image: ghcr.io/akash-network/provider:stable - imagePullPolicy: IfNotPresent - args: ["provider-services", "ip-operator"] - ports: - - name: api - containerPort: 8086 - env: - - name: AP_K8S_MANIFEST_NS - valueFrom: - configMapKeyRef: - name: akash-provider-config - key: k8s-manifest-ns - - name: AP_PRUNE_INTERVAL - valueFrom: - configMapKeyRef: - name: akash-ip-operator-config - key: prune-interval - - name: AP_IGNORE_LIST_ENTRY_LIMIT - valueFrom: - configMapKeyRef: - name: akash-ip-operator-config - key: ignore-list-entry-limit - - name: AP_WEB_REFRESH_INTERVAL - valueFrom: - configMapKeyRef: - name: akash-ip-operator-config - key: web-refresh-interval - - name: AP_RETRY_DELAY - valueFrom: - configMapKeyRef: - name: akash-ip-operator-config - key: retry-delay - - name: AP_IGNORE_LIST_AGE_LIMIT - valueFrom: - configMapKeyRef: - name: akash-ip-operator-config - key: ignore-list-age-limit - - name: AP_EVENT_FAILURE_LIMIT - valueFrom: - configMapKeyRef: - name: akash-ip-operator-config - key: event-failure-limit - - name: AP_PROVIDER - valueFrom: - configMapKeyRef: - name: akash-ip-operator-config - key: provider-address + - name: akash-ip-operator + image: ghcr.io/akash-network/provider:stable + imagePullPolicy: IfNotPresent + args: ["provider-services", "ip-operator"] + ports: + - name: api + containerPort: 8086 + env: + - name: AP_K8S_MANIFEST_NS + valueFrom: + configMapKeyRef: + name: akash-provider-config + key: k8s-manifest-ns + - name: AP_PRUNE_INTERVAL + valueFrom: + configMapKeyRef: + name: akash-ip-operator-config + key: prune-interval + - name: AP_IGNORE_LIST_ENTRY_LIMIT + valueFrom: + configMapKeyRef: + name: akash-ip-operator-config + key: ignore-list-entry-limit + - name: AP_WEB_REFRESH_INTERVAL + valueFrom: + configMapKeyRef: + name: akash-ip-operator-config + key: web-refresh-interval + - name: AP_RETRY_DELAY + valueFrom: + configMapKeyRef: + name: akash-ip-operator-config + key: retry-delay + - name: AP_IGNORE_LIST_AGE_LIMIT + valueFrom: + configMapKeyRef: + name: akash-ip-operator-config + key: ignore-list-age-limit + - name: AP_EVENT_FAILURE_LIMIT + valueFrom: + configMapKeyRef: + name: akash-ip-operator-config + key: event-failure-limit + - name: AP_PROVIDER + valueFrom: + configMapKeyRef: + name: akash-ip-operator-config + key: provider-address diff --git a/_docs/kustomize/akash-operator-ip/ingress.yaml b/_docs/kustomize/akash-operator-ip/ingress.yaml index e36613b8..3b3bdb69 100644 --- a/_docs/kustomize/akash-operator-ip/ingress.yaml +++ b/_docs/kustomize/akash-operator-ip/ingress.yaml @@ -1,3 +1,4 @@ +--- apiVersion: networking.k8s.io/v1 kind: Ingress metadata: @@ -13,10 +14,10 @@ spec: - host: akash-ip-operator.localhost http: paths: - - path: / - pathType: Prefix - backend: - service: - name: akash-ip-operator - port: - name: status + - path: / + pathType: Prefix + backend: + service: + name: akash-ip-operator + port: + name: status diff --git a/_docs/kustomize/akash-operator-ip/kustomization.yaml b/_docs/kustomize/akash-operator-ip/kustomization.yaml index f8888a61..404425bb 100644 --- a/_docs/kustomize/akash-operator-ip/kustomization.yaml +++ b/_docs/kustomize/akash-operator-ip/kustomization.yaml @@ -1,3 +1,4 @@ +--- apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization namespace: akash-services diff --git a/_docs/kustomize/akash-operator-ip/rbac.yaml b/_docs/kustomize/akash-operator-ip/rbac.yaml index 4d509bf8..feffde2c 100644 --- a/_docs/kustomize/akash-operator-ip/rbac.yaml +++ b/_docs/kustomize/akash-operator-ip/rbac.yaml @@ -1,3 +1,4 @@ +--- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: diff --git a/_docs/kustomize/akash-operator-ip/service.yaml b/_docs/kustomize/akash-operator-ip/service.yaml index 2ffb4f4f..8fce464f 100644 --- a/_docs/kustomize/akash-operator-ip/service.yaml +++ b/_docs/kustomize/akash-operator-ip/service.yaml @@ -1,3 +1,4 @@ +--- apiVersion: v1 kind: Service metadata: diff --git a/_docs/kustomize/akash-operator-ip/service_account.yaml b/_docs/kustomize/akash-operator-ip/service_account.yaml index 0dab5636..9340609a 100644 --- a/_docs/kustomize/akash-operator-ip/service_account.yaml +++ b/_docs/kustomize/akash-operator-ip/service_account.yaml @@ -1,3 +1,4 @@ +--- apiVersion: v1 kind: ServiceAccount metadata: diff --git a/_docs/kustomize/akash-provider/deployment.yaml b/_docs/kustomize/akash-provider/deployment.yaml index e852ef73..5f9eb6bd 100644 --- a/_docs/kustomize/akash-provider/deployment.yaml +++ b/_docs/kustomize/akash-provider/deployment.yaml @@ -1,3 +1,4 @@ +--- apiVersion: apps/v1 kind: Deployment metadata: @@ -17,7 +18,7 @@ spec: - name: import-keys image: ghcr.io/akash-network/node:stable imagePullPolicy: IfNotPresent - command: [ "/bin/sh" , "/boot/run.sh" ] + command: ["/bin/sh", "/boot/run.sh"] env: ## # boot config diff --git a/_docs/kustomize/akash-provider/ingress.yaml b/_docs/kustomize/akash-provider/ingress.yaml index 5a17e542..02120f89 100644 --- a/_docs/kustomize/akash-provider/ingress.yaml +++ b/_docs/kustomize/akash-provider/ingress.yaml @@ -1,3 +1,4 @@ +--- apiVersion: networking.k8s.io/v1 kind: Ingress metadata: diff --git a/_docs/kustomize/akash-provider/kustomization.yaml b/_docs/kustomize/akash-provider/kustomization.yaml index 16bfed1b..4f0551ae 100644 --- a/_docs/kustomize/akash-provider/kustomization.yaml +++ b/_docs/kustomize/akash-provider/kustomization.yaml @@ -1,3 +1,4 @@ +--- apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization namespace: akash-services diff --git a/_docs/kustomize/akash-provider/rbac.yaml b/_docs/kustomize/akash-provider/rbac.yaml index 01db79bb..d630d5b1 100644 --- a/_docs/kustomize/akash-provider/rbac.yaml +++ b/_docs/kustomize/akash-provider/rbac.yaml @@ -1,3 +1,4 @@ +--- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: diff --git a/_docs/kustomize/akash-provider/service.yaml b/_docs/kustomize/akash-provider/service.yaml index 421d1e7c..4d649f48 100644 --- a/_docs/kustomize/akash-provider/service.yaml +++ b/_docs/kustomize/akash-provider/service.yaml @@ -1,3 +1,4 @@ +--- apiVersion: v1 kind: Service metadata: diff --git a/_docs/kustomize/akash-provider/service_account.yaml b/_docs/kustomize/akash-provider/service_account.yaml index f102564a..7810b824 100644 --- a/_docs/kustomize/akash-provider/service_account.yaml +++ b/_docs/kustomize/akash-provider/service_account.yaml @@ -1,3 +1,4 @@ +--- apiVersion: v1 kind: ServiceAccount metadata: diff --git a/_docs/kustomize/akash-services/kustomization.yaml b/_docs/kustomize/akash-services/kustomization.yaml index 0b51f85a..3fff0fe4 100644 --- a/_docs/kustomize/akash-services/kustomization.yaml +++ b/_docs/kustomize/akash-services/kustomization.yaml @@ -1,3 +1,4 @@ +--- apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization namespace: akash-services diff --git a/_docs/kustomize/akash-services/network-policies.yaml b/_docs/kustomize/akash-services/network-policies.yaml index a0bbbfad..78718e1f 100644 --- a/_docs/kustomize/akash-services/network-policies.yaml +++ b/_docs/kustomize/akash-services/network-policies.yaml @@ -4,33 +4,33 @@ kind: NetworkPolicy metadata: name: akash-services-default-deny-ingress spec: - podSelector: + podSelector: matchLabels: {} policyTypes: - - Ingress + - Ingress --- apiVersion: networking.k8s.io/v1 kind: NetworkPolicy metadata: name: akash-services-allow-akash-services spec: - podSelector: + podSelector: matchLabels: {} ingress: - - from: - - namespaceSelector: - matchLabels: - akash.network/name: akash-services + - from: + - namespaceSelector: + matchLabels: + akash.network/name: akash-services --- apiVersion: networking.k8s.io/v1 kind: NetworkPolicy metadata: name: akash-services-allow-ingress-nginx spec: - podSelector: + podSelector: matchLabels: {} ingress: - - from: - - namespaceSelector: - matchLabels: - app.kubernetes.io/name: ingress-nginx + - from: + - namespaceSelector: + matchLabels: + app.kubernetes.io/name: ingress-nginx diff --git a/_docs/kustomize/kind/kind-metrics-server.yaml b/_docs/kustomize/kind/kind-metrics-server.yaml index 3ef71146..89672e64 100644 --- a/_docs/kustomize/kind/kind-metrics-server.yaml +++ b/_docs/kustomize/kind/kind-metrics-server.yaml @@ -1,3 +1,4 @@ +--- apiVersion: v1 kind: ServiceAccount metadata: diff --git a/_docs/kustomize/networking/namespace.yaml b/_docs/kustomize/networking/namespace.yaml index b4aa03ef..70301048 100644 --- a/_docs/kustomize/networking/namespace.yaml +++ b/_docs/kustomize/networking/namespace.yaml @@ -1,3 +1,4 @@ +--- apiVersion: v1 kind: Namespace metadata: @@ -14,4 +15,3 @@ metadata: labels: name: lease akash.network: "true" - diff --git a/_docs/kustomize/networking/network-policy-default-ns-deny.yaml b/_docs/kustomize/networking/network-policy-default-ns-deny.yaml index 3dea46c0..f8184e0d 100644 --- a/_docs/kustomize/networking/network-policy-default-ns-deny.yaml +++ b/_docs/kustomize/networking/network-policy-default-ns-deny.yaml @@ -5,8 +5,8 @@ metadata: name: default-deny-ingress namespace: default spec: - podSelector: + podSelector: matchLabels: {} policyTypes: - - Ingress - - Egress + - Ingress + - Egress diff --git a/_docs/kustomize/templates/akash-node/docker-image.yaml b/_docs/kustomize/templates/akash-node/docker-image.yaml index 58e517a1..574d61bf 100644 --- a/_docs/kustomize/templates/akash-node/docker-image.yaml +++ b/_docs/kustomize/templates/akash-node/docker-image.yaml @@ -1,3 +1,4 @@ +--- - op: replace path: /spec/template/spec/containers/0/image value: ghcr.io/akash-network/node:stable diff --git a/_docs/kustomize/templates/akash-node/gateway-host.yaml b/_docs/kustomize/templates/akash-node/gateway-host.yaml index 5c9411e0..758fcf82 100644 --- a/_docs/kustomize/templates/akash-node/gateway-host.yaml +++ b/_docs/kustomize/templates/akash-node/gateway-host.yaml @@ -1,3 +1,4 @@ +--- - op: replace path: /spec/rules/0/host value: akash.localhost diff --git a/_docs/kustomize/templates/akash-node/kustomization.yaml b/_docs/kustomize/templates/akash-node/kustomization.yaml index 9d8335d7..a2ce3d5b 100644 --- a/_docs/kustomize/templates/akash-node/kustomization.yaml +++ b/_docs/kustomize/templates/akash-node/kustomization.yaml @@ -1,3 +1,4 @@ +--- apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization namespace: akash-services diff --git a/_docs/kustomize/templates/akash-operator-hostname/docker-image.yaml b/_docs/kustomize/templates/akash-operator-hostname/docker-image.yaml index 08ad6369..7276c58e 100644 --- a/_docs/kustomize/templates/akash-operator-hostname/docker-image.yaml +++ b/_docs/kustomize/templates/akash-operator-hostname/docker-image.yaml @@ -1,3 +1,4 @@ +--- - op: replace path: /spec/template/spec/containers/0/image value: ghcr.io/akash-network/provider:stable diff --git a/_docs/kustomize/templates/akash-operator-hostname/kustomization.yaml b/_docs/kustomize/templates/akash-operator-hostname/kustomization.yaml index b5f4f359..aa3bf11e 100644 --- a/_docs/kustomize/templates/akash-operator-hostname/kustomization.yaml +++ b/_docs/kustomize/templates/akash-operator-hostname/kustomization.yaml @@ -1,3 +1,4 @@ +--- apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization namespace: akash-services diff --git a/_docs/kustomize/templates/akash-operator-inventory/docker-image.yaml b/_docs/kustomize/templates/akash-operator-inventory/docker-image.yaml index 08ad6369..7276c58e 100644 --- a/_docs/kustomize/templates/akash-operator-inventory/docker-image.yaml +++ b/_docs/kustomize/templates/akash-operator-inventory/docker-image.yaml @@ -1,3 +1,4 @@ +--- - op: replace path: /spec/template/spec/containers/0/image value: ghcr.io/akash-network/provider:stable diff --git a/_docs/kustomize/templates/akash-operator-inventory/kustomization.yaml b/_docs/kustomize/templates/akash-operator-inventory/kustomization.yaml index c36f6a15..18cfe34e 100644 --- a/_docs/kustomize/templates/akash-operator-inventory/kustomization.yaml +++ b/_docs/kustomize/templates/akash-operator-inventory/kustomization.yaml @@ -1,3 +1,4 @@ +--- apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization namespace: akash-services diff --git a/_docs/kustomize/templates/akash-operator-ip/docker-image.yaml b/_docs/kustomize/templates/akash-operator-ip/docker-image.yaml index 08ad6369..7276c58e 100644 --- a/_docs/kustomize/templates/akash-operator-ip/docker-image.yaml +++ b/_docs/kustomize/templates/akash-operator-ip/docker-image.yaml @@ -1,3 +1,4 @@ +--- - op: replace path: /spec/template/spec/containers/0/image value: ghcr.io/akash-network/provider:stable diff --git a/_docs/kustomize/templates/akash-operator-ip/kustomization.yaml b/_docs/kustomize/templates/akash-operator-ip/kustomization.yaml index 5c872439..2b48abc4 100644 --- a/_docs/kustomize/templates/akash-operator-ip/kustomization.yaml +++ b/_docs/kustomize/templates/akash-operator-ip/kustomization.yaml @@ -1,3 +1,4 @@ +--- apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization namespace: akash-services diff --git a/_docs/kustomize/templates/akash-provider/docker-image.yaml b/_docs/kustomize/templates/akash-provider/docker-image.yaml index 8176a3db..15c2b2a7 100644 --- a/_docs/kustomize/templates/akash-provider/docker-image.yaml +++ b/_docs/kustomize/templates/akash-provider/docker-image.yaml @@ -1,3 +1,4 @@ +--- - op: replace path: /spec/template/spec/initContainers/0/image value: ghcr.io/akash-network/node:stable diff --git a/_docs/kustomize/templates/akash-provider/gateway-host.yaml b/_docs/kustomize/templates/akash-provider/gateway-host.yaml index 3580f857..765172ae 100644 --- a/_docs/kustomize/templates/akash-provider/gateway-host.yaml +++ b/_docs/kustomize/templates/akash-provider/gateway-host.yaml @@ -1,3 +1,4 @@ +--- - op: replace path: /spec/rules/0/host value: akash-provider.localhost diff --git a/_docs/kustomize/templates/akash-provider/kustomization.yaml b/_docs/kustomize/templates/akash-provider/kustomization.yaml index df21badc..b1878298 100644 --- a/_docs/kustomize/templates/akash-provider/kustomization.yaml +++ b/_docs/kustomize/templates/akash-provider/kustomization.yaml @@ -1,3 +1,4 @@ +--- apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization namespace: akash-services diff --git a/_docs/kustomize/templates/redis/kustomization.yaml b/_docs/kustomize/templates/redis/kustomization.yaml index 65c48bdc..0a2cc813 100644 --- a/_docs/kustomize/templates/redis/kustomization.yaml +++ b/_docs/kustomize/templates/redis/kustomization.yaml @@ -1,3 +1,4 @@ +--- apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization namespace: akash-services diff --git a/_run/common-kube.mk b/_run/common-kube.mk index 44351aef..cf3fa43e 100644 --- a/_run/common-kube.mk +++ b/_run/common-kube.mk @@ -5,7 +5,6 @@ include ../common-kustomize.mk include ../common-kind.mk include ../common-helm.mk -KUBE_SSH_NODE_NAME ?= akash-gpu KUBE_UPLOAD_AKASH_IMAGE ?= false KUBE_CLUSTER_CREATE_TARGET ?= default KUBE_ROLLOUT_TIMEOUT ?= 180 @@ -14,6 +13,10 @@ INGRESS_CONFIG_PATH ?= ../ingress-nginx.yaml CALICO_MANIFEST ?= https://github.com/projectcalico/calico/blob/v3.25.0/manifests/calico.yaml CRD_FILE ?= $(AP_ROOT)/pkg/apis/akash.network/crd.yaml +ifeq ($(KUBE_SSH_NODE_NAME),) +$(error "KUBE_SSH_NODE_NAME is not set") +endif + # when image is built locally, for example on M1 (arm64) and kubernetes cluster is running on amd64 # we need to specify what arch to deploy as docker manifests can't be transferred locally KUBE_DOCKER_IMAGE_ARCH ?= $(shell uname -m | sed "s/x86_64/amd64/g") @@ -75,11 +78,11 @@ kube-upload-images: kube-upload-images-$(KUBE_CLUSTER_CREATE_TARGET) .PHONY: kube-upload-images-kind kube-upload-images-kind: $(KIND) - $(AP_ROOT)/script/load_docker2kind.sh "$(DOCKER_LOAD_IMAGES)" $(KIND_NAME) + $(AP_ROOT)/script/setup-kube.sh load-images docker2kind "$(KIND_NAME)" "$(DOCKER_LOAD_IMAGES)" .PHONY: kube-upload-images-default kube-upload-images-default: - $(AP_ROOT)/script/load_docker2ctr.sh "$(DOCKER_LOAD_IMAGES)" $(KUBE_SSH_NODE_NAME) + $(AP_ROOT)/script/setup-kube.sh load-images docker2ctr "$(KUBE_SSH_NODE_NAME)" "$(DOCKER_LOAD_IMAGES)" .PHONY: kube-upload-crd kube-upload-crd: @@ -187,3 +190,15 @@ akash-node-ready: done; \ exit 1 \ ) + + +.PHONY: kube-operator-inventory-logs +kube-operator-inventory-logs: + kubectl -n akash-services logs -f \ + -l app.kubernetes.io/part-of=provider,app.kubernetes.io/component=operator,app.kubernetes.io/instance=inventory-service,app.kubernetes.io/name=inventory + +.PHONY: kube-operator-inventory-node-logs +kube-operator-inventory-node-logs: + kubectl -n akash-services logs -f \ + -l app.kubernetes.io/part-of=provider,app.kubernetes.io/component=operator,app.kubernetes.io/instance=inventory-node,app.kubernetes.io/name=inventory + diff --git a/_run/common.mk b/_run/common.mk index 70fa132b..54045e9d 100644 --- a/_run/common.mk +++ b/_run/common.mk @@ -14,6 +14,7 @@ export AKASH_CHAIN_ID = local export AKASH_YES = true export AKASH_GAS_PRICES = 0.025uakt export AKASH_GAS = auto +export AKASH_NODE = http://localhost:26657 export AP_HOME = $(AKASH_HOME) export AP_KEYRING_BACKEND = $(AKASH_KEYRING_BACKEND) @@ -22,6 +23,7 @@ export AP_CHAIN_ID = $(AKASH_CHAIN_ID) export AP_YES = $(AKASH_YES) export AP_GAS_PRICES = $(AKASH_GAS_PRICES) export AP_GAS = $(AKASH_GAS) +export AP_NODE = $(AKASH_GAS) AKASH_INIT := $(AP_RUN_DIR)/.akash-init diff --git a/_run/kube/Makefile b/_run/kube/Makefile index 3df01ce1..b10bbe20 100644 --- a/_run/kube/Makefile +++ b/_run/kube/Makefile @@ -8,9 +8,6 @@ KUSTOMIZE_INSTALLS ?= \ akash-operator-inventory \ akash-operator-ip -export AKASH_NODE=http://localhost:26657 -export AP_NODE=http://localhost:26657 - include ../common.mk include ../common-commands.mk include ../common-kube.mk diff --git a/_run/ssh/.envrc b/_run/ssh/.envrc index 5136c27c..cd4d9fdd 100644 --- a/_run/ssh/.envrc +++ b/_run/ssh/.envrc @@ -1,5 +1,13 @@ source_up .envrc - source_env ~/projects/akash/gpu + +dotenv_if_exists dev.env + +#source_env ~/projects/akash/gpu export AKASH_HOME=$DEVCACHE_RUN/ssh/.akash export AKASH_KUBECONFIG=$KUBECONFIG +export AP_KUBECONFIG=$KUBECONFIG + +if ! has tqdm ; then + echo -e "\033[31mtqdm is not installed. https://github.com/tqdm/tqdm"; exit 1 +fi diff --git a/_run/ssh/Makefile b/_run/ssh/Makefile index 5b38db21..72eb97a6 100644 --- a/_run/ssh/Makefile +++ b/_run/ssh/Makefile @@ -2,14 +2,13 @@ KUBE_SETUP_PREREQUISITES ?= \ KUBE_UPLOAD_AKASH_IMAGE ?= true +KUBE_DOCKER_IMAGE_ARCH := amd64 + KUSTOMIZE_INSTALLS ?= \ akash-operator-hostname \ akash-operator-inventory \ akash-operator-ip -export AKASH_NODE=http://localhost:26657 -export AP_NODE=http://localhost:26657 - SDL_PATH ?= gpu.yaml include ../common.mk diff --git a/_run/ssh/README.md b/_run/ssh/README.md index 81caa088..4a6f6844 100644 --- a/_run/ssh/README.md +++ b/_run/ssh/README.md @@ -1,4 +1,4 @@ -# Dev Environment: "Kube" configuration +# Dev Environment: "SSH" configuration The _Kube_ dev environment builds: @@ -61,7 +61,7 @@ make init Start and initialize kind. -Kubernetes ingress objects present some difficulties for creating development +Kubernetes' ingress objects present some difficulties for creating development environments. Two options are offered below - the first (random port) is less error-prone and can have multiple instances run concurrently, while the second option arguably has a better payoff. diff --git a/_run/ssh/kind-config-80.yaml b/_run/ssh/kind-config-80.yaml index a85fe0d6..22e0b06e 100644 --- a/_run/ssh/kind-config-80.yaml +++ b/_run/ssh/kind-config-80.yaml @@ -1,14 +1,15 @@ +--- kind: Cluster apiVersion: kind.x-k8s.io/v1alpha4 nodes: -- role: control-plane - kubeadmConfigPatches: - - | - kind: InitConfiguration - nodeRegistration: - kubeletExtraArgs: - node-labels: "ingress-ready=true" - extraPortMappings: - - containerPort: 80 - hostPort: 80 - protocol: TCP + - role: control-plane + kubeadmConfigPatches: + - | + kind: InitConfiguration + nodeRegistration: + kubeletExtraArgs: + node-labels: "ingress-ready=true" + extraPortMappings: + - containerPort: 80 + hostPort: 80 + protocol: TCP diff --git a/_run/ssh/kind-config.yaml b/_run/ssh/kind-config.yaml index 43ad9149..a36333b7 100644 --- a/_run/ssh/kind-config.yaml +++ b/_run/ssh/kind-config.yaml @@ -1,13 +1,14 @@ +--- kind: Cluster apiVersion: kind.x-k8s.io/v1alpha4 nodes: -- role: control-plane - kubeadmConfigPatches: - - | - kind: InitConfiguration - nodeRegistration: - kubeletExtraArgs: - node-labels: "ingress-ready=true" - extraPortMappings: - - containerPort: 80 - protocol: TCP + - role: control-plane + kubeadmConfigPatches: + - | + kind: InitConfiguration + nodeRegistration: + kubeletExtraArgs: + node-labels: "ingress-ready=true" + extraPortMappings: + - containerPort: 80 + protocol: TCP diff --git a/_run/ssh/provider.yaml b/_run/ssh/provider.yaml index 57371ebf..987b9ca6 100644 --- a/_run/ssh/provider.yaml +++ b/_run/ssh/provider.yaml @@ -1,3 +1,4 @@ +--- host: https://localhost:8443 jwt-host: https://localhost:8444 attributes: diff --git a/cmd/provider-services/cmd/flags/kube_config.go b/cmd/provider-services/cmd/flags/kube_config.go index cf3e48bf..73f9e4ef 100644 --- a/cmd/provider-services/cmd/flags/kube_config.go +++ b/cmd/provider-services/cmd/flags/kube_config.go @@ -11,5 +11,9 @@ const ( func AddKubeConfigPathFlag(cmd *cobra.Command) error { cmd.PersistentFlags().String(FlagKubeConfig, "$HOME/.kube/config", "kubernetes configuration file path") + if err := viper.BindEnv(FlagKubeConfig, "KUBECONFIG"); err != nil { + return err + } + return viper.BindPFlag(FlagKubeConfig, cmd.PersistentFlags().Lookup(FlagKubeConfig)) } diff --git a/go.mod b/go.mod index 40188528..ce902503 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,7 @@ module github.com/akash-network/provider go 1.21 require ( - github.com/akash-network/akash-api v0.0.45 + github.com/akash-network/akash-api v0.0.47 github.com/akash-network/node v0.30.1-rc4 github.com/avast/retry-go/v4 v4.5.0 github.com/blang/semver/v4 v4.0.0 diff --git a/go.sum b/go.sum index 99df7370..656e7e88 100644 --- a/go.sum +++ b/go.sum @@ -197,8 +197,8 @@ github.com/afex/hystrix-go v0.0.0-20180502004556-fa1af6a1f4f5/go.mod h1:SkGFH1ia github.com/agnivade/levenshtein v1.0.1/go.mod h1:CURSv5d9Uaml+FovSIICkLbAUZ9S4RqaHDIsdSBg7lM= github.com/ajg/form v1.5.1/go.mod h1:uL1WgH+h2mgNtvBq0339dVnzXdBETtL2LeUXaIv25UY= github.com/ajstarks/svgo v0.0.0-20180226025133-644b8db467af/go.mod h1:K08gAheRH3/J6wwsYMMT4xOr94bZjxIelGM0+d/wbFw= -github.com/akash-network/akash-api v0.0.45 h1:NEEP3R+Y8WL6cmHequdhprPMIt2vVhdgwQIKgeDOjgE= -github.com/akash-network/akash-api v0.0.45/go.mod h1:YZq1ukyGEknizGaE6g+A8yGupGqEN8hG8PGPPE8VAxA= +github.com/akash-network/akash-api v0.0.47 h1:1dCe7wKoqaOW9bUFxM46aY567O50DFshljk36mPMXYg= +github.com/akash-network/akash-api v0.0.47/go.mod h1:YZq1ukyGEknizGaE6g+A8yGupGqEN8hG8PGPPE8VAxA= github.com/akash-network/cometbft v0.34.27-akash h1:V1dApDOr8Ee7BJzYyQ7Z9VBtrAul4+baMeA6C49dje0= github.com/akash-network/cometbft v0.34.27-akash/go.mod h1:BcCbhKv7ieM0KEddnYXvQZR+pZykTKReJJYf7YC7qhw= github.com/akash-network/ledger-go v0.14.3 h1:LCEFkTfgGA2xFMN2CtiKvXKE7dh0QSM77PJHCpSkaAo= diff --git a/operator/inventory/cmd.go b/operator/inventory/cmd.go index 0552a259..a3ba7eb5 100644 --- a/operator/inventory/cmd.go +++ b/operator/inventory/cmd.go @@ -75,12 +75,18 @@ func Cmd() *cobra.Command { return err } + ac, err := akashclientset.NewForConfig(kubecfg) + if err != nil { + return err + } + startupch := make(chan struct{}, 1) pctx, pcancel := context.WithCancel(context.Background()) fromctx.CmdSetContextValue(cmd, fromctx.CtxKeyStartupCh, (chan<- struct{})(startupch)) fromctx.CmdSetContextValue(cmd, fromctx.CtxKeyKubeConfig, kubecfg) fromctx.CmdSetContextValue(cmd, fromctx.CtxKeyKubeClientSet, kc) + fromctx.CmdSetContextValue(cmd, fromctx.CtxKeyAkashClientSet, ac) fromctx.CmdSetContextValue(cmd, fromctx.CtxKeyErrGroup, group) fromctx.CmdSetContextValue(cmd, fromctx.CtxKeyPubSub, pubsub.New(pctx, 1000)) @@ -106,13 +112,7 @@ func Cmd() *cobra.Command { return err } - ac, err := akashclientset.NewForConfig(kubecfg) - if err != nil { - return err - } - fromctx.CmdSetContextValue(cmd, CtxKeyRookClientSet, rc) - fromctx.CmdSetContextValue(cmd, fromctx.CtxKeyAkashClientSet, ac) return nil }, @@ -279,6 +279,11 @@ func Cmd() *cobra.Command { panic(err) } + cmd.Flags().Duration(FlagRegistryQueryPeriod, 5*time.Minute, "query period for registry changes") + if err = viper.BindPFlag(FlagRegistryQueryPeriod, cmd.Flags().Lookup(FlagRegistryQueryPeriod)); err != nil { + panic(err) + } + cmd.AddCommand(cmdFeatureDiscoveryNode()) return cmd @@ -297,6 +302,55 @@ func loadKubeConfig(c *cobra.Command) error { return nil } +func configWatcher(ctx context.Context, file string) error { + config, err := loadConfig(file, false) + if err != nil { + return err + } + + var watcher *fsnotify.Watcher + var evtch chan fsnotify.Event + + if strings.HasSuffix(file, "yaml") { + watcher, err = fsnotify.NewWatcher() + if err != nil { + return err + } + } + + defer func() { + if watcher != nil { + _ = watcher.Close() + } + }() + + if watcher != nil { + if err = watcher.Add(file); err != nil { + return err + } + + evtch = watcher.Events + } + + bus := fromctx.PubSubFromCtx(ctx) + + bus.Pub(config, []string{"config"}, pubsub.WithRetain()) + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case evt := <-evtch: + if evt.Has(fsnotify.Create) || evt.Has(fsnotify.Write) { + config, _ = loadConfig(evt.Name, true) + } else if evt.Has(fsnotify.Remove) { + config, _ = loadConfig("", true) + } + bus.Pub(config, []string{"config"}, pubsub.WithRetain()) + } + } +} + func newServiceRouter(apiTimeout, queryTimeout time.Duration) *serviceRouter { mRouter := mux.NewRouter() rt := &serviceRouter{ @@ -446,52 +500,3 @@ loop: } } } - -func configWatcher(ctx context.Context, file string) error { - config, err := loadConfig(file, false) - if err != nil { - return err - } - - var watcher *fsnotify.Watcher - var evtch chan fsnotify.Event - - if strings.HasSuffix(file, "yaml") { - watcher, err = fsnotify.NewWatcher() - if err != nil { - return err - } - } - - defer func() { - if watcher != nil { - _ = watcher.Close() - } - }() - - if watcher != nil { - if err = watcher.Add(file); err != nil { - return err - } - - evtch = watcher.Events - } - - bus := fromctx.PubSubFromCtx(ctx) - - bus.Pub(config, []string{"config"}, pubsub.WithRetain()) - - for { - select { - case <-ctx.Done(): - return ctx.Err() - case evt := <-evtch: - if evt.Has(fsnotify.Create) || evt.Has(fsnotify.Write) { - config, _ = loadConfig(evt.Name, true) - } else if evt.Has(fsnotify.Remove) { - config, _ = loadConfig("", true) - } - bus.Pub(config, []string{"config"}, pubsub.WithRetain()) - } - } -} diff --git a/operator/inventory/feature-discovery-client.go b/operator/inventory/feature-discovery-client.go index e26968e5..cb4e1dee 100644 --- a/operator/inventory/feature-discovery-client.go +++ b/operator/inventory/feature-discovery-client.go @@ -179,6 +179,8 @@ func (fd *featureDiscovery) run() error { res = append(res, nd.Dup()) } + sort.Sort(res) + return res } diff --git a/operator/inventory/feature-discovery-node.go b/operator/inventory/feature-discovery-node.go index 16ce1185..901113dd 100644 --- a/operator/inventory/feature-discovery-node.go +++ b/operator/inventory/feature-discovery-node.go @@ -1,7 +1,9 @@ package inventory import ( - "embed" + "bytes" + "crypto/sha256" + "crypto/tls" "encoding/json" "errors" "fmt" @@ -9,6 +11,7 @@ import ( "net" "net/http" "strconv" + "strings" "time" "github.com/go-logr/logr" @@ -48,23 +51,9 @@ const ( topicStorage = "storage" topicConfig = "config" topicClusterState = "cluster-state" + topicGPUIDs = "gpu-ids" ) -type gpuDevice struct { - Name string `json:"name"` - Interface string `json:"interface"` - MemorySize string `json:"memory_size"` -} - -type gpuDevices map[string]gpuDevice - -type gpuVendor struct { - Name string `json:"name"` - Devices gpuDevices `json:"devices"` -} - -type gpuVendors map[string]gpuVendor - type dpReqType int const ( @@ -108,34 +97,6 @@ type fdNodeServer struct { nodeName string } -var ( - supportedGPUs = gpuVendors{} - - //go:embed gpu-info.json - gpuDevs embed.FS -) - -func init() { - f, err := gpuDevs.Open("gpu-info.json") - if err != nil { - panic(err) - } - // close pci.ids file when done - defer func() { - _ = f.Close() - }() - - data, err := io.ReadAll(f) - if err != nil { - panic(err) - } - - err = json.Unmarshal(data, &supportedGPUs) - if err != nil { - panic(err) - } -} - func cmdFeatureDiscoveryNode() *cobra.Command { cmd := &cobra.Command{ Use: "node", @@ -289,6 +250,10 @@ func cmdFeatureDiscoveryNode() *cobra.Command { nodeName: nodeName, } + group.Go(func() error { + return registryLoader(ctx) + }) + startch := make(chan struct{}, 1) group.Go(func() error { defer func() { @@ -359,6 +324,16 @@ func cmdFeatureDiscoveryNode() *cobra.Command { panic(err) } + cmd.Flags().String(FlagProviderConfigsURL, defaultProviderConfigsURL, "provider configs server") + if err := viper.BindPFlag(FlagProviderConfigsURL, cmd.Flags().Lookup(FlagProviderConfigsURL)); err != nil { + panic(err) + } + + cmd.Flags().String(FlagPciDbURL, "https://pci-ids.ucw.cz/v2.2/pci.ids", "query period for registry changes") + if err := viper.BindPFlag(FlagPciDbURL, cmd.Flags().Lookup(FlagPciDbURL)); err != nil { + panic(err) + } + return cmd } @@ -437,13 +412,17 @@ func (rt *nodeRouter) readyHandler(w http.ResponseWriter, req *http.Request) { func (nd *fdNodeServer) run(startch chan<- struct{}) error { kc := fromctx.KubeClientFromCtx(nd.ctx) + bus := fromctx.PubSubFromCtx(nd.ctx) + + subEvents := bus.Sub(topicGPUIDs) + defer bus.Unsub(subEvents) nodeWatch, err := kc.CoreV1().Nodes().Watch(nd.ctx, metav1.ListOptions{ LabelSelector: builder.AkashManagedLabelName + "=true", FieldSelector: fields.OneTermEqualSelector(metav1.ObjectNameField, nd.nodeName).String(), }) if err != nil { - nd.log.Error(err, fmt.Sprintf("unable to start node watcher for \"%s\"", nd.nodeName)) + nd.log.Error(err, fmt.Sprintf("unable to watch node \"%s\"", nd.nodeName)) return err } @@ -453,13 +432,21 @@ func (nd *fdNodeServer) run(startch chan<- struct{}) error { FieldSelector: fields.OneTermEqualSelector("spec.nodeName", nd.nodeName).String(), }) if err != nil { - nd.log.Error(err, "unable to fetch pods") + nd.log.Error(err, "unable to watch start pods") return err } defer podsWatch.Stop() - node, initPods, err := initNodeInfo(nd.ctx, nd.nodeName) + gpusIDs := make(RegistryGPUVendors) + + select { + case evt := <-subEvents: + gpusIDs = evt.(RegistryGPUVendors) + default: + } + + node, initPods, err := initNodeInfo(nd.ctx, nd.nodeName, gpusIDs) if err != nil { nd.log.Error(err, "unable to init node info") return err @@ -491,6 +478,9 @@ func (nd *fdNodeServer) run(startch chan<- struct{}) error { nd.pub.Pub(node.Dup(), []string{topicNode}, pubsub.WithRetain()) case req := <-nd.reqch: req <- node.Dup() + case evt := <-subEvents: + gpusIDs = evt.(RegistryGPUVendors) + node.Resources.GPU.Info, _ = parseGPUInfo(nd.ctx, gpusIDs) case res := <-nodeWatch.ResultChan(): obj := res.Object.(*corev1.Node) switch res.Type { @@ -563,7 +553,12 @@ func subAllocatedResources(node *v1.Node, rl corev1.ResourceList) { } } -func initNodeInfo(ctx context.Context, name string) (v1.Node, map[string]corev1.Pod, error) { +func initNodeInfo(ctx context.Context, name string, gpusIds RegistryGPUVendors) (v1.Node, map[string]corev1.Pod, error) { + defer func() { + if r := recover(); r != nil { + fromctx.LogrFromCtx(ctx).Info(fmt.Sprintf("recovered from panic: %s", r)) + } + }() kc := fromctx.KubeClientFromCtx(ctx) cpuInfo, err := parseCPUInfo(ctx) @@ -571,11 +566,9 @@ func initNodeInfo(ctx context.Context, name string) (v1.Node, map[string]corev1. return v1.Node{}, nil, err } - gpuInfo, err := parseGPUInfo(ctx) + gpuInfo, err := parseGPUInfo(ctx, gpusIds) if err != nil { - log := fromctx.LogrFromCtx(ctx) - log.Error(err, "couldn't pull GPU info") - // return v1.Node{}, nil, err + return v1.Node{}, nil, err } knode, err := kc.CoreV1().Nodes().Get(ctx, name, metav1.GetOptions{}) @@ -624,6 +617,7 @@ func initNodeInfo(ctx context.Context, name string) (v1.Node, map[string]corev1. case corev1.ResourceEphemeralStorage: res.Resources.EphemeralStorage.Allocatable.Set(r.Value()) case builder.ResourceGPUNvidia: + fallthrough case builder.ResourceGPUAMD: res.Resources.GPU.Quantity.Allocatable.Set(r.Value()) } @@ -638,9 +632,17 @@ func initNodeInfo(ctx context.Context, name string) (v1.Node, map[string]corev1. return res, nil, err } + if podsList == nil { + return res, initPods, nil + } + for _, pod := range podsList.Items { for _, container := range pod.Spec.Containers { - addAllocatedResources(&res, container.Resources.Requests) + if container.Resources.Requests != nil { + addAllocatedResources(&res, container.Resources.Requests) + } else if container.Resources.Limits != nil { + addAllocatedResources(&res, container.Resources.Limits) + } } initPods[pod.Name] = pod } @@ -740,7 +742,13 @@ func parseCPUInfo(ctx context.Context) (v1.CPUInfoS, error) { return res, nil } -func parseGPUInfo(ctx context.Context) (v1.GPUInfoS, error) { +func parseGPUInfo(ctx context.Context, info RegistryGPUVendors) (v1.GPUInfoS, error) { + defer func() { + if r := recover(); r != nil { + fromctx.LogrFromCtx(ctx).Info(fmt.Sprintf("recovered from panic: %s", r)) + } + }() + res := make(v1.GPUInfoS, 0) if err := ctx.Err(); err != nil { @@ -769,7 +777,7 @@ func parseGPUInfo(ctx context.Context) (v1.GPUInfoS, error) { continue } - vendor, exists := supportedGPUs[vinfo.ID] + vendor, exists := info[vinfo.ID] if !exists { continue } @@ -780,9 +788,9 @@ func parseGPUInfo(ctx context.Context) (v1.GPUInfoS, error) { } res = append(res, v1.GPUInfo{ - Vendor: dev.DeviceInfo.Vendor.Name, + Vendor: vendor.Name, VendorID: dev.DeviceInfo.Vendor.ID, - Name: dev.DeviceInfo.Product.Name, + Name: model.Name, ModelID: dev.DeviceInfo.Product.ID, Interface: model.Interface, MemorySize: model.MemorySize, @@ -1011,6 +1019,129 @@ initloop: } } +// this function is piece of sh*t. refactor it! +func registryLoader(ctx context.Context) error { + log := fromctx.LogrFromCtx(ctx).WithName("registry-loader") + bus := fromctx.PubSubFromCtx(ctx) + + tlsConfig := http.DefaultTransport.(*http.Transport).TLSClientConfig + + cl := &http.Client{ + Transport: &http.Transport{ + DialTLSContext: func(ctx context.Context, network, addr string) (net.Conn, error) { + return tls.Dial(network, addr, tlsConfig) + }, + }, + } + + urlGPU := fmt.Sprintf("%s/devices/gpus", strings.TrimSuffix(viper.GetString(FlagProviderConfigsURL), "/")) + urlPcieDB := viper.GetString(FlagPciDbURL) + + var gpuCurrHash []byte + var pcidbHash []byte + + gpuIDs := make(RegistryGPUVendors) + + queryGPUs := func() bool { + res, err := cl.Get(urlGPU) + if err != nil { + log.Error(err, "couldn't query inventory registry") + return false + } + + defer func() { + _ = res.Body.Close() + }() + + if res.StatusCode != http.StatusOK { + return false + } + + gpus, err := io.ReadAll(res.Body) + if err != nil { + return false + } + + upstreamHash := sha256.New() + _, _ = upstreamHash.Write(gpus) + newHash := upstreamHash.Sum(nil) + + if bytes.Equal(gpuCurrHash, newHash) { + return false + } + + _ = json.Unmarshal(gpus, &gpuIDs) + + gpuCurrHash = newHash + + return true + } + + queryPCI := func() bool { + res, err := cl.Get(urlPcieDB) + if err != nil { + log.Error(err, "couldn't query pci.ids") + return false + } + + defer func() { + _ = res.Body.Close() + }() + + if res.StatusCode != http.StatusOK { + return false + } + + pcie, err := io.ReadAll(res.Body) + if err != nil { + return false + } + + upstreamHash := sha256.New() + _, _ = upstreamHash.Write(pcie) + newHash := upstreamHash.Sum(nil) + + if bytes.Equal(pcidbHash, newHash) { + return false + } + + pcidbHash = newHash + + return true + } + + queryGPUs() + bus.Pub(gpuIDs, []string{topicGPUIDs}) + + queryPeriod := viper.GetDuration(FlagRegistryQueryPeriod) + tmGPU := time.NewTimer(queryPeriod) + tmPCIe := time.NewTimer(24 * time.Hour) + + for { + select { + case <-ctx.Done(): + if !tmGPU.Stop() { + <-tmGPU.C + } + + if !tmPCIe.Stop() { + <-tmPCIe.C + } + + return ctx.Err() + case <-tmGPU.C: + if queryGPUs() { + bus.Pub(gpuIDs, []string{topicGPUIDs}) + } + tmGPU.Reset(queryPeriod) + case <-tmPCIe.C: + queryPCI() + + tmGPU.Reset(24 * time.Hour) + } + } +} + // // ExecCmd exec command on specific pod and wait the command's output. // func ExecCmd(ctx context.Context, podName string, command string, stdin io.Reader, stdout io.Writer, stderr io.Writer) error { // kc := KubeClientFromCtx(ctx) diff --git a/operator/inventory/gpu-info.json b/operator/inventory/gpu-info.json deleted file mode 100644 index eded4523..00000000 --- a/operator/inventory/gpu-info.json +++ /dev/null @@ -1,37 +0,0 @@ -{ - "10de":{ - "name": "nvidia", - "devices": { - "20b0": { - "name": "", - "interface": "SXM4", - "memory_size": "40Gi" - }, - "20b1": { - "name": "", - "interface": "PCIe", - "memory_size": "40Gi" - }, - "20b2": { - "name": "", - "interface": "SXM4", - "memory_size": "80Gi" - }, - "20b3": { - "name": "", - "interface": "SXM", - "memory_size": "64Gi" - }, - "20b5": { - "name": "", - "interface": "PCIe", - "memory_size": "80Gi" - }, - "20f1": { - "name": "", - "interface": "PCIe", - "memory_size": "40Gi" - } - } - } -} diff --git a/operator/inventory/registry.go b/operator/inventory/registry.go new file mode 100644 index 00000000..be45231e --- /dev/null +++ b/operator/inventory/registry.go @@ -0,0 +1,16 @@ +package inventory + +type RegistryGPUDevice struct { + Name string `json:"name"` + Interface string `json:"interface"` + MemorySize string `json:"memory_size"` +} + +type RegistryGPUDevices map[string]RegistryGPUDevice + +type RegistryGPUVendor struct { + Name string `json:"name"` + Devices RegistryGPUDevices `json:"devices"` +} + +type RegistryGPUVendors map[string]RegistryGPUVendor diff --git a/operator/inventory/types.go b/operator/inventory/types.go index 860f3bc2..28365bce 100644 --- a/operator/inventory/types.go +++ b/operator/inventory/types.go @@ -20,13 +20,17 @@ import ( ) const ( - FlagAPITimeout = "api-timeout" - FlagQueryTimeout = "query-timeout" - FlagRESTPort = "rest-port" - FlagGRPCPort = "grpc-port" - FlagPodName = "pod-name" - FlagNodeName = "node-name" - FlagConfig = "config" + FlagAPITimeout = "api-timeout" + FlagQueryTimeout = "query-timeout" + FlagRESTPort = "rest-port" + FlagGRPCPort = "grpc-port" + FlagPodName = "pod-name" + FlagNodeName = "node-name" + FlagConfig = "config" + FlagProviderConfigsURL = "provider-configs-url" + FlagPciDbURL = "provider-pcidb-url" + FlagRegistryQueryPeriod = "registry-query-period" + defaultProviderConfigsURL = "https://provider-configs.akash.network" ) var ( diff --git a/operator/psutil.go b/operator/psutil.go index 2ccaf8da..e763c584 100644 --- a/operator/psutil.go +++ b/operator/psutil.go @@ -12,6 +12,7 @@ import ( "github.com/jaypipes/ghw/pkg/cpu" "github.com/jaypipes/ghw/pkg/gpu" "github.com/jaypipes/ghw/pkg/memory" + "github.com/jaypipes/ghw/pkg/pci" "github.com/spf13/cobra" "github.com/spf13/viper" ) @@ -49,6 +50,7 @@ func cmdPsutilServe() *cobra.Command { router.HandleFunc("/cpu", cpuInfoHandler).Methods(http.MethodGet) router.HandleFunc("/gpu", gpuHandler).Methods(http.MethodGet) router.HandleFunc("/memory", memoryHandler).Methods(http.MethodGet) + router.HandleFunc("/pci", pciHandler).Methods(http.MethodGet) port := viper.GetUint16(flagAPIPort) @@ -89,6 +91,8 @@ func cmdPsutilList() *cobra.Command { res, err = gpu.New() case "memory": res, err = memory.New() + case "pci": + res, err = pci.New() default: return fmt.Errorf("invalid command \"%s\"", args[0]) // nolint: goerr113 } @@ -111,7 +115,7 @@ func cmdPsutilList() *cobra.Command { return cmd } -func cpuInfoHandler(w http.ResponseWriter, r *http.Request) { +func cpuInfoHandler(w http.ResponseWriter, _ *http.Request) { res, err := cpu.New() if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) @@ -121,7 +125,7 @@ func cpuInfoHandler(w http.ResponseWriter, r *http.Request) { writeJSON(w, res) } -func gpuHandler(w http.ResponseWriter, r *http.Request) { +func gpuHandler(w http.ResponseWriter, _ *http.Request) { res, err := gpu.New() if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) @@ -131,7 +135,7 @@ func gpuHandler(w http.ResponseWriter, r *http.Request) { writeJSON(w, res) } -func memoryHandler(w http.ResponseWriter, r *http.Request) { +func memoryHandler(w http.ResponseWriter, _ *http.Request) { res, err := memory.New() if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) @@ -141,6 +145,16 @@ func memoryHandler(w http.ResponseWriter, r *http.Request) { writeJSON(w, res) } +func pciHandler(w http.ResponseWriter, _ *http.Request) { + res, err := pci.New() + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + + writeJSON(w, res) +} + func writeJSON(w http.ResponseWriter, obj interface{}) { bytes, err := json.Marshal(obj) if err != nil { diff --git a/pkg/apis/akash.network/v2beta2/register.go b/pkg/apis/akash.network/v2beta2/register.go index 26cab247..7f0d0d77 100644 --- a/pkg/apis/akash.network/v2beta2/register.go +++ b/pkg/apis/akash.network/v2beta2/register.go @@ -39,6 +39,7 @@ func addKnownTypes(scheme *runtime.Scheme) error { scheme.AddKnownTypes(SchemeGroupVersion, &ProviderLeasedIP{}, &ProviderLeasedIPList{}) + metav1.AddToGroupVersion(scheme, SchemeGroupVersion) return nil } diff --git a/script/load_docker2ctr.sh b/script/load_docker2ctr.sh deleted file mode 100755 index f6d6222d..00000000 --- a/script/load_docker2ctr.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env bash - -images=("$1") - -remote=$2 - -if ! ssh "$remote" "which nerdctl" >/dev/null 2>&1; then - echo "nerdctl is not installed on remote server. https://github.com/containerd/nerdctl/blob/main/docs/rootless.md" - exit 1 -fi - -# shellcheck disable=SC2048 -for image in ${images[*]}; do - if ! docker image inspect "$image" >/dev/null 2>&1; then - echo "image \"$image\" is not present locally" - exit 1 - fi -done - -# need tqdm installed (https://github.com/tqdm/tqdm) for this to work. - -# shellcheck disable=SC2048 -for image in ${images[*]}; do - docker save "$image" | tqdm --bytes --total "$(docker image inspect "$image" --format='{{.Size}}')" | ssh "$remote" "nerdctl image load" -done diff --git a/script/load_docker2kind.sh b/script/load_docker2kind.sh deleted file mode 100755 index 8f29bf18..00000000 --- a/script/load_docker2kind.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env bash - -images=("$1") -kind_name=$2 - -# shellcheck disable=SC2048 -for image in ${images[*]}; do - if ! docker image inspect "$image" >/dev/null 2>&1; then - echo "image \"$image\" is not present locally" - exit 1 - fi -done - -# shellcheck disable=SC2048 -for image in ${images[*]}; do - kind --name "${kind_name}" load docker-image "${image}" -done diff --git a/script/setup-kube.sh b/script/setup-kube.sh index 76de893b..25d44a84 100755 --- a/script/setup-kube.sh +++ b/script/setup-kube.sh @@ -166,6 +166,42 @@ config_file() { command_ssh() { case "$1" in + init) + shift + + while read -r node; do + if ! ssh -n "$node" "test -e /etc/systemd/system/user@.service.d/delegate.conf"; then + ssh -n "$node" 'sudo mkdir -p /etc/systemd/system/user@.service.d' + ssh -n "$node" 'cat </dev/null 2>&1'; then + packages="$packages uidmap" + fi + + if ! ssh -n "$node" 'dpkg-query -W slirp4netns >/dev/null 2>&1'; then + packages="$packages slirp4netns" + fi + + if [[ $packages != "" ]]; then + ssh -n "$node" 'sudo apt-get install -y uidmap slirp4netns' + fi + ssh -n "$node" 'curl -sSL https://github.com/containerd/nerdctl/releases/download/v1.7.2/nerdctl-1.7.2-linux-$(uname -m | sed "s/x86_64/amd64/g").tar.gz | sudo tar Cxzv /usr/local/bin/' + ssh -n "$node" 'curl -sSL https://github.com/rootless-containers/rootlesskit/releases/download/v2.0.0/rootlesskit-$(uname -m).tar.gz | sudo tar Cxzv /usr/local/bin/' + ssh -n "$node" 'containerd-rootless-setuptool.sh install' + done <<< "$1" + + ;; + *) + echo "invalid command \"$1\"" + usage "$@" + ;; esac } @@ -194,6 +230,63 @@ command_kustomize() { esac } +command_load_images() { + case "$1" in + docker2ctr) + shift + + remotes=("$1") + images=("$2") + + for remote in ${remotes[*]}; do + if ! ssh "$remote" "which nerdctl" >/dev/null 2>&1; then + echo "nerdctl is not installed on \"$remote\" node. run \"setup-kube.sh ssh init\" first" + exit 1 + fi + done + + for image in ${images[*]}; do + if ! docker image inspect "$image" >/dev/null 2>&1; then + echo "image \"$image\" is not present locally" + exit 1 + fi + done + + # shellcheck disable=SC2048 + for remote in ${remotes[*]}; do + for image in ${images[*]}; do + docker save "$image" | tqdm --bytes --total "$(docker image inspect "$image" --format='{{.Size}}')" | ssh "$remote" "sudo nerdctl image load" + done + done + + ;; + docker2kind) + shift + + kind_name=$1 + images=("$2") + + # shellcheck disable=SC2048 + for image in ${images[*]}; do + if ! docker image inspect "$image" >/dev/null 2>&1; then + echo "image \"$image\" is not present locally" + exit 1 + fi + done + + # shellcheck disable=SC2048 + for image in ${images[*]}; do + kind --name "${kind_name}" load docker-image "${image}" + done + + ;; + *) + echo "invalid command \"$1\"" + usage "$@" + ;; + esac +} + case "${1}" in ssh) shift @@ -207,6 +300,10 @@ kustomize) shift command_kustomize "$@" ;; +load-images) + shift + command_load_images "$@" + ;; *) echo "invalid cluster type" usage "$@" diff --git a/service.go b/service.go index 4e467f09..46e45e93 100644 --- a/service.go +++ b/service.go @@ -2,6 +2,7 @@ package provider import ( "context" + "time" "github.com/boz/go-lifecycle" "github.com/pkg/errors" @@ -326,10 +327,13 @@ loop: case evt := <-events: switch obj := evt.(type) { case provider.ClusterStatus: + status.Timestamp = time.Now().UTC() status.Cluster = &obj case provider.BidEngineStatus: + status.Timestamp = time.Now().UTC() status.BidEngine = &obj case provider.ManifestStatus: + status.Timestamp = time.Now().UTC() status.Manifest = &obj default: continue loop