From 00ab1195dce5f39f0ec21424e0d57ff69b0ca464 Mon Sep 17 00:00:00 2001 From: Xiao Gui Date: Thu, 15 Feb 2024 11:26:10 +0100 Subject: [PATCH] fix metrics and worker --- .github/workflows/docker-img.yml | 55 ++++++++++--------- .helm/adhoc/certificate-rc.yaml | 20 +++++++ .helm/siibra-api/Chart.yaml | 4 +- .helm/siibra-api/templates/_helpers.tpl | 8 +++ .../templates/deployment-server.yaml | 4 +- .../templates/deployment-worker.yaml | 4 +- api/server/metrics.py | 32 ++++++----- 7 files changed, 82 insertions(+), 45 deletions(-) create mode 100644 .helm/adhoc/certificate-rc.yaml diff --git a/.github/workflows/docker-img.yml b/.github/workflows/docker-img.yml index becd190..7b34965 100644 --- a/.github/workflows/docker-img.yml +++ b/.github/workflows/docker-img.yml @@ -142,34 +142,35 @@ jobs: secrets: okd_token: ${{ secrets.OKD_PROD_SECRET }} - - data-validation-config-hash: - if: ${{ github.event_name == 'release' && contains(github.ref, 'rc') }} - runs-on: ubuntu-latest - outputs: - CONFIG_SHORT_REV: ${{ steps.parse-rev.outputs.CONFIG_SHORT_REV }} - steps: - - id: parse-rev - name: Get short rev of HEAD at master - run: | - git clone https://jugit.fz-juelich.de/t.dickscheid/brainscapes-configurations.git - CONFIG_SHORT_REV=$(git -C brainscapes-configurations rev-parse --short=6 HEAD) - echo CONFIG_SHORT_REV=$CONFIG_SHORT_REV >> $GITHUB_OUTPUT +# disable deployment on jsc + + # data-validation-config-hash: + # if: ${{ github.event_name == 'release' && contains(github.ref, 'rc') }} + # runs-on: ubuntu-latest + # outputs: + # CONFIG_SHORT_REV: ${{ steps.parse-rev.outputs.CONFIG_SHORT_REV }} + # steps: + # - id: parse-rev + # name: Get short rev of HEAD at master + # run: | + # git clone https://jugit.fz-juelich.de/t.dickscheid/brainscapes-configurations.git + # CONFIG_SHORT_REV=$(git -C brainscapes-configurations rev-parse --short=6 HEAD) + # echo CONFIG_SHORT_REV=$CONFIG_SHORT_REV >> $GITHUB_OUTPUT - deploy-rc-on-data-validation: - needs: - - setup-envvar - - data-validation-config-hash - if: ${{ github.event_name == 'release' && contains(github.ref, 'rc') }} - uses: ./.github/workflows/deploy-on-okd.yml - with: - okd_endpoint: https://okd.jsc.hbp.eu:443 - flavor: rc - queues: ${{ needs.setup-envvar.outputs.queues }} - version: c.${{ needs.data-validation-config-hash.outputs.CONFIG_SHORT_REV }} - workerimage: docker-registry.ebrains.eu/siibra/siibra-api:rc-worker - secrets: - okd_token: ${{ secrets.OKD_JSC_SECRET }} + # deploy-rc-on-data-validation: + # needs: + # - setup-envvar + # - data-validation-config-hash + # if: ${{ github.event_name == 'release' && contains(github.ref, 'rc') }} + # uses: ./.github/workflows/deploy-on-okd.yml + # with: + # okd_endpoint: https://okd.jsc.hbp.eu:443 + # flavor: rc + # queues: ${{ needs.setup-envvar.outputs.queues }} + # version: c.${{ needs.data-validation-config-hash.outputs.CONFIG_SHORT_REV }} + # workerimage: docker-registry.ebrains.eu/siibra/siibra-api:rc-worker + # secrets: + # okd_token: ${{ secrets.OKD_JSC_SECRET }} deploy-prod-on-okd: needs: setup-envvar diff --git a/.helm/adhoc/certificate-rc.yaml b/.helm/adhoc/certificate-rc.yaml new file mode 100644 index 0000000..69cb826 --- /dev/null +++ b/.helm/adhoc/certificate-rc.yaml @@ -0,0 +1,20 @@ +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: siibra-api-rc +spec: + commonName: siibra-api-rc.apps.tc.humanbrainproject.eu + isCA: false + dnsNames: + - siibra-api-rc.apps.tc.humanbrainproject.eu + issuerRef: + kind: ClusterIssuer + name: letsencrypt-production-issuer-1 + privateKey: + algorithm: RSA + encoding: PKCS1 + size: 2048 + renewBefore: 120h + secretName: siibra-api-rc-secret + usages: + - server auth \ No newline at end of file diff --git a/.helm/siibra-api/Chart.yaml b/.helm/siibra-api/Chart.yaml index bddd77f..e52c4ef 100644 --- a/.helm/siibra-api/Chart.yaml +++ b/.helm/siibra-api/Chart.yaml @@ -15,10 +15,10 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 0.1.0 +version: 0.1.1 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to # follow Semantic Versioning. They should reflect the version the application is using. # It is recommended to use it with quotes. -appVersion: "0.3.15" +appVersion: "0.3.17" diff --git a/.helm/siibra-api/templates/_helpers.tpl b/.helm/siibra-api/templates/_helpers.tpl index 7d68d33..9d97ece 100644 --- a/.helm/siibra-api/templates/_helpers.tpl +++ b/.helm/siibra-api/templates/_helpers.tpl @@ -5,6 +5,14 @@ Expand the name of the chart. {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} {{- end }} +{{- define "siibra-api.root-img" -}} +{{- if eq .Values.sapiFlavor "rc" }} +{{- "rc" }} +{{- else }} +{{- .Values.sapiVersion }} +{{- end }} +{{- end }} + {{/* Create a default fully qualified app name. We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). diff --git a/.helm/siibra-api/templates/deployment-server.yaml b/.helm/siibra-api/templates/deployment-server.yaml index bd04b7b..0e4b880 100644 --- a/.helm/siibra-api/templates/deployment-server.yaml +++ b/.helm/siibra-api/templates/deployment-server.yaml @@ -33,7 +33,7 @@ spec: - name: {{ .Chart.Name }} securityContext: {{- toYaml .Values.securityContext | nindent 12 }} - image: "{{ .Values.image.repository }}:{{ .Values.sapiVersion }}-server" + image: "{{ .Values.image.repository }}:{{ include "siibra-api.root-img" . }}-server" imagePullPolicy: {{ .Values.image.pullPolicy }} ports: - name: http @@ -59,6 +59,8 @@ spec: env: - name: SIIBRA_CACHEDIR value: "/siibra-api-volume/{{ .Values.sapiVersion }}" + - name: SIIBRA_API_NAMESPACE + value: {{ $.Values.sapiFlavor }} resources: {{- toYaml .Values.resourcesServerPod | nindent 12 }} {{- with .Values.volumeMounts }} diff --git a/.helm/siibra-api/templates/deployment-worker.yaml b/.helm/siibra-api/templates/deployment-worker.yaml index 3b980ce..6f32ce2 100644 --- a/.helm/siibra-api/templates/deployment-worker.yaml +++ b/.helm/siibra-api/templates/deployment-worker.yaml @@ -44,7 +44,7 @@ spec: - name: {{ $.Chart.Name }} securityContext: {{- toYaml $.Values.securityContext | nindent 12 }} - image: "{{ $.Values.image.repository }}:{{ $.Values.sapiVersion }}-worker" + image: "{{ $.Values.image.repository }}:{{ include "siibra-api.root-img" $ }}-worker" imagePullPolicy: {{ $.Values.image.pullPolicy }} command: ["celery"] args: @@ -75,6 +75,8 @@ spec: env: - name: SIIBRA_CACHEDIR value: "/siibra-api-volume/{{ $.Values.sapiVersion }}" + - name: SIIBRA_API_NAMESPACE + value: {{ $.Values.sapiFlavor }} resources: {{- toYaml $.Values.resourcesWorkerPod | nindent 12 }} {{- with $.Values.volumeMounts }} diff --git a/api/server/metrics.py b/api/server/metrics.py index 1ce8548..c5818cc 100644 --- a/api/server/metrics.py +++ b/api/server/metrics.py @@ -4,6 +4,7 @@ from subprocess import run import os from pathlib import Path +from collections import defaultdict from api.siibra_api_config import ROLE, CELERY_CONFIG, NAME_SPACE, MONITOR_FIRSTLVL_DIR, queues from api.common.timer import RepeatTimer from api.common import general_logger @@ -31,6 +32,8 @@ def timed_du(): return for dir in dirs: + if dir == "lost+found": + continue path_to_dir = Path(MONITOR_FIRSTLVL_DIR) / dir try: result = run(["du", "-s", str(path_to_dir)], capture_output=True, text=True) @@ -83,7 +86,7 @@ def refresh_prom_metrics(): **common_kwargs) num_worker_gauge = Gauge("num_workers", "Number of workers", - labelnames=("hostname", "q_name", "ok"), **common_kwargs) + labelnames=("version", "namespace", "queue"), **common_kwargs) scheduled_gauge = Gauge("scheduled_tasks","Number of scheduled tasks", labelnames=("hostname",), **common_kwargs) active_gauge = Gauge("active_tasks", "Number of active tasks", labelnames=("hostname",), **common_kwargs) reserved_gauge = Gauge("reserved_tasks", "Number of reserved tasks", labelnames=("hostname",), **common_kwargs) @@ -103,19 +106,20 @@ def refresh_prom_metrics(): i = app.control.inspect() # number of active workers - result = i.ping() - if result is None: - num_worker_gauge.set(0) - else: - for worker_hostname, resp in result.items(): - for queue in queues: - if queue in worker_hostname: - break - else: - queue = "celery" - num_worker_gauge.labels(hostname=worker_hostname.replace("celery@", ""), - ok=resp.get("ok"), - q_name=queue).set(len(result)) + result = app.control.inspect().active_queues() + + tally = defaultdict(0) + for hostname in result: + for queue in result[hostname]: + routing_key = queue.get("routing_key") + *_, namespace, queue = routing_key + version = ".".join(_) + tally[(version, namespace, queue)] += 1 + + for ((version, namespace, queue), total) in tally.items(): + num_worker_gauge.labels(version=version, + namespace=namespace, + queue=queue).set(total) for workername, queue in (i.scheduled() or {}).items(): scheduled_gauge.labels(hostname=workername).set(len(queue))