Skip to content

Commit

Permalink
maint: also expose kubectl top data
Browse files Browse the repository at this point in the history
  • Loading branch information
xgui3783 committed Feb 29, 2024
1 parent 3036ab9 commit 5c37f6b
Show file tree
Hide file tree
Showing 8 changed files with 146 additions and 24 deletions.
19 changes: 11 additions & 8 deletions .github/workflows/deploy-helm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,29 +58,32 @@ jobs:
run: |
if [[ "$HELM_STATUS" == "0" ]]
then
helm --set sapiVersion=$version \
helm upgrade --set sapiVersion=$SAPI_VERSION \
--reuse-values \
--set podLabels.hash="$GIT_HASH" \
--set image.pullPolicy=Always \
upgrade rc .helm/siibra-api/
--history-max 3 \
rc .helm/siibra-api/
else
helm --set sapiVersion=$version \
helm install --set sapiVersion=$SAPI_VERSION \
--set sapiFlavor=rc --set-json ingress.hosts='${{ env.RC_INGRESS_HOST }}'\
--set-json ingress.tls='${{ env.RC_INGRESS_TLS }}' \
--set podLabels.hash="$GIT_HASH" \
--set image.pullPolicy=Always \
install rc .helm/siibra-api/
rc .helm/siibra-api/
fi
- name: 'deploy prod'
if: ${{ inputs.DEPLOYMENT_NAME == 'prod' }}
run: |
if [[ "$HELM_STATUS" == "0" ]]
then
helm --set sapiVersion=$version \
helm upgrade \
--set sapiVersion=$SAPI_VERSION \
--reuse-values \
upgrade rc .helm/siibra-api/
prod .helm/siibra-api/
else
helm --set sapiVersion=$version \
install rc .helm/siibra-api/
helm install \
--set sapiVersion=$SAPI_VERSION \
prod .helm/siibra-api/
fi
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@ tmp
*.sh
docs/_*
**/*.log
secret*
37 changes: 37 additions & 0 deletions .helm/adhoc/cronjob-kubectl-top-pod.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
apiVersion: batch/v1
kind: CronJob
metadata:
name: cronjob-pod-usage
spec:
# run everyminute
schedule: "* * * * *"
jobTemplate:
spec:
template:
spec:
containers:
- name: cronjob-pod-usage
image: docker-registry.ebrains.eu/monitoring/kubectl:v1.29.2
imagePullPolicy: Always
command:
- /bin/ash
- -c
- "kubectl top pod > /siibra-api-volume/metrics.txt"
volumeMounts:
- mountPath: /siibra-api-volume
name: data-volume
- mountPath: /var/tmp/kube/
name: kubectl-config
readOnly: true
env:
- name: KUBECONFIG
value: /var/tmp/kube/config

restartPolicy: OnFailure
volumes:
- name: data-volume
persistentVolumeClaim:
claimName: data-volume-claim
- name: kubectl-config
secret:
secretName: kubectl-config
14 changes: 14 additions & 0 deletions .helm/adhoc/example-secret-kubeconfig.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
apiVersion: v1
kind: Secret
metadata:
name: kubectl-config
type: Opaque
data:
config: |
YXBpVmVyc2lvbjogdjEKa2luZDogQ29uZmlnCmNsdXN0ZXJzOgotIGNsdXN0ZXI6CiAgICBzZXJ2
ZXI6IGh0dHBzOi8vcmFuY2hlci50Yy5odW1hbmJyYWlucHJvamVjdC5ldS9rOHMvY2x1c3RlcnMv
Yy1tLWpjeDJxaHFuCiAgbmFtZTogcmtlMi0xLWpzY2Nsb3VkCmNvbnRleHRzOgotIGNvbnRleHQ6
CiAgICBjbHVzdGVyOiBya2UyLTEtanNjY2xvdWQKICAgIG5hbWVzcGFjZTogc2lpYnJhLWFwaQog
ICAgdXNlcjogcmtlMi0xLWpzY2Nsb3VkCiAgbmFtZTogZWIyLXNhcGkKY3VycmVudC1jb250ZXh0
OiBlYjItc2FwaQpwcmVmZXJlbmNlczoge30KdXNlcnM6Ci0gbmFtZTogcmtlMi0xLWpzY2Nsb3Vk
CiAgdXNlcjoKICAgIHRva2VuOiBmb28tYmFyCg==
2 changes: 1 addition & 1 deletion .helm/siibra-api/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.1.2
version: 0.1.3

# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
Expand Down
21 changes: 11 additions & 10 deletions .helm/siibra-api/templates/deployment-worker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,13 @@ metadata:
queuename: {{ . }}
{{- include "siibra-api.labels" $ | nindent 4 }}
spec:
strategy:
type: RollingUpdate
# This rolling update setup allow k8s to scale down before bringing up new pods.
# This should be more resource conservative, whilst sacrificing some availability during update.
rollingUpdate:
maxSurge: 0
maxUnavailable: 1

{{- if not $.Values.autoscaling.enabled }}
replicas: {{ $.Values.replicaCount }}
Expand Down Expand Up @@ -56,16 +63,10 @@ spec:
- "-O"
- "fair"
livenessProbe:
initialDelaySeconds: 20
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 3
exec:
command: ["/bin/bash", "-c", "cd /worker && python worker_health.py"]
readinessProbe:
initialDelaySeconds: 10
periodSeconds: 60
timeoutSeconds: 30
# each pod has 20 seconds to become lively (which by definition is ready)
initialDelaySeconds: 2
periodSeconds: 10
timeoutSeconds: 4
failureThreshold: 12
exec:
command: ["/bin/bash", "-c", "cd /worker && python worker_health.py"]
Expand Down
2 changes: 1 addition & 1 deletion .helm/siibra-api/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ resourcesServerPod:
resourcesWorkerPod:
limits:
cpu: 500m
memory: 4Gi
memory: 3Gi
requests:
cpu: 100m
memory: 1Gi
Expand Down
74 changes: 70 additions & 4 deletions api/server/metrics.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from fastapi import HTTPException
from fastapi.responses import PlainTextResponse
from typing import List, Dict
from typing import List, Dict, Tuple
from subprocess import run
import os
from pathlib import Path
Expand All @@ -15,14 +15,61 @@ class Singleton:
cached_du: Dict[str, str] = {}
timers: List[RepeatTimer] = []

res_mtime: float = None
cached_res_usage: Dict[str, Tuple[float, float]] = {}

@staticmethod
def populate_celery():
if ROLE == 'server':
Singleton.cached_metrics = refresh_prom_metrics()


@staticmethod
def parse_cpu(text: str) -> float:
if text.endswith("m"):
return float(text.replace("m", ""))
raise ValueError(f"Cannot parse cpu text {text}")

@staticmethod
def parse_memory(text: str) -> float:
if text.endswith("Mi"):
return float(text.replace("Mi", "")) * 1024 * 1024
raise ValueError(f"Cannot parse memory text {text}")

@staticmethod
def timed_du():
def parse_text(text: str):
titles = ["NAME", "CPU", "MEMORY"]

Singleton.cached_res_usage.clear()

for line in text.splitlines():
if all(t in line for t in titles):
continue
podname, cpuusage, memoryusage = line.split()
try:
Singleton.cached_res_usage[podname] = (
str(Singleton.parse_cpu(cpuusage)),
str(Singleton.parse_memory(memoryusage)),
)
except Exception as e:
general_logger.error(f"Cannot parse line: {str(e)}")

@staticmethod
def timed_get_metrics():
if ROLE == 'server' and MONITOR_FIRSTLVL_DIR:
Singleton.res_mtime = None
try:
metrics_path = Path(MONITOR_FIRSTLVL_DIR) / "metrics.txt"
metric_text = metrics_path.read_text()
Singleton.res_mtime = metrics_path.lstat().st_mtime
Singleton.parse_text(metric_text)

except FileNotFoundError as e:
...
except Exception as e:
general_logger.error(f"Reading metrics.txt error: {str(e)}")


# n.b. cannot use shutil.disk_usage . It seems it
# queries mount used/free and not directory
try:
Expand All @@ -46,11 +93,11 @@ def timed_du():
def on_startup():
"""On startup"""
Singleton.populate_celery()
Singleton.timed_du()
Singleton.timed_get_metrics()

Singleton.timers = [
RepeatTimer(60, Singleton.populate_celery),
RepeatTimer(600, Singleton.timed_du),
RepeatTimer(600, Singleton.timed_get_metrics),
]

for timer in Singleton.timers:
Expand All @@ -73,6 +120,25 @@ def refresh_prom_metrics():
'namespace':NAME_SPACE,
}

cpu_usage = Gauge("resource_usage_cpu",
"CPU usage by pods",
labelnames=("podname",),
**common_kwargs)

memory_usage = Gauge("resource_usage_memory",
"RAM usage by pods",
labelnames=("podname",),
**common_kwargs)

for podname, (cpu, ram) in Singleton.cached_res_usage.items():
cpu_usage.labels(podname=podname).set(cpu)
memory_usage.labels(podname=podname).set(ram)

res_timestamp = Gauge("resource_usage_timestamp",
"Timestamp", **common_kwargs)
if Singleton.res_mtime:
res_timestamp.set(Singleton.res_mtime)

du = Gauge(f"firstlvl_folder_disk_usage",
"Bytes used by first level folders",
labelnames=("folder_name",),
Expand Down

0 comments on commit 5c37f6b

Please sign in to comment.