From ef2fe6596d313e35e2ee82672f094e46bda08e70 Mon Sep 17 00:00:00 2001 From: Alexey Fomenko Date: Wed, 21 Aug 2024 22:47:00 +0300 Subject: [PATCH] Add HPA-enabled manifests Signed-off-by: Alexey Fomenko --- .../config/HPA/customMetrics.yaml | 51 +++++ microservices-connector/config/HPA/tei.yaml | 205 ++++++++++++++++++ .../config/HPA/teirerank.yaml | 204 +++++++++++++++++ microservices-connector/config/HPA/tgi.yaml | 201 +++++++++++++++++ 4 files changed, 661 insertions(+) create mode 100644 microservices-connector/config/HPA/customMetrics.yaml create mode 100644 microservices-connector/config/HPA/tei.yaml create mode 100644 microservices-connector/config/HPA/teirerank.yaml create mode 100644 microservices-connector/config/HPA/tgi.yaml diff --git a/microservices-connector/config/HPA/customMetrics.yaml b/microservices-connector/config/HPA/customMetrics.yaml new file mode 100644 index 000000000..c3b5de055 --- /dev/null +++ b/microservices-connector/config/HPA/customMetrics.yaml @@ -0,0 +1,51 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +data: + config.yaml: | + rules: + - seriesQuery: '{__name__="tgi_request_inference_duration_sum"}' + # Average request latency from TGI histograms, over 1 min + # (0.001 divider add is to make sure there's always a valid value) + metricsQuery: 'rate(tgi_request_inference_duration_sum{<<.LabelMatchers>>}[1m]) / (0.001+rate(tgi_request_inference_duration_count{<<.LabelMatchers>>}[1m]))' + name: + matches: ^tgi_request_inference_duration_sum + as: "tgi_request_latency" + resources: + # HPA needs both namespace + suitable object resource for its query paths: + # /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/tgi_request_latency + # (pod is not suitable object type for matching as each instance has different name) + overrides: + namespace: + resource: namespace + service: + resource: service + - seriesQuery: '{__name__="te_request_inference_duration_sum",service="tei-reranking-svc"}' + # Average request latency from TEI histograms, over 1 min + metricsQuery: 'rate(te_request_inference_duration_sum{service="tei-reranking-svc",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="tei-reranking-svc",<<.LabelMatchers>>}[1m]))' + name: + matches: ^te_request_inference_duration_sum + as: "reranking_request_latency" + resources: + overrides: + namespace: + resource: namespace + service: + resource: service + - seriesQuery: '{__name__="te_request_inference_duration_sum",service="tei-embedding-svc"}' + # Average request latency from TEI histograms, over 1 min + metricsQuery: 'rate(te_request_inference_duration_sum{service="tei-embedding-svc",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="tei-embedding-svc",<<.LabelMatchers>>}[1m]))' + name: + matches: ^te_request_inference_duration_sum + as: "embedding_request_latency" + resources: + overrides: + namespace: + resource: namespace + service: + resource: service +kind: ConfigMap +metadata: + name: adapter-config + namespace: monitoring diff --git a/microservices-connector/config/HPA/tei.yaml b/microservices-connector/config/HPA/tei.yaml new file mode 100644 index 000000000..54c830e6c --- /dev/null +++ b/microservices-connector/config/HPA/tei.yaml @@ -0,0 +1,205 @@ +--- +# Source: tei/templates/configmap.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: ConfigMap +metadata: + name: tei-config + labels: + helm.sh/chart: tei-0.8.0 + app.kubernetes.io/name: tei + app.kubernetes.io/instance: tei + app.kubernetes.io/version: "cpu-1.5" + app.kubernetes.io/managed-by: Helm +data: + MODEL_ID: "BAAI/bge-base-en-v1.5" + PORT: "2081" + http_proxy: "" + https_proxy: "" + no_proxy: "" + NUMBA_CACHE_DIR: "/tmp" + TRANSFORMERS_CACHE: "/tmp/transformers_cache" + HF_HOME: "/tmp/.cache/huggingface" + MAX_WARMUP_SEQUENCE_LENGTH: "512" +--- +# Source: tei/templates/service.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: Service +metadata: + name: tei + labels: + helm.sh/chart: tei-0.8.0 + app.kubernetes.io/name: tei + app.kubernetes.io/instance: tei + app.kubernetes.io/version: "cpu-1.5" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 80 + targetPort: 2081 + protocol: TCP + name: tei + selector: + app.kubernetes.io/name: tei + app.kubernetes.io/instance: tei +--- +# Source: tei/templates/deployment.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: tei + labels: + helm.sh/chart: tei-0.8.0 + app.kubernetes.io/name: tei + app.kubernetes.io/instance: tei + app.kubernetes.io/version: "cpu-1.5" + app.kubernetes.io/managed-by: Helm +spec: + # use explicit replica counts only of HorizontalPodAutoscaler is disabled + selector: + matchLabels: + app.kubernetes.io/name: tei + app.kubernetes.io/instance: tei + template: + metadata: + labels: + app.kubernetes.io/name: tei + app.kubernetes.io/instance: tei + spec: + securityContext: + {} + containers: + - name: tei + envFrom: + - configMapRef: + name: tei-config + - configMapRef: + name: extra-env-config + optional: true + securityContext: + {} + image: "ghcr.io/huggingface/text-embeddings-inference:cpu-1.5" + imagePullPolicy: IfNotPresent + args: + - "--auto-truncate" + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + - mountPath: /tmp + name: tmp + ports: + - name: http + containerPort: 2081 + protocol: TCP + livenessProbe: + failureThreshold: 24 + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + readinessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + startupProbe: + failureThreshold: 120 + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + resources: + {} + volumes: + - name: model-volume + hostPath: + path: /mnt/opea-models + type: Directory + - name: shm + emptyDir: + medium: Memory + sizeLimit: 1Gi + - name: tmp + emptyDir: {} + # extra time to finish processing buffered requests before HPA forcibly terminates pod + terminationGracePeriodSeconds: 60 +--- +# Source: tei/templates/horizontalPodAutoscaler.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: tei +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: tei + minReplicas: 1 + maxReplicas: 2 + metrics: + - type: Object + object: + metric: + # tei-embedding time metrics are in seconds + name: embedding_request_latency + describedObject: + apiVersion: v1 + # get metric for named object of given type (in same namespace) + kind: Service + name: tei + target: + # embedding_request_latency is average for all TEI pods. To avoid replica fluctuations when + # TEI startup + request processing takes longer than HPA evaluation period, this uses + # "Value" (replicas = metric.value / target.value), instead of "averageValue" type: + # https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details + type: Value + value: 4 + behavior: + scaleDown: + stabilizationWindowSeconds: 180 + policies: + - type: Percent + value: 25 + periodSeconds: 15 + scaleUp: + selectPolicy: Max + stabilizationWindowSeconds: 0 + policies: + - type: Percent + value: 50 + periodSeconds: 15 + - type: Pods + value: 2 + periodSeconds: 15 +--- +# Source: tei/templates/servicemonitor.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: tei +spec: + selector: + matchLabels: + app.kubernetes.io/name: tei + endpoints: + - interval: 4s + port: tei + scheme: http diff --git a/microservices-connector/config/HPA/teirerank.yaml b/microservices-connector/config/HPA/teirerank.yaml new file mode 100644 index 000000000..3cd33c140 --- /dev/null +++ b/microservices-connector/config/HPA/teirerank.yaml @@ -0,0 +1,204 @@ +--- +# Source: teirerank/templates/configmap.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: ConfigMap +metadata: + name: teirerank-config + labels: + helm.sh/chart: teirerank-0.8.0 + app.kubernetes.io/name: teirerank + app.kubernetes.io/instance: teirerank + app.kubernetes.io/version: "cpu-1.5" + app.kubernetes.io/managed-by: Helm +data: + MODEL_ID: "BAAI/bge-reranker-base" + PORT: "2082" + http_proxy: "" + https_proxy: "" + no_proxy: "" + NUMBA_CACHE_DIR: "/tmp" + TRANSFORMERS_CACHE: "/tmp/transformers_cache" + HF_HOME: "/tmp/.cache/huggingface" +--- +# Source: teirerank/templates/service.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: Service +metadata: + name: teirerank + labels: + helm.sh/chart: teirerank-0.8.0 + app.kubernetes.io/name: teirerank + app.kubernetes.io/instance: teirerank + app.kubernetes.io/version: "cpu-1.5" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 80 + targetPort: 2082 + protocol: TCP + name: teirerank + selector: + app.kubernetes.io/name: teirerank + app.kubernetes.io/instance: teirerank +--- +# Source: teirerank/templates/deployment.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: teirerank + labels: + helm.sh/chart: teirerank-0.8.0 + app.kubernetes.io/name: teirerank + app.kubernetes.io/instance: teirerank + app.kubernetes.io/version: "cpu-1.5" + app.kubernetes.io/managed-by: Helm +spec: + # use explicit replica counts only of HorizontalPodAutoscaler is disabled + selector: + matchLabels: + app.kubernetes.io/name: teirerank + app.kubernetes.io/instance: teirerank + template: + metadata: + labels: + app.kubernetes.io/name: teirerank + app.kubernetes.io/instance: teirerank + spec: + securityContext: + {} + containers: + - name: teirerank + envFrom: + - configMapRef: + name: teirerank-config + - configMapRef: + name: extra-env-config + optional: true + securityContext: + {} + image: "ghcr.io/huggingface/text-embeddings-inference:cpu-1.5" + imagePullPolicy: IfNotPresent + args: + - "--auto-truncate" + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + - mountPath: /tmp + name: tmp + ports: + - name: http + containerPort: 2082 + protocol: TCP + livenessProbe: + failureThreshold: 24 + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + readinessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + startupProbe: + failureThreshold: 120 + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + resources: + {} + volumes: + - name: model-volume + hostPath: + path: /mnt/opea-models + type: Directory + - name: shm + emptyDir: + medium: Memory + sizeLimit: 1Gi + - name: tmp + emptyDir: {} + # extra time to finish processing buffered requests before HPA forcibly terminates pod + terminationGracePeriodSeconds: 60 +--- +# Source: teirerank/templates/horizontalPodAutoscaler.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: teirerank +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: teirerank + minReplicas: 1 + maxReplicas: 3 + metrics: + - type: Object + object: + metric: + # tei-reranking time metrics are in seconds + name: reranking_request_latency + describedObject: + apiVersion: v1 + # get metric for named object of given type (in same namespace) + kind: Service + name: teirerank + target: + # reranking_request_latency is average for all TEI pods. To avoid replica fluctuations when + # TEI startup + request processing takes longer than HPA evaluation period, this uses + # "Value" (replicas = metric.value / target.value), instead of "averageValue" type: + # https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details + type: Value + value: 4 + behavior: + scaleDown: + stabilizationWindowSeconds: 180 + policies: + - type: Percent + value: 25 + periodSeconds: 15 + scaleUp: + selectPolicy: Max + stabilizationWindowSeconds: 0 + policies: + - type: Percent + value: 50 + periodSeconds: 15 + - type: Pods + value: 2 + periodSeconds: 15 +--- +# Source: teirerank/templates/servicemonitor.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: teirerank +spec: + selector: + matchLabels: + app.kubernetes.io/name: teirerank + endpoints: + - interval: 4s + port: teirerank + scheme: http diff --git a/microservices-connector/config/HPA/tgi.yaml b/microservices-connector/config/HPA/tgi.yaml new file mode 100644 index 000000000..97aedc761 --- /dev/null +++ b/microservices-connector/config/HPA/tgi.yaml @@ -0,0 +1,201 @@ +--- +# Source: tgi/templates/configmap.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: ConfigMap +metadata: + name: tgi-config + labels: + helm.sh/chart: tgi-0.8.0 + app.kubernetes.io/name: tgi + app.kubernetes.io/instance: tgi + app.kubernetes.io/version: "2.1.0" + app.kubernetes.io/managed-by: Helm +data: + MODEL_ID: "Intel/neural-chat-7b-v3-3" + PORT: "2080" + HF_TOKEN: "insert-your-huggingface-token-here" + http_proxy: "" + https_proxy: "" + no_proxy: "" + HABANA_LOGS: "/tmp/habana_logs" + NUMBA_CACHE_DIR: "/tmp" + TRANSFORMERS_CACHE: "/tmp/transformers_cache" + HF_HOME: "/tmp/.cache/huggingface" + CUDA_GRAPHS: "0" +--- +# Source: tgi/templates/service.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: Service +metadata: + name: tgi + labels: + helm.sh/chart: tgi-0.8.0 + app.kubernetes.io/name: tgi + app.kubernetes.io/instance: tgi + app.kubernetes.io/version: "2.1.0" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 80 + targetPort: 2080 + protocol: TCP + name: tgi + selector: + app.kubernetes.io/name: tgi + app.kubernetes.io/instance: tgi +--- +# Source: tgi/templates/deployment.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: tgi + labels: + helm.sh/chart: tgi-0.8.0 + app.kubernetes.io/name: tgi + app.kubernetes.io/instance: tgi + app.kubernetes.io/version: "2.1.0" + app.kubernetes.io/managed-by: Helm +spec: + # use explicit replica counts only of HorizontalPodAutoscaler is disabled + selector: + matchLabels: + app.kubernetes.io/name: tgi + app.kubernetes.io/instance: tgi + template: + metadata: + labels: + app.kubernetes.io/name: tgi + app.kubernetes.io/instance: tgi + spec: + securityContext: + {} + containers: + - name: tgi + envFrom: + - configMapRef: + name: tgi-config + - configMapRef: + name: extra-env-config + optional: true + securityContext: + {} + image: "ghcr.io/huggingface/text-generation-inference:latest-intel-cpu" + imagePullPolicy: IfNotPresent + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /tmp + name: tmp + ports: + - name: http + containerPort: 2080 + protocol: TCP + livenessProbe: + failureThreshold: 24 + initialDelaySeconds: 5 + periodSeconds: 5 + tcpSocket: + port: http + readinessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + tcpSocket: + port: http + startupProbe: + failureThreshold: 120 + initialDelaySeconds: 5 + periodSeconds: 5 + tcpSocket: + port: http + resources: + {} + volumes: + - name: model-volume + hostPath: + path: /mnt/opea-models + type: Directory + - name: tmp + emptyDir: {} + # extra time to finish processing buffered requests before HPA forcibly terminates pod + terminationGracePeriodSeconds: 120 +--- +# Source: tgi/templates/horizontalPorAutoscaler.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: tgi +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: tgi + minReplicas: 1 + maxReplicas: 6 + metrics: + - type: Object + object: + metric: + # TGI time metrics are in seconds + name: tgi_request_latency + describedObject: + apiVersion: v1 + # get metric for named object of given type (in same namespace) + kind: Service + name: tgi + target: + # tgi_request_latency is average for all the TGI pods. To avoid replica fluctuations when + # TGI startup + request processing takes longer than HPA evaluation period, this uses + # "Value" (replicas = metric.value / target.value), instead of "averageValue" type: + # https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details + type: Value + value: 4 + behavior: + scaleDown: + stabilizationWindowSeconds: 180 + policies: + - type: Percent + value: 25 + periodSeconds: 15 + scaleUp: + selectPolicy: Max + stabilizationWindowSeconds: 0 + policies: + - type: Percent + value: 50 + periodSeconds: 15 + - type: Pods + value: 2 + periodSeconds: 15 +--- +# Source: tgi/templates/servicemonitor.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +# Dashboard for the exposed TGI metrics: +# - https://grafana.com/grafana/dashboards/19831-text-generation-inference-dashboard/ +# Metric descriptions: +# - https://github.com/huggingface/text-generation-inference/discussions/1127#discussioncomment-7240527 +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: tgi +spec: + selector: + matchLabels: + app.kubernetes.io/name: tgi + endpoints: + - interval: 4s + port: tgi + scheme: http