Make HPA global for ChatQnA, tei, teirerank, tgi

Signed-off-by: Alexey Fomenko <[email protected]>
opea-project · Aug 22, 2024 · a5f4570 · a5f4570
1 parent ef2fe65
commit a5f4570
Show file tree

Hide file tree

Showing 14 changed files with 48 additions and 44 deletions.
diff --git a/helm-charts/chatqna/templates/customMetrics.yaml b/helm-charts/chatqna/templates/customMetrics.yaml
@@ -1,12 +1,13 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-{{- if .Values.horizontalPodAutoscaler.enabled }}
+{{- if .Values.global.horizontalPodAutoscaler.enabled }}
 apiVersion: v1
 data:
   config.yaml: |
       rules:
-      - seriesQuery: '{__name__="tgi_request_inference_duration_sum"}'
+      - seriesQuery: '{__name__="tgi_request_inference_duration_sum",service="{{ include "tgi.fullname" .Subcharts.tgi }}"}'
+        metricsQuery: 'rate(tgi_request_inference_duration_sum{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(tgi_request_inference_duration_count{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>}[1m]))'
         # Average request latency from TGI histograms, over 1 min
         # (0.001 divider add is to make sure there's always a valid value)
         metricsQuery: 'rate(tgi_request_inference_duration_sum{<<.LabelMatchers>>}[1m]) / (0.001+rate(tgi_request_inference_duration_count{<<.LabelMatchers>>}[1m]))'
@@ -22,9 +23,9 @@ data:
               resource: namespace
             service:
               resource: service
-      - seriesQuery: '{__name__="te_request_inference_duration_sum",service="tei-reranking-svc"}'
+      - seriesQuery: '{__name__="te_request_inference_duration_sum",service="{{ include "teirerank.fullname" .Subcharts.teirerank }}"}'
         # Average request latency from TEI histograms, over 1 min
-        metricsQuery: 'rate(te_request_inference_duration_sum{service="tei-reranking-svc",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="tei-reranking-svc",<<.LabelMatchers>>}[1m]))'
+        metricsQuery: 'rate(te_request_inference_duration_sum{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>}[1m]))'
         name:
           matches: ^te_request_inference_duration_sum
           as: "reranking_request_latency"
@@ -34,9 +35,9 @@ data:
               resource: namespace
             service:
               resource: service
-      - seriesQuery: '{__name__="te_request_inference_duration_sum",service="tei-embedding-svc"}'
+      - seriesQuery: '{__name__="te_request_inference_duration_sum",service="{{ include "tei.fullname" .Subcharts.tei }}"}'
         # Average request latency from TEI histograms, over 1 min
-        metricsQuery: 'rate(te_request_inference_duration_sum{service="tei-embedding-svc",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="tei-embedding-svc",<<.LabelMatchers>>}[1m]))'
+        metricsQuery: 'rate(te_request_inference_duration_sum{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>}[1m]))'
         name:
           matches: ^te_request_inference_duration_sum
           as: "embedding_request_latency"

diff --git a/helm-charts/chatqna/values.yaml b/helm-charts/chatqna/values.yaml
@@ -7,14 +7,6 @@
 
 replicaCount: 1
 
-# Enabling HPA will:
-# - Overwrite existing PrometheusAdapter "adapter-config" configMap with ChatQnA specific custom metric queries
-#   for embedding, reranking, tgi services
-# Upstream default configMap:
-#  - https://github.com/kubernetes-sigs/prometheus-adapter/blob/master/deploy/manifests/config-map.yaml
-horizontalPodAutoscaler:
-  enabled: false
-
 image:
   repository: opea/chatqna
   pullPolicy: IfNotPresent
@@ -56,3 +48,11 @@ global:
   modelUseHostPath: ""
   # modelUseHostPath: /mnt/opea-models
   # modelUsePVC: model-volume
+
+  # Enabling HorizontalPodAutoscaler (HPA) will:
+  # - Overwrite existing PrometheusAdapter "adapter-config" configMap with ChatQnA specific custom metric queries
+  #   for embedding, reranking, tgi services
+  # Upstream default configMap:
+  #  - https://github.com/kubernetes-sigs/prometheus-adapter/blob/master/deploy/manifests/config-map.yaml
+  horizontalPodAutoscaler:
+    enabled: false
diff --git a/helm-charts/common/tei/templates/deployment.yaml b/helm-charts/common/tei/templates/deployment.yaml
@@ -9,7 +9,7 @@ metadata:
     {{- include "tei.labels" . | nindent 4 }}
 spec:
   # use explicit replica counts only of HorizontalPodAutoscaler is disabled
-  {{- if not .Values.horizontalPodAutoscaler.enabled }}
+  {{- if not .Values.global.horizontalPodAutoscaler.enabled }}
   replicas: {{ .Values.replicaCount }}
   {{- end }}
   selector:
@@ -105,7 +105,7 @@ spec:
       tolerations:
         {{- toYaml . | nindent 8 }}
       {{- end }}
-      {{- if .Values.horizontalPodAutoscaler.enabled }}
+      {{- if .Values.global.horizontalPodAutoscaler.enabled }}
       # extra time to finish processing buffered requests before HPA forcibly terminates pod
       terminationGracePeriodSeconds: 60
       {{- end }}
diff --git a/helm-charts/common/tei/templates/horizontalPodAutoscaler.yaml b/helm-charts/common/tei/templates/horizontalPodAutoscaler.yaml
@@ -1,7 +1,7 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-{{- if .Values.horizontalPodAutoscaler.enabled }}
+{{- if .Values.global.horizontalPodAutoscaler.enabled }}
 apiVersion: autoscaling/v2
 kind: HorizontalPodAutoscaler
 metadata:

diff --git a/helm-charts/common/tei/templates/servicemonitor.yaml b/helm-charts/common/tei/templates/servicemonitor.yaml
@@ -1,15 +1,15 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-{{- if .Values.horizontalPodAutoscaler.enabled }}
+{{- if .Values.global.horizontalPodAutoscaler.enabled }}
 apiVersion: monitoring.coreos.com/v1
 kind: ServiceMonitor
 metadata:
   name: {{ include "tei.fullname" . }}
 spec:
   selector:
     matchLabels:
-      app.kubernetes.io/name: {{ include "tei.name" . }}
+      {{- include "tei.selectorLabels" . | nindent 6 }}
   endpoints:
   - interval: 4s
     port: tei

diff --git a/helm-charts/common/tei/values.yaml b/helm-charts/common/tei/values.yaml
@@ -7,12 +7,7 @@
 
 replicaCount: 1
 
-# Enabling HPA will:
-# - Ignore above replica count, as it will be controlled by HPA
-# - Add example HPA scaling rules with thresholds suitable for Xeon deployments
-# - Require custom metrics ConfigMap available in the main application chart
 horizontalPodAutoscaler:
-  enabled: false
   maxReplicas: 2
 
 port: 2081
@@ -100,3 +95,9 @@ global:
   # By default, both var are set to empty, the model will be downloaded and saved to a tmp volume.
   modelUseHostPath: ""
   modelUsePVC: ""
+  # Enabling HPA will:
+  # - Ignore above replica count, as it will be controlled by HPA
+  # - Add example HPA scaling rules with thresholds suitable for Xeon deployments
+  # - Require custom metrics ConfigMap available in the main application chart
+  horizontalPodAutoscaler:
+    enabled: false
diff --git a/helm-charts/common/teirerank/templates/deployment.yaml b/helm-charts/common/teirerank/templates/deployment.yaml
@@ -9,7 +9,7 @@ metadata:
     {{- include "teirerank.labels" . | nindent 4 }}
 spec:
   # use explicit replica counts only of HorizontalPodAutoscaler is disabled
-  {{- if not .Values.horizontalPodAutoscaler.enabled }}
+  {{- if not .Values.global.horizontalPodAutoscaler.enabled }}
   replicas: {{ .Values.replicaCount }}
   {{- end }}
   selector:
@@ -105,7 +105,7 @@ spec:
       tolerations:
         {{- toYaml . | nindent 8 }}
       {{- end }}
-      {{- if .Values.horizontalPodAutoscaler.enabled }}
+      {{- if .Values.global.horizontalPodAutoscaler.enabled }}
       # extra time to finish processing buffered requests before HPA forcibly terminates pod
       terminationGracePeriodSeconds: 60
       {{- end }}
diff --git a/helm-charts/common/teirerank/templates/horizontalPodAutoscaler.yaml b/helm-charts/common/teirerank/templates/horizontalPodAutoscaler.yaml
@@ -1,7 +1,7 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-{{- if .Values.horizontalPodAutoscaler.enabled }}
+{{- if .Values.global.horizontalPodAutoscaler.enabled }}
 apiVersion: autoscaling/v2
 kind: HorizontalPodAutoscaler
 metadata:

diff --git a/helm-charts/common/teirerank/templates/servicemonitor.yaml b/helm-charts/common/teirerank/templates/servicemonitor.yaml
@@ -1,15 +1,15 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-{{- if .Values.horizontalPodAutoscaler.enabled }}
+{{- if .Values.global.horizontalPodAutoscaler.enabled }}
 apiVersion: monitoring.coreos.com/v1
 kind: ServiceMonitor
 metadata:
   name: {{ include "teirerank.fullname" . }}
 spec:
   selector:
     matchLabels:
-      app.kubernetes.io/name: {{ include "teirerank.name" . }}
+      {{- include "teirerank.selectorLabels" . | nindent 6 }}
   endpoints:
   - interval: 4s
     port: teirerank

diff --git a/helm-charts/common/teirerank/values.yaml b/helm-charts/common/teirerank/values.yaml
@@ -8,12 +8,7 @@
 replicaCount: 1
 
 
-# Enabling HPA will:
-# - Ignore above replica count, as it will be controlled by HPA
-# - Add example HPA scaling rules with thresholds suitable for Xeon deployments
-# - Require custom metrics ConfigMap available in the main application chart
 horizontalPodAutoscaler:
-  enabled: false
   maxReplicas: 3
 
 port: 2082
@@ -101,3 +96,9 @@ global:
   # By default, both var are set to empty, the model will be downloaded and saved to a tmp volume.
   modelUseHostPath: ""
   modelUsePVC: ""
+  # Enabling HPA will:
+  # - Ignore above replica count, as it will be controlled by HPA
+  # - Add example HPA scaling rules with thresholds suitable for Xeon deployments
+  # - Require custom metrics ConfigMap available in the main application chart
+  horizontalPodAutoscaler:
+    enabled: false
diff --git a/helm-charts/common/tgi/templates/deployment.yaml b/helm-charts/common/tgi/templates/deployment.yaml
@@ -9,7 +9,7 @@ metadata:
     {{- include "tgi.labels" . | nindent 4 }}
 spec:
   # use explicit replica counts only of HorizontalPodAutoscaler is disabled
-  {{- if not .Values.horizontalPodAutoscaler.enabled }}
+  {{- if not .Values.global.horizontalPodAutoscaler.enabled }}
   replicas: {{ .Values.replicaCount }}
   {{- end }}
   selector:
@@ -97,7 +97,7 @@ spec:
       tolerations:
         {{- toYaml . | nindent 8 }}
       {{- end }}
-      {{- if .Values.horizontalPodAutoscaler.enabled }}
+      {{- if .Values.global.horizontalPodAutoscaler.enabled }}
       # extra time to finish processing buffered requests before HPA forcibly terminates pod
       terminationGracePeriodSeconds: 120
       {{- end }}
diff --git a/helm-charts/common/tgi/templates/horizontalPorAutoscaler.yaml b/helm-charts/common/tgi/templates/horizontalPorAutoscaler.yaml
@@ -1,7 +1,7 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-{{- if .Values.horizontalPodAutoscaler.enabled }}
+{{- if .Values.global.horizontalPodAutoscaler.enabled }}
 apiVersion: autoscaling/v2
 kind: HorizontalPodAutoscaler
 metadata:

diff --git a/helm-charts/common/tgi/templates/servicemonitor.yaml b/helm-charts/common/tgi/templates/servicemonitor.yaml
@@ -6,15 +6,15 @@
 # Metric descriptions:
 # - https://github.com/huggingface/text-generation-inference/discussions/1127#discussioncomment-7240527
 
-{{- if .Values.horizontalPodAutoscaler.enabled }}
+{{- if .Values.global.horizontalPodAutoscaler.enabled }}
 apiVersion: monitoring.coreos.com/v1
 kind: ServiceMonitor
 metadata:
   name: {{ include "tgi.fullname" . }}
 spec:
   selector:
     matchLabels:
-      app.kubernetes.io/name: {{ include "tgi.name" . }}
+      {{- include "tgi.selectorLabels" . | nindent 6 }}
   endpoints:
   - interval: 4s
     port: tgi

diff --git a/helm-charts/common/tgi/values.yaml b/helm-charts/common/tgi/values.yaml
@@ -7,12 +7,7 @@
 
 replicaCount: 1
 
-# Enabling HPA will:
-# - Ignore above replica count, as it will be controlled by HPA
-# - Add example HPA scaling rules with thresholds suitable for Xeon deployments
-# - Require custom metrics ConfigMap available in the main application chart
 horizontalPodAutoscaler:
-  enabled: false
   maxReplicas: 6
 
 port: 2080
@@ -125,3 +120,9 @@ global:
   # By default, both var are set to empty, the model will be downloaded and saved to a tmp volume.
   modelUseHostPath: ""
   modelUsePVC: ""
+  # Enabling HPA will:
+  # - Ignore above replica count, as it will be controlled by HPA
+  # - Add example HPA scaling rules with thresholds suitable for Xeon deployments
+  # - Require custom metrics ConfigMap available in the main application chart
+  horizontalPodAutoscaler:
+    enabled: false