Skip to content

Commit

Permalink
Make HPA global for ChatQnA, tei, teirerank, tgi
Browse files Browse the repository at this point in the history
Signed-off-by: Alexey Fomenko <[email protected]>
  • Loading branch information
byako committed Aug 22, 2024
1 parent ef2fe65 commit a5f4570
Show file tree
Hide file tree
Showing 14 changed files with 48 additions and 44 deletions.
13 changes: 7 additions & 6 deletions helm-charts/chatqna/templates/customMetrics.yaml
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

{{- if .Values.horizontalPodAutoscaler.enabled }}
{{- if .Values.global.horizontalPodAutoscaler.enabled }}
apiVersion: v1
data:
config.yaml: |
rules:
- seriesQuery: '{__name__="tgi_request_inference_duration_sum"}'
- seriesQuery: '{__name__="tgi_request_inference_duration_sum",service="{{ include "tgi.fullname" .Subcharts.tgi }}"}'
metricsQuery: 'rate(tgi_request_inference_duration_sum{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(tgi_request_inference_duration_count{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>}[1m]))'
# Average request latency from TGI histograms, over 1 min
# (0.001 divider add is to make sure there's always a valid value)
metricsQuery: 'rate(tgi_request_inference_duration_sum{<<.LabelMatchers>>}[1m]) / (0.001+rate(tgi_request_inference_duration_count{<<.LabelMatchers>>}[1m]))'
Expand All @@ -22,9 +23,9 @@ data:
resource: namespace
service:
resource: service
- seriesQuery: '{__name__="te_request_inference_duration_sum",service="tei-reranking-svc"}'
- seriesQuery: '{__name__="te_request_inference_duration_sum",service="{{ include "teirerank.fullname" .Subcharts.teirerank }}"}'
# Average request latency from TEI histograms, over 1 min
metricsQuery: 'rate(te_request_inference_duration_sum{service="tei-reranking-svc",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="tei-reranking-svc",<<.LabelMatchers>>}[1m]))'
metricsQuery: 'rate(te_request_inference_duration_sum{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>}[1m]))'
name:
matches: ^te_request_inference_duration_sum
as: "reranking_request_latency"
Expand All @@ -34,9 +35,9 @@ data:
resource: namespace
service:
resource: service
- seriesQuery: '{__name__="te_request_inference_duration_sum",service="tei-embedding-svc"}'
- seriesQuery: '{__name__="te_request_inference_duration_sum",service="{{ include "tei.fullname" .Subcharts.tei }}"}'
# Average request latency from TEI histograms, over 1 min
metricsQuery: 'rate(te_request_inference_duration_sum{service="tei-embedding-svc",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="tei-embedding-svc",<<.LabelMatchers>>}[1m]))'
metricsQuery: 'rate(te_request_inference_duration_sum{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>}[1m]))'
name:
matches: ^te_request_inference_duration_sum
as: "embedding_request_latency"
Expand Down
16 changes: 8 additions & 8 deletions helm-charts/chatqna/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,6 @@

replicaCount: 1

# Enabling HPA will:
# - Overwrite existing PrometheusAdapter "adapter-config" configMap with ChatQnA specific custom metric queries
# for embedding, reranking, tgi services
# Upstream default configMap:
# - https://github.com/kubernetes-sigs/prometheus-adapter/blob/master/deploy/manifests/config-map.yaml
horizontalPodAutoscaler:
enabled: false

image:
repository: opea/chatqna
pullPolicy: IfNotPresent
Expand Down Expand Up @@ -56,3 +48,11 @@ global:
modelUseHostPath: ""
# modelUseHostPath: /mnt/opea-models
# modelUsePVC: model-volume

# Enabling HorizontalPodAutoscaler (HPA) will:
# - Overwrite existing PrometheusAdapter "adapter-config" configMap with ChatQnA specific custom metric queries
# for embedding, reranking, tgi services
# Upstream default configMap:
# - https://github.com/kubernetes-sigs/prometheus-adapter/blob/master/deploy/manifests/config-map.yaml
horizontalPodAutoscaler:
enabled: false
4 changes: 2 additions & 2 deletions helm-charts/common/tei/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ metadata:
{{- include "tei.labels" . | nindent 4 }}
spec:
# use explicit replica counts only of HorizontalPodAutoscaler is disabled
{{- if not .Values.horizontalPodAutoscaler.enabled }}
{{- if not .Values.global.horizontalPodAutoscaler.enabled }}
replicas: {{ .Values.replicaCount }}
{{- end }}
selector:
Expand Down Expand Up @@ -105,7 +105,7 @@ spec:
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- if .Values.horizontalPodAutoscaler.enabled }}
{{- if .Values.global.horizontalPodAutoscaler.enabled }}
# extra time to finish processing buffered requests before HPA forcibly terminates pod
terminationGracePeriodSeconds: 60
{{- end }}
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

{{- if .Values.horizontalPodAutoscaler.enabled }}
{{- if .Values.global.horizontalPodAutoscaler.enabled }}
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
Expand Down
4 changes: 2 additions & 2 deletions helm-charts/common/tei/templates/servicemonitor.yaml
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

{{- if .Values.horizontalPodAutoscaler.enabled }}
{{- if .Values.global.horizontalPodAutoscaler.enabled }}
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: {{ include "tei.fullname" . }}
spec:
selector:
matchLabels:
app.kubernetes.io/name: {{ include "tei.name" . }}
{{- include "tei.selectorLabels" . | nindent 6 }}
endpoints:
- interval: 4s
port: tei
Expand Down
11 changes: 6 additions & 5 deletions helm-charts/common/tei/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,7 @@

replicaCount: 1

# Enabling HPA will:
# - Ignore above replica count, as it will be controlled by HPA
# - Add example HPA scaling rules with thresholds suitable for Xeon deployments
# - Require custom metrics ConfigMap available in the main application chart
horizontalPodAutoscaler:
enabled: false
maxReplicas: 2

port: 2081
Expand Down Expand Up @@ -100,3 +95,9 @@ global:
# By default, both var are set to empty, the model will be downloaded and saved to a tmp volume.
modelUseHostPath: ""
modelUsePVC: ""
# Enabling HPA will:
# - Ignore above replica count, as it will be controlled by HPA
# - Add example HPA scaling rules with thresholds suitable for Xeon deployments
# - Require custom metrics ConfigMap available in the main application chart
horizontalPodAutoscaler:
enabled: false
4 changes: 2 additions & 2 deletions helm-charts/common/teirerank/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ metadata:
{{- include "teirerank.labels" . | nindent 4 }}
spec:
# use explicit replica counts only of HorizontalPodAutoscaler is disabled
{{- if not .Values.horizontalPodAutoscaler.enabled }}
{{- if not .Values.global.horizontalPodAutoscaler.enabled }}
replicas: {{ .Values.replicaCount }}
{{- end }}
selector:
Expand Down Expand Up @@ -105,7 +105,7 @@ spec:
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- if .Values.horizontalPodAutoscaler.enabled }}
{{- if .Values.global.horizontalPodAutoscaler.enabled }}
# extra time to finish processing buffered requests before HPA forcibly terminates pod
terminationGracePeriodSeconds: 60
{{- end }}
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

{{- if .Values.horizontalPodAutoscaler.enabled }}
{{- if .Values.global.horizontalPodAutoscaler.enabled }}
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
Expand Down
4 changes: 2 additions & 2 deletions helm-charts/common/teirerank/templates/servicemonitor.yaml
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

{{- if .Values.horizontalPodAutoscaler.enabled }}
{{- if .Values.global.horizontalPodAutoscaler.enabled }}
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: {{ include "teirerank.fullname" . }}
spec:
selector:
matchLabels:
app.kubernetes.io/name: {{ include "teirerank.name" . }}
{{- include "teirerank.selectorLabels" . | nindent 6 }}
endpoints:
- interval: 4s
port: teirerank
Expand Down
11 changes: 6 additions & 5 deletions helm-charts/common/teirerank/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,7 @@
replicaCount: 1


# Enabling HPA will:
# - Ignore above replica count, as it will be controlled by HPA
# - Add example HPA scaling rules with thresholds suitable for Xeon deployments
# - Require custom metrics ConfigMap available in the main application chart
horizontalPodAutoscaler:
enabled: false
maxReplicas: 3

port: 2082
Expand Down Expand Up @@ -101,3 +96,9 @@ global:
# By default, both var are set to empty, the model will be downloaded and saved to a tmp volume.
modelUseHostPath: ""
modelUsePVC: ""
# Enabling HPA will:
# - Ignore above replica count, as it will be controlled by HPA
# - Add example HPA scaling rules with thresholds suitable for Xeon deployments
# - Require custom metrics ConfigMap available in the main application chart
horizontalPodAutoscaler:
enabled: false
4 changes: 2 additions & 2 deletions helm-charts/common/tgi/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ metadata:
{{- include "tgi.labels" . | nindent 4 }}
spec:
# use explicit replica counts only of HorizontalPodAutoscaler is disabled
{{- if not .Values.horizontalPodAutoscaler.enabled }}
{{- if not .Values.global.horizontalPodAutoscaler.enabled }}
replicas: {{ .Values.replicaCount }}
{{- end }}
selector:
Expand Down Expand Up @@ -97,7 +97,7 @@ spec:
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- if .Values.horizontalPodAutoscaler.enabled }}
{{- if .Values.global.horizontalPodAutoscaler.enabled }}
# extra time to finish processing buffered requests before HPA forcibly terminates pod
terminationGracePeriodSeconds: 120
{{- end }}
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

{{- if .Values.horizontalPodAutoscaler.enabled }}
{{- if .Values.global.horizontalPodAutoscaler.enabled }}
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
Expand Down
4 changes: 2 additions & 2 deletions helm-charts/common/tgi/templates/servicemonitor.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,15 @@
# Metric descriptions:
# - https://github.com/huggingface/text-generation-inference/discussions/1127#discussioncomment-7240527

{{- if .Values.horizontalPodAutoscaler.enabled }}
{{- if .Values.global.horizontalPodAutoscaler.enabled }}
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: {{ include "tgi.fullname" . }}
spec:
selector:
matchLabels:
app.kubernetes.io/name: {{ include "tgi.name" . }}
{{- include "tgi.selectorLabels" . | nindent 6 }}
endpoints:
- interval: 4s
port: tgi
Expand Down
11 changes: 6 additions & 5 deletions helm-charts/common/tgi/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,7 @@

replicaCount: 1

# Enabling HPA will:
# - Ignore above replica count, as it will be controlled by HPA
# - Add example HPA scaling rules with thresholds suitable for Xeon deployments
# - Require custom metrics ConfigMap available in the main application chart
horizontalPodAutoscaler:
enabled: false
maxReplicas: 6

port: 2080
Expand Down Expand Up @@ -125,3 +120,9 @@ global:
# By default, both var are set to empty, the model will be downloaded and saved to a tmp volume.
modelUseHostPath: ""
modelUsePVC: ""
# Enabling HPA will:
# - Ignore above replica count, as it will be controlled by HPA
# - Add example HPA scaling rules with thresholds suitable for Xeon deployments
# - Require custom metrics ConfigMap available in the main application chart
horizontalPodAutoscaler:
enabled: false

0 comments on commit a5f4570

Please sign in to comment.