Skip to content

Commit

Permalink
Add HPA-enabled manifests
Browse files Browse the repository at this point in the history
Signed-off-by: Alexey Fomenko <[email protected]>
  • Loading branch information
byako committed Aug 21, 2024
1 parent fc27c77 commit ef2fe65
Show file tree
Hide file tree
Showing 4 changed files with 661 additions and 0 deletions.
51 changes: 51 additions & 0 deletions microservices-connector/config/HPA/customMetrics.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

apiVersion: v1
data:
config.yaml: |
rules:
- seriesQuery: '{__name__="tgi_request_inference_duration_sum"}'
# Average request latency from TGI histograms, over 1 min
# (0.001 divider add is to make sure there's always a valid value)
metricsQuery: 'rate(tgi_request_inference_duration_sum{<<.LabelMatchers>>}[1m]) / (0.001+rate(tgi_request_inference_duration_count{<<.LabelMatchers>>}[1m]))'
name:
matches: ^tgi_request_inference_duration_sum
as: "tgi_request_latency"
resources:
# HPA needs both namespace + suitable object resource for its query paths:
# /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/tgi_request_latency
# (pod is not suitable object type for matching as each instance has different name)
overrides:
namespace:
resource: namespace
service:
resource: service
- seriesQuery: '{__name__="te_request_inference_duration_sum",service="tei-reranking-svc"}'
# Average request latency from TEI histograms, over 1 min
metricsQuery: 'rate(te_request_inference_duration_sum{service="tei-reranking-svc",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="tei-reranking-svc",<<.LabelMatchers>>}[1m]))'
name:
matches: ^te_request_inference_duration_sum
as: "reranking_request_latency"
resources:
overrides:
namespace:
resource: namespace
service:
resource: service
- seriesQuery: '{__name__="te_request_inference_duration_sum",service="tei-embedding-svc"}'
# Average request latency from TEI histograms, over 1 min
metricsQuery: 'rate(te_request_inference_duration_sum{service="tei-embedding-svc",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="tei-embedding-svc",<<.LabelMatchers>>}[1m]))'
name:
matches: ^te_request_inference_duration_sum
as: "embedding_request_latency"
resources:
overrides:
namespace:
resource: namespace
service:
resource: service
kind: ConfigMap
metadata:
name: adapter-config
namespace: monitoring
205 changes: 205 additions & 0 deletions microservices-connector/config/HPA/tei.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
---
# Source: tei/templates/configmap.yaml
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

apiVersion: v1
kind: ConfigMap
metadata:
name: tei-config
labels:
helm.sh/chart: tei-0.8.0
app.kubernetes.io/name: tei
app.kubernetes.io/instance: tei
app.kubernetes.io/version: "cpu-1.5"
app.kubernetes.io/managed-by: Helm
data:
MODEL_ID: "BAAI/bge-base-en-v1.5"
PORT: "2081"
http_proxy: ""
https_proxy: ""
no_proxy: ""
NUMBA_CACHE_DIR: "/tmp"
TRANSFORMERS_CACHE: "/tmp/transformers_cache"
HF_HOME: "/tmp/.cache/huggingface"
MAX_WARMUP_SEQUENCE_LENGTH: "512"
---
# Source: tei/templates/service.yaml
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

apiVersion: v1
kind: Service
metadata:
name: tei
labels:
helm.sh/chart: tei-0.8.0
app.kubernetes.io/name: tei
app.kubernetes.io/instance: tei
app.kubernetes.io/version: "cpu-1.5"
app.kubernetes.io/managed-by: Helm
spec:
type: ClusterIP
ports:
- port: 80
targetPort: 2081
protocol: TCP
name: tei
selector:
app.kubernetes.io/name: tei
app.kubernetes.io/instance: tei
---
# Source: tei/templates/deployment.yaml
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

apiVersion: apps/v1
kind: Deployment
metadata:
name: tei
labels:
helm.sh/chart: tei-0.8.0
app.kubernetes.io/name: tei
app.kubernetes.io/instance: tei
app.kubernetes.io/version: "cpu-1.5"
app.kubernetes.io/managed-by: Helm
spec:
# use explicit replica counts only of HorizontalPodAutoscaler is disabled
selector:
matchLabels:
app.kubernetes.io/name: tei
app.kubernetes.io/instance: tei
template:
metadata:
labels:
app.kubernetes.io/name: tei
app.kubernetes.io/instance: tei
spec:
securityContext:
{}
containers:
- name: tei
envFrom:
- configMapRef:
name: tei-config
- configMapRef:
name: extra-env-config
optional: true
securityContext:
{}
image: "ghcr.io/huggingface/text-embeddings-inference:cpu-1.5"
imagePullPolicy: IfNotPresent
args:
- "--auto-truncate"
volumeMounts:
- mountPath: /data
name: model-volume
- mountPath: /dev/shm
name: shm
- mountPath: /tmp
name: tmp
ports:
- name: http
containerPort: 2081
protocol: TCP
livenessProbe:
failureThreshold: 24
httpGet:
path: /health
port: http
initialDelaySeconds: 5
periodSeconds: 5
readinessProbe:
httpGet:
path: /health
port: http
initialDelaySeconds: 5
periodSeconds: 5
startupProbe:
failureThreshold: 120
httpGet:
path: /health
port: http
initialDelaySeconds: 5
periodSeconds: 5
resources:
{}
volumes:
- name: model-volume
hostPath:
path: /mnt/opea-models
type: Directory
- name: shm
emptyDir:
medium: Memory
sizeLimit: 1Gi
- name: tmp
emptyDir: {}
# extra time to finish processing buffered requests before HPA forcibly terminates pod
terminationGracePeriodSeconds: 60
---
# Source: tei/templates/horizontalPodAutoscaler.yaml
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: tei
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: tei
minReplicas: 1
maxReplicas: 2
metrics:
- type: Object
object:
metric:
# tei-embedding time metrics are in seconds
name: embedding_request_latency
describedObject:
apiVersion: v1
# get metric for named object of given type (in same namespace)
kind: Service
name: tei
target:
# embedding_request_latency is average for all TEI pods. To avoid replica fluctuations when
# TEI startup + request processing takes longer than HPA evaluation period, this uses
# "Value" (replicas = metric.value / target.value), instead of "averageValue" type:
# https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details
type: Value
value: 4
behavior:
scaleDown:
stabilizationWindowSeconds: 180
policies:
- type: Percent
value: 25
periodSeconds: 15
scaleUp:
selectPolicy: Max
stabilizationWindowSeconds: 0
policies:
- type: Percent
value: 50
periodSeconds: 15
- type: Pods
value: 2
periodSeconds: 15
---
# Source: tei/templates/servicemonitor.yaml
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: tei
spec:
selector:
matchLabels:
app.kubernetes.io/name: tei
endpoints:
- interval: 4s
port: tei
scheme: http
Loading

0 comments on commit ef2fe65

Please sign in to comment.