This repository has been archived by the owner on Dec 12, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 41
/
prometheus.yaml
483 lines (462 loc) · 15.7 KB
/
prometheus.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
apiVersion: v1
kind: Template
metadata:
name: prometheus
annotations:
"openshift.io/display-name": Prometheus
description: |
A monitoring solution for an OpenShift cluster - collect and gather metrics from nodes, services, and the infrastructure.
iconClass: icon-cogs
tags: "monitoring,prometheus,time-series"
parameters:
- description: The namespace to instantiate prometheus under. Defaults to 'prometheus'.
name: NAMESPACE
value: prometheus
- description: The location of the prometheus image
name: IMAGE_PROMETHEUS
value: wkulhanek/prometheus:latest
- description: The scheme to communicate with the Alertmanager. Defaults to 'http'.
name: ALERT_MANAGER_SCHEME
value: http
- description: Alertmanager Hostname and Port. Defaults to 'alertmanager:9093'.
name: ALERT_MANAGER_HOST_PORT
value: alertmanager:9093
- description: Router Password (oc set env dc router -n default --list|grep STATS_PASSWORD)
name: ROUTER_PASSWORD
required: true
objects:
- apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
- apiVersion: v1
kind: ClusterRoleBinding
metadata:
name: prometheus-cluster-reader
roleRef:
name: cluster-reader
subjects:
- kind: ServiceAccount
name: prometheus
namespace: ${NAMESPACE}
- apiVersion: v1
kind: Route
metadata:
name: prometheus
spec:
to:
name: prometheus
- apiVersion: v1
kind: Service
metadata:
annotations:
prometheus.io/scrape: "true"
prometheus.io/scheme: http
labels:
name: prometheus
name: prometheus
spec:
ports:
- name: prometheus
port: 9090
protocol: TCP
targetPort: 9090
selector:
app: prometheus
- apiVersion: v1
kind: DeploymentConfig
metadata:
name: prometheus
labels:
app: prometheus
spec:
replicas: 1
selector:
app: prometheus
deploymentconfig: prometheus
template:
metadata:
labels:
app: prometheus
deploymentconfig: prometheus
name: prometheus
spec:
serviceAccountName: prometheus
securityContext:
privileged: true
nodeSelector:
prometheus-host: "true"
containers:
- name: prometheus
args:
- --config.file=/etc/prometheus/prometheus.yml
- --web.listen-address=:9090
- --storage.tsdb.retention=6h
- --storage.tsdb.min-block-duration=15m
- --storage.tsdb.max-block-duration=60m
image: ${IMAGE_PROMETHEUS}
imagePullPolicy: IfNotPresent
livenessProbe:
failureThreshold: 3
httpGet:
path: /status
port: 9090
scheme: HTTP
initialDelaySeconds: 2
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 1
readinessProbe:
failureThreshold: 3
httpGet:
path: /status
port: 9090
scheme: HTTP
initialDelaySeconds: 2
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 1
# Set Requests & Limits
# Prometheus uses 2Gi memory by default with 50% headroom
# required.
resources:
requests:
cpu: 500m
memory: 3Gi
limits:
cpu: 500m
memory: 3Gi
volumeMounts:
- mountPath: /etc/prometheus
name: config-volume
- mountPath: /prometheus
name: data-volume
- mountPath: /etc/prometheus-rules
name: rules-volume
restartPolicy: Always
volumes:
- name: data-volume
hostPath:
path: /var/lib/prometheus-data
type: Directory
- name: config-volume
configMap:
defaultMode: 420
name: prometheus
- name: rules-volume
configMap:
defaultMode: 420
name: prometheus-rules
- apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus
data:
prometheus.yml: |
global:
scrape_interval: 1m
scrape_timeout: 10s
evaluation_interval: 1m
alerting:
alertmanagers:
- scheme: ${ALERT_MANAGER_SCHEME}
static_configs:
- targets:
- "${ALERT_MANAGER_HOST_PORT}"
rule_files:
- /etc/prometheus-rules/*.rules
# A scrape configuration for running Prometheus on a Kubernetes cluster.
# This uses separate scrape configs for cluster components (i.e. API server, node)
# and services to allow each to use different authentication configs.
#
# Kubernetes labels will be added as Prometheus labels on metrics via the
# `labelmap` relabeling action.
scrape_configs:
# Scrape config for API servers.
#
# Kubernetes exposes API servers as endpoints to the default/kubernetes
# service so this uses `endpoints` role and uses relabelling to only keep
# the endpoints associated with the default/kubernetes service using the
# default named port `https`. This works for single API server deployments as
# well as HA API server deployments.
- job_name: kubernetes-apiservers
scrape_interval: 1m
scrape_timeout: 10s
metrics_path: /metrics
scheme: https
kubernetes_sd_configs:
- api_server: null
role: endpoints
namespaces:
names: []
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: false
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
separator: ;
regex: default;kubernetes;https
replacement: $1
action: keep
# Scrape config for controllers.
#
# Each master node exposes a /metrics endpoint on :8444 that contains operational metrics for
# the controllers.
#
# TODO: move this to a pure endpoints based metrics gatherer when controllers are exposed via
# endpoints.
- job_name: kubernetes-controllers
scrape_interval: 1m
scrape_timeout: 10s
metrics_path: /metrics
scheme: https
kubernetes_sd_configs:
- api_server: null
role: endpoints
namespaces:
names: []
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: false
# Keep only the default/kubernetes service endpoints for the https port, and then
# set the port to 8444. This is the default configuration for the controllers on OpenShift
# masters.
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
separator: ;
regex: default;kubernetes;https
replacement: $1
action: keep
- source_labels: [__address__]
separator: ;
regex: (.+)(?::\d+)
target_label: __address__
replacement: $1:8444
action: replace
# Scrape config for nodes.
#
# Each node exposes a /metrics endpoint that contains operational metrics for
# the Kubelet and other components.
- job_name: kubernetes-nodes
scrape_interval: 1m
scrape_timeout: 10s
metrics_path: /metrics
scheme: https
kubernetes_sd_configs:
- api_server: null
role: node
namespaces:
names: []
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: false
relabel_configs:
- separator: ;
regex: __meta_kubernetes_node_label_(.+)
replacement: $1
action: labelmap
# Scrape config for cAdvisor.
#
# Beginning in Kube 1.7, each node exposes a /metrics/cadvisor endpoint that
# reports container metrics for each running pod. Scrape those by default.
- job_name: kubernetes-cadvisor
scrape_interval: 1m
scrape_timeout: 10s
metrics_path: /metrics/cadvisor
scheme: https
kubernetes_sd_configs:
- api_server: null
role: node
namespaces:
names: []
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: false
relabel_configs:
- separator: ;
regex: __meta_kubernetes_node_label_(.+)
replacement: $1
action: labelmap
# Scrape config for service endpoints.
#
# The relabeling allows the actual service scrape endpoint to be configured
# via the following annotations:
#
# * `prometheus.io/scrape`: Only scrape services that have a value of `true`
# * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
# to set this to `https` & most likely set the `tls_config` of the scrape config.
# * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
# * `prometheus.io/port`: If the metrics are exposed on a different port to the
# service then set this appropriately.
- job_name: kubernetes-service-endpoints
scrape_interval: 1m
scrape_timeout: 10s
metrics_path: /metrics
scheme: http
kubernetes_sd_configs:
- api_server: null
role: endpoints
namespaces:
names: []
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
separator: ;
regex: "true"
replacement: $1
action: keep
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
separator: ;
regex: (https?)
target_label: __scheme__
replacement: $1
action: replace
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
separator: ;
regex: (.+)
target_label: __metrics_path__
replacement: $1
action: replace
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
separator: ;
regex: (.+)(?::\d+);(\d+)
target_label: __address__
replacement: $1:$2
action: replace
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_username]
separator: ;
regex: (.+)
target_label: __basic_auth_username__
replacement: $1
action: replace
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_password]
separator: ;
regex: (.+)
target_label: __metrics_path__
replacement: $1
action: replace
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
separator: ;
regex: (.+)(?::\d+);(\d+)
target_label: __address__
replacement: $1:$2
action: replace
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_username]
separator: ;
regex: (.+)
target_label: __basic_auth_username__
replacement: $1
action: replace
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_password]
separator: ;
regex: (.+)
target_label: __basic_auth_password__
replacement: $1
action: replace
- separator: ;
regex: __meta_kubernetes_service_label_(.+)
replacement: $1
action: labelmap
- source_labels: [__meta_kubernetes_namespace]
separator: ;
regex: (.*)
target_label: kubernetes_namespace
replacement: $1
action: replace
- source_labels: [__meta_kubernetes_service_name]
separator: ;
regex: (.*)
target_label: kubernetes_name
replacement: $1
# Scrape config for node-exporter, which is expected to be running on port 9100.
- job_name: node-exporters
scrape_interval: 30s
scrape_timeout: 30s
metrics_path: /metrics
scheme: http
kubernetes_sd_configs:
- api_server: null
role: node
namespaces:
names: []
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
relabel_configs:
- separator: ;
regex: __meta_kubernetes_node_label_(.+)
replacement: $1
action: labelmap
- source_labels: [__meta_kubernetes_role]
separator: ;
regex: (.*)
target_label: kubernetes_role
replacement: $1
action: replace
- source_labels: [__address__]
separator: ;
regex: (.*):10250
target_label: __address__
replacement: ${1}:9100
action: replace
# Scrape config for the template service broker
- job_name: 'openshift-template-service-broker'
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/service-ca.crt
server_name: apiserver.openshift-template-service-broker.svc
bearer_token_file: /var/run/secrets/kubernetes.io/scraper/token
kubernetes_sd_configs:
- role: endpoints
namespaces:
names:
- openshift-template-service-broker
relabel_configs:
- source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: api-server;https
- job_name: openshift-routers
scrape_interval: 30s
scrape_timeout: 30s
metrics_path: /metrics
scheme: http
static_configs:
- targets:
- router.default.svc.cluster.local:1936
basic_auth:
username: admin
password: ${ROUTER_PASSWORD}
- apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-rules
data:
alerting.rules: |
groups:
- name: example-rules
interval: 30s # defaults to global interval
rules:
- alert: Node Down
expr: up{job="kubernetes-nodes"} == 0
annotations:
miqTarget: "ContainerNode"
severity: "HIGH"
message: "{{$labels.instance}} is down"
recording.rules: |
groups:
- name: aggregate_container_resources
rules:
- record: container_cpu_usage_rate
expr: sum without (cpu) (rate(container_cpu_usage_seconds_total[5m]))
- record: container_memory_rss_by_type
expr: container_memory_rss{id=~"/|/system.slice|/kubepods.slice"} > 0
- record: container_cpu_usage_percent_by_host
expr: sum by (kubernetes_io_hostname,type)(rate(container_cpu_usage_seconds_total{id="/"}[5m])) / on (kubernetes_io_hostname,type) machine_cpu_cores
- record: apiserver_request_count_rate_by_resources
expr: sum without (client,instance,contentType) (rate(apiserver_request_count[5m]))