Skip to content

Commit

Permalink
Merge pull request #182 from amisstea/RHTAPWATCH-757
Browse files Browse the repository at this point in the history
fix(RHTAPWATCH-757): Don't monitor the build-templates-e2e namespace
  • Loading branch information
hmariset committed Jan 24, 2024
2 parents 3e80744 + c0b64e0 commit 79eb23d
Show file tree
Hide file tree
Showing 7 changed files with 16 additions and 16 deletions.
6 changes: 3 additions & 3 deletions rhobs/alerting/data_plane/prometheus.pod_alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ spec:
interval: 1m
rules:
- alert: UnschedulablePods
expr: last_over_time(kube_pod_status_unschedulable{namespace!~"(.*-tenant|.*-env|openshift-.*|kube-.*|default|tekton-ci|build-definitions-e2e)"}[5m]) == 1
expr: last_over_time(kube_pod_status_unschedulable{namespace!~"(.*-tenant|.*-env|openshift-.*|kube-.*|default|tekton-ci|build-templates-e2e)"}[5m]) == 1
for: 30m
labels:
severity: warning
Expand All @@ -22,7 +22,7 @@ spec:
alert_route_namespace: '{{ $labels.namespace }}'
runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/o11y/alert-rule-unschedualablePods.md
- alert: CrashLoopBackOff
expr: max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", namespace!~"(.*-tenant|.*-env|openshift-.*|kube-.*|default|tekton-ci|build-definitions-e2e)"}[5m]) >= 1
expr: max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", namespace!~"(.*-tenant|.*-env|openshift-.*|kube-.*|default|tekton-ci|build-templates-e2e)"}[5m]) >= 1
for: 15m
labels:
severity: warning
Expand All @@ -36,7 +36,7 @@ spec:
runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/o11y/alert-rule-crashLoopBackOff.md
- alert: PodNotReady
expr: |
kube_pod_status_phase{phase=~"Pending|Unknown|Failed", namespace!~"(.*-tenant|.*-env|openshift-.*|kube-.*|default|tekton-ci|build-definitions-e2e)"} == 1
kube_pod_status_phase{phase=~"Pending|Unknown|Failed", namespace!~"(.*-tenant|.*-env|openshift-.*|kube-.*|default|tekton-ci|build-templates-e2e)"} == 1
unless ignoring (phase) (kube_pod_status_unschedulable == 1)
for: 15m
labels:
Expand Down
2 changes: 1 addition & 1 deletion rhobs/alerting/data_plane/prometheus.pv_alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ spec:
interval: 1m
rules:
- alert: PersistentVolumeIssues
expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending", namespace!~"(.*-tenant|.*-env|openshift-.*|kube-.*|default|tekton-ci|build-definitions-e2e)"} == 1
expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending", namespace!~"(.*-tenant|.*-env|openshift-.*|kube-.*|default|tekton-ci|build-templates-e2e)"} == 1
for: 5m
labels:
severity: warning
Expand Down
4 changes: 2 additions & 2 deletions rhobs/alerting/data_plane/prometheus.quota_alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@ spec:
rules:
- alert: QuotaExceeded
expr: |
kube_resourcequota{type="used", namespace!~"(.*-tenant|.*-env|openshift-.*|kube-.*|default|tekton-ci|build-definitions-e2e)"} /
kube_resourcequota{type="used", namespace!~"(.*-tenant|.*-env|openshift-.*|kube-.*|default|tekton-ci|build-templates-e2e)"} /
ignoring (type)
(kube_resourcequota{type="hard", namespace!~"(.*-tenant|.*-env|openshift-.*|kube-.*|default|tekton-ci|build-definitions-e2e)"} > 0) > 1
(kube_resourcequota{type="hard", namespace!~"(.*-tenant|.*-env|openshift-.*|kube-.*|default|tekton-ci|build-templates-e2e)"} > 0) > 1
for: 15m
labels:
severity: warning
Expand Down
4 changes: 2 additions & 2 deletions test/promql/tests/data_plane/crashloopbackoff_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ tests:
- series: 'kube_pod_container_status_waiting_reason{namespace="tekton-ci", pod="test-pod", container="test-container", reason="CrashLoopBackOff", source_cluster="cluster01"}'
values: '1x15'

# Pod failing with CrashLoopBackOff error but in the 'build-definitions-e2e' namespace so it's ignored
- series: 'kube_pod_container_status_waiting_reason{namespace="build-definitions-e2e", pod="test-pod", container="test-container", reason="CrashLoopBackOff", source_cluster="cluster01"}'
# Pod failing with CrashLoopBackOff error but in the 'build-templates-e2e' namespace so it's ignored
- series: 'kube_pod_container_status_waiting_reason{namespace="build-templates-e2e", pod="test-pod", container="test-container", reason="CrashLoopBackOff", source_cluster="cluster01"}'
values: '1x15'

alert_rule_test:
Expand Down
6 changes: 3 additions & 3 deletions test/promql/tests/data_plane/notready_pods_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -193,10 +193,10 @@ tests:
- series: 'kube_pod_status_unschedulable{namespace="tekton-ci", pod="test-pod", source_cluster="cluster01"}'
values: '0x14'

# Pod is in the Failed state and scheduled, but it's in the 'build-definitions-e2e' namespace so it's ignored.
- series: 'kube_pod_status_phase{pod="test-pod", namespace="build-definitions-e2e", phase="Failed"}'
# Pod is in the Failed state and scheduled, but it's in the 'build-templates-e2e' namespace so it's ignored.
- series: 'kube_pod_status_phase{pod="test-pod", namespace="build-templates-e2e", phase="Failed"}'
values: '1x14'
- series: 'kube_pod_status_unschedulable{namespace="build-definitions-e2e", pod="test-pod", source_cluster="cluster01"}'
- series: 'kube_pod_status_unschedulable{namespace="build-templates-e2e", pod="test-pod", source_cluster="cluster01"}'
values: '0x14'

alert_rule_test:
Expand Down
6 changes: 3 additions & 3 deletions test/promql/tests/data_plane/quota_exceeded_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,10 @@ tests:
- series: 'kube_resourcequota{type="used", namespace="tekton-ci", resourcequota="test-quota", resource="example-resource", source_cluster="cluster01"}'
values: '5x15'

# Namespace exceeds its quota limits but since it's named 'build-definitions-e2e' it's ignored.
- series: 'kube_resourcequota{type="hard", namespace="build-definitions-e2e", resourcequota="test-quota", resource="example-resource", source_cluster="cluster01"}'
# Namespace exceeds its quota limits but since it's named 'build-templates-e2e' it's ignored.
- series: 'kube_resourcequota{type="hard", namespace="build-templates-e2e", resourcequota="test-quota", resource="example-resource", source_cluster="cluster01"}'
values: '2x15'
- series: 'kube_resourcequota{type="used", namespace="build-definitions-e2e", resourcequota="test-quota", resource="example-resource", source_cluster="cluster01"}'
- series: 'kube_resourcequota{type="used", namespace="build-templates-e2e", resourcequota="test-quota", resource="example-resource", source_cluster="cluster01"}'
values: '5x15'

alert_rule_test:
Expand Down
4 changes: 2 additions & 2 deletions test/promql/tests/data_plane/unschedulable_pods_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ tests:
- series: 'kube_pod_status_unschedulable{namespace="tekton-ci", pod="test-pod", source_cluster="cluster01"}'
values: '1x60'

# The pod is in unscheduled status whole time but it's in the 'build-definitions-e2e' namespace so it's ignored
- series: 'kube_pod_status_unschedulable{namespace="build-definitions-e2e", pod="test-pod", source_cluster="cluster01"}'
# The pod is in unscheduled status whole time but it's in the 'build-templates-e2e' namespace so it's ignored
- series: 'kube_pod_status_unschedulable{namespace="build-templates-e2e", pod="test-pod", source_cluster="cluster01"}'
values: '1x60'

alert_rule_test:
Expand Down

0 comments on commit 79eb23d

Please sign in to comment.