diff --git a/rhobs/alerting/data_plane/prometheus.application_alerts.yaml b/rhobs/alerting/data_plane/prometheus.application_alerts.yaml index 5d72c5e4..393b7c7a 100644 --- a/rhobs/alerting/data_plane/prometheus.application_alerts.yaml +++ b/rhobs/alerting/data_plane/prometheus.application_alerts.yaml @@ -25,6 +25,7 @@ spec: Application controller in Pod {{ $labels.pod }} for namespace {{ $labels.namespace }} on cluster {{ $labels.source_cluster }} is failing to successfully delete at least 95% of applications over the past hour + alert_route_namespace: 'application-service' runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/has/application-delete-failed.md - alert: ApplicationCreationErrors expr: | @@ -42,4 +43,5 @@ spec: Application controller in Pod {{ $labels.pod }} for namespace {{ $labels.namespace }} on cluster {{ $labels.source_cluster }} is failing to successfully create at least 95% of applications over the past hour + alert_route_namespace: 'application-service' runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/has/application-create-failed.md diff --git a/rhobs/alerting/data_plane/prometheus.component_alerts.yaml b/rhobs/alerting/data_plane/prometheus.component_alerts.yaml index 02ccbefb..3f1578db 100644 --- a/rhobs/alerting/data_plane/prometheus.component_alerts.yaml +++ b/rhobs/alerting/data_plane/prometheus.component_alerts.yaml @@ -25,6 +25,7 @@ spec: Component controller in Pod {{ $labels.pod }} for namespace {{ $labels.namespace }} on cluster {{ $labels.source_cluster }} is failing to successfully delete at least 95% of components over the past hour + alert_route_namespace: '{{ $labels.namespace }}' runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/has/component-delete-failed.md - alert: ComponentCreationErrors expr: | @@ -42,4 +43,5 @@ spec: Component controller in Pod {{ $labels.pod }} for namespace {{ $labels.namespace }} on cluster {{ $labels.source_cluster }} is failing to successfully create at least 95% of components over the past hour + alert_route_namespace: '{{ $labels.namespace }}' runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/has/component-create-failed.md diff --git a/rhobs/alerting/data_plane/prometheus.gitops_cluster_agent_op_alerts.yaml b/rhobs/alerting/data_plane/prometheus.gitops_cluster_agent_op_alerts.yaml index 1f1497a2..64b55758 100644 --- a/rhobs/alerting/data_plane/prometheus.gitops_cluster_agent_op_alerts.yaml +++ b/rhobs/alerting/data_plane/prometheus.gitops_cluster_agent_op_alerts.yaml @@ -21,4 +21,5 @@ spec: Cluster-Agent Operations in non-complete state are too high. Got: {{ $value }} description: >- The sum of cluster-agent operations in non-complete state is greater than 10 on cluster {{ $labels.source_cluster }} + alert_route_namespace: "gitops-service-argocd" runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/gitops/cluster-agent-operations.md diff --git a/rhobs/alerting/data_plane/prometheus.gitops_deployment_alerts.yaml b/rhobs/alerting/data_plane/prometheus.gitops_deployment_alerts.yaml index db76fada..0990a44c 100644 --- a/rhobs/alerting/data_plane/prometheus.gitops_deployment_alerts.yaml +++ b/rhobs/alerting/data_plane/prometheus.gitops_deployment_alerts.yaml @@ -29,4 +29,5 @@ spec: Less than 95% of GitOps Deployments are in Healthy state: {{ $value | humanizePercentage }} in cluster: {{ $labels.source_cluster }} description: >- The percentage total of Argo CD Deployments that are in Healthy state is less than 95% in cluster: {{ $labels.source_cluster }}. + alert_route_namespace: "gitops-service-argocd" runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/gitops/deployments.md diff --git a/rhobs/alerting/data_plane/prometheus.gitops_routes_alerts.yaml b/rhobs/alerting/data_plane/prometheus.gitops_routes_alerts.yaml index 9fa0d27c..08201b73 100644 --- a/rhobs/alerting/data_plane/prometheus.gitops_routes_alerts.yaml +++ b/rhobs/alerting/data_plane/prometheus.gitops_routes_alerts.yaml @@ -29,4 +29,5 @@ spec: Less than 95% of GitOps Routes are in Healthy state: {{ $value | humanizePercentage }} in cluster: {{ $labels.source_cluster }} description: >- The percentage total of Argo CD Routes that are in Healthy state is less than 95% in cluster: {{ $labels.source_cluster }}. + alert_route_namespace: "gitops-service-argocd" runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/gitops/routes.md diff --git a/rhobs/alerting/data_plane/prometheus.gitops_statefulsets_alerts.yaml b/rhobs/alerting/data_plane/prometheus.gitops_statefulsets_alerts.yaml index c0849a81..41d77744 100644 --- a/rhobs/alerting/data_plane/prometheus.gitops_statefulsets_alerts.yaml +++ b/rhobs/alerting/data_plane/prometheus.gitops_statefulsets_alerts.yaml @@ -29,4 +29,5 @@ spec: Less than 95% of GitOps StatefulSets are in Healthy state: {{ $value | humanizePercentage }} in cluster: {{ $labels.source_cluster }} description: >- The percentage total of Argo CD StatefulSets that are in Healthy state is less than 95% in cluster: {{ $labels.source_cluster }}. + alert_route_namespace: "gitops-service-argocd" runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/gitops/statefulsets.md diff --git a/rhobs/alerting/data_plane/prometheus.gitops_sync_alerts.yaml b/rhobs/alerting/data_plane/prometheus.gitops_sync_alerts.yaml index 5db854e5..e04a253c 100644 --- a/rhobs/alerting/data_plane/prometheus.gitops_sync_alerts.yaml +++ b/rhobs/alerting/data_plane/prometheus.gitops_sync_alerts.yaml @@ -31,4 +31,5 @@ spec: Less than 95% of GitOps applications are in synced state: {{ $value | humanizePercentage }} description: >- The percentage total of all Argo CD applications that are in Synced state is less than 95%. + alert_route_namespace: "gitops-service-argocd" runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/gitops/deploy-from-git-to-k8s.md diff --git a/rhobs/alerting/data_plane/prometheus.latency_component_onboarding_alerts.yaml b/rhobs/alerting/data_plane/prometheus.latency_component_onboarding_alerts.yaml index d57a6c58..ae10daa9 100644 --- a/rhobs/alerting/data_plane/prometheus.latency_component_onboarding_alerts.yaml +++ b/rhobs/alerting/data_plane/prometheus.latency_component_onboarding_alerts.yaml @@ -31,4 +31,5 @@ spec: if PaC provision is requested upon the Component creation, then till the provision finishes has been over 60s for more than 10% of requests during the last 5 minutes on cluster {{ $labels.source_cluster }} + alert_route_namespace: build-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/build-service/latency_component_onboarding.md diff --git a/rhobs/alerting/data_plane/prometheus.latency_image_repository_provision_alerts.yaml b/rhobs/alerting/data_plane/prometheus.latency_image_repository_provision_alerts.yaml index 831f45c0..662f4e80 100644 --- a/rhobs/alerting/data_plane/prometheus.latency_image_repository_provision_alerts.yaml +++ b/rhobs/alerting/data_plane/prometheus.latency_image_repository_provision_alerts.yaml @@ -30,4 +30,5 @@ spec: Time taken to provision image repository has been over 30s for more than 5% of requests during the last 5 minutes on cluster {{ $labels.source_cluster }} + alert_route_namespace: image-controller runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/image-controller/latency_image_repository_provision.md diff --git a/rhobs/alerting/data_plane/prometheus.latency_pac_provision_alerts.yaml b/rhobs/alerting/data_plane/prometheus.latency_pac_provision_alerts.yaml index 03bedb30..024df914 100644 --- a/rhobs/alerting/data_plane/prometheus.latency_pac_provision_alerts.yaml +++ b/rhobs/alerting/data_plane/prometheus.latency_pac_provision_alerts.yaml @@ -30,4 +30,5 @@ spec: Time taken from PaC provision request till Component is provisioned for PaC builds has been over 20s for more than 5% of requests during the last 5 minutes on cluster {{ $labels.source_cluster }} + alert_route_namespace: build-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/build-service/latency_pac_provision.md diff --git a/rhobs/alerting/data_plane/prometheus.latency_pac_unprovision_alerts.yaml b/rhobs/alerting/data_plane/prometheus.latency_pac_unprovision_alerts.yaml index 9d719024..8ebaadad 100644 --- a/rhobs/alerting/data_plane/prometheus.latency_pac_unprovision_alerts.yaml +++ b/rhobs/alerting/data_plane/prometheus.latency_pac_unprovision_alerts.yaml @@ -30,4 +30,5 @@ spec: Time taken from PaC unprovision request till Component is unprovisioned for PaC builds has been over 20s for more than 5% of requests during the last 5 minutes on cluster {{ $labels.source_cluster }} + alert_route_namespace: build-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/build-service/latency_pac_unprovision.md diff --git a/rhobs/alerting/data_plane/prometheus.latency_release_creation_alerts.yaml b/rhobs/alerting/data_plane/prometheus.latency_release_creation_alerts.yaml index 2a2daadf..a8881332 100644 --- a/rhobs/alerting/data_plane/prometheus.latency_release_creation_alerts.yaml +++ b/rhobs/alerting/data_plane/prometheus.latency_release_creation_alerts.yaml @@ -30,4 +30,5 @@ spec: Time from Snapshot marked as passed to release created has been over 10s for more than 10% of requests during the last 5 minutes on cluster {{ $labels.source_cluster }} + alert_route_namespace: integration-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/integration-service/latency_release_creation.md diff --git a/rhobs/alerting/data_plane/prometheus.latency_simple_build_alerts.yaml b/rhobs/alerting/data_plane/prometheus.latency_simple_build_alerts.yaml index b79cd4b7..a8faa573 100644 --- a/rhobs/alerting/data_plane/prometheus.latency_simple_build_alerts.yaml +++ b/rhobs/alerting/data_plane/prometheus.latency_simple_build_alerts.yaml @@ -30,4 +30,5 @@ spec: Time taken from simple build request till the build pipeline is submitted has been over 15s for more than 5% of requests during the last 5 minutes on cluster {{ $labels.source_cluster }} + alert_route_namespace: build-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/build-service/latency_simple_build.md diff --git a/rhobs/alerting/data_plane/prometheus.latency_snapshot_to_static_integration_plr_creation_alerts.yaml b/rhobs/alerting/data_plane/prometheus.latency_snapshot_to_static_integration_plr_creation_alerts.yaml index 77780c87..9553d309 100644 --- a/rhobs/alerting/data_plane/prometheus.latency_snapshot_to_static_integration_plr_creation_alerts.yaml +++ b/rhobs/alerting/data_plane/prometheus.latency_snapshot_to_static_integration_plr_creation_alerts.yaml @@ -30,4 +30,5 @@ spec: Time from Snapshot created to integration PLRs in static envs created has been over 5s for {{ $value | humanizePercentage }} of requests (tolerance 10%) during the last 5 minutes on cluster {{ $labels.source_cluster }} + alert_route_namespace: integration-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/integration-service/latency_snapshot_to_integration_test_static.md diff --git a/rhobs/alerting/data_plane/prometheus.oauth_alerts.yaml b/rhobs/alerting/data_plane/prometheus.oauth_alerts.yaml index a6fb63b1..3a654ed9 100644 --- a/rhobs/alerting/data_plane/prometheus.oauth_alerts.yaml +++ b/rhobs/alerting/data_plane/prometheus.oauth_alerts.yaml @@ -24,4 +24,5 @@ spec: description: >- The average OAuth login time on cluster {{ $labels.source_cluster }} has {{ $value }} sec for the last 5 minutes + alert_route_namespace: spi-system runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/spi/oauth_login.md diff --git a/rhobs/alerting/data_plane/prometheus.pipeline_alerts.yaml b/rhobs/alerting/data_plane/prometheus.pipeline_alerts.yaml index b5c319b0..ed0f8fa5 100644 --- a/rhobs/alerting/data_plane/prometheus.pipeline_alerts.yaml +++ b/rhobs/alerting/data_plane/prometheus.pipeline_alerts.yaml @@ -25,6 +25,7 @@ spec: description: >- Tekton controller on cluster {{ $labels.source_cluster }} the percentage of time needed to receive PipelineRun creation events vs. overall PipelineRun execution time is at {{ $value | humanizePercentage }} instead of less than 5%. + alert_route_namespace: plnsvc-tests runbook_url: TBD - alert: HighExecutionOverhead expr: | @@ -42,4 +43,5 @@ spec: description: >- Tekton controller on cluster {{ $labels.source_cluster }} the percentage of the time needed to create underlying TaskRuns vs. overall PipelineRun execution time is at {{ $value | humanizePercentage }} instead of less than 5%. + alert_route_namespace: plnsvc-tests runbook_url: TBD diff --git a/rhobs/alerting/data_plane/prometheus.pipeline_to_snapshot_alerts.yaml b/rhobs/alerting/data_plane/prometheus.pipeline_to_snapshot_alerts.yaml index 4b3ce04a..65caaa36 100644 --- a/rhobs/alerting/data_plane/prometheus.pipeline_to_snapshot_alerts.yaml +++ b/rhobs/alerting/data_plane/prometheus.pipeline_to_snapshot_alerts.yaml @@ -30,4 +30,5 @@ spec: Time from pipeline run finished to snapshot marked in progress has been over 30s for more than 10% of requests during the last 5 minutes on cluster {{ $labels.source_cluster }} + alert_route_namespace: perf-team-prometheus-reader runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/integration-service/pipeline_to_snapshot_exceeded.md diff --git a/rhobs/alerting/data_plane/prometheus.pv_alerts.yaml b/rhobs/alerting/data_plane/prometheus.pv_alerts.yaml index 432dc2cd..51b3e05d 100644 --- a/rhobs/alerting/data_plane/prometheus.pv_alerts.yaml +++ b/rhobs/alerting/data_plane/prometheus.pv_alerts.yaml @@ -20,4 +20,5 @@ spec: description: >- Persistent Volume {{ $labels.persistentvolume }} in namespace {{ $labels.namespace }} on cluster {{ $labels.source_cluster }} is in {{ $labels.phase }} phase for more than 5 minutes. + alert_route_namespace: '{{ $labels.namespace }}' runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/o11y/alert-rule-pesistentVolumeIssues.md diff --git a/rhobs/alerting/data_plane/prometheus.quota_alerts.yaml b/rhobs/alerting/data_plane/prometheus.quota_alerts.yaml index 50b3ffe4..dea9af27 100644 --- a/rhobs/alerting/data_plane/prometheus.quota_alerts.yaml +++ b/rhobs/alerting/data_plane/prometheus.quota_alerts.yaml @@ -25,4 +25,5 @@ spec: Resource {{ $labels.resource }} in namespace {{ $labels.namespace }} on cluster {{ $labels.source_cluster }} exceeded quota {{ $labels.resourcequota }}. + alert_route_namespace: '{{ $labels.namespace }}' runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/o11y/alert-rule-QuotaExceeded.md diff --git a/rhobs/alerting/data_plane/prometheus.release_service_alerts.yaml b/rhobs/alerting/data_plane/prometheus.release_service_alerts.yaml index caffb8b6..0eb3cd27 100644 --- a/rhobs/alerting/data_plane/prometheus.release_service_alerts.yaml +++ b/rhobs/alerting/data_plane/prometheus.release_service_alerts.yaml @@ -45,6 +45,7 @@ spec: 90% of Releases must be processed under one hour description: >- Release service is failing to successfully process within the period of one hour for 90% of releases + alert_route_namespace: release-service - alert: ReleaseServicePreProcessingDurationSeconds expr: | @@ -60,6 +61,7 @@ spec: 90% of Releases must start processing under 10 seconds description: >- Release service is failing to start processing under 10 seconds for 90% of releases + alert_route_namespace: release-service - alert: ReleaseServiceValidationDurationSeconds expr: | @@ -75,3 +77,4 @@ spec: 90% of Releases must be validated under 5 seconds description: >- Release service is failing to run the validations under 5 seconds for 90% of releases + alert_route_namespace: release-service diff --git a/rhobs/alerting/data_plane/prometheus.seb_created_to_ready_alerts.yaml b/rhobs/alerting/data_plane/prometheus.seb_created_to_ready_alerts.yaml index a700c97c..49dcd6d2 100644 --- a/rhobs/alerting/data_plane/prometheus.seb_created_to_ready_alerts.yaml +++ b/rhobs/alerting/data_plane/prometheus.seb_created_to_ready_alerts.yaml @@ -30,4 +30,5 @@ spec: Time from Snapshot Environment Binding created to marked as ready has been over 120s for more than 10% of requests during the last 5 minutes on cluster {{ $labels.source_cluster }} + alert_route_namespace: integration-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/integration-service/seb_created_to_ready.md diff --git a/rhobs/alerting/data_plane/prometheus.serviceprovider_alerts.yaml b/rhobs/alerting/data_plane/prometheus.serviceprovider_alerts.yaml index 61fc1b3f..37865d73 100644 --- a/rhobs/alerting/data_plane/prometheus.serviceprovider_alerts.yaml +++ b/rhobs/alerting/data_plane/prometheus.serviceprovider_alerts.yaml @@ -28,4 +28,5 @@ spec: Application controller in Pod {{ $labels.pod }} for namespace {{ $labels.namespace }} on instance {{ $labels.source_cluster }} having a {{ $value | humanizePercentage }} of 5xx errors from service provider {{ $labels.sp }} for latest 60 minutes + alert_route_namespace: spi-system runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/spi/alert-rule-serviceprovider5xxErrorsRate.md diff --git a/rhobs/alerting/data_plane/prometheus.stability_image_repository_provision_alerts.yaml b/rhobs/alerting/data_plane/prometheus.stability_image_repository_provision_alerts.yaml index 0ee59500..e3303ada 100644 --- a/rhobs/alerting/data_plane/prometheus.stability_image_repository_provision_alerts.yaml +++ b/rhobs/alerting/data_plane/prometheus.stability_image_repository_provision_alerts.yaml @@ -30,4 +30,5 @@ spec: Time taken to provision image repository has been over 5 minutes for more than 1% of requests during the last 10 minutes on cluster {{ $labels.source_cluster }} + alert_route_namespace: image-controller runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/image-controller/stability_image_repository_provision.md diff --git a/rhobs/alerting/data_plane/prometheus.stability_image_repository_provision_failures_alerts.yaml b/rhobs/alerting/data_plane/prometheus.stability_image_repository_provision_failures_alerts.yaml index 46fc3031..3ef7db67 100644 --- a/rhobs/alerting/data_plane/prometheus.stability_image_repository_provision_failures_alerts.yaml +++ b/rhobs/alerting/data_plane/prometheus.stability_image_repository_provision_failures_alerts.yaml @@ -22,4 +22,5 @@ spec: description: > Provision image repository failures occured for more than 5 requests during the last 30 minutes {{ $labels.source_cluster }} + alert_route_namespace: image-controller runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/image-controller/stability_image_repository_provision_failures.md diff --git a/rhobs/alerting/data_plane/prometheus.stability_pac_provision_alerts.yaml b/rhobs/alerting/data_plane/prometheus.stability_pac_provision_alerts.yaml index 9d90677a..dbf8ec55 100644 --- a/rhobs/alerting/data_plane/prometheus.stability_pac_provision_alerts.yaml +++ b/rhobs/alerting/data_plane/prometheus.stability_pac_provision_alerts.yaml @@ -30,4 +30,5 @@ spec: Time taken from PaC provision request till Component is provisioned for PaC builds has been over 5 minutes for more than 1% of requests during the last 10 minutes on cluster {{ $labels.source_cluster }} + alert_route_namespace: build-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/build-service/stability_pac_provision.md diff --git a/rhobs/alerting/data_plane/prometheus.stability_pac_unprovision_alerts.yaml b/rhobs/alerting/data_plane/prometheus.stability_pac_unprovision_alerts.yaml index 216276c2..3fc2d3eb 100644 --- a/rhobs/alerting/data_plane/prometheus.stability_pac_unprovision_alerts.yaml +++ b/rhobs/alerting/data_plane/prometheus.stability_pac_unprovision_alerts.yaml @@ -30,4 +30,5 @@ spec: Time taken from PaC unprovision request till Component is unprovisioned for PaC builds has been over 5 minutes for more than 1% of requests during the last 10 minutes on cluster {{ $labels.source_cluster }} + alert_route_namespace: build-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/build-service/stability_pac_unprovision.md diff --git a/rhobs/alerting/data_plane/prometheus.stability_simple_build_alerts.yaml b/rhobs/alerting/data_plane/prometheus.stability_simple_build_alerts.yaml index 5966e753..283b7233 100644 --- a/rhobs/alerting/data_plane/prometheus.stability_simple_build_alerts.yaml +++ b/rhobs/alerting/data_plane/prometheus.stability_simple_build_alerts.yaml @@ -30,4 +30,5 @@ spec: Time taken from simple build request till the build pipeline is submitted has been over 5 minutes for more than 1% of requests during the last 10 minutes on cluster {{ $labels.source_cluster }} + alert_route_namespace: build-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/build-service/stability_simple_build.md diff --git a/test/promql/tests/data_plane/application_errors_test.yaml b/test/promql/tests/data_plane/application_errors_test.yaml index 5fb6ac10..7388cebf 100644 --- a/test/promql/tests/data_plane/application_errors_test.yaml +++ b/test/promql/tests/data_plane/application_errors_test.yaml @@ -31,6 +31,7 @@ tests: Application controller in Pod has for namespace application-service on cluster cluster01 is failing to successfully delete at least 95% of applications over the past hour + alert_route_namespace: 'application-service' runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/has/application-delete-failed.md - interval: 1m @@ -59,6 +60,7 @@ tests: Application controller in Pod has for namespace application-service on cluster cluster01 is failing to successfully delete at least 95% of applications over the past hour + alert_route_namespace: application-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/has/application-delete-failed.md - interval: 1m @@ -115,6 +117,7 @@ tests: Application controller in Pod has for namespace application-service on cluster cluster01 is failing to successfully create at least 95% of applications over the past hour + alert_route_namespace: application-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/has/application-create-failed.md - interval: 1m @@ -143,6 +146,7 @@ tests: Application controller in Pod has for namespace application-service on cluster cluster01 is failing to successfully create at least 95% of applications over the past hour + alert_route_namespace: application-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/has/application-create-failed.md - interval: 1m diff --git a/test/promql/tests/data_plane/component_errors_test.yaml b/test/promql/tests/data_plane/component_errors_test.yaml index 529da1c7..7c143ad2 100644 --- a/test/promql/tests/data_plane/component_errors_test.yaml +++ b/test/promql/tests/data_plane/component_errors_test.yaml @@ -41,6 +41,7 @@ tests: Component controller in Pod has for namespace application-service on cluster cluster01 is failing to successfully delete at least 95% of components over the past hour + alert_route_namespace: application-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/has/component-delete-failed.md - interval: 1m @@ -69,6 +70,7 @@ tests: Component controller in Pod has for namespace application-service on cluster cluster01 is failing to successfully delete at least 95% of components over the past hour + alert_route_namespace: application-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/has/component-delete-failed.md - interval: 1m @@ -98,6 +100,7 @@ tests: Component controller in Pod has for namespace application-service on cluster cluster01 is failing to successfully delete at least 95% of components over the past hour + alert_route_namespace: application-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/has/component-delete-failed.md - interval: 1m @@ -164,6 +167,7 @@ tests: Component controller in Pod has for namespace application-service on cluster cluster01 is failing to successfully create at least 95% of components over the past hour + alert_route_namespace: application-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/has/component-create-failed.md - interval: 1m @@ -192,6 +196,7 @@ tests: Component controller in Pod has for namespace application-service on cluster cluster01 is failing to successfully create at least 95% of components over the past hour + alert_route_namespace: application-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/has/component-create-failed.md - interval: 1m @@ -221,6 +226,7 @@ tests: Component controller in Pod has for namespace application-service on cluster cluster01 is failing to successfully create at least 95% of components over the past hour + alert_route_namespace: application-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/has/component-create-failed.md - interval: 1m diff --git a/test/promql/tests/data_plane/gitops_cluster_agent_operation_test.yaml b/test/promql/tests/data_plane/gitops_cluster_agent_operation_test.yaml index 5af446df..6671c2f8 100644 --- a/test/promql/tests/data_plane/gitops_cluster_agent_operation_test.yaml +++ b/test/promql/tests/data_plane/gitops_cluster_agent_operation_test.yaml @@ -24,6 +24,7 @@ tests: Cluster-Agent Operations in non-complete state are too high. Got: 11 description: >- The sum of cluster-agent operations in non-complete state is greater than 10 on cluster cluster01 + alert_route_namespace: "gitops-service-argocd" runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/gitops/cluster-agent-operations.md - interval: 1m @@ -45,6 +46,7 @@ tests: Cluster-Agent Operations in non-complete state are too high. Got: 10 description: >- The sum of cluster-agent operations in non-complete state is greater than 10 on cluster cluster01 + alert_route_namespace: "gitops-service-argocd" runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/gitops/cluster-agent-operations.md - interval: 1m diff --git a/test/promql/tests/data_plane/gitops_deployment_test.yaml b/test/promql/tests/data_plane/gitops_deployment_test.yaml index ef66e5ba..5fff8963 100644 --- a/test/promql/tests/data_plane/gitops_deployment_test.yaml +++ b/test/promql/tests/data_plane/gitops_deployment_test.yaml @@ -29,6 +29,7 @@ tests: Less than 95% of GitOps Deployments are in Healthy state: 0% in cluster: stonesoupp01ue1 description: >- The percentage total of Argo CD Deployments that are in Healthy state is less than 95% in cluster: stonesoupp01ue1. + alert_route_namespace: "gitops-service-argocd" runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/gitops/deployments.md - interval: 1m @@ -74,6 +75,7 @@ tests: Less than 95% of GitOps Deployments are in Healthy state: 50% in cluster: stonesoupp01ue1 description: >- The percentage total of Argo CD Deployments that are in Healthy state is less than 95% in cluster: stonesoupp01ue1. + alert_route_namespace: "gitops-service-argocd" runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/gitops/deployments.md - interval: 1m @@ -119,6 +121,7 @@ tests: Less than 95% of GitOps Deployments are in Healthy state: 50% in cluster: stonesoupp01ue1 description: >- The percentage total of Argo CD Deployments that are in Healthy state is less than 95% in cluster: stonesoupp01ue1. + alert_route_namespace: "gitops-service-argocd" runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/gitops/deployments.md - interval: 1m @@ -179,4 +182,5 @@ tests: Less than 95% of GitOps Deployments are in Healthy state: 80% in cluster: stonesoupp01ue1 description: >- The percentage total of Argo CD Deployments that are in Healthy state is less than 95% in cluster: stonesoupp01ue1. + alert_route_namespace: "gitops-service-argocd" runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/gitops/deployments.md diff --git a/test/promql/tests/data_plane/gitops_routes_test.yaml b/test/promql/tests/data_plane/gitops_routes_test.yaml index 03691ed5..2f4110a8 100644 --- a/test/promql/tests/data_plane/gitops_routes_test.yaml +++ b/test/promql/tests/data_plane/gitops_routes_test.yaml @@ -39,6 +39,7 @@ tests: Less than 95% of GitOps Routes are in Healthy state: 50% in cluster: stonesoupp01ue1 description: >- The percentage total of Argo CD Routes that are in Healthy state is less than 95% in cluster: stonesoupp01ue1. + alert_route_namespace: "gitops-service-argocd" runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/gitops/routes.md - interval: 1m @@ -67,6 +68,7 @@ tests: Less than 95% of GitOps Routes are in Healthy state: 50% in cluster: stonesoupp01ue1 description: >- The percentage total of Argo CD Routes that are in Healthy state is less than 95% in cluster: stonesoupp01ue1. + alert_route_namespace: "gitops-service-argocd" runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/gitops/routes.md - interval: 1m @@ -146,6 +148,7 @@ tests: Less than 95% of GitOps Routes are in Healthy state: 33.33% in cluster: stonesoupp01ue1 description: >- The percentage total of Argo CD Routes that are in Healthy state is less than 95% in cluster: stonesoupp01ue1. + alert_route_namespace: "gitops-service-argocd" runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/gitops/routes.md - interval: 1m @@ -172,6 +175,7 @@ tests: Less than 95% of GitOps Routes are in Healthy state: 66.67% in cluster: stonesoupp01ue1 description: >- The percentage total of Argo CD Routes that are in Healthy state is less than 95% in cluster: stonesoupp01ue1. + alert_route_namespace: "gitops-service-argocd" runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/gitops/routes.md - interval: 1m @@ -234,6 +238,7 @@ tests: Less than 95% of GitOps Routes are in Healthy state: 66.67% in cluster: stonesoupp01ue1 description: >- The percentage total of Argo CD Routes that are in Healthy state is less than 95% in cluster: stonesoupp01ue1. + alert_route_namespace: "gitops-service-argocd" runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/gitops/routes.md - interval: 1m @@ -275,6 +280,7 @@ tests: Less than 95% of GitOps Routes are in Healthy state: 90% in cluster: stonesoupp01ue1 description: >- The percentage total of Argo CD Routes that are in Healthy state is less than 95% in cluster: stonesoupp01ue1. + alert_route_namespace: "gitops-service-argocd" runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/gitops/routes.md - interval: 1m diff --git a/test/promql/tests/data_plane/gitops_statefulsets_test.yaml b/test/promql/tests/data_plane/gitops_statefulsets_test.yaml index 9ef3f277..167f96e1 100644 --- a/test/promql/tests/data_plane/gitops_statefulsets_test.yaml +++ b/test/promql/tests/data_plane/gitops_statefulsets_test.yaml @@ -25,6 +25,7 @@ tests: Less than 95% of GitOps StatefulSets are in Healthy state: 0% in cluster: stonesoupp01ue1 description: >- The percentage total of Argo CD StatefulSets that are in Healthy state is less than 95% in cluster: stonesoupp01ue1. + alert_route_namespace: "gitops-service-argocd" runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/gitops/statefulsets.md - interval: 1m @@ -62,6 +63,7 @@ tests: Less than 95% of GitOps StatefulSets are in Healthy state: 30% in cluster: stonesoupp01ue1 description: >- The percentage total of Argo CD StatefulSets that are in Healthy state is less than 95% in cluster: stonesoupp01ue1. + alert_route_namespace: "gitops-service-argocd" runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/gitops/statefulsets.md - interval: 1m @@ -153,6 +155,7 @@ tests: Less than 95% of GitOps StatefulSets are in Healthy state: 0% in cluster: stonesoupp01ue1 description: >- The percentage total of Argo CD StatefulSets that are in Healthy state is less than 95% in cluster: stonesoupp01ue1. + alert_route_namespace: "gitops-service-argocd" runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/gitops/statefulsets.md - interval: 1m @@ -176,4 +179,5 @@ tests: Less than 95% of GitOps StatefulSets are in Healthy state: 20% in cluster: stonesoupp01ue1 description: >- The percentage total of Argo CD StatefulSets that are in Healthy state is less than 95% in cluster: stonesoupp01ue1. + alert_route_namespace: "gitops-service-argocd" runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/gitops/statefulsets.md diff --git a/test/promql/tests/data_plane/gitops_sync_test.yaml b/test/promql/tests/data_plane/gitops_sync_test.yaml index 06fff68b..e1cff0b3 100644 --- a/test/promql/tests/data_plane/gitops_sync_test.yaml +++ b/test/promql/tests/data_plane/gitops_sync_test.yaml @@ -29,6 +29,7 @@ tests: Less than 95% of GitOps applications are in synced state: 0% description: >- The percentage total of all Argo CD applications that are in Synced state is less than 95%. + alert_route_namespace: "gitops-service-argocd" runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/gitops/deploy-from-git-to-k8s.md - interval: 1m @@ -55,6 +56,7 @@ tests: Less than 95% of GitOps applications are in synced state: 0% description: >- The percentage total of all Argo CD applications that are in Synced state is less than 95%. + alert_route_namespace: "gitops-service-argocd" runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/gitops/deploy-from-git-to-k8s.md - interval: 1m @@ -101,6 +103,7 @@ tests: Less than 95% of GitOps applications are in synced state: 33.33% description: >- The percentage total of all Argo CD applications that are in Synced state is less than 95%. + alert_route_namespace: "gitops-service-argocd" runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/gitops/deploy-from-git-to-k8s.md @@ -128,6 +131,7 @@ tests: Less than 95% of GitOps applications are in synced state: 33.33% description: >- The percentage total of all Argo CD applications that are in Synced state is less than 95%. + alert_route_namespace: "gitops-service-argocd" runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/gitops/deploy-from-git-to-k8s.md - interval: 1m @@ -153,6 +157,7 @@ tests: Less than 95% of GitOps applications are in synced state: 50% description: >- The percentage total of all Argo CD applications that are in Synced state is less than 95%. + alert_route_namespace: "gitops-service-argocd" runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/gitops/deploy-from-git-to-k8s.md - interval: 1m @@ -259,6 +264,7 @@ tests: Less than 95% of GitOps applications are in synced state: 50% description: >- The percentage total of all Argo CD applications that are in Synced state is less than 95%. + alert_route_namespace: "gitops-service-argocd" runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/gitops/deploy-from-git-to-k8s.md - interval: 1m @@ -287,6 +293,7 @@ tests: Less than 95% of GitOps applications are in synced state: 66.67% description: >- The percentage total of all Argo CD applications that are in Synced state is less than 95%. + alert_route_namespace: "gitops-service-argocd" runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/gitops/deploy-from-git-to-k8s.md - interval: 1m @@ -358,6 +365,7 @@ tests: Less than 95% of GitOps applications are in synced state: 66.67% description: >- The percentage total of all Argo CD applications that are in Synced state is less than 95%. + alert_route_namespace: "gitops-service-argocd" runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/gitops/deploy-from-git-to-k8s.md - interval: 1m @@ -398,6 +406,7 @@ tests: Less than 95% of GitOps applications are in synced state: 90% description: >- The percentage total of all Argo CD applications that are in Synced state is less than 95%. + alert_route_namespace: "gitops-service-argocd" runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/gitops/deploy-from-git-to-k8s.md - interval: 1m @@ -475,6 +484,7 @@ tests: Less than 95% of GitOps applications are in synced state: 50% description: >- The percentage total of all Argo CD applications that are in Synced state is less than 95%. + alert_route_namespace: "gitops-service-argocd" runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/gitops/deploy-from-git-to-k8s.md - interval: 1m @@ -564,6 +574,7 @@ tests: Less than 95% of GitOps applications are in synced state: 50% description: >- The percentage total of all Argo CD applications that are in Synced state is less than 95%. + alert_route_namespace: "gitops-service-argocd" runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/gitops/deploy-from-git-to-k8s.md - interval: 1m @@ -613,6 +624,7 @@ tests: Less than 95% of GitOps applications are in synced state: 0% description: >- The percentage total of all Argo CD applications that are in Synced state is less than 95%. + alert_route_namespace: "gitops-service-argocd" runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/gitops/deploy-from-git-to-k8s.md - interval: 1m @@ -662,6 +674,7 @@ tests: Less than 95% of GitOps applications are in synced state: 0% description: >- The percentage total of all Argo CD applications that are in Synced state is less than 95%. + alert_route_namespace: "gitops-service-argocd" runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/gitops/deploy-from-git-to-k8s.md # ------------------------------------------------------------------ diff --git a/test/promql/tests/data_plane/latency_component_onboarding_test.yaml b/test/promql/tests/data_plane/latency_component_onboarding_test.yaml index 1f2f3abd..412f828f 100644 --- a/test/promql/tests/data_plane/latency_component_onboarding_test.yaml +++ b/test/promql/tests/data_plane/latency_component_onboarding_test.yaml @@ -35,6 +35,7 @@ tests: if PaC provision is requested upon the Component creation, then till the provision finishes has been over 60s for more than 10% of requests during the last 5 minutes on cluster cluster01 + alert_route_namespace: build-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/build-service/latency_component_onboarding.md # Scenario where both clusters cross the 10% threshold @@ -68,6 +69,7 @@ tests: if PaC provision is requested upon the Component creation, then till the provision finishes has been over 60s for more than 10% of requests during the last 5 minutes on cluster cluster01 + alert_route_namespace: build-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/build-service/latency_component_onboarding.md - exp_labels: severity: warning @@ -80,6 +82,7 @@ tests: if PaC provision is requested upon the Component creation, then till the provision finishes has been over 60s for more than 10% of requests during the last 5 minutes on cluster cluster02 + alert_route_namespace: build-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/build-service/latency_component_onboarding.md # Scenario where neither cluster crosses the 10% threshold diff --git a/test/promql/tests/data_plane/latency_image_repository_provision_test.yaml b/test/promql/tests/data_plane/latency_image_repository_provision_test.yaml index 809776c5..bc23bcab 100644 --- a/test/promql/tests/data_plane/latency_image_repository_provision_test.yaml +++ b/test/promql/tests/data_plane/latency_image_repository_provision_test.yaml @@ -34,6 +34,7 @@ tests: Time taken to provision image repository has been over 30s for more than 5% of requests during the last 5 minutes on cluster cluster01 + alert_route_namespace: image-controller runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/image-controller/latency_image_repository_provision.md # Scenario where both clusters cross the 5% threshold @@ -66,6 +67,7 @@ tests: Time taken to provision image repository has been over 30s for more than 5% of requests during the last 5 minutes on cluster cluster01 + alert_route_namespace: image-controller runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/image-controller/latency_image_repository_provision.md - exp_labels: severity: warning @@ -77,6 +79,7 @@ tests: Time taken to provision image repository has been over 30s for more than 5% of requests during the last 5 minutes on cluster cluster02 + alert_route_namespace: image-controller runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/image-controller/latency_image_repository_provision.md # Scenario where neither cluster crosses the 5% threshold diff --git a/test/promql/tests/data_plane/latency_pac_provision_test.yaml b/test/promql/tests/data_plane/latency_pac_provision_test.yaml index a53849b5..54750588 100644 --- a/test/promql/tests/data_plane/latency_pac_provision_test.yaml +++ b/test/promql/tests/data_plane/latency_pac_provision_test.yaml @@ -34,6 +34,7 @@ tests: Time taken from PaC provision request till Component is provisioned for PaC builds has been over 20s for more than 5% of requests during the last 5 minutes on cluster cluster01 + alert_route_namespace: build-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/build-service/latency_pac_provision.md # Scenario where both clusters cross the 5% threshold @@ -66,6 +67,7 @@ tests: Time taken from PaC provision request till Component is provisioned for PaC builds has been over 20s for more than 5% of requests during the last 5 minutes on cluster cluster01 + alert_route_namespace: build-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/build-service/latency_pac_provision.md - exp_labels: severity: warning @@ -77,6 +79,7 @@ tests: Time taken from PaC provision request till Component is provisioned for PaC builds has been over 20s for more than 5% of requests during the last 5 minutes on cluster cluster02 + alert_route_namespace: build-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/build-service/latency_pac_provision.md # Scenario where neither cluster crosses the 5% threshold diff --git a/test/promql/tests/data_plane/latency_pac_unprovision_test.yaml b/test/promql/tests/data_plane/latency_pac_unprovision_test.yaml index ad23647b..ae5f4c7f 100644 --- a/test/promql/tests/data_plane/latency_pac_unprovision_test.yaml +++ b/test/promql/tests/data_plane/latency_pac_unprovision_test.yaml @@ -34,6 +34,7 @@ tests: Time taken from PaC unprovision request till Component is unprovisioned for PaC builds has been over 20s for more than 5% of requests during the last 5 minutes on cluster cluster01 + alert_route_namespace: build-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/build-service/latency_pac_unprovision.md # Scenario where both clusters cross the 5% threshold @@ -66,6 +67,7 @@ tests: Time taken from PaC unprovision request till Component is unprovisioned for PaC builds has been over 20s for more than 5% of requests during the last 5 minutes on cluster cluster01 + alert_route_namespace: build-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/build-service/latency_pac_unprovision.md - exp_labels: severity: warning @@ -77,6 +79,7 @@ tests: Time taken from PaC unprovision request till Component is unprovisioned for PaC builds has been over 20s for more than 5% of requests during the last 5 minutes on cluster cluster02 + alert_route_namespace: build-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/build-service/latency_pac_unprovision.md # Scenario where neither cluster crosses the 5% threshold diff --git a/test/promql/tests/data_plane/latency_release_creation_test.yaml b/test/promql/tests/data_plane/latency_release_creation_test.yaml index fe243ae2..111378b0 100644 --- a/test/promql/tests/data_plane/latency_release_creation_test.yaml +++ b/test/promql/tests/data_plane/latency_release_creation_test.yaml @@ -32,6 +32,7 @@ tests: Time from Snapshot marked as passed to release created has been over 10s for more than 10% of requests during the last 5 minutes on cluster cluster01 + alert_route_namespace: integration-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/integration-service/latency_release_creation.md # Scenario where both clusters cross the 10% threshold @@ -64,6 +65,7 @@ tests: Time from Snapshot marked as passed to release created has been over 10s for more than 10% of requests during the last 5 minutes on cluster cluster01 + alert_route_namespace: integration-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/integration-service/latency_release_creation.md - exp_labels: severity: warning @@ -75,6 +77,7 @@ tests: Time from Snapshot marked as passed to release created has been over 10s for more than 10% of requests during the last 5 minutes on cluster cluster02 + alert_route_namespace: integration-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/integration-service/latency_release_creation.md # Scenario where neither cluster crosses the 10% threshold diff --git a/test/promql/tests/data_plane/latency_simple_build_test.yaml b/test/promql/tests/data_plane/latency_simple_build_test.yaml index 4c5cd040..0ce45031 100644 --- a/test/promql/tests/data_plane/latency_simple_build_test.yaml +++ b/test/promql/tests/data_plane/latency_simple_build_test.yaml @@ -34,6 +34,7 @@ tests: Time taken from simple build request till the build pipeline is submitted has been over 15s for more than 5% of requests during the last 5 minutes on cluster cluster01 + alert_route_namespace: build-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/build-service/latency_simple_build.md # Scenario where both clusters cross the 5% threshold @@ -66,6 +67,7 @@ tests: Time taken from simple build request till the build pipeline is submitted has been over 15s for more than 5% of requests during the last 5 minutes on cluster cluster01 + alert_route_namespace: build-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/build-service/latency_simple_build.md - exp_labels: severity: warning @@ -77,6 +79,7 @@ tests: Time taken from simple build request till the build pipeline is submitted has been over 15s for more than 5% of requests during the last 5 minutes on cluster cluster02 + alert_route_namespace: build-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/build-service/latency_simple_build.md # Scenario where neither cluster crosses the 5% threshold diff --git a/test/promql/tests/data_plane/latency_snapshot_to_static_integration_plr_creation_test.yaml b/test/promql/tests/data_plane/latency_snapshot_to_static_integration_plr_creation_test.yaml index 3faf8514..d50e32b7 100644 --- a/test/promql/tests/data_plane/latency_snapshot_to_static_integration_plr_creation_test.yaml +++ b/test/promql/tests/data_plane/latency_snapshot_to_static_integration_plr_creation_test.yaml @@ -32,6 +32,7 @@ tests: Time from Snapshot created to integration PLRs in static envs created has been over 5s for 90% of requests (tolerance 10%) during the last 5 minutes on cluster cluster01 + alert_route_namespace: integration-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/integration-service/latency_snapshot_to_integration_test_static.md # Scenario where both clusters cross the 10% threshold @@ -64,6 +65,7 @@ tests: Time from Snapshot created to integration PLRs in static envs created has been over 5s for 95% of requests (tolerance 10%) during the last 5 minutes on cluster cluster01 + alert_route_namespace: integration-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/integration-service/latency_snapshot_to_integration_test_static.md - exp_labels: severity: warning @@ -75,6 +77,7 @@ tests: Time from Snapshot created to integration PLRs in static envs created has been over 5s for 95% of requests (tolerance 10%) during the last 5 minutes on cluster cluster02 + alert_route_namespace: integration-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/integration-service/latency_snapshot_to_integration_test_static.md # Scenario where neither cluster crosses the 10% threshold diff --git a/test/promql/tests/data_plane/oauth_time_test.yaml b/test/promql/tests/data_plane/oauth_time_test.yaml index 94fc3a8c..6e08bbd6 100644 --- a/test/promql/tests/data_plane/oauth_time_test.yaml +++ b/test/promql/tests/data_plane/oauth_time_test.yaml @@ -29,6 +29,7 @@ tests: OAuth login average time is more than 30 sec on cluster01 description: >- The average OAuth login time on cluster cluster01 has 50 sec for the last 5 minutes + alert_route_namespace: spi-system runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/spi/oauth_login.md - interval: 1m @@ -60,6 +61,7 @@ tests: OAuth login average time is more than 30 sec on cluster01 description: >- The average OAuth login time on cluster cluster01 has 50 sec for the last 5 minutes + alert_route_namespace: spi-system runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/spi/oauth_login.md - exp_labels: severity: warning @@ -73,6 +75,7 @@ tests: OAuth login average time is more than 30 sec on cluster02 description: >- The average OAuth login time on cluster cluster02 has 60 sec for the last 5 minutes + alert_route_namespace: spi-system runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/spi/oauth_login.md diff --git a/test/promql/tests/data_plane/persistentvolumeissues_test.yaml b/test/promql/tests/data_plane/persistentvolumeissues_test.yaml index 328a3925..0788520d 100644 --- a/test/promql/tests/data_plane/persistentvolumeissues_test.yaml +++ b/test/promql/tests/data_plane/persistentvolumeissues_test.yaml @@ -36,6 +36,7 @@ tests: description: >- Persistent Volume pv-1 in namespace ns-1 on cluster cluster01 is in Pending phase for more than 5 minutes. + alert_route_namespace: ns-1 runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/o11y/alert-rule-pesistentVolumeIssues.md - interval: 1m @@ -68,6 +69,7 @@ tests: description: >- Persistent Volume pv-2 in namespace ns-2 on cluster cluster02 is in Failed phase for more than 5 minutes. + alert_route_namespace: ns-2 runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/o11y/alert-rule-pesistentVolumeIssues.md # Not Alerted cases: diff --git a/test/promql/tests/data_plane/pipeline_overhead_test.yaml b/test/promql/tests/data_plane/pipeline_overhead_test.yaml index c44b8d4f..e8536f94 100644 --- a/test/promql/tests/data_plane/pipeline_overhead_test.yaml +++ b/test/promql/tests/data_plane/pipeline_overhead_test.yaml @@ -37,6 +37,7 @@ tests: description: >- Tekton controller on cluster cluster01 the percentage of time needed to receive PipelineRun creation events vs. overall PipelineRun execution time is at 100% instead of less than 5%. + alert_route_namespace: plnsvc-tests runbook_url: TBD - interval: 1m @@ -71,6 +72,7 @@ tests: description: >- Tekton controller on cluster cluster01 the percentage of time needed to receive PipelineRun creation events vs. overall PipelineRun execution time is at 50% instead of less than 5%. + alert_route_namespace: plnsvc-tests runbook_url: TBD - interval: 1m @@ -128,6 +130,7 @@ tests: description: >- Tekton controller on cluster cluster01 the percentage of the time needed to create underlying TaskRuns vs. overall PipelineRun execution time is at 100% instead of less than 5%. + alert_route_namespace: plnsvc-tests runbook_url: TBD - interval: 1m @@ -166,6 +169,7 @@ tests: description: >- Tekton controller on cluster cluster01 the percentage of the time needed to create underlying TaskRuns vs. overall PipelineRun execution time is at 5.263% instead of less than 5%. + alert_route_namespace: plnsvc-tests runbook_url: TBD - interval: 1m diff --git a/test/promql/tests/data_plane/pipeline_to_snapshot_test.yaml b/test/promql/tests/data_plane/pipeline_to_snapshot_test.yaml index 1ff95fb2..af4f4cca 100644 --- a/test/promql/tests/data_plane/pipeline_to_snapshot_test.yaml +++ b/test/promql/tests/data_plane/pipeline_to_snapshot_test.yaml @@ -33,6 +33,7 @@ tests: Time from pipeline run finished to snapshot marked in progress has been over 30s for more than 10% of requests during the last 5 minutes on cluster cluster01 + alert_route_namespace: perf-team-prometheus-reader runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/integration-service/pipeline_to_snapshot_exceeded.md # Scenario where both clusters cross the 10% threshold, alert should trigger for both @@ -63,6 +64,7 @@ tests: Time from pipeline run finished to snapshot marked in progress has been over 30s for more than 10% of requests during the last 5 minutes on cluster cluster01 + alert_route_namespace: perf-team-prometheus-reader runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/integration-service/pipeline_to_snapshot_exceeded.md - exp_labels: severity: warning @@ -75,6 +77,7 @@ tests: Time from pipeline run finished to snapshot marked in progress has been over 30s for more than 10% of requests during the last 5 minutes on cluster cluster02 + alert_route_namespace: perf-team-prometheus-reader runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/integration-service/pipeline_to_snapshot_exceeded.md # Scenario where no alert is triggered as both clusters stay below the 10% threshold diff --git a/test/promql/tests/data_plane/quota_exceeded_test.yaml b/test/promql/tests/data_plane/quota_exceeded_test.yaml index e7ee8536..14f1a6c8 100644 --- a/test/promql/tests/data_plane/quota_exceeded_test.yaml +++ b/test/promql/tests/data_plane/quota_exceeded_test.yaml @@ -41,6 +41,7 @@ tests: description: >- Resource example-resource in namespace exceeding-limits on cluster cluster01 exceeded quota test-quota. + alert_route_namespace: exceeding-limits runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/o11y/alert-rule-QuotaExceeded.md - interval: 1m diff --git a/test/promql/tests/data_plane/release_service_test.yaml b/test/promql/tests/data_plane/release_service_test.yaml index b60bb43d..71c6bf3a 100644 --- a/test/promql/tests/data_plane/release_service_test.yaml +++ b/test/promql/tests/data_plane/release_service_test.yaml @@ -6,13 +6,13 @@ evaluation_interval: 1m tests: - interval: 1m input_series: - - series: 'release_processing_duration_seconds_bucket{le="3600", job="release", source_cluster="cluster01"}' + - series: 'release_processing_duration_seconds_bucket{le="3600", job="release", namespace="foo", source_cluster="cluster01"}' values: '1+8x59' - - series: 'release_processing_duration_seconds_count{job="release", source_cluster="cluster01"}' + - series: 'release_processing_duration_seconds_count{job="release", namespace="foo", source_cluster="cluster01"}' values: '1+9x59' - - series: 'release_processing_duration_seconds_bucket{le="3600", job="release", source_cluster="cluster02"}' + - series: 'release_processing_duration_seconds_bucket{le="3600", job="release", namespace="foo", source_cluster="cluster02"}' values: '1+8x59' - - series: 'release_processing_duration_seconds_count{job="release", source_cluster="cluster02"}' + - series: 'release_processing_duration_seconds_count{job="release", namespace="foo", source_cluster="cluster02"}' values: '1+9x59' alert_rule_test: - eval_time: 1h @@ -22,21 +22,23 @@ tests: severity: critical slo: "true" job: "release" + namespace: "foo" exp_annotations: summary: >- 90% of Releases must be processed under one hour description: >- Release service is failing to successfully process within the period of one hour for 90% of releases + alert_route_namespace: release-service - interval: 1m input_series: - - series: 'release_pre_processing_duration_seconds_bucket{le="10", job="release", source_cluster="cluster01"}' + - series: 'release_pre_processing_duration_seconds_bucket{le="10", job="release", namespace="foo", source_cluster="cluster01"}' values: '1+8x59' - - series: 'release_pre_processing_duration_seconds_count{job="release", source_cluster="cluster01"}' + - series: 'release_pre_processing_duration_seconds_count{job="release", namespace="foo", source_cluster="cluster01"}' values: '1+9x59' - - series: 'release_pre_processing_duration_seconds_bucket{le="10", job="release", source_cluster="cluster02"}' + - series: 'release_pre_processing_duration_seconds_bucket{le="10", job="release", namespace="foo", source_cluster="cluster02"}' values: '1+8x59' - - series: 'release_pre_processing_duration_seconds_count{job="release", source_cluster="cluster02"}' + - series: 'release_pre_processing_duration_seconds_count{job="release", namespace="foo", source_cluster="cluster02"}' values: '1+9x59' alert_rule_test: - eval_time: 1h @@ -46,21 +48,23 @@ tests: severity: critical slo: "true" job: "release" + namespace: "foo" exp_annotations: summary: >- 90% of Releases must start processing under 10 seconds description: >- Release service is failing to start processing under 10 seconds for 90% of releases + alert_route_namespace: release-service - interval: 1m input_series: - - series: 'release_validation_duration_seconds_bucket{le="5", job="release", source_cluster="cluster01"}' + - series: 'release_validation_duration_seconds_bucket{le="5", job="release", namespace="foo", source_cluster="cluster01"}' values: '1+8x59' - - series: 'release_validation_duration_seconds_count{job="release", source_cluster="cluster01"}' + - series: 'release_validation_duration_seconds_count{job="release", namespace="foo", source_cluster="cluster01"}' values: '1+9x59' - - series: 'release_validation_duration_seconds_bucket{le="5", job="release", source_cluster="cluster02"}' + - series: 'release_validation_duration_seconds_bucket{le="5", job="release", namespace="foo", source_cluster="cluster02"}' values: '1+8x59' - - series: 'release_validation_duration_seconds_count{job="release", source_cluster="cluster02"}' + - series: 'release_validation_duration_seconds_count{job="release", namespace="foo", source_cluster="cluster02"}' values: '1+9x59' alert_rule_test: - eval_time: 1h @@ -70,8 +74,10 @@ tests: severity: critical slo: "true" job: "release" + namespace: "foo" exp_annotations: summary: >- 90% of Releases must be validated under 5 seconds description: >- Release service is failing to run the validations under 5 seconds for 90% of releases + alert_route_namespace: release-service diff --git a/test/promql/tests/data_plane/seb_created_to_ready_test.yaml b/test/promql/tests/data_plane/seb_created_to_ready_test.yaml index 2b3797c1..c5ddec63 100644 --- a/test/promql/tests/data_plane/seb_created_to_ready_test.yaml +++ b/test/promql/tests/data_plane/seb_created_to_ready_test.yaml @@ -27,6 +27,7 @@ tests: Time from Snapshot Environment Binding created to marked as ready has been over 120s for more than 10% of requests during the last 5 minutes on cluster cluster01 + alert_route_namespace: "integration-service" runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/integration-service/seb_created_to_ready.md # Scenario where both clusters exceed the 10% threshold @@ -57,6 +58,7 @@ tests: Time from Snapshot Environment Binding created to marked as ready has been over 120s for more than 10% of requests during the last 5 minutes on cluster cluster01 + alert_route_namespace: "integration-service" runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/integration-service/seb_created_to_ready.md - exp_labels: @@ -69,6 +71,7 @@ tests: Time from Snapshot Environment Binding created to marked as ready has been over 120s for more than 10% of requests during the last 5 minutes on cluster cluster02 + alert_route_namespace: "integration-service" runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/integration-service/seb_created_to_ready.md # Scenario where neither cluster crosses the 10% threshold diff --git a/test/promql/tests/data_plane/serviceprovider_errors_test.yaml b/test/promql/tests/data_plane/serviceprovider_errors_test.yaml index 2b7b4ca2..343ebd42 100644 --- a/test/promql/tests/data_plane/serviceprovider_errors_test.yaml +++ b/test/promql/tests/data_plane/serviceprovider_errors_test.yaml @@ -32,6 +32,7 @@ tests: Application controller in Pod spi-controller-manager for namespace spi-system on instance cluster01 having a 50% of 5xx errors from service provider GitHub for latest 60 minutes + alert_route_namespace: spi-system runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/spi/alert-rule-serviceprovider5xxErrorsRate.md - interval: 1m @@ -64,6 +65,7 @@ tests: Application controller in Pod spi-controller-manager for namespace spi-system on instance cluster01 having a 33.33% of 5xx errors from service provider GitHub for latest 60 minutes + alert_route_namespace: spi-system runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/spi/alert-rule-serviceprovider5xxErrorsRate.md diff --git a/test/promql/tests/data_plane/stability_image_repository_provision_failures_test.yaml b/test/promql/tests/data_plane/stability_image_repository_provision_failures_test.yaml index 0b0a82a4..22262ef1 100644 --- a/test/promql/tests/data_plane/stability_image_repository_provision_failures_test.yaml +++ b/test/promql/tests/data_plane/stability_image_repository_provision_failures_test.yaml @@ -29,6 +29,7 @@ tests: description: > Provision image repository failures occured for more than 5 requests during the last 30 minutes cluster01 + alert_route_namespace: image-controller runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/image-controller/stability_image_repository_provision_failures.md # Scenario where both clusters cross the 5 threshold @@ -56,6 +57,7 @@ tests: description: > Provision image repository failures occured for more than 5 requests during the last 30 minutes cluster01 + alert_route_namespace: image-controller runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/image-controller/stability_image_repository_provision_failures.md - exp_labels: severity: warning @@ -66,6 +68,7 @@ tests: description: > Provision image repository failures occured for more than 5 requests during the last 30 minutes cluster02 + alert_route_namespace: image-controller runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/image-controller/stability_image_repository_provision_failures.md # Scenario where neither cluster crosses the 5 threshold diff --git a/test/promql/tests/data_plane/stability_image_repository_provision_test.yaml b/test/promql/tests/data_plane/stability_image_repository_provision_test.yaml index e0de7319..ca900b5b 100644 --- a/test/promql/tests/data_plane/stability_image_repository_provision_test.yaml +++ b/test/promql/tests/data_plane/stability_image_repository_provision_test.yaml @@ -34,6 +34,7 @@ tests: Time taken to provision image repository has been over 5 minutes for more than 1% of requests during the last 10 minutes on cluster cluster01 + alert_route_namespace: image-controller runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/image-controller/stability_image_repository_provision.md # Scenario where both clusters cross the 1% threshold @@ -66,6 +67,7 @@ tests: Time taken to provision image repository has been over 5 minutes for more than 1% of requests during the last 10 minutes on cluster cluster01 + alert_route_namespace: image-controller runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/image-controller/stability_image_repository_provision.md - exp_labels: severity: warning @@ -77,6 +79,7 @@ tests: Time taken to provision image repository has been over 5 minutes for more than 1% of requests during the last 10 minutes on cluster cluster02 + alert_route_namespace: image-controller runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/image-controller/stability_image_repository_provision.md # Scenario where neither cluster crosses the 1% threshold diff --git a/test/promql/tests/data_plane/stability_pac_provision_test.yaml b/test/promql/tests/data_plane/stability_pac_provision_test.yaml index 500432e1..ae91b11b 100644 --- a/test/promql/tests/data_plane/stability_pac_provision_test.yaml +++ b/test/promql/tests/data_plane/stability_pac_provision_test.yaml @@ -28,12 +28,14 @@ tests: severity: warning slo: "true" source_cluster: cluster01 + namespace: "foo" exp_annotations: summary: Stability of component PaC provision time exceeded description: > Time taken from PaC provision request till Component is provisioned for PaC builds has been over 5 minutes for more than 1% of requests during the last 10 minutes on cluster cluster01 + alert_route_namespace: build-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/build-service/stability_pac_provision.md # Scenario where both clusters cross the 1% threshold @@ -66,6 +68,7 @@ tests: Time taken from PaC provision request till Component is provisioned for PaC builds has been over 5 minutes for more than 1% of requests during the last 10 minutes on cluster cluster01 + alert_route_namespace: build-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/build-service/stability_pac_provision.md - exp_labels: severity: warning @@ -77,6 +80,7 @@ tests: Time taken from PaC provision request till Component is provisioned for PaC builds has been over 5 minutes for more than 1% of requests during the last 10 minutes on cluster cluster02 + alert_route_namespace: build-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/build-service/stability_pac_provision.md # Scenario where neither cluster crosses the 1% threshold diff --git a/test/promql/tests/data_plane/stability_pac_unprovision_test.yaml b/test/promql/tests/data_plane/stability_pac_unprovision_test.yaml index 17b4f1d9..05d90a0e 100644 --- a/test/promql/tests/data_plane/stability_pac_unprovision_test.yaml +++ b/test/promql/tests/data_plane/stability_pac_unprovision_test.yaml @@ -34,6 +34,7 @@ tests: Time taken from PaC unprovision request till Component is unprovisioned for PaC builds has been over 5 minutes for more than 1% of requests during the last 10 minutes on cluster cluster01 + alert_route_namespace: build-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/build-service/stability_pac_unprovision.md # Scenario where both clusters cross the 1% threshold @@ -66,6 +67,7 @@ tests: Time taken from PaC unprovision request till Component is unprovisioned for PaC builds has been over 5 minutes for more than 1% of requests during the last 10 minutes on cluster cluster01 + alert_route_namespace: build-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/build-service/stability_pac_unprovision.md - exp_labels: severity: warning @@ -77,6 +79,7 @@ tests: Time taken from PaC unprovision request till Component is unprovisioned for PaC builds has been over 5 minutes for more than 1% of requests during the last 10 minutes on cluster cluster02 + alert_route_namespace: build-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/build-service/stability_pac_unprovision.md # Scenario where neither cluster crosses the 1% threshold diff --git a/test/promql/tests/data_plane/stability_simple_build_test.yaml b/test/promql/tests/data_plane/stability_simple_build_test.yaml index a2e37cc0..8d31a83c 100644 --- a/test/promql/tests/data_plane/stability_simple_build_test.yaml +++ b/test/promql/tests/data_plane/stability_simple_build_test.yaml @@ -34,6 +34,7 @@ tests: Time taken from simple build request till the build pipeline is submitted has been over 5 minutes for more than 1% of requests during the last 10 minutes on cluster cluster01 + alert_route_namespace: build-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/build-service/stability_simple_build.md # Scenario where both clusters cross the 1% threshold @@ -66,6 +67,7 @@ tests: Time taken from simple build request till the build pipeline is submitted has been over 5 minutes for more than 1% of requests during the last 10 minutes on cluster cluster01 + alert_route_namespace: build-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/build-service/stability_simple_build.md - exp_labels: severity: warning @@ -77,6 +79,7 @@ tests: Time taken from simple build request till the build pipeline is submitted has been over 5 minutes for more than 1% of requests during the last 10 minutes on cluster cluster02 + alert_route_namespace: build-service runbook_url: https://gitlab.cee.redhat.com/rhtap/docs/sop/-/blob/main/build-service/stability_simple_build.md # Scenario where neither cluster crosses the 1% threshold