diff --git a/.github/workflows/prometheusrules-publish.yaml b/.github/workflows/prometheusrules-publish.yaml new file mode 100644 index 0000000..d9ccb12 --- /dev/null +++ b/.github/workflows/prometheusrules-publish.yaml @@ -0,0 +1,41 @@ +name: Publish prometheus-rules + +env: + CHART_NAME: prometheus-rules + GAR_LOCATION: us-central1 + PROJECT_NAME: cloudkite-public + PROJECT_ID: 297731695546 + REPOSITORY: public-helm-charts + +on: + workflow_dispatch: + push: + branches: [main] + paths: [prometheus-rules/**] + +jobs: + publish: + runs-on: ubuntu-latest + permissions: + contents: read + id-token: write + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Authenticate to Google Cloud + id: auth + uses: google-github-actions/auth@v0 + with: + workload_identity_provider: projects/${{ env.PROJECT_ID }}/locations/global/workloadIdentityPools/github-actions/providers/github-oidc + service_account: github-actions@${{ env.PROJECT_NAME }}.iam.gserviceaccount.com + - name: Set up Cloud SDK + uses: google-github-actions/setup-gcloud@v0 + - name: Docker configuration + run: gcloud auth configure-docker $GAR_LOCATION-docker.pkg.dev --quiet + - name: Update dependencies + working-directory: ${{ env.CHART_NAME }} + run: helm dependency update + - name: Package chart + run: helm package $CHART_NAME + - name: Push chart + run: helm push $CHART_NAME-*.tgz oci://$GAR_LOCATION-docker.pkg.dev/$PROJECT_NAME/$REPOSITORY diff --git a/prometheus-rules/Chart.yaml b/prometheus-rules/Chart.yaml new file mode 100644 index 0000000..2e8cadf --- /dev/null +++ b/prometheus-rules/Chart.yaml @@ -0,0 +1,6 @@ +apiVersion: v2 +name: prometheus-rules +description: Helm chart for prometheus rules +type: application +version: 0.1.0 +appVersion: v0.1.0 diff --git a/prometheus-rules/README.md b/prometheus-rules/README.md new file mode 100644 index 0000000..4e01b86 --- /dev/null +++ b/prometheus-rules/README.md @@ -0,0 +1,69 @@ +# Prometheus Rule Helm Chart + +This Helm chart installs Promrtheus Rules, which integrates with either `kube-prometheus-stack` or stand alone prometheus installation. + +## Table of Contents + +- [Installation](#installation) +- [Configuration](#configuration) +- [Usage](#usage) + +## Prerequisites + +Before you begin, ensure you have the following prerequisites in place: + +- **Helm**: Make sure you have Helm installed on your local machine and it's configured to work with your Kubernetes cluster. If not, you can follow the [official Helm installation guide](https://helm.sh/docs/intro/install/). + +## Installation + +### Local +To install this Helm chart, follow these steps: + +1. Clone the repository to your local machine: + + ```sh + git clone https://github.com/cloudkite-io/helm-charts + ``` + +2. Change into the chart directory: + + ```sh + cd helm-charts/prometheus-rules + ``` + Modify the `values.yaml` to enable the list of rules you want. + +3. Install the Helm chart with a release name (e.g., `cloudkitepromrules`): + + ```sh + helm install cloudkitepromrules . + ``` + +### Helm Templating +1. Create a values.yaml file to customise your values. + + ```values.yaml + nginx_ingress: + enabled: false + velero: + enabled: true + solr: + enabled: false + kubernetes: + enabled: true + + ``` + +Charts.yaml file: + + ```Charts.yaml + apiVersion: v2 + description: Cloudkite prometheus-rules chart + name: prometheus-rules + version: 0.1.0 + dependencies: + - name: prometheus-rules + version: 0.1.0 + repository: oci://us-central1-docker.pkg.dev/cloudkite-public/public-helm-charts + ``` + +2. Install or upgrade the Helm chart. diff --git a/prometheus-rules/templates/prometheusrules.k8s.yaml b/prometheus-rules/templates/prometheusrules.k8s.yaml new file mode 100644 index 0000000..070c25c --- /dev/null +++ b/prometheus-rules/templates/prometheusrules.k8s.yaml @@ -0,0 +1,27 @@ +{{- if .Values.kubernetes.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: kubernetes-rules +spec: + groups: + - name: kubernetes_apps + rules: + - alert: KubePodCrashLooping + annotations: + description: 'Pod {{ "{{" }} $labels.namespace }}/{{ "{{" }} $labels.pod }} ({{ "{{" }} $labels.container }}) is restarting {{ "{{" }} printf "%.2f" $value }} times / 5 minutes.' + expr: 'rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) * 60 * 5 > 5' + for: 30m + labels: + severity: warning + - name: kubernetes_jobs + rules: + - alert: KubernetesJobFailed + expr: 'kube_job_status_failed > 2' + for: 10m + labels: + severity: warning + annotations: + summary: 'Job {{ "{{" }} $labels.namespace }}/{{ "{{" }} $labels.job_name }} failed to complete.' + description: "Job" +{{- end }} diff --git a/prometheus-rules/templates/prometheusrules.nginxingress.yaml b/prometheus-rules/templates/prometheusrules.nginxingress.yaml new file mode 100644 index 0000000..fb1c53b --- /dev/null +++ b/prometheus-rules/templates/prometheusrules.nginxingress.yaml @@ -0,0 +1,18 @@ +{{- if .Values.nginx_ingress.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: nginx-ingress-rules +spec: + groups: + - name: nginx_ingress + rules: + - alert: NGINXTooMany500s + expr: '100 * ( sum( nginx_ingress_controller_requests{status=~"5.+"} ) / sum(nginx_ingress_controller_requests) ) > 5' + for: 5m + labels: + severity: warning + annotations: + description: Too many 5XXs + summary: More than 5% of all requests returned 5XX, this requires your attention +{{- end }} diff --git a/prometheus-rules/templates/prometheusrules.solr.yaml b/prometheus-rules/templates/prometheusrules.solr.yaml new file mode 100644 index 0000000..b2ad241 --- /dev/null +++ b/prometheus-rules/templates/prometheusrules.solr.yaml @@ -0,0 +1,19 @@ +{{- if .Values.solr.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: solr-cloud-rules +spec: + groups: + - name: solr_cloud + rules: + - alert: SolrCloudLowLiveNodeCount + annotations: + summary: Solr low live node count + description: 'Solr collection {{ "{{" }} $labels.collection }} has less than two live nodes for replica {{ "{{" }} $labels.replica }} on {{ "{{" }} $labels.base_url }}.' + expr: 'solr_collections_live_nodes < 2' + for: 20m + labels: + severity: warning + instance: solr +{{- end }} diff --git a/prometheus-rules/templates/prometheusrules.velero.yaml b/prometheus-rules/templates/prometheusrules.velero.yaml new file mode 100644 index 0000000..a749aef --- /dev/null +++ b/prometheus-rules/templates/prometheusrules.velero.yaml @@ -0,0 +1,18 @@ +{{- if .Values.velero.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: velero-rules +spec: + groups: + - name: velero + rules: + - alert: VeleroBackupFailures + expr: 'velero_backup_failure_total{schedule!=""} / velero_backup_attempt_total{schedule!=""} > 0.25' + for: 30m + labels: + severity: warning + annotations: + description: 'Velero backup {{ "{{" }} $labels.schedule }} has {{ "{{" }} $value | humanizePercentage }} failed backups.' + summary: Velero backup has failed backups. +{{- end }} diff --git a/prometheus-rules/values.yaml b/prometheus-rules/values.yaml new file mode 100644 index 0000000..3de421a --- /dev/null +++ b/prometheus-rules/values.yaml @@ -0,0 +1,8 @@ +nginx_ingress: + enabled: false +velero: + enabled: true +solr: + enabled: false +kubernetes: + enabled: true