From 195f331f6c8033a7d7eeaee31828ec367d45abde Mon Sep 17 00:00:00 2001 From: Matej Feder Date: Fri, 14 Jun 2024 15:36:01 +0200 Subject: [PATCH] Update Loki Signed-off-by: Matej Feder --- .github/kind_cluster_config.yaml | 1 + .github/workflows/ci-scheduled.yaml | 123 ++--- .github/workflows/ci.yaml | 22 +- .github/workflows/release.yaml | 8 +- README.md | 56 +- chart/Chart.yaml | 8 +- chart/templates/NOTES.txt | 2 +- chart/templates/minio/create-bucket-job.yaml | 105 ++++ chart/values-loki-distributed.yaml | 6 + chart/values.yaml | 537 +++++++++++++------ helpers/README.md | 4 +- helpers/kind_cluster_config.yaml | 1 + 12 files changed, 599 insertions(+), 274 deletions(-) create mode 100644 chart/templates/minio/create-bucket-job.yaml diff --git a/.github/kind_cluster_config.yaml b/.github/kind_cluster_config.yaml index 4bcb748..e600bcd 100644 --- a/.github/kind_cluster_config.yaml +++ b/.github/kind_cluster_config.yaml @@ -18,4 +18,5 @@ apiVersion: kind.x-k8s.io/v1alpha4 nodes: - role: worker - role: worker +- role: worker - role: control-plane diff --git a/.github/workflows/ci-scheduled.yaml b/.github/workflows/ci-scheduled.yaml index c4afa24..6b9ab98 100644 --- a/.github/workflows/ci-scheduled.yaml +++ b/.github/workflows/ci-scheduled.yaml @@ -13,64 +13,65 @@ # limitations under the License. # -name: Test Chart with k8s versions - -on: - schedule: - # Run every 2 weeks - - cron: "0 0 1,15 * *" - -jobs: - test-chart: - name: test-chart - runs-on: ubuntu-20.04 - strategy: - fail-fast: false - matrix: - include: - - k8s-version: v1.19.7 - stack-version: v1.3 - - k8s-version: v1.20.2 - stack-version: v1.3 - - k8s-version: v1.20.7 - stack-version: v1.4 - - k8s-version: v1.21.2 - stack-version: v1.4 - - k8s-version: v1.22.1 - stack-version: v1.4 - - k8s-version: v1.23.13 - stack-version: v2.4 - - k8s-version: v1.25.9 - stack-version: v2.5 - - steps: - - name: Checkout - uses: actions/checkout@v2 - - - name: Fetch history - run: git fetch --prune --unshallow - - - name: Complete semVer with highest patch version - id: highest-patch - run: | - tag=$(git tag -l --sort=-v:refname | grep -F ${{ matrix.stack-version }}. | head -1) - [[ -z $tag ]] && (echo "Patch version for ${{ matrix.stack-version }} not found!" && exit 1) - echo "::set-output name=version::${tag//v}" - - - name: Download correct chart - run: | - echo ${{ steps.semver.outputs.version }} - wget https://dnationcloud.github.io/helm-hub/dnation-kubernetes-monitoring-stack-${{ steps.highest-patch.outputs.version }}.tgz - tar -xvf dnation-kubernetes-monitoring-stack-${{ steps.highest-patch.outputs.version }}.tgz - - - name: Create kind ${{ matrix.k8s-version }} cluster - uses: helm/kind-action@v1.2.0 - with: - node_image: kindest/node:${{ matrix.k8s-version }} - - - name: Set up chart-testing - uses: helm/chart-testing-action@v2.1.0 - - - name: Run chart-testing (install) - run: | - ct install --config .github/ct-scheduled.yaml +# TODO @mfeder: Enable and update below outdated pipeline +#name: Test Chart with k8s versions +# +#on: +# schedule: +# # Run every 2 weeks +# - cron: "0 0 1,15 * *" +# +#jobs: +# test-chart: +# name: test-chart +# runs-on: ubuntu-20.04 +# strategy: +# fail-fast: false +# matrix: +# include: +# - k8s-version: v1.19.7 +# stack-version: v1.3 +# - k8s-version: v1.20.2 +# stack-version: v1.3 +# - k8s-version: v1.20.7 +# stack-version: v1.4 +# - k8s-version: v1.21.2 +# stack-version: v1.4 +# - k8s-version: v1.22.1 +# stack-version: v1.4 +# - k8s-version: v1.23.13 +# stack-version: v2.4 +# - k8s-version: v1.25.9 +# stack-version: v2.5 +# +# steps: +# - name: Checkout +# uses: actions/checkout@v2 +# +# - name: Fetch history +# run: git fetch --prune --unshallow +# +# - name: Complete semVer with highest patch version +# id: highest-patch +# run: | +# tag=$(git tag -l --sort=-v:refname | grep -F ${{ matrix.stack-version }}. | head -1) +# [[ -z $tag ]] && (echo "Patch version for ${{ matrix.stack-version }} not found!" && exit 1) +# echo "::set-output name=version::${tag//v}" +# +# - name: Download correct chart +# run: | +# echo ${{ steps.semver.outputs.version }} +# wget https://dnationcloud.github.io/helm-hub/dnation-kubernetes-monitoring-stack-${{ steps.highest-patch.outputs.version }}.tgz +# tar -xvf dnation-kubernetes-monitoring-stack-${{ steps.highest-patch.outputs.version }}.tgz +# +# - name: Create kind ${{ matrix.k8s-version }} cluster +# uses: helm/kind-action@v1.2.0 +# with: +# node_image: kindest/node:${{ matrix.k8s-version }} +# +# - name: Set up chart-testing +# uses: helm/chart-testing-action@v2.1.0 +# +# - name: Run chart-testing (install) +# run: | +# ct install --config .github/ct-scheduled.yaml diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 0541c73..52081f0 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -23,16 +23,16 @@ on: jobs: lint-chart: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Fetch history run: git fetch --prune --unshallow - name: Set up chart-testing - uses: helm/chart-testing-action@v2.3.0 + uses: helm/chart-testing-action@v2.6.1 - name: Run chart-testing (install) run: | @@ -40,31 +40,31 @@ jobs: install-chart: name: install-chart - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 needs: - lint-chart strategy: matrix: k8s: - - v1.23.13 - - v1.24.13 - - v1.25.9 + - v1.28.9 + - v1.29.4 + - v1.30.0 steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Fetch history run: git fetch --prune --unshallow - name: Create kind ${{ matrix.k8s }} cluster - uses: helm/kind-action@v1.7.0 + uses: helm/kind-action@v1.10.0 with: node_image: kindest/node:${{ matrix.k8s }} config: .github/kind_cluster_config.yaml - name: Set up chart-testing - uses: helm/chart-testing-action@v2.3.0 + uses: helm/chart-testing-action@v2.6.1 - name: Run chart-testing (install) run: | - ct install --config .github/ct.yaml + ct install --helm-extra-set-args "--set loki.ci=true" --config .github/ct.yaml diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index e6fc4e2..d6d967d 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -25,10 +25,10 @@ on: jobs: release: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: Fetch history run: git fetch --prune --unshallow @@ -45,7 +45,7 @@ jobs: git config --global user.email "$GITHUB_ACTOR@users.noreply.github.com" - name: Set up Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: 3.6 @@ -60,7 +60,7 @@ jobs: GITHUB_TOKEN: ${{ secrets.CICD_TOKEN }} - name: Tag branch - uses: anothrNick/github-tag-action@1.39.0 + uses: anothrNick/github-tag-action@1.67.0 env: GITHUB_TOKEN: ${{ secrets.CICD_TOKEN }} CUSTOM_TAG: 'v${{ steps.version.outputs.chart }}' diff --git a/README.md b/README.md index 139ac00..2f40e5f 100644 --- a/README.md +++ b/README.md @@ -11,52 +11,47 @@ An umbrella helm chart for [dNation Kubernetes Monitoring](https://github.com/dN * [dnation-kubernetes-monitoring](https://github.com/dNationCloud/kubernetes-monitoring) * [kube-prometheus-stack](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack) -* [thanos](https://github.com/bitnami/charts/tree/master/bitnami/thanos) (to support multicluster monitoring) -* [loki](https://github.com/grafana/helm-charts/tree/main/charts/loki) -* [loki-distributed](https://github.com/grafana/helm-charts/tree/main/charts/loki-distributed) +* [thanos](https://github.com/bitnami/charts/tree/master/bitnami/thanos) +* [loki](https://github.com/grafana/loki/tree/main/production/helm/loki) * [promtail](https://github.com/grafana/helm-charts/tree/main/charts/promtail) +* [ssl-exporter](https://github.com/dNationCloud/ssl-exporter) # optional +* [prometheus-blackbox-exporter](https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus-blackbox-exporter) # optional +* [loki-distributed](https://github.com/grafana/helm-charts/tree/main/charts/loki-distributed) # deprecated, see [loki](#loki) + # Installation Prerequisites * [Helm3](https://helm.sh/) -* For production environment we recommend (based on our experience) a kubernetes cluster with at least 2 worker nodes and 4 GiB RAM per node or more. +* For production environment we recommend (based on our experience) a kubernetes cluster with at least 3 worker nodes and 4 GiB RAM per node or more. +dNation Kubernetes Monitoring Stack umbrella chart is hosted in the [dNation helm repository](https://artifacthub.io/packages/search?repo=dnationcloud). +By default, dNation Kubernetes Monitoring Stack installs Grafana with dNation dashboards, Prometheus with Thanos and Loki in simple scalable mode. -dNation Kubernetes Monitoring Stack umbrella chart is hosted in the [dNation helm repository](https://artifacthub.io/packages/search?repo=dnationcloud). By default, dNation Kubernetes Monitoring Stack installs Prometheus with Thanos sidecar and Thanos Query. -Thanos components are tuned according to [SCS Thanos tuning](https://github.com/SovereignCloudStack/k8s-observability/blob/main/docs/thanos-tuning.md). -For more details check [Multicluster monitoring support](#multicluster-monitoring-support) section. ```bash # Add dNation helm repository helm repo add dnationcloud https://dnationcloud.github.io/helm-hub/ helm repo update -# Install dNation Kubernetes Monitoring Stack (Loki in monolithic mode) +# Install dNation Kubernetes Monitoring Stack helm install monitoring dnationcloud/dnation-kubernetes-monitoring-stack - -# Install dNation Kubernetes Monitoring Stack (Loki in distributed mode with s3-compatible storage) -helm install monitoring dnationcloud/dnation-kubernetes-monitoring-stack \ - -f https://raw.githubusercontent.com/dNationCloud/kubernetes-monitoring-stack/main/chart/values-loki-distributed.yaml \ - --set loki-distributed.loki.storageConfig.aws.s3="" ``` -Search for `Monitoring` dashboard in the `dNation` directory. The fun starts here :). -If you want to set the `Monitoring` dashboard as a home dashboard follow [here](https://grafana.com/docs/grafana/latest/administration/change-home-dashboard/#set-the-default-dashboard-through-preferences). +Installation notes: + - Thanos components are tuned according to [SCS Thanos tuning](https://github.com/SovereignCloudStack/k8s-observability/blob/main/docs/thanos-tuning.md) + - If you're experiencing issues please read the [documentation](https://dnationcloud.github.io/kubernetes-monitoring/docs/documentation) and [FAQ](https://dnationcloud.github.io/kubernetes-monitoring/helpers/FAQ/) -For `multi-cluster centralized logging` install monitoring on your workload cluster without Loki, set `loki.enabled: false` in [values.yaml](chart/values.yaml) and also configure `promtail.config.lokiAddress` to send logs to your Loki instance. On your central cluster install it in classic way with `loki.enable: true`. +Search for `Infrastructure services monitoring` dashboard in the `dNation` directory. The fun starts here :). +If you want to set the `Infrastructure services monitoring` dashboard as a home dashboard follow [here](https://grafana.com/docs/grafana/latest/administration/change-home-dashboard/#set-the-default-dashboard-through-preferences). -If you're experiencing issues please read the [documentation](https://dnationcloud.github.io/kubernetes-monitoring/docs/documentation) and [FAQ](https://dnationcloud.github.io/kubernetes-monitoring/helpers/FAQ/). +# Loki +## loki-distributed -# Kubernetes support (tested) +This chart is deprecated and replaced by [loki](https://github.com/grafana/loki/tree/main/production/helm/loki) helm chart. +Loki helm chart is the only helm chart you should use for loki helm deployment. It supports loki deployment in monolithic, scalable +and even [distributed mode](https://grafana.com/docs/loki/next/setup/install/helm/install-microservices/). -||dNation monitoring v1.3|dNation monitoring v1.4|dNation monitoring v2.0|dNation monitoring v2.3|dNation monitoring v2.5| -|-|-|-|-|-|-| -|Kubernetes v1.19|✓||||| -|Kubernetes v1.20|✓||||| -|Kubernetes v1.21||✓|✓||| -|Kubernetes v1.22||✓|✓|✓|| -|Kubernetes v1.23||||✓|✓| -|Kubernetes v1.24|||||✓| -|Kubernetes v1.25|||||✓| +We recommend use the loki helm chart for all fresh installations. If you already use loki-distributed helm chart, check +the migration [guide](https://grafana.com/docs/loki/latest/setup/migrate/migrate-from-distributed/). # Multicluster monitoring support This chart supports also setup of multicluster monitoring using Thanos. The deployment architecture follows "observer cluster/workload clusters" pattern, where there is one observer k8s cluster which provides centralized monitoring overview of multiple workload k8s clusters. Helm values files enabling the multicluster monitoring are located inside `multicluster-config/` directory. There are 2 files in total: @@ -64,6 +59,8 @@ This chart supports also setup of multicluster monitoring using Thanos. The depl - `multicluster-config/observer-values.yaml` - contains config for installation of observer cluster - `multicluster-config/workload-values.yaml` - contains config for installation of workload cluster(s) +For `multi-cluster centralized logging` install monitoring on your workload cluster without Loki, set `loki.enabled: false` in [values.yaml](chart/values.yaml) and also configure `promtail.config.lokiAddress` to send logs to your Loki instance. On your central cluster install it in classic way with `loki.enable: true`. + ## Architecture As mentioned earlier, we are using "observer cluster/workload clusters" pattern to implement multicluster monitoring. The full architecture can be seen on following diagram: @@ -220,11 +217,6 @@ dnationcloud/dnation-kubernetes-monitoring-stack \ -f ``` # Openshift support -## Tested versions - -||dNation monitoring v1.3|dNation monitoring v1.4|dNation monitoring v2.0|dNation monitoring v2.3| -|-|-|-|-|-| -|Openshift v4.7||||✓| ## Installation To install the chart on an openshift cluster, use additional [values for openshift](/chart/values-openshift.yaml) ```shell diff --git a/chart/Chart.yaml b/chart/Chart.yaml index e396e08..a84990f 100644 --- a/chart/Chart.yaml +++ b/chart/Chart.yaml @@ -15,9 +15,9 @@ apiVersion: v2 name: dnation-kubernetes-monitoring-stack -version: 3.5.0 +version: 3.6.0 appVersion: 2.7.0 # dnation-kubernetes-monitoring -description: An umbrella helm chart for Kubernetes monitoring based on kube-prometheus-stack, thanos, loki, loki-distributed, promtail and dnation-kubernetes-monitoring. +description: An umbrella helm chart for Kubernetes monitoring based on kube-prometheus-stack, thanos, loki, promtail and dnation-kubernetes-monitoring. keywords: - dnation - kubernetes @@ -42,7 +42,7 @@ dependencies: repository: https://charts.bitnami.com/bitnami condition: thanos.enabled - name: loki - version: "5.5.*" + version: "6.6.*" repository: https://grafana.github.io/helm-charts condition: loki.enabled - name: loki-distributed @@ -50,7 +50,7 @@ dependencies: repository: https://grafana.github.io/helm-charts condition: loki-distributed.enabled - name: promtail - version: "6.0.*" + version: "6.16.*" repository: https://grafana.github.io/helm-charts condition: promtail.enabled - name: dnation-kubernetes-monitoring diff --git a/chart/templates/NOTES.txt b/chart/templates/NOTES.txt index 0653756..56465d1 100644 --- a/chart/templates/NOTES.txt +++ b/chart/templates/NOTES.txt @@ -28,4 +28,4 @@ If you're experiencing issues please read the project documentation and FAQ. 4. Login with the password from step 2 and the username: '{{ index .Values "kube-prometheus-stack" "grafana" "adminUser" }}' -5. Search for `Monitoring` dashboard in the `dNation` directory. The fun starts here :) +5. Search for `Infrastructure services monitoring` dashboard in the `dNation` directory. The fun starts here :) diff --git a/chart/templates/minio/create-bucket-job.yaml b/chart/templates/minio/create-bucket-job.yaml new file mode 100644 index 0000000..12cef48 --- /dev/null +++ b/chart/templates/minio/create-bucket-job.yaml @@ -0,0 +1,105 @@ +# For CI testing only! +# +# Minio provides post-install hook to create bucket +# however the hook won't be executed if helm install is run +# with --wait flag. +# This breaks GitHub CI as chart-testing tool applies --wait flag, see: +# https://github.com/helm/chart-testing/issues/202 +# Hence, this job is a workaround for that, and +# it is applied only when `loki.ci=true` variable is passed to the chart. +# Related issues: +# - https://github.com/grafana/mimir/issues/2464 +# - https://github.com/grafana/loki/issues/9634 +# The job definition is taken from: +# - https://github.com/minio/minio/blob/master/helm/minio/templates/post-job.yaml +{{- if .Values.loki.enabled }} +{{- if (eq .Values.loki.ci true) }} +{{- if .Values.loki.minio.enabled }} +{{- if .Values.loki.minio.buckets }} +apiVersion: batch/v1 +kind: Job +metadata: + name: minio-post-job + namespace: {{ .Release.Namespace | quote }} + labels: + app: {{ template "minio.name" . }}-make-bucket-job + chart: {{ template "minio.chart" . }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} +{{- with .Values.loki.minio.makeBucketJob.annotations }} + annotations: +{{ toYaml . | indent 4 }} +{{- end }} +spec: + template: + metadata: + labels: + app: {{ template "minio.name" . }}-job + release: {{ .Release.Name }} +{{- if .Values.loki.minio.podLabels }} +{{ toYaml .Values.loki.minio.podLabels | indent 8 }} +{{- end }} +{{- if .Values.loki.minio.makeBucketJob.podAnnotations }} + annotations: +{{ toYaml .Values.loki.minio.makeBucketJob.podAnnotations | indent 8 }} +{{- end }} + spec: + restartPolicy: OnFailure +{{- include "minio.imagePullSecrets" . | indent 6 }} +{{- if .Values.loki.minio.nodeSelector }} + nodeSelector: +{{ toYaml .Values.loki.minio.makeBucketJob.nodeSelector | indent 8 }} +{{- end }} +{{- with .Values.loki.minio.makeBucketJob.affinity }} + affinity: +{{ toYaml . | indent 8 }} +{{- end }} +{{- with .Values.loki.minio.makeBucketJob.tolerations }} + tolerations: +{{ toYaml . | indent 8 }} +{{- end }} +{{- if .Values.loki.minio.makeBucketJob.securityContext.enabled }} + securityContext: + runAsUser: {{ .Values.loki.minio.makeBucketJob.securityContext.runAsUser }} + runAsGroup: {{ .Values.loki.minio.makeBucketJob.securityContext.runAsGroup }} + fsGroup: {{ .Values.loki.minio.makeBucketJob.securityContext.fsGroup }} +{{- end }} + volumes: + - name: minio-configuration + projected: + sources: + - configMap: + name: {{ .Release.Name }}-minio + - secret: + name: {{ .Release.Name }}-minio + {{- if .Values.loki.minio.tls.enabled }} + - name: cert-secret-volume-mc + secret: + secretName: {{ .Values.loki.minio.tls.certSecret }} + items: + - key: {{ .Values.loki.minio.tls.publicCrt }} + path: CAs/public.crt + {{ end }} + containers: + - name: minio-mc + image: "{{ .Values.loki.minio.mcImage.repository }}:{{ .Values.loki.minio.mcImage.tag }}" + imagePullPolicy: {{ .Values.loki.minio.mcImage.pullPolicy }} + command: ["/bin/sh", "/config/initialize"] + env: + - name: MINIO_ENDPOINT + value: {{ .Release.Name }}-minio + - name: MINIO_PORT + value: {{ .Values.loki.minio.service.port | quote }} + volumeMounts: + - name: minio-configuration + mountPath: /config + {{- if .Values.loki.minio.tls.enabled }} + - name: cert-secret-volume-mc + mountPath: {{ .Values.loki.minio.configPathmc }}certs + {{ end }} + resources: +{{ toYaml .Values.loki.minio.makeBucketJob.resources | indent 10 }} +{{- end }} +{{- end }} +{{- end }} +{{- end }} diff --git a/chart/values-loki-distributed.yaml b/chart/values-loki-distributed.yaml index feb2b3a..e907f93 100644 --- a/chart/values-loki-distributed.yaml +++ b/chart/values-loki-distributed.yaml @@ -18,6 +18,12 @@ # with loki-distributed. # Declare variables to be passed into helm chart dependencies. +############################################## WARNING ############################################################### +## +## DEPRECATED VALUES +## +## The following values are deprecated and will be removed in a future version of the helm chart! +############################################## WARNING ############################################################## ## Configure additional grafana datasources grafanaDatasourcesAsConfigMap: cluster-logs: diff --git a/chart/values.yaml b/chart/values.yaml index 8f1a2b7..b9775b4 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -28,107 +28,12 @@ namespaceOverride: "" ## fullnameOverride: "" -ssl-exporter: - enabled: false - serviceMonitor: - # We can enable the service monitor, because we have prometheus in our monitoring stack - enabled: true - - # # SSL Exporter example config - # # Configure external URLs to scrape - # externalTargets: - # - example.com:443 - # # Configure kubernetes secrets to scrape - # secretTargets: - # # e.g. all secrets across all namespaces - # - "*/*" - # # Certificate files on control plane nodes - # fileTargets: - # # Included in default values of ssl-exporter helm chart - # - "/etc/kubernetes/pki/**/*.crt" - # # Certificates within kubeconfig files - # kubeconfigTargets: - # # Included in default values of ssl-exporter helm chart - # - /etc/kubernetes/admin.conf - -## Deploy a prometheus-blackbox-exporter -## ref: https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus-blackbox-exporter +###################################################################################################################### ## -prometheus-blackbox-exporter: - enabled: false - releaseLabel: true - prometheusRule: - enabled: true - additionalLabels: - prometheus_rule: '1' - rules: - - alert: BlackboxProbeFailed - expr: 'probe_success == 0' - for: 5m - labels: - severity: critical - annotations: - message: 'Blackbox probe on target: {{ $labels.target }} failed' - - alert: BlackboxSlowProbe - expr: 'avg_over_time(probe_duration_seconds[1m]) > 5' - for: 5m - labels: - severity: warning - annotations: - message: 'Blackbox probe on target: {{ $labels.target }} took more than 5s to complete, probe time = {{ $value }}' - - alert: BlackboxSslCertificateWillExpireSoon - expr: 'round((probe_ssl_earliest_cert_expiry - time()) / 86400, 0.1) < 30' - for: 5m - labels: - severity: warning - annotations: - message: 'SSL certificate expires in {{ $value }} days' - serviceMonitor: - enabled: true -# targets: -# - name: dnation-cloud -# url: https://dnation.cloud/ - -thanos: - enabled: true - queryFrontend: - enabled: false - extraFlags: - - --query-range.split-interval=12h - - --query-frontend.log-queries-longer-than=10s - - --query-frontend.compress-responses - - |- - --query-range.response-cache-config="config": - "max_size": "500MB" - "max_size_items": 0 - "validity": 0s - "type": "in-memory" - query: - extraFlags: - - --query.auto-downsampling - dnsDiscovery: - sidecarsService: kube-prometheus-thanos-discovery - sidecarsNamespace: "{{ .Release.Namespace }}" - bucketweb: - enabled: false - compactor: - enabled: false - retentionResolutionRaw: 2d - retentionResolution5m: 10d - retentionResolution1h: 15d - extraFlags: - - --compact.concurrency=3 - - --downsample.concurrency=3 - storegateway: - enabled: false - ruler: - enabled: false - -thanosQueryEnvoySidecar: - enabled: false - ## Configure additional grafana datasources that will be provisioned as ConfigMaps +## ## ref: http://docs.grafana.org/administration/provisioning/#datasources +###################################################################################################################### grafanaDatasourcesAsConfigMap: cluster-metrics: - name: thanos @@ -140,15 +45,18 @@ grafanaDatasourcesAsConfigMap: - name: cluster-logs isDefault: false type: loki - url: http://loki-gateway + url: http://{{ .Release.Name }}-loki-gateway jsonData: httpHeaderName1: 'X-Scope-OrgID' secureJsonData: httpHeaderValue1: '1' -## Deploy a dnation-kubernetes-monitoring -## ref: https://github.com/dNationCloud/kubernetes-monitoring +###################################################################################################################### +## +## dNation-kubernetes-monitoring sub-chart configuration ## +## ref: https://github.com/dNationCloud/kubernetes-monitoring +###################################################################################################################### dnation-kubernetes-monitoring: enabled: true @@ -169,63 +77,12 @@ dnation-kubernetes-monitoring: labelPrometheus: prometheus_rule: '1' -## Deploy a loki -## ref: https://github.com/grafana/helm-charts/tree/main/charts/loki -## -loki: - enabled: true - auth_enabled: false - backend: - replicas: 2 - write: - # -- Number of replicas for the write - replicas: 2 - read: - # -- Number of replicas for the read - replicas: 2 - test: - enabled: false - -## Deploy a loki-distributed -## with s3-compatible object storage -## ref: https://github.com/grafana/helm-charts/tree/main/charts/loki-distributed +###################################################################################################################### ## -loki-distributed: - enabled: false - loki: - schemaConfig: - configs: - - from: 2020-09-07 - store: boltdb-shipper - object_store: aws - schema: v11 - index: - prefix: loki_index_ - period: 24h - storageConfig: - boltdb_shipper: - shared_store: aws - aws: - s3: "s3://access_key:secret_access_key@custom_endpoint/bucket_name" - s3forcepathstyle: true # set to 'false' to enable virtual-hosted-style URLs - -# Deploy a promtail -## ref: https://github.com/grafana/helm-charts/tree/main/charts/promtail +## Kube-prometheus-stack sub-chart configuration ## -promtail: - enabled: true - config: - clients: - - url: http://loki-gateway/loki/api/v1/push - tenant_id: 1 - # !IMPORTANT: each cluster must have promtail external label 'cluster' with unique value. - # This will be used as cluster discriminator when logs are aggregated on observer cluster. - extraArgs: - - -client.external-labels=cluster=observer-cluster - -## Deploy a kube-prometheus-stack ## ref: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack -## +###################################################################################################################### kube-prometheus-stack: enabled: true @@ -257,8 +114,37 @@ kube-prometheus-stack: ## prometheus: prometheusSpec: - # !IMPORTANT: each cluster must have prometheus external label 'cluster' with unique value. - # This will be used as cluster discriminator when metrics are aggregated on observer cluster. + serviceMonitorSelector: + ## Match all dNation monitoring ServiceMonitors using the default match label setting `release=.Release.Name` OR + ## match all ServiceMonitors with the static label `release=kubernetes-monitoring-servicemonitor`. + ## The static label can be used for third-party ServiceMonitors deployed outside of this chart. + ## If multiple instances of this monitoring are deployed within one Kubernetes environment, + ## we recommend using different static labels for each instance to ensure separation of discovered ServiceMonitors. + ## + matchExpressions: + - key: release + operator: In + values: + - "{{ .Release.Name }}" + - kubernetes-monitoring-servicemonitor + + podMonitorSelector: + ## Match all dNation monitoring PodMonitors using the default match label setting `release=.Release.Name` OR + ## match all PodMonitors with the static label `release=kubernetes-monitoring-podmonitor`. + ## The static label can be used for third-party PodMonitors deployed outside of this chart. + ## If multiple instances of this monitoring are deployed within one Kubernetes environment, + ## we recommend using different static labels for each instance to ensure separation of discovered PodMonitors. + ## + matchExpressions: + - key: release + operator: In + values: + - "{{ .Release.Name }}" + - kubernetes-monitoring-podmonitor + + ## !IMPORTANT: each cluster must have prometheus external label 'cluster' with unique value. + ## This will be used as cluster discriminator when metrics are aggregated on observer cluster. + ## externalLabels: cluster: observer-cluster @@ -315,7 +201,7 @@ kube-prometheus-stack: port: 2381 targetPort: 2381 - ## Configuration for prometheus-node-exporter subchart + ## Configuration for prometheus-node-exporter sub-chart ## prometheus-node-exporter: ## Extra args extended with `rootfs`file system @@ -402,7 +288,296 @@ kube-prometheus-stack: severity: 'warning' equal: ['alertname'] -# Values for Openshift clusters +###################################################################################################################### +## +## Thanos sub-chart configuration +## +## ref: https://github.com/bitnami/charts/tree/main/bitnami/thanos +###################################################################################################################### +thanos: + enabled: true + queryFrontend: + enabled: false + extraFlags: + - --query-range.split-interval=12h + - --query-frontend.log-queries-longer-than=10s + - --query-frontend.compress-responses + - |- + --query-range.response-cache-config="config": + "max_size": "500MB" + "max_size_items": 0 + "validity": 0s + "type": "in-memory" + query: + extraFlags: + - --query.auto-downsampling + dnsDiscovery: + sidecarsService: kube-prometheus-thanos-discovery + sidecarsNamespace: "{{ .Release.Namespace }}" + bucketweb: + enabled: false + compactor: + enabled: false + retentionResolutionRaw: 2d + retentionResolution5m: 10d + retentionResolution1h: 15d + extraFlags: + - --compact.concurrency=3 + - --downsample.concurrency=3 + storegateway: + enabled: false + ruler: + enabled: false + +###################################################################################################################### +## +## Thanos query sidecar configuration +## +###################################################################################################################### +thanosQueryEnvoySidecar: + enabled: false + +###################################################################################################################### +## +## Loki sub-chart configuration +## +## Loki defaults to Simple Scalable Deployment (SSD) Mode. +## Fits well for small to medium size Loki deployments up to around few TB of logs per day. +## ref: https://github.com/grafana/loki/tree/main/production/helm/loki +## Config developed based on Loki [best-practices](https://grafana.com/docs/loki/latest/configure/bp-configure/) and +## dNation productive setup of Loki. +###################################################################################################################### +loki: + enabled: true + ## The SimpleScalable deployment mode can scale up to a few TBs of logs per day. + ## If you go much beyond this, the Distributed (microservices) mode will be a better choice. + ## + deploymentMode: SimpleScalable + auth_enabled: false + loki: + resultsCache: + enabled: true # Memcached based results-cache is enabled + chunksCache: + enabled: true # Memcached based chunks-cache is enabled + schemaConfig: + configs: + - from: 2024-06-01 + store: tsdb + object_store: s3 + schema: v13 + index: + prefix: loki_index_ + period: 24h + limits_config: + ## !Adjust for productive usage or remove if you want infinite retention! + ## Validate also `compactor` settings + ## + retention_period: 48h # 2days + + commonConfig: + ## Use the memberlist key-value store instead of consul, read the related docs below for details: + ## https://grafana.com/blog/2022/09/28/inside-the-migration-from-consul-to-memberlist-at-grafana-labs/ + ## https://grafana.com/docs/loki/latest/get-started/hash-rings/ + ## + ring: + kvstore: + store: memberlist + + ingester: + ## Use snappy compression instead of default gzip. + ## It is focused on high speeds with ‘good enough’ compression ratio. + ## + chunk_encoding: snappy + ## This section configures the ingester for deployments with infrequent log shipments to Loki. + ## For medium to high-frequency log shipments, you can remove this section to use the default, + ## more "real-world" ingester settings. + ## + ## How long chunks should sit in-memory with no updates before being flushed if + ## they don't hit the max block size. + ## + chunk_idle_period: 6h + ## The maximum duration of a timeseries chunk in memory. If a timeseries runs for + ## longer than this, the current chunk will be flushed to the store and a new + ## chunk created. + ## + max_chunk_age: 12h + ## The targeted _uncompressed_ size in bytes of a chunk block When this threshold + ## is exceeded the head block will be cut and compressed inside the chunk. + ## + chunk_block_size: 262144 # ~0.26MB + ## A target _compressed_ size in bytes for chunks. This is a desired size not an + ## exact size. + ## + chunk_target_size: 1572864 # ~1.5MB + ## How long chunks should be retained in-memory after they've been flushed. + ## + chunk_retain_period: 1m + + querier: + max_concurrent: 4 # Default is 4, if you have enough memory and CPU you can increase, reduce if OOMing + compactor: + retention_enabled: true + delete_request_store: s3 + delete_request_cancel_period: 10m # Don't wait 24h before processing the delete_request + gateway: + service: + labels: + ## TODO: Remove label once the issue below is resolved. + ## Exclude scraping loki-gateway's non-existent /metrics endpoint, + ## see https://github.com/grafana/loki/issues/13201 + ## + prometheus.io/service-monitor: "false" + backend: + replicas: 3 + read: + replicas: 3 + extraArgs: + - -config.expand-env=true + write: + replicas: 3 + extraArgs: + - -config.expand-env=true + ## Enable minio for loki storage. + ## This spawns simple minio instance without any production ready configuration. + ## For productive usage go through the minio configuration or setup external storage for loki. + ## + minio: + enabled: true + ## Monitoring section has been deprecated by the loki project. + ## ref: https://github.com/grafana/loki/blob/main/production/helm/loki/values.yaml#L3209 + ## As a replacement Loki suggests https://github.com/grafana/meta-monitoring-chart helm chart, but this chart can not + ## be currently used as monitoring only deployment chart, see https://github.com/grafana/meta-monitoring-chart/issues/130. + ## Hence, we should stick with this deprecated values for now. + monitoring: + dashboards: + enabled: true + labels: + grafana_dashboard: "1" + rules: + enabled: true + labels: + prometheus_rule: "1" + serviceMonitor: + enabled: true + labels: + release: kubernetes-monitoring-servicemonitor + ## Safety values for zero out replica counts of other Loki deployment modes + ## + singleBinary: + replicas: 0 + ingester: + replicas: 0 + querier: + replicas: 0 + queryFrontend: + replicas: 0 + queryScheduler: + replicas: 0 + distributor: + replicas: 0 + compactor: + replicas: 0 + indexGateway: + replicas: 0 + bloomCompactor: + replicas: 0 + bloomGateway: + replicas: 0 +###################################################################################################################### +## +## Promtail sub-chart configuration +## +## ref: https://github.com/grafana/helm-charts/tree/main/charts/promtail +## Note: if you observe failing promtail container due to "too many open files" error, check +## the following issue https://github.com/grafana/loki/issues/1153 and increase the number of inotify +## resources, e.g. see https://www.suse.com/support/kb/doc/?id=000020048 +###################################################################################################################### +promtail: + enabled: true + config: + clients: + - url: http://{{ .Release.Name }}-loki-gateway/loki/api/v1/push + tenant_id: 1 + # !IMPORTANT: each cluster must have promtail external label 'cluster' with unique value. + # This will be used as cluster discriminator when logs are aggregated on observer cluster. + extraArgs: + - -client.external-labels=cluster=observer-cluster + +###################################################################################################################### +## +## SSL-exporter sub-chart configuration +## +## ref: https://github.com/dNationCloud/ssl-exporter +###################################################################################################################### +ssl-exporter: + enabled: false + serviceMonitor: + # We can enable the service monitor, because we have prometheus in our monitoring stack + enabled: true + + # # SSL Exporter example config + # # Configure external URLs to scrape + # externalTargets: + # - example.com:443 + # # Configure kubernetes secrets to scrape + # secretTargets: + # # e.g. all secrets across all namespaces + # - "*/*" + # # Certificate files on control plane nodes + # fileTargets: + # # Included in default values of ssl-exporter helm chart + # - "/etc/kubernetes/pki/**/*.crt" + # # Certificates within kubeconfig files + # kubeconfigTargets: + # # Included in default values of ssl-exporter helm chart + # - /etc/kubernetes/admin.conf + +###################################################################################################################### +## +## Prometheus-blackbox-exporter sub-chart configuration +## +## ref: https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus-blackbox-exporter +###################################################################################################################### +prometheus-blackbox-exporter: + enabled: false + releaseLabel: true + prometheusRule: + enabled: true + additionalLabels: + prometheus_rule: '1' + rules: + - alert: BlackboxProbeFailed + expr: 'probe_success == 0' + for: 5m + labels: + severity: critical + annotations: + message: 'Blackbox probe on target: {{ $labels.target }} failed' + - alert: BlackboxSlowProbe + expr: 'avg_over_time(probe_duration_seconds[1m]) > 5' + for: 5m + labels: + severity: warning + annotations: + message: 'Blackbox probe on target: {{ $labels.target }} took more than 5s to complete, probe time = {{ $value }}' + - alert: BlackboxSslCertificateWillExpireSoon + expr: 'round((probe_ssl_earliest_cert_expiry - time()) / 86400, 0.1) < 30' + for: 5m + labels: + severity: warning + annotations: + message: 'SSL certificate expires in {{ $value }} days' + serviceMonitor: + enabled: true +# targets: +# - name: dnation-cloud +# url: https://dnation.cloud/ + +###################################################################################################################### +## +## Openshift configuration +## +###################################################################################################################### openshift: # Set to true if you use openshift cluster enabled: false @@ -410,6 +585,12 @@ openshift: existingSecurityContextConstraints: null # Service accounts that kubernetes monitoring stack uses serviceAccounts: [] + +###################################################################################################################### +## +## Extras +## +###################################################################################################################### extraServices: [] # - name: addservice # namespace: targetnamespace @@ -443,3 +624,41 @@ extraConfigmaps: [] # { # ... # } + +############################################## WARNING ############################################################### +## +## DEPRECATED VALUES +## +## The following values are deprecated and will be removed in a future version of the helm chart! +############################################## WARNING ############################################################## + +###################################################################################################################### +## +## Loki-distributed sub-chart configuration +## +## ref: https://github.com/grafana/helm-charts/tree/main/charts/loki-distributed +## This chart is deprecated and replaced by [loki](https://github.com/grafana/loki/tree/main/production/helm/loki) helm chart. +## Loki helm chart is the only helm chart you should use for loki helm deployment. It supports loki deployment in monolithic, scalable +## and even [distributed mode](https://grafana.com/docs/loki/next/setup/install/helm/install-microservices/). +## +## We recommend use the loki helm chart for all fresh installations. If you already use loki-distributed helm chart, check +## the migration [guide](https://grafana.com/docs/loki/latest/setup/migrate/migrate-from-distributed/). +###################################################################################################################### +loki-distributed: + enabled: false + loki: + schemaConfig: + configs: + - from: 2020-09-07 + store: boltdb-shipper + object_store: aws + schema: v11 + index: + prefix: loki_index_ + period: 24h + storageConfig: + boltdb_shipper: + shared_store: aws + aws: + s3: "s3://access_key:secret_access_key@custom_endpoint/bucket_name" + s3forcepathstyle: true # set to 'false' to enable virtual-hosted-style URLs diff --git a/helpers/README.md b/helpers/README.md index e7725a2..9a63a03 100644 --- a/helpers/README.md +++ b/helpers/README.md @@ -13,12 +13,12 @@ Prerequisites Create KinD cluster ```bash -kind create cluster --config helpers/kind_cluster_config.yaml --image kindest/node:v1.25.11 +kind create cluster --config helpers/kind_cluster_config.yaml --image kindest/node:v1.30.0 ``` Install Kubernetes Monitoring Stack ```bash -helm install monitoring chart --dependency-update -f helpers/values-kind.yaml +helm upgrade --install monitoring chart --dependency-update -f helpers/values-kind.yaml ``` Follow installation notes and use Port Forwarding if you want to access the Grafana server from outside your KinD cluster diff --git a/helpers/kind_cluster_config.yaml b/helpers/kind_cluster_config.yaml index 4807fc3..8855013 100644 --- a/helpers/kind_cluster_config.yaml +++ b/helpers/kind_cluster_config.yaml @@ -18,6 +18,7 @@ apiVersion: kind.x-k8s.io/v1alpha4 nodes: - role: worker - role: worker +- role: worker - role: control-plane kubeadmConfigPatches: - |