Skip to content

Commit

Permalink
Add Kube_Prometheus_Stack to components
Browse files Browse the repository at this point in the history
  • Loading branch information
angelofenoglio committed Nov 11, 2023
1 parent 9eee7fd commit 5a5cdc9
Show file tree
Hide file tree
Showing 5 changed files with 234 additions and 1 deletion.
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
# ---------------------------------------------------------------------------------------------
# Disable KubeControllerManagerDown and KubeSchedulerDown since they are not necessary on EKS
# Disable WatchDog until we decide what to do with it
# ---------------------------------------------------------------------------------------------
defaultRules:
rules:
kubeControllerManager: false
kubeSchedulerAlerting: false

disabled:
Watchdog: true
KubeControllerManagerDown: true
KubeSchedulerDown: true
InfoInhibitor: true

kubeControllerManager:
enabled: false

kubeScheduler:
enabled: false
# ---------------------------------------------------------------------------------------------

alertmanager:
enabled: true

config:
global:
slack_api_url: ${alertmanagerSlackWebhook}
route:
group_by:
- namespace
group_wait: 30s
group_interval: 5m
repeat_interval: 1h
receiver: 'slack-notifications'
routes:
- receiver: 'slack-notifications'
continue: true
receivers:
- name: 'slack-notifications'
slack_configs:
- channel: '#${alertmanagerSlackChannel}'
send_resolved: true
text: '{{ template "slack.alert.text" }}'

templateFiles:
alert.tmpl: |-
{{ define "slack.alert.text" }}
{{ range .Alerts }}
*Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}` \n
*Environment:* {{ .Labels.cluster }} \n
*Description:* {{ .Annotations.description }} \n
*Graph:* <{{ .GeneratorURL }}|:chart_with_upwards_trend:> - *Runbook:* <{{ .Annotations.runbook }}|:documentation:> \n
*Details:* \n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` \n
{{ end }}
{{ end }}
{{ end }}
ingress:
enabled: true
annotations:
kubernetes.io/tls-acme: 'true'
kubernetes.io/ingress.class: ${ingressClass}
cert-manager.io/cluster-issuer: clusterissuer-arta-cert-manager-clusterissuer
hosts:
- ${alertmanagerHost}
path: /
tls:
- secretName: alertmananager-tls
hosts:
- ${alertmanagerHost}

alertmanagerSpec:
nodeSelector: ${nodeSelector}
tolerations: ${tolerations}

storage:
volumeClaimTemplate:
spec:
storageClassName: gp2
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 2Gi

grafana:
enabled: true

adminUser: ${grafanaUser}
adminPassword: ${grafanaPassword}

serviceAccount:
name: grafana
annotations:
eks.amazonaws.com/role-arn: ${grafanaRoleArn}

ingress:
enabled: true
annotations:
kubernetes.io/tls-acme: 'true'
kubernetes.io/ingress.class: ${ingressClass}
cert-manager.io/cluster-issuer: clusterissuer-arta-cert-manager-clusterissuer
hosts:
- ${grafanaHost}
path: /
tls:
- secretName: grafana-tls
hosts:
- ${grafanaHost}

nodeSelector: ${nodeSelector}
tolerations: ${tolerations}

persistence:
enabled: true
size: 5Gi
storageClassName: gp2

# additionalDataSources:
# - name: CloudWatch
# type: cloudwatch
# jsonData:
# authType: default
# defaultRegion: us-east-1

kube-state-metrics:
nodeSelector: ${nodeSelector}
tolerations: ${tolerations}

prometheusOperator:
admissionWebHooks:
patch:
nodeSelector: ${nodeSelector}
tolerations: ${tolerations}
nodeSelector: ${nodeSelector}
tolerations: ${tolerations}

prometheus:
ingress:
enabled: true
annotations:
kubernetes.io/tls-acme: 'true'
kubernetes.io/ingress.class: ${ingressClass}
cert-manager.io/cluster-issuer: clusterissuer-arta-cert-manager-clusterissuer
hosts:
- ${prometheusHost}
path: /
tls:
- secretName: prometheus-tls
hosts:
- ${prometheusHost}

prometheusSpec:
nodeSelector: ${nodeSelector}
tolerations: ${tolerations}

storageSpec:
volumeClaimTemplate:
spec:
storageClassName: gp2
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 20Gi
59 changes: 59 additions & 0 deletions apps-devstg/us-east-1/k8s-eks/k8s-components/monitoring-metrics.tf
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,62 @@ resource "helm_release" "metrics_server" {
version = "5.8.4"
values = [file("chart-values/metrics-server.yaml")]
}

#--------------------------------------------------------------------------------
# Kube Prometheus Stack: Full Prometheus + Alertmanager + Grafana implementation.
#--------------------------------------------------------------------------------

#
# Slack webhook
#
data "aws_secretsmanager_secret_version" "alertmanager_slack_webhook" {
count = var.metrics.prometheus_stack.enabled ? 1 : 0
provider = aws.shared
secret_id = "/notifications/alertmanager"
}

#
# Grafana's credentials
#
data "aws_secretsmanager_secret_version" "grafana" {
count = var.metrics.prometheus_stack.enabled ? 1 : 0
provider = aws.shared
secret_id = "/grafana/administrator"
}

resource "helm_release" "kube_prometheus_stack" {
count = var.metrics.prometheus_stack.enabled ? 1 : 0
name = "kube-prometheus-stack"
namespace = kubernetes_namespace.monitoring_metrics[0].id
repository = "https://prometheus-community.github.io/helm-charts/"
chart = "kube-prometheus-stack"
version = "45.9.1"
values = [
templatefile("chart-values/kube-prometheus-stack.yaml", {
ingressClass = local.private_ingress_class,
alertmanagerSlackWebhook = jsondecode(data.aws_secretsmanager_secret_version.alertmanager_slack_webhook[0].secret_string)["webhook"],
alertmanagerSlackChannel = jsondecode(data.aws_secretsmanager_secret_version.alertmanager_slack_webhook[0].secret_string)["channel"],,
alertmanagerHost = "alertmanager.${local.environment}.${local.private_base_domain}",
grafanaUser = jsondecode(data.aws_secretsmanager_secret_version.grafana[0].secret_string)["username"],
grafanaPassword = jsondecode(data.aws_secretsmanager_secret_version.grafana[0].secret_string)["password"],,
grafanaHost = "grafana.${local.environment}.${local.private_base_domain}",
grafanaRoleArn = data.terraform_remote_state.eks-identities.outputs.grafana_role_arn,
prometheusHost = "prometheus.${local.environment}.${local.private_base_domain}",
nodeSelector = jsonencode({ stack = "monitoring" }),
tolerations = jsonencode([
{
key = "stack",
operator = "Equal",
value = "monitoring",
effect = "NoSchedule"
},
{
key = "stack",
operator = "Equal",
value = "argocd",
effect = "NoSchedule"
}
])
})
]
}
2 changes: 1 addition & 1 deletion apps-devstg/us-east-1/k8s-eks/k8s-components/namespaces.tf
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
resource "kubernetes_namespace" "monitoring_metrics" {
count = var.enable_prometheus_dependencies || var.enable_prometheus_dependencies || var.enable_cluster_autoscaling || var.enable_hpa_scaling || var.enable_vpa_scaling ? 1 : 0
count = var.metrics.prometheus_stack.enabled || var.enable_prometheus_dependencies || var.enable_prometheus_dependencies || var.enable_cluster_autoscaling || var.enable_hpa_scaling || var.enable_vpa_scaling ? 1 : 0

metadata {
labels = local.labels
Expand Down
5 changes: 5 additions & 0 deletions apps-devstg/us-east-1/k8s-eks/k8s-components/terraform.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,11 @@ logging = {
]
}
# metrics
metrics = {
prometheus_stack = {
enabled = true
}
}
enable_prometheus_dependencies = false
enable_grafana_dependencies = false
# tools
Expand Down
5 changes: 5 additions & 0 deletions apps-devstg/us-east-1/k8s-eks/k8s-components/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,11 @@ variable "logging" {
default = {}
}

variable "metrics" {
type = any
default = {}
}

variable "enable_ingressmonitorcontroller" {
type = bool
default = false
Expand Down

0 comments on commit 5a5cdc9

Please sign in to comment.