Skip to content

Commit

Permalink
feat (deploy): [cluster] enable networking observability (basic) (#101)
Browse files Browse the repository at this point in the history
* allow fw for ama-metrics to connnect to azure monitor

* add managed prometheus

* enable azure monitor agent metrics

* add prometheus custom config for networking

* allow resource types for networking observabilty
  • Loading branch information
ferantivero authored Nov 8, 2024
1 parent 6599b9e commit ab4912c
Show file tree
Hide file tree
Showing 7 changed files with 276 additions and 2 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# https://raw.githubusercontent.com/Azure/prometheus-collector/refs/heads/main/otelcollector/configmaps/ama-metrics-prometheus-config-configmap.yaml
# This config map can be used to provide Prometheus scrape config for addon replica. Addon runs a singleton replica, and any cluster level services can be discovered and scraped by providing scrape jobs in this configmap.
# https://learn.microsoft.com/azure/virtual-network/kubernetes-network-policies#set-up-for-prometheus-server
kind: ConfigMap
apiVersion: v1
data:
prometheus-config: |-
global:
scrape_interval: 30s
scrape_configs:
- job_name: "azure-npm-node-metrics"
metrics_path: /node-metrics
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: [__address__]
action: replace
regex: ([^:]+)(?::\d+)?
replacement: "$1:10091"
target_label: __address__
- job_name: "azure-npm-cluster-metrics"
metrics_path: /cluster-metrics
kubernetes_sd_configs:
- role: service
relabel_configs:
- source_labels: [__meta_kubernetes_namespace]
regex: kube-system
action: keep
- source_labels: [__meta_kubernetes_service_name]
regex: npm-metrics-cluster-service
action: keep
# Comment from here to the end to collect advanced metrics: number of entries for each IPSet
metric_relabel_configs:
- source_labels: [__name__]
regex: npm_ipset_counts
action: drop
metadata:
# ama-metrics-prometheus-config (Recommended) - When a configmap with this name is created, scrape jobs defined in it are run from the Azure monitor metrics replica pod running in the cluster.
name: ama-metrics-prometheus-config
namespace: kube-system
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# https://raw.githubusercontent.com/Azure/prometheus-collector/refs/heads/main/otelcollector/configmaps/ama-metrics-prometheus-config-node-configmap.yaml
# This config map can be used to provide Prometheus scrape config for addon DaemonSet that runs on every Linux node in the cluster, and any node level targets on each node can be scraped by providing scrape jobs in this configmap.
# https://learn.microsoft.com/azure/virtual-network/kubernetes-network-policies#set-up-for-prometheus-server
kind: ConfigMap
apiVersion: v1
data:
prometheus-config: |-
global:
scrape_interval: 30s
scrape_configs:
- job_name: "azure-npm-node-metrics-from-pod-config"
metrics_path: /node-metrics
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_namespace]
regex: kube-system
action: keep
- source_labels: [__name__]
action: keep
regex: '(__meta_kubernetes_pod_annotationpresent_azure.*)'
- source_labels: [__address__]
action: replace
regex: ([^:]+)(?::\d+)?
replacement: "$1:10091"
target_label: __address__
metadata:
# ama-metrics-prometheus-config-node - When a configmap with this name is created, scrape jobs defined in it are run from each Linux DaemonSet pod running in the cluster.
name: ama-metrics-prometheus-config-node
namespace: kube-system
89 changes: 89 additions & 0 deletions cluster-manifests/kube-system/ama-metrics-settings-configmap.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# https://raw.githubusercontent.com/Azure/prometheus-collector/refs/heads/main/otelcollector/configmaps/ama-metrics-settings-configmap.yaml
kind: ConfigMap
apiVersion: v1
data:
schema-version:
#string.used by agent to parse config. supported versions are {v1}. Configs with other schema versions will be rejected by the agent.
v1
config-version:
#string.used by customer to keep track of this config file's version in their source control/repository (max allowed 10 chars, other chars will be truncated)
ver1
# Cluster alias (to change the value of cluster label in every time-series/metric that's ingested from a cluster)
# https://learn.microsoft.com/azure/azure-monitor/containers/prometheus-metrics-scrape-configuration?tabs=CRDConfig%2CCRDScrapeConfig%2CConfigFileScrapeConfigBasicAuth%2CConfigFileScrapeConfigTLSAuth#cluster-alias
prometheus-collector-settings: |-
cluster_alias = ""
# Enable/disable default scrape targets - Turn ON/OFF default scraping based on targets. Scrape configuration for these default targets are already pre-defined/built-in
# https://learn.microsoft.com/azure/azure-monitor/containers/prometheus-metrics-scrape-configuration?tabs=CRDConfig%2CCRDScrapeConfig%2CConfigFileScrapeConfigBasicAuth%2CConfigFileScrapeConfigTLSAuth#metrics-add-on-settings-configmap
default-scrape-settings-enabled: |-
kubelet = true
coredns = false
cadvisor = true
kubeproxy = false
apiserver = false
kubestate = true
nodeexporter = true
windowsexporter = false
windowskubeproxy = false
kappiebasic = true
networkobservabilityRetina = true
networkobservabilityHubble = false
networkobservabilityCilium = false
prometheuscollectorhealth = false
controlplane-apiserver = true
controlplane-cluster-autoscaler = false
controlplane-kube-scheduler = false
controlplane-kube-controller-manager = false
controlplane-etcd = true
# Regex for which namespaces to scrape through pod annotation based scraping.
# This is none by default. Use '.*' to scrape all namespaces of annotated pods.
# https://learn.microsoft.com/azure/azure-monitor/containers/prometheus-metrics-scrape-configuration?tabs=CRDConfig%2CCRDScrapeConfig%2CConfigFileScrapeConfigBasicAuth%2CConfigFileScrapeConfigTLSAuth#enable-pod-annotation-based-scraping
pod-annotation-based-scraping: |-
podannotationnamespaceregex = "kube-system|a0008"
# Metric keep-lists - this setting is used to control which metrics are listed to be allowed from each default target and to change the default behavior
# https://learn.microsoft.com/azure/azure-monitor/containers/prometheus-metrics-scrape-configuration?tabs=CRDConfig%2CCRDScrapeConfig%2CConfigFileScrapeConfigBasicAuth%2CConfigFileScrapeConfigTLSAuth#customize-metrics-collected-by-default-targets
default-targets-metrics-keep-list: |-
kubelet = ""
coredns = ""
cadvisor = ""
kubeproxy = ""
apiserver = ""
kubestate = ""
nodeexporter = ""
windowsexporter = ""
windowskubeproxy = ""
podannotations = ""
kappiebasic = ""
networkobservabilityRetina = "" # networkobservability_forward_count|networkobservability_forward_bytes|networkobservability_drop_count|networkobservability_drop_bytes|networkobservability_tcp_state|networkobservability_tcp_connection_remote|networkobservability_tcp_connection_stats|networkobservability_ip_connection_stats|networkobservability_udp_connection_stats|networkobservability_udp_active_sockets|networkobservability_interface_stats
networkobservabilityHubble = ""
networkobservabilityCilium = ""
controlplane-apiserver = ""
controlplane-cluster-autoscaler = ""
controlplane-kube-scheduler = ""
controlplane-kube-controller-manager = ""
controlplane-etcd = ""
minimalingestionprofile = true
# The scrape intervals for default/pre-definetargets. 30 secs is the default scrape frequency and it can be changed per default target using this configmap
# https://learn.microsoft.com/azure/azure-monitor/containers/prometheus-metrics-scrape-configuration?tabs=CRDConfig%2CCRDScrapeConfig%2CConfigFileScrapeConfigBasicAuth%2CConfigFileScrapeConfigTLSAuth#scrape-interval-settings
default-targets-scrape-interval-settings: |-
kubelet = "30s"
coredns = "30s"
cadvisor = "30s"
kubeproxy = "30s"
apiserver = "30s"
kubestate = "30s"
nodeexporter = "30s"
windowsexporter = "30s"
windowskubeproxy = "30s"
kappiebasic = "30s"
networkobservabilityRetina = "30s"
networkobservabilityHubble = "30s"
networkobservabilityCilium = "30s"
prometheuscollectorhealth = "30s"
podannotations = "30s"
# debug-mode - turning this ON helps to debug missing metric/ingestion issues
# https://learn.microsoft.com/azure/azure-monitor/containers/prometheus-metrics-scrape-configuration?tabs=CRDConfig%2CCRDScrapeConfig%2CConfigFileScrapeConfigBasicAuth%2CConfigFileScrapeConfigTLSAuth#debug-mode
debug-mode: |-
enabled = false
metadata:
name: ama-metrics-settings-configmap
namespace: kube-system
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ data:
job_completion_threshold_time_minutes = 360
integrations: |-
[integrations.azure_network_policy_manager]
collect_basic_metrics = true
collect_basic_metrics = false
collect_advanced_metrics = false
[integrations.azure_subnet_ip_usage]
enabled = true
Expand Down
110 changes: 110 additions & 0 deletions cluster-stamp.bicep
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,90 @@ resource containerRegistryPullRole 'Microsoft.Authorization/roleDefinitions@2022

/*** RESOURCES ***/

resource amw 'Microsoft.Monitor/accounts@2023-04-03' = {
name: 'amw-${clusterName}'
location: location
properties: {
publicNetworkAccess: 'Enabled'
}
}

// A data collection endpoint to process Prometheus scraped metrics so they can be ingested by Azure Monitor
resource dce 'Microsoft.Insights/dataCollectionEndpoints@2023-03-11' = {
name: 'MSProm-${location}-${clusterName}'
location: location
kind: 'Linux'
properties: {
networkAcls: {
publicNetworkAccess: 'Enabled'
}
}
}

// A data collection rule that collects PrometheusMetrics from pods, nodes and cluster and configure Azure monitor workspace as destination
resource dcr 'Microsoft.Insights/dataCollectionRules@2023-03-11' = {
name: 'MSProm-${location}-${clusterName}'
kind: 'Linux'
location: location
properties: {
dataCollectionEndpointId: dce.id
dataSources: {
prometheusForwarder: [
{
name: 'PrometheusDataSource'
streams: [
'Microsoft-PrometheusMetrics'
]
labelIncludeFilter: {}
}
]
}
destinations: {
monitoringAccounts: [
{
accountResourceId: amw.id
name: amw.name
}
]
}
dataFlows: [
{
streams: [
'Microsoft-PrometheusMetrics'
]
destinations: [
amw.name
]
}
]
}
}

// A diagnostic setting for all Prometheus DCR logs to be sent to log analytics
resource dcr_diagnosticSettings 'Microsoft.Insights/diagnosticSettings@2021-05-01-preview' = {
scope: dcr
name: 'default'
properties: {
workspaceId: la.id
logs: [
{
categoryGroup: 'allLogs'
enabled: true
}
]
}
}

// Associate a data collection rule to the AKS Cluster
resource dcrAssociation 'Microsoft.Insights/dataCollectionRuleAssociations@2023-03-11' = {
name: 'MSProm-${location}-${clusterName}'
scope: mc
properties: {
dataCollectionRuleId: dcr.id
}
}


@description('The control plane identity used by the cluster. Used for networking access (VNET joining and DNS updating)')
resource miClusterControlPlane 'Microsoft.ManagedIdentity/userAssignedIdentities@2022-01-31-preview' = {
name: 'mi-${clusterName}-controlplane'
Expand Down Expand Up @@ -1194,6 +1278,30 @@ resource mc 'Microsoft.ContainerService/managedClusters@2022-10-02-preview' = {
autoUpgradeProfile: {
upgradeChannel: 'stable'
}
azureMonitorProfile: {
metrics: {
enabled: true
kubeStateMetrics: {
// https://learn.microsoft.com/azure/azure-monitor/containers/kubernetes-monitoring-enable
// https://github.com/kubernetes/kube-state-metrics

// Comma-separated list of Kubernetes annotations keys used in the resource's kube_resource_annotations metric.
// For example, kube_pod_annotations is the annotations metric for the pods resource.
// By default, this metric contains only name and namespace labels. To include more annotations,
// provide a list of resource names in their plural form and Kubernetes annotation keys that you want to allow for them.
// A single * can be provided for each resource to allow any annotations, but this has severe performance implications
// https://github.com/prometheus-community/helm-charts/blob/e68c764aa6c764ec5934c6812ff0eaa0877ba275/charts/kube-state-metrics/values.yaml#L342
metricAnnotationsAllowList: ''

// Comma-separated list of more Kubernetes label keys that is used in the resource's kube_resource_labels metric kube_resource_labels metric.
// For example, kube_pod_labels is the labels metric for the pods resource. By default this metric contains only name and namespace labels.
// To include more labels, provide a list of resource names in their plural form and Kubernetes label keys that you want to allow for them.
// A single * can be provided for each resource to allow any labels, but i this has severe performance implications.
// https://github.com/prometheus-community/helm-charts/blob/e68c764aa6c764ec5934c6812ff0eaa0877ba275/charts/kube-state-metrics/values.yaml#L326
metricLabelsAllowlist: ''
}
}
}
disableLocalAccounts: true
securityProfile: {
workloadIdentity: {
Expand Down Expand Up @@ -1222,6 +1330,8 @@ resource mc 'Microsoft.ContainerService/managedClusters@2022-10-02-preview' = {
omsContainerInsights
ensureClusterIdentityHasRbacToSelfManagedResources

dcr

// You want policies created before cluster because they take some time to be made available and we want them
// to apply to your cluster as soon as possible. Nothing in this cluster "technically" depends on these existing,
// just trying to get coverage as soon as possible.
Expand Down
3 changes: 3 additions & 0 deletions modules/workloadPoliciesDeployment.bicep
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,9 @@ module allowedResourcespolicyAssignment 'resourceGroupPolicyAssignment.bicep' =
'Microsoft.OperationsManagement/solutions'
'Microsoft.VirtualMachineImages/imageTemplates'
'Microsoft.Insights/dataCollectionRules'
'Microsoft.Insights/dataCollectionEndpoints'
'Microsoft.Monitor/accounts'
'Microsoft.Insights/dataCollectionRuleAssociations'
]
}
}
Expand Down
3 changes: 2 additions & 1 deletion networking/hub-region.v2.bicep
Original file line number Diff line number Diff line change
Expand Up @@ -619,7 +619,7 @@ resource hubFirewall 'Microsoft.Network/azureFirewalls@2021-05-01' = {
}
{
name: 'azure-monitor-addon'
description: 'All required for Azure Monitor for containers per https://learn.microsoft.com/azure/aks/limit-egress-traffic#azure-monitor-for-containers - Optionally you can restrict the ods and oms wildcards to JUST your cluster\'s log analytics instances.'
description: 'All required for Azure Monitor for containers per https://learn.microsoft.com/azure/aks/limit-egress-traffic#azure-monitor-for-containers and Managed Prometheus (data collection endpoint) - Optionally you can restrict the ods and oms wildcards to JUST your cluster\'s log analytics instances.'
sourceIpGroups: [
aks_ipgroup.id
]
Expand All @@ -634,6 +634,7 @@ resource hubFirewall 'Microsoft.Network/azureFirewalls@2021-05-01' = {
'*.oms.opinsights.azure.com'
'${location}.monitoring.azure.com'
'*.handler.control.monitor.azure.com'
'*.metrics.ingest.monitor.azure.com'
]
}
{
Expand Down

0 comments on commit ab4912c

Please sign in to comment.