HPCC-30295 Add ALA ContainerLogV2 Support

- Adds ContainerLogV2 based KQL query logic - Provides new sample helm values to enable v2 support - Ensures V1 backward compatability - Updates README documentation - Provides config files for v2 enablement - Provides logic to enable v2 on AKS deployment Signed-off-by: Rodrigo Pastrana <[email protected]>
hpcc-systems · Sep 28, 2023 · bc375f5 · bc375f5
1 parent 65b9811
commit bc375f5
Show file tree

Hide file tree

Showing 8 changed files with 381 additions and 11 deletions.
diff --git a/helm/examples/azure/log-analytics/README.md b/helm/examples/azure/log-analytics/README.md
@@ -22,6 +22,16 @@ The user should populate the following values in order to create a new Azure Log
      For example: "admin=MyName email=[email protected] environment=myenv justification=testing"
 - AZURE_SUBSCRIPTION (Optional - Ensures this subscription is set before creating the new workspace)
 
+- AKS_RESOURCE_LOCATION (Optional e.g. eastus)
+
+- ENABLE_CONTAINER_LOG_V2 (true|false) Enables the ContainerLog V2 schema.
+   If set to true, the stdout/stderr Logs are forwarded to ContainerLogV2 table, otherwise the container logs continue to be forwarded to ContainerLog table.
+   Utilizes ./helm/examples/azure/log-analytics/dataCollectionSettings.json, and 
+   ./helm/examples/azure/log-analytics/container-azm-ms-agentconfig.yaml which creates and applies a new configmap 'kube-system/container-azm-ms-agentconfig'.
+
+   Details on benefits of V2 schema: https://learn.microsoft.com/en-us/azure/azure-monitor/containers/container-insights-logging-v2?tabs=configure-portal
+
+
 #### b - Execute enable-loganalytics.sh
 
 This helper script attempts to create new Azure LogAnalytics workspace (user can provide pre-existing), associates the workspace with the target AKS cluster, and enables the Azure Log Analytics feature. This script is dependant on the values provided in the previous step.
@@ -62,9 +72,9 @@ Example manual secret creation command (assuming ./secrets-templates contains a
 ```
 
 #### c - Configure HPCC logAccess
-The target HPCC deployment should be directed to use the above Azure Log Analytics workspace, and the newly created secret by providing appropriate logAccess values (such as ./loganalytics-hpcc-logaccess.yaml). 
+The target HPCC deployment should be directed to use the above Azure Log Analytics workspace, and the newly created secret by providing appropriate logAccess values (such as ./loganalytics-hpcc-logaccess.yaml or ./loganalytics-hpcc-logaccessV2.yaml if targeting Azure Log Analytics ContainerLogV2 (recommended)). 
 
 Example use:
 ```console
-  helm install myhpcc hpcc/hpcc -f HPCC-Platform/helm/examples/azure/log-analytics/loganalytics-hpcc-logaccess.yaml
+  helm install myhpcc hpcc/hpcc -f HPCC-Platform/helm/examples/azure/log-analytics/loganalytics-hpcc-logaccessV2.yaml
 ```
diff --git a/helm/examples/azure/log-analytics/container-azm-ms-agentconfig.yaml b/helm/examples/azure/log-analytics/container-azm-ms-agentconfig.yaml
@@ -0,0 +1,209 @@
+kind: ConfigMap
+apiVersion: v1
+data:
+  schema-version:
+    #string.used by agent to parse config. supported versions are {v1}. Configs with other schema versions will be rejected by the agent.
+    v1
+  config-version:
+    #string.used by customer to keep track of this config file's version in their source control/repository (max allowed 10 chars, other chars will be truncated)
+    ver1
+  log-data-collection-settings: |-
+    # Log data collection settings
+    # Any errors related to config map settings can be found in the KubeMonAgentEvents table in the Log Analytics workspace that the cluster is sending data to.
+
+    [log_collection_settings]
+       [log_collection_settings.stdout]
+          # In the absense of this configmap, default value for enabled is true
+          enabled = true
+          # exclude_namespaces setting holds good only if enabled is set to true
+          # kube-system,gatekeeper-system log collection are disabled by default in the absence of 'log_collection_settings.stdout' setting. If you want to enable kube-system,gatekeeper-system, remove them from the following setting.
+          # If you want to continue to disable kube-system,gatekeeper-system log collection keep the namespaces in the following setting and add any other namespace you want to disable log collection to the array.
+          # In the absense of this configmap, default value for exclude_namespaces = ["kube-system","gatekeeper-system"]
+          exclude_namespaces = ["kube-system","gatekeeper-system"]
+
+       [log_collection_settings.stderr]
+          # Default value for enabled is true
+          enabled = true
+          # exclude_namespaces setting holds good only if enabled is set to true
+          # kube-system,gatekeeper-system log collection are disabled by default in the absence of 'log_collection_settings.stderr' setting. If you want to enable kube-system,gatekeeper-system, remove them from the following setting.
+          # If you want to continue to disable kube-system,gatekeeper-system log collection keep the namespaces in the following setting and add any other namespace you want to disable log collection to the array.
+          # In the absense of this configmap, default value for exclude_namespaces = ["kube-system","gatekeeper-system"]
+          exclude_namespaces = ["kube-system","gatekeeper-system"]
+
+       [log_collection_settings.env_var]
+          # In the absense of this configmap, default value for enabled is true
+          enabled = true
+       [log_collection_settings.enrich_container_logs]
+          # In the absense of this configmap, default value for enrich_container_logs is false
+          enabled = false
+          # When this is enabled (enabled = true), every container log entry (both stdout & stderr) will be enriched with container Name & container Image
+       [log_collection_settings.collect_all_kube_events]
+          # In the absense of this configmap, default value for collect_all_kube_events is false
+          # When the setting is set to false, only the kube events with !normal event type will be collected
+          enabled = false
+          # When this is enabled (enabled = true), all kube events including normal events will be collected
+       [log_collection_settings.schema]
+          # In the absence of this configmap, default value for containerlog_schema_version is "v1"
+          # Supported values for this setting are "v1","v2"
+          # See documentation at https://aka.ms/ContainerLogv2 for benefits of v2 schema over v1 schema before opting for "v2" schema
+          containerlog_schema_version = "v2"
+       #[log_collection_settings.enable_multiline_logs]
+          # fluent-bit based multiline log collection for go (stacktrace), dotnet (stacktrace)
+          # if enabled will also stitch together container logs split by docker/cri due to size limits(16KB per log line)
+          # enabled = "false"
+
+
+  prometheus-data-collection-settings: |-
+    # Custom Prometheus metrics data collection settings
+    [prometheus_data_collection_settings.cluster]
+        # Cluster level scrape endpoint(s). These metrics will be scraped from agent's Replicaset (singleton)
+        # Any errors related to prometheus scraping can be found in the KubeMonAgentEvents table in the Log Analytics workspace that the cluster is sending data to.
+
+        #Interval specifying how often to scrape for metrics. This is duration of time and can be specified for supporting settings by combining an integer value and time unit as a string value. Valid time units are ns, us (or µs), ms, s, m, h.
+        interval = "1m"
+
+        ## Uncomment the following settings with valid string arrays for prometheus scraping
+        #fieldpass = ["metric_to_pass1", "metric_to_pass12"]
+
+        #fielddrop = ["metric_to_drop"]
+
+        # An array of urls to scrape metrics from.
+        # urls = ["http://myurl:9101/metrics"]
+
+        # An array of Kubernetes services to scrape metrics from.
+        # kubernetes_services = ["http://my-service-dns.my-namespace:9102/metrics"]
+
+        # When monitor_kubernetes_pods = true, replicaset will scrape Kubernetes pods for the following prometheus annotations:
+        # - prometheus.io/scrape: Enable scraping for this pod
+        # - prometheus.io/scheme: Default is http
+        # - prometheus.io/path: If the metrics path is not /metrics, define it with this annotation.
+        # - prometheus.io/port: If port is not 9102 use this annotation
+        monitor_kubernetes_pods = false
+
+        ## Restricts Kubernetes monitoring to namespaces for pods that have annotations set and are scraped using the monitor_kubernetes_pods setting.
+        ## This will take effect when monitor_kubernetes_pods is set to true
+        ##   ex: monitor_kubernetes_pods_namespaces = ["default1", "default2", "default3"]
+        # monitor_kubernetes_pods_namespaces = ["default1"]
+
+        ## Label selector to target pods which have the specified label
+        ## This will take effect when monitor_kubernetes_pods is set to true
+        ## Reference the docs at https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#label-selectors
+        # kubernetes_label_selector = "env=dev,app=nginx"
+
+        ## Field selector to target pods which have the specified field
+        ## This will take effect when monitor_kubernetes_pods is set to true
+        ## Reference the docs at https://kubernetes.io/docs/concepts/overview/working-with-objects/field-selectors/
+        ## eg. To scrape pods on a specific node
+        # kubernetes_field_selector = "spec.nodeName=$HOSTNAME"
+
+    [prometheus_data_collection_settings.node]
+        # Node level scrape endpoint(s). These metrics will be scraped from agent's DaemonSet running in every node in the cluster
+        # Any errors related to prometheus scraping can be found in the KubeMonAgentEvents table in the Log Analytics workspace that the cluster is sending data to.
+
+        #Interval specifying how often to scrape for metrics. This is duration of time and can be specified for supporting settings by combining an integer value and time unit as a string value. Valid time units are ns, us (or µs), ms, s, m, h.
+        interval = "1m"
+
+        ## Uncomment the following settings with valid string arrays for prometheus scraping
+
+        # An array of urls to scrape metrics from. $NODE_IP (all upper case) will substitute of running Node's IP address
+        # urls = ["http://$NODE_IP:9103/metrics"]
+
+        #fieldpass = ["metric_to_pass1", "metric_to_pass12"]
+
+        #fielddrop = ["metric_to_drop"]
+
+  metric_collection_settings: |-
+    # Metrics collection settings for metrics sent to Log Analytics and MDM
+    [metric_collection_settings.collect_kube_system_pv_metrics]
+      # In the absense of this configmap, default value for collect_kube_system_pv_metrics is false
+      # When the setting is set to false, only the persistent volume metrics outside the kube-system namespace will be collected
+      enabled = false
+      # When this is enabled (enabled = true), persistent volume metrics including those in the kube-system namespace will be collected
+
+  alertable-metrics-configuration-settings: |-
+    # Alertable metrics configuration settings for container resource utilization
+    [alertable_metrics_configuration_settings.container_resource_utilization_thresholds]
+        # The threshold(Type Float) will be rounded off to 2 decimal points
+        # Threshold for container cpu, metric will be sent only when cpu utilization exceeds or becomes equal to the following percentage
+        container_cpu_threshold_percentage = 95.0
+        # Threshold for container memoryRss, metric will be sent only when memory rss exceeds or becomes equal to the following percentage
+        container_memory_rss_threshold_percentage = 95.0
+        # Threshold for container memoryWorkingSet, metric will be sent only when memory working set exceeds or becomes equal to the following percentage
+        container_memory_working_set_threshold_percentage = 95.0
+
+    # Alertable metrics configuration settings for persistent volume utilization
+    [alertable_metrics_configuration_settings.pv_utilization_thresholds]
+        # Threshold for persistent volume usage bytes, metric will be sent only when persistent volume utilization exceeds or becomes equal to the following percentage
+        pv_usage_threshold_percentage = 60.0
+
+    # Alertable metrics configuration settings for completed jobs count
+    [alertable_metrics_configuration_settings.job_completion_threshold]
+        # Threshold for completed job count , metric will be sent only for those jobs which were completed earlier than the following threshold
+        job_completion_threshold_time_minutes = 360
+  integrations: |-
+    [integrations.azure_network_policy_manager]
+        collect_basic_metrics = false
+        collect_advanced_metrics = false
+    [integrations.azure_subnet_ip_usage]
+        enabled = false
+
+# Doc - https://github.com/microsoft/Docker-Provider/blob/ci_prod/Documentation/AgentSettings/ReadMe.md
+  agent-settings: |-
+    # prometheus scrape fluent bit settings for high scale
+    # buffer size should be greater than or equal to chunk size else we set it to chunk size.
+    # settings scoped to prometheus sidecar container. all values in mb
+    [agent_settings.prometheus_fbit_settings]
+      tcp_listener_chunk_size = 10
+      tcp_listener_buffer_size = 10
+      tcp_listener_mem_buf_limit = 200
+
+    # prometheus scrape fluent bit settings for high scale
+    # buffer size should be greater than or equal to chunk size else we set it to chunk size.
+    # settings scoped to daemonset container. all values in mb
+    # [agent_settings.node_prometheus_fbit_settings]
+      # tcp_listener_chunk_size = 1
+      # tcp_listener_buffer_size = 1
+      # tcp_listener_mem_buf_limit = 10
+
+    # prometheus scrape fluent bit settings for high scale
+    # buffer size should be greater than or equal to chunk size else we set it to chunk size.
+    # settings scoped to replicaset container. all values in mb
+    # [agent_settings.cluster_prometheus_fbit_settings]
+      # tcp_listener_chunk_size = 1
+      # tcp_listener_buffer_size = 1
+      # tcp_listener_mem_buf_limit = 10
+
+    # The following settings are "undocumented", we don't recommend uncommenting them unless directed by Microsoft.
+    # They increase the maximum stdout/stderr log collection rate but will also cause higher cpu/memory usage.
+    ## Ref for more details about Ignore_Older -  https://docs.fluentbit.io/manual/v/1.7/pipeline/inputs/tail
+    # [agent_settings.fbit_config]
+    #   log_flush_interval_secs = "1"                 # default value is 15
+    #   tail_mem_buf_limit_megabytes = "10"           # default value is 10
+    #   tail_buf_chunksize_megabytes = "1"            # default value is 32kb (comment out this line for default)
+    #   tail_buf_maxsize_megabytes = "1"              # default value is 32kb (comment out this line for default)
+    #   tail_ignore_older = "5m"                      # default value same as fluent-bit default i.e.0m
+
+    # On both AKS & Arc K8s enviornments, if Cluster has configured with Forward Proxy then Proxy settings automatically applied and used for the agent
+    # Certain configurations, proxy config should be ignored for example Cluster with AMPLS + Proxy
+    # in such scenarios, use the following config to ignore proxy settings
+    # [agent_settings.proxy_config]
+    #    ignore_proxy_settings = "true"  # if this is not applied, default value is false
+
+    # The following settings are "undocumented", we don't recommend uncommenting them unless directed by Microsoft.
+    # Configuration settings for the waittime for the network listeners to be available
+    # [agent_settings.network_listener_waittime]
+    #   tcp_port_25226 = 45                           # Port 25226 is used for telegraf to fluent-bit data in ReplicaSet
+    #   tcp_port_25228 = 60                           # Port 25228 is used for telegraf to fluentd data
+    #   tcp_port_25229 = 45                           # Port 25229 is used for telegraf to fluent-bit data in DaemonSet
+
+    # The following settings are "undocumented", we don't recommend uncommenting them unless directed by Microsoft.
+    # [agent_settings.mdsd_config]
+    #   monitoring_max_event_rate = "50000" # default 20K eps
+    #   backpressure_memory_threshold_in_mb = "1500" # default 3500MB
+    #   upload_max_size_in_mb = "20"  # default 2MB
+    #   upload_frequency_seconds = "1" # default 60 upload_frequency_seconds
+    #   compression_level = "0"  # supported levels 0 to 9 and 0 means no compression
+
+metadata:
+  name: container-azm-ms-agentconfig
+  namespace: kube-system
diff --git a/helm/examples/azure/log-analytics/dataCollectionSettings.json b/helm/examples/azure/log-analytics/dataCollectionSettings.json
@@ -0,0 +1,7 @@
+{
+    "interval": "1m",
+    "namespaceFilteringMode": "Include",
+    "namespaces": ["kube-system"],
+    "enableContainerLogV2": true, 
+    "streams": ["Microsoft-Perf", "Microsoft-ContainerLogV1"]
+  }
diff --git a/helm/examples/azure/log-analytics/enable-loganalytics.sh b/helm/examples/azure/log-analytics/enable-loganalytics.sh
@@ -83,6 +83,10 @@ else
 fi
 
 echo "Enabling workspace on target AKS cluster '$AKS_CLUSTER_NAME'..."
+
+if $ENABLE_CONTAINER_LOG_V2 ; then DATA_COLLECTION_SETTINGS="--workspace-resource-id $wsid --data-collection-settings dataCollectionSettings.json"; fi
+
+echo "aks enable-addons -g $AKS_RESOURCE_GROUP -n $AKS_CLUSTER_NAME -a monitoring $DATA_COLLECTION_SETTINGS"
 az aks enable-addons -g $AKS_RESOURCE_GROUP -n $AKS_CLUSTER_NAME -a monitoring --workspace-resource-id $wsid
 if [[ $? -ne 0 ]]
 then
@@ -91,3 +95,8 @@ then
 else
   echo "Success, workspace id: '$wsid' enabled on AKS $AKS_CLUSTER_NAME"
 fi
+
+if $ENABLE_CONTAINER_LOG_V2 ; then
+ echo "Setting ContainerLogV2 schema via container-azm-ms-agentconfig" 
+ kubectl apply -f ${WORK_DIR}/container-azm-ms-agentconfig.yaml;
+fi
diff --git a/helm/examples/azure/log-analytics/env-loganalytics b/helm/examples/azure/log-analytics/env-loganalytics
@@ -21,4 +21,7 @@ AKS_RESOURCE_GROUP=
 AZURE_SUBSCRIPTION=
 
 # Azure resource location
-AKS_RESOURCE_LOCATION=eastus
+AKS_RESOURCE_LOCATION=eastus
+
+# Enable enableContainerLogV2
+ENABLE_CONTAINER_LOG_V2=true