From 10fe30c6bbfae1e170278492a84e6752359c3eb9 Mon Sep 17 00:00:00 2001 From: Andreas Sommer Date: Wed, 11 Dec 2024 13:51:21 +0100 Subject: [PATCH] Make ASG lifecycle hook heartbeat timeout configurable (#960) --- helm/cluster-aws/README.md | 2 ++ ...ifecycle-hook-heartbeattimeout-values.yaml | 24 +++++++++++++++++++ helm/cluster-aws/templates/_machine_pools.tpl | 14 +++++++---- helm/cluster-aws/values.schema.json | 14 +++++++++++ 4 files changed, 49 insertions(+), 5 deletions(-) create mode 100644 helm/cluster-aws/ci/test-lifecycle-hook-heartbeattimeout-values.yaml diff --git a/helm/cluster-aws/README.md b/helm/cluster-aws/README.md index 8e5f97a1..4fe64666 100644 --- a/helm/cluster-aws/README.md +++ b/helm/cluster-aws/README.md @@ -392,6 +392,8 @@ Node pools of the cluster. If not specified, this defaults to the value of `clus | `global.nodePools.PATTERN.additionalSecurityGroups[*].id` | **Id of the security group** - ID of the security group that will be added to the machine pool nodes. The security group must exist.|**Type:** `string`
**Key pattern:**
`PATTERN`=`^[a-z0-9][-a-z0-9]{3,18}[a-z0-9]$`
| | `global.nodePools.PATTERN.availabilityZones` | **Availability zones**|**Type:** `array`
**Key pattern:**
`PATTERN`=`^[a-z0-9][-a-z0-9]{3,18}[a-z0-9]$`
| | `global.nodePools.PATTERN.availabilityZones[*]` | **Availability zone**|**Type:** `string`
**Key pattern:**
`PATTERN`=`^[a-z0-9][-a-z0-9]{3,18}[a-z0-9]$`
| +| `global.nodePools.PATTERN.awsNodeTerminationHandler` | **aws-node-termination-handler related settings** - Configuration for the ASG lifecycle hook used by aws-node-termination-handler|**Type:** `object`
**Key pattern:**
`PATTERN`=`^[a-z0-9][-a-z0-9]{3,18}[a-z0-9]$`
| +| `global.nodePools.PATTERN.awsNodeTerminationHandler.heartbeatTimeoutSeconds` | **Heartbeat timeout for ASG lifecycle hook**|**Type:** `number`
**Key pattern:**
`PATTERN`=`^[a-z0-9][-a-z0-9]{3,18}[a-z0-9]$`
**Default:** `1800`| | `global.nodePools.PATTERN.customNodeLabels` | **Custom node labels**|**Type:** `array`
**Key pattern:**
`PATTERN`=`^[a-z0-9][-a-z0-9]{3,18}[a-z0-9]$`
| | `global.nodePools.PATTERN.customNodeLabels[*]` | **Label**|**Type:** `string`
**Key pattern:**
`PATTERN`=`^[a-z0-9][-a-z0-9]{3,18}[a-z0-9]$`
| | `global.nodePools.PATTERN.customNodeTaints` | **Custom node taints**|**Type:** `array`
**Key pattern:**
`PATTERN`=`^[a-z0-9][-a-z0-9]{3,18}[a-z0-9]$`
| diff --git a/helm/cluster-aws/ci/test-lifecycle-hook-heartbeattimeout-values.yaml b/helm/cluster-aws/ci/test-lifecycle-hook-heartbeattimeout-values.yaml new file mode 100644 index 00000000..a76b5d49 --- /dev/null +++ b/helm/cluster-aws/ci/test-lifecycle-hook-heartbeattimeout-values.yaml @@ -0,0 +1,24 @@ +global: + release: + version: v27.0.0-alpha.1 + metadata: + name: test-wc-minimal + organization: test + servicePriority: lowest + connectivity: + baseDomain: example.com + nodePools: + pool0: + maxSize: 2 + minSize: 2 + awsNodeTerminationHandler: + heartbeatTimeoutSeconds: 60 + providerSpecific: + region: "eu-west-1" + managementCluster: test + +cluster: + internal: + ephemeralConfiguration: + offlineTesting: + renderWithoutReleaseResource: true diff --git a/helm/cluster-aws/templates/_machine_pools.tpl b/helm/cluster-aws/templates/_machine_pools.tpl index cfbdcc2e..ad5f2c37 100644 --- a/helm/cluster-aws/templates/_machine_pools.tpl +++ b/helm/cluster-aws/templates/_machine_pools.tpl @@ -84,11 +84,15 @@ spec: version: "3.4" lifecycleHooks: - defaultResult: CONTINUE - # High enough heartbeat timeout because aws-node-termination-handler (shortened to "NTH" here) - # doesn't send heartbeats (https://github.com/aws/aws-node-termination-handler/issues/493), - # but low enough so that if the controller is down, instances can still terminate within - # a reasonable time. - heartbeatTimeout: 30m + + {{/* + The default is a high enough heartbeat timeout because aws-node-termination-handler (shortened to "NTH" here) + doesn't send heartbeats (https://github.com/aws/aws-node-termination-handler/issues/493), + but low enough so that if the controller is down, instances can still terminate within + a reasonable time. + */}} + heartbeatTimeout: "{{ ($value.awsNodeTerminationHandler).heartbeatTimeoutSeconds | default 1800 }}s" + lifecycleTransition: autoscaling:EC2_INSTANCE_TERMINATING name: aws-node-termination-handler notificationTargetARN: arn:{{ include "aws-partition" $}}:sqs:{{ include "aws-region" $ }}:{{ include "aws-account-id" $}}:{{ include "resource.default.name" $ }}-nth diff --git a/helm/cluster-aws/values.schema.json b/helm/cluster-aws/values.schema.json index 3297046c..89e22e54 100644 --- a/helm/cluster-aws/values.schema.json +++ b/helm/cluster-aws/values.schema.json @@ -127,6 +127,20 @@ "title": "Availability zone" } }, + "awsNodeTerminationHandler": { + "type": "object", + "title": "aws-node-termination-handler related settings", + "description": "Configuration for the ASG lifecycle hook used by aws-node-termination-handler", + "properties": { + "heartbeatTimeoutSeconds": { + "type": "number", + "title": "Heartbeat timeout for ASG lifecycle hook", + "default": 1800, + "maximum": 7200, + "minimum": 30 + } + } + }, "customNodeLabels": { "type": "array", "title": "Custom node labels",