From 6c0036b8049b5255a296bbb9d5db05d2ae11ea74 Mon Sep 17 00:00:00 2001 From: Maciej Kwidzinski Date: Wed, 3 Jan 2024 17:15:52 +0100 Subject: [PATCH 1/6] JPERF-1332: Reduce pressure on CloudFormation When there's a lot of expired stacks (e.g. due to housekeeping outage), then listing them all requires a lot of Cloudformation requests. Then we get throttled, so AWS SDK makes a couple retries underneath, but we finally time out. Then we don't reach the stack deletion part. So the list remains long and it's a self-perpetuating problem. Instead, start deleting stacks as soon as we get a batch listed. --- CHANGELOG.md | 2 ++ .../performance/tools/aws/Cloudformation.kt | 16 +++++++-------- .../housekeeping/ConcurrentHousekeeping.kt | 6 ++++-- .../tools/aws/CloudformationTest.kt | 20 ++++++++++--------- 4 files changed, 24 insertions(+), 20 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2032f7f..76ea205 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,8 +28,10 @@ Dropping a requirement of a major version of a dependency is a new contract. ### Fixed - Add missing `iam:GetRole` permission. You have to update the policy manually. Fix [JPERF-1407]. +- Reduce pressure on CloudFormation when cleaning long lists of expired stacks. Help [JPERF-1332]. [JPERF-1407]: https://ecosystem.atlassian.net/browse/JPERF-1407 +[JPERF-1332]: https://ecosystem.atlassian.net/browse/JPERF-1332 ## [1.13.0] - 2023-08-14 [1.13.0]: https://github.com/atlassian-labs/aws-resources/compare/release-1.12.2...release-1.13.0 diff --git a/src/main/kotlin/com/atlassian/performance/tools/aws/Cloudformation.kt b/src/main/kotlin/com/atlassian/performance/tools/aws/Cloudformation.kt index 860c3fc..4ff5c15 100644 --- a/src/main/kotlin/com/atlassian/performance/tools/aws/Cloudformation.kt +++ b/src/main/kotlin/com/atlassian/performance/tools/aws/Cloudformation.kt @@ -7,25 +7,23 @@ import com.atlassian.performance.tools.aws.api.Aws import com.atlassian.performance.tools.aws.api.Investment import com.atlassian.performance.tools.aws.api.ProvisionedStack import com.atlassian.performance.tools.aws.api.ScrollingCloudformation +import java.util.function.Consumer internal class Cloudformation( private val aws: Aws, private val cloudformation: AmazonCloudFormation ) { - fun listExpiredStacks(): List { + fun consumeExpiredStacks(call: Consumer>) { val cleanStackStatuses = listOf( StackStatus.DELETE_COMPLETE, StackStatus.DELETE_IN_PROGRESS ) - val scrollingCloudformation = ScrollingCloudformation(cloudformation) - val stacks = mutableListOf() - scrollingCloudformation.scrollThroughStacks { stackBatch -> - stackBatch + ScrollingCloudformation(cloudformation).scrollThroughStacks { stackBatch -> + val expiredStacks = stackBatch .filter { StackStatus.fromValue(it.stackStatus) !in cleanStackStatuses } - .forEach { stacks += ProvisionedStack(it, aws) } - } - return stacks.filter { - it.isExpired() + .map { ProvisionedStack(it, aws) } + .filter { it.isExpired() } + call.accept(expiredStacks) } } diff --git a/src/main/kotlin/com/atlassian/performance/tools/aws/api/housekeeping/ConcurrentHousekeeping.kt b/src/main/kotlin/com/atlassian/performance/tools/aws/api/housekeeping/ConcurrentHousekeeping.kt index f4730c0..7d114c6 100644 --- a/src/main/kotlin/com/atlassian/performance/tools/aws/api/housekeeping/ConcurrentHousekeeping.kt +++ b/src/main/kotlin/com/atlassian/performance/tools/aws/api/housekeeping/ConcurrentHousekeeping.kt @@ -8,6 +8,7 @@ import org.apache.logging.log4j.LogManager import java.time.Duration import java.time.Instant.now import java.util.concurrent.CompletableFuture +import java.util.function.Consumer class ConcurrentHousekeeping( private val stackTimeout: Duration, @@ -17,8 +18,9 @@ class ConcurrentHousekeeping( private val logger = LogManager.getLogger(this::class.java) override fun cleanLeftovers(aws: Aws) { - val stacks = Cloudformation(aws, aws.cloudformation).listExpiredStacks() - waitUntilReleased(stacks, stackTimeout) + Cloudformation(aws, aws.cloudformation).consumeExpiredStacks(Consumer { stacks -> + waitUntilReleased(stacks, stackTimeout) + }) val instances = Ec2(aws.ec2).listExpiredInstances() waitUntilReleased(instances, instanceTimeout) diff --git a/src/test/kotlin/com/atlassian/performance/tools/aws/CloudformationTest.kt b/src/test/kotlin/com/atlassian/performance/tools/aws/CloudformationTest.kt index fdcbb9c..783ad4f 100644 --- a/src/test/kotlin/com/atlassian/performance/tools/aws/CloudformationTest.kt +++ b/src/test/kotlin/com/atlassian/performance/tools/aws/CloudformationTest.kt @@ -6,10 +6,12 @@ import com.amazonaws.services.cloudformation.model.DescribeStacksResult import com.amazonaws.services.cloudformation.model.Stack import com.atlassian.performance.tools.aws.api.Tag import org.assertj.core.api.Assertions.assertThat +import org.assertj.core.api.Assertions.fail import org.junit.jupiter.api.Test import java.time.Duration.ofMinutes import java.time.Instant.now import java.util.* +import java.util.function.Consumer class CloudformationTest { private val awsMock = FakeAws.awsForUnitTests() @@ -21,9 +23,9 @@ class CloudformationTest { ) val cloudformation = Cloudformation(awsMock, CloudformationMock(stacks)) - val expiredStacks = cloudformation.listExpiredStacks() - - assertThat(expiredStacks).isEmpty() + cloudformation.consumeExpiredStacks(Consumer { expiredStacks -> + assertThat(expiredStacks).isEmpty() + }) } @Test @@ -40,9 +42,9 @@ class CloudformationTest { ) val cloudformation = Cloudformation(awsMock, CloudformationMock(stacks)) - val expiredStacks = cloudformation.listExpiredStacks() - - assertThat(expiredStacks).hasSize(1) + cloudformation.consumeExpiredStacks(Consumer { expiredStacks -> + assertThat(expiredStacks).hasSize(1) + }) } @Test @@ -58,9 +60,9 @@ class CloudformationTest { ) val cloudformation = Cloudformation(awsMock, CloudformationMock(stacks)) - val expiredStacks = cloudformation.listExpiredStacks() - - assertThat(expiredStacks).isEmpty() + cloudformation.consumeExpiredStacks(Consumer { expiredStacks -> + assertThat(expiredStacks).isEmpty() + }) } private class CloudformationMock( From 8291fc86abcedc25aa1bf63ab468fe376260028b Mon Sep 17 00:00:00 2001 From: Maciej Kwidzinski Date: Thu, 4 Jan 2024 11:04:31 +0100 Subject: [PATCH 2/6] Scroll-delete EC2 instances too We haven't been throttled by EC2 like for CloudFormation, but do it for consistency with CloudFormation housekeeping. --- .../com/atlassian/performance/tools/aws/Ec2.kt | 12 ++++++------ .../aws/api/housekeeping/ConcurrentHousekeeping.kt | 5 +++-- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/main/kotlin/com/atlassian/performance/tools/aws/Ec2.kt b/src/main/kotlin/com/atlassian/performance/tools/aws/Ec2.kt index 1f8e1f4..ec2761b 100644 --- a/src/main/kotlin/com/atlassian/performance/tools/aws/Ec2.kt +++ b/src/main/kotlin/com/atlassian/performance/tools/aws/Ec2.kt @@ -7,26 +7,27 @@ import com.atlassian.performance.tools.aws.ami.AmiImage import com.atlassian.performance.tools.aws.api.Resource import com.atlassian.performance.tools.aws.api.TerminationBatchingEc2 import com.atlassian.performance.tools.aws.api.TerminationPollingEc2 +import java.util.function.Consumer internal class Ec2( private val ec2: AmazonEC2 ) { - fun listExpiredInstances(): List { + fun consumeExpiredInstances(call: Consumer>) { val scrollingEc2 = TokenScrollingEc2(ec2) val terminationPollingEc2 = TerminationPollingEc2(scrollingEc2) val terminationBatchingEc2 = TerminationBatchingEc2(ec2, terminationPollingEc2) - val instances = mutableListOf() val cleanInstanceStatuses = listOf( InstanceStateName.ShuttingDown, InstanceStateName.Terminated ) scrollingEc2.scrollThroughInstances { instanceBatch -> - instanceBatch + val expiredInstances = instanceBatch .filter { InstanceStateName.fromValue(it.state.name) !in cleanInstanceStatuses } - .forEach { instances += Ec2Instance(it, terminationBatchingEc2) } + .map { Ec2Instance(it, terminationBatchingEc2) } + .filter { it.isExpired() } + call.accept(expiredInstances) } - return instances.filter { it.isExpired() } } fun listExpiredAmis(): List { @@ -36,5 +37,4 @@ internal class Ec2( .map { AmiImage(image = it, ec2 = ec2) } .filter { it.isExpired() } } - } diff --git a/src/main/kotlin/com/atlassian/performance/tools/aws/api/housekeeping/ConcurrentHousekeeping.kt b/src/main/kotlin/com/atlassian/performance/tools/aws/api/housekeeping/ConcurrentHousekeeping.kt index 7d114c6..dde506f 100644 --- a/src/main/kotlin/com/atlassian/performance/tools/aws/api/housekeeping/ConcurrentHousekeeping.kt +++ b/src/main/kotlin/com/atlassian/performance/tools/aws/api/housekeeping/ConcurrentHousekeeping.kt @@ -22,8 +22,9 @@ class ConcurrentHousekeeping( waitUntilReleased(stacks, stackTimeout) }) - val instances = Ec2(aws.ec2).listExpiredInstances() - waitUntilReleased(instances, instanceTimeout) + Ec2(aws.ec2).consumeExpiredInstances(Consumer { instances -> + waitUntilReleased(instances, instanceTimeout) + }) val amis = Ec2(aws.ec2).listExpiredAmis() waitUntilReleased(amis, amiTimeout) From f2c6336e8632e3bc6c4ffa8083f4529dd157c8df Mon Sep 17 00:00:00 2001 From: Maciej Kwidzinski Date: Wed, 3 Jan 2024 17:24:33 +0100 Subject: [PATCH 3/6] JPERF-1208: Clean up security groups before stacks Network stacks contain VPCs, and VPCs depend on security groups. Some security groups are provisioned outside of the stack, so deleting such a stack will fail due to the dependency. Delete stacks at the end, so that all external dependencies are already cleaned up. --- CHANGELOG.md | 2 ++ .../tools/aws/api/housekeeping/ConcurrentHousekeeping.kt | 8 ++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 76ea205..4c97020 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,9 +29,11 @@ Dropping a requirement of a major version of a dependency is a new contract. ### Fixed - Add missing `iam:GetRole` permission. You have to update the policy manually. Fix [JPERF-1407]. - Reduce pressure on CloudFormation when cleaning long lists of expired stacks. Help [JPERF-1332]. +- Clean up EC2 security groups before CloudFormation stacks. Fix [JPERF-1208]. [JPERF-1407]: https://ecosystem.atlassian.net/browse/JPERF-1407 [JPERF-1332]: https://ecosystem.atlassian.net/browse/JPERF-1332 +[JPERF-1208]: https://ecosystem.atlassian.net/browse/JPERF-1208 ## [1.13.0] - 2023-08-14 [1.13.0]: https://github.com/atlassian-labs/aws-resources/compare/release-1.12.2...release-1.13.0 diff --git a/src/main/kotlin/com/atlassian/performance/tools/aws/api/housekeeping/ConcurrentHousekeeping.kt b/src/main/kotlin/com/atlassian/performance/tools/aws/api/housekeeping/ConcurrentHousekeeping.kt index dde506f..403666d 100644 --- a/src/main/kotlin/com/atlassian/performance/tools/aws/api/housekeeping/ConcurrentHousekeeping.kt +++ b/src/main/kotlin/com/atlassian/performance/tools/aws/api/housekeeping/ConcurrentHousekeeping.kt @@ -18,10 +18,6 @@ class ConcurrentHousekeeping( private val logger = LogManager.getLogger(this::class.java) override fun cleanLeftovers(aws: Aws) { - Cloudformation(aws, aws.cloudformation).consumeExpiredStacks(Consumer { stacks -> - waitUntilReleased(stacks, stackTimeout) - }) - Ec2(aws.ec2).consumeExpiredInstances(Consumer { instances -> waitUntilReleased(instances, instanceTimeout) }) @@ -39,6 +35,10 @@ class ConcurrentHousekeeping( waitUntilReleased(keys) waitUntilReleased(securityGroups) + + Cloudformation(aws, aws.cloudformation).consumeExpiredStacks(Consumer { stacks -> + waitUntilReleased(stacks, stackTimeout) + }) } private fun waitUntilReleased( From 689262019f5ee5e0e89e646509d82fc9b4b27247 Mon Sep 17 00:00:00 2001 From: Maciej Kwidzinski Date: Wed, 3 Jan 2024 17:49:06 +0100 Subject: [PATCH 4/6] Add jenv --- .java-version | 1 + 1 file changed, 1 insertion(+) create mode 100644 .java-version diff --git a/.java-version b/.java-version new file mode 100644 index 0000000..6259340 --- /dev/null +++ b/.java-version @@ -0,0 +1 @@ +1.8 From b258ad710ed8646d08fa622dea285a260a1da191 Mon Sep 17 00:00:00 2001 From: Maciej Kwidzinski Date: Wed, 3 Jan 2024 17:59:15 +0100 Subject: [PATCH 5/6] Fix always reporting fail even on success Avoid false-positives like (notice lack of stacktrace logging too): ``` 16:36:00,592 ERROR {} Ec2Instance(instanceId = i-080edd5933a74418f) failed to release itself 16:36:30,577 ERROR {} Ec2Instance(instanceId = i-061f9c1f94990848e) failed to release itself ``` --- CHANGELOG.md | 1 + .../tools/aws/api/housekeeping/ConcurrentHousekeeping.kt | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4c97020..cd1d4c9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -30,6 +30,7 @@ Dropping a requirement of a major version of a dependency is a new contract. - Add missing `iam:GetRole` permission. You have to update the policy manually. Fix [JPERF-1407]. - Reduce pressure on CloudFormation when cleaning long lists of expired stacks. Help [JPERF-1332]. - Clean up EC2 security groups before CloudFormation stacks. Fix [JPERF-1208]. +- Fix housekeeping fail logging. [JPERF-1407]: https://ecosystem.atlassian.net/browse/JPERF-1407 [JPERF-1332]: https://ecosystem.atlassian.net/browse/JPERF-1332 diff --git a/src/main/kotlin/com/atlassian/performance/tools/aws/api/housekeeping/ConcurrentHousekeeping.kt b/src/main/kotlin/com/atlassian/performance/tools/aws/api/housekeeping/ConcurrentHousekeeping.kt index 403666d..32fbc9d 100644 --- a/src/main/kotlin/com/atlassian/performance/tools/aws/api/housekeeping/ConcurrentHousekeeping.kt +++ b/src/main/kotlin/com/atlassian/performance/tools/aws/api/housekeeping/ConcurrentHousekeeping.kt @@ -57,7 +57,7 @@ class ConcurrentHousekeeping( if (!resource.isExpired()) { throw Exception("You can't release $resource. It hasn't expired.") } - return resource.release().handle { throwable, _ -> + return resource.release().handle { _, throwable: Throwable? -> if (throwable != null) { logger.error("$resource failed to release itself", throwable) } From b0952f37cb8823b5a277b66ee8c92e89830c74ae Mon Sep 17 00:00:00 2001 From: Maciej Kwidzinski Date: Thu, 4 Jan 2024 10:56:03 +0100 Subject: [PATCH 6/6] Release SSH keys sequentially When you hit the 5k key limit, starting 5k threads is ok for Java, but not ok for AWS. We get immediately throttled. It's actually faster to do it sequentially. And AWS is quick to delete. --- .../tools/aws/api/housekeeping/ConcurrentHousekeeping.kt | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/main/kotlin/com/atlassian/performance/tools/aws/api/housekeeping/ConcurrentHousekeeping.kt b/src/main/kotlin/com/atlassian/performance/tools/aws/api/housekeeping/ConcurrentHousekeeping.kt index 32fbc9d..c61e156 100644 --- a/src/main/kotlin/com/atlassian/performance/tools/aws/api/housekeeping/ConcurrentHousekeeping.kt +++ b/src/main/kotlin/com/atlassian/performance/tools/aws/api/housekeeping/ConcurrentHousekeeping.kt @@ -25,15 +25,14 @@ class ConcurrentHousekeeping( val amis = Ec2(aws.ec2).listExpiredAmis() waitUntilReleased(amis, amiTimeout) - val keys = aws.ec2.describeKeyPairs().keyPairs.map { key -> - RemoteSshKey(SshKeyName(key.keyName), aws.ec2) - }.filter { it.isExpired() } + aws.ec2.describeKeyPairs().keyPairs + .map { key -> RemoteSshKey(SshKeyName(key.keyName), aws.ec2) } + .filter { it.isExpired() } + .forEach { it.release().get() } val securityGroups = aws.ec2.describeSecurityGroups().securityGroups.map { securityGroup -> Ec2SecurityGroup(securityGroup, aws.ec2) }.filter { it.isExpired() } - - waitUntilReleased(keys) waitUntilReleased(securityGroups) Cloudformation(aws, aws.cloudformation).consumeExpiredStacks(Consumer { stacks ->