-
Notifications
You must be signed in to change notification settings - Fork 20
/
aws_clean.py
executable file
·497 lines (417 loc) · 20.6 KB
/
aws_clean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
import logging
import sys
from argparse import ArgumentParser
import time
from time import sleep
import boto3
from boto3.exceptions import Boto3Error
from botocore.exceptions import ClientError
def wait_for_node_group_delete(eks_client, cluster_name, node_group):
timeout = 900 # 15 min
attempt = 0
sleep_time = 10
attempts = timeout // sleep_time
while attempt < attempts:
try:
status_info = eks_client.describe_nodegroup(clusterName=cluster_name, nodegroupName=node_group)['nodegroup']
except eks_client.exceptions.ResourceNotFoundException:
logging.info(f"Node group {node_group} for cluster {cluster_name} was successfully deleted.")
break
if status_info['status'] == "DELETING":
logging.info(f"Node group {node_group} for cluster {cluster_name} status is {status_info['status']}. "
f"Attempt {attempt}/{attempts}. Sleeping {sleep_time} seconds.")
sleep(sleep_time)
attempt += 1
else:
logging.error(f"Node group {node_group} for cluster {cluster_name} has "
f"unexpected status: {status_info['status']}.")
logging.error(f"Health status: {status_info['health']}")
return
else:
logging.error(f"Node group {node_group} for cluster {cluster_name} was not deleted in {timeout} seconds.")
def wait_for_cluster_delete(eks_client, cluster_name):
timeout = 600 # 10 min
attempt = 0
sleep_time = 10
attempts = timeout // sleep_time
while attempt < attempts:
try:
status = eks_client.describe_cluster(name=cluster_name)['cluster']['status']
except eks_client.exceptions.ResourceNotFoundException:
logging.info(f"Cluster {cluster_name} was successfully deleted.")
break
logging.info(f"Cluster {cluster_name} status is {status}. "
f"Attempt {attempt}/{attempts}. Sleeping {sleep_time} seconds.")
sleep(sleep_time)
attempt += 1
else:
logging.error(f"Cluster {cluster_name} was not deleted in {timeout} seconds.")
def wait_for_rds_delete(rds_client, db_name):
timeout = 600 # 10 min
attempt = 0
sleep_time = 10
attempts = timeout // sleep_time
while attempt < attempts:
try:
status = \
rds_client.describe_db_instances(DBInstanceIdentifier=db_name)['DBInstances'][0]['DBInstanceStatus']
except rds_client.exceptions.DBInstanceNotFoundFault:
logging.info(f"RDS {db_name} was successfully deleted.")
break
logging.info(f"RDS {db_name} status is {status}. "
f"Attempt {attempt}/{attempts}. Sleeping {sleep_time} seconds.")
sleep(sleep_time)
attempt += 1
else:
logging.error(f"RDS {db_name} was not deleted in {timeout} seconds.")
def delete_nodegroup(aws_region, cluster_name):
eks_client = boto3.client('eks', region_name=aws_region)
autoscaling_client = boto3.client('autoscaling', region_name=aws_region)
node_groups = eks_client.list_nodegroups(clusterName=cluster_name)['nodegroups']
if node_groups:
for node_group in node_groups:
autoscaling_group_name = None
try:
autoscaling_group_name = eks_client.describe_nodegroup(
clusterName=cluster_name,
nodegroupName=node_group)['nodegroup']['resources']['autoScalingGroups'][0]['name']
autoscaling_client.delete_auto_scaling_group(AutoScalingGroupName=autoscaling_group_name,
ForceDelete=True)
except Boto3Error as e:
logging.error(f"Deleting autoscaling group {autoscaling_group_name} failed with error: {e}")
try:
eks_client.delete_nodegroup(clusterName=cluster_name, nodegroupName=node_group)
wait_for_node_group_delete(eks_client, cluster_name, node_group)
except Boto3Error as e:
logging.error(f"Deleting node group {node_group} failed with error: {e}")
else:
logging.info(f"Cluster {cluster_name} does not have nodegroups.")
def delete_cluster(aws_region, cluster_name):
eks_client = boto3.client('eks', region_name=aws_region)
eks_client.delete_cluster(name=cluster_name)
wait_for_cluster_delete(eks_client, cluster_name)
def delete_lb(aws_region, vpc_id):
lb_names = []
elb_client = boto3.client('elb', region_name=aws_region)
lb_names = [lb['LoadBalancerName']
for lb in elb_client.describe_load_balancers()['LoadBalancerDescriptions']
if lb['VPCId'] == vpc_id]
if lb_names:
for lb_name in lb_names:
try:
logging.info(f"Deleting load balancer: {lb_name} for vpc id: {vpc_id}")
elb_client.delete_load_balancer(LoadBalancerName=lb_name)
except Boto3Error as e:
logging.error(f"Deleting load balancer {lb_name} failed with error: {e}")
def wait_for_nat_gateway_delete(ec2, nat_gateway_id):
timeout = 600 # 10 min
attempt = 0
sleep_time = 10
attempts = timeout // sleep_time
while attempt < attempts:
try:
status = ec2.describe_nat_gateways(NatGatewayIds=[nat_gateway_id])['NatGateways'][0]['State']
except ec2.exceptions.ResourceNotFoundException:
logging.info(f"NAT gateway with id {nat_gateway_id} was not found.")
break
if status == 'deleted':
logging.info(f"NAT gateway with id {nat_gateway_id} was successfully deleted.")
break
logging.info(f"NAT gateway with id {nat_gateway_id} status is {status}. "
f"Attempt {attempt}/{attempts}. Sleeping {sleep_time} seconds.")
sleep(sleep_time)
attempt += 1
else:
logging.error(f"NAT gateway with id {nat_gateway_id} was not deleted in {timeout} seconds.")
def delete_nat_gateway(aws_region, vpc_id):
ec2_client = boto3.client('ec2', region_name=aws_region)
filters = [{'Name': 'vpc-id', 'Values': [f'{vpc_id}', ]}, ]
nat_gateway = ec2_client.describe_nat_gateways(Filters=filters)
nat_gateway_ids = [nat['NatGatewayId'] for nat in nat_gateway['NatGateways']]
if nat_gateway_ids:
for nat_gateway_id in nat_gateway_ids:
logging.info(f"Deleting NAT gateway with id: {nat_gateway_id}")
try:
ec2_client.delete_nat_gateway(NatGatewayId=nat_gateway_id)
wait_for_nat_gateway_delete(ec2_client, nat_gateway_id)
except Boto3Error as e:
logging.error(f"Deleting NAT gateway with id {nat_gateway_id} failed with error: {e}")
def delete_igw(ec2_resource, vpc_id):
vpc_resource = ec2_resource.Vpc(vpc_id)
igws = vpc_resource.internet_gateways.all()
if igws:
for igw in igws:
try:
logging.info(f"Detaching and Removing igw id: {igw.id}")
igw.detach_from_vpc(
VpcId=vpc_id
)
igw.delete()
except Boto3Error as e:
logging.error(f"Deleting igw failed with error: {e}")
def delete_subnets(ec2_resource, vpc_id):
vpc_resource = ec2_resource.Vpc(vpc_id)
subnets_all = vpc_resource.subnets.all()
subnets = [ec2_resource.Subnet(subnet.id) for subnet in subnets_all]
if subnets:
for sub in subnets:
# here we try to delete and except errors to try again after 30 seconds
# sometimes subnets still have dependencies which are completely gone in ~1 min
for attempt in range(0, 10):
logging.info(f"Removing subnet with id: {sub.id}. Attempt {attempt}/10")
try:
sub.delete()
except ClientError as e:
logging.error(f"Failed to delete subnet, will try again. The error was: {e}. Sleeping 30 seconds")
sleep(30)
continue
break
def delete_route_tables(ec2_resource, vpc_id):
vpc_resource = ec2_resource.Vpc(vpc_id)
rtbs = vpc_resource.route_tables.all()
if rtbs:
try:
for rtb in rtbs:
if rtb.associations_attribute and rtb.associations_attribute[0]['Main'] == True:
logging.info(f"{rtb.id} is the main route table, skipping...")
continue
logging.info(f"Removing rtb-id: {rtb.id}")
table = ec2_resource.RouteTable(rtb.id)
table.delete()
except Boto3Error as e:
logging.error(f"Delete of route table failed with error: {e}")
def delete_security_groups(ec2_resource, vpc_id):
vpc_resource = ec2_resource.Vpc(vpc_id)
sgps = vpc_resource.security_groups.all()
if sgps:
try:
for sg in sgps:
if sg.group_name == 'default':
logging.info(f"{sg.id} is the default security group, skipping...")
continue
if sg.ip_permissions:
logging.info(f"Removing ingress rules for security group with id: {sg.id}")
sg.revoke_ingress(IpPermissions=sg.ip_permissions)
if sg.ip_permissions_egress:
logging.info(f"Removing egress rules for security group with id: {sg.id}")
sg.revoke_egress(IpPermissions=sg.ip_permissions_egress)
for sg in sgps:
if sg.group_name == 'default':
logging.info(f"{sg.id} is the default security group, skipping...")
continue
logging.info(f"Removing security group with id: {sg.id}")
sg.delete()
except Boto3Error as e:
logging.error(f"Delete of security group failed with error: {e}")
def delete_rds(aws_region, vpc_name):
rds_client = boto3.client('rds', region_name=aws_region)
rds_name_pattern = f'{vpc_name.replace("-vpc", "-")}'
db_instances = rds_client.describe_db_instances()['DBInstances']
db_names = [db_instance['DBInstanceIdentifier']
for db_instance in db_instances
if rds_name_pattern in db_instance['DBInstanceIdentifier']]
for db in db_names:
try:
logging.info(f"Deleting RDS {db}.")
rds_client.delete_db_instance(DBInstanceIdentifier=db, SkipFinalSnapshot=True, DeleteAutomatedBackups=True)
wait_for_rds_delete(rds_client, db)
except Boto3Error as e:
logging.error(f"Delete RDS {db} failed with error: {e}")
def terminate_vpc(vpc_name, aws_region):
ec2_resource = boto3.resource('ec2', region_name=aws_region)
filters = [{'Name': 'tag:Name', 'Values': [vpc_name]}]
vpc = list(ec2_resource.vpcs.filter(Filters=filters))
# we assume that if vpc is gone, all resources attached to it are gone too
if not vpc:
logging.info(f"VPC {vpc_name} not found in {aws_region}. Assuming all related aws resources are deleted")
return
vpc_id = vpc[0].id
logging.info(f"Checking RDS for VPC {vpc_name}.")
delete_rds(aws_region, vpc_name)
logging.info(f"Checking load balancers for VPC {vpc_name}.")
delete_lb(aws_region, vpc_id)
logging.info(f"Checking NAT gateway for VPC {vpc_name}.")
delete_nat_gateway(aws_region, vpc_id)
logging.info(f"Checking internet gateway for VPC {vpc_name}.")
delete_igw(ec2_resource, vpc_id)
logging.info(f"Checking subnets for VPC {vpc_name}.")
delete_subnets(ec2_resource, vpc_id)
logging.info(f"Checking route tables for VPC {vpc_name}.")
delete_route_tables(ec2_resource, vpc_id)
logging.info(f"Checking security groups for VPC {vpc_name}.")
delete_security_groups(ec2_resource, vpc_id)
logging.info(f"Deleting VPC {vpc_name}.")
try:
ec2_resource.Vpc(vpc_id).delete()
except Boto3Error as e:
logging.error(f"Deleting VPC {vpc_name} failed with error: {e}.")
def terminate_cluster(cluster_name, aws_region):
delete_nodegroup(aws_region, cluster_name)
delete_cluster(aws_region, cluster_name)
def get_clusters_to_terminate(service_name, aws_region):
eks_client = boto3.client('eks', region_name=aws_region)
clusters = eks_client.list_clusters()['clusters']
for cluster in clusters:
cluster_info = eks_client.describe_cluster(name=cluster)['cluster']
service_name_tag = cluster_info['tags'].get('service_name')
if service_name_tag == service_name:
return cluster
return
def release_unused_eips(aws_region):
ec2_client = boto3.client('ec2', region_name=aws_region)
addresses_dict = ec2_client.describe_addresses()
for eip_dict in addresses_dict['Addresses']:
if "NetworkInterfaceId" not in eip_dict:
name = next((tag["Value"] for tag in eip_dict["Tags"] if tag["Key"] == "Name"), None)
logging.info(f"Releasing EIP {eip_dict['PublicIp']} with name: {name}")
ec2_client.release_address(AllocationId=eip_dict['AllocationId'])
def terminate_open_id_providers(service_name):
iam_client = boto3.client('iam')
providers = iam_client.list_open_id_connect_providers()['OpenIDConnectProviderList']
for provider in providers:
arn = provider['Arn']
tags = iam_client.list_open_id_connect_provider_tags(OpenIDConnectProviderArn=provider['Arn'])['Tags']
service_name_tag_val = next((tag["Value"] for tag in tags if tag["Key"] == "service_name"), None)
if not service_name_tag_val:
logging.info(f'No service_name tag found in {arn}. Skipping')
else:
if service_name_tag_val == service_name:
logging.info(f"Deleting Open ID provider {arn} with tag service_name={service_name}.")
iam_client.delete_open_id_connect_provider(OpenIDConnectProviderArn=provider['Arn'])
def delete_volumes(service_name, aws_region):
ec2_client = boto3.resource('ec2', region_name=aws_region)
volumes = ec2_client.volumes.filter(Filters=[{'Name': 'status', 'Values': ['available']}])
for vol in volumes:
for tag in vol.tags:
if service_name in tag["Key"]:
logging.info(f"Volume {vol} with tag {tag} is unused. Deleting it")
vol.delete()
else:
if tag["Key"] == 'service_name':
service_name_tag = tag["Value"]
if service_name in service_name_tag:
logging.info(f"Volume {vol} with tag {service_name_tag} is unused. Deleting it")
vol.delete()
def delete_certificates(service_name, aws_region):
logging.info('Deleting unused certificates')
client = boto3.client('acm', region_name=aws_region)
response = client.list_certificates(CertificateStatuses=['ISSUED'])
for crt in response['CertificateSummaryList']:
if not (crt['InUse']) and (service_name in crt['DomainName']):
logging.info('Deleting unused certificate ' + crt['CertificateArn'])
client.delete_certificate(CertificateArn=crt['CertificateArn'])
def delete_hosted_zones(service_name):
logging.info('Deleting unused hosted zones')
client = boto3.client('route53')
response = client.list_hosted_zones()
for hz in response['HostedZones']:
if service_name in hz['Name']:
hosted_zone_records = client.list_resource_record_sets(HostedZoneId=hz['Id'])
for hzr in hosted_zone_records['ResourceRecordSets']:
if (hzr['Type'] == 'CNAME') or (hzr['Type'] == 'A'):
logging.info('Deleting record: ' + hzr['Name'])
client.change_resource_record_sets(HostedZoneId=hz['Id'], ChangeBatch={"Changes": [
{"Action": "DELETE", "ResourceRecordSet": hzr}]})
logging.info('Deleting hosted zone: ' + hz['Name'])
client.delete_hosted_zone(Id=hz['Id'])
def delete_iam_roles(service_name):
logging.info('Deleting IAM roles')
client = boto3.client('iam')
paginator = client.get_paginator('list_roles')
response_iterator = paginator.paginate()
for response in response_iterator:
matching_roles = [role['RoleName'] for role in response['Roles'] if
role['RoleName'].startswith('atlas-' + service_name)]
for role_name in matching_roles:
# Detach policies from the role
attached_policies = client.list_attached_role_policies(RoleName=role_name)['AttachedPolicies']
for policy in attached_policies:
policy_arn = policy['PolicyArn']
logging.info(f'Detaching IAM policy {policy_arn} from role {role_name}')
client.detach_role_policy(RoleName=role_name, PolicyArn=policy_arn)
logging.info('Deleting IAM role: ' + role_name)
client.delete_role(RoleName=role_name)
def delete_iam_policies(service_name):
logging.info('Deleting IAM policies')
client = boto3.client('iam')
response = client.list_policies(MaxItems=1000)
matching_policies = [{'PolicyName': policy['PolicyName'], 'PolicyArn': policy['Arn']} for policy in
response['Policies'] if policy['PolicyName'].startswith('atlas-' + service_name)]
for policy in matching_policies:
policy_name = policy['PolicyName']
policy_arn = policy['PolicyArn']
# Detach the policy from its roles
roles = client.list_entities_for_policy(PolicyArn=policy_arn)['PolicyRoles']
for role in roles:
role_name = role['RoleName']
logging.info(f'Detaching IAM policy {policy_name} from role {role_name}')
client.detach_role_policy(RoleName=role_name, PolicyArn=policy_arn)
logging.info(f'Deleting IAM policy {policy_name}')
client.delete_policy(PolicyArn=policy_arn)
def delete_launch_templates(service_name, aws_region):
logging.info('Deleting launch templates')
client = boto3.client('ec2', region_name=aws_region)
response = client.describe_launch_templates(
Filters=[
{
'Name': 'tag:' + 'service_name',
'Values': [service_name]
}
]
)
launch_templates = response['LaunchTemplates']
for launch_template in launch_templates:
template_id = launch_template['LaunchTemplateId']
print(f"Deleting Launch Template: {template_id}")
client.delete_launch_template(LaunchTemplateId=template_id)
def delete_s3_buckets(service_name):
client = boto3.client('s3')
response = client.list_buckets()
bucket_list = response['Buckets']
matching_buckets = [bucket['Name'] for bucket in bucket_list if bucket['Name'].startswith("atlas-" + service_name)]
for bucket in matching_buckets:
# Delete each object inside the bucket otherwise the bucket cannot be deleted
response = client.list_objects_v2(Bucket=bucket)
objects = response.get('Contents', [])
for obj in objects:
print(f"Deleting object: s3://{bucket}/{obj['Key']}")
client.delete_object(Bucket=bucket, Key=obj['Key'])
print(f"Deleting bucket: {bucket}")
client.delete_bucket(Bucket=bucket)
def main():
parser = ArgumentParser()
parser.add_argument("--service_name")
parser.add_argument("--region")
args = parser.parse_args()
if not (args.service_name or args.region):
sys.exit('One or more mandatory arguments not provided')
else:
service_name = args.service_name
aws_region = args.region
logging.info(f"Searching for resources to remove in {aws_region}.")
cluster = get_clusters_to_terminate(service_name, aws_region)
if cluster:
logging.info(f"Terminating {cluster}")
terminate_cluster(cluster_name=cluster, aws_region=aws_region)
else:
logging.info(f"No eks clusters found in {aws_region} with tag service_name={service_name}")
# we need cluster name to get the vpc name
cluster = "atlas-" + args.service_name + "-cluster"
logging.info(f"Delete all resources and VPC for environment with tag service_name={service_name}.")
vpc_name = f'{cluster.replace("-cluster", "-vpc")}'
terminate_vpc(vpc_name, aws_region)
logging.info("Release unused EIPs")
release_unused_eips(aws_region)
logging.info("Terminate open ID providers")
terminate_open_id_providers(service_name)
logging.info("Delete unused EBS volumes")
delete_volumes(service_name, aws_region)
delete_launch_templates(service_name, aws_region)
delete_certificates(service_name, aws_region)
delete_hosted_zones(service_name)
delete_iam_policies(service_name)
delete_iam_roles(service_name)
delete_s3_buckets(service_name)
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
main()