From ec607e2347e06624e54889769ae4031ea976ca6a Mon Sep 17 00:00:00 2001 From: Ashok Srirama Date: Thu, 19 Oct 2023 13:04:26 -0400 Subject: [PATCH 1/3] Add Cell based EKS Cluster Pattern --- patterns/cell-based-eks/0.vpc/main.tf | 47 +++ patterns/cell-based-eks/0.vpc/outputs.tf | 14 + patterns/cell-based-eks/0.vpc/variables.tf | 0 patterns/cell-based-eks/0.vpc/versions.tf | 17 + patterns/cell-based-eks/1.cell1/README.md | 180 +++++++++ patterns/cell-based-eks/1.cell1/main.tf | 349 ++++++++++++++++++ patterns/cell-based-eks/1.cell1/outputs.tf | 24 ++ patterns/cell-based-eks/1.cell1/variables.tf | 12 + patterns/cell-based-eks/1.cell1/versions.tf | 29 ++ patterns/cell-based-eks/2.cell2/README.md | 180 +++++++++ patterns/cell-based-eks/2.cell2/main.tf | 345 +++++++++++++++++ patterns/cell-based-eks/2.cell2/outputs.tf | 24 ++ patterns/cell-based-eks/2.cell2/variables.tf | 12 + patterns/cell-based-eks/2.cell2/versions.tf | 29 ++ .../cell-based-eks/3.test-setup/test_setup.sh | 51 +++ patterns/cell-based-eks/README.md | 92 +++++ 16 files changed, 1405 insertions(+) create mode 100644 patterns/cell-based-eks/0.vpc/main.tf create mode 100644 patterns/cell-based-eks/0.vpc/outputs.tf create mode 100644 patterns/cell-based-eks/0.vpc/variables.tf create mode 100644 patterns/cell-based-eks/0.vpc/versions.tf create mode 100644 patterns/cell-based-eks/1.cell1/README.md create mode 100644 patterns/cell-based-eks/1.cell1/main.tf create mode 100644 patterns/cell-based-eks/1.cell1/outputs.tf create mode 100644 patterns/cell-based-eks/1.cell1/variables.tf create mode 100644 patterns/cell-based-eks/1.cell1/versions.tf create mode 100644 patterns/cell-based-eks/2.cell2/README.md create mode 100644 patterns/cell-based-eks/2.cell2/main.tf create mode 100644 patterns/cell-based-eks/2.cell2/outputs.tf create mode 100644 patterns/cell-based-eks/2.cell2/variables.tf create mode 100644 patterns/cell-based-eks/2.cell2/versions.tf create mode 100755 patterns/cell-based-eks/3.test-setup/test_setup.sh create mode 100644 patterns/cell-based-eks/README.md diff --git a/patterns/cell-based-eks/0.vpc/main.tf b/patterns/cell-based-eks/0.vpc/main.tf new file mode 100644 index 0000000000..5d92a05334 --- /dev/null +++ b/patterns/cell-based-eks/0.vpc/main.tf @@ -0,0 +1,47 @@ +provider "aws" { + region = local.region +} + +data "aws_availability_zones" "available" {} + +locals { + cluster_name = format("%s-%s", basename(path.cwd), "shared") + region = "us-west-2" + + vpc_cidr = "10.0.0.0/16" + azs = slice(data.aws_availability_zones.available.names, 0, 3) + + tags = { + Blueprint = local.cluster_name + GithubRepo = "github.com/aws-ia/terraform-aws-eks-blueprints" + } +} + +################################################################################ +# VPC +################################################################################ + +module "vpc" { + source = "terraform-aws-modules/vpc/aws" + version = "~> 5.0" + + name = local.cluster_name + cidr = local.vpc_cidr + + azs = local.azs + private_subnets = [for k, v in local.azs : cidrsubnet(local.vpc_cidr, 4, k)] + public_subnets = [for k, v in local.azs : cidrsubnet(local.vpc_cidr, 8, k + 48)] + + enable_nat_gateway = true + single_nat_gateway = true + + public_subnet_tags = { + "kubernetes.io/role/elb" = 1 + } + + private_subnet_tags = { + "kubernetes.io/role/internal-elb" = 1 + } + + tags = local.tags +} diff --git a/patterns/cell-based-eks/0.vpc/outputs.tf b/patterns/cell-based-eks/0.vpc/outputs.tf new file mode 100644 index 0000000000..af6cc3a872 --- /dev/null +++ b/patterns/cell-based-eks/0.vpc/outputs.tf @@ -0,0 +1,14 @@ +output "vpc_id" { + description = "Amazon EKS VPC ID" + value = module.vpc.vpc_id +} + +output "subnet_ids" { + description = "Amazon EKS Subnet IDs" + value = module.vpc.private_subnets +} + +output "vpc_cidr" { + description = "Amazon EKS VPC CIDR Block." + value = local.vpc_cidr +} diff --git a/patterns/cell-based-eks/0.vpc/variables.tf b/patterns/cell-based-eks/0.vpc/variables.tf new file mode 100644 index 0000000000..e69de29bb2 diff --git a/patterns/cell-based-eks/0.vpc/versions.tf b/patterns/cell-based-eks/0.vpc/versions.tf new file mode 100644 index 0000000000..6cbf2a99e8 --- /dev/null +++ b/patterns/cell-based-eks/0.vpc/versions.tf @@ -0,0 +1,17 @@ +terraform { + required_version = ">= 1.0" + + required_providers { + aws = { + source = "hashicorp/aws" + version = ">= 4.47" + } + } + + # ## Used for end-to-end testing on project; update to suit your needs + # backend "s3" { + # bucket = "" + # region = "" + # key = "e2e/istio-multi-cluster-vpc/terraform.tfstate" + # } +} diff --git a/patterns/cell-based-eks/1.cell1/README.md b/patterns/cell-based-eks/1.cell1/README.md new file mode 100644 index 0000000000..f71cb32dec --- /dev/null +++ b/patterns/cell-based-eks/1.cell1/README.md @@ -0,0 +1,180 @@ +# Cell-Based Architecture for Amazon EKS + +This example shows how to provision a cell based Amazon EKS cluster. + +* Deploy EKS Cluster with one managed node group in a VPC and AZ +* Deploy Fargate profiles to run `coredns`, `aws-load-balancer-controller`, and `karpenter` addons +* Deploy Karpenter `Provisioner` and `AWSNodeTemplate` resources and configure them to run in AZ1 +* Deploy sample deployment `inflate` with 0 replicas + +Refer to the [AWS Solution Guidance](https://aws.amazon.com/solutions/guidance/cell-based-architecture-for-amazon-eks/) for more details. + +## Prerequisites: + +Ensure that you have the following tools installed locally: + +1. [aws cli](https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2.html) +2. [kubectl](https://Kubernetes.io/docs/tasks/tools/) +3. [terraform](https://learn.hashicorp.com/tutorials/terraform/install-cli) + +## Deploy + +To provision this example: + +```sh +terraform init +terraform apply +``` + +Enter `yes` at command prompt to apply + +## Validate + +The following command will update the `kubeconfig` on your local machine and allow you to interact with your EKS Cluster using `kubectl` to validate the deployment. + +1. Run `update-kubeconfig` command: + +```sh +aws eks --region update-kubeconfig --name +``` + +2. List the nodes running currently + +```sh +kubectl get node -o custom-columns='NODE_NAME:.metadata.name,READY:.status.conditions[?(@.type=="Ready")].status,INSTANCE-TYPE:.metadata.labels.node\.kubernetes\.io/instance-type,AZ:.metadata.labels.topology\.kubernetes\.io/zone,VERSION:.status.nodeInfo.kubeletVersion,OS-IMAGE:.status.nodeInfo.osImage,INTERNAL-IP:.metadata.annotations.alpha\.kubernetes\.io/provided-node-ip' +``` + +``` +# Output should look like below +NODE_NAME READY INSTANCE-TYPE AZ VERSION OS-IMAGE INTERNAL-IP +fargate-ip-10-0-13-93.us-west-2.compute.internal True us-west-2a v1.28.2-eks-f8587cb Amazon Linux 2 +fargate-ip-10-0-14-95.us-west-2.compute.internal True us-west-2a v1.28.2-eks-f8587cb Amazon Linux 2 +fargate-ip-10-0-15-86.us-west-2.compute.internal True us-west-2a v1.28.2-eks-f8587cb Amazon Linux 2 +fargate-ip-10-0-8-178.us-west-2.compute.internal True us-west-2a v1.28.2-eks-f8587cb Amazon Linux 2 +fargate-ip-10-0-8-254.us-west-2.compute.internal True us-west-2a v1.28.2-eks-f8587cb Amazon Linux 2 +fargate-ip-10-0-8-73.us-west-2.compute.internal True us-west-2a v1.28.2-eks-f8587cb Amazon Linux 2 +ip-10-0-12-14.us-west-2.compute.internal True m5.large us-west-2a v1.28.1-eks-43840fb Amazon Linux 2 10.0.12.14 +ip-10-0-14-197.us-west-2.compute.internal True m5.large us-west-2a v1.28.1-eks-43840fb Amazon Linux 2 10.0.14.197 +``` + +3. List out the pods running currently: + +```sh +kubectl get pods,svc -n kube-system +``` + +``` +# Output should look like below +NAME READY STATUS RESTARTS AGE +pod/aws-load-balancer-controller-776868b4fb-2j9t6 1/1 Running 0 13h +pod/aws-load-balancer-controller-776868b4fb-bzkrr 1/1 Running 0 13h +pod/aws-node-2zhpc 2/2 Running 0 16h +pod/aws-node-w897r 2/2 Running 0 16h +pod/coredns-5c9679c87-bp6ws 1/1 Running 0 16h +pod/coredns-5c9679c87-lw468 1/1 Running 0 16h +pod/kube-proxy-6wp2k 1/1 Running 0 16h +pod/kube-proxy-n8qtq 1/1 Running 0 16h + +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +service/aws-load-balancer-webhook-service ClusterIP 172.20.44.77 443/TCP 14h +service/kube-dns ClusterIP 172.20.0.10 53/UDP,53/TCP 17h +``` + +4. Verify all the helm releases installed: + +```sh +helm list -A +``` + +``` +# Output should look like below +NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION +aws-load-balancer-controller kube-system 2 2023-10-18 23:07:36.089372 -0400 EDT deployed aws-load-balancer-controller-1.6.1 v2.6.1 +karpenter karpenter 14 2023-10-19 08:25:12.313094 -0400 EDT deployed karpenter-v0.30.0 0.30.0 +``` + +## Test + +1. Verify both Fargate nodes and EKS Managed Nodegroup worker nodes are deployed to single AZ + +```sh +kubectl get node -o custom-columns='NODE_NAME:.metadata.name,READY:.status.conditions[?(@.type=="Ready")].status,INSTANCE-TYPE:.metadata.labels.node\.kubernetes\.io/instance-type,AZ:.metadata.labels.topology\.kubernetes\.io/zone,VERSION:.status.nodeInfo.kubeletVersion,OS-IMAGE:.status.nodeInfo.osImage,INTERNAL-IP:.metadata.annotations.alpha\.kubernetes\.io/provided-node-ip' +``` + +``` +NODE_NAME READY INSTANCE-TYPE AZ VERSION OS-IMAGE INTERNAL-IP +fargate-ip-10-0-13-93.us-west-2.compute.internal True us-west-2a v1.28.2-eks-f8587cb Amazon Linux 2 +fargate-ip-10-0-14-95.us-west-2.compute.internal True us-west-2a v1.28.2-eks-f8587cb Amazon Linux 2 +fargate-ip-10-0-15-86.us-west-2.compute.internal True us-west-2a v1.28.2-eks-f8587cb Amazon Linux 2 +fargate-ip-10-0-8-178.us-west-2.compute.internal True us-west-2a v1.28.2-eks-f8587cb Amazon Linux 2 +fargate-ip-10-0-8-254.us-west-2.compute.internal True us-west-2a v1.28.2-eks-f8587cb Amazon Linux 2 +fargate-ip-10-0-8-73.us-west-2.compute.internal True us-west-2a v1.28.2-eks-f8587cb Amazon Linux 2 +ip-10-0-12-14.us-west-2.compute.internal True m5.large us-west-2a v1.28.1-eks-43840fb Amazon Linux 2 10.0.12.14 +ip-10-0-14-197.us-west-2.compute.internal True m5.large us-west-2a v1.28.1-eks-43840fb Amazon Linux 2 10.0.14.197 +``` + +2. Scale the `inflate` deployment to 20 replicas and watch for Karpenter to launch EKS worker nodes in correct AZ. + +```sh +kubectl scale deployment inflate --replicas 20 +``` + +``` +deployment.apps/inflate scaled +``` + +3. Wait for the pods become ready + +```sh +kubectl wait --for=condition=ready pods --all --timeout 2m +``` + +``` +pod/inflate-75d744d4c6-5r5cv condition met +pod/inflate-75d744d4c6-775wm condition met +pod/inflate-75d744d4c6-7t225 condition met +pod/inflate-75d744d4c6-945p4 condition met +pod/inflate-75d744d4c6-b52gp condition met +pod/inflate-75d744d4c6-d99fn condition met +pod/inflate-75d744d4c6-dmnwm condition met +pod/inflate-75d744d4c6-hrvvr condition met +pod/inflate-75d744d4c6-j4hkl condition met +pod/inflate-75d744d4c6-jwknj condition met +pod/inflate-75d744d4c6-ldwts condition met +pod/inflate-75d744d4c6-lqnr5 condition met +pod/inflate-75d744d4c6-pctjh condition met +pod/inflate-75d744d4c6-qdlkc condition met +pod/inflate-75d744d4c6-qnzc5 condition met +pod/inflate-75d744d4c6-r2cwj condition met +pod/inflate-75d744d4c6-srmkb condition met +pod/inflate-75d744d4c6-wf45j condition met +pod/inflate-75d744d4c6-x9mwl condition met +pod/inflate-75d744d4c6-xlbhl condition met +``` + +4. Check all the nodes are in the correct AZ + +```sh +kubectl get node -o custom-columns='NODE_NAME:.metadata.name,READY:.status.conditions[?(@.type=="Ready")].status,INSTANCE-TYPE:.metadata.labels.node\.kubernetes\.io/instance-type,AZ:.metadata.labels.topology\.kubernetes\.io/zone,VERSION:.status.nodeInfo.kubeletVersion,OS-IMAGE:.status.nodeInfo.osImage,INTERNAL-IP:.metadata.annotations.alpha\.kubernetes\.io/provided-node-ip' +``` +``` +NODE_NAME READY INSTANCE-TYPE AZ VERSION OS-IMAGE INTERNAL-IP +fargate-ip-10-0-13-93.us-west-2.compute.internal True us-west-2a v1.28.2-eks-f8587cb Amazon Linux 2 +fargate-ip-10-0-14-95.us-west-2.compute.internal True us-west-2a v1.28.2-eks-f8587cb Amazon Linux 2 +fargate-ip-10-0-15-86.us-west-2.compute.internal True us-west-2a v1.28.2-eks-f8587cb Amazon Linux 2 +fargate-ip-10-0-8-178.us-west-2.compute.internal True us-west-2a v1.28.2-eks-f8587cb Amazon Linux 2 +fargate-ip-10-0-8-254.us-west-2.compute.internal True us-west-2a v1.28.2-eks-f8587cb Amazon Linux 2 +fargate-ip-10-0-8-73.us-west-2.compute.internal True us-west-2a v1.28.2-eks-f8587cb Amazon Linux 2 +ip-10-0-12-14.us-west-2.compute.internal True m5.large us-west-2a v1.28.1-eks-43840fb Amazon Linux 2 10.0.12.14 +ip-10-0-14-197.us-west-2.compute.internal True m5.large us-west-2a v1.28.1-eks-43840fb Amazon Linux 2 10.0.14.197 +ip-10-0-3-161.us-west-2.compute.internal True c6gn.8xlarge us-west-2a v1.28.1-eks-43840fb Amazon Linux 2 10.0.3.161 +``` + +## Destroy + +To teardown and remove the resources created in this example: + +```sh +terraform destroy -target="module.eks_blueprints_addons" -auto-approve +terraform destroy -auto-approve +``` diff --git a/patterns/cell-based-eks/1.cell1/main.tf b/patterns/cell-based-eks/1.cell1/main.tf new file mode 100644 index 0000000000..d554be5269 --- /dev/null +++ b/patterns/cell-based-eks/1.cell1/main.tf @@ -0,0 +1,349 @@ +provider "aws" { + region = local.region +} + +# Required for public ECR where Karpenter artifacts are hosted +provider "aws" { + region = "us-east-1" + alias = "virginia" +} + +data "terraform_remote_state" "vpc" { + backend = "local" + + config = { + path = "${path.module}/../0.vpc/terraform.tfstate" + } +} + +data "aws_ecrpublic_authorization_token" "token" { + provider = aws.virginia +} + +provider "kubernetes" { + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + + exec { + api_version = "client.authentication.k8s.io/v1beta1" + command = "aws" + # This requires the awscli to be installed locally where Terraform is executed + args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] + } +} + +provider "helm" { + kubernetes { + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + + exec { + api_version = "client.authentication.k8s.io/v1beta1" + command = "aws" + # This requires the awscli to be installed locally where Terraform is executed + args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] + } + } +} + +provider "kubectl" { + apply_retry_count = 5 + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + load_config_file = false + + exec { + api_version = "client.authentication.k8s.io/v1beta1" + command = "aws" + # This requires the awscli to be installed locally where Terraform is executed + args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] + } +} + +data "aws_availability_zones" "available" {} + +locals { + cluster_name = var.name + region = var.region + azs = slice(data.aws_availability_zones.available.names, 0, 3) + + tags = { + Blueprint = local.cluster_name + GithubRepo = "github.com/aws-ia/terraform-aws-eks-blueprints" + } +} + +################################################################################ +# Cluster +################################################################################ + +module "eks" { + source = "terraform-aws-modules/eks/aws" + version = "~> 19.16" + + cluster_name = local.cluster_name + cluster_version = "1.28" + cluster_endpoint_public_access = true + + vpc_id = data.terraform_remote_state.vpc.outputs.vpc_id + subnet_ids = data.terraform_remote_state.vpc.outputs.subnet_ids + + manage_aws_auth_configmap = true + aws_auth_roles = [ + # We need to add in the Karpenter node IAM role for nodes launched by Karpenter + { + rolearn = module.eks_blueprints_addons.karpenter.node_iam_role_arn + username = "system:node:{{EC2PrivateDNSName}}" + groups = [ + "system:bootstrappers", + "system:nodes", + ] + }, + ] + + fargate_profiles = { + karpenter = { + selectors = [ + { namespace = "karpenter" } + ] + subnet_ids = [data.terraform_remote_state.vpc.outputs.subnet_ids[0]] + } + kube_system = { + name = "kube-system" + selectors = [ + { namespace = "kube-system" } + ] + subnet_ids = [data.terraform_remote_state.vpc.outputs.subnet_ids[0]] + } + } + + eks_managed_node_groups = { + cell1 = { + instance_types = ["m5.large"] + + min_size = 1 + max_size = 5 + desired_size = 2 + + subnet_ids = [data.terraform_remote_state.vpc.outputs.subnet_ids[0]] + } + } + + tags = local.tags +} + +################################################################################ +# EKS Blueprints Addons +################################################################################ + +module "eks_blueprints_addons" { + source = "aws-ia/eks-blueprints-addons/aws" + version = "~> 1.0" + + cluster_name = module.eks.cluster_name + cluster_endpoint = module.eks.cluster_endpoint + cluster_version = module.eks.cluster_version + oidc_provider_arn = module.eks.oidc_provider_arn + + # We want to wait for the Fargate profiles to be deployed first + create_delay_dependencies = [for prof in module.eks.fargate_profiles : prof.fargate_profile_arn] + + eks_addons = { + coredns = { + configuration_values = jsonencode({ + computeType = "Fargate" + # Ensure that the we fully utilize the minimum amount of resources that are supplied by + # Fargate https://docs.aws.amazon.com/eks/latest/userguide/fargate-pod-configuration.html + # Fargate adds 256 MB to each pod's memory reservation for the required Kubernetes + # components (kubelet, kube-proxy, and containerd). Fargate rounds up to the following + # compute configuration that most closely matches the sum of vCPU and memory requests in + # order to ensure pods always have the resources that they need to run. + resources = { + limits = { + cpu = "0.25" + # We are targetting the smallest Task size of 512Mb, so we subtract 256Mb from the + # request/limit to ensure we can fit within that task + memory = "256M" + } + requests = { + cpu = "0.25" + # We are targetting the smallest Task size of 512Mb, so we subtract 256Mb from the + # request/limit to ensure we can fit within that task + memory = "256M" + } + } + }) + } + vpc-cni = {} + kube-proxy = {} + } + + enable_karpenter = true + karpenter = { + repository_username = data.aws_ecrpublic_authorization_token.token.user_name + repository_password = data.aws_ecrpublic_authorization_token.token.password + } + + enable_aws_load_balancer_controller = true + aws_load_balancer_controller = { + chart_version = "1.6.1" # min version required to use SG for NLB feature + set = [ + { + name = "vpcId" + value = data.terraform_remote_state.vpc.outputs.vpc_id + }, + { + name = "podDisruptionBudget.maxUnavailable" + value = 1 + }, + ] + } + + tags = local.tags +} + +################################################################################ +# Karpenter +################################################################################ + +resource "aws_security_group" "karpenter_sg" { + name = "${local.cluster_name}_karpenter_sg" + description = "${local.cluster_name} Karpenter SG" + vpc_id = data.terraform_remote_state.vpc.outputs.vpc_id + tags = { + "Name" = "${local.cluster_name}_karpenter_sg" + "karpenter.sh/discovery" = local.cluster_name + } +} + +resource "aws_vpc_security_group_egress_rule" "karpenter_sg_allow_all_4" { + security_group_id = aws_security_group.karpenter_sg.id + + ip_protocol = "-1" + cidr_ipv4 = "0.0.0.0/0" +} + +resource "aws_vpc_security_group_egress_rule" "karpenter_sg_allow_all_6" { + security_group_id = aws_security_group.karpenter_sg.id + + ip_protocol = "-1" + cidr_ipv6 = "::/0" +} + +resource "aws_vpc_security_group_ingress_rule" "karpenter_sg_allow_cluster_ing" { + security_group_id = aws_security_group.karpenter_sg.id + + ip_protocol = "-1" + referenced_security_group_id = module.eks.cluster_security_group_id +} + +resource "aws_vpc_security_group_ingress_rule" "karpenter_sg_allow_mng_ing" { + security_group_id = aws_security_group.karpenter_sg.id + + ip_protocol = "-1" + referenced_security_group_id = module.eks.node_security_group_id +} + +resource "aws_vpc_security_group_ingress_rule" "cluster_sg_allow_karpenter_ing" { + security_group_id = module.eks.cluster_security_group_id + + ip_protocol = "-1" + referenced_security_group_id = aws_security_group.karpenter_sg.id +} + +resource "kubectl_manifest" "karpenter_provisioner" { + yaml_body = <<-YAML + apiVersion: karpenter.sh/v1alpha5 + kind: Provisioner + metadata: + name: default + spec: + requirements: + - key: "karpenter.k8s.aws/instance-category" + operator: In + values: ["c", "m"] + - key: "karpenter.k8s.aws/instance-cpu" + operator: In + values: ["8", "16", "32"] + - key: "karpenter.k8s.aws/instance-hypervisor" + operator: In + values: ["nitro"] + - key: "topology.kubernetes.io/zone" + operator: In + values: [${jsonencode(local.azs[0])}] + - key: "kubernetes.io/arch" + operator: In + values: ["arm64", "amd64"] + - key: "karpenter.sh/capacity-type" # If not included, the webhook for the AWS cloud provider will default to on-demand + operator: In + values: ["spot", "on-demand"] + kubeletConfiguration: + containerRuntime: containerd + maxPods: 110 + limits: + resources: + cpu: 1000 + consolidation: + enabled: true + providerRef: + name: default + ttlSecondsUntilExpired: 604800 # 7 Days = 7 * 24 * 60 * 60 Seconds + YAML + + depends_on = [ + module.eks_blueprints_addons + ] +} + +resource "kubectl_manifest" "karpenter_node_template" { + yaml_body = <<-YAML + apiVersion: karpenter.k8s.aws/v1alpha1 + kind: AWSNodeTemplate + metadata: + name: default + spec: + subnetSelector: + aws-ids: ${data.terraform_remote_state.vpc.outputs.subnet_ids[0]} + securityGroupSelector: + karpenter.sh/discovery: ${module.eks.cluster_name} + instanceProfile: ${module.eks_blueprints_addons.karpenter.node_instance_profile_name} + tags: + karpenter.sh/discovery: ${module.eks.cluster_name} + YAML + + depends_on = [ + module.eks_blueprints_addons + ] +} + +# Example deployment using the [pause image](https://www.ianlewis.org/en/almighty-pause-container) +# and starts with zero replicas +resource "kubectl_manifest" "karpenter_example_deployment" { + yaml_body = <<-YAML + apiVersion: apps/v1 + kind: Deployment + metadata: + name: inflate + spec: + replicas: 0 + selector: + matchLabels: + app: inflate + template: + metadata: + labels: + app: inflate + spec: + terminationGracePeriodSeconds: 0 + containers: + - name: inflate + image: public.ecr.aws/eks-distro/kubernetes/pause:3.7 + resources: + requests: + cpu: 1 + YAML + + depends_on = [ + kubectl_manifest.karpenter_node_template + ] +} diff --git a/patterns/cell-based-eks/1.cell1/outputs.tf b/patterns/cell-based-eks/1.cell1/outputs.tf new file mode 100644 index 0000000000..d3a1ef5eb0 --- /dev/null +++ b/patterns/cell-based-eks/1.cell1/outputs.tf @@ -0,0 +1,24 @@ +output "configure_kubectl" { + description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig" + value = "aws eks --region ${local.region} update-kubeconfig --name ${module.eks.cluster_name}" +} + +output "cluster_endpoint" { + description = "Cluster endpoint" + value = module.eks.cluster_endpoint +} + +output "cluster_certificate_authority_data" { + description = "Cluster ca certificate" + value = module.eks.cluster_certificate_authority_data +} + +output "cluster_name" { + description = "Cluster name" + value = module.eks.cluster_name +} + +output "cluster_region" { + description = "Cluster region" + value = local.region +} diff --git a/patterns/cell-based-eks/1.cell1/variables.tf b/patterns/cell-based-eks/1.cell1/variables.tf new file mode 100644 index 0000000000..d71fa165b9 --- /dev/null +++ b/patterns/cell-based-eks/1.cell1/variables.tf @@ -0,0 +1,12 @@ + +variable "name" { + description = "cluster name" + type = string + default = "cell-1" +} + +variable "region" { + description = "cluster name" + type = string + default = "us-west-2" +} diff --git a/patterns/cell-based-eks/1.cell1/versions.tf b/patterns/cell-based-eks/1.cell1/versions.tf new file mode 100644 index 0000000000..9577780c9b --- /dev/null +++ b/patterns/cell-based-eks/1.cell1/versions.tf @@ -0,0 +1,29 @@ +terraform { + required_version = ">= 1.0" + + required_providers { + aws = { + source = "hashicorp/aws" + version = ">= 4.47" + } + helm = { + source = "hashicorp/helm" + version = ">= 2.9" + } + kubernetes = { + source = "hashicorp/kubernetes" + version = ">= 2.20" + } + kubectl = { + source = "gavinbunney/kubectl" + version = ">= 1.14" + } + } + + # ## Used for end-to-end testing on project; update to suit your needs + # backend "s3" { + # bucket = "" + # region = "" + # key = "e2e/istio-multi-cluster-vpc/terraform.tfstate" + # } +} diff --git a/patterns/cell-based-eks/2.cell2/README.md b/patterns/cell-based-eks/2.cell2/README.md new file mode 100644 index 0000000000..6a435a65b4 --- /dev/null +++ b/patterns/cell-based-eks/2.cell2/README.md @@ -0,0 +1,180 @@ +# Cell-Based Architecture for Amazon EKS + +This example shows how to provision a cell based Amazon EKS cluster. + +* Deploy EKS Cluster with one managed node group in a VPC and AZ +* Deploy Fargate profiles to run `coredns`, `aws-load-balancer-controller`, and `karpenter` addons +* Deploy Karpenter `Provisioner` and `AWSNodeTemplate` resources and configure them to run in AZ2 +* Deploy sample deployment `inflate` with 0 replicas + +Refer to the [AWS Solution Guidance](https://aws.amazon.com/solutions/guidance/cell-based-architecture-for-amazon-eks/) for more details. + +## Prerequisites: + +Ensure that you have the following tools installed locally: + +1. [aws cli](https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2.html) +2. [kubectl](https://Kubernetes.io/docs/tasks/tools/) +3. [terraform](https://learn.hashicorp.com/tutorials/terraform/install-cli) + +## Deploy + +To provision this example: + +```sh +terraform init +terraform apply +``` + +Enter `yes` at command prompt to apply + +## Validate + +The following command will update the `kubeconfig` on your local machine and allow you to interact with your EKS Cluster using `kubectl` to validate the deployment. + +1. Run `update-kubeconfig` command: + +```sh +aws eks --region update-kubeconfig --name +``` + +2. List the nodes running currently + +```sh +kubectl get node -o custom-columns='NODE_NAME:.metadata.name,READY:.status.conditions[?(@.type=="Ready")].status,INSTANCE-TYPE:.metadata.labels.node\.kubernetes\.io/instance-type,AZ:.metadata.labels.topology\.kubernetes\.io/zone,VERSION:.status.nodeInfo.kubeletVersion,OS-IMAGE:.status.nodeInfo.osImage,INTERNAL-IP:.metadata.annotations.alpha\.kubernetes\.io/provided-node-ip' +``` + +``` +# Output should look like below +NODE_NAME READY INSTANCE-TYPE AZ VERSION OS-IMAGE INTERNAL-IP +fargate-ip-10-0-22-6.us-west-2.compute.internal True us-west-2b v1.28.2-eks-f8587cb Amazon Linux 2 +fargate-ip-10-0-23-139.us-west-2.compute.internal True us-west-2b v1.28.2-eks-f8587cb Amazon Linux 2 +fargate-ip-10-0-23-59.us-west-2.compute.internal True us-west-2b v1.28.2-eks-f8587cb Amazon Linux 2 +fargate-ip-10-0-24-236.us-west-2.compute.internal True us-west-2b v1.28.2-eks-f8587cb Amazon Linux 2 +fargate-ip-10-0-25-116.us-west-2.compute.internal True us-west-2b v1.28.2-eks-f8587cb Amazon Linux 2 +fargate-ip-10-0-31-31.us-west-2.compute.internal True us-west-2b v1.28.2-eks-f8587cb Amazon Linux 2 +ip-10-0-30-113.us-west-2.compute.internal True m5.large us-west-2b v1.28.1-eks-43840fb Amazon Linux 2 10.0.30.113 +ip-10-0-31-158.us-west-2.compute.internal True m5.large us-west-2b v1.28.1-eks-43840fb Amazon Linux 2 10.0.31.158 +``` + +3. List out the pods running currently: + +```sh +kubectl get pods,svc -n kube-system +``` + +``` +# Output should look like below +NAME READY STATUS RESTARTS AGE +pod/aws-load-balancer-controller-8758bf745-grj9s 1/1 Running 0 3h42m +pod/aws-load-balancer-controller-8758bf745-j5m5j 1/1 Running 0 3h42m +pod/aws-node-crst2 2/2 Running 0 3h42m +pod/aws-node-dbs2f 2/2 Running 0 3h42m +pod/coredns-5c9679c87-fsxtt 1/1 Running 0 3h42m +pod/coredns-5c9679c87-fttcc 1/1 Running 0 3h42m +pod/kube-proxy-lrsd9 1/1 Running 0 3h42m +pod/kube-proxy-rc49k 1/1 Running 0 3h42m + +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +service/aws-load-balancer-webhook-service ClusterIP 172.20.134.154 443/TCP 3h42m +service/kube-dns ClusterIP 172.20.0.10 53/UDP,53/TCP 3h52m +``` + +4. Verify all the helm releases installed: + +```sh +helm list -A +``` + +``` +# Output should look like below +NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION +aws-load-balancer-controller kube-system 1 2023-10-19 09:01:45.053426 -0400 EDT deployed aws-load-balancer-controller-1.6.1 v2.6.1 +karpenter karpenter 4 2023-10-19 09:56:07.225133 -0400 EDT deployed karpenter-v0.30.0 0.30.0 +``` + +## Test + +1. Verify both Fargate nodes and EKS Managed Nodegroup worker nodes are deployed to single AZ + +```sh +kubectl get node -o custom-columns='NODE_NAME:.metadata.name,READY:.status.conditions[?(@.type=="Ready")].status,INSTANCE-TYPE:.metadata.labels.node\.kubernetes\.io/instance-type,AZ:.metadata.labels.topology\.kubernetes\.io/zone,VERSION:.status.nodeInfo.kubeletVersion,OS-IMAGE:.status.nodeInfo.osImage,INTERNAL-IP:.metadata.annotations.alpha\.kubernetes\.io/provided-node-ip' +``` + +``` +NODE_NAME READY INSTANCE-TYPE AZ VERSION OS-IMAGE INTERNAL-IP +fargate-ip-10-0-22-6.us-west-2.compute.internal True us-west-2b v1.28.2-eks-f8587cb Amazon Linux 2 +fargate-ip-10-0-23-139.us-west-2.compute.internal True us-west-2b v1.28.2-eks-f8587cb Amazon Linux 2 +fargate-ip-10-0-23-59.us-west-2.compute.internal True us-west-2b v1.28.2-eks-f8587cb Amazon Linux 2 +fargate-ip-10-0-24-236.us-west-2.compute.internal True us-west-2b v1.28.2-eks-f8587cb Amazon Linux 2 +fargate-ip-10-0-25-116.us-west-2.compute.internal True us-west-2b v1.28.2-eks-f8587cb Amazon Linux 2 +fargate-ip-10-0-31-31.us-west-2.compute.internal True us-west-2b v1.28.2-eks-f8587cb Amazon Linux 2 +ip-10-0-30-113.us-west-2.compute.internal True m5.large us-west-2b v1.28.1-eks-43840fb Amazon Linux 2 10.0.30.113 +ip-10-0-31-158.us-west-2.compute.internal True m5.large us-west-2b v1.28.1-eks-43840fb Amazon Linux 2 10.0.31.158 +``` + +2. Scale the `inflate` deployment to 20 replicas and watch for Karpenter to launch EKS worker nodes in correct AZ. + +```sh +kubectl scale deployment inflate --replicas 20 +``` + +``` +deployment.apps/inflate scaled +``` + +3. Wait for the pods become ready + +```sh +kubectl wait --for=condition=ready pods --all --timeout 2m +``` + +``` +pod/inflate-75d744d4c6-26nfh condition met +pod/inflate-75d744d4c6-4hfxf condition met +pod/inflate-75d744d4c6-4tvzr condition met +pod/inflate-75d744d4c6-5jkdp condition met +pod/inflate-75d744d4c6-5lpkg condition met +pod/inflate-75d744d4c6-6kv28 condition met +pod/inflate-75d744d4c6-7k5k5 condition met +pod/inflate-75d744d4c6-b7mm4 condition met +pod/inflate-75d744d4c6-kq9z7 condition met +pod/inflate-75d744d4c6-kslkq condition met +pod/inflate-75d744d4c6-mfps6 condition met +pod/inflate-75d744d4c6-s6h2j condition met +pod/inflate-75d744d4c6-s9db9 condition met +pod/inflate-75d744d4c6-sbmlz condition met +pod/inflate-75d744d4c6-slqhw condition met +pod/inflate-75d744d4c6-t9z27 condition met +pod/inflate-75d744d4c6-tqrjd condition met +pod/inflate-75d744d4c6-w9w8b condition met +pod/inflate-75d744d4c6-wk2jb condition met +pod/inflate-75d744d4c6-z54wg condition met +``` + +4. Check all the nodes are in the correct AZ + +```sh +kubectl get node -o custom-columns='NODE_NAME:.metadata.name,READY:.status.conditions[?(@.type=="Ready")].status,INSTANCE-TYPE:.metadata.labels.node\.kubernetes\.io/instance-type,AZ:.metadata.labels.topology\.kubernetes\.io/zone,VERSION:.status.nodeInfo.kubeletVersion,OS-IMAGE:.status.nodeInfo.osImage,INTERNAL-IP:.metadata.annotations.alpha\.kubernetes\.io/provided-node-ip' +``` +``` +NODE_NAME READY INSTANCE-TYPE AZ VERSION OS-IMAGE INTERNAL-IP +fargate-ip-10-0-22-6.us-west-2.compute.internal True us-west-2b v1.28.2-eks-f8587cb Amazon Linux 2 +fargate-ip-10-0-23-139.us-west-2.compute.internal True us-west-2b v1.28.2-eks-f8587cb Amazon Linux 2 +fargate-ip-10-0-23-59.us-west-2.compute.internal True us-west-2b v1.28.2-eks-f8587cb Amazon Linux 2 +fargate-ip-10-0-24-236.us-west-2.compute.internal True us-west-2b v1.28.2-eks-f8587cb Amazon Linux 2 +fargate-ip-10-0-25-116.us-west-2.compute.internal True us-west-2b v1.28.2-eks-f8587cb Amazon Linux 2 +fargate-ip-10-0-31-31.us-west-2.compute.internal True us-west-2b v1.28.2-eks-f8587cb Amazon Linux 2 +ip-10-0-27-134.us-west-2.compute.internal True c6g.8xlarge us-west-2b v1.28.1-eks-43840fb Amazon Linux 2 10.0.27.134 +ip-10-0-30-113.us-west-2.compute.internal True m5.large us-west-2b v1.28.1-eks-43840fb Amazon Linux 2 10.0.30.113 +ip-10-0-31-158.us-west-2.compute.internal True m5.large us-west-2b v1.28.1-eks-43840fb Amazon Linux 2 10.0.31.158 +``` + +## Destroy + +To teardown and remove the resources created in this example: + +```sh +terraform destroy -target="module.eks_blueprints_addons" -auto-approve +terraform destroy -auto-approve +``` diff --git a/patterns/cell-based-eks/2.cell2/main.tf b/patterns/cell-based-eks/2.cell2/main.tf new file mode 100644 index 0000000000..8f3c8b8d00 --- /dev/null +++ b/patterns/cell-based-eks/2.cell2/main.tf @@ -0,0 +1,345 @@ +provider "aws" { + region = local.region +} + +# Required for public ECR where Karpenter artifacts are hosted +provider "aws" { + region = "us-east-1" + alias = "virginia" +} + +data "terraform_remote_state" "vpc" { + backend = "local" + + config = { + path = "${path.module}/../0.vpc/terraform.tfstate" + } +} + +data "aws_ecrpublic_authorization_token" "token" { + provider = aws.virginia +} + +provider "kubernetes" { + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + + exec { + api_version = "client.authentication.k8s.io/v1beta1" + command = "aws" + # This requires the awscli to be installed locally where Terraform is executed + args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] + } +} + +provider "helm" { + kubernetes { + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + + exec { + api_version = "client.authentication.k8s.io/v1beta1" + command = "aws" + # This requires the awscli to be installed locally where Terraform is executed + args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] + } + } +} + +provider "kubectl" { + apply_retry_count = 5 + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + load_config_file = false + + exec { + api_version = "client.authentication.k8s.io/v1beta1" + command = "aws" + # This requires the awscli to be installed locally where Terraform is executed + args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] + } +} + +data "aws_availability_zones" "available" {} + +locals { + cluster_name = var.name + region = var.region + azs = slice(data.aws_availability_zones.available.names, 0, 3) + + tags = { + Blueprint = local.cluster_name + GithubRepo = "github.com/aws-ia/terraform-aws-eks-blueprints" + } +} + +################################################################################ +# Cluster +################################################################################ + +module "eks" { + source = "terraform-aws-modules/eks/aws" + version = "~> 19.16" + + cluster_name = local.cluster_name + cluster_version = "1.28" + cluster_endpoint_public_access = true + + vpc_id = data.terraform_remote_state.vpc.outputs.vpc_id + subnet_ids = data.terraform_remote_state.vpc.outputs.subnet_ids + + manage_aws_auth_configmap = true + aws_auth_roles = [ + # We need to add in the Karpenter node IAM role for nodes launched by Karpenter + { + rolearn = module.eks_blueprints_addons.karpenter.node_iam_role_arn + username = "system:node:{{EC2PrivateDNSName}}" + groups = [ + "system:bootstrappers", + "system:nodes", + ] + }, + ] + + fargate_profiles = { + karpenter = { + selectors = [ + { namespace = "karpenter" } + ] + subnet_ids = [data.terraform_remote_state.vpc.outputs.subnet_ids[1]] + } + kube_system = { + name = "kube-system" + selectors = [ + { namespace = "kube-system" } + ] + subnet_ids = [data.terraform_remote_state.vpc.outputs.subnet_ids[1]] + } + } + + eks_managed_node_groups = { + cell2 = { + instance_types = ["m5.large"] + + min_size = 1 + max_size = 5 + desired_size = 2 + + subnet_ids = [data.terraform_remote_state.vpc.outputs.subnet_ids[1]] + } + } + + tags = local.tags +} + +################################################################################ +# EKS Blueprints Addons +################################################################################ + +module "eks_blueprints_addons" { + source = "aws-ia/eks-blueprints-addons/aws" + version = "~> 1.0" + + cluster_name = module.eks.cluster_name + cluster_endpoint = module.eks.cluster_endpoint + cluster_version = module.eks.cluster_version + oidc_provider_arn = module.eks.oidc_provider_arn + + # We want to wait for the Fargate profiles to be deployed first + create_delay_dependencies = [for prof in module.eks.fargate_profiles : prof.fargate_profile_arn] + + eks_addons = { + coredns = { + configuration_values = jsonencode({ + computeType = "Fargate" + # Ensure that the we fully utilize the minimum amount of resources that are supplied by + # Fargate https://docs.aws.amazon.com/eks/latest/userguide/fargate-pod-configuration.html + # Fargate adds 256 MB to each pod's memory reservation for the required Kubernetes + # components (kubelet, kube-proxy, and containerd). Fargate rounds up to the following + # compute configuration that most closely matches the sum of vCPU and memory requests in + # order to ensure pods always have the resources that they need to run. + resources = { + limits = { + cpu = "0.25" + # We are targetting the smallest Task size of 512Mb, so we subtract 256Mb from the + # request/limit to ensure we can fit within that task + memory = "256M" + } + requests = { + cpu = "0.25" + # We are targetting the smallest Task size of 512Mb, so we subtract 256Mb from the + # request/limit to ensure we can fit within that task + memory = "256M" + } + } + }) + } + vpc-cni = {} + kube-proxy = {} + } + + enable_karpenter = true + karpenter = { + repository_username = data.aws_ecrpublic_authorization_token.token.user_name + repository_password = data.aws_ecrpublic_authorization_token.token.password + } + + enable_aws_load_balancer_controller = true + aws_load_balancer_controller = { + chart_version = "1.6.1" # min version required to use SG for NLB feature + set = [ + { + name = "vpcId" + value = data.terraform_remote_state.vpc.outputs.vpc_id + }, + { + name = "podDisruptionBudget.maxUnavailable" + value = 1 + }, + ] + } + + tags = local.tags +} + +################################################################################ +# Karpenter +################################################################################ + +resource "aws_security_group" "karpenter_sg" { + name = "${local.cluster_name}_karpenter_sg" + description = "${local.cluster_name} Karpenter SG" + vpc_id = data.terraform_remote_state.vpc.outputs.vpc_id + tags = { + "Name" = "${local.cluster_name}_karpenter_sg" + "karpenter.sh/discovery" = local.cluster_name + } +} + +resource "aws_vpc_security_group_egress_rule" "karpenter_sg_allow_all_4" { + security_group_id = aws_security_group.karpenter_sg.id + + ip_protocol = "-1" + cidr_ipv4 = "0.0.0.0/0" +} + +resource "aws_vpc_security_group_egress_rule" "karpenter_sg_allow_all_6" { + security_group_id = aws_security_group.karpenter_sg.id + + ip_protocol = "-1" + cidr_ipv6 = "::/0" +} + +resource "aws_vpc_security_group_ingress_rule" "karpenter_sg_allow_cluster_ing" { + security_group_id = aws_security_group.karpenter_sg.id + + ip_protocol = "-1" + referenced_security_group_id = module.eks.cluster_security_group_id +} + +resource "aws_vpc_security_group_ingress_rule" "karpenter_sg_allow_mng_ing" { + security_group_id = aws_security_group.karpenter_sg.id + + ip_protocol = "-1" + referenced_security_group_id = module.eks.node_security_group_id +} + +resource "aws_vpc_security_group_ingress_rule" "cluster_sg_allow_karpenter_ing" { + security_group_id = module.eks.cluster_security_group_id + + ip_protocol = "-1" + referenced_security_group_id = aws_security_group.karpenter_sg.id +} + +resource "kubectl_manifest" "karpenter_provisioner" { + yaml_body = <<-YAML + apiVersion: karpenter.sh/v1alpha5 + kind: Provisioner + metadata: + name: default + spec: + requirements: + - key: "karpenter.k8s.aws/instance-category" + operator: In + values: ["c", "m"] + - key: "karpenter.k8s.aws/instance-cpu" + operator: In + values: ["8", "16", "32"] + - key: "karpenter.k8s.aws/instance-hypervisor" + operator: In + values: ["nitro"] + - key: "topology.kubernetes.io/zone" + operator: In + values: [${jsonencode(local.azs[1])}] + - key: "kubernetes.io/arch" + operator: In + values: ["arm64", "amd64"] + - key: "karpenter.sh/capacity-type" # If not included, the webhook for the AWS cloud provider will default to on-demand + operator: In + values: ["spot", "on-demand"] + kubeletConfiguration: + containerRuntime: containerd + maxPods: 110 + limits: + resources: + cpu: 1000 + consolidation: + enabled: true + providerRef: + name: default + ttlSecondsUntilExpired: 604800 # 7 Days = 7 * 24 * 60 * 60 Seconds + YAML + + depends_on = [ + module.eks_blueprints_addons + ] +} + +resource "kubectl_manifest" "karpenter_node_template" { + yaml_body = <<-YAML + apiVersion: karpenter.k8s.aws/v1alpha1 + kind: AWSNodeTemplate + metadata: + name: default + spec: + subnetSelector: + aws-ids: ${data.terraform_remote_state.vpc.outputs.subnet_ids[1]} + securityGroupSelector: + karpenter.sh/discovery: ${module.eks.cluster_name} + instanceProfile: ${module.eks_blueprints_addons.karpenter.node_instance_profile_name} + tags: + karpenter.sh/discovery: ${module.eks.cluster_name} + YAML +} + +# Example deployment using the [pause image](https://www.ianlewis.org/en/almighty-pause-container) +# and starts with zero replicas +resource "kubectl_manifest" "karpenter_example_deployment" { + yaml_body = <<-YAML + apiVersion: apps/v1 + kind: Deployment + metadata: + name: inflate + spec: + replicas: 0 + selector: + matchLabels: + app: inflate + template: + metadata: + labels: + app: inflate + spec: + terminationGracePeriodSeconds: 0 + containers: + - name: inflate + image: public.ecr.aws/eks-distro/kubernetes/pause:3.7 + resources: + requests: + cpu: 1 + YAML + + depends_on = [ + kubectl_manifest.karpenter_node_template + ] +} diff --git a/patterns/cell-based-eks/2.cell2/outputs.tf b/patterns/cell-based-eks/2.cell2/outputs.tf new file mode 100644 index 0000000000..d3a1ef5eb0 --- /dev/null +++ b/patterns/cell-based-eks/2.cell2/outputs.tf @@ -0,0 +1,24 @@ +output "configure_kubectl" { + description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig" + value = "aws eks --region ${local.region} update-kubeconfig --name ${module.eks.cluster_name}" +} + +output "cluster_endpoint" { + description = "Cluster endpoint" + value = module.eks.cluster_endpoint +} + +output "cluster_certificate_authority_data" { + description = "Cluster ca certificate" + value = module.eks.cluster_certificate_authority_data +} + +output "cluster_name" { + description = "Cluster name" + value = module.eks.cluster_name +} + +output "cluster_region" { + description = "Cluster region" + value = local.region +} diff --git a/patterns/cell-based-eks/2.cell2/variables.tf b/patterns/cell-based-eks/2.cell2/variables.tf new file mode 100644 index 0000000000..9969dd7b67 --- /dev/null +++ b/patterns/cell-based-eks/2.cell2/variables.tf @@ -0,0 +1,12 @@ + +variable "name" { + description = "cluster name" + type = string + default = "cell-2" +} + +variable "region" { + description = "cluster name" + type = string + default = "us-west-2" +} diff --git a/patterns/cell-based-eks/2.cell2/versions.tf b/patterns/cell-based-eks/2.cell2/versions.tf new file mode 100644 index 0000000000..9577780c9b --- /dev/null +++ b/patterns/cell-based-eks/2.cell2/versions.tf @@ -0,0 +1,29 @@ +terraform { + required_version = ">= 1.0" + + required_providers { + aws = { + source = "hashicorp/aws" + version = ">= 4.47" + } + helm = { + source = "hashicorp/helm" + version = ">= 2.9" + } + kubernetes = { + source = "hashicorp/kubernetes" + version = ">= 2.20" + } + kubectl = { + source = "gavinbunney/kubectl" + version = ">= 1.14" + } + } + + # ## Used for end-to-end testing on project; update to suit your needs + # backend "s3" { + # bucket = "" + # region = "" + # key = "e2e/istio-multi-cluster-vpc/terraform.tfstate" + # } +} diff --git a/patterns/cell-based-eks/3.test-setup/test_setup.sh b/patterns/cell-based-eks/3.test-setup/test_setup.sh new file mode 100755 index 0000000000..48cb1036ed --- /dev/null +++ b/patterns/cell-based-eks/3.test-setup/test_setup.sh @@ -0,0 +1,51 @@ +export CELL_1=cell-1 +export CELL_2=cell-2 +export AWS_DEFAULT_REGION=$(aws configure get region) +export AWS_ACCOUNT_NUMBER=$(aws sts get-caller-identity --query "Account" --output text) + +aws eks update-kubeconfig --name $CELL_1 --region $AWS_DEFAULT_REGION +aws eks update-kubeconfig --name $CELL_2 --region $AWS_DEFAULT_REGION + +export CTX_CELL_1=arn:aws:eks:$AWS_DEFAULT_REGION:${AWS_ACCOUNT_NUMBER}:cluster/$CELL_1 +export CTX_CELL_2=arn:aws:eks:$AWS_DEFAULT_REGION:${AWS_ACCOUNT_NUMBER}:cluster/$CELL_2 + +bold=$(tput bold) +normal=$(tput sgr0) + +alias kgn="kubectl get node -o custom-columns='NODE_NAME:.metadata.name,READY:.status.conditions[?(@.type==\"Ready\")].status,INSTANCE-TYPE:.metadata.labels.node\.kubernetes\.io/instance-type,CAPACITY-TYPE:.metadata.labels.karpenter\.sh/capacity-type,AZ:.metadata.labels.topology\.kubernetes\.io/zone,VERSION:.status.nodeInfo.kubeletVersion,OS-IMAGE:.status.nodeInfo.osImage,INTERNAL-IP:.metadata.annotations.alpha\.kubernetes\.io/provided-node-ip'" + +echo "------------${bold}Test the Cell-1 Setup${normal}-------------" + +echo "${bold}Cell-1: Nodes before the scaling event${normal}" + +kgn --context="${CTX_CELL_1}" + +echo "${bold}Cell-1: Scaling the inflate deployment to 50 replicas${normal}" + +kubectl scale deployment inflate --replicas 20 --context="${CTX_CELL_1}" + +echo "${bold}Cell-1: Wait for karpenter to launch the worker nodes and pods become ready......${normal}" + +kubectl wait --for=condition=ready pods --all --timeout 2m --context="${CTX_CELL_1}" + +echo "${bold}Cell-1: Nodes after the scaling event${normal}" + +kgn --context="${CTX_CELL_1}" + +echo "------------${bold}Test the Cell-2 Setup${normal}-------------" + +echo "${bold}Cell-2: Nodes before the scaling event${normal}" + +kgn --context="${CTX_CELL_2}" + +echo "${bold}Cell-2: Scaling the inflate deployment to 50 replicas${normal}" + +kubectl scale deployment inflate --replicas 20 --context="${CTX_CELL_2}" + +echo "${bold}Cell-2: Wait for karpenter to launch the worker nodes and pods become ready......${normal}" + +kubectl wait --for=condition=ready pods --all --timeout 2m --context="${CTX_CELL_2}" + +echo "${bold}Cell-2: Nodes after the scaling event${normal}" + +kgn --context="${CTX_CELL_2}" diff --git a/patterns/cell-based-eks/README.md b/patterns/cell-based-eks/README.md new file mode 100644 index 0000000000..70cc23dc8b --- /dev/null +++ b/patterns/cell-based-eks/README.md @@ -0,0 +1,92 @@ +# Cell-Based Architecture for Amazon EKS + +This pattern how to configure a cell-based architecture for Amazon Elastic Kubernetes Service (Amazon EKS). It moves away from typical multiple Availability Zone (AZ) clusters to a single Availability Zone cluster. These single AZ clusters are called cells, and the aggregation of these cells in each Region is called a supercell. These cells help to ensure that a failure in one cell doesn't affect the cells in another, reducing data transfer costs and improving both the availability and resiliency against AZ failures for Amazon EKS workloads. + +Refer to the [AWS Solution Guidance](https://aws.amazon.com/solutions/guidance/cell-based-architecture-for-amazon-eks/) for more details. + +## Notable configuration + +* This sample rely on reading data from Terraform Remote State in the different folders. In a production setup, Terraform Remote State is stored in a persistent backend such as Terraform Cloud or S3. For more information, please refer to the Terraform [Backends](https://developer.hashicorp.com/terraform/language/settings/backends/configuration) documentation + +## Folder structure + +### [`0.vpc`](0.vpc/) + +This folder creates the VPC for all clusters. In this demonstration we are creating 2 cells. The VPC creation is not part of the cluster provisionig and therefore lives in a seperate folder. + +### [`1.cell1`](1.cell1/) + +This folder creates an Amazon EKS Cluster, named by default `cell-1` (see [`variables.tf`](1.cell1/variables.tf)), with AWS Load Balancer Controller, and Karpenter installation. +Configurations in this folder to be aware of: + +* The cluster is configured to use the subnet-1 (AZ-1) created in the `0.vpc` folder. +* Karpenter `Provisioner` and `AWSNodeTemplate` resources are pointing to AZ-1 subnet. +* Essential operational addons like `coredns`, `aws-load-balancer-controller`, and `karpenter` are deployed to Fargate configured with AZ-1 subnet. + +### [`2.cell2`](2.cell2/) + +Same configuration as in `1.cell1` except the name of the cluster which is `cell-2` and deployed in `az-2` + +### [`3.test-setup`](4.test-setup/) + +This folder test the installation setup. It does by scaling the sample `inflate` application replicas and watch for Karpenter to launch EKS worker nodes in respective AZs. + +## Prerequisites + +Ensure that you have the following tools installed locally: + +1. [aws cli](https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2.html) +2. [kubectl](https://Kubernetes.io/docs/tasks/tools/) +3. [terraform](https://learn.hashicorp.com/tutorials/terraform/install-cli) + +## Deploy + +### Step 0 - Create the VPC + +```shell +cd 0.vpc +terraform init +terraform apply -auto-approve +cd.. +``` + +### Step 1 - Deploy cell-1 + +```shell +cd 1.cell1 +terraform init +terraform apply -auto-approve +cd.. +``` + +### Step 2 - Deploy cell-2 + +```shell +cd 2.cell2 +terraform init +terraform apply -auto-approve +cd.. +``` + +### Step 3 - test installation + +```shell +cd 3.test-setup +./test_setup.sh +cd.. +``` + +This script scale the sample application `inflate` to 25 replicas in both cells. As replica pods go into pending state due to insufficient compute capacity, Karpenter will kick-in and bring up the EC2 worker nodes in respective AZs. + +## Destroy + +To teardown and remove the resources created in this example: + +```shell +cd ../2.cell2 +terraform apply -destroy -autoapprove +cd ../1.cell1 +terraform apply -destroy -autoapprove +cd ../0.vpc +terraform apply -destroy -autoapprove +``` From d8c42033f23e2fcd0df314250ec454956d714739 Mon Sep 17 00:00:00 2001 From: Ashok Srirama Date: Mon, 23 Oct 2023 10:43:41 -0400 Subject: [PATCH 2/3] Updates to ReadMe --- patterns/cell-based-eks/1.cell1/README.md | 1 + patterns/cell-based-eks/2.cell2/README.md | 1 + patterns/cell-based-eks/README.md | 18 +++++++++--------- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/patterns/cell-based-eks/1.cell1/README.md b/patterns/cell-based-eks/1.cell1/README.md index f71cb32dec..c93f008bd7 100644 --- a/patterns/cell-based-eks/1.cell1/README.md +++ b/patterns/cell-based-eks/1.cell1/README.md @@ -16,6 +16,7 @@ Ensure that you have the following tools installed locally: 1. [aws cli](https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2.html) 2. [kubectl](https://Kubernetes.io/docs/tasks/tools/) 3. [terraform](https://learn.hashicorp.com/tutorials/terraform/install-cli) +4. [helm](https://helm.sh/docs/helm/helm_install/) ## Deploy diff --git a/patterns/cell-based-eks/2.cell2/README.md b/patterns/cell-based-eks/2.cell2/README.md index 6a435a65b4..3d0a1a36d5 100644 --- a/patterns/cell-based-eks/2.cell2/README.md +++ b/patterns/cell-based-eks/2.cell2/README.md @@ -16,6 +16,7 @@ Ensure that you have the following tools installed locally: 1. [aws cli](https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2.html) 2. [kubectl](https://Kubernetes.io/docs/tasks/tools/) 3. [terraform](https://learn.hashicorp.com/tutorials/terraform/install-cli) +4. [helm](https://helm.sh/docs/helm/helm_install/) ## Deploy diff --git a/patterns/cell-based-eks/README.md b/patterns/cell-based-eks/README.md index 70cc23dc8b..c3306d5090 100644 --- a/patterns/cell-based-eks/README.md +++ b/patterns/cell-based-eks/README.md @@ -1,6 +1,6 @@ # Cell-Based Architecture for Amazon EKS -This pattern how to configure a cell-based architecture for Amazon Elastic Kubernetes Service (Amazon EKS). It moves away from typical multiple Availability Zone (AZ) clusters to a single Availability Zone cluster. These single AZ clusters are called cells, and the aggregation of these cells in each Region is called a supercell. These cells help to ensure that a failure in one cell doesn't affect the cells in another, reducing data transfer costs and improving both the availability and resiliency against AZ failures for Amazon EKS workloads. +This pattern demonstrates how to configure a cell-based architecture for Amazon Elastic Kubernetes Service (Amazon EKS). It moves away from typical multiple Availability Zone (AZ) clusters to a single Availability Zone cluster. These single AZ clusters are called cells, and the aggregation of these cells in each Region is called a supercell. These cells help to ensure that a failure in one cell doesn't affect the cells in another, reducing data transfer costs and improving both the availability and resiliency against AZ wide failures for Amazon EKS workloads. Refer to the [AWS Solution Guidance](https://aws.amazon.com/solutions/guidance/cell-based-architecture-for-amazon-eks/) for more details. @@ -12,7 +12,7 @@ Refer to the [AWS Solution Guidance](https://aws.amazon.com/solutions/guidance/c ### [`0.vpc`](0.vpc/) -This folder creates the VPC for all clusters. In this demonstration we are creating 2 cells. The VPC creation is not part of the cluster provisionig and therefore lives in a seperate folder. +This folder creates the VPC for all clusters. In this demonstration we are creating 2 cells sharing the same VPC. So, the VPC creation is not part of the cluster provisionig and therefore lives in a seperate folder. You could also explore a VPC per cluster depending on your needs. ### [`1.cell1`](1.cell1/) @@ -25,9 +25,9 @@ Configurations in this folder to be aware of: ### [`2.cell2`](2.cell2/) -Same configuration as in `1.cell1` except the name of the cluster which is `cell-2` and deployed in `az-2` +Same configuration as in `1.cell1` except the name of the cluster is `cell-2` and deployed in `az-2` -### [`3.test-setup`](4.test-setup/) +### [`3.test-setup`](3.test-setup/) This folder test the installation setup. It does by scaling the sample `inflate` application replicas and watch for Karpenter to launch EKS worker nodes in respective AZs. @@ -76,17 +76,17 @@ cd 3.test-setup cd.. ``` -This script scale the sample application `inflate` to 25 replicas in both cells. As replica pods go into pending state due to insufficient compute capacity, Karpenter will kick-in and bring up the EC2 worker nodes in respective AZs. +This script scale the sample application `inflate` to 20 replicas in both cells. As replica pods go into pending state due to insufficient compute capacity, Karpenter will kick-in and bring up the EC2 worker nodes in respective AZs. ## Destroy To teardown and remove the resources created in this example: ```shell -cd ../2.cell2 -terraform apply -destroy -autoapprove +cd 2.cell2 +terraform apply -destroy -auto-approve cd ../1.cell1 -terraform apply -destroy -autoapprove +terraform apply -destroy -auto-approve cd ../0.vpc -terraform apply -destroy -autoapprove +terraform apply -destroy -auto-approve ``` From 03e0ed66ec2b6392c2fdadcfc2c972a910d5d02c Mon Sep 17 00:00:00 2001 From: Ashok Srirama Date: Wed, 27 Dec 2023 10:30:53 -0500 Subject: [PATCH 3/3] Updates based on the PR feedback --- docs/patterns/cell-based-eks.md | 7 + .../{0.vpc/main.tf => 0.vpc.tf} | 8 +- patterns/cell-based-eks/0.vpc/outputs.tf | 14 - patterns/cell-based-eks/0.vpc/versions.tf | 17 - patterns/cell-based-eks/1.az1.tf | 133 +++++++ patterns/cell-based-eks/1.cell1/README.md | 181 --------- patterns/cell-based-eks/1.cell1/main.tf | 349 ------------------ patterns/cell-based-eks/1.cell1/outputs.tf | 24 -- patterns/cell-based-eks/1.cell1/variables.tf | 12 - patterns/cell-based-eks/1.cell1/versions.tf | 29 -- patterns/cell-based-eks/2.az2.tf | 123 ++++++ patterns/cell-based-eks/2.cell2/README.md | 181 --------- patterns/cell-based-eks/2.cell2/main.tf | 345 ----------------- patterns/cell-based-eks/2.cell2/outputs.tf | 24 -- patterns/cell-based-eks/2.cell2/variables.tf | 12 - patterns/cell-based-eks/3.az3.tf | 123 ++++++ patterns/cell-based-eks/README.md | 127 +++---- patterns/cell-based-eks/az1.yaml | 43 +++ patterns/cell-based-eks/az2.yaml | 43 +++ patterns/cell-based-eks/az3.yaml | 43 +++ patterns/cell-based-eks/inflate.yaml | 21 ++ patterns/cell-based-eks/outputs.tf | 34 ++ .../{3.test-setup => }/test_setup.sh | 42 ++- .../cell-based-eks/{0.vpc => }/variables.tf | 0 .../cell-based-eks/{2.cell2 => }/versions.tf | 6 +- 25 files changed, 679 insertions(+), 1262 deletions(-) create mode 100644 docs/patterns/cell-based-eks.md rename patterns/cell-based-eks/{0.vpc/main.tf => 0.vpc.tf} (86%) delete mode 100644 patterns/cell-based-eks/0.vpc/outputs.tf delete mode 100644 patterns/cell-based-eks/0.vpc/versions.tf create mode 100644 patterns/cell-based-eks/1.az1.tf delete mode 100644 patterns/cell-based-eks/1.cell1/README.md delete mode 100644 patterns/cell-based-eks/1.cell1/main.tf delete mode 100644 patterns/cell-based-eks/1.cell1/outputs.tf delete mode 100644 patterns/cell-based-eks/1.cell1/variables.tf delete mode 100644 patterns/cell-based-eks/1.cell1/versions.tf create mode 100644 patterns/cell-based-eks/2.az2.tf delete mode 100644 patterns/cell-based-eks/2.cell2/README.md delete mode 100644 patterns/cell-based-eks/2.cell2/main.tf delete mode 100644 patterns/cell-based-eks/2.cell2/outputs.tf delete mode 100644 patterns/cell-based-eks/2.cell2/variables.tf create mode 100644 patterns/cell-based-eks/3.az3.tf create mode 100644 patterns/cell-based-eks/az1.yaml create mode 100644 patterns/cell-based-eks/az2.yaml create mode 100644 patterns/cell-based-eks/az3.yaml create mode 100644 patterns/cell-based-eks/inflate.yaml create mode 100644 patterns/cell-based-eks/outputs.tf rename patterns/cell-based-eks/{3.test-setup => }/test_setup.sh (59%) rename patterns/cell-based-eks/{0.vpc => }/variables.tf (100%) rename patterns/cell-based-eks/{2.cell2 => }/versions.tf (76%) diff --git a/docs/patterns/cell-based-eks.md b/docs/patterns/cell-based-eks.md new file mode 100644 index 0000000000..2a9bc2b46b --- /dev/null +++ b/docs/patterns/cell-based-eks.md @@ -0,0 +1,7 @@ +--- +title: Cell-Based Architecture for Amazon EKS +--- + +{% + include-markdown "../../patterns/cell-based-eks/README.md" +%} diff --git a/patterns/cell-based-eks/0.vpc/main.tf b/patterns/cell-based-eks/0.vpc.tf similarity index 86% rename from patterns/cell-based-eks/0.vpc/main.tf rename to patterns/cell-based-eks/0.vpc.tf index 5d92a05334..81528caba1 100644 --- a/patterns/cell-based-eks/0.vpc/main.tf +++ b/patterns/cell-based-eks/0.vpc.tf @@ -5,14 +5,14 @@ provider "aws" { data "aws_availability_zones" "available" {} locals { - cluster_name = format("%s-%s", basename(path.cwd), "shared") - region = "us-west-2" + name = basename(path.cwd) + region = "us-west-2" vpc_cidr = "10.0.0.0/16" azs = slice(data.aws_availability_zones.available.names, 0, 3) tags = { - Blueprint = local.cluster_name + Blueprint = local.name GithubRepo = "github.com/aws-ia/terraform-aws-eks-blueprints" } } @@ -25,7 +25,7 @@ module "vpc" { source = "terraform-aws-modules/vpc/aws" version = "~> 5.0" - name = local.cluster_name + name = local.name cidr = local.vpc_cidr azs = local.azs diff --git a/patterns/cell-based-eks/0.vpc/outputs.tf b/patterns/cell-based-eks/0.vpc/outputs.tf deleted file mode 100644 index af6cc3a872..0000000000 --- a/patterns/cell-based-eks/0.vpc/outputs.tf +++ /dev/null @@ -1,14 +0,0 @@ -output "vpc_id" { - description = "Amazon EKS VPC ID" - value = module.vpc.vpc_id -} - -output "subnet_ids" { - description = "Amazon EKS Subnet IDs" - value = module.vpc.private_subnets -} - -output "vpc_cidr" { - description = "Amazon EKS VPC CIDR Block." - value = local.vpc_cidr -} diff --git a/patterns/cell-based-eks/0.vpc/versions.tf b/patterns/cell-based-eks/0.vpc/versions.tf deleted file mode 100644 index 6cbf2a99e8..0000000000 --- a/patterns/cell-based-eks/0.vpc/versions.tf +++ /dev/null @@ -1,17 +0,0 @@ -terraform { - required_version = ">= 1.0" - - required_providers { - aws = { - source = "hashicorp/aws" - version = ">= 4.47" - } - } - - # ## Used for end-to-end testing on project; update to suit your needs - # backend "s3" { - # bucket = "" - # region = "" - # key = "e2e/istio-multi-cluster-vpc/terraform.tfstate" - # } -} diff --git a/patterns/cell-based-eks/1.az1.tf b/patterns/cell-based-eks/1.az1.tf new file mode 100644 index 0000000000..f95c1a3bdf --- /dev/null +++ b/patterns/cell-based-eks/1.az1.tf @@ -0,0 +1,133 @@ +# Required for public ECR where Karpenter artifacts are hosted +provider "aws" { + region = "us-east-1" + alias = "virginia" +} + +data "aws_ecrpublic_authorization_token" "token" { + provider = aws.virginia +} + +provider "kubernetes" { + alias = "k8s-az1" + host = module.eks_az1.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks_az1.cluster_certificate_authority_data) + + exec { + api_version = "client.authentication.k8s.io/v1beta1" + command = "aws" + # This requires the awscli to be installed locally where Terraform is executed + args = ["eks", "get-token", "--cluster-name", module.eks_az1.cluster_name] + } +} + +provider "helm" { + alias = "helm-az1" + kubernetes { + host = module.eks_az1.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks_az1.cluster_certificate_authority_data) + + exec { + api_version = "client.authentication.k8s.io/v1beta1" + command = "aws" + # This requires the awscli to be installed locally where Terraform is executed + args = ["eks", "get-token", "--cluster-name", module.eks_az1.cluster_name] + } + } +} + +locals { + cell1_name = format("%s-%s", local.name, "az1") +} + +################################################################################ +# Cluster +################################################################################ + +module "eks_az1" { + source = "terraform-aws-modules/eks/aws" + version = "~> 19.18" + + providers = { + kubernetes = kubernetes.k8s-az1 + } + + cluster_name = local.cell1_name + cluster_version = "1.28" + cluster_endpoint_public_access = true + + vpc_id = module.vpc.vpc_id + subnet_ids = module.vpc.private_subnets + + manage_aws_auth_configmap = true + aws_auth_roles = [ + # We need to add in the Karpenter node IAM role for nodes launched by Karpenter + { + rolearn = module.eks_blueprints_addons_az1.karpenter.node_iam_role_arn + username = "system:node:{{EC2PrivateDNSName}}" + groups = [ + "system:bootstrappers", + "system:nodes", + ] + }, + ] + + eks_managed_node_groups = { + cell1 = { + instance_types = ["m5.large"] + + min_size = 1 + max_size = 5 + desired_size = 2 + + subnet_ids = [module.vpc.private_subnets[0]] + } + } + + tags = merge(local.tags, { + # NOTE - if creating multiple security groups with this module, only tag the + # security group that Karpenter should utilize with the following tag + # (i.e. - at most, only one security group should have this tag in your account) + "karpenter.sh/discovery" = local.cell1_name + }) +} + +################################################################################ +# EKS Blueprints Addons +################################################################################ + +module "eks_blueprints_addons_az1" { + source = "aws-ia/eks-blueprints-addons/aws" + version = "~> 1.11" + + providers = { + helm = helm.helm-az1 + kubernetes = kubernetes.k8s-az1 + } + + cluster_name = module.eks_az1.cluster_name + cluster_endpoint = module.eks_az1.cluster_endpoint + cluster_version = module.eks_az1.cluster_version + oidc_provider_arn = module.eks_az1.oidc_provider_arn + + # We want to wait for the EKS Managed Nodegroups to be deployed first + create_delay_dependencies = [for group in module.eks_az1.eks_managed_node_groups : group.node_group_arn] + + eks_addons = { + coredns = {} + vpc-cni = {} + kube-proxy = {} + } + + enable_karpenter = true + karpenter = { + repository_username = data.aws_ecrpublic_authorization_token.token.user_name + repository_password = data.aws_ecrpublic_authorization_token.token.password + } + karpenter_node = { + # Use static name so that it matches what is defined in `az1.yaml` example manifest + iam_role_use_name_prefix = false + } + + tags = local.tags +} diff --git a/patterns/cell-based-eks/1.cell1/README.md b/patterns/cell-based-eks/1.cell1/README.md deleted file mode 100644 index c93f008bd7..0000000000 --- a/patterns/cell-based-eks/1.cell1/README.md +++ /dev/null @@ -1,181 +0,0 @@ -# Cell-Based Architecture for Amazon EKS - -This example shows how to provision a cell based Amazon EKS cluster. - -* Deploy EKS Cluster with one managed node group in a VPC and AZ -* Deploy Fargate profiles to run `coredns`, `aws-load-balancer-controller`, and `karpenter` addons -* Deploy Karpenter `Provisioner` and `AWSNodeTemplate` resources and configure them to run in AZ1 -* Deploy sample deployment `inflate` with 0 replicas - -Refer to the [AWS Solution Guidance](https://aws.amazon.com/solutions/guidance/cell-based-architecture-for-amazon-eks/) for more details. - -## Prerequisites: - -Ensure that you have the following tools installed locally: - -1. [aws cli](https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2.html) -2. [kubectl](https://Kubernetes.io/docs/tasks/tools/) -3. [terraform](https://learn.hashicorp.com/tutorials/terraform/install-cli) -4. [helm](https://helm.sh/docs/helm/helm_install/) - -## Deploy - -To provision this example: - -```sh -terraform init -terraform apply -``` - -Enter `yes` at command prompt to apply - -## Validate - -The following command will update the `kubeconfig` on your local machine and allow you to interact with your EKS Cluster using `kubectl` to validate the deployment. - -1. Run `update-kubeconfig` command: - -```sh -aws eks --region update-kubeconfig --name -``` - -2. List the nodes running currently - -```sh -kubectl get node -o custom-columns='NODE_NAME:.metadata.name,READY:.status.conditions[?(@.type=="Ready")].status,INSTANCE-TYPE:.metadata.labels.node\.kubernetes\.io/instance-type,AZ:.metadata.labels.topology\.kubernetes\.io/zone,VERSION:.status.nodeInfo.kubeletVersion,OS-IMAGE:.status.nodeInfo.osImage,INTERNAL-IP:.metadata.annotations.alpha\.kubernetes\.io/provided-node-ip' -``` - -``` -# Output should look like below -NODE_NAME READY INSTANCE-TYPE AZ VERSION OS-IMAGE INTERNAL-IP -fargate-ip-10-0-13-93.us-west-2.compute.internal True us-west-2a v1.28.2-eks-f8587cb Amazon Linux 2 -fargate-ip-10-0-14-95.us-west-2.compute.internal True us-west-2a v1.28.2-eks-f8587cb Amazon Linux 2 -fargate-ip-10-0-15-86.us-west-2.compute.internal True us-west-2a v1.28.2-eks-f8587cb Amazon Linux 2 -fargate-ip-10-0-8-178.us-west-2.compute.internal True us-west-2a v1.28.2-eks-f8587cb Amazon Linux 2 -fargate-ip-10-0-8-254.us-west-2.compute.internal True us-west-2a v1.28.2-eks-f8587cb Amazon Linux 2 -fargate-ip-10-0-8-73.us-west-2.compute.internal True us-west-2a v1.28.2-eks-f8587cb Amazon Linux 2 -ip-10-0-12-14.us-west-2.compute.internal True m5.large us-west-2a v1.28.1-eks-43840fb Amazon Linux 2 10.0.12.14 -ip-10-0-14-197.us-west-2.compute.internal True m5.large us-west-2a v1.28.1-eks-43840fb Amazon Linux 2 10.0.14.197 -``` - -3. List out the pods running currently: - -```sh -kubectl get pods,svc -n kube-system -``` - -``` -# Output should look like below -NAME READY STATUS RESTARTS AGE -pod/aws-load-balancer-controller-776868b4fb-2j9t6 1/1 Running 0 13h -pod/aws-load-balancer-controller-776868b4fb-bzkrr 1/1 Running 0 13h -pod/aws-node-2zhpc 2/2 Running 0 16h -pod/aws-node-w897r 2/2 Running 0 16h -pod/coredns-5c9679c87-bp6ws 1/1 Running 0 16h -pod/coredns-5c9679c87-lw468 1/1 Running 0 16h -pod/kube-proxy-6wp2k 1/1 Running 0 16h -pod/kube-proxy-n8qtq 1/1 Running 0 16h - -NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE -service/aws-load-balancer-webhook-service ClusterIP 172.20.44.77 443/TCP 14h -service/kube-dns ClusterIP 172.20.0.10 53/UDP,53/TCP 17h -``` - -4. Verify all the helm releases installed: - -```sh -helm list -A -``` - -``` -# Output should look like below -NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION -aws-load-balancer-controller kube-system 2 2023-10-18 23:07:36.089372 -0400 EDT deployed aws-load-balancer-controller-1.6.1 v2.6.1 -karpenter karpenter 14 2023-10-19 08:25:12.313094 -0400 EDT deployed karpenter-v0.30.0 0.30.0 -``` - -## Test - -1. Verify both Fargate nodes and EKS Managed Nodegroup worker nodes are deployed to single AZ - -```sh -kubectl get node -o custom-columns='NODE_NAME:.metadata.name,READY:.status.conditions[?(@.type=="Ready")].status,INSTANCE-TYPE:.metadata.labels.node\.kubernetes\.io/instance-type,AZ:.metadata.labels.topology\.kubernetes\.io/zone,VERSION:.status.nodeInfo.kubeletVersion,OS-IMAGE:.status.nodeInfo.osImage,INTERNAL-IP:.metadata.annotations.alpha\.kubernetes\.io/provided-node-ip' -``` - -``` -NODE_NAME READY INSTANCE-TYPE AZ VERSION OS-IMAGE INTERNAL-IP -fargate-ip-10-0-13-93.us-west-2.compute.internal True us-west-2a v1.28.2-eks-f8587cb Amazon Linux 2 -fargate-ip-10-0-14-95.us-west-2.compute.internal True us-west-2a v1.28.2-eks-f8587cb Amazon Linux 2 -fargate-ip-10-0-15-86.us-west-2.compute.internal True us-west-2a v1.28.2-eks-f8587cb Amazon Linux 2 -fargate-ip-10-0-8-178.us-west-2.compute.internal True us-west-2a v1.28.2-eks-f8587cb Amazon Linux 2 -fargate-ip-10-0-8-254.us-west-2.compute.internal True us-west-2a v1.28.2-eks-f8587cb Amazon Linux 2 -fargate-ip-10-0-8-73.us-west-2.compute.internal True us-west-2a v1.28.2-eks-f8587cb Amazon Linux 2 -ip-10-0-12-14.us-west-2.compute.internal True m5.large us-west-2a v1.28.1-eks-43840fb Amazon Linux 2 10.0.12.14 -ip-10-0-14-197.us-west-2.compute.internal True m5.large us-west-2a v1.28.1-eks-43840fb Amazon Linux 2 10.0.14.197 -``` - -2. Scale the `inflate` deployment to 20 replicas and watch for Karpenter to launch EKS worker nodes in correct AZ. - -```sh -kubectl scale deployment inflate --replicas 20 -``` - -``` -deployment.apps/inflate scaled -``` - -3. Wait for the pods become ready - -```sh -kubectl wait --for=condition=ready pods --all --timeout 2m -``` - -``` -pod/inflate-75d744d4c6-5r5cv condition met -pod/inflate-75d744d4c6-775wm condition met -pod/inflate-75d744d4c6-7t225 condition met -pod/inflate-75d744d4c6-945p4 condition met -pod/inflate-75d744d4c6-b52gp condition met -pod/inflate-75d744d4c6-d99fn condition met -pod/inflate-75d744d4c6-dmnwm condition met -pod/inflate-75d744d4c6-hrvvr condition met -pod/inflate-75d744d4c6-j4hkl condition met -pod/inflate-75d744d4c6-jwknj condition met -pod/inflate-75d744d4c6-ldwts condition met -pod/inflate-75d744d4c6-lqnr5 condition met -pod/inflate-75d744d4c6-pctjh condition met -pod/inflate-75d744d4c6-qdlkc condition met -pod/inflate-75d744d4c6-qnzc5 condition met -pod/inflate-75d744d4c6-r2cwj condition met -pod/inflate-75d744d4c6-srmkb condition met -pod/inflate-75d744d4c6-wf45j condition met -pod/inflate-75d744d4c6-x9mwl condition met -pod/inflate-75d744d4c6-xlbhl condition met -``` - -4. Check all the nodes are in the correct AZ - -```sh -kubectl get node -o custom-columns='NODE_NAME:.metadata.name,READY:.status.conditions[?(@.type=="Ready")].status,INSTANCE-TYPE:.metadata.labels.node\.kubernetes\.io/instance-type,AZ:.metadata.labels.topology\.kubernetes\.io/zone,VERSION:.status.nodeInfo.kubeletVersion,OS-IMAGE:.status.nodeInfo.osImage,INTERNAL-IP:.metadata.annotations.alpha\.kubernetes\.io/provided-node-ip' -``` -``` -NODE_NAME READY INSTANCE-TYPE AZ VERSION OS-IMAGE INTERNAL-IP -fargate-ip-10-0-13-93.us-west-2.compute.internal True us-west-2a v1.28.2-eks-f8587cb Amazon Linux 2 -fargate-ip-10-0-14-95.us-west-2.compute.internal True us-west-2a v1.28.2-eks-f8587cb Amazon Linux 2 -fargate-ip-10-0-15-86.us-west-2.compute.internal True us-west-2a v1.28.2-eks-f8587cb Amazon Linux 2 -fargate-ip-10-0-8-178.us-west-2.compute.internal True us-west-2a v1.28.2-eks-f8587cb Amazon Linux 2 -fargate-ip-10-0-8-254.us-west-2.compute.internal True us-west-2a v1.28.2-eks-f8587cb Amazon Linux 2 -fargate-ip-10-0-8-73.us-west-2.compute.internal True us-west-2a v1.28.2-eks-f8587cb Amazon Linux 2 -ip-10-0-12-14.us-west-2.compute.internal True m5.large us-west-2a v1.28.1-eks-43840fb Amazon Linux 2 10.0.12.14 -ip-10-0-14-197.us-west-2.compute.internal True m5.large us-west-2a v1.28.1-eks-43840fb Amazon Linux 2 10.0.14.197 -ip-10-0-3-161.us-west-2.compute.internal True c6gn.8xlarge us-west-2a v1.28.1-eks-43840fb Amazon Linux 2 10.0.3.161 -``` - -## Destroy - -To teardown and remove the resources created in this example: - -```sh -terraform destroy -target="module.eks_blueprints_addons" -auto-approve -terraform destroy -auto-approve -``` diff --git a/patterns/cell-based-eks/1.cell1/main.tf b/patterns/cell-based-eks/1.cell1/main.tf deleted file mode 100644 index d554be5269..0000000000 --- a/patterns/cell-based-eks/1.cell1/main.tf +++ /dev/null @@ -1,349 +0,0 @@ -provider "aws" { - region = local.region -} - -# Required for public ECR where Karpenter artifacts are hosted -provider "aws" { - region = "us-east-1" - alias = "virginia" -} - -data "terraform_remote_state" "vpc" { - backend = "local" - - config = { - path = "${path.module}/../0.vpc/terraform.tfstate" - } -} - -data "aws_ecrpublic_authorization_token" "token" { - provider = aws.virginia -} - -provider "kubernetes" { - host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - - exec { - api_version = "client.authentication.k8s.io/v1beta1" - command = "aws" - # This requires the awscli to be installed locally where Terraform is executed - args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] - } -} - -provider "helm" { - kubernetes { - host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - - exec { - api_version = "client.authentication.k8s.io/v1beta1" - command = "aws" - # This requires the awscli to be installed locally where Terraform is executed - args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] - } - } -} - -provider "kubectl" { - apply_retry_count = 5 - host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - load_config_file = false - - exec { - api_version = "client.authentication.k8s.io/v1beta1" - command = "aws" - # This requires the awscli to be installed locally where Terraform is executed - args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] - } -} - -data "aws_availability_zones" "available" {} - -locals { - cluster_name = var.name - region = var.region - azs = slice(data.aws_availability_zones.available.names, 0, 3) - - tags = { - Blueprint = local.cluster_name - GithubRepo = "github.com/aws-ia/terraform-aws-eks-blueprints" - } -} - -################################################################################ -# Cluster -################################################################################ - -module "eks" { - source = "terraform-aws-modules/eks/aws" - version = "~> 19.16" - - cluster_name = local.cluster_name - cluster_version = "1.28" - cluster_endpoint_public_access = true - - vpc_id = data.terraform_remote_state.vpc.outputs.vpc_id - subnet_ids = data.terraform_remote_state.vpc.outputs.subnet_ids - - manage_aws_auth_configmap = true - aws_auth_roles = [ - # We need to add in the Karpenter node IAM role for nodes launched by Karpenter - { - rolearn = module.eks_blueprints_addons.karpenter.node_iam_role_arn - username = "system:node:{{EC2PrivateDNSName}}" - groups = [ - "system:bootstrappers", - "system:nodes", - ] - }, - ] - - fargate_profiles = { - karpenter = { - selectors = [ - { namespace = "karpenter" } - ] - subnet_ids = [data.terraform_remote_state.vpc.outputs.subnet_ids[0]] - } - kube_system = { - name = "kube-system" - selectors = [ - { namespace = "kube-system" } - ] - subnet_ids = [data.terraform_remote_state.vpc.outputs.subnet_ids[0]] - } - } - - eks_managed_node_groups = { - cell1 = { - instance_types = ["m5.large"] - - min_size = 1 - max_size = 5 - desired_size = 2 - - subnet_ids = [data.terraform_remote_state.vpc.outputs.subnet_ids[0]] - } - } - - tags = local.tags -} - -################################################################################ -# EKS Blueprints Addons -################################################################################ - -module "eks_blueprints_addons" { - source = "aws-ia/eks-blueprints-addons/aws" - version = "~> 1.0" - - cluster_name = module.eks.cluster_name - cluster_endpoint = module.eks.cluster_endpoint - cluster_version = module.eks.cluster_version - oidc_provider_arn = module.eks.oidc_provider_arn - - # We want to wait for the Fargate profiles to be deployed first - create_delay_dependencies = [for prof in module.eks.fargate_profiles : prof.fargate_profile_arn] - - eks_addons = { - coredns = { - configuration_values = jsonencode({ - computeType = "Fargate" - # Ensure that the we fully utilize the minimum amount of resources that are supplied by - # Fargate https://docs.aws.amazon.com/eks/latest/userguide/fargate-pod-configuration.html - # Fargate adds 256 MB to each pod's memory reservation for the required Kubernetes - # components (kubelet, kube-proxy, and containerd). Fargate rounds up to the following - # compute configuration that most closely matches the sum of vCPU and memory requests in - # order to ensure pods always have the resources that they need to run. - resources = { - limits = { - cpu = "0.25" - # We are targetting the smallest Task size of 512Mb, so we subtract 256Mb from the - # request/limit to ensure we can fit within that task - memory = "256M" - } - requests = { - cpu = "0.25" - # We are targetting the smallest Task size of 512Mb, so we subtract 256Mb from the - # request/limit to ensure we can fit within that task - memory = "256M" - } - } - }) - } - vpc-cni = {} - kube-proxy = {} - } - - enable_karpenter = true - karpenter = { - repository_username = data.aws_ecrpublic_authorization_token.token.user_name - repository_password = data.aws_ecrpublic_authorization_token.token.password - } - - enable_aws_load_balancer_controller = true - aws_load_balancer_controller = { - chart_version = "1.6.1" # min version required to use SG for NLB feature - set = [ - { - name = "vpcId" - value = data.terraform_remote_state.vpc.outputs.vpc_id - }, - { - name = "podDisruptionBudget.maxUnavailable" - value = 1 - }, - ] - } - - tags = local.tags -} - -################################################################################ -# Karpenter -################################################################################ - -resource "aws_security_group" "karpenter_sg" { - name = "${local.cluster_name}_karpenter_sg" - description = "${local.cluster_name} Karpenter SG" - vpc_id = data.terraform_remote_state.vpc.outputs.vpc_id - tags = { - "Name" = "${local.cluster_name}_karpenter_sg" - "karpenter.sh/discovery" = local.cluster_name - } -} - -resource "aws_vpc_security_group_egress_rule" "karpenter_sg_allow_all_4" { - security_group_id = aws_security_group.karpenter_sg.id - - ip_protocol = "-1" - cidr_ipv4 = "0.0.0.0/0" -} - -resource "aws_vpc_security_group_egress_rule" "karpenter_sg_allow_all_6" { - security_group_id = aws_security_group.karpenter_sg.id - - ip_protocol = "-1" - cidr_ipv6 = "::/0" -} - -resource "aws_vpc_security_group_ingress_rule" "karpenter_sg_allow_cluster_ing" { - security_group_id = aws_security_group.karpenter_sg.id - - ip_protocol = "-1" - referenced_security_group_id = module.eks.cluster_security_group_id -} - -resource "aws_vpc_security_group_ingress_rule" "karpenter_sg_allow_mng_ing" { - security_group_id = aws_security_group.karpenter_sg.id - - ip_protocol = "-1" - referenced_security_group_id = module.eks.node_security_group_id -} - -resource "aws_vpc_security_group_ingress_rule" "cluster_sg_allow_karpenter_ing" { - security_group_id = module.eks.cluster_security_group_id - - ip_protocol = "-1" - referenced_security_group_id = aws_security_group.karpenter_sg.id -} - -resource "kubectl_manifest" "karpenter_provisioner" { - yaml_body = <<-YAML - apiVersion: karpenter.sh/v1alpha5 - kind: Provisioner - metadata: - name: default - spec: - requirements: - - key: "karpenter.k8s.aws/instance-category" - operator: In - values: ["c", "m"] - - key: "karpenter.k8s.aws/instance-cpu" - operator: In - values: ["8", "16", "32"] - - key: "karpenter.k8s.aws/instance-hypervisor" - operator: In - values: ["nitro"] - - key: "topology.kubernetes.io/zone" - operator: In - values: [${jsonencode(local.azs[0])}] - - key: "kubernetes.io/arch" - operator: In - values: ["arm64", "amd64"] - - key: "karpenter.sh/capacity-type" # If not included, the webhook for the AWS cloud provider will default to on-demand - operator: In - values: ["spot", "on-demand"] - kubeletConfiguration: - containerRuntime: containerd - maxPods: 110 - limits: - resources: - cpu: 1000 - consolidation: - enabled: true - providerRef: - name: default - ttlSecondsUntilExpired: 604800 # 7 Days = 7 * 24 * 60 * 60 Seconds - YAML - - depends_on = [ - module.eks_blueprints_addons - ] -} - -resource "kubectl_manifest" "karpenter_node_template" { - yaml_body = <<-YAML - apiVersion: karpenter.k8s.aws/v1alpha1 - kind: AWSNodeTemplate - metadata: - name: default - spec: - subnetSelector: - aws-ids: ${data.terraform_remote_state.vpc.outputs.subnet_ids[0]} - securityGroupSelector: - karpenter.sh/discovery: ${module.eks.cluster_name} - instanceProfile: ${module.eks_blueprints_addons.karpenter.node_instance_profile_name} - tags: - karpenter.sh/discovery: ${module.eks.cluster_name} - YAML - - depends_on = [ - module.eks_blueprints_addons - ] -} - -# Example deployment using the [pause image](https://www.ianlewis.org/en/almighty-pause-container) -# and starts with zero replicas -resource "kubectl_manifest" "karpenter_example_deployment" { - yaml_body = <<-YAML - apiVersion: apps/v1 - kind: Deployment - metadata: - name: inflate - spec: - replicas: 0 - selector: - matchLabels: - app: inflate - template: - metadata: - labels: - app: inflate - spec: - terminationGracePeriodSeconds: 0 - containers: - - name: inflate - image: public.ecr.aws/eks-distro/kubernetes/pause:3.7 - resources: - requests: - cpu: 1 - YAML - - depends_on = [ - kubectl_manifest.karpenter_node_template - ] -} diff --git a/patterns/cell-based-eks/1.cell1/outputs.tf b/patterns/cell-based-eks/1.cell1/outputs.tf deleted file mode 100644 index d3a1ef5eb0..0000000000 --- a/patterns/cell-based-eks/1.cell1/outputs.tf +++ /dev/null @@ -1,24 +0,0 @@ -output "configure_kubectl" { - description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig" - value = "aws eks --region ${local.region} update-kubeconfig --name ${module.eks.cluster_name}" -} - -output "cluster_endpoint" { - description = "Cluster endpoint" - value = module.eks.cluster_endpoint -} - -output "cluster_certificate_authority_data" { - description = "Cluster ca certificate" - value = module.eks.cluster_certificate_authority_data -} - -output "cluster_name" { - description = "Cluster name" - value = module.eks.cluster_name -} - -output "cluster_region" { - description = "Cluster region" - value = local.region -} diff --git a/patterns/cell-based-eks/1.cell1/variables.tf b/patterns/cell-based-eks/1.cell1/variables.tf deleted file mode 100644 index d71fa165b9..0000000000 --- a/patterns/cell-based-eks/1.cell1/variables.tf +++ /dev/null @@ -1,12 +0,0 @@ - -variable "name" { - description = "cluster name" - type = string - default = "cell-1" -} - -variable "region" { - description = "cluster name" - type = string - default = "us-west-2" -} diff --git a/patterns/cell-based-eks/1.cell1/versions.tf b/patterns/cell-based-eks/1.cell1/versions.tf deleted file mode 100644 index 9577780c9b..0000000000 --- a/patterns/cell-based-eks/1.cell1/versions.tf +++ /dev/null @@ -1,29 +0,0 @@ -terraform { - required_version = ">= 1.0" - - required_providers { - aws = { - source = "hashicorp/aws" - version = ">= 4.47" - } - helm = { - source = "hashicorp/helm" - version = ">= 2.9" - } - kubernetes = { - source = "hashicorp/kubernetes" - version = ">= 2.20" - } - kubectl = { - source = "gavinbunney/kubectl" - version = ">= 1.14" - } - } - - # ## Used for end-to-end testing on project; update to suit your needs - # backend "s3" { - # bucket = "" - # region = "" - # key = "e2e/istio-multi-cluster-vpc/terraform.tfstate" - # } -} diff --git a/patterns/cell-based-eks/2.az2.tf b/patterns/cell-based-eks/2.az2.tf new file mode 100644 index 0000000000..0535bf2ce4 --- /dev/null +++ b/patterns/cell-based-eks/2.az2.tf @@ -0,0 +1,123 @@ +provider "kubernetes" { + alias = "k8s-az2" + host = module.eks_az2.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks_az2.cluster_certificate_authority_data) + + exec { + api_version = "client.authentication.k8s.io/v1beta1" + command = "aws" + # This requires the awscli to be installed locally where Terraform is executed + args = ["eks", "get-token", "--cluster-name", module.eks_az2.cluster_name] + } +} + +provider "helm" { + alias = "helm-az2" + kubernetes { + host = module.eks_az2.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks_az2.cluster_certificate_authority_data) + + exec { + api_version = "client.authentication.k8s.io/v1beta1" + command = "aws" + # This requires the awscli to be installed locally where Terraform is executed + args = ["eks", "get-token", "--cluster-name", module.eks_az2.cluster_name] + } + } +} + +locals { + cell2_name = format("%s-%s", local.name, "az2") +} + +################################################################################ +# Cluster +################################################################################ + +module "eks_az2" { + source = "terraform-aws-modules/eks/aws" + version = "~> 19.18" + + providers = { + kubernetes = kubernetes.k8s-az2 + } + + cluster_name = local.cell2_name + cluster_version = "1.28" + cluster_endpoint_public_access = true + + vpc_id = module.vpc.vpc_id + subnet_ids = module.vpc.private_subnets + + manage_aws_auth_configmap = true + aws_auth_roles = [ + # We need to add in the Karpenter node IAM role for nodes launched by Karpenter + { + rolearn = module.eks_blueprints_addons_az2.karpenter.node_iam_role_arn + username = "system:node:{{EC2PrivateDNSName}}" + groups = [ + "system:bootstrappers", + "system:nodes", + ] + }, + ] + + eks_managed_node_groups = { + cell1 = { + instance_types = ["m5.large"] + + min_size = 1 + max_size = 5 + desired_size = 2 + + subnet_ids = [module.vpc.private_subnets[1]] + } + } + + tags = merge(local.tags, { + # NOTE - if creating multiple security groups with this module, only tag the + # security group that Karpenter should utilize with the following tag + # (i.e. - at most, only one security group should have this tag in your account) + "karpenter.sh/discovery" = local.cell2_name + }) +} + +################################################################################ +# EKS Blueprints Addons +################################################################################ + +module "eks_blueprints_addons_az2" { + source = "aws-ia/eks-blueprints-addons/aws" + version = "~> 1.11" + + providers = { + helm = helm.helm-az2 + kubernetes = kubernetes.k8s-az2 + } + + cluster_name = module.eks_az2.cluster_name + cluster_endpoint = module.eks_az2.cluster_endpoint + cluster_version = module.eks_az2.cluster_version + oidc_provider_arn = module.eks_az2.oidc_provider_arn + + # We want to wait for the EKS Managed Nodegroups to be deployed first + create_delay_dependencies = [for group in module.eks_az2.eks_managed_node_groups : group.node_group_arn] + + eks_addons = { + coredns = {} + vpc-cni = {} + kube-proxy = {} + } + + enable_karpenter = true + karpenter = { + repository_username = data.aws_ecrpublic_authorization_token.token.user_name + repository_password = data.aws_ecrpublic_authorization_token.token.password + } + karpenter_node = { + # Use static name so that it matches what is defined in `az2.yaml` example manifest + iam_role_use_name_prefix = false + } + + tags = local.tags +} diff --git a/patterns/cell-based-eks/2.cell2/README.md b/patterns/cell-based-eks/2.cell2/README.md deleted file mode 100644 index 3d0a1a36d5..0000000000 --- a/patterns/cell-based-eks/2.cell2/README.md +++ /dev/null @@ -1,181 +0,0 @@ -# Cell-Based Architecture for Amazon EKS - -This example shows how to provision a cell based Amazon EKS cluster. - -* Deploy EKS Cluster with one managed node group in a VPC and AZ -* Deploy Fargate profiles to run `coredns`, `aws-load-balancer-controller`, and `karpenter` addons -* Deploy Karpenter `Provisioner` and `AWSNodeTemplate` resources and configure them to run in AZ2 -* Deploy sample deployment `inflate` with 0 replicas - -Refer to the [AWS Solution Guidance](https://aws.amazon.com/solutions/guidance/cell-based-architecture-for-amazon-eks/) for more details. - -## Prerequisites: - -Ensure that you have the following tools installed locally: - -1. [aws cli](https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2.html) -2. [kubectl](https://Kubernetes.io/docs/tasks/tools/) -3. [terraform](https://learn.hashicorp.com/tutorials/terraform/install-cli) -4. [helm](https://helm.sh/docs/helm/helm_install/) - -## Deploy - -To provision this example: - -```sh -terraform init -terraform apply -``` - -Enter `yes` at command prompt to apply - -## Validate - -The following command will update the `kubeconfig` on your local machine and allow you to interact with your EKS Cluster using `kubectl` to validate the deployment. - -1. Run `update-kubeconfig` command: - -```sh -aws eks --region update-kubeconfig --name -``` - -2. List the nodes running currently - -```sh -kubectl get node -o custom-columns='NODE_NAME:.metadata.name,READY:.status.conditions[?(@.type=="Ready")].status,INSTANCE-TYPE:.metadata.labels.node\.kubernetes\.io/instance-type,AZ:.metadata.labels.topology\.kubernetes\.io/zone,VERSION:.status.nodeInfo.kubeletVersion,OS-IMAGE:.status.nodeInfo.osImage,INTERNAL-IP:.metadata.annotations.alpha\.kubernetes\.io/provided-node-ip' -``` - -``` -# Output should look like below -NODE_NAME READY INSTANCE-TYPE AZ VERSION OS-IMAGE INTERNAL-IP -fargate-ip-10-0-22-6.us-west-2.compute.internal True us-west-2b v1.28.2-eks-f8587cb Amazon Linux 2 -fargate-ip-10-0-23-139.us-west-2.compute.internal True us-west-2b v1.28.2-eks-f8587cb Amazon Linux 2 -fargate-ip-10-0-23-59.us-west-2.compute.internal True us-west-2b v1.28.2-eks-f8587cb Amazon Linux 2 -fargate-ip-10-0-24-236.us-west-2.compute.internal True us-west-2b v1.28.2-eks-f8587cb Amazon Linux 2 -fargate-ip-10-0-25-116.us-west-2.compute.internal True us-west-2b v1.28.2-eks-f8587cb Amazon Linux 2 -fargate-ip-10-0-31-31.us-west-2.compute.internal True us-west-2b v1.28.2-eks-f8587cb Amazon Linux 2 -ip-10-0-30-113.us-west-2.compute.internal True m5.large us-west-2b v1.28.1-eks-43840fb Amazon Linux 2 10.0.30.113 -ip-10-0-31-158.us-west-2.compute.internal True m5.large us-west-2b v1.28.1-eks-43840fb Amazon Linux 2 10.0.31.158 -``` - -3. List out the pods running currently: - -```sh -kubectl get pods,svc -n kube-system -``` - -``` -# Output should look like below -NAME READY STATUS RESTARTS AGE -pod/aws-load-balancer-controller-8758bf745-grj9s 1/1 Running 0 3h42m -pod/aws-load-balancer-controller-8758bf745-j5m5j 1/1 Running 0 3h42m -pod/aws-node-crst2 2/2 Running 0 3h42m -pod/aws-node-dbs2f 2/2 Running 0 3h42m -pod/coredns-5c9679c87-fsxtt 1/1 Running 0 3h42m -pod/coredns-5c9679c87-fttcc 1/1 Running 0 3h42m -pod/kube-proxy-lrsd9 1/1 Running 0 3h42m -pod/kube-proxy-rc49k 1/1 Running 0 3h42m - -NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE -service/aws-load-balancer-webhook-service ClusterIP 172.20.134.154 443/TCP 3h42m -service/kube-dns ClusterIP 172.20.0.10 53/UDP,53/TCP 3h52m -``` - -4. Verify all the helm releases installed: - -```sh -helm list -A -``` - -``` -# Output should look like below -NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION -aws-load-balancer-controller kube-system 1 2023-10-19 09:01:45.053426 -0400 EDT deployed aws-load-balancer-controller-1.6.1 v2.6.1 -karpenter karpenter 4 2023-10-19 09:56:07.225133 -0400 EDT deployed karpenter-v0.30.0 0.30.0 -``` - -## Test - -1. Verify both Fargate nodes and EKS Managed Nodegroup worker nodes are deployed to single AZ - -```sh -kubectl get node -o custom-columns='NODE_NAME:.metadata.name,READY:.status.conditions[?(@.type=="Ready")].status,INSTANCE-TYPE:.metadata.labels.node\.kubernetes\.io/instance-type,AZ:.metadata.labels.topology\.kubernetes\.io/zone,VERSION:.status.nodeInfo.kubeletVersion,OS-IMAGE:.status.nodeInfo.osImage,INTERNAL-IP:.metadata.annotations.alpha\.kubernetes\.io/provided-node-ip' -``` - -``` -NODE_NAME READY INSTANCE-TYPE AZ VERSION OS-IMAGE INTERNAL-IP -fargate-ip-10-0-22-6.us-west-2.compute.internal True us-west-2b v1.28.2-eks-f8587cb Amazon Linux 2 -fargate-ip-10-0-23-139.us-west-2.compute.internal True us-west-2b v1.28.2-eks-f8587cb Amazon Linux 2 -fargate-ip-10-0-23-59.us-west-2.compute.internal True us-west-2b v1.28.2-eks-f8587cb Amazon Linux 2 -fargate-ip-10-0-24-236.us-west-2.compute.internal True us-west-2b v1.28.2-eks-f8587cb Amazon Linux 2 -fargate-ip-10-0-25-116.us-west-2.compute.internal True us-west-2b v1.28.2-eks-f8587cb Amazon Linux 2 -fargate-ip-10-0-31-31.us-west-2.compute.internal True us-west-2b v1.28.2-eks-f8587cb Amazon Linux 2 -ip-10-0-30-113.us-west-2.compute.internal True m5.large us-west-2b v1.28.1-eks-43840fb Amazon Linux 2 10.0.30.113 -ip-10-0-31-158.us-west-2.compute.internal True m5.large us-west-2b v1.28.1-eks-43840fb Amazon Linux 2 10.0.31.158 -``` - -2. Scale the `inflate` deployment to 20 replicas and watch for Karpenter to launch EKS worker nodes in correct AZ. - -```sh -kubectl scale deployment inflate --replicas 20 -``` - -``` -deployment.apps/inflate scaled -``` - -3. Wait for the pods become ready - -```sh -kubectl wait --for=condition=ready pods --all --timeout 2m -``` - -``` -pod/inflate-75d744d4c6-26nfh condition met -pod/inflate-75d744d4c6-4hfxf condition met -pod/inflate-75d744d4c6-4tvzr condition met -pod/inflate-75d744d4c6-5jkdp condition met -pod/inflate-75d744d4c6-5lpkg condition met -pod/inflate-75d744d4c6-6kv28 condition met -pod/inflate-75d744d4c6-7k5k5 condition met -pod/inflate-75d744d4c6-b7mm4 condition met -pod/inflate-75d744d4c6-kq9z7 condition met -pod/inflate-75d744d4c6-kslkq condition met -pod/inflate-75d744d4c6-mfps6 condition met -pod/inflate-75d744d4c6-s6h2j condition met -pod/inflate-75d744d4c6-s9db9 condition met -pod/inflate-75d744d4c6-sbmlz condition met -pod/inflate-75d744d4c6-slqhw condition met -pod/inflate-75d744d4c6-t9z27 condition met -pod/inflate-75d744d4c6-tqrjd condition met -pod/inflate-75d744d4c6-w9w8b condition met -pod/inflate-75d744d4c6-wk2jb condition met -pod/inflate-75d744d4c6-z54wg condition met -``` - -4. Check all the nodes are in the correct AZ - -```sh -kubectl get node -o custom-columns='NODE_NAME:.metadata.name,READY:.status.conditions[?(@.type=="Ready")].status,INSTANCE-TYPE:.metadata.labels.node\.kubernetes\.io/instance-type,AZ:.metadata.labels.topology\.kubernetes\.io/zone,VERSION:.status.nodeInfo.kubeletVersion,OS-IMAGE:.status.nodeInfo.osImage,INTERNAL-IP:.metadata.annotations.alpha\.kubernetes\.io/provided-node-ip' -``` -``` -NODE_NAME READY INSTANCE-TYPE AZ VERSION OS-IMAGE INTERNAL-IP -fargate-ip-10-0-22-6.us-west-2.compute.internal True us-west-2b v1.28.2-eks-f8587cb Amazon Linux 2 -fargate-ip-10-0-23-139.us-west-2.compute.internal True us-west-2b v1.28.2-eks-f8587cb Amazon Linux 2 -fargate-ip-10-0-23-59.us-west-2.compute.internal True us-west-2b v1.28.2-eks-f8587cb Amazon Linux 2 -fargate-ip-10-0-24-236.us-west-2.compute.internal True us-west-2b v1.28.2-eks-f8587cb Amazon Linux 2 -fargate-ip-10-0-25-116.us-west-2.compute.internal True us-west-2b v1.28.2-eks-f8587cb Amazon Linux 2 -fargate-ip-10-0-31-31.us-west-2.compute.internal True us-west-2b v1.28.2-eks-f8587cb Amazon Linux 2 -ip-10-0-27-134.us-west-2.compute.internal True c6g.8xlarge us-west-2b v1.28.1-eks-43840fb Amazon Linux 2 10.0.27.134 -ip-10-0-30-113.us-west-2.compute.internal True m5.large us-west-2b v1.28.1-eks-43840fb Amazon Linux 2 10.0.30.113 -ip-10-0-31-158.us-west-2.compute.internal True m5.large us-west-2b v1.28.1-eks-43840fb Amazon Linux 2 10.0.31.158 -``` - -## Destroy - -To teardown and remove the resources created in this example: - -```sh -terraform destroy -target="module.eks_blueprints_addons" -auto-approve -terraform destroy -auto-approve -``` diff --git a/patterns/cell-based-eks/2.cell2/main.tf b/patterns/cell-based-eks/2.cell2/main.tf deleted file mode 100644 index 8f3c8b8d00..0000000000 --- a/patterns/cell-based-eks/2.cell2/main.tf +++ /dev/null @@ -1,345 +0,0 @@ -provider "aws" { - region = local.region -} - -# Required for public ECR where Karpenter artifacts are hosted -provider "aws" { - region = "us-east-1" - alias = "virginia" -} - -data "terraform_remote_state" "vpc" { - backend = "local" - - config = { - path = "${path.module}/../0.vpc/terraform.tfstate" - } -} - -data "aws_ecrpublic_authorization_token" "token" { - provider = aws.virginia -} - -provider "kubernetes" { - host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - - exec { - api_version = "client.authentication.k8s.io/v1beta1" - command = "aws" - # This requires the awscli to be installed locally where Terraform is executed - args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] - } -} - -provider "helm" { - kubernetes { - host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - - exec { - api_version = "client.authentication.k8s.io/v1beta1" - command = "aws" - # This requires the awscli to be installed locally where Terraform is executed - args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] - } - } -} - -provider "kubectl" { - apply_retry_count = 5 - host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - load_config_file = false - - exec { - api_version = "client.authentication.k8s.io/v1beta1" - command = "aws" - # This requires the awscli to be installed locally where Terraform is executed - args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] - } -} - -data "aws_availability_zones" "available" {} - -locals { - cluster_name = var.name - region = var.region - azs = slice(data.aws_availability_zones.available.names, 0, 3) - - tags = { - Blueprint = local.cluster_name - GithubRepo = "github.com/aws-ia/terraform-aws-eks-blueprints" - } -} - -################################################################################ -# Cluster -################################################################################ - -module "eks" { - source = "terraform-aws-modules/eks/aws" - version = "~> 19.16" - - cluster_name = local.cluster_name - cluster_version = "1.28" - cluster_endpoint_public_access = true - - vpc_id = data.terraform_remote_state.vpc.outputs.vpc_id - subnet_ids = data.terraform_remote_state.vpc.outputs.subnet_ids - - manage_aws_auth_configmap = true - aws_auth_roles = [ - # We need to add in the Karpenter node IAM role for nodes launched by Karpenter - { - rolearn = module.eks_blueprints_addons.karpenter.node_iam_role_arn - username = "system:node:{{EC2PrivateDNSName}}" - groups = [ - "system:bootstrappers", - "system:nodes", - ] - }, - ] - - fargate_profiles = { - karpenter = { - selectors = [ - { namespace = "karpenter" } - ] - subnet_ids = [data.terraform_remote_state.vpc.outputs.subnet_ids[1]] - } - kube_system = { - name = "kube-system" - selectors = [ - { namespace = "kube-system" } - ] - subnet_ids = [data.terraform_remote_state.vpc.outputs.subnet_ids[1]] - } - } - - eks_managed_node_groups = { - cell2 = { - instance_types = ["m5.large"] - - min_size = 1 - max_size = 5 - desired_size = 2 - - subnet_ids = [data.terraform_remote_state.vpc.outputs.subnet_ids[1]] - } - } - - tags = local.tags -} - -################################################################################ -# EKS Blueprints Addons -################################################################################ - -module "eks_blueprints_addons" { - source = "aws-ia/eks-blueprints-addons/aws" - version = "~> 1.0" - - cluster_name = module.eks.cluster_name - cluster_endpoint = module.eks.cluster_endpoint - cluster_version = module.eks.cluster_version - oidc_provider_arn = module.eks.oidc_provider_arn - - # We want to wait for the Fargate profiles to be deployed first - create_delay_dependencies = [for prof in module.eks.fargate_profiles : prof.fargate_profile_arn] - - eks_addons = { - coredns = { - configuration_values = jsonencode({ - computeType = "Fargate" - # Ensure that the we fully utilize the minimum amount of resources that are supplied by - # Fargate https://docs.aws.amazon.com/eks/latest/userguide/fargate-pod-configuration.html - # Fargate adds 256 MB to each pod's memory reservation for the required Kubernetes - # components (kubelet, kube-proxy, and containerd). Fargate rounds up to the following - # compute configuration that most closely matches the sum of vCPU and memory requests in - # order to ensure pods always have the resources that they need to run. - resources = { - limits = { - cpu = "0.25" - # We are targetting the smallest Task size of 512Mb, so we subtract 256Mb from the - # request/limit to ensure we can fit within that task - memory = "256M" - } - requests = { - cpu = "0.25" - # We are targetting the smallest Task size of 512Mb, so we subtract 256Mb from the - # request/limit to ensure we can fit within that task - memory = "256M" - } - } - }) - } - vpc-cni = {} - kube-proxy = {} - } - - enable_karpenter = true - karpenter = { - repository_username = data.aws_ecrpublic_authorization_token.token.user_name - repository_password = data.aws_ecrpublic_authorization_token.token.password - } - - enable_aws_load_balancer_controller = true - aws_load_balancer_controller = { - chart_version = "1.6.1" # min version required to use SG for NLB feature - set = [ - { - name = "vpcId" - value = data.terraform_remote_state.vpc.outputs.vpc_id - }, - { - name = "podDisruptionBudget.maxUnavailable" - value = 1 - }, - ] - } - - tags = local.tags -} - -################################################################################ -# Karpenter -################################################################################ - -resource "aws_security_group" "karpenter_sg" { - name = "${local.cluster_name}_karpenter_sg" - description = "${local.cluster_name} Karpenter SG" - vpc_id = data.terraform_remote_state.vpc.outputs.vpc_id - tags = { - "Name" = "${local.cluster_name}_karpenter_sg" - "karpenter.sh/discovery" = local.cluster_name - } -} - -resource "aws_vpc_security_group_egress_rule" "karpenter_sg_allow_all_4" { - security_group_id = aws_security_group.karpenter_sg.id - - ip_protocol = "-1" - cidr_ipv4 = "0.0.0.0/0" -} - -resource "aws_vpc_security_group_egress_rule" "karpenter_sg_allow_all_6" { - security_group_id = aws_security_group.karpenter_sg.id - - ip_protocol = "-1" - cidr_ipv6 = "::/0" -} - -resource "aws_vpc_security_group_ingress_rule" "karpenter_sg_allow_cluster_ing" { - security_group_id = aws_security_group.karpenter_sg.id - - ip_protocol = "-1" - referenced_security_group_id = module.eks.cluster_security_group_id -} - -resource "aws_vpc_security_group_ingress_rule" "karpenter_sg_allow_mng_ing" { - security_group_id = aws_security_group.karpenter_sg.id - - ip_protocol = "-1" - referenced_security_group_id = module.eks.node_security_group_id -} - -resource "aws_vpc_security_group_ingress_rule" "cluster_sg_allow_karpenter_ing" { - security_group_id = module.eks.cluster_security_group_id - - ip_protocol = "-1" - referenced_security_group_id = aws_security_group.karpenter_sg.id -} - -resource "kubectl_manifest" "karpenter_provisioner" { - yaml_body = <<-YAML - apiVersion: karpenter.sh/v1alpha5 - kind: Provisioner - metadata: - name: default - spec: - requirements: - - key: "karpenter.k8s.aws/instance-category" - operator: In - values: ["c", "m"] - - key: "karpenter.k8s.aws/instance-cpu" - operator: In - values: ["8", "16", "32"] - - key: "karpenter.k8s.aws/instance-hypervisor" - operator: In - values: ["nitro"] - - key: "topology.kubernetes.io/zone" - operator: In - values: [${jsonencode(local.azs[1])}] - - key: "kubernetes.io/arch" - operator: In - values: ["arm64", "amd64"] - - key: "karpenter.sh/capacity-type" # If not included, the webhook for the AWS cloud provider will default to on-demand - operator: In - values: ["spot", "on-demand"] - kubeletConfiguration: - containerRuntime: containerd - maxPods: 110 - limits: - resources: - cpu: 1000 - consolidation: - enabled: true - providerRef: - name: default - ttlSecondsUntilExpired: 604800 # 7 Days = 7 * 24 * 60 * 60 Seconds - YAML - - depends_on = [ - module.eks_blueprints_addons - ] -} - -resource "kubectl_manifest" "karpenter_node_template" { - yaml_body = <<-YAML - apiVersion: karpenter.k8s.aws/v1alpha1 - kind: AWSNodeTemplate - metadata: - name: default - spec: - subnetSelector: - aws-ids: ${data.terraform_remote_state.vpc.outputs.subnet_ids[1]} - securityGroupSelector: - karpenter.sh/discovery: ${module.eks.cluster_name} - instanceProfile: ${module.eks_blueprints_addons.karpenter.node_instance_profile_name} - tags: - karpenter.sh/discovery: ${module.eks.cluster_name} - YAML -} - -# Example deployment using the [pause image](https://www.ianlewis.org/en/almighty-pause-container) -# and starts with zero replicas -resource "kubectl_manifest" "karpenter_example_deployment" { - yaml_body = <<-YAML - apiVersion: apps/v1 - kind: Deployment - metadata: - name: inflate - spec: - replicas: 0 - selector: - matchLabels: - app: inflate - template: - metadata: - labels: - app: inflate - spec: - terminationGracePeriodSeconds: 0 - containers: - - name: inflate - image: public.ecr.aws/eks-distro/kubernetes/pause:3.7 - resources: - requests: - cpu: 1 - YAML - - depends_on = [ - kubectl_manifest.karpenter_node_template - ] -} diff --git a/patterns/cell-based-eks/2.cell2/outputs.tf b/patterns/cell-based-eks/2.cell2/outputs.tf deleted file mode 100644 index d3a1ef5eb0..0000000000 --- a/patterns/cell-based-eks/2.cell2/outputs.tf +++ /dev/null @@ -1,24 +0,0 @@ -output "configure_kubectl" { - description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig" - value = "aws eks --region ${local.region} update-kubeconfig --name ${module.eks.cluster_name}" -} - -output "cluster_endpoint" { - description = "Cluster endpoint" - value = module.eks.cluster_endpoint -} - -output "cluster_certificate_authority_data" { - description = "Cluster ca certificate" - value = module.eks.cluster_certificate_authority_data -} - -output "cluster_name" { - description = "Cluster name" - value = module.eks.cluster_name -} - -output "cluster_region" { - description = "Cluster region" - value = local.region -} diff --git a/patterns/cell-based-eks/2.cell2/variables.tf b/patterns/cell-based-eks/2.cell2/variables.tf deleted file mode 100644 index 9969dd7b67..0000000000 --- a/patterns/cell-based-eks/2.cell2/variables.tf +++ /dev/null @@ -1,12 +0,0 @@ - -variable "name" { - description = "cluster name" - type = string - default = "cell-2" -} - -variable "region" { - description = "cluster name" - type = string - default = "us-west-2" -} diff --git a/patterns/cell-based-eks/3.az3.tf b/patterns/cell-based-eks/3.az3.tf new file mode 100644 index 0000000000..2797e3700d --- /dev/null +++ b/patterns/cell-based-eks/3.az3.tf @@ -0,0 +1,123 @@ +provider "kubernetes" { + alias = "k8s-az3" + host = module.eks_az3.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks_az3.cluster_certificate_authority_data) + + exec { + api_version = "client.authentication.k8s.io/v1beta1" + command = "aws" + # This requires the awscli to be installed locally where Terraform is executed + args = ["eks", "get-token", "--cluster-name", module.eks_az3.cluster_name] + } +} + +provider "helm" { + alias = "helm-az3" + kubernetes { + host = module.eks_az3.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks_az3.cluster_certificate_authority_data) + + exec { + api_version = "client.authentication.k8s.io/v1beta1" + command = "aws" + # This requires the awscli to be installed locally where Terraform is executed + args = ["eks", "get-token", "--cluster-name", module.eks_az3.cluster_name] + } + } +} + +locals { + cell3_name = format("%s-%s", local.name, "az3") +} + +################################################################################ +# Cluster +################################################################################ + +module "eks_az3" { + source = "terraform-aws-modules/eks/aws" + version = "~> 19.18" + + providers = { + kubernetes = kubernetes.k8s-az3 + } + + cluster_name = local.cell3_name + cluster_version = "1.28" + cluster_endpoint_public_access = true + + vpc_id = module.vpc.vpc_id + subnet_ids = module.vpc.private_subnets + + manage_aws_auth_configmap = true + aws_auth_roles = [ + # We need to add in the Karpenter node IAM role for nodes launched by Karpenter + { + rolearn = module.eks_blueprints_addons_az3.karpenter.node_iam_role_arn + username = "system:node:{{EC2PrivateDNSName}}" + groups = [ + "system:bootstrappers", + "system:nodes", + ] + }, + ] + + eks_managed_node_groups = { + cell1 = { + instance_types = ["m5.large"] + + min_size = 1 + max_size = 5 + desired_size = 2 + + subnet_ids = [module.vpc.private_subnets[2]] + } + } + + tags = merge(local.tags, { + # NOTE - if creating multiple security groups with this module, only tag the + # security group that Karpenter should utilize with the following tag + # (i.e. - at most, only one security group should have this tag in your account) + "karpenter.sh/discovery" = local.cell3_name + }) +} + +################################################################################ +# EKS Blueprints Addons +################################################################################ + +module "eks_blueprints_addons_az3" { + source = "aws-ia/eks-blueprints-addons/aws" + version = "~> 1.11" + + providers = { + helm = helm.helm-az3 + kubernetes = kubernetes.k8s-az3 + } + + cluster_name = module.eks_az3.cluster_name + cluster_endpoint = module.eks_az3.cluster_endpoint + cluster_version = module.eks_az3.cluster_version + oidc_provider_arn = module.eks_az3.oidc_provider_arn + + # We want to wait for the EKS Managed Nodegroups to be deployed first + create_delay_dependencies = [for group in module.eks_az3.eks_managed_node_groups : group.node_group_arn] + + eks_addons = { + coredns = {} + vpc-cni = {} + kube-proxy = {} + } + + enable_karpenter = true + karpenter = { + repository_username = data.aws_ecrpublic_authorization_token.token.user_name + repository_password = data.aws_ecrpublic_authorization_token.token.password + } + karpenter_node = { + # Use static name so that it matches what is defined in `az3.yaml` example manifest + iam_role_use_name_prefix = false + } + + tags = local.tags +} diff --git a/patterns/cell-based-eks/README.md b/patterns/cell-based-eks/README.md index c3306d5090..4d4b3c9fd4 100644 --- a/patterns/cell-based-eks/README.md +++ b/patterns/cell-based-eks/README.md @@ -1,92 +1,95 @@ # Cell-Based Architecture for Amazon EKS -This pattern demonstrates how to configure a cell-based architecture for Amazon Elastic Kubernetes Service (Amazon EKS). It moves away from typical multiple Availability Zone (AZ) clusters to a single Availability Zone cluster. These single AZ clusters are called cells, and the aggregation of these cells in each Region is called a supercell. These cells help to ensure that a failure in one cell doesn't affect the cells in another, reducing data transfer costs and improving both the availability and resiliency against AZ wide failures for Amazon EKS workloads. +This pattern demonstrates how to configure a cell-based architecture for Amazon Elastic Kubernetes Service (Amazon EKS) workloads. It moves away from typical multiple Availability Zone (AZ) clusters to a single Availability Zone cluster. These single AZ clusters are called cells, and the aggregation of these cells in each Region is called a supercell. These cells help to ensure that a failure in one cell doesn't affect the cells in another, reducing data transfer costs and improving both the availability and resiliency against AZ wide failures for Amazon EKS workloads. Refer to the [AWS Solution Guidance](https://aws.amazon.com/solutions/guidance/cell-based-architecture-for-amazon-eks/) for more details. -## Notable configuration - -* This sample rely on reading data from Terraform Remote State in the different folders. In a production setup, Terraform Remote State is stored in a persistent backend such as Terraform Cloud or S3. For more information, please refer to the Terraform [Backends](https://developer.hashicorp.com/terraform/language/settings/backends/configuration) documentation - -## Folder structure - -### [`0.vpc`](0.vpc/) - -This folder creates the VPC for all clusters. In this demonstration we are creating 2 cells sharing the same VPC. So, the VPC creation is not part of the cluster provisionig and therefore lives in a seperate folder. You could also explore a VPC per cluster depending on your needs. - -### [`1.cell1`](1.cell1/) - -This folder creates an Amazon EKS Cluster, named by default `cell-1` (see [`variables.tf`](1.cell1/variables.tf)), with AWS Load Balancer Controller, and Karpenter installation. -Configurations in this folder to be aware of: +## Deploy -* The cluster is configured to use the subnet-1 (AZ-1) created in the `0.vpc` folder. -* Karpenter `Provisioner` and `AWSNodeTemplate` resources are pointing to AZ-1 subnet. -* Essential operational addons like `coredns`, `aws-load-balancer-controller`, and `karpenter` are deployed to Fargate configured with AZ-1 subnet. +See [here](https://aws-ia.github.io/terraform-aws-eks-blueprints/getting-started/#prerequisites) for the prerequisites. This pattern consists 1 VPC and 3 public & private subnets across 3 AZs. Also 3 Amazon EKS clusters are deployed, each in single AZ. -### [`2.cell2`](2.cell2/) +```bash +terraform init +terraform apply -target="module.vpc" -auto-approve +terraform apply -target="module.eks_az1" -auto-approve +terraform apply -target="module.eks_az2" -auto-approve +terraform apply -target="module.eks_az3" -auto-approve +terraform apply -auto-approve +``` -Same configuration as in `1.cell1` except the name of the cluster is `cell-2` and deployed in `az-2` +## Validate -### [`3.test-setup`](3.test-setup/) +1. Export the necessary environment variables and update the local kubeconfig file. -This folder test the installation setup. It does by scaling the sample `inflate` application replicas and watch for Karpenter to launch EKS worker nodes in respective AZs. +```bash +export CELL_1=cell-based-eks-az1 +export CELL_2=cell-based-eks-az2 +export CELL_3=cell-based-eks-az3 +export AWS_REGION=$(aws configure get region) #AWS region of the EKS clusters +export AWS_ACCOUNT_NUMBER=$(aws sts get-caller-identity --query "Account" --output text) +export SUBNET_ID_CELL1=$(terraform output -raw subnet_id_az1) +export SUBNET_ID_CELL2=$(terraform output -raw subnet_id_az2) +export SUBNET_ID_CELL3=$(terraform output -raw subnet_id_az3) +alias kgn="kubectl get node -o custom-columns='NODE_NAME:.metadata.name,READY:.status.conditions[?(@.type==\"Ready\")].status,INSTANCE-TYPE:.metadata.labels.node\.kubernetes\.io/instance-type,AZ:.metadata.labels.topology\.kubernetes\.io/zone,CAPACITY-TYPE:.metadata.labels.karpenter\.sh/capacity-type,VERSION:.status.nodeInfo.kubeletVersion,OS-IMAGE:.status.nodeInfo.osImage,INTERNAL-IP:.metadata.annotations.alpha\.kubernetes\.io/provided-node-ip'" +``` -## Prerequisites +```bash +aws eks update-kubeconfig --name $CELL_1 --region $AWS_REGION --alias $CELL_1 +aws eks update-kubeconfig --name $CELL_2 --region $AWS_REGION --alias $CELL_2 +aws eks update-kubeconfig --name $CELL_3 --region $AWS_REGION --alias $CELL_3 +``` -Ensure that you have the following tools installed locally: +2. Lets start our validation using Cell 1 which is running in AZ1. Verify the existing nodes are deployed to AZ1 (us-west-2a) -1. [aws cli](https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2.html) -2. [kubectl](https://Kubernetes.io/docs/tasks/tools/) -3. [terraform](https://learn.hashicorp.com/tutorials/terraform/install-cli) +```bash +kgn --context ${CELL_1} +``` +```output +NODE_NAME READY INSTANCE-TYPE AZ CAPACITY-TYPE VERSION OS-IMAGE INTERNAL-IP +ip-10-0-12-83.us-west-2.compute.internal True m5.large us-west-2a v1.28.3-eks-e71965b Amazon Linux 2 10.0.12.83 +ip-10-0-7-191.us-west-2.compute.internal True m5.large us-west-2a v1.28.3-eks-e71965b Amazon Linux 2 10.0.7.191 +``` -## Deploy +3. Deploy the necessary Karpenter resources like `EC2NodeClass`, `NodePool` and configure them to use AZ1 to launch any EC2 resources -### Step 0 - Create the VPC +```bash +sed -i'.bak' -e 's/SUBNET_ID_CELL1/'"${SUBNET_ID_CELL1}"'/g' az1.yaml -```shell -cd 0.vpc -terraform init -terraform apply -auto-approve -cd.. +kubectl apply -f az1.yaml --context ${CELL_1} ``` -### Step 1 - Deploy cell-1 - -```shell -cd 1.cell1 -terraform init -terraform apply -auto-approve -cd.. -``` +4. Deploy a sample application `inflate` with 20 replicas and watch for Karpenter to launch the EC2 worker nodes in AZ1 -### Step 2 - Deploy cell-2 +```bash +kubectl apply -f inflate.yaml --context ${CELL_1} -```shell -cd 2.cell2 -terraform init -terraform apply -auto-approve -cd.. +kubectl wait --for=condition=ready pods --all --timeout 2m --context ${CELL_1} ``` -### Step 3 - test installation +5. List the EKS worker nodes to verify all of them are deployed to AZ1 -```shell -cd 3.test-setup -./test_setup.sh -cd.. +```bash +kgn --context ${CELL_1} +``` +```output +NODE_NAME READY INSTANCE-TYPE AZ CAPACITY-TYPE VERSION OS-IMAGE INTERNAL-IP +ip-10-0-11-154.us-west-2.compute.internal True c7g.8xlarge us-west-2a spot v1.28.3-eks-e71965b Amazon Linux 2 10.0.11.154 +ip-10-0-12-83.us-west-2.compute.internal True m5.large us-west-2a v1.28.3-eks-e71965b Amazon Linux 2 10.0.12.83 +ip-10-0-7-191.us-west-2.compute.internal True m5.large us-west-2a v1.28.3-eks-e71965b Amazon Linux 2 10.0.7.191 ``` -This script scale the sample application `inflate` to 20 replicas in both cells. As replica pods go into pending state due to insufficient compute capacity, Karpenter will kick-in and bring up the EC2 worker nodes in respective AZs. +6. Repeat the steps from 2 to 5 for Cell 2 and Cell 3 using --context $CELL_2, $CELL_3 respectively. ## Destroy -To teardown and remove the resources created in this example: +To teardown and remove the resources created in the pattern, the typical steps of execution are as follows: -```shell -cd 2.cell2 -terraform apply -destroy -auto-approve -cd ../1.cell1 -terraform apply -destroy -auto-approve -cd ../0.vpc -terraform apply -destroy -auto-approve +```bash +terraform destroy -target="module.eks_blueprints_addons_az1" -auto-approve +terraform destroy -target="module.eks_blueprints_addons_az2" -auto-approve +terraform destroy -target="module.eks_blueprints_addons_az3" -auto-approve +terraform destroy -target="module.eks_az1" -auto-approve +terraform destroy -target="module.eks_az2" -auto-approve +terraform destroy -target="module.eks_az3" -auto-approve +terraform destroy -auto-approve ``` diff --git a/patterns/cell-based-eks/az1.yaml b/patterns/cell-based-eks/az1.yaml new file mode 100644 index 0000000000..4c821d91e8 --- /dev/null +++ b/patterns/cell-based-eks/az1.yaml @@ -0,0 +1,43 @@ +--- +apiVersion: karpenter.k8s.aws/v1beta1 +kind: EC2NodeClass +metadata: + name: default +spec: + amiFamily: AL2 + role: karpenter-cell-based-eks-az1 + subnetSelectorTerms: + - id: SUBNET_ID_CELL1 + securityGroupSelectorTerms: + - tags: + karpenter.sh/discovery: cell-based-eks-az1 + tags: + karpenter.sh/discovery: cell-based-eks-az1 +--- +apiVersion: karpenter.sh/v1beta1 +kind: NodePool +metadata: + name: default +spec: + template: + spec: + nodeClassRef: + name: default + requirements: + - key: "karpenter.k8s.aws/instance-category" + operator: In + values: ["c", "m", "r"] + - key: "karpenter.k8s.aws/instance-cpu" + operator: In + values: ["4", "8", "16", "32"] + - key: "karpenter.k8s.aws/instance-hypervisor" + operator: In + values: ["nitro"] + - key: "karpenter.k8s.aws/instance-generation" + operator: Gt + values: ["2"] + limits: + cpu: 1000 + disruption: + consolidationPolicy: WhenEmpty + consolidateAfter: 30s diff --git a/patterns/cell-based-eks/az2.yaml b/patterns/cell-based-eks/az2.yaml new file mode 100644 index 0000000000..484fe169ad --- /dev/null +++ b/patterns/cell-based-eks/az2.yaml @@ -0,0 +1,43 @@ +--- +apiVersion: karpenter.k8s.aws/v1beta1 +kind: EC2NodeClass +metadata: + name: default +spec: + amiFamily: AL2 + role: karpenter-cell-based-eks-az2 + subnetSelectorTerms: + - id: SUBNET_ID_CELL2 + securityGroupSelectorTerms: + - tags: + karpenter.sh/discovery: cell-based-eks-az2 + tags: + karpenter.sh/discovery: cell-based-eks-az2 +--- +apiVersion: karpenter.sh/v1beta1 +kind: NodePool +metadata: + name: default +spec: + template: + spec: + nodeClassRef: + name: default + requirements: + - key: "karpenter.k8s.aws/instance-category" + operator: In + values: ["c", "m", "r"] + - key: "karpenter.k8s.aws/instance-cpu" + operator: In + values: ["4", "8", "16", "32"] + - key: "karpenter.k8s.aws/instance-hypervisor" + operator: In + values: ["nitro"] + - key: "karpenter.k8s.aws/instance-generation" + operator: Gt + values: ["2"] + limits: + cpu: 1000 + disruption: + consolidationPolicy: WhenEmpty + consolidateAfter: 30s diff --git a/patterns/cell-based-eks/az3.yaml b/patterns/cell-based-eks/az3.yaml new file mode 100644 index 0000000000..ecc1d7c696 --- /dev/null +++ b/patterns/cell-based-eks/az3.yaml @@ -0,0 +1,43 @@ +--- +apiVersion: karpenter.k8s.aws/v1beta1 +kind: EC2NodeClass +metadata: + name: default +spec: + amiFamily: AL2 + role: karpenter-cell-based-eks-az3 + subnetSelectorTerms: + - id: SUBNET_ID_CELL3 + securityGroupSelectorTerms: + - tags: + karpenter.sh/discovery: cell-based-eks-az3 + tags: + karpenter.sh/discovery: cell-based-eks-az3 +--- +apiVersion: karpenter.sh/v1beta1 +kind: NodePool +metadata: + name: default +spec: + template: + spec: + nodeClassRef: + name: default + requirements: + - key: "karpenter.k8s.aws/instance-category" + operator: In + values: ["c", "m", "r"] + - key: "karpenter.k8s.aws/instance-cpu" + operator: In + values: ["4", "8", "16", "32"] + - key: "karpenter.k8s.aws/instance-hypervisor" + operator: In + values: ["nitro"] + - key: "karpenter.k8s.aws/instance-generation" + operator: Gt + values: ["2"] + limits: + cpu: 1000 + disruption: + consolidationPolicy: WhenEmpty + consolidateAfter: 30s diff --git a/patterns/cell-based-eks/inflate.yaml b/patterns/cell-based-eks/inflate.yaml new file mode 100644 index 0000000000..f755e5f244 --- /dev/null +++ b/patterns/cell-based-eks/inflate.yaml @@ -0,0 +1,21 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: inflate +spec: + replicas: 20 + selector: + matchLabels: + app: inflate + template: + metadata: + labels: + app: inflate + spec: + terminationGracePeriodSeconds: 0 + containers: + - name: inflate + image: public.ecr.aws/eks-distro/kubernetes/pause:3.7 + resources: + requests: + cpu: 1 diff --git a/patterns/cell-based-eks/outputs.tf b/patterns/cell-based-eks/outputs.tf new file mode 100644 index 0000000000..a87926d032 --- /dev/null +++ b/patterns/cell-based-eks/outputs.tf @@ -0,0 +1,34 @@ +output "vpc_id" { + description = "Amazon EKS VPC ID" + value = module.vpc.vpc_id +} + +output "subnet_id_az1" { + description = "Amazon EKS AZ1 Cluster Subnet ID" + value = module.vpc.private_subnets[0] +} + +output "subnet_id_az2" { + description = "Amazon EKS AZ2 Cluster Subnet ID" + value = module.vpc.private_subnets[1] +} + +output "subnet_id_az3" { + description = "Amazon EKS AZ3 Cluster Subnet ID" + value = module.vpc.private_subnets[2] +} + +output "configure_kubectl_az1" { + description = "Configure kubectl for AZ1 Cluster: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig" + value = "aws eks --region ${local.region} update-kubeconfig --name ${module.eks_az1.cluster_name}" +} + +output "configure_kubectl_az2" { + description = "Configure kubectl for AZ2 Cluster: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig" + value = "aws eks --region ${local.region} update-kubeconfig --name ${module.eks_az2.cluster_name}" +} + +output "configure_kubectl_az3" { + description = "Configure kubectl for AZ3 Cluster: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig" + value = "aws eks --region ${local.region} update-kubeconfig --name ${module.eks_az3.cluster_name}" +} diff --git a/patterns/cell-based-eks/3.test-setup/test_setup.sh b/patterns/cell-based-eks/test_setup.sh similarity index 59% rename from patterns/cell-based-eks/3.test-setup/test_setup.sh rename to patterns/cell-based-eks/test_setup.sh index 48cb1036ed..41e6390aec 100755 --- a/patterns/cell-based-eks/3.test-setup/test_setup.sh +++ b/patterns/cell-based-eks/test_setup.sh @@ -1,13 +1,19 @@ -export CELL_1=cell-1 -export CELL_2=cell-2 +export CELL_1=cell-based-eks-az1 +export CELL_2=cell-based-eks-az2 +export CELL_3=cell-based-eks-az3 export AWS_DEFAULT_REGION=$(aws configure get region) export AWS_ACCOUNT_NUMBER=$(aws sts get-caller-identity --query "Account" --output text) +export SUBNET_ID_CELL1=$(terraform output -raw subnet_id_az1) +export SUBNET_ID_CELL2=$(terraform output -raw subnet_id_az2) +export SUBNET_ID_CELL3=$(terraform output -raw subnet_id_az3) aws eks update-kubeconfig --name $CELL_1 --region $AWS_DEFAULT_REGION aws eks update-kubeconfig --name $CELL_2 --region $AWS_DEFAULT_REGION +aws eks update-kubeconfig --name $CELL_3 --region $AWS_DEFAULT_REGION export CTX_CELL_1=arn:aws:eks:$AWS_DEFAULT_REGION:${AWS_ACCOUNT_NUMBER}:cluster/$CELL_1 export CTX_CELL_2=arn:aws:eks:$AWS_DEFAULT_REGION:${AWS_ACCOUNT_NUMBER}:cluster/$CELL_2 +export CTX_CELL_3=arn:aws:eks:$AWS_DEFAULT_REGION:${AWS_ACCOUNT_NUMBER}:cluster/$CELL_3 bold=$(tput bold) normal=$(tput sgr0) @@ -20,6 +26,10 @@ echo "${bold}Cell-1: Nodes before the scaling event${normal}" kgn --context="${CTX_CELL_1}" +sed -i'.bak' -e 's/SUBNET_ID_CELL1/'"${SUBNET_ID_CELL1}"'/g' az1.yaml + +kubectl apply -f az1.yaml,inflate.yaml --context="${CTX_CELL_1}" + echo "${bold}Cell-1: Scaling the inflate deployment to 50 replicas${normal}" kubectl scale deployment inflate --replicas 20 --context="${CTX_CELL_1}" @@ -38,14 +48,40 @@ echo "${bold}Cell-2: Nodes before the scaling event${normal}" kgn --context="${CTX_CELL_2}" +sed -i'.bak' -e 's/SUBNET_ID_CELL2/'"${SUBNET_ID_CELL2}"'/g' az2.yaml + +kubectl apply -f az2.yaml,inflate.yaml --context="${CTX_CELL_2}" + echo "${bold}Cell-2: Scaling the inflate deployment to 50 replicas${normal}" kubectl scale deployment inflate --replicas 20 --context="${CTX_CELL_2}" echo "${bold}Cell-2: Wait for karpenter to launch the worker nodes and pods become ready......${normal}" -kubectl wait --for=condition=ready pods --all --timeout 2m --context="${CTX_CELL_2}" +kubectl wait --for=condition=ready pods --all --timeout 2m --context="${CTX_CELL_2}" echo "${bold}Cell-2: Nodes after the scaling event${normal}" kgn --context="${CTX_CELL_2}" + +echo "------------${bold}Test the Cell-3 Setup${normal}-------------" + +echo "${bold}Cell-3: Nodes before the scaling event${normal}" + +kgn --context="${CTX_CELL_3}" + +sed -i'.bak' -e 's/SUBNET_ID_CELL3/'"${SUBNET_ID_CELL3}"'/g' az3.yaml + +kubectl apply -f az3.yaml,inflate.yaml --context="${CTX_CELL_3}" + +echo "${bold}Cell-3: Scaling the inflate deployment to 50 replicas${normal}" + +kubectl scale deployment inflate --replicas 20 --context="${CTX_CELL_3}" + +echo "${bold}Cell-3: Wait for karpenter to launch the worker nodes and pods become ready......${normal}" + +kubectl wait --for=condition=ready pods --all --timeout 2m --context="${CTX_CELL_3}" + +echo "${bold}Cell-3: Nodes after the scaling event${normal}" + +kgn --context="${CTX_CELL_3}" diff --git a/patterns/cell-based-eks/0.vpc/variables.tf b/patterns/cell-based-eks/variables.tf similarity index 100% rename from patterns/cell-based-eks/0.vpc/variables.tf rename to patterns/cell-based-eks/variables.tf diff --git a/patterns/cell-based-eks/2.cell2/versions.tf b/patterns/cell-based-eks/versions.tf similarity index 76% rename from patterns/cell-based-eks/2.cell2/versions.tf rename to patterns/cell-based-eks/versions.tf index 9577780c9b..e11e084c1d 100644 --- a/patterns/cell-based-eks/2.cell2/versions.tf +++ b/patterns/cell-based-eks/versions.tf @@ -14,16 +14,12 @@ terraform { source = "hashicorp/kubernetes" version = ">= 2.20" } - kubectl = { - source = "gavinbunney/kubectl" - version = ">= 1.14" - } } # ## Used for end-to-end testing on project; update to suit your needs # backend "s3" { # bucket = "" # region = "" - # key = "e2e/istio-multi-cluster-vpc/terraform.tfstate" + # key = "e2e/cell-based-eks/terraform.tfstate" # } }