From 102430b8ecc66f9219c4501755611af3573c5a51 Mon Sep 17 00:00:00 2001
From: Bryant Biggs <bryantbiggs@gmail.com>
Date: Fri, 3 May 2024 13:04:49 -0400
Subject: [PATCH] feat: Add pattern that demonstrates using ML capacity block
 reservation with self-managed node group

---
 .github/workflows/publish-docs.yml   |   4 +-
 .pre-commit-config.yaml              |   2 +-
 docs/patterns/ml-capacity-block.md   |   7 ++
 patterns/ml-capacity-block/README.md |  31 ++++++
 patterns/ml-capacity-block/eks.tf    | 144 +++++++++++++++++++++++++++
 patterns/ml-capacity-block/main.tf   |  87 ++++++++++++++++
 patterns/nvidia-gpu-efa/README.md    |  12 +--
 patterns/nvidia-gpu-efa/eks.tf       |   2 +-
 patterns/nvidia-gpu-efa/main.tf      |   2 +-
 patterns/targeted-odcr/README.md     |   8 +-
 patterns/targeted-odcr/eks.tf        |   2 +-
 11 files changed, 288 insertions(+), 13 deletions(-)
 create mode 100644 docs/patterns/ml-capacity-block.md
 create mode 100644 patterns/ml-capacity-block/README.md
 create mode 100644 patterns/ml-capacity-block/eks.tf
 create mode 100644 patterns/ml-capacity-block/main.tf

diff --git a/.github/workflows/publish-docs.yml b/.github/workflows/publish-docs.yml
index bf07bc4abc..28573e543c 100644
--- a/.github/workflows/publish-docs.yml
+++ b/.github/workflows/publish-docs.yml
@@ -35,8 +35,8 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          python -m pip install mkdocs-material==9.3.1 \
-            mkdocs-include-markdown-plugin==6.0.1 \
+          python -m pip install mkdocs-material==9.5.21 \
+            mkdocs-include-markdown-plugin==6.0.6 \
             mkdocs-awesome-pages-plugin==2.9.2
 
       - name: git config
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d9b2283918..cd3e8a60c5 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
   - repo: https://github.com/streetsidesoftware/cspell-cli
-    rev: v8.7.0
+    rev: v8.8.0
     hooks:
       - id: cspell
         args: [--exclude, 'ADOPTERS.md', --exclude, '.pre-commit-config.yaml', --exclude, '.gitignore', --exclude, '*.drawio', --exclude, 'mkdocs.yml', --exclude, '.helmignore', --exclude, '.github/workflows/*', --exclude, 'patterns/istio-multi-cluster/*', --exclude, 'patterns/blue-green-upgrade/*']
diff --git a/docs/patterns/ml-capacity-block.md b/docs/patterns/ml-capacity-block.md
new file mode 100644
index 0000000000..2d1528bcd3
--- /dev/null
+++ b/docs/patterns/ml-capacity-block.md
@@ -0,0 +1,7 @@
+---
+title: ML Capacity Block Reservation (CBR)
+---
+
+{%
+   include-markdown "../../patterns/ml-capacity-block/README.md"
+%}
diff --git a/patterns/ml-capacity-block/README.md b/patterns/ml-capacity-block/README.md
new file mode 100644
index 0000000000..58f21ee7c6
--- /dev/null
+++ b/patterns/ml-capacity-block/README.md
@@ -0,0 +1,31 @@
+# EKS w/ ML Capacity Block Reservation (CBR)
+
+This pattern demonstrates how to consume/utilize ML capacity block reservations (CBR) with Amazon EKS. The solution is comprised of primarily 2 components:
+
+!!! warning
+      The use of self-managed node group(s) are required at this time to support capacity block reservations within EKS. This pattern will be updated to demonstrate EKS managed node groups once support has been implemented by the EKS service.
+
+1. The self-managed node group that will utilize the CBR should have the subnets provided to it restricted to the availability zone where the CBR has been allocated. For example - if the CBR is allocated to `us-west-2b`, the node group should only have subnet IDs provided to it that reside in `us-west-2b`. If the subnets that reside in other AZs are provided, its possible to encounter an error such as `InvalidParameterException: The following supplied instance types do not exist ...`. It is not guaranteed that this error will always be shown, and may appear random since the underlying autoscaling group(s) will provision nodes into different AZs at random. It will only occur when the underlying autoscaling group tries to provision instances into an AZ where capacity is not allocated and there is insufficient on-demand capacity for the desired instance type.
+
+2. The launch template utilized should specify the `instance_market_options` and `capacity_reservation_specification` arguments. This is how the CBR is utilized by the node group (i.e. - tells the autoscaling group to launch instances utilizing provided capacity reservation).
+
+<b>Links:</b>
+
+- [EKS - Capacity Blocks for ML](https://docs.aws.amazon.com/eks/latest/userguide/capacity-blocks.html)
+- [EC2 - Capacity Blocks for ML](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-capacity-blocks.html)
+
+## Code
+
+```terraform hl_lines="53-93"
+{% include  "../../patterns/ml-capacity-block/eks.tf" %}
+```
+
+## Deploy
+
+See [here](https://aws-ia.github.io/terraform-aws-eks-blueprints/getting-started/#prerequisites) for the prerequisites and steps to deploy this pattern.
+
+## Destroy
+
+{%
+   include-markdown "../../docs/_partials/destroy.md"
+%}
diff --git a/patterns/ml-capacity-block/eks.tf b/patterns/ml-capacity-block/eks.tf
new file mode 100644
index 0000000000..da0d7235e3
--- /dev/null
+++ b/patterns/ml-capacity-block/eks.tf
@@ -0,0 +1,144 @@
+################################################################################
+# Required Input
+################################################################################
+
+# See https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/capacity-blocks-using.html
+# on how to obtain a ML capacity block reservation. Once acquired, you can provide
+# the reservation ID through this input to deploy the pattern
+variable "capacity_reservation_id" {
+  description = "The ID of the ML capacity block reservation to use for the node group"
+  type        = string
+}
+
+################################################################################
+# Cluster
+################################################################################
+
+module "eks" {
+  source  = "terraform-aws-modules/eks/aws"
+  version = "~> 20.9"
+
+  cluster_name    = local.name
+  cluster_version = "1.29"
+
+  # Give the Terraform identity admin access to the cluster
+  # which will allow it to deploy resources into the cluster
+  enable_cluster_creator_admin_permissions = true
+  cluster_endpoint_public_access           = true
+
+  cluster_addons = {
+    coredns    = {}
+    kube-proxy = {}
+    vpc-cni    = {}
+  }
+
+  # Add security group rules on the node group security group to
+  # allow EFA traffic
+  enable_efa_support = true
+
+  vpc_id     = module.vpc.vpc_id
+  subnet_ids = module.vpc.private_subnets
+
+  eks_managed_node_groups = {
+    # This node group is for core addons such as CoreDNS
+    default = {
+      instance_types = ["m5.large"]
+
+      min_size     = 1
+      max_size     = 2
+      desired_size = 2
+    }
+  }
+
+  # Note: ML capacity block reservations are only supported
+  # on self-managed node groups at this time
+  self_managed_node_groups = {
+    odcr = {
+      # The EKS AL2 GPU AMI provides all of the necessary components
+      # for accelerated workloads w/ EFA
+      ami_type      = "AL2_x86_64_GPU"
+      instance_type = "p5.48xlarge"
+
+      pre_bootstrap_user_data = <<-EOT
+        # Mount instance store volumes in RAID-0 for kubelet and containerd
+        # https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0
+        /bin/setup-local-disks raid0
+
+        # Ensure only GPU workloads are scheduled on this node group
+        export KUBELET_EXTRA_ARGS='--node-labels=vpc.amazonaws.com/efa.present=true,nvidia.com/gpu.present=true \
+          --register-with-taints=nvidia.com/gpu=true:NoSchedule'
+
+      EOT
+
+      min_size     = 2
+      max_size     = 2
+      desired_size = 2
+
+      # This will:
+      # 1. Create a placement group to place the instances close to one another
+      # 2. Ignore subnets that reside in AZs that do not support the instance type
+      # 3. Expose all of the available EFA interfaces on the launch template
+      enable_efa_support = true
+
+      # ML capacity block reservation
+      instance_market_options = {
+        market_type = "capacity-block"
+      }
+      capacity_reservation_specification = {
+        capacity_reservation_target = {
+          capacity_reservation_id = var.capacity_reservation_id
+        }
+      }
+    }
+  }
+
+  tags = local.tags
+}
+
+################################################################################
+# Helm charts
+################################################################################
+
+resource "helm_release" "nvidia_device_plugin" {
+  name             = "nvidia-device-plugin"
+  repository       = "https://nvidia.github.io/k8s-device-plugin"
+  chart            = "nvidia-device-plugin"
+  version          = "0.14.5"
+  namespace        = "nvidia-device-plugin"
+  create_namespace = true
+  wait             = false
+
+  values = [
+    <<-EOT
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: 'nvidia.com/gpu.present'
+                operator: In
+                values:
+                - 'true'
+    EOT
+  ]
+}
+
+resource "helm_release" "aws_efa_device_plugin" {
+  name       = "aws-efa-k8s-device-plugin"
+  repository = "https://aws.github.io/eks-charts"
+  chart      = "aws-efa-k8s-device-plugin"
+  version    = "v0.4.4"
+  namespace  = "kube-system"
+  wait       = false
+
+  values = [
+    <<-EOT
+      nodeSelector:
+        vpc.amazonaws.com/efa.present: 'true'
+      tolerations:
+        - key: nvidia.com/gpu
+          operator: Exists
+          effect: NoSchedule
+    EOT
+  ]
+}
diff --git a/patterns/ml-capacity-block/main.tf b/patterns/ml-capacity-block/main.tf
new file mode 100644
index 0000000000..4fd6d95715
--- /dev/null
+++ b/patterns/ml-capacity-block/main.tf
@@ -0,0 +1,87 @@
+terraform {
+  required_version = ">= 1.3"
+
+  required_providers {
+    aws = {
+      source  = "hashicorp/aws"
+      version = ">= 5.34"
+    }
+    helm = {
+      source  = "hashicorp/helm"
+      version = ">= 2.9"
+    }
+  }
+
+  # ##  Used for end-to-end testing on project; update to suit your needs
+  # backend "s3" {
+  #   bucket = "terraform-ssp-github-actions-state"
+  #   region = "us-west-2"
+  #   key    = "e2e/ml-capacity-block/terraform.tfstate"
+  # }
+}
+
+provider "aws" {
+  region = local.region
+}
+
+provider "helm" {
+  kubernetes {
+    host                   = module.eks.cluster_endpoint
+    cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
+
+    exec {
+      api_version = "client.authentication.k8s.io/v1beta1"
+      command     = "aws"
+      # This requires the awscli to be installed locally where Terraform is executed
+      args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name]
+    }
+  }
+}
+
+################################################################################
+# Common data/locals
+################################################################################
+
+data "aws_availability_zones" "available" {}
+
+locals {
+  name   = basename(path.cwd)
+  region = "us-west-2"
+
+  vpc_cidr = "10.0.0.0/16"
+  azs      = slice(data.aws_availability_zones.available.names, 0, 3)
+
+  tags = {
+    Blueprint  = local.name
+    GithubRepo = "github.com/aws-ia/terraform-aws-eks-blueprints"
+  }
+}
+
+################################################################################
+# Supporting Resources
+################################################################################
+
+module "vpc" {
+  source  = "terraform-aws-modules/vpc/aws"
+  version = "~> 5.0"
+
+  name = local.name
+  cidr = local.vpc_cidr
+
+  azs             = local.azs
+  private_subnets = [for k, v in local.azs : cidrsubnet(local.vpc_cidr, 4, k)]
+  public_subnets  = [for k, v in local.azs : cidrsubnet(local.vpc_cidr, 8, k + 48)]
+
+  enable_nat_gateway = true
+  single_nat_gateway = true
+
+  public_subnet_tags = {
+    "kubernetes.io/role/elb" = 1
+  }
+
+  private_subnet_tags = {
+    "kubernetes.io/role/internal-elb" = 1
+  }
+
+  tags = local.tags
+}
diff --git a/patterns/nvidia-gpu-efa/README.md b/patterns/nvidia-gpu-efa/README.md
index ce3a16bbdc..e43128c7e2 100644
--- a/patterns/nvidia-gpu-efa/README.md
+++ b/patterns/nvidia-gpu-efa/README.md
@@ -6,12 +6,12 @@ The following components are demonstrated in this pattern:
 
 - A "default" node group that supports addons and components that do not require GPUs nor EFA devices. Any pods that do not tolerate the taints of the GPU node group will be scheduled on instances within this node group.
 - A node group of `p5.48xlarge` instances with
-    - all x32 [EFA network interfaces](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa.html) enabled
-    - provisioned within a placement group so that the instances are provisioned close to one another in a single availability zone that supports the instance type.
-    - a common NVIDIA taint of `"nvidia.com/gpu:NoSchedule"` to ensure only the intended applications are allowed to run on the nodes created
-    - two labels to identify that this nodegroup supports NVIDIA GPUs and EFA devices and allow pods to use node selectors with these labels
-    - the NVME instance store volumes are mounted in a RAID-0 array to provide a single, large, high-performance storage volume for the GPU workloads
-    - kubelet and containerd are configured to utilize the RAID-0 volume, allowing kubelet to discover the additional storage as ephemeral storage that can be utilized by pods
+  - all x32 [EFA network interfaces](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa.html) enabled
+  - provisioned within a placement group so that the instances are provisioned close to one another in a single availability zone that supports the instance type.
+  - a common NVIDIA taint of `"nvidia.com/gpu:NoSchedule"` to ensure only the intended applications are allowed to run on the nodes created
+  - two labels to identify that this nodegroup supports NVIDIA GPUs and EFA devices and allow pods to use node selectors with these labels
+  - the NVME instance store volumes are mounted in a RAID-0 array to provide a single, large, high-performance storage volume for the GPU workloads
+  - kubelet and containerd are configured to utilize the RAID-0 volume, allowing kubelet to discover the additional storage as ephemeral storage that can be utilized by pods
 - A Helm chart deployment for the [NVIDIA device plugin](https://github.com/NVIDIA/k8s-device-plugin) to expose and mount the GPUs provided by the instances to the pods that request them
 - A Helm chart deployment for the EFA device plugin to expose and mount the EFA network interfaces provided by the instances to the pods that request them. Since the EFA network interfaces are only found on the instances that provide NVIDIA GPUs in this pattern, we do not apply an additional taint for the EFA network interfaces to avoid over-constraining.
 
diff --git a/patterns/nvidia-gpu-efa/eks.tf b/patterns/nvidia-gpu-efa/eks.tf
index 38259e872d..51c927a725 100644
--- a/patterns/nvidia-gpu-efa/eks.tf
+++ b/patterns/nvidia-gpu-efa/eks.tf
@@ -4,7 +4,7 @@
 
 module "eks" {
   source  = "terraform-aws-modules/eks/aws"
-  version = "~> 20.8"
+  version = "~> 20.9"
 
   cluster_name    = local.name
   cluster_version = "1.29"
diff --git a/patterns/nvidia-gpu-efa/main.tf b/patterns/nvidia-gpu-efa/main.tf
index eaa0bae637..e11150c06f 100644
--- a/patterns/nvidia-gpu-efa/main.tf
+++ b/patterns/nvidia-gpu-efa/main.tf
@@ -16,7 +16,7 @@ terraform {
   # backend "s3" {
   #   bucket = "terraform-ssp-github-actions-state"
   #   region = "us-west-2"
-  #   key    = "e2e/nvida-gpu-efa/terraform.tfstate"
+  #   key    = "e2e/nvidia-gpu-efa/terraform.tfstate"
   # }
 }
 
diff --git a/patterns/targeted-odcr/README.md b/patterns/targeted-odcr/README.md
index bbcc651064..7b2238907e 100644
--- a/patterns/targeted-odcr/README.md
+++ b/patterns/targeted-odcr/README.md
@@ -11,11 +11,17 @@ This pattern demonstrates how to consume/utilize on-demand capacity reservations
 
 3. A resource group will need to be created for the capacity reservations. The resource group acts like a container, allowing for ODCRs to be added or removed as needed to adjust the available capacity. Utilizing the resource group allows for this additional capacity to be adjusted without any modification or disruption to the existing node group or launch template. As soon as the ODCR has been associated to the resource group, the node group can scale up to start utilizing that capacity.
 
-Links:
+<b>Links:</b>
 
 - [Tutorial: Launch On-Demand Instances using targeted Capacity Reservations](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-fleet-launch-on-demand-instances-using-targeted-capacity-reservations-walkthrough.html)
 - [Target a group of Amazon EC2 On-Demand Capacity Reservations](https://aws.amazon.com/blogs/mt/target-a-group-of-amazon-ec2-on-demand-capacity-reservations/)
 
+## Code
+
+```terraform hl_lines="34-51"
+{% include  "../../patterns/targeted-odcr/eks.tf" %}
+```
+
 ## Deploy
 
 See [here](https://aws-ia.github.io/terraform-aws-eks-blueprints/getting-started/#prerequisites) for the prerequisites and steps to deploy this pattern.
diff --git a/patterns/targeted-odcr/eks.tf b/patterns/targeted-odcr/eks.tf
index 0560e5fffa..e7214a742a 100644
--- a/patterns/targeted-odcr/eks.tf
+++ b/patterns/targeted-odcr/eks.tf
@@ -4,7 +4,7 @@
 
 module "eks" {
   source  = "terraform-aws-modules/eks/aws"
-  version = "~> 20.8"
+  version = "~> 20.9"
 
   cluster_name    = local.name
   cluster_version = "1.29"