diff --git a/.github/workflows/plan-examples.yml b/.github/workflows/plan-examples.yml index c28e77ca89..60d02b36e3 100644 --- a/.github/workflows/plan-examples.yml +++ b/.github/workflows/plan-examples.yml @@ -5,6 +5,7 @@ on: pull_request_target: branches: - main + - v4 workflow_dispatch: concurrency: diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 4e4ac382d1..137a7efa27 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -4,6 +4,7 @@ on: pull_request: branches: - main + - v4 paths: - '**.tf' - '**.yml' diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 44a5a23a56..cc20962986 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -10,7 +10,7 @@ repos: - id: detect-aws-credentials args: ['--allow-missing-credentials'] - repo: https://github.com/antonbabenko/pre-commit-terraform - rev: v1.77.2 + rev: v1.80.0 hooks: - id: terraform_fmt - id: terraform_docs diff --git a/ADOPTERS.md b/ADOPTERS.md index 05ddcc849f..00f0e55820 100644 --- a/ADOPTERS.md +++ b/ADOPTERS.md @@ -13,5 +13,6 @@ If you are using EKS Blueprints for Terraform please consider adding yourself as | Organization | Description | Contacts | Link | | --- | --- | --- | --- | +| PITS Global Data Recovery Services | PITS Global Data Recovery Services is a leading global recovery company based in the United States. With a team of highly skilled and experienced data recovery experts, we are dedicated to providing top-notch data recovery services to businesses and individuals across the country. Our team understands how critical data is and takes pride in recovering lost, corrupted, or deleted files quickly and effectively. Regardless of the data loss scenario, our team of experts has the tools and expertise to get your data back.| info@pitsdatarecovery.net | https://www.pitsdatarecovery.net/ | | AlgoDx AB | At AlgoDx, we are dedicated to developing and implementing clinically validated ML-based clinical decision support software that save lives and reduce costs in healthcare, primarily targeting intensive and critical care. We achieve this through the integration of our state-of-the-art AI precision medicine platform with health systems’ electronic medical records. | sebastian.olsson@algodx.com, info+engineering@algodx.com | https://www.algodx.com/ | | Swyft Logistics | Swyft is an award-winning, tech-enabled last-mile e-Parcel delivery service provider, primarily operating in the fashion, electronics, and personal care industries. Swyft aspires to reinvent logistics in Pakistan and connecting people beyond possibilities. Our focus is on innovation, development of technologies and processes. | abdul.rauf@swyftlogistics.com, haider.aslam@swyftlogistics.com, osama.maruf@swyftlogistics.com, | https://www.swyftlogistics.com/ | diff --git a/DIRECTION_v5.md b/DIRECTION_v5.md new file mode 100644 index 0000000000..2997cf007d --- /dev/null +++ b/DIRECTION_v5.md @@ -0,0 +1,102 @@ +# Direction for v5 of Terraform EKS Blueprints + +## What Has Worked + +- EKS Blueprints was started to [make it easier for customers to adopt Amazon Elastic Kubernetes Service (EKS)](https://aws.amazon.com/blogs/containers/bootstrapping-clusters-with-eks-blueprints/) in a shorter period of time. The project has been quite successful in this regard - hearing from customers stating that EKS Blueprints has helped them get from zero to one or more clusters running with applications in less than 1-2 weeks. + +- EKS Blueprints has also been successful in providing working examples to users that demonstrate common architectural patterns and workload solutions. Some popular examples include: + - Spark on EKS + - Karpenter on EKS Fargate + - Transparent encryption with Wireguard and Cilium + - Fully serverless cluster with EKS Fargate + +## What Has Not + +- Scaling and managing addons that are created through EKS Blueprints. With almost [1,200 projects on the CNCF roadmap](https://landscape.cncf.io/), the number of various ways and methods that a project allows for deploying onto a cluster (i.e. - Datadog offers 5 different Helm charts for its service, Prometheus hosts over 30 Helm charts for its services), as well as the number of different tools used to provision addons (i.e. - Terraform, ArgoCD, FluxCD, etc.), supporting both the number of addons and their different forms has been extremely challenging for the team. In addition to managing just the sheer number of addons, supporting the different configurations that users wish to have exposed in conjunction with testing and validating those various configurations is only compounded by the number of addons and their methods of creation. + +- Managing resources provisioned on the cluster using Terraform. Terraform is a fantastic tool for provisioning infrastructure and it is the tool of choice for many customers when it comes to creating resources in AWS. However, there are a number of downsides with Terraform when it comes to provisioning resources on a Kubernetes cluster. These include: + + - Ordering of dependencies when relationships live outside of Terraform's HCL syntax. Terraform wants to evaluate the current state of what it controls and be able to plan a series of actions to align the current state with the desired state *in one action*. It does this once for each `terraform plan` or `terraform apply`, and if any issues are encountered, it simply fails and halts execution. When Terraform cannot infer the ordering of dependencies across resources (i.e. - through passing outputs of parent resources to arguments of child resources using the Terraform `..` syntax), it will view this as no relationship between the resources and attempt to execute their provisioning in parallel and asynchronously. Any resources that are left waiting for a dependency will eventually timeout and fail, causing Terraform itself to timeout and fail the apply. This is where the reconciliation loop of a Kubernetes controller or operator on the cluster is better suited - continuously trying to reconcile the state over and over again as dependencies are eventually resolved. (To be clear - the issue of dependency ordering still exists, but the controller/operator will keep retrying and on each retry, some resources will succeed which will move the execution along with each cycle until everything is fully deployed. Terraform could do this if it kept re-trying, but it does not do this today) + + - Publicly exposing access to the EKS endpoints in order to provision resources defined outside of the VPC onto the cluster. When using Terraform, the resource provisioning operation is a "push" model where Terraform will send requests to the EKS API Server to create resources. Coupled with the fact that the Terraform operation typically resides outside of the VPC where the cluster is running, this results in users enabling public access to the EKS endpoints to provision resources. However, the more widely accepted approach by the Kubernetes community has been the adoption of GitOps which uses a "pull" based model, where an operator or controller running on the cluster will pull the resource definitions from a Git repository and reconcile state from within the cluster itself. This approach is more secure as it does not require public access to the EKS endpoints and instead relies on the cluster's internal network to communicate with the EKS API Server. + + - The nesting of multiple sub-modules in conjunction with the necessity to even require a module to be able to support an addon. When we compare and contrast the Terraform approach to addons versus the GitOps approach, the Terraform approach has a glaring disadvantage - the need to create a module that wraps the addon's Helm chart in order to provision the addon via Terraform. As opposed to the GitOps approach, where users simply consume the charts from where they are stored as needed. This creates a bottleneck on the team to review, test, and validate each new addon as well as the overhead then added for maintaining and updating those addons going forward. This also opens up more areas where breaking changes are encountered which is compounded by the fact that Terraform addons are grouped under an "umbrella" module which obfuscates versioning. + +- Being able to support a combination of various tools, modules, frameworks, etc., to meet the needs of customers. The [`terraform-aws-eks`](https://github.com/terraform-aws-modules/terraform-aws-eks) was created long before EKS Blueprints, and many customers had already adopted this module for creating their clusters. In addition, Amazon has since adopted the [`eksctl`](https://github.com/weaveworks/eksctl) as the official CLI for Amazon EKS. When EKS Blueprints was first announced, many customers raised questions asking if they needed to abandon their current clusters created through those other tools in order to adopt EKS Blueprints. The answer is no - users can and should be able to use their existing clusters while EKS Blueprints can help augment that process through its supporting modules (addons, teams, etc.). This left the team with the question - why create a Terraform module for creating an EKS cluster when the [`terraform-aws-eks`](https://github.com/terraform-aws-modules/terraform-aws-eks) already exists and the EKS Blueprints implementation already uses that module for creating the control plane and security groups? + +## What Is Changing + +The direction for EKS Blueprints in v5 will shift from providing an all-encompassing, monolithic "framework" and instead focus more on how users can organize a set of modular components to create the desired solution on Amazon EKS. This will allow customers to use the components of their choosing in a way that is more familiar to them and their organization instead of having to adopt and conform to a framework. + +With this shift in direction, the cluster definition will be removed from the project and instead examples will reference the [`terraform-aws-eks`](https://github.com/terraform-aws-modules/terraform-aws-eks) module for cluster creation. The remaining modules will be moved out to their own respective repositories as standalone projects. This leaves the EKS Blueprint project as the canonical place where users can receive guidance on how to configure their clusters to meet a desired architecture, how best to setup their clusters following well-architected practices, as well as references on the various ways that different workloads can be deployed on Amazon EKS. + +### Notable Changes + +1. EKS Blueprints will remove its Amazon EKS cluster Terraform module components (control plane, EKS managed node group, self-managed node group, and Fargate profile modules) from the project. In its place, users are encouraged to utilize the [`terraform-aws-eks`](https://github.com/terraform-aws-modules/terraform-aws-eks) module which meets or exceeds nearly all of the functionality of the EKS Blueprints v4.x cluster module. This includes the Terraform code contained at the root of the project as well as the `aws-eks-fargate-profiles`, `aws-eks-managed-node-groups`, `aws-eks-self-managed-node-groups`, and `launch-templates` modules which will all be removed from the project. +2. The `aws-kms` module will be removed entirely. This was consumed in the root project module for cluster secret encryption. In its place, users can utilize the KMS key creation functionality of the [`terraform-aws-eks`](https://github.com/terraform-aws-modules/terraform-aws-eks) module or the [`terraform-aws-kms`](https://github.com/terraform-aws-modules/terraform-aws-kms) module if they wish to control the key separately from the cluster itself. +3. The `emr-on-eks` module will be removed entirely; its replacement can be found in the new external module [`terraform-aws-emr`](https://github.com/terraform-aws-modules/terraform-aws-emr/tree/master/modules/serverless). +4. The `irsa` and `helm-addon` modules will be removed entirely; we have released a new external module [`terraform-aws-eks-blueprints-addon`](https://github.com/aws-ia/terraform-aws-eks-blueprints-addon) that is available on the Terraform registry that replicates/replaces the functionality of these two modules. This will now allow users, as well as partners, to create their own addons that are not natively supported by EKS Blueprints more easily and following the same process as EKS Blueprints. +5. The `aws-eks-teams` module will be removed entirely; its replacement will be the new external module [`terraform-aws-eks-blueprints-teams`](https://github.com/aws-ia/terraform-aws-eks-blueprints-teams) that incorporates the changes customers have been asking for in https://github.com/aws-ia/terraform-aws-eks-blueprints/issues/842 +6. The integration between Terraform and ArgoCD has been removed in the initial release of v5. The team is currently investigating better patterns and solutions in conjunction with the ArgoCD and FluxCD teams that will provide a better, more integrated experience when using a GitOps based approach for cluster management. This will be released in a future version of EKS Blueprints v5 and is tracked [here](https://github.com/aws-ia/terraform-aws-eks-blueprints-addons/issues/114) + +### Resulting Project Structure + +Previously under the v4.x structure, the EKS Blueprint project was comprised of various repositories across multiple AWS organizations that looked roughly like the following: + +#### v4.x Structure + +``` +├── aws-ia/ +| ├── terraform-aws-eks-ack-addons/ +| └── terraform-aws-eks-blueprints/ +| ├── aws-auth-configmap.tf +| ├── data.tf +| ├── eks-worker.tf +| ├── locals.tf +| ├── main.tf +| ├── outputs.tf +| ├── variables.tf +| ├── versions.tf +| ├── examples/ +| └── modules +| ├── aws-eks-fargate-profiles/ +| ├── aws-eks-managed-node-groups/ +| ├── aws-eks-self-managed-node-groups/ +| ├── aws-eks-teams/ +| ├── aws-kms/ +| ├── emr-on-eks/ +| ├── irsa/ +| ├── kubernetes-addons/ +| └── launch-templates/ +├── awslabs/ +| ├── crossplane-on-eks/ +| └── data-on-eks/ +└── aws-samples/ + ├── eks-blueprints-add-ons/ # Previously shared with the CDK based EKS Blueprints project + └── eks-blueprints-workloads/ # Previously shared with the CDK based EKS Blueprints project +``` + +Under th new v5.x structure, the Terraform based EKS Blueprints project will be comprised of the following repositories: + +#### v5.x Structure + +``` +├── aws-ia/ +| ├── terraform-aws-eks-ack-addons/ +| ├── terraform-aws-eks-blueprints/ # Will contain only example/blueprint implementations; no modules +| ├── terraform-aws-eks-blueprints-addon # Module for creating Terraform based addon (IRSA + Helm chart) +| ├── terraform-aws-eks-blueprints-addons # Will contain a select set of addons supported by the EKS Blueprints +| └── terraform-aws-eks-blueprints-teams # Was previously `aws-eks-teams/` EKS Blueprint sub-module; updated based on customer feedback +└── awslabs/ + ├── crossplane-on-eks/ + └── data-on-eks/ # Data related patterns that used to be located in `terraform-aws-eks-blueprints/` are now located here +``` + +## What Can Users Expect + +With these changes, the team intends to provide a better experience for users of the Terraform EKS Blueprints project as well as new and improved reference architectures. Following the v5 changes, the team intends to: + +1. Improved quality of the examples provided - more information on the intent of the example, why it might be useful for users, what scenarios is the pattern applicable, etc. Where applicable, architectural diagrams and supporting material will be provided to highlight the intent of the example and how its constructed. +2. A more clear distinction between a blueprint and a usage reference. For example - the Karpenter on EKS Fargate blueprint should demonstrate all of the various aspects that users should be aware of and consider in order to take full advantage of this pattern (recommended practices, observability, logging, monitoring, security, day 2 operations, etc.); this is what makes it a blueprint. In contrast, a usage reference would be an example that shows how users can pass configuration values to the Karpenter provisioner. This example is less focused on the holistic architecture and more focused on how one might configure Karpenter using the implementation. The EKS Blueprints repository will focus mostly on holistic architecture and patterns, and any usage references should be saved for the repository that contains that implementation definition (i.e. - the `terraform-aws-eks-blueprints-addons` repository where the addon implementation is defined). +3. Faster, and more responsive feedback. The first part of this is going to be improved documentation on how to contribute which should help clarify whether a contribution is worthy and willing to be accepted by the team before any effort is spent by the contributor. However, the goal of v5 is to focus more on the value added benefits that EKS Blueprints was created to provide as opposed to simply mass producing Helm chart wrappers (addons) and trying to keep up with that operationally intensive process. +4. Lastly, more examples and blueprints that demonstrate various architectures and workloads that run on top of Amazon EKS as well as integrations into other AWS services. diff --git a/docs/add-ons/index.md b/docs/add-ons/index.md index ceb3885c8f..425da74426 100644 --- a/docs/add-ons/index.md +++ b/docs/add-ons/index.md @@ -69,7 +69,7 @@ If you would like to use private repositories, you can download Docker images fo To indicate that you would like to manage add-ons via ArgoCD, you must do the following: -1. Enable the ArgoCD add-on by setting `argocd_enable` to `true`. +1. Enable the ArgoCD add-on by setting `enable_argocd` to `true`. 2. Specify you would like ArgoCD to be responsible for deploying your add-ons by setting `argocd_manage_add_ons` to `true`. This will prevent the individual Terraform add-on modules from deploying Helm charts. 3. Pass Application configuration for your add-ons repository via the `argocd_applications` property. diff --git a/docs/add-ons/kubecost.md b/docs/add-ons/kubecost.md index 5ca662312e..0f97210c5a 100644 --- a/docs/add-ons/kubecost.md +++ b/docs/add-ons/kubecost.md @@ -24,7 +24,7 @@ Deploy Kubecost with custom `values.yaml` name = "kubecost" # (Required) Release name. repository = "oci://public.ecr.aws/kubecost" # (Optional) Repository URL where to locate the requested chart. chart = "cost-analyzer" # (Required) Chart name to be installed. - version = "1.96.0" # (Optional) Specify the exact chart version to install. If this is not specified, it defaults to the version set within default_helm_config: https://github.com/aws-ia/terraform-aws-eks-blueprints/blob/main/modules/kubernetes-addons/kubecost/locals.tf + version = "1.103.3" # (Optional) Specify the exact chart version to install. If this is not specified, it defaults to the version set within default_helm_config: https://github.com/aws-ia/terraform-aws-eks-blueprints/blob/main/modules/kubernetes-addons/kubecost/locals.tf namespace = "kubecost" # (Optional) The namespace to install the release into. values = [templatefile("${path.module}/kubecost-values.yaml", {})] } diff --git a/examples/eks-efa/.gitignore b/examples/eks-efa/.gitignore new file mode 100644 index 0000000000..55e2910e38 --- /dev/null +++ b/examples/eks-efa/.gitignore @@ -0,0 +1,6 @@ +tfplan +*.tfstate +*.backup +TODO*.* +.terraform +*.hcl diff --git a/examples/eks-efa/README.md b/examples/eks-efa/README.md new file mode 100644 index 0000000000..de6a0ddc9d --- /dev/null +++ b/examples/eks-efa/README.md @@ -0,0 +1,659 @@ +# EKS Blueprint Example with Elastic Fabric Adapter + +## Table of Contents + +- [EKS Blueprint Example with Elastic Fabric Adapter](#eks-blueprint-example-with-elastic-fabric-adapter) + - [Table of Contents](#table-of-contents) + - [Elastic Fabric Adapter Overview](#elastic-fabric-adapter-overview) + - [Setup Details](#setup-details) +- [Terraform Doc](#terraform-doc) + - [Requirements](#requirements) + - [Providers](#providers) + - [Modules](#modules) + - [Resources](#resources) + - [Inputs](#inputs) + - [Outputs](#outputs) +- [Example Walkthrough](#example-walkthrough) + - [1. Clone Repository](#1-clone-repository) + - [2. Configure Terraform Plan](#2-configure-terraform-plan) + - [3. Initialize Terraform Plan](#3-initialize-terraform-plan) + - [4. Create Terraform Plan](#4-create-terraform-plan) + - [5. Apply Terraform Plan](#5-apply-terraform-plan) + - [6. Connect to EKS](#6-connect-to-eks) + - [7. Deploy Kubeflow MPI Operator](#7-deploy-kubeflow-mpi-operator) + - [8. Test EFA](#8-test-efa) + - [8.1. EFA Info Test](#81-efa-info-test) + - [8.2. EFA NCCL Test](#82-efa-nccl-test) + - [9. Cleanup](#9-cleanup) +- [Conclusion](#conclusion) + +## Elastic Fabric Adapter Overview + +[Elastic Fabric Adapter (EFA)](https://aws.amazon.com/hpc/efa/) is a network interface supported by [some Amazon EC2 instances](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa.html#efa-instance-types) that provides high-performance network communications at scale on AWS. Commonly, high-performance computing, simulation, and large AI model training jobs require EFA, in order to minimize the time to job completion. This example provides a blueprint for deploying an [Amazon EKS](https://aws.amazon.com/eks/) cluster with EFA-enabled nodes, which can be used to run such jobs. + +## Setup Details + +There are three requirements that need to be satisfied, in order for EFA to work: + +1. The EC2 instance type must support EFA and the EFA adapter must be enabled. +2. The EFA software must be installed +3. The security group attached to the EC2 instance must allow all incoming and outgoing traffic to itself + +In the provided Terraform EKS Blueprint example here, these requirements are satisfied automatically. + +# Terraform Doc + +The main Terraform doc [main.tf](main.tf) contains local variables, local data, vpc and eks definitions, device plugins, and addons. + +## Requirements + +Requirements are specified in the [providers.tf](providers.tf) file. This file is used to install all needed providers when `terraform init` is executed. + +## Providers + +Providers are defined in [main.tf](main.tf#L3). They include `aws`, `kubernetes`, `helm`, and `kubectl`. + +## Modules + +The following modules are included in the template: + +1. [vpc](main.tf#L240) - defines the VPC which will be used to host the EKS cluster + +2. [eks](main.tf#L92) - defines the EKS cluster + The EKS cluster contains a managed nodedgroup called `sys` for running system pods, + and an unmanaged nodegroup called `efa` which has the necessary configuration to enable EFA on the nodes in that group. + +3. [eks_blueprints_kubernetes_addons](main.tf#L220) - defines EKS cluster addons to be deployed + + +## Resources + +The [resources section of main.tf](main.tf#69) creates a placement group, deploys the [EFA](https://github.com/aws-samples/aws-efa-eks) and [NVIDIA](https://github.com/NVIDIA/k8s-device-plugin) device plugins. + +## Inputs + +There are no required user-inputs. +The template comes with default inputs which create an EKS cluster called `eks-efa` in region `us-east-1`. +These settings can be adjusted in the [variables.tf](variables.tf) file. + +## Outputs + +When the `terraform apply` completes successfully, the EKS cluster id, and the command to connect to the cluster are provided as outputs as described in [outputs.tf](outputs.tf). + +# Example Walkthrough + +## 1. Clone Repository + +```bash +git clone https://github.com/aws-ia/terraform-aws-eks-blueprints.git +cd terraform-aws-eks-bluerpints/examples/eks-efa +``` + +## 2. Configure Terraform Plan + +Edit [variables.tf](variables.tf) and the [locals section of main.tf](main.tf#L54) as needed. + +## 3. Initialize Terraform Plan + +```bash +terraform init +``` + +
+Output: +Initializing the backend... +Initializing modules... +Downloading registry.terraform.io/terraform-aws-modules/eks/aws 19.13.1 for eks... +- eks in .terraform/modules/eks +- eks.eks_managed_node_group in .terraform/modules/eks/modules/eks-managed-node-group +- eks.eks_managed_node_group.user_data in .terraform/modules/eks/modules/_user_data +- eks.fargate_profile in .terraform/modules/eks/modules/fargate-profile +Downloading registry.terraform.io/terraform-aws-modules/kms/aws 1.1.0 for eks.kms... +- eks.kms in .terraform/modules/eks.kms +- eks.self_managed_node_group in .terraform/modules/eks/modules/self-managed-node-group +- eks.self_managed_node_group.user_data in .terraform/modules/eks/modules/_user_data +- eks_blueprints_kubernetes_addons in ../../modules/kubernetes-addons +- eks_blueprints_kubernetes_addons.adot_collector_haproxy in ../../modules/kubernetes-addons/adot-collector-haproxy +- eks_blueprints_kubernetes_addons.adot_collector_haproxy.helm_addon in ../../modules/kubernetes-addons/helm-addon +- eks_blueprints_kubernetes_addons.adot_collector_haproxy.helm_addon.irsa in ../../modules/irsa +- eks_blueprints_kubernetes_addons.adot_collector_java in ../../modules/kubernetes-addons/adot-collector-java +- eks_blueprints_kubernetes_addons.adot_collector_java.helm_addon in ../../modules/kubernetes-addons/helm-addon +- ... +- eks_blueprints_kubernetes_addons.opentelemetry_operator in ../../modules/kubernetes-addons/opentelemetry-operator +- eks_blueprints_kubernetes_addons.opentelemetry_operator.cert_manager in ../../modules/kubernetes-addons/cert-manager +- eks_blueprints_kubernetes_addons.opentelemetry_operator.cert_manager.helm_addon in ../../modules/kubernetes-addons/helm-addon +- eks_blueprints_kubernetes_addons.opentelemetry_operator.cert_manager.helm_addon.irsa in ../../modules/irsa +- eks_blueprints_kubernetes_addons.opentelemetry_operator.helm_addon in ../../modules/kubernetes-addons/helm-addon +- eks_blueprints_kubernetes_addons.opentelemetry_operator.helm_addon.irsa in ../../modules/irsa +Downloading registry.terraform.io/portworx/portworx-addon/eksblueprints 0.0.6 for eks_blueprints_kubernetes_addons.portworx... +- eks_blueprints_kubernetes_addons.portworx in .terraform/modules/eks_blueprints_kubernetes_addons.portworx +Downloading git::https://github.com/aws-ia/terraform-aws-eks-blueprints.git for eks_blueprints_kubernetes_addons.portworx.helm_addon... +- eks_blueprints_kubernetes_addons.portworx.helm_addon in .terraform/modules/eks_blueprints_kubernetes_addons.portworx.helm_addon/modules/kubernetes-addons/helm-addon +- eks_blueprints_kubernetes_addons.portworx.helm_addon.irsa in .terraform/modules/eks_blueprints_kubernetes_addons.portworx.helm_addon/modules/irsa +- eks_blueprints_kubernetes_addons.prometheus in ../../modules/kubernetes-addons/prometheus +-... +- eks_blueprints_kubernetes_addons.yunikorn.helm_addon in ../../modules/kubernetes-addons/helm-addon +- eks_blueprints_kubernetes_addons.yunikorn.helm_addon.irsa in ../../modules/irsa +Downloading registry.terraform.io/terraform-aws-modules/vpc/aws 4.0.1 for vpc... +- vpc in .terraform/modules/vpc + +Initializing provider plugins... +- Finding latest version of hashicorp/random... +- Finding hashicorp/kubernetes versions matching ">= 2.6.1, >= 2.10.0, >= 2.16.1"... +- Finding latest version of hashicorp/http... +- Finding hashicorp/helm versions matching ">= 2.4.1, >= 2.5.1, >= 2.8.0"... +- Finding gavinbunney/kubectl versions matching ">= 1.14.0"... +- Finding hashicorp/aws versions matching ">= 3.72.0, >= 4.10.0, >= 4.13.0, >= 4.35.0, >= 4.47.0, >= 4.57.0"... +- Finding hashicorp/time versions matching ">= 0.7.0, >= 0.8.0, >= 0.9.0"... +- Finding hashicorp/null versions matching ">= 3.0.0"... +- Finding hashicorp/tls versions matching ">= 3.0.0"... +- Finding hashicorp/cloudinit versions matching ">= 2.0.0"... +- Installing hashicorp/helm v2.9.0... +- Installed hashicorp/helm v2.9.0 (signed by HashiCorp) +- Installing gavinbunney/kubectl v1.14.0... +- Installed gavinbunney/kubectl v1.14.0 (self-signed, key ID AD64217B5ADD572F) +- Installing hashicorp/tls v4.0.4... +- Installed hashicorp/tls v4.0.4 (signed by HashiCorp) +- Installing hashicorp/cloudinit v2.3.2... +- Installed hashicorp/cloudinit v2.3.2 (signed by HashiCorp) +- Installing hashicorp/random v3.5.1... +- Installed hashicorp/random v3.5.1 (signed by HashiCorp) +- Installing hashicorp/http v3.3.0... +- Installed hashicorp/http v3.3.0 (signed by HashiCorp) +- Installing hashicorp/time v0.9.1... +- Installed hashicorp/time v0.9.1 (signed by HashiCorp) +- Installing hashicorp/null v3.2.1... +- Installed hashicorp/null v3.2.1 (signed by HashiCorp) +- Installing hashicorp/kubernetes v2.20.0... +- Installed hashicorp/kubernetes v2.20.0 (signed by HashiCorp) +- Installing hashicorp/aws v4.66.1... +- Installed hashicorp/aws v4.66.1 (signed by HashiCorp) + +Partner and community providers are signed by their developers. +If you'd like to know more about provider signing, you can read about it here: +https://www.terraform.io/docs/cli/plugins/signing.html + +Terraform has created a lock file .terraform.lock.hcl to record the provider +selections it made above. Include this file in your version control repository +so that Terraform can guarantee to make the same selections by default when +you run "terraform init" in the future. + +Terraform has been successfully initialized! + +You may now begin working with Terraform. Try running "terraform plan" to see +any changes that are required for your infrastructure. All Terraform commands +should now work. + +If you ever set or change modules or backend configuration for Terraform, +rerun this command to reinitialize your working directory. If you forget, other +commands will detect it and remind you to do so if necessary. +
+ +## 4. Create Terraform Plan + +```bash +terraform plan -out tfplan +``` + +
+Output: + +```text +... +# module.vpc.aws_vpc.this[0] will be created + + resource "aws_vpc" "this" { + + arn = (known after apply) + + cidr_block = "10.11.0.0/16" + + default_network_acl_id = (known after apply) + + default_route_table_id = (known after apply) + + default_security_group_id = (known after apply) +... + +Plan: 80 to add, 0 to change, 0 to destroy. + +Changes to Outputs: + + configure_kubectl = "aws eks update-kubeconfig --region us-east-1 --name eks-efa" + + eks_cluster_id = (known after apply) + +─────────────────────────────────────────────────────────────────────────────── + +Saved the plan to: tfplan + +To perform exactly these actions, run the following command to apply: + terraform apply "tfplan" +``` +
+ +## 5. Apply Terraform Plan + +```bash +terraform apply tfplan +``` + +
+ +Output: + +```text +aws_placement_group.efa_pg: Creating... +module.eks.aws_cloudwatch_log_group.this[0]: Creating... +module.vpc.aws_vpc.this[0]: Creating... +module.eks.module.eks_managed_node_group["sys"].aws_iam_role.this[0]: Creating... +module.vpc.aws_eip.nat[0]: Creating... +module.eks.aws_iam_role.this[0]: Creating... +... +module.eks.aws_eks_cluster.this[0]: Still creating... [1m40s elapsed] +module.eks.aws_eks_cluster.this[0]: Still creating... [1m50s elapsed] +module.eks.aws_eks_cluster.this[0]: Still creating... [2m0s elapsed] +... +module.eks.aws_eks_addon.this["kube-proxy"]: Still creating... [30s elapsed] +module.eks_blueprints_kubernetes_addons.module.aws_fsx_csi_driver[0].module.helm_addon.helm_release.addon[0]: Still creating... [20s elapsed] +module.eks_blueprints_kubernetes_addons.module.aws_efs_csi_driver[0].module.helm_addon.helm_release.addon[0]: Still creating... [20s elapsed] +module.eks.aws_eks_addon.this["vpc-cni"]: Creation complete after 35s [id=eks-efa:vpc-cni] +module.eks.aws_eks_addon.this["kube-proxy"]: Creation complete after 35s [id=eks-efa:kube-proxy] +module.eks_blueprints_kubernetes_addons.module.aws_fsx_csi_driver[0].module.helm_addon.helm_release.addon[0]: Still creating... [30s elapsed] +module.eks_blueprints_kubernetes_addons.module.aws_efs_csi_driver[0].module.helm_addon.helm_release.addon[0]: Still creating... [30s elapsed] +module.eks_blueprints_kubernetes_addons.module.aws_efs_csi_driver[0].module.helm_addon.helm_release.addon[0]: Creation complete after 36s [id=aws-efs-csi-driver] +module.eks_blueprints_kubernetes_addons.module.aws_fsx_csi_driver[0].module.helm_addon.helm_release.addon[0]: Creation complete after 36s [id=aws-fsx-csi-driver] +╷ +│ Warning: "default_secret_name" is no longer applicable for Kubernetes v1.24.0 and above +│ +│ with module.eks_blueprints_kubernetes_addons.module.aws_efs_csi_driver[0].module.helm_addon.module.irsa[0].kubernetes_service_account_v1.irsa[0], +│ on ../../modules/irsa/main.tf line 37, in resource "kubernetes_service_account_v1" "irsa": +│ 37: resource "kubernetes_service_account_v1" "irsa" { +│ +│ Starting from version 1.24.0 Kubernetes does not automatically generate a token for service accounts, in this case, "default_secret_name" will be empty +│ +│ (and one more similar warning elsewhere) +╵ + +Apply complete! Resources: 80 added, 0 changed, 0 destroyed. + +Outputs: + +configure_kubectl = "aws eks update-kubeconfig --region us-east-1 --name eks-efa" + +``` +
+ +> **_Note:_** If the plan apply operation fails, you can repeat `terraform plan -out tfplan` and `terraform apply tfplan` + +It takes about 15 minutes to create the cluster. + +## 6. Connect to EKS + +Copy the value of the `configure_kubectl` output and execute it in your shell to connect to your EKS cluster. + +```bash +aws eks update-kubeconfig --region us-east-1 --name eks-efa +``` + +Output: +```text +Updated context arn:aws:eks:us-east-1:xxxxxxxxxxxx:cluster/eks-efa in /root/.kube/config +``` + +Allow 5 minutes after the plan is applied for the EFA nodes to finish initializing and join the EKS cluster, then execute: + +```bash +kubectl get nodes +kubectl get nodes -o yaml | grep instance-type | grep node | grep -v f: +``` + +Your nodes and node types will be listed: + +```text +# kubectl get nodes +NAME STATUS ROLES AGE VERSION +ip-10-11-10-103.ec2.internal Ready 4m1s v1.25.7-eks-a59e1f0 +ip-10-11-19-28.ec2.internal Ready 11m v1.25.7-eks-a59e1f0 +ip-10-11-2-151.ec2.internal Ready 11m v1.25.7-eks-a59e1f0 +ip-10-11-2-18.ec2.internal Ready 5m1s v1.25.7-eks-a59e1f0 +# kubectl get nodes -o yaml | grep instance-type | grep node | grep -v f: + node.kubernetes.io/instance-type: g4dn.metal + node.kubernetes.io/instance-type: m5.large + node.kubernetes.io/instance-type: m5.large + node.kubernetes.io/instance-type: g4dn.metal +``` + +You should see two EFA-enabled (in this example `g4dn.metal`) nodes in the list. +This verifies that you are connected to your EKS cluster and it is configured with EFA nodes. + +## 7. Deploy Kubeflow MPI Operator + +Kubeflow MPI Operator is required for running MPIJobs on EKS. We will use an MPIJob to test EFA. +To deploy the MPI operator execute the following: + +```bash +kubectl apply -f https://raw.githubusercontent.com/kubeflow/mpi-operator/v0.3.0/deploy/v2beta1/mpi-operator.yaml +``` + +Output: + +```text +namespace/mpi-operator created +customresourcedefinition.apiextensions.k8s.io/mpijobs.kubeflow.org created +serviceaccount/mpi-operator created +clusterrole.rbac.authorization.k8s.io/kubeflow-mpijobs-admin created +clusterrole.rbac.authorization.k8s.io/kubeflow-mpijobs-edit created +clusterrole.rbac.authorization.k8s.io/kubeflow-mpijobs-view created +clusterrole.rbac.authorization.k8s.io/mpi-operator created +clusterrolebinding.rbac.authorization.k8s.io/mpi-operator created +deployment.apps/mpi-operator created +``` + +In addition to deploying the operator, please apply a patch to the mpi-operator clusterrole +to allow the mpi-operator service account access to `leases` resources in the `coordination.k8s.io` apiGroup. + +```bash +kubectl apply -f https://raw.githubusercontent.com/aws-samples/aws-do-eks/main/Container-Root/eks/deployment/kubeflow/mpi-operator/clusterrole-mpi-operator.yaml +``` + +Output: + +```text +clusterrole.rbac.authorization.k8s.io/mpi-operator configured +``` + +## 8. Test EFA + +We will run two tests. The first one will show the presence of EFA adapters on our EFA-enabled nodes. The second will test EFA performance. + +### 8.1. EFA Info Test + +To run the EFA info test, execute the following commands: + +```bash +kubectl apply -f https://raw.githubusercontent.com/aws-samples/aws-do-eks/main/Container-Root/eks/deployment/efa-device-plugin/test-efa.yaml +``` + +Output: + +```text +mpijob.kubeflow.org/efa-info-test created +``` + +```bash +kubectl get pods +``` + +Output: + +```text +NAME READY STATUS RESTARTS AGE +efa-info-test-launcher-hckkj 0/1 Completed 2 37s +efa-info-test-worker-0 1/1 Running 0 38s +efa-info-test-worker-1 1/1 Running 0 38s +``` + +Once the test launcher pod enters status `Running` or `Completed`, see the test logs using the command below: + +```bash +kubectl logs -f $(kubectl get pods | grep launcher | cut -d ' ' -f 1) +``` + +Output: + +```text +Warning: Permanently added 'efa-info-test-worker-1.efa-info-test-worker.default.svc,10.11.13.224' (ECDSA) to the list of known hosts. +Warning: Permanently added 'efa-info-test-worker-0.efa-info-test-worker.default.svc,10.11.4.63' (ECDSA) to the list of known hosts. +[1,1]:provider: efa +[1,1]: fabric: efa +[1,1]: domain: rdmap197s0-rdm +[1,1]: version: 116.10 +[1,1]: type: FI_EP_RDM +[1,1]: protocol: FI_PROTO_EFA +[1,0]:provider: efa +[1,0]: fabric: efa +[1,0]: domain: rdmap197s0-rdm +[1,0]: version: 116.10 +[1,0]: type: FI_EP_RDM +[1,0]: protocol: FI_PROTO_EFA +``` + +This result shows that two EFA adapters are available (one for each worker pod). + +Lastly, delete the test job: + +```bash +kubectl delete mpijob efa-info-test +``` + +Output: + +```text +mpijob.kubeflow.org "efa-info-test" deleted +``` + +### 8.2. EFA NCCL Test + +To run the EFA NCCL test please execute the following kubectl command: + +```bash +kubectl apply -f https://raw.githubusercontent.com/aws-samples/aws-do-eks/main/Container-Root/eks/deployment/efa-device-plugin/test-nccl-efa.yaml +``` + +Output: + +```text +mpijob.kubeflow.org/test-nccl-efa created +``` + +Then display the pods in the current namespace: + +```bash +kubectl get pods +``` + +Output: + +```text +NAME READY STATUS RESTARTS AGE +test-nccl-efa-launcher-tx47t 1/1 Running 2 (31s ago) 33s +test-nccl-efa-worker-0 1/1 Running 0 33s +test-nccl-efa-worker-1 1/1 Running 0 33s +``` + +Once the launcher pod enters `Running` or `Completed` state, execute the following to see the test logs: + +```bash +kubectl logs -f $(kubectl get pods | grep launcher | cut -d ' ' -f 1) +``` + +
+ +Output: + +```text +Warning: Permanently added 'test-nccl-efa-worker-1.test-nccl-efa-worker.default.svc,10.11.5.31' (ECDSA) to the list of known hosts. +Warning: Permanently added 'test-nccl-efa-worker-0.test-nccl-efa-worker.default.svc,10.11.13.106' (ECDSA) to the list of known hosts. +[1,0]:# nThread 1 nGpus 1 minBytes 1 maxBytes 1073741824 step: 2(factor) warmup iters: 5 iters: 100 agg iters: 1 validation: 1 graph: 0 +[1,0]:# +[1,0]:# Using devices +[1,0]:# Rank 0 Group 0 Pid 21 on test-nccl-efa-worker-0 device 0 [0x35] Tesla T4 +[1,0]:# Rank 1 Group 0 Pid 21 on test-nccl-efa-worker-1 device 0 [0xf5] Tesla T4 +[1,0]:test-nccl-efa-worker-0:21:21 [0] NCCL INFO Bootstrap : Using eth0:10.11.13.106<0> +[1,0]:test-nccl-efa-worker-0:21:21 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. +[1,0]:test-nccl-efa-worker-0:21:21 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. +[1,0]:test-nccl-efa-worker-0:21:21 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.5.0aws +[1,0]:test-nccl-efa-worker-0:21:21 [0] NCCL INFO NET/OFI Configuring AWS-specific options +[1,0]:test-nccl-efa-worker-0:21:21 [0] NCCL INFO NET/OFI Setting NCCL_PROTO to "simple" +[1,0]:test-nccl-efa-worker-0:21:21 [0] NCCL INFO NET/OFI Setting FI_EFA_FORK_SAFE environment variable to 1 +[1,0]:test-nccl-efa-worker-0:21:21 [0] NCCL INFO NET/OFI Selected Provider is efa (found 1 nics) +[1,0]:test-nccl-efa-worker-0:21:21 [0] NCCL INFO Using network AWS Libfabric +[1,0]:NCCL version 2.12.7+cuda11.4 +[1,1]:test-nccl-efa-worker-1:21:21 [0] NCCL INFO Bootstrap : Using eth0:10.11.5.31<0> +[1,1]:test-nccl-efa-worker-1:21:21 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol. +[1,1]:test-nccl-efa-worker-1:21:21 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol. +[1,1]:test-nccl-efa-worker-1:21:21 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.5.0aws +[1,1]:test-nccl-efa-worker-1:21:21 [0] NCCL INFO NET/OFI Configuring AWS-specific options +[1,1]:test-nccl-efa-worker-1:21:21 [0] NCCL INFO NET/OFI Setting NCCL_PROTO to "simple" +[1,1]:test-nccl-efa-worker-1:21:21 [0] NCCL INFO NET/OFI Setting FI_EFA_FORK_SAFE environment variable to 1 +[1,1]:test-nccl-efa-worker-1:21:21 [0] NCCL INFO NET/OFI Selected Provider is efa (found 1 nics) +[1,1]:test-nccl-efa-worker-1:21:21 [0] NCCL INFO Using network AWS Libfabric +[1,0]:test-nccl-efa-worker-0:21:27 [0] NCCL INFO Setting affinity for GPU 0 to ff,ffff0000,00ffffff +[1,1]:test-nccl-efa-worker-1:21:26 [0] NCCL INFO Setting affinity for GPU 0 to ffffff00,0000ffff,ff000000 +[1,1]:test-nccl-efa-worker-1:21:26 [0] NCCL INFO Trees [0] -1/-1/-1->1->0 [1] 0/-1/-1->1->-1 +[1,0]:test-nccl-efa-worker-0:21:27 [0] NCCL INFO Channel 00/02 : 0 1 +[1,0]:test-nccl-efa-worker-0:21:27 [0] NCCL INFO Channel 01/02 : 0 1 +[1,0]:test-nccl-efa-worker-0:21:27 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] -1/-1/-1->0->1 +[1,1]:test-nccl-efa-worker-1:21:26 [0] NCCL INFO NCCL_SHM_DISABLE set by environment to 0. +[1,0]:test-nccl-efa-worker-0:21:27 [0] NCCL INFO NCCL_SHM_DISABLE set by environment to 0. +[1,1]:test-nccl-efa-worker-1:21:26 [0] NCCL INFO Channel 00/0 : 0[35000] -> 1[f5000] [receive] via NET/AWS Libfabric/0 +[1,0]:test-nccl-efa-worker-0:21:27 [0] NCCL INFO Channel 00/0 : 1[f5000] -> 0[35000] [receive] via NET/AWS Libfabric/0 +[1,1]:test-nccl-efa-worker-1:21:26 [0] NCCL INFO Channel 01/0 : 0[35000] -> 1[f5000] [receive] via NET/AWS Libfabric/0 +[1,0]:test-nccl-efa-worker-0:21:27 [0] NCCL INFO Channel 01/0 : 1[f5000] -> 0[35000] [receive] via NET/AWS Libfabric/0 +[1,1]:test-nccl-efa-worker-1:21:26 [0] NCCL INFO Channel 00/0 : 1[f5000] -> 0[35000] [send] via NET/AWS Libfabric/0 +[1,0]:test-nccl-efa-worker-0:21:27 [0] NCCL INFO Channel 00/0 : 0[35000] -> 1[f5000] [send] via NET/AWS Libfabric/0 +[1,1]:test-nccl-efa-worker-1:21:26 [0] NCCL INFO Channel 01/0 : 1[f5000] -> 0[35000] [send] via NET/AWS Libfabric/0 +[1,0]:test-nccl-efa-worker-0:21:27 [0] NCCL INFO Channel 01/0 : 0[35000] -> 1[f5000] [send] via NET/AWS Libfabric/0 +[1,0]:test-nccl-efa-worker-0:21:27 [0] NCCL INFO Connected all rings +[1,0]:test-nccl-efa-worker-0:21:27 [0] NCCL INFO Connected all trees +[1,0]:test-nccl-efa-worker-0:21:27 [0] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 8/8/512 +[1,0]:test-nccl-efa-worker-0:21:27 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +[1,1]:test-nccl-efa-worker-1:21:26 [0] NCCL INFO Connected all rings +[1,1]:test-nccl-efa-worker-1:21:26 [0] NCCL INFO Connected all trees +[1,1]:test-nccl-efa-worker-1:21:26 [0] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 8/8/512 +[1,1]:test-nccl-efa-worker-1:21:26 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +[1,1]:test-nccl-efa-worker-1:21:26 [0] NCCL INFO comm 0x7f9c0c000f60 rank 1 nranks 2 cudaDev 0 busId f5000 - Init COMPLETE +[1,0]:test-nccl-efa-worker-0:21:27 [0] NCCL INFO comm 0x7fde98000f60 rank 0 nranks 2 cudaDev 0 busId 35000 - Init COMPLETE +[1,0]:# +[1,0]:# out-of-place in-place +[1,0]:# size count type redop root time algbw busbw #wrong time algbw busbw #wrong +[1,0]:# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s) +[1,0]:test-nccl-efa-worker-0:21:21 [0] NCCL INFO Launch mode Parallel +[1,0]: 0 0 float sum -1 6.36 0.00 0.00 0 6.40 0.00 0.00 0 +[1,0]: 0 0 float sum -1 6.43 0.00 0.00 0 6.35 0.00 0.00 0 +[1,0]: 4 1 float sum -1 65.70 0.00 0.00 0 64.84 0.00 0.00 0 +[1,0]: 8 2 float sum -1 64.88 0.00 0.00 0 64.18 0.00 0.00 0 +[1,0]: 16 4 float sum -1 64.33 0.00 0.00 0 65.02 0.00 0.00 0 +[1,0]: 32 8 float sum -1 65.95 0.00 0.00 0 64.78 0.00 0.00 0 +[1,0]: 64 16 float sum -1 65.19 0.00 0.00 0 64.66 0.00 0.00 0 +[1,0]: 128 32 float sum -1 65.30 0.00 0.00 0 64.76 0.00 0.00 0 +[1,0]: 256 64 float sum -1 65.30 0.00 0.00 0 64.90 0.00 0.00 0 +[1,0]: 512 128 float sum -1 65.71 0.01 0.01 0 64.75 0.01 0.01 0 +[1,0]: 1024 256 float sum -1 67.15 0.02 0.02 0 66.82 0.02 0.02 0 +[1,0]: 2048 512 float sum -1 68.22 0.03 0.03 0 67.55 0.03 0.03 0 +[1,0]: 4096 1024 float sum -1 70.65 0.06 0.06 0 71.20 0.06 0.06 0 +[1,0]: 8192 2048 float sum -1 76.15 0.11 0.11 0 75.36 0.11 0.11 0 +[1,0]: 16384 4096 float sum -1 87.65 0.19 0.19 0 87.87 0.19 0.19 0 +[1,0]: 32768 8192 float sum -1 98.94 0.33 0.33 0 98.14 0.33 0.33 0 +[1,0]: 65536 16384 float sum -1 115.8 0.57 0.57 0 115.7 0.57 0.57 0 +[1,0]: 131072 32768 float sum -1 149.3 0.88 0.88 0 148.7 0.88 0.88 0 +[1,0]: 262144 65536 float sum -1 195.0 1.34 1.34 0 194.0 1.35 1.35 0 +[1,0]: 524288 131072 float sum -1 296.9 1.77 1.77 0 291.1 1.80 1.80 0 +[1,0]: 1048576 262144 float sum -1 583.4 1.80 1.80 0 579.6 1.81 1.81 0 +[1,0]: 2097152 524288 float sum -1 983.3 2.13 2.13 0 973.9 2.15 2.15 0 +[1,0]: 4194304 1048576 float sum -1 1745.4 2.40 2.40 0 1673.2 2.51 2.51 0 +[1,0]: 8388608 2097152 float sum -1 3116.1 2.69 2.69 0 3092.6 2.71 2.71 0 +[1,0]: 16777216 4194304 float sum -1 5966.3 2.81 2.81 0 6008.9 2.79 2.79 0 +[1,0]: 33554432 8388608 float sum -1 11390 2.95 2.95 0 11419 2.94 2.94 0 +[1,0]: 67108864 16777216 float sum -1 21934 3.06 3.06 0 21930 3.06 3.06 0 +[1,0]: 134217728 33554432 float sum -1 43014 3.12 3.12 0 42619 3.15 3.15 0 +[1,0]: 268435456 67108864 float sum -1 85119 3.15 3.15 0 85743 3.13 3.13 0 +[1,0]: 536870912 134217728 float sum -1 171351 3.13 3.13 0 171823 3.12 3.12 0 +[1,0]: 1073741824 268435456 float sum -1 344981 3.11 3.11 0 344454 3.12 3.12 0 +[1,1]:test-nccl-efa-worker-1:21:21 [0] NCCL INFO comm 0x7f9c0c000f60 rank 1 nranks 2 cudaDev 0 busId f5000 - Destroy COMPLETE +[1,0]:test-nccl-efa-worker-0:21:21 [0] NCCL INFO comm 0x7fde98000f60 rank 0 nranks 2 cudaDev 0 busId 35000 - Destroy COMPLETE +[1,0]:# Out of bounds values : 0 OK +[1,0]:# Avg bus bandwidth : 1.15327 +[1,0]:# +[1,0]: +``` +
+ + +The following section from the beginning of the log, indicates that the test is being performed using EFA: + +```text +[1,0]:test-nccl-efa-worker-0:21:21 [0] NCCL INFO NET/OFI Selected Provider is efa (found 1 nics) +[1,0]:test-nccl-efa-worker-0:21:21 [0] NCCL INFO Using network AWS Libfabric +[1,0]:NCCL version 2.12.7+cuda11.4 +``` + +Columns 8 and 12 in the output table show the in-place and out-of-place bus bandwidth calculated for the data size listed in column 1. In this case it is 3.13 and 3.12 GB/s respectively. +Your actual results may be slightly different. The calculated average bus bandwidth is displayed at the bottom of the log when the test finishes after it reaches the max data size, +specified in the mpijob manifest. In this result the average bus bandwidth is 1.15 GB/s. + +``` +[1,0]:# size count type redop root time algbw busbw #wrong time algbw busbw #wrong +[1,0]:# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s) +... +[1,0]: 262144 65536 float sum -1 195.0 1.34 1.34 0 194.0 1.35 1.35 0 +[1,0]: 524288 131072 float sum -1 296.9 1.77 1.77 0 291.1 1.80 1.80 0 +[1,0]: 1048576 262144 float sum -1 583.4 1.80 1.80 0 579.6 1.81 1.81 0 +[1,0]: 2097152 524288 float sum -1 983.3 2.13 2.13 0 973.9 2.15 2.15 0 +[1,0]: 4194304 1048576 float sum -1 1745.4 2.40 2.40 0 1673.2 2.51 2.51 0 +... +[1,0]:# Avg bus bandwidth : 1.15327 +``` + +Finally, delete the test mpi job: + +```bash +kubectl delete mpijob test-nccl-efa +``` + +Output: + +```text +mpijob.kubeflow.org "test-nccl-efa" deleted +``` + +## 9. Cleanup + +```bash +terraform destroy +``` + +
+Output: + +```text +... + # module.eks.module.self_managed_node_group["efa"].aws_iam_role.this[0] will be destroyed +... + +Plan: 0 to add, 0 to change, 80 to destroy. + +Changes to Outputs: + - configure_kubectl = "aws eks update-kubeconfig --region us-east-1 --name eks-efa" -> null + +Do you really want to destroy all resources? + Terraform will destroy all your managed infrastructure, as shown above. + There is no undo. Only 'yes' will be accepted to confirm. + + Enter a value: yes + ... + module.eks.aws_iam_role.this[0]: Destruction complete after 1s +module.eks.aws_security_group_rule.node["ingress_self_coredns_udp"]: Destruction complete after 2s +module.eks.aws_security_group_rule.node["ingress_cluster_9443_webhook"]: Destruction complete after 3s +module.eks.aws_security_group_rule.node["ingress_cluster_443"]: Destruction complete after 3s +module.eks.aws_security_group_rule.node["egress_all"]: Destruction complete after 2s +module.eks.aws_security_group_rule.node["egress_self_all"]: Destruction complete after 3s +module.eks.aws_security_group_rule.node["ingress_nodes_ephemeral"]: Destruction complete after 3s +module.eks.aws_security_group_rule.node["ingress_cluster_8443_webhook"]: Destruction complete after 3s +module.eks.aws_security_group_rule.node["ingress_self_coredns_tcp"]: Destruction complete after 4s +module.eks.aws_security_group.cluster[0]: Destroying... [id=sg-05516650e2f2ed6c1] +module.eks.aws_security_group.node[0]: Destroying... [id=sg-0e421877145f36d48] +module.eks.aws_security_group.cluster[0]: Destruction complete after 1s +module.eks.aws_security_group.node[0]: Destruction complete after 1s +module.vpc.aws_vpc.this[0]: Destroying... [id=vpc-04677b1ab4eac3ca7] +module.vpc.aws_vpc.this[0]: Destruction complete after 0s +╷ +│ Warning: EC2 Default Network ACL (acl-0932148c7d86482e0) not deleted, removing from state +╵ + +Destroy complete! Resources: 80 destroyed. +``` + +
+ +The cleanup process takes about 15 minutes. + +# Conclusion + +With this example, we have demonstrated how AWS EKS Blueprints can be used to create an EKS cluster with an +EFA-enabled nodegroup. Futhermore, we have shown how to run MPI Jobs to validate that EFA works and check its performance. +Use this example as a starting point to bootstrap your own infrastructure-as-code terraform projects that require use +of high-performance networking on AWS with Elastic Fabric Adapter. diff --git a/examples/eks-efa/main.tf b/examples/eks-efa/main.tf new file mode 100644 index 0000000000..3fe9178e9c --- /dev/null +++ b/examples/eks-efa/main.tf @@ -0,0 +1,261 @@ +# Providers + +provider "aws" { + region = var.aws_region +} + +provider "kubernetes" { + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + token = data.aws_eks_cluster_auth.this.token +} + +provider "helm" { + kubernetes { + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + token = data.aws_eks_cluster_auth.this.token + } +} + +provider "kubectl" { + apply_retry_count = 10 + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + token = data.aws_eks_cluster_auth.this.token +} + +# Data + +data "aws_eks_cluster_auth" "this" { + name = module.eks.cluster_name +} + +data "aws_availability_zones" "available" {} + +data "http" "efa_device_plugin_yaml" { + url = "https://raw.githubusercontent.com/aws-samples/aws-efa-eks/main/manifest/efa-k8s-device-plugin.yml" +} + +data "aws_ami" "eks_gpu_node" { + most_recent = true + owners = ["amazon"] + + filter { + name = "name" + values = ["amazon-eks-gpu-node-${local.cluster_version}-*"] + } +} + +# Local config + +locals { + name = var.cluster_name + cluster_version = "1.25" + + vpc_cidr = "10.11.0.0/16" + azs = slice(data.aws_availability_zones.available.names, 0, 2) + + tags = { + Blueprint = local.name + GithubRepo = "github.com/aws-ia/terraform-aws-eks-blueprints" + } + +} + +# Resources + +resource "aws_placement_group" "efa_pg" { + name = "efa_pg" + strategy = "cluster" +} + +resource "kubectl_manifest" "efa_device_plugin" { + yaml_body = < 0 ~} export SERVICE_IPV4_CIDR=${service_ipv4_cidr} %{ endif ~} -%{ if length(service_ipv4_cidr) > 0 ~} +%{ if length(service_ipv6_cidr) > 0 ~} export SERVICE_IPV6_CIDR=${service_ipv6_cidr} %{ endif ~} %{ if length(custom_ami_id) > 0 ~} diff --git a/modules/kubernetes-addons/README.md b/modules/kubernetes-addons/README.md index 58cdcff208..e062865cf3 100644 --- a/modules/kubernetes-addons/README.md +++ b/modules/kubernetes-addons/README.md @@ -74,7 +74,7 @@ | [nvidia\_device\_plugin](#module\_nvidia\_device\_plugin) | ./nvidia-device-plugin | n/a | | [ondat](#module\_ondat) | ./ondat | n/a | | [opentelemetry\_operator](#module\_opentelemetry\_operator) | ./opentelemetry-operator | n/a | -| [portworx](#module\_portworx) | portworx/portworx-addon/eksblueprints | 0.0.6 | +| [portworx](#module\_portworx) | ./portworx | n/a | | [prometheus](#module\_prometheus) | ./prometheus | n/a | | [promtail](#module\_promtail) | ./promtail | n/a | | [reloader](#module\_reloader) | ./reloader | n/a | @@ -83,7 +83,7 @@ | [spark\_history\_server](#module\_spark\_history\_server) | ./spark-history-server | n/a | | [spark\_k8s\_operator](#module\_spark\_k8s\_operator) | ./spark-k8s-operator | n/a | | [strimzi\_kafka\_operator](#module\_strimzi\_kafka\_operator) | ./strimzi-kafka-operator | n/a | -| [sysdig\_agent](#module\_sysdig\_agent) | sysdiglabs/sysdig-addon/eksblueprints | 0.0.3 | +| [sysdig\_agent](#module\_sysdig\_agent) | ./sysdig | n/a | | [tetrate\_istio](#module\_tetrate\_istio) | ./tetrate-istio | n/a | | [thanos](#module\_thanos) | ./thanos | n/a | | [traefik](#module\_traefik) | ./traefik | n/a | diff --git a/modules/kubernetes-addons/aws-efs-csi-driver/data.tf b/modules/kubernetes-addons/aws-efs-csi-driver/data.tf index eacf140092..3d312e740f 100644 --- a/modules/kubernetes-addons/aws-efs-csi-driver/data.tf +++ b/modules/kubernetes-addons/aws-efs-csi-driver/data.tf @@ -37,6 +37,19 @@ data "aws_iam_policy_document" "aws_efs_csi_driver" { } } + statement { + sid = "TagResource" + effect = "Allow" + resources = ["arn:${var.addon_context.aws_partition_id}:elasticfilesystem:${var.addon_context.aws_region_name}:${var.addon_context.aws_caller_identity_account_id}:file-system/*"] + actions = ["elasticfilesystem:TagResource"] + + condition { + test = "StringLike" + variable = "aws:RequestTag/efs.csi.aws.com/cluster" + values = ["true"] + } + } + statement { sid = "AllowDeleteAccessPoint" effect = "Allow" @@ -50,6 +63,22 @@ data "aws_iam_policy_document" "aws_efs_csi_driver" { } } + statement { + sid = "AllowTagResource" + effect = "Allow" + resources = [ + "arn:${var.addon_context.aws_partition_id}:elasticfilesystem:${var.addon_context.aws_region_name}:${var.addon_context.aws_caller_identity_account_id}:file-system/*", + "arn:${var.addon_context.aws_partition_id}:elasticfilesystem:${var.addon_context.aws_region_name}:${var.addon_context.aws_caller_identity_account_id}:access-point/*" + ] + actions = ["elasticfilesystem:TagResource"] + + condition { + test = "StringLike" + variable = "aws:ResourceTag/efs.csi.aws.com/cluster" + values = ["true"] + } + } + statement { actions = [ "elasticfilesystem:ClientRootAccess", diff --git a/modules/kubernetes-addons/datadog-operator/main.tf b/modules/kubernetes-addons/datadog-operator/main.tf index 89b851ab79..6e9d587fc6 100644 --- a/modules/kubernetes-addons/datadog-operator/main.tf +++ b/modules/kubernetes-addons/datadog-operator/main.tf @@ -11,7 +11,7 @@ module "helm_addon" { name = local.name chart = local.name repository = "https://helm.datadoghq.com" - version = "0.8.8" + version = "1.0.2" namespace = local.name create_namespace = true description = "Datadog Operator" diff --git a/modules/kubernetes-addons/karpenter/locals.tf b/modules/kubernetes-addons/karpenter/locals.tf index edb52300f2..9b5f988529 100644 --- a/modules/kubernetes-addons/karpenter/locals.tf +++ b/modules/kubernetes-addons/karpenter/locals.tf @@ -17,7 +17,7 @@ locals { name = local.name chart = local.name repository = "oci://public.ecr.aws/karpenter" - version = "v0.27.2" + version = "v0.27.3" namespace = local.name values = [ <<-EOT diff --git a/modules/kubernetes-addons/kubecost/main.tf b/modules/kubernetes-addons/kubecost/main.tf index c57f30e732..5e0e5f2084 100644 --- a/modules/kubernetes-addons/kubecost/main.tf +++ b/modules/kubernetes-addons/kubecost/main.tf @@ -7,7 +7,7 @@ module "helm_addon" { name = "kubecost" chart = "cost-analyzer" repository = "oci://public.ecr.aws/kubecost" - version = "1.97.0" + version = "1.103.3" namespace = "kubecost" values = [file("${path.module}/values.yaml")] create_namespace = true diff --git a/modules/kubernetes-addons/kubecost/values.yaml b/modules/kubernetes-addons/kubecost/values.yaml index 505ef6f230..7aceaca501 100644 --- a/modules/kubernetes-addons/kubecost/values.yaml +++ b/modules/kubernetes-addons/kubecost/values.yaml @@ -4,7 +4,7 @@ global: enabled: false proxy: false -imageVersion: prod-1.97.0 +imageVersion: prod-1.103.3 kubecostFrontend: image: public.ecr.aws/kubecost/frontend diff --git a/modules/kubernetes-addons/main.tf b/modules/kubernetes-addons/main.tf index 62080c9e4d..2fee971912 100644 --- a/modules/kubernetes-addons/main.tf +++ b/modules/kubernetes-addons/main.tf @@ -389,12 +389,14 @@ module "kube_prometheus_stack" { } module "portworx" { - count = var.enable_portworx ? 1 : 0 - source = "portworx/portworx-addon/eksblueprints" - version = "0.0.6" + source = "./portworx" + + count = var.enable_portworx ? 1 : 0 + helm_config = var.portworx_helm_config addon_context = local.addon_context } + module "prometheus" { count = var.enable_prometheus ? 1 : 0 source = "./prometheus" @@ -441,8 +443,7 @@ module "strimzi_kafka_operator" { } module "sysdig_agent" { - source = "sysdiglabs/sysdig-addon/eksblueprints" - version = "0.0.3" + source = "./sysdig" count = var.enable_sysdig_agent ? 1 : 0 helm_config = var.sysdig_agent_helm_config @@ -450,11 +451,6 @@ module "sysdig_agent" { } module "tetrate_istio" { - # source = "tetratelabs/tetrate-istio-addon/eksblueprints" - # version = "0.0.7" - - # TODO - remove local source and revert to remote once - # https://github.com/tetratelabs/terraform-eksblueprints-tetrate-istio-addon/pull/12 is merged source = "./tetrate-istio" count = var.enable_tetrate_istio ? 1 : 0 diff --git a/modules/kubernetes-addons/portworx/README.md b/modules/kubernetes-addons/portworx/README.md new file mode 100644 index 0000000000..f1994d004a --- /dev/null +++ b/modules/kubernetes-addons/portworx/README.md @@ -0,0 +1,3 @@ +# Portworx add-on for EKS Blueprints + +Local copy of https://github.com/portworx/terraform-eksblueprints-portworx-addon diff --git a/modules/kubernetes-addons/portworx/locals.tf b/modules/kubernetes-addons/portworx/locals.tf new file mode 100644 index 0000000000..7e3bbb85be --- /dev/null +++ b/modules/kubernetes-addons/portworx/locals.tf @@ -0,0 +1,86 @@ +resource "random_string" "id" { + length = 4 + special = false + upper = false +} + +locals { + name = "portworx-${random_string.id.result}" + namespace = "kube-system" + service_account_name = "${local.name}-sa-${random_string.id.result}" + + aws_marketplace_config = try(var.helm_config["set"][index(var.helm_config.set[*].name, "aws.marketplace")], null) + use_aws_marketplace = local.aws_marketplace_config != null ? local.aws_marketplace_config["value"] : false + + default_helm_config = { + name = local.name + description = "A Helm chart for portworx" + chart = "portworx" + repository = "https://raw.githubusercontent.com/portworx/eks-blueprint-helm/main/repo/stable" + version = "2.11.0" + namespace = local.namespace + values = local.default_helm_values + } + + helm_config = merge( + local.default_helm_config, + var.helm_config + ) + + irsa_iam_policies_list = local.use_aws_marketplace != false ? [aws_iam_policy.portworx_eksblueprint_metering[0].arn] : [] + + irsa_config = { + create_kubernetes_namespace = false + kubernetes_namespace = local.namespace + create_kubernetes_service_account = true + kubernetes_service_account = local.service_account_name + irsa_iam_policies = local.irsa_iam_policies_list + } + + default_helm_values = [templatefile("${path.module}/values.yaml", { + imageVersion = "2.11.0" + clusterName = local.name + drives = "type=gp2,size=200" + useInternalKVDB = true + kvdbDevice = "type=gp2,size=150" + envVars = "" + maxStorageNodesPerZone = 3 + useOpenshiftInstall = false + etcdEndPoint = "" + dataInterface = "" + managementInterface = "" + useStork = true + storkVersion = "2.11.0" + customRegistryURL = "" + registrySecret = "" + licenseSecret = "" + monitoring = false + enableCSI = false + enableAutopilot = false + KVDBauthSecretName = "" + eksServiceAccount = local.service_account_name + awsAccessKeyId = "" + awsSecretAccessKey = "" + deleteType = "UninstallAndWipe" + } + )] +} + +resource "aws_iam_policy" "portworx_eksblueprint_metering" { + count = try(local.use_aws_marketplace, false) ? 1 : 0 + name = "portworx_eksblueprint_metering-${random_string.id.result}" + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Action = [ + "aws-marketplace:MeterUsage", + "aws-marketplace:RegisterUsage" + ], + Effect = "Allow", + Resource = "*" + }, + ] + }) +} diff --git a/modules/kubernetes-addons/portworx/main.tf b/modules/kubernetes-addons/portworx/main.tf new file mode 100644 index 0000000000..e764030b4b --- /dev/null +++ b/modules/kubernetes-addons/portworx/main.tf @@ -0,0 +1,7 @@ +module "helm_addon" { + source = "../helm-addon" + + addon_context = var.addon_context + helm_config = local.helm_config + irsa_config = local.irsa_config +} diff --git a/modules/kubernetes-addons/portworx/outputs.tf b/modules/kubernetes-addons/portworx/outputs.tf new file mode 100644 index 0000000000..e69de29bb2 diff --git a/modules/kubernetes-addons/portworx/values.yaml b/modules/kubernetes-addons/portworx/values.yaml new file mode 100644 index 0000000000..230e207ce1 --- /dev/null +++ b/modules/kubernetes-addons/portworx/values.yaml @@ -0,0 +1,123 @@ +# Please uncomment and specify values for these options as per your requirements. + +deployOperator: true # Deploy the Portworx operator +deployCluster: true # Deploy the Portworx cluster + +imageVersion: ${imageVersion} # Version of the PX Image. +pxOperatorImageVersion: 1.9.0 # Version of the PX operator image. + +openshiftInstall: ${useOpenshiftInstall} # Defaults to false for installing Portworx on Openshift . +isTargetOSCoreOS: false # Is your target OS CoreOS? Defaults to false. +pksInstall: false # installation on PKS (Pivotal Container Service) +EKSInstall: true # installation on EKS. +AKSInstall: false # installation on AKS +etcdEndPoint: ${etcdEndPoint} # The ETCD endpoint. Should be in the format etcd:http://:2379. If there are multiple etcd endpoints they need to be ";" seperated. + # the default value is empty since it requires to be explicity set using either the --set option of -f values.yaml. +clusterName: ${clusterName} # This is the default. please change it to your cluster name. +usefileSystemDrive: false # true/false Instructs PX to use an unmounted Drive even if it has a filesystem. +usedrivesAndPartitions: false # Defaults to false. Change to true and PX will use unmounted drives and partitions. +drives: ${drives} # NOTE: This is a ";" seperated list of drives. For eg: "/dev/sda;/dev/sdb;/dev/sdc" or + # "type=gp2,size=200;type=gp3,size=500". Defaults to use -A switch. +journalDevice: +maxStorageNodesPerZone: ${maxStorageNodesPerZone} # The maximum number of storage nodes desired per zone, in case of cloud drive provisioning + +secretType: k8s # Defaults to k8s, but can be kvdb/k8s/aws-kms/vault/ibm-kp. It is autopopulated to ibm-kp + # if the environment is IKS. + +dataInterface: ${dataInterface} # Name of the interface +managementInterface: none # Name of the interface +serviceType: none # Kubernetes service type for services deployed by the Operator. Direct Values like + # 'LoadBalancer', 'NodePort' will change all services. To change the types of specific + # services, value can be specified as 'portworx-service:LoadBalancer;portworx-api:ClusterIP' + +envVars: ${envVars} # DEPRECATED: Use envs section to set env variables + # NOTE: This is a ";" seperated list of environment variables. + # For eg: MYENV1=myvalue1;MYENV2=myvalue2 + +envs: # Add environment variables to the Portworx container in all Kubernetes supported formats + # - name: AWS_CA_BUNDLE + # value: "/etc/pwx/objectstore-cert/objectstore.pem" + # - name: AWS_ACCESS_KEY_ID + # valueFrom: + # secretKeyRef: + # name: aws-creds + # key: access-key + +miscArgs: none # Miscellaneous arguments that will be passed to portworx verbatim. Only use this if there is + # no equivalent way to specify these options directly via a StorageCluster spec field. + +disableStorageClass: false # Instructs Operator to not install the default Portworx StorageClasses. + +stork: ${useStork} # Use Stork https://docs.portworx.com/scheduler/kubernetes/stork.html for hyperconvergence. +storkVersion: ${storkVersion} # Optional: version of Stork. For eg: 2.7.0, when it's empty Portworx operator will pick up + # version according to Portworx version. + +storkSpec: # Optional Stork configurations + args: # Pass arguments to Stork container. Example: verbose='true';webhook-controller='false' + volumes: # Add volumes to Stork container. Refer the top level volumes for schema. + +customRegistryURL: ${customRegistryURL} #Url wherre to pull Portworx image from +registrySecret: ${registrySecret} #Image registery credentials to pull Portworx Images from a secure registry +licenseSecret: ${licenseSecret} #Kubernetes secret name that has Portworx licensing information + +monitoring: ${monitoring} + +deployOnMaster: false # For POC only +csi: ${enableCSI} # Enable CSI +aut: ${enableAutopilot} # Enable AutoPilot + + +internalKVDB: ${useInternalKVDB} # internal KVDB +kvdbDevice: ${kvdbDevice} # specify a separate device to store KVDB data, only used when internalKVDB is set to true + +etcd: # DEPRECATED: Use kvdb.authSecretName for configuring secure etcd + credentials: none:none # Username and password for ETCD authentication in the form user:password + certPath: none # Base path where the certificates are placed. (example: if the certificates ca,crt and the key are in /etc/pwx/etcdcerts the value should be provided as /etc/pwx/etcdcerts) + ca: none # Location of CA file for ETCD authentication. Should be /path/to/server.ca + cert: none # Location of certificate for ETCD authentication. Should be /path/to/server.crt + key: none # Location of certificate key for ETCD authentication Should be /path/to/servery.key + +consul: # DEPRECATED: Use kvdb.authSecretName for configuring secure consul + token: none # ACL token value used for Consul authentication. (example: 398073a8-5091-4d9c-871a-bbbeb030d1f6) + +kvdb: + authSecretName: ${KVDBauthSecretName} # Refer https://docs.portworx.com/reference/etcd/#securing-with-certificates-in-kubernetes to + # create a kvdb secret and specify the name of the secret here + +volumes: # Add volumes to Portworx container. Supported volume types: Host, Secret, ConfigMap + # - name: objectstore-cert + # mountPath: /etc/pwx/objectstore-cert + # secret: + # secretName: objectstore-cert + # items: + # - key: objectstore.pem + # path: objectstore.pem + +tolerations: # Add tolerations + # - key: "key" + # operator: "Equal|Exists" + # value: "value" + # effect: "NoSchedule|PreferNoSchedule|NoExecute(1.6 only)" + +serviceAccount: + hook: + create: true + name: + +aws: + marketplace: + eksServiceAccount: ${eksServiceAccount} + accessKeyId: ${awsAccessKeyId} + secretAccessKey: ${awsSecretAccessKey} + +deleteType: ${deleteType} + +clusterToken: + create: true # Create cluster token + secretName: px-vol-encryption # Name of kubernetes secret to be created. Requires clusterToken.create to be true. + serviceAccountName: px-create-cluster-token # Service account name to use for post-install hook to create cluster token + +#requirePxEnabledTag: true # if set to true, portworx will only install on nodes with px/enabled: true label. Not required in most scenarios. + +deleteStrategy: # Optional: Delete strategy for the portworx cluster + type: # Valid values: Uninstall, UninstallAndWipe diff --git a/modules/kubernetes-addons/portworx/variables.tf b/modules/kubernetes-addons/portworx/variables.tf new file mode 100644 index 0000000000..9c99f07327 --- /dev/null +++ b/modules/kubernetes-addons/portworx/variables.tf @@ -0,0 +1,11 @@ +variable "helm_config" { + description = "Helm chart config. Repository and version required. See https://registry.terraform.io/providers/hashicorp/helm/latest/docs" + type = any + default = {} +} + +variable "addon_context" { + description = "Input configuration for the addon" + type = any + default = {} +} diff --git a/modules/kubernetes-addons/portworx/versions.tf b/modules/kubernetes-addons/portworx/versions.tf new file mode 100644 index 0000000000..4ec4fc423b --- /dev/null +++ b/modules/kubernetes-addons/portworx/versions.tf @@ -0,0 +1,18 @@ +terraform { + required_version = ">= 1.0" + + required_providers { + aws = { + source = "hashicorp/aws" + version = ">= 4.67" + } + kubernetes = { + source = "hashicorp/kubernetes" + version = ">= 2.10" + } + random = { + source = "hashicorp/random" + version = ">= 3.0" + } + } +} diff --git a/modules/kubernetes-addons/sysdig/README.md b/modules/kubernetes-addons/sysdig/README.md new file mode 100644 index 0000000000..042c8ea6a5 --- /dev/null +++ b/modules/kubernetes-addons/sysdig/README.md @@ -0,0 +1,3 @@ +# Sysdig Addon for EKS Blueprints + +Locally copy of https://github.com/sysdiglabs/terraform-eksblueprints-sysdig-addon diff --git a/modules/kubernetes-addons/sysdig/locals.tf b/modules/kubernetes-addons/sysdig/locals.tf new file mode 100644 index 0000000000..71522d64c3 --- /dev/null +++ b/modules/kubernetes-addons/sysdig/locals.tf @@ -0,0 +1,27 @@ +locals { + name = "sysdig" + namespace = "sysdig" + + set_values = [] + + default_helm_config = { + name = local.name + chart = "sysdig-deploy" + repository = "https://charts.sysdig.com" + version = "1.5.71" + namespace = local.namespace + create_namespace = true + values = local.default_helm_values + set = [] + description = "Sysdig HelmChart Sysdig-Deploy configuration" + wait = false + } + + helm_config = merge( + local.default_helm_config, + var.helm_config + ) + + default_helm_values = [templatefile("${path.module}/values-sysdig.yaml", {}, )] + +} diff --git a/modules/kubernetes-addons/sysdig/main.tf b/modules/kubernetes-addons/sysdig/main.tf new file mode 100644 index 0000000000..a913bdcae0 --- /dev/null +++ b/modules/kubernetes-addons/sysdig/main.tf @@ -0,0 +1,7 @@ +module "helm_addon" { + source = "../helm-addon" + + addon_context = var.addon_context + set_values = local.set_values + helm_config = local.helm_config +} diff --git a/modules/kubernetes-addons/sysdig/outputs.tf b/modules/kubernetes-addons/sysdig/outputs.tf new file mode 100644 index 0000000000..b5d714acb4 --- /dev/null +++ b/modules/kubernetes-addons/sysdig/outputs.tf @@ -0,0 +1,4 @@ +output "argocd_gitops_config" { + description = "Configuration used for managing the add-on with ArgoCD" + value = var.manage_via_gitops ? { enable = true } : null +} diff --git a/modules/kubernetes-addons/sysdig/values-sysdig.yaml b/modules/kubernetes-addons/sysdig/values-sysdig.yaml new file mode 100644 index 0000000000..8628751261 --- /dev/null +++ b/modules/kubernetes-addons/sysdig/values-sysdig.yaml @@ -0,0 +1,17 @@ +global: + kspm: + deploy: true +agent: + sysdig: + settings: + collector_port: 6443 +nodeAnalyzer: + nodeAnalyzer: + benchmarkRunner: + deploy: false + runtimeScanner: + settings: + eveEnabled: true + secure: + vulnerabilityManagement: + newEngineOnly: true diff --git a/modules/kubernetes-addons/sysdig/variables.tf b/modules/kubernetes-addons/sysdig/variables.tf new file mode 100644 index 0000000000..9a81f7fb51 --- /dev/null +++ b/modules/kubernetes-addons/sysdig/variables.tf @@ -0,0 +1,35 @@ +#Helm config +variable "helm_config" { + type = any + description = "Helm Configuration for Sysdig Agent" + default = {} +} + +# Manage via gitops +variable "manage_via_gitops" { + description = "Determines if the add-on should be managed via GitOps" + type = bool + default = false +} + +# tflint-ignore: terraform_unused_declarations +variable "cluster_name" { + description = "Cluster name" + type = string + default = "" +} + +variable "addon_context" { + description = "Input configuration for the addon" + type = object({ + aws_caller_identity_account_id = string + aws_caller_identity_arn = string + aws_eks_cluster_endpoint = string + aws_partition_id = string + aws_region_name = string + eks_cluster_id = string + eks_oidc_issuer_url = string + eks_oidc_provider_arn = string + tags = map(string) + }) +} diff --git a/modules/kubernetes-addons/sysdig/versions.tf b/modules/kubernetes-addons/sysdig/versions.tf new file mode 100644 index 0000000000..d2ddf87cc2 --- /dev/null +++ b/modules/kubernetes-addons/sysdig/versions.tf @@ -0,0 +1,14 @@ +terraform { + required_version = ">= 1.0.0" + + required_providers { + aws = { + source = "hashicorp/aws" + version = ">= 3.72" + } + kubernetes = { + source = "hashicorp/kubernetes" + version = ">= 2.10" + } + } +} diff --git a/variables.tf b/variables.tf index 1500741937..e9167393ca 100644 --- a/variables.tf +++ b/variables.tf @@ -108,6 +108,12 @@ variable "cluster_security_group_tags" { default = {} } +variable "create_cluster_primary_security_group_tags" { + description = "Indicates whether or not to tag the cluster's primary security group. This security group is created by the EKS service, not the module, and therefore tagging is handled after cluster creation" + type = bool + default = true +} + #------------------------------- # EKS Cluster VPC Config #-------------------------------