diff --git a/governance/steering.md b/governance/steering.md index 3540af136..4b6a244ec 100644 --- a/governance/steering.md +++ b/governance/steering.md @@ -1,49 +1,59 @@ # Steering Committee and Module Leads + ## Steering Commitee Members + The Steering Committee is a 6 member body, overseeing the governance of the EKS Workshop. + ### Terms end in February 2024 -|Name|Profile|Role| -|:----|:-------|:----| -|Sai Vennam|[@svennam92](https://github.com/svennam92)|Principal EKS DA -|Niall Thomson|[@niallthomson](https://github.com/niallthomson)|Specialist Solution Architect, Containers| -|Ray Krueger|[@raykrueger](https://github.com/raykrueger)|Principal Container Specialist| -|Ameet Naik|[@ameetnaik](https://github.com/ameetnaik)|Technical Account Manager| -|Kamran Habib|[@kmhabib](https://github.com/kmhabib)|Solution Architect (TFC at large)| -|Theo Salvo|[@buzzsurfr](https://github.com/buzzsurfr)|Container Specialist (TFC core team member)| + +| Name | Profile | Role | +| :------------ | :----------------------------------------------- | :------------------------------------------ | +| Sai Vennam | [@svennam92](https://github.com/svennam92) | Principal EKS DA | +| Niall Thomson | [@niallthomson](https://github.com/niallthomson) | Specialist Solution Architect, Containers | +| Ray Krueger | [@raykrueger](https://github.com/raykrueger) | Principal Container Specialist | +| Ameet Naik | [@ameetnaik](https://github.com/ameetnaik) | Technical Account Manager | +| Kamran Habib | [@kmhabib](https://github.com/kmhabib) | Solution Architect (TFC at large) | +| Theo Salvo | [@buzzsurfr](https://github.com/buzzsurfr) | Container Specialist (TFC core team member) | ## Working Groups + The working groups are led by chairs (6 month terms) and maintainers (6 month terms). -|Working Group|Chair|Maintainers| -|:----|:-------|:----| -|Infrastructure|[Niall Thomson](https://github.com/niallthomson)|| -|Fundamentals|[Sai Vennam](https://github.com/svennam92)|[Bijith Nair](https://github.com/bijithnair), [Tolu Okuboyejo](https://github.com/oktab1), [Hemanth AVS](https://github.com/hemanth-avs)| -|Autoscaling|[Sanjeev Ganjihal](https://github.com/sanjeevrg89)|| -|Automation|[Carlos Santana](https://github.com/csantanapr)|[Tsahi Duek](https://github.com/tsahiduek), [Christina Andonov](https://github.com/candonov), [Sébastien Allamand](https://github.com/allamand)| -|Machine Learning|[Masatoshi Hayashi](https://github.com/literalice)|| -|Networking|[Sheetal Joshi](https://github.com/sheetaljoshi)|[Umair Ishaq](https://github.com/umairishaq)| -|Observability|[Nirmal Mehta](https://github.com/normalfaults)|[Steven David](https://github.com/StevenDavid)| -|Security|[Rodrigo Bersa](https://github.com/rodrigobersa)| | -|Storage|[Eric Heinrichs](https://github.com/heinrichse)|[Andrew Peng](https://github.com/pengc99)| +| Working Group | Chair | Maintainers | +| :--------------- | :------------------------------------------------- | :---------------------------------------------------------------------------------------------------------------------------------------------- | +| Infrastructure | [Niall Thomson](https://github.com/niallthomson) | | +| Fundamentals | [Sai Vennam](https://github.com/svennam92) | [Bijith Nair](https://github.com/bijithnair), [Tolu Okuboyejo](https://github.com/oktab1), [Hemanth AVS](https://github.com/hemanth-avs) | +| Autoscaling | [Sanjeev Ganjihal](https://github.com/sanjeevrg89) | | +| Automation | [Carlos Santana](https://github.com/csantanapr) | [Tsahi Duek](https://github.com/tsahiduek), [Christina Andonov](https://github.com/candonov), [Sébastien Allamand](https://github.com/allamand) | +| Machine Learning | [Masatoshi Hayashi](https://github.com/literalice) | [Benjamin Gardiner](https://github.com/bkgardiner) | +| Networking | [Sheetal Joshi](https://github.com/sheetaljoshi) | [Umair Ishaq](https://github.com/umairishaq) | +| Observability | [Nirmal Mehta](https://github.com/normalfaults) | [Steven David](https://github.com/StevenDavid) | +| Security | [Rodrigo Bersa](https://github.com/rodrigobersa) | | +| Storage | [Eric Heinrichs](https://github.com/heinrichse) | [Andrew Peng](https://github.com/pengc99) | ## Wranglers + Wranglers will work across all topic areas and serve for at least 6 months. |Name|Profile|Role| |:----|:-------|:----| |Math Bruneau|[@ROunofF](https://github.com/ROunofF)|Specialist Solution Architect, Containers| - ## Emeritus -|Name|Profile|Role| -|:----|:-------|:----| -|Jeremy Cowan|[@jicowan](https://github.com/jicowan)|EKS DA manager| + +| Name | Profile | Role | +| :----------- | :------------------------------------- | :------------- | +| Jeremy Cowan | [@jicowan](https://github.com/jicowan) | EKS DA manager | ## Meetings + ### Schedule and Cadence + The steering committee will host a public meeting every third Thursday of the month at 9AM CT. ### Resources -* + +- ## Contact -* Mailing List: + +- Mailing List: diff --git a/manifests/modules/aiml/inferentia/.workshop/cleanup.sh b/manifests/modules/aiml/inferentia/.workshop/cleanup.sh new file mode 100644 index 000000000..25b5c37aa --- /dev/null +++ b/manifests/modules/aiml/inferentia/.workshop/cleanup.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +set -e + +echo "Deleting AIML resources..." + +kubectl delete namespace aiml > /dev/null + +echo "Deleting Karpenter provisioners..." + +kubectl delete provisioner --all > /dev/null +kubectl delete awsnodetemplate --all > /dev/null + +echo "Waiting for Karpenter nodes to be removed..." + +EXIT_CODE=0 + +timeout --foreground -s TERM 30 bash -c \ + 'while [[ $(kubectl get nodes --selector=type=karpenter -o json | jq -r ".items | length") -gt 0 ]];\ + do sleep 5;\ + done' || EXIT_CODE=$? + +if [ $EXIT_CODE -ne 0 ]; then + echo "Warning: Karpenter nodes did not clean up" +fi \ No newline at end of file diff --git a/manifests/modules/aiml/inferentia/.workshop/terraform/addon.tf b/manifests/modules/aiml/inferentia/.workshop/terraform/addon.tf new file mode 100644 index 000000000..5d872d5ce --- /dev/null +++ b/manifests/modules/aiml/inferentia/.workshop/terraform/addon.tf @@ -0,0 +1,128 @@ +data "aws_subnets" "private" { + tags = { + created-by = "eks-workshop-v2" + env = local.addon_context.eks_cluster_id + } + + filter { + name = "tag:Name" + values = ["*Private*"] + } +} + +module "iam_assumable_role_inference" { + source = "terraform-aws-modules/iam/aws//modules/iam-assumable-role-with-oidc" + version = "~> v5.5.0" + create_role = true + role_name = "${local.addon_context.eks_cluster_id}-inference" + provider_url = local.addon_context.eks_oidc_issuer_url + role_policy_arns = [aws_iam_policy.inference.arn] + oidc_fully_qualified_subjects = ["system:serviceaccount:aiml:inference"] + + tags = local.tags +} + + +resource "aws_iam_policy" "inference" { + name = "${local.addon_context.eks_cluster_id}-inference" + path = "/" + description = "IAM policy for the inferenct workload" + + policy = < aten::_convolution: 53 +INFO:Neuron: => aten::adaptive_avg_pool2d: 1 +INFO:Neuron: => aten::add_: 16 +INFO:Neuron: => aten::batch_norm: 53 +INFO:Neuron: => aten::flatten: 1 +INFO:Neuron: => aten::linear: 1 +INFO:Neuron: => aten::max_pool2d: 1 +INFO:Neuron: => aten::relu_: 49 + +``` + +Finally, upload the model to the S3 bucket that has been created for you. This will ensure we can use the model later in the lab. + +```bash +$ kubectl -n aiml exec -it compiler -- aws s3 cp ./resnet50_neuron.pt s3://$AIML_NEURON_BUCKET_NAME/ + +upload: ./resnet50_neuron.pt to s3://eksworkshop-inference20230511204343601500000001/resnet50_neuron.pt +``` diff --git a/website/docs/aiml/inferentia/index.md b/website/docs/aiml/inferentia/index.md new file mode 100644 index 000000000..0d199d55b --- /dev/null +++ b/website/docs/aiml/inferentia/index.md @@ -0,0 +1,40 @@ +--- +title: "Inference with AWS Inferentia" +sidebar_position: 10 +chapter: true +sidebar_custom_props: { "module": true } +--- + +:::tip Before you start +Prepare your environment for this section: + +```bash timeout=300 wait=30 +$ prepare-environment aiml/inferentia +``` + +This will make the following changes to your lab environment: + +- Installs Karpenter in the Amazon EKS cluster +- Creates an S3 Bucket to store results +- Creates an IAM Role for the Pods to use +- Installs the [AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/containers/dlc-then-eks-devflow.html) device plugin + +You can view the Terraform that applies these changes [here](https://github.com/VAR::MANIFESTS_OWNER/VAR::MANIFESTS_REPOSITORY/tree/VAR::MANIFESTS_REF/manifests/modules/aiml/inferentia/.workshop/terraform). + +::: + +[AWS Inferentia](https://aws.amazon.com/machine-learning/inferentia/?nc1=h_ls) is the purpose-built accelerator designed to accelerate deep learning workloads. + +Inferentia has processing cores called Neuron Cores, which have high-speed access to models stored in on-chip memory. + +You can easily use the accelerator on EKS. The Neuron device plugin exposes Neuron cores and devices to Kubernetes as a resource. When your workloads require Neuron cores, the Kubernetes scheduler can assign the Inferentia node to the workloads. You can even provision the node automatically using Karpenter. + +This lab provides a tutorial on how to use Inferentia to accelerate deep learning inference workloads on EKS. +In this lab we will: + +1. Compile a ResNet-50 pre-trained model for use with AWS Inferentia +2. Upload this model to an S3 Bucket for later use +3. Create a Karpenter Provisioner to provision Inferentia EC2 instances +4. Launch an inference Pod that uses our previous model to run our inference against + +Let's get started. diff --git a/website/docs/aiml/inferentia/inference.md b/website/docs/aiml/inferentia/inference.md new file mode 100644 index 000000000..cfe123a7b --- /dev/null +++ b/website/docs/aiml/inferentia/inference.md @@ -0,0 +1,140 @@ +--- +title: "Run Inference on an AWS Inferentia Node using Amazon EKS" +sidebar_position: 30 +--- + +Now we can use the compiled model to run an inference workload on an AWS Inferentia node. + +### Install Device Plugin for AWS Inferentia + +In order for our DLC to use the Neuron cores they need to be exposed. The [Neuron device plugin Kubernetes manifest files](https://github.com/aws-neuron/aws-neuron-sdk/tree/master/src/k8) expose the Neuron cores to the DLC. These manifest files have been pre-installed into the EKS Cluster. + +When a Pod requires the exposed Neuron cores, the Kubernetes scheduler can provision an Inferentia node to schedule the Pod to. This is the Pod that we will schedule. Note that we have a resource requirement of `aws.amazon.com/neuron`. + +```file +manifests/modules/aiml/inferentia/inference/inference.yaml +``` + +### Set up a provisioner of Karpenter for launching a node which has the Inferentia chip + +The lab uses Karpenter to provision an Inferentia node. Karpenter can detect the pending pod which requires Neuron cores and launch an inf1 instance which has the required Neuron cores. + +:::tip +You can learn more about Karpenter in the [Karpenter module](../../autoscaling/compute/karpenter/index.md) that's provided in this workshop. +::: + +Karpenter has been installed in our EKS cluster, and runs as a deployment: + +```bash +$ kubectl get deployment -n karpenter +NAME READY UP-TO-DATE AVAILABLE AGE +karpenter 1/1 1 1 5m52s +``` + +The only setup that we will need to do is to update our EKS IAM mappings to allow Karpenter nodes to join the cluster: + +```bash +$ eksctl create iamidentitymapping --cluster $EKS_CLUSTER_NAME \ + --region=$AWS_REGION --arn $KARPENTER_NODE_ROLE \ + --group system:bootstrappers --group system:nodes \ + --username system:node:{{EC2PrivateDNSName}} +``` + +Karpenter requires a provisioner to provision nodes. This is the Karpenter provisioner that we will create: + +```file +manifests/modules/aiml/inferentia/provisioner/provisioner.yaml +``` + +Apply the provisioner manifest: + +```bash +$ kubectl apply -k ~/environment/eks-workshop/modules/aiml/inferentia/provisioner/ +``` + +### Create a pod for inference + +Now we can deploy a Pod for inference: + +```bash +$ kubectl apply -k ~/environment/eks-workshop/modules/aiml/inferentia/inference/ +``` + +Karpenter detects the pending pod which needs Neuron cores and launches an inf1 instance which has the Inferentia chip. Monitor the instance provisioning with the following command: + +```bash test=false +$ kubectl logs -f -n karpenter deploy/karpenter -c controller + +2022-10-28T08:24:42.704Z DEBUG controller.provisioning.cloudprovider Created launch template, Karpenter-eks-workshop-cluster-3507260904097783831 {"commit": "37c8653", "provisioner": "default"} +2022-10-28T08:24:45.125Z INFO controller.provisioning.cloudprovider Launched instance: i-09ddba6280017ae4d, hostname: ip-100-64-10-250.ap-northeast-1.compute.internal, type: inf1.xlarge, zone: ap-northeast-1a, capacityType: spot {"commit": "37c8653", "provisioner": "default"} +2022-10-28T08:24:45.136Z INFO controller.provisioning Created node with 1 pods requesting {"aws.amazon.com/neuron":"1","cpu":"125m","pods":"6"} from types inf1.xlarge, inf1.2xlarge, inf1.6xlarge, inf1.24xlarge {"commit": "37c8653", "provisioner": "default"} +2022-10-28T08:24:45.136Z INFO controller.provisioning Waiting for unschedulable pods {"commit": "37c8653"} +``` + +The inference pod should be scheduled on the node provisioned by Karpenter. Check if the Pod is in it's ready state: + +:::note +It can take up to 8 minutes to provision the node, add it to the EKS cluster, and start the pod. +::: + +```bash timeout=360 +$ kubectl -n aiml wait --for=condition=Ready --timeout=8m pod/inference +``` + +We can use the following command to get more details on the node that was provisioned to schedule our pod onto: + +```bash +$ kubectl get node -l karpenter.sh/provisioner-name=aiml -o jsonpath='{.items[0].status.capacity}' | jq . +``` + +This output shows the capacity this node has: + +```json +{ + "attachable-volumes-aws-ebs": "39", + "aws.amazon.com/neuron": "1", + "aws.amazon.com/neuroncore": "4", + "aws.amazon.com/neurondevice": "1", + "cpu": "4", + "ephemeral-storage": "104845292Ki", + "hugepages-1Gi": "0", + "hugepages-2Mi": "0", + "memory": "7832960Ki", + "pods": "38", + "vpc.amazonaws.com/pod-eni": "38" +} +``` + +We can see that this node as a `aws.amazon.com/neuron` of 1. Karpenter provisioned this node for us as that's how many neuron the pod requested. + +### Run an inference + +This is the code that we will be using to run inference using a Neuron core on Inferentia: + +```file +manifests/modules/aiml/inferentia/inference/inference.py +``` + +This Python code does the following tasks: + +1. It downloads and stores an image of a small kitten. +2. It fetches the labels for classifying the image. +3. It then imports this image and normalizes it into a tensor. +4. It loads our previously created model. +5. It runs the prediction on our small kitten image. +6. It gets the top 5 results from the prediction and prints these to the command-line. + +We copy this code to the Pod, download our previously uploaded model, and run the code: + +```bash +$ kubectl -n aiml cp ~/environment/eks-workshop/modules/aiml/inferentia/inference/inference.py inference:/ +$ kubectl -n aiml exec -it inference -- aws s3 cp s3://$AIML_NEURON_BUCKET_NAME/resnet50_neuron.pt ./ +$ kubectl -n aiml exec -it inference -- python /inference.py + +Top 5 labels: + ['tiger', 'lynx', 'tiger_cat', 'Egyptian_cat', 'tabby'] +``` + +As output we get the top 5 labels back. We are running the inference on an image of a small kitten using ResNet-50's pre-trained model, so these results are expected. As a possible next step to improve performance we could create our own data set of images and train our own model for our specific use case. This could improve our prediction results. + +This concludes this lab on using AWS Inferentia with Amazon EKS. diff --git a/website/docs/aiml/inferentia/wrapup.md b/website/docs/aiml/inferentia/wrapup.md new file mode 100644 index 000000000..8d70b2bd1 --- /dev/null +++ b/website/docs/aiml/inferentia/wrapup.md @@ -0,0 +1,14 @@ +--- +title: "Real World Implementation" +sidebar_position: 40 +--- + +In the previous sections we've seen how we can use Amazon EKS to build models for AWS Inferentia and deploy models on EKS using Inferentia nodes. In both these examples we've executed Python code inside our containers from our command-line. In a real world scenario we do not want to run these commands manually, but rather have the container execute the commands. + +For building the model we would want use the DLC container as our base image and add our Python code to it. We would then store this container image in our container repository like Amazon ECR. We would use a Kubernetes Job to run this container image on EKS and store the generated model to S3. + +![Build Model](./assets/CreateModel.png) + +For running inference against our model we would want to modify our code to allow other applications or users to retrieve the classification results from the model. This could be done by creating a REST API that we can call and responds with our classification results. We would run this application as a Kubernetes Deployment within our cluster using the AWS Inferentia resource requirement: `aws.amazon.com/neuron`. + +![Inference Model](./assets/Inference.png) diff --git a/website/docusaurus.config.js b/website/docusaurus.config.js index 7cde82a4e..1243e151e 100644 --- a/website/docusaurus.config.js +++ b/website/docusaurus.config.js @@ -31,7 +31,7 @@ const config = { onBrokenLinks: 'throw', onBrokenMarkdownLinks: 'warn', favicon: 'img/favicon.png', - noIndex: process.env.ENABLE_INDEX!=="1", + noIndex: process.env.ENABLE_INDEX !== "1", organizationName: 'aws-samples', projectName: 'eks-workshop-v2', @@ -134,6 +134,12 @@ const config = { position: 'left', label: 'Automation', }, + { + type: 'doc', + docId: 'aiml/index', + position: 'left', + label: 'AIML', + }, { href: 'https://github.com/aws-samples/eks-workshop-v2', position: 'right', diff --git a/website/sidebars.js b/website/sidebars.js index c80f8393c..3f432d0b5 100644 --- a/website/sidebars.js +++ b/website/sidebars.js @@ -14,13 +14,14 @@ /** @type {import('@docusaurus/plugin-content-docs').SidebarsConfig} */ const sidebars = { // By default, Docusaurus generates a sidebar from the docs folder structure - introduction: [{type: 'autogenerated', dirName: 'introduction'}], - fundamentals: [{type: 'autogenerated', dirName: 'fundamentals'}], - security: [{type: 'autogenerated', dirName: 'security'}], - networking: [{type: 'autogenerated', dirName: 'networking'}], - autoscaling: [{type: 'autogenerated', dirName: 'autoscaling'}], + introduction: [{ type: 'autogenerated', dirName: 'introduction' }], + fundamentals: [{ type: 'autogenerated', dirName: 'fundamentals' }], + security: [{ type: 'autogenerated', dirName: 'security' }], + networking: [{ type: 'autogenerated', dirName: 'networking' }], + autoscaling: [{ type: 'autogenerated', dirName: 'autoscaling' }], observability: [{ type: 'autogenerated', dirName: 'observability' }], automation: [{ type: 'autogenerated', dirName: 'automation' }], + aiml: [{ type: 'autogenerated', dirName: 'aiml' }], }; module.exports = sidebars;