diff --git a/hack/run-tests.sh b/hack/run-tests.sh index 54b72f851..660606ba6 100755 --- a/hack/run-tests.sh +++ b/hack/run-tests.sh @@ -76,6 +76,10 @@ RESOURCES_PRECREATED=${RESOURCES_PRECREATED:-""} echo "Running test suite..." +# get current IDs +USER_ID=$(id -u) +GROUP_ID=$(id -g) + exit_code=0 $CONTAINER_CLI run $background_args $dns_args \ diff --git a/hack/validate-terraform.sh b/hack/validate-terraform.sh old mode 100644 new mode 100755 diff --git a/lab/iam/iam-role-cfn.yaml b/lab/iam/iam-role-cfn.yaml index 5232703ea..4d2f3642b 100644 --- a/lab/iam/iam-role-cfn.yaml +++ b/lab/iam/iam-role-cfn.yaml @@ -62,3 +62,13 @@ Resources: ManagedPolicyName: ${Env}-ide-labs2 PolicyDocument: file: ./iam/policies/labs2.yaml + EksWorkshopLabsPolicy3: + Type: AWS::IAM::ManagedPolicy + DependsOn: + - EksWorkshopIdeRole + Properties: + Roles: + - !Ref EksWorkshopIdeRole + ManagedPolicyName: ${Env}-ide-labs3 + PolicyDocument: + file: ./iam/policies/labs3.yaml diff --git a/lab/iam/policies/ec2.yaml b/lab/iam/policies/ec2.yaml index 4f588cd50..4ea240f2c 100644 --- a/lab/iam/policies/ec2.yaml +++ b/lab/iam/policies/ec2.yaml @@ -9,6 +9,7 @@ Statement: Resource: ["*"] - Effect: Allow Action: + - ec2:StopInstances - ec2:TerminateInstances Resource: ["*"] Condition: diff --git a/lab/iam/policies/iam.yaml b/lab/iam/policies/iam.yaml index ab0cb2c24..b8a15252e 100644 --- a/lab/iam/policies/iam.yaml +++ b/lab/iam/policies/iam.yaml @@ -23,12 +23,19 @@ Statement: - iam:CreatePolicy - iam:DeletePolicy - iam:GetPolicyVersion + - iam:DeletePolicyVersion - iam:ListPolicyVersions - iam:TagPolicy - iam:GetPolicy Resource: - !Sub arn:aws:iam::${AWS::AccountId}:policy/${Env}* - !Sub arn:aws:iam::${AWS::AccountId}:policy/eksctl-${Env}* + - Effect: Allow + Action: + - s3:ListAllMyBuckets + - iam:ListPolicies + - iam:ListRoles + Resource: ["*"] - Effect: Allow Action: - iam:CreateInstanceProfile diff --git a/lab/iam/policies/labs3.yaml b/lab/iam/policies/labs3.yaml new file mode 100644 index 000000000..db59d58a0 --- /dev/null +++ b/lab/iam/policies/labs3.yaml @@ -0,0 +1,70 @@ +Version: "2012-10-17" +Statement: + - Effect: Allow + Action: + - s3:CreateBucket + - s3:DeleteBucket + - s3:PutObject + - s3:List* + - s3:Get* + - s3:GetObjectVersion + - s3:PutBucketPublicAccessBlock + - s3:PutBucketTagging + - s3:DeleteObject + - s3:DeleteObjectVersion + Resource: + - arn:aws:s3:::eks-workshop-canary-artifacts* + - arn:aws:s3:::aws-synthetics-library* + - Effect: Allow + Action: + - fis:CreateExperimentTemplate + - fis:CreateExperimentTemplate + - fis:GetExperimentTemplate + - fis:ListExperimentTemplates + - fis:DeleteExperimentTemplate + - fis:UpdateExperimentTemplate + - fis:TagResource + - fis:UntagResource + - fis:StartExperiment + - fis:GetExperiment + - fis:ListExperiments + Resource: + - !Sub arn:aws:fis:${AWS::Region}:${AWS::AccountId}:action/aws:eks:* + - !Sub arn:aws:fis:${AWS::Region}:${AWS::AccountId}:action/aws:ssm:* + - !Sub arn:aws:fis:${AWS::Region}:${AWS::AccountId}:experiment-template/* + - !Sub arn:aws:fis:${AWS::Region}:${AWS::AccountId}:experiment/* + + - Effect: Allow + Action: + - synthetics:CreateCanary + - synthetics:DeleteCanary + - synthetics:DescribeCanaries + - synthetics:StartCanary + - synthetics:StopCanary + - synthetics:UpdateCanary + Resource: + - !Sub arn:aws:synthetics:${AWS::Region}:${AWS::AccountId}:canary:${Env}* + - Effect: Allow + Action: + - cloudwatch:PutMetricAlarm + - cloudwatch:PutMetricData + - cloudwatch:GetMetricStatistics + - cloudwatch:ListMetrics + Resource: + - !Sub arn:aws:cloudwatch:${AWS::Region}:${AWS::AccountId}:alarm:${Env}* + - Effect: Allow + Action: + - lambda:CreateFunction + - lambda:UpdateFunctionCode + - lambda:GetFunctionConfiguration + - lambda:UpdateFunctionConfiguration + - lambda:GetFunction + - lambda:DeleteFunction + - lambda:InvokeFunction + - lambda:AddPermission + - lambda:RemovePermission + - lambda:PublishLayerVersion + - lambda:PublishVersion + Resource: + - !Sub arn:aws:lambda:${AWS::Region}:${AWS::AccountId}:function:*${Env}* + - !Sub arn:aws:lambda:${AWS::Region}:${AWS::AccountId}:layer:*${Env}* diff --git a/manifests/.workshop/terraform/base.tf b/manifests/.workshop/terraform/base.tf index 6e0be63d4..d05b67020 100644 --- a/manifests/.workshop/terraform/base.tf +++ b/manifests/.workshop/terraform/base.tf @@ -4,7 +4,7 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = "5.66.0" + version = "5.72.0" } kubernetes = { source = "hashicorp/kubernetes" diff --git a/manifests/modules/observability/resiliency/.workshop/cleanup.sh b/manifests/modules/observability/resiliency/.workshop/cleanup.sh new file mode 100755 index 000000000..1bb63ce1e --- /dev/null +++ b/manifests/modules/observability/resiliency/.workshop/cleanup.sh @@ -0,0 +1,77 @@ +#!/bin/bash + +set -e + +echo "Starting cleanup process..." + +# Function to safely delete a resource +safe_delete() { + local cmd=$1 + local resource=$2 + echo "Attempting to delete $resource..." + if $cmd 2>/dev/null; then + echo "$resource deleted successfully." + else + echo "Failed to delete $resource or it doesn't exist. Continuing..." + fi +} + +# Delete Kubernetes resources +echo "Cleaning up Kubernetes resources..." +kubectl delete ingress,deployment,service -n ui --all --ignore-not-found +kubectl delete role,rolebinding -n ui --all --ignore-not-found +kubectl delete namespace chaos-mesh --ignore-not-found + +# Uninstall Helm charts +echo "Uninstalling Helm charts..." +helm uninstall aws-load-balancer-controller -n kube-system || true +helm uninstall chaos-mesh -n chaos-mesh || true + +# Delete ALBs +echo "Cleaning up ALBs..." +for alb_arn in $(aws elbv2 describe-load-balancers --query "LoadBalancers[?starts_with(LoadBalancerName, 'k8s-ui-ui-') || starts_with(LoadBalancerName, 'k8s-default-ui-')].LoadBalancerArn" --output text); do + safe_delete "aws elbv2 delete-load-balancer --load-balancer-arn $alb_arn" "ALB $alb_arn" +done + +# Delete IAM Roles and Policies +echo "Cleaning up IAM roles and policies..." +for role_prefix in "fis-execution-role-eks-workshop" "canary-execution-role-eks-workshop"; do + for role in $(aws iam list-roles --query "Roles[?starts_with(RoleName, '${role_prefix}')].RoleName" --output text); do + echo "Processing role: $role" + for policy in $(aws iam list-attached-role-policies --role-name $role --query "AttachedPolicies[*].PolicyArn" --output text); do + safe_delete "aws iam detach-role-policy --role-name $role --policy-arn $policy" "attached policy $policy from role $role" + done + for policy in $(aws iam list-role-policies --role-name $role --query "PolicyNames" --output text); do + safe_delete "aws iam delete-role-policy --role-name $role --policy-name $policy" "inline policy $policy from role $role" + done + safe_delete "aws iam delete-role --role-name $role" "IAM role $role" + done +done + +for policy_prefix in "eks-resiliency-fis-policy" "eks-resiliency-canary-policy"; do + for policy_arn in $(aws iam list-policies --scope Local --query "Policies[?starts_with(PolicyName, '${policy_prefix}')].Arn" --output text); do + safe_delete "aws iam delete-policy --policy-arn $policy_arn" "IAM policy $policy_arn" + done +done + +# Delete S3 buckets +echo "Cleaning up S3 buckets..." +for bucket in $(aws s3api list-buckets --query "Buckets[?starts_with(Name, 'eks-workshop-canary-artifacts-')].Name" --output text); do + aws s3 rm s3://$bucket --recursive + safe_delete "aws s3api delete-bucket --bucket $bucket" "S3 bucket $bucket" +done + +# Delete CloudWatch Synthetics canary and alarm +CANARY_NAME="eks-workshop-canary" +ALARM_NAME="eks-workshop-canary-alarm" + +echo "Cleaning up CloudWatch Synthetics canary and alarm..." +if aws synthetics get-canary --name $CANARY_NAME &>/dev/null; then + aws synthetics stop-canary --name $CANARY_NAME || true + sleep 30 + safe_delete "aws synthetics delete-canary --name $CANARY_NAME" "CloudWatch Synthetics canary $CANARY_NAME" +fi + +safe_delete "aws cloudwatch delete-alarms --alarm-names $ALARM_NAME" "CloudWatch alarm $ALARM_NAME" + +echo "Cleanup process completed. Please check for any remaining resources manually." \ No newline at end of file diff --git a/manifests/modules/observability/resiliency/.workshop/terraform/main.tf b/manifests/modules/observability/resiliency/.workshop/terraform/main.tf new file mode 100644 index 000000000..30bfaf442 --- /dev/null +++ b/manifests/modules/observability/resiliency/.workshop/terraform/main.tf @@ -0,0 +1,404 @@ +module "eks_blueprints_addons" { + source = "aws-ia/eks-blueprints-addons/aws" + version = "1.16.3" + + cluster_name = var.addon_context.eks_cluster_id + cluster_endpoint = var.addon_context.aws_eks_cluster_endpoint + cluster_version = var.eks_cluster_version + oidc_provider_arn = var.addon_context.eks_oidc_provider_arn + + enable_aws_load_balancer_controller = true + aws_load_balancer_controller = { + wait = true + role_name = "${var.addon_context.eks_cluster_id}-alb-controller" + policy_name = "${var.addon_context.eks_cluster_id}-alb-controller" + } + create_kubernetes_resources = false + +} + + +# ALB creation +resource "kubernetes_manifest" "ui_alb" { + manifest = { + "apiVersion" = "networking.k8s.io/v1" + "kind" = "Ingress" + "metadata" = { + "name" = "ui" + "namespace" = "ui" + "annotations" = { + "alb.ingress.kubernetes.io/scheme" = "internet-facing" + "alb.ingress.kubernetes.io/target-type" = "ip" + "alb.ingress.kubernetes.io/healthcheck-path" = "/actuator/health/liveness" + } + } + "spec" = { + ingressClassName = "alb", + "rules" = [{ + "http" = { + paths = [{ + path = "/" + pathType = "Prefix" + "backend" = { + service = { + name = "ui" + port = { + number = 80 + } + } + } + }] + } + }] + } + } +} + +# Create RBAC and Rolebinding +resource "kubernetes_role" "chaos_mesh_role" { + metadata { + name = "chaos-mesh-role" + namespace = "ui" + } + + rule { + api_groups = ["chaos-mesh.org"] + resources = ["podchaos"] + verbs = ["create", "delete", "get", "list", "patch", "update", "watch"] + } + + rule { + api_groups = [""] + resources = ["pods"] + verbs = ["get", "list", "watch"] + } +} + +data "aws_caller_identity" "current" {} + +resource "kubernetes_role_binding" "chaos_mesh_rolebinding" { + metadata { + name = "chaos-mesh-rolebinding" + namespace = "ui" + } + + role_ref { + api_group = "rbac.authorization.k8s.io" + kind = "Role" + name = kubernetes_role.chaos_mesh_role.metadata[0].name + } + + subject { + kind = "User" + name = data.aws_caller_identity.current.arn + namespace = "ui" + } +} + +# Add AWS Load Balancer controller +resource "helm_release" "aws_load_balancer_controller" { + name = "aws-load-balancer-controller" + repository = "https://aws.github.io/eks-charts" + chart = "aws-load-balancer-controller" + namespace = "kube-system" + version = var.load_balancer_controller_chart_version + + set { + name = "clusterName" + value = var.addon_context.eks_cluster_id + } + + set { + name = "serviceAccount.name" + value = "aws-load-balancer-controller-sa" + } + + set { + name = "serviceAccount.annotations.eks\\.amazonaws\\.com/role-arn" + value = module.eks_blueprints_addons.aws_load_balancer_controller.iam_role_arn + } +} + + +# Chaos Mesh Helm Release +#resource "helm_release" "chaos_mesh" { +# name = "chaos-mesh" +# repository = "https://charts.chaos-mesh.org" +# chart = "chaos-mesh" +# namespace = "chaos-mesh" +# version = "2.5.1" +# +# create_namespace = true +#} + +# FIS IAM role +resource "random_id" "suffix" { + byte_length = 8 +} + +resource "aws_iam_role" "fis_role" { + name = "${var.addon_context.eks_cluster_id}-fis_role-${random_id.suffix.hex}" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Principal = { + Service = [ + "fis.amazonaws.com" + ] + } + Action = "sts:AssumeRole" + }, + { + Effect = "Allow" + Principal = { + Federated = var.addon_context.eks_oidc_provider_arn + } + Action = "sts:AssumeRoleWithWebIdentity" + Condition = { + StringEquals = { + "${trimprefix(var.addon_context.eks_oidc_provider_arn, "arn:aws:iam::${data.aws_caller_identity.current.account_id}:oidc-provider/")}:sub" = [ + "system:serviceaccount:ui:chaos-mesh-sa" + ] + } + } + }, + { + Effect = "Allow" + Principal = { + Service = "ssm.amazonaws.com" + } + Action = "sts:AssumeRole" + } + ] + }) + + lifecycle { + create_before_destroy = true + } + + depends_on = [kubernetes_role_binding.chaos_mesh_rolebinding] +} + +# Attach FIS Access Policy +resource "aws_iam_role_policy_attachment" "fis_eks_access" { + policy_arn = "arn:aws:iam::aws:policy/service-role/AWSFaultInjectionSimulatorEKSAccess" + role = aws_iam_role.fis_role.name +} + +resource "aws_iam_role_policy_attachment" "fis_network_access" { + policy_arn = "arn:aws:iam::aws:policy/service-role/AWSFaultInjectionSimulatorNetworkAccess" + role = aws_iam_role.fis_role.name +} + +# Attach to FIS for EKS node group +resource "aws_iam_role_policy_attachment" "fis_node_policy" { + policy_arn = "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy" + role = aws_iam_role.fis_role.name +} + +resource "aws_iam_role_policy_attachment" "fis_ecr_policy" { + policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly" + role = aws_iam_role.fis_role.name +} + +resource "aws_iam_role_policy_attachment" "fis_cni_policy" { + policy_arn = "arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy" + role = aws_iam_role.fis_role.name +} + +# Policy for creating FIS experiment templates +resource "aws_iam_policy" "eks_resiliency_fis_policy" { + name = "${var.addon_context.eks_cluster_id}-resiliency_fis_policy-${random_id.suffix.hex}" + path = "/" + description = "Custom policy for EKS resiliency FIS experiments" + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = [ + # FIS + "fis:CreateExperimentTemplate", + "fis:GetExperimentTemplate", + "fis:ListExperimentTemplates", + "fis:DeleteExperimentTemplate", + "fis:UpdateExperimentTemplate", + "fis:TagResource", + "fis:UntagResource", + "fis:StartExperiment", + "fis:GetExperiment", + "fis:ListExperiments", + "ec2:DescribeInstances", + "ec2:DescribeInstanceStatus", + "ec2:TerminateInstances", + "ec2:StartInstances", + "ec2:StopInstances", + "eks:DescribeCluster", + "eks:ListNodegroups", + "eks:DescribeNodegroup", + "autoscaling:DescribeAutoScalingGroups", + "autoscaling:DescribeAutoScalingInstances", + "autoscaling:SetDesiredCapacity", + "autoscaling:SuspendProcesses", + "autoscaling:ResumeProcesses", + "logs:CreateLogDelivery", + "logs:GetLogDelivery", + "logs:UpdateLogDelivery", + "logs:DeleteLogDelivery", + "logs:ListLogDeliveries", + "ssm:StartAutomationExecution", + "ssm:GetAutomationExecution", + "cloudwatch:DescribeAlarms", + "cloudwatch:GetMetricData", + "iam:PassRole" + ] + Resource = "*" + }, + { + Effect = "Allow" + Action = "iam:PassRole" + Resource = aws_iam_role.fis_role.arn + } + ] + }) +} + +# Attach custom policy to the role +resource "aws_iam_role_policy_attachment" "eks_resiliency_fis_policy_attachment" { + policy_arn = aws_iam_policy.eks_resiliency_fis_policy.arn + role = aws_iam_role.fis_role.name +} + + +# Canary IAM role +resource "aws_iam_role" "canary_role" { + name = "${var.addon_context.eks_cluster_id}-canary_role-${random_id.suffix.hex}" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Principal = { + Service = [ + "lambda.amazonaws.com", + "synthetics.amazonaws.com" + ] + } + Action = "sts:AssumeRole" + } + ] + }) + + lifecycle { + create_before_destroy = true + } +} + +# Attach Lambda Basic Execution Role to Canary role +resource "aws_iam_role_policy_attachment" "canary_lambda_basic_execution" { + policy_arn = "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole" + role = aws_iam_role.canary_role.name +} + +# Policy for Canary +resource "aws_iam_policy" "eks_resiliency_canary_policy" { + name = "${var.addon_context.eks_cluster_id}-resiliency_canary_policy-${random_id.suffix.hex}" + path = "/" + description = "Custom policy for EKS resiliency Canary" + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = [ + "synthetics:CreateCanary", + "synthetics:DeleteCanary", + "synthetics:DescribeCanaries", + "synthetics:StartCanary", + "synthetics:StopCanary", + "synthetics:UpdateCanary", + "s3:PutObject", + "s3:GetBucketLocation", + "s3:ListAllMyBuckets", + "s3:GetObject", + "s3:ListBucket", + "cloudwatch:PutMetricData", + "cloudwatch:GetMetricStatistics", + "cloudwatch:ListMetrics", + "logs:CreateLogGroup", + "logs:CreateLogStream", + "logs:PutLogEvents", + "logs:DescribeLogGroups", + "logs:DescribeLogStreams", + "lambda:CreateFunction", + "lambda:UpdateFunctionCode", + "lambda:UpdateFunctionConfiguration", + "lambda:GetFunction", + "lambda:DeleteFunction", + "lambda:InvokeFunction", + "lambda:AddPermission", + "lambda:RemovePermission", + "lambda:PublishLayerVersion", + "lambda:PublishVersion", + "iam:PassRole" + ] + Resource = "*" + } + ] + }) +} + +# Attach custom policy to the Canary role +resource "aws_iam_role_policy_attachment" "eks_resiliency_canary_policy_attachment" { + policy_arn = aws_iam_policy.eks_resiliency_canary_policy.arn + role = aws_iam_role.canary_role.name +} + +# EKS Cluster IAM Role +resource "aws_iam_role" "eks_cluster_role" { + name = "eks-workshop-cluster-role-${var.addon_context.eks_cluster_id}-${random_id.suffix.hex}" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Principal = { + Service = "eks.amazonaws.com" + } + Action = "sts:AssumeRole" + } + ] + }) + + lifecycle { + create_before_destroy = true + } +} + +# Attach required policies to EKS Cluster role +resource "aws_iam_role_policy_attachment" "eks_cluster_policy" { + policy_arn = "arn:aws:iam::aws:policy/AmazonEKSClusterPolicy" + role = aws_iam_role.eks_cluster_role.name +} + +resource "aws_iam_role_policy_attachment" "eks_vpc_resource_controller" { + policy_arn = "arn:aws:iam::aws:policy/AmazonEKSVPCResourceController" + role = aws_iam_role.eks_cluster_role.name +} + +# Executable Scripts +resource "null_resource" "chmod_all_scripts_bash" { + provisioner "local-exec" { + command = "find ${var.script_dir} -type f -exec chmod +x {} + || true" + } +} + +# Add Region terraform +data "aws_region" "current" {} \ No newline at end of file diff --git a/manifests/modules/observability/resiliency/.workshop/terraform/outputs.tf b/manifests/modules/observability/resiliency/.workshop/terraform/outputs.tf new file mode 100644 index 000000000..45313ce78 --- /dev/null +++ b/manifests/modules/observability/resiliency/.workshop/terraform/outputs.tf @@ -0,0 +1,13 @@ +output "environment_variables" { + description = "Environment variables to be added to the IDE shell" + value = { + LBC_CHART_VERSION = var.load_balancer_controller_chart_version + LBC_ROLE_ARN = module.eks_blueprints_addons.aws_load_balancer_controller.iam_role_arn + FIS_ROLE_ARN = aws_iam_role.fis_role.arn + RANDOM_SUFFIX = random_id.suffix.hex + SCRIPT_DIR = var.script_dir + CANARY_ROLE_ARN = aws_iam_role.canary_role.arn + EKS_CLUSTER_ROLE_ARN = aws_iam_role.eks_cluster_role.arn + AWS_REGION = data.aws_region.current.name + } +} \ No newline at end of file diff --git a/manifests/modules/observability/resiliency/.workshop/terraform/vars.tf b/manifests/modules/observability/resiliency/.workshop/terraform/vars.tf new file mode 100644 index 000000000..ee4b68ade --- /dev/null +++ b/manifests/modules/observability/resiliency/.workshop/terraform/vars.tf @@ -0,0 +1,49 @@ +# tflint-ignore: terraform_unused_declarations +variable "eks_cluster_id" { + description = "EKS cluster name" + type = string +} + +# tflint-ignore: terraform_unused_declarations +variable "eks_cluster_version" { + description = "EKS cluster version" + type = string +} + +# tflint-ignore: terraform_unused_declarations +variable "cluster_security_group_id" { + description = "EKS cluster security group ID" + type = any +} + +# tflint-ignore: terraform_unused_declarations +variable "addon_context" { + description = "Addon context that can be passed directly to blueprints addon modules" + type = any +} + +# tflint-ignore: terraform_unused_declarations +variable "tags" { + description = "Tags to apply to AWS resources" + type = any +} + +# tflint-ignore: terraform_unused_declarations +variable "resources_precreated" { + description = "Have expensive resources been created already" + type = bool +} + +variable "load_balancer_controller_chart_version" { + description = "The chart version of aws-load-balancer-controller to use" + type = string + # renovate-helm: depName=aws-load-balancer-controller + default = "1.8.1" +} + +# Executable Scripts +variable "script_dir" { + description = "Directory where scripts are located" + type = string + default = "environment/eks-workshop/modules/observability/resiliency/scripts" +} \ No newline at end of file diff --git a/manifests/modules/observability/resiliency/high-availability/config/kustomization.yaml b/manifests/modules/observability/resiliency/high-availability/config/kustomization.yaml new file mode 100644 index 000000000..deae0ee7f --- /dev/null +++ b/manifests/modules/observability/resiliency/high-availability/config/kustomization.yaml @@ -0,0 +1,8 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../../../../../../manifests/base-application/ui + +patches: + - path: scale_and_affinity_patch.yaml diff --git a/manifests/modules/observability/resiliency/high-availability/config/scale_and_affinity_patch.yaml b/manifests/modules/observability/resiliency/high-availability/config/scale_and_affinity_patch.yaml new file mode 100644 index 000000000..c4bffa2ec --- /dev/null +++ b/manifests/modules/observability/resiliency/high-availability/config/scale_and_affinity_patch.yaml @@ -0,0 +1,28 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ui + namespace: ui +spec: + replicas: 5 + selector: + matchLabels: + app: ui + template: + metadata: + labels: + app: ui + spec: + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: topology.kubernetes.io/zone + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: ui + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: ui diff --git a/manifests/modules/observability/resiliency/scripts/AZ-verify-clusters.sh b/manifests/modules/observability/resiliency/scripts/AZ-verify-clusters.sh new file mode 100755 index 000000000..a136332b2 --- /dev/null +++ b/manifests/modules/observability/resiliency/scripts/AZ-verify-clusters.sh @@ -0,0 +1,110 @@ +#!/bin/bash +# verify-cluster.sh - Verifies cluster state and corrects replica count + +DESIRED_REPLICAS=9 +MAX_WAIT_TIME=300 # 5 minutes +POLL_INTERVAL=10 # 10 seconds +NAMESPACE="ui" +EXPECTED_READY_NODES=6 + +print_header() { + echo -e "\n==== $1 ====\n" +} + +wait_for_condition() { + local end_time=$((SECONDS + MAX_WAIT_TIME)) + while [ $SECONDS -lt $end_time ]; do + if eval "$1"; then + return 0 + fi + echo -n "." + sleep $POLL_INTERVAL + done + echo " Timeout!" + return 1 +} + +print_header "Checking Current Pod Distribution" +$SCRIPT_DIR/get-pods-by-az.sh + +print_header "Waiting for nodes to be Ready" +total_nodes=$(kubectl get nodes --no-headers | wc -l) +echo "Total nodes in the cluster: $total_nodes" +echo "Waiting for $EXPECTED_READY_NODES nodes to be in Ready state" +if wait_for_condition "[ \$(kubectl get nodes --no-headers | grep ' Ready ' | wc -l) -eq $EXPECTED_READY_NODES ]"; then + echo -e "\n✅ $EXPECTED_READY_NODES nodes are in Ready state." +else + echo -e "\n⚠️ Warning: $EXPECTED_READY_NODES nodes did not reach Ready state within the timeout period." + exit 1 +fi + +print_header "Checking Current Pod Distribution" +$SCRIPT_DIR/get-pods-by-az.sh + +print_header "Node Information" +kubectl get nodes -o wide + +print_header "Verifying Cluster State" +node_count=$(kubectl get nodes --no-headers | grep " Ready " | grep -vc "SchedulingDisabled") +current_pod_count=$(kubectl get pods -n $NAMESPACE -l app=ui --no-headers | grep -v Terminating | wc -l) + +echo "Ready and schedulable nodes: $node_count" +echo "Current active ui pods: $current_pod_count" +echo "Desired ui pods: $DESIRED_REPLICAS" + +if [ $current_pod_count -ne $DESIRED_REPLICAS ]; then + print_header "Adjusting Replica Count" + echo "Scaling deployment to $DESIRED_REPLICAS replicas..." + kubectl scale deployment ui -n $NAMESPACE --replicas=$DESIRED_REPLICAS + + echo -n "Waiting for pod count to stabilize" + if wait_for_condition "[ \$(kubectl get pods -n $NAMESPACE -l app=ui --no-headers | grep -v Terminating | wc -l) -eq $DESIRED_REPLICAS ]"; then + echo -e "\n✅ Pod count has reached the desired number." + else + echo -e "\n⚠️ Warning: Failed to reach desired pod count within the timeout period." + fi +else + echo "✅ Number of replicas is correct." +fi + +print_header "Checking Pod Distribution" +if [ $node_count -gt 0 ]; then + max_pods_per_node=$((DESIRED_REPLICAS / node_count + 1)) + uneven_distribution=false + + for node in $(kubectl get nodes -o name | grep -v "SchedulingDisabled"); do + pods_on_node=$(kubectl get pods -n $NAMESPACE -l app=ui --field-selector spec.nodeName=${node#node/} --no-headers | grep -v Terminating | wc -l) + if [ $pods_on_node -gt $max_pods_per_node ]; then + uneven_distribution=true + break + fi + done + + if $uneven_distribution; then + echo "⚠️ Pod distribution is uneven. Rebalancing..." + kubectl scale deployment ui -n $NAMESPACE --replicas=0 + sleep $POLL_INTERVAL + kubectl scale deployment ui -n $NAMESPACE --replicas=$DESIRED_REPLICAS + + echo -n "Waiting for pods to be ready" + if wait_for_condition "[ \$(kubectl get pods -n $NAMESPACE -l app=ui --no-headers | grep Running | wc -l) -eq $DESIRED_REPLICAS ]"; then + echo -e "\n✅ Pods are ready and balanced." + else + echo -e "\n⚠️ Warning: Pods did not reach ready state within the timeout period." + fi + else + echo "✅ Pod distribution is balanced." + fi +else + echo "⚠️ Warning: No Ready and schedulable nodes found. Cannot check pod distribution." +fi + +print_header "Final Pod Distribution" +$SCRIPT_DIR/get-pods-by-az.sh + +echo +if [ $node_count -gt 0 ] && [ $current_pod_count -eq $DESIRED_REPLICAS ]; then + echo "✅ Cluster verification and correction complete." +else + echo "⚠️ Cluster verification complete, but some issues may require attention." +fi \ No newline at end of file diff --git a/manifests/modules/observability/resiliency/scripts/create-blueprint.sh b/manifests/modules/observability/resiliency/scripts/create-blueprint.sh new file mode 100755 index 000000000..4f8ab5112 --- /dev/null +++ b/manifests/modules/observability/resiliency/scripts/create-blueprint.sh @@ -0,0 +1,114 @@ +#!/bin/bash + +# Get Ingress URL +INGRESS_URL=$(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') + +# Create the required directory structure +mkdir -p nodejs/node_modules + +# Create the Node.js canary script with heartbeat blueprint +cat << EOF > nodejs/node_modules/canary.js +const { URL } = require('url'); +const synthetics = require('Synthetics'); +const log = require('SyntheticsLogger'); +const syntheticsConfiguration = synthetics.getConfiguration(); +const syntheticsLogHelper = require('SyntheticsLogHelper'); + +const loadBlueprint = async function () { + const urls = ['http://${INGRESS_URL}']; + + // Set screenshot option + const takeScreenshot = true; + + // Configure synthetics settings + syntheticsConfiguration.disableStepScreenshots(); + syntheticsConfiguration.setConfig({ + continueOnStepFailure: true, + includeRequestHeaders: true, + includeResponseHeaders: true, + restrictedHeaders: [], + restrictedUrlParameters: [] + }); + + let page = await synthetics.getPage(); + + for (const url of urls) { + await loadUrl(page, url, takeScreenshot); + } +}; + +// Reset the page in-between +const resetPage = async function(page) { + try { + await page.goto('about:blank', {waitUntil: ['load', 'networkidle0'], timeout: 30000}); + } catch (e) { + synthetics.addExecutionError('Unable to open a blank page. ', e); + } +}; + +const loadUrl = async function (page, url, takeScreenshot) { + let stepName = null; + let domcontentloaded = false; + + try { + stepName = new URL(url).hostname; + } catch (e) { + const errorString = \`Error parsing url: \${url}. \${e}\`; + log.error(errorString); + throw e; + } + + await synthetics.executeStep(stepName, async function () { + const sanitizedUrl = syntheticsLogHelper.getSanitizedUrl(url); + + const response = await page.goto(url, { waitUntil: ['domcontentloaded'], timeout: 30000}); + if (response) { + domcontentloaded = true; + const status = response.status(); + const statusText = response.statusText(); + + logResponseString = \`Response from url: \${sanitizedUrl} Status: \${status} Status Text: \${statusText}\`; + + if (response.status() < 200 || response.status() > 299) { + throw new Error(\`Failed to load url: \${sanitizedUrl} \${response.status()} \${response.statusText()}\`); + } + } else { + const logNoResponseString = \`No response returned for url: \${sanitizedUrl}\`; + log.error(logNoResponseString); + throw new Error(logNoResponseString); + } + }); + + // Wait for 15 seconds to let page load fully before taking screenshot. + if (domcontentloaded && takeScreenshot) { + await new Promise(r => setTimeout(r, 15000)); + await synthetics.takeScreenshot(stepName, 'loaded'); + } + + // Reset page + await resetPage(page); +}; + +exports.handler = async () => { + return await loadBlueprint(); +}; +EOF + +# Zip the Node.js script +python3 - << EOL +import zipfile +with zipfile.ZipFile('canary.zip', 'w') as zipf: + zipf.write('nodejs/node_modules/canary.js', arcname='nodejs/node_modules/canary.js') +EOL + +# Ensure BUCKET_NAME is set +if [ -z "$BUCKET_NAME" ]; then + echo "Error: BUCKET_NAME environment variable is not set." + exit 1 +fi + +# Upload the zipped canary script to S3 +aws s3 cp canary.zip "s3://${BUCKET_NAME}/canary-scripts/canary.zip" + +echo "Canary script has been zipped and uploaded to s3://${BUCKET_NAME}/canary-scripts/canary.zip" +echo "The script is configured to check the URL: http://${INGRESS_URL}" diff --git a/manifests/modules/observability/resiliency/scripts/get-pods-by-az.sh b/manifests/modules/observability/resiliency/scripts/get-pods-by-az.sh new file mode 100755 index 000000000..9b73c21d8 --- /dev/null +++ b/manifests/modules/observability/resiliency/scripts/get-pods-by-az.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +# Modified from "Disaster recovery, high availability, and resiliency on Amazon EKS" +# https://catalog.us-east-1.prod.workshops.aws/workshops/6140457f-53b2-48b8-a007-2d4be06ba2fc + +GREEN='\033[0;32m' +RED='\033[0;31m' +NC='\033[0m' # No Color + +CURRENT_CONTEXT=$(kubectl config current-context) +REGION=$(kubectl config view -o jsonpath="{.contexts[?(@.name == \"$CURRENT_CONTEXT\")].context.cluster}" | cut -d : -f 4) + +# Function to clear the screen and move cursor to top-left +clear_screen() { + echo -e "\033[2J\033[H" +} + +# Function to generate the output +generate_output() { + echo -e '\n\n\n' + for az in a b c + do + AZ=$REGION$az + echo -n "------" + echo -n -e "${GREEN}$AZ${NC}" + echo "------" + for node in $(kubectl get nodes -l topology.kubernetes.io/zone=$AZ --no-headers | grep -v NotReady | cut -d " " -f1) + do + echo -e " ${RED}$node:${NC}" + kubectl get pods -n ui --no-headers --field-selector spec.nodeName=${node} 2>&1 | while read line; do echo " ${line}"; done + done + echo "" + done + echo -e '\n\n\n' +} + +# Initial clear screen +# clear_screen + +# Main loop +while true; do + # Generate output to a temporary file + generate_output > temp_output.txt + + #generate_output + # Clear screen and display the new output + # clear_screen + cat temp_output.txt + # clear_screen + + # Wait before next update + sleep 1 +done diff --git a/manifests/modules/observability/resiliency/scripts/node-failure.sh b/manifests/modules/observability/resiliency/scripts/node-failure.sh new file mode 100755 index 000000000..80d3fc3b9 --- /dev/null +++ b/manifests/modules/observability/resiliency/scripts/node-failure.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# node-failure.sh - Simulates node failure by stopping an EC2 instance with running pods + +# Get a list of nodes with running pods +node_with_pods=$(kubectl get pods --all-namespaces -o wide | awk 'NR>1 {print $8}' | sort | uniq) + +if [ -z "$node_with_pods" ]; then + echo "No nodes with running pods found. Please run this script: $SCRIPT_DIR/verify-cluster.sh" + exit 1 +fi + +# Select a random node from the list +selected_node=$(echo "$node_with_pods" | shuf -n 1) + +# Get the EC2 instance ID for the selected node +instance_id=$(aws ec2 describe-instances \ + --filters "Name=private-dns-name,Values=$selected_node" \ + --query "Reservations[*].Instances[*].InstanceId" \ + --output text) + +# Stop the instance to simulate a node failure +echo "Stopping instance: $instance_id (Node: $selected_node)" +aws ec2 stop-instances --instance-ids $instance_id + +echo "Instance $instance_id is being stopped. Monitoring pod distribution..." diff --git a/manifests/modules/observability/resiliency/scripts/pod-failure.sh b/manifests/modules/observability/resiliency/scripts/pod-failure.sh new file mode 100755 index 000000000..fd7ea7b49 --- /dev/null +++ b/manifests/modules/observability/resiliency/scripts/pod-failure.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# pod-failure.sh - Simulates pod failure using Chaos Mesh + +# Generates a unique identifier for the pod failure experiment +unique_id=$(date +%s) + +# Create a YAML configuration for the PodChaos resource +kubectl apply -f - < +ui-6dfb84cf67-6d5lq 1/1 Running 0 46s 10.42.121.36 ip-10-42-119-94.us-west-2.compute.internal +ui-6dfb84cf67-hqccq 1/1 Running 0 46s 10.42.154.216 ip-10-42-146-130.us-west-2.compute.internal +ui-6dfb84cf67-qqltz 1/1 Running 0 46s 10.42.185.149 ip-10-42-176-213.us-west-2.compute.internal +ui-6dfb84cf67-rzbvl 1/1 Running 0 46s 10.42.188.96 ip-10-42-176-213.us-west-2.compute.internal +``` + +Note that all pods have similar start times (shown in the AGE column). + +### Step 2: Simulate Pod Failure + +Now, let's simulate a pod failure: + +```bash +$ ~/$SCRIPT_DIR/pod-failure.sh +``` + +This script will use Chaos Mesh to terminate one of the pods. + +### Step 3: Observe Recovery + +Wait for a couple of seconds to allow Kubernetes to detect the failure and initiate recovery. Then, check the pod status again: + +```bash timeout=5 +$ kubectl get pods -n ui -o wide +``` + +You should now see output similar to this: + +```text +NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES +ui-6dfb84cf67-44hc9 1/1 Running 0 2m57s 10.42.121.37 ip-10-42-119-94.us-west-2.compute.internal +ui-6dfb84cf67-6d5lq 1/1 Running 0 2m57s 10.42.121.36 ip-10-42-119-94.us-west-2.compute.internal +ui-6dfb84cf67-ghp5z 1/1 Running 0 6s 10.42.185.150 ip-10-42-176-213.us-west-2.compute.internal +ui-6dfb84cf67-hqccq 1/1 Running 0 2m57s 10.42.154.216 ip-10-42-146-130.us-west-2.compute.internal +ui-6dfb84cf67-rzbvl 1/1 Running 0 2m57s 10.42.188.96 ip-10-42-176-213.us-west-2.compute.internal +[ec2-user@bc44085aafa9 environment]$ +``` + +Notice that one of the pods (in this example, `ui-6dfb84cf67-ghp5z`) has a much lower AGE value. This is the pod that Kubernetes automatically created to replace the one that was terminated by our simulation. + +This will show you the status, IP addresses, and nodes for each pod in the `ui` namespace. + +## Verify Retail Store Availability + +An essential aspect of this experiment is to ensure that your retail store application remains operational throughout the pod failure and recovery process. To verify the availability of the retail store, use the following command to fetch and access the store's URL: + +```bash timeout=900 +$ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') + +Waiting for k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com... +You can now access http://k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com +``` + +Once ready, you can access the retail store through this URL to confirm that it's still functioning correctly despite the simulated pod failure. + +## Conclusion + +This pod failure simulation demonstrates the resilience of your Kubernetes-based application. By intentionally causing a pod to fail, you can observe: + +1. The system's ability to detect failures quickly +2. Kubernetes' automatic rescheduling and recovery of Deployments or StatefulSets failed pods. +3. The application's continued availability during pod failures + +Remember that the retail store should remain operational even when a pod fails, showcasing the high availability and fault tolerance of your Kubernetes setup. This experiment helps validate your application's resilience and can be repeated as needed to ensure consistent behavior across different scenarios or after making changes to your infrastructure. + +By regularly performing such chaos engineering experiments, you can build confidence in your system's ability to withstand and recover from various types of failures, ultimately leading to a more robust and reliable application. diff --git a/website/docs/observability/high-availability/03-node-failure-no-fis.md b/website/docs/observability/high-availability/03-node-failure-no-fis.md new file mode 100644 index 000000000..71aa50ba3 --- /dev/null +++ b/website/docs/observability/high-availability/03-node-failure-no-fis.md @@ -0,0 +1,130 @@ +--- +title: "Simulating Node Failure without FIS" +sidebar_position: 130 +description: "Manually simulate a node failure in your Kubernetes environment to test the resilience of your applications without using AWS FIS." +--- + +## Overview + +This experiment simulates a node failure manually in your Kubernetes cluster to understand the impact on your deployed applications, particularly focusing on the retail store application's availability. By deliberately causing a node to fail, we can observe how Kubernetes handles the failure and maintains the overall health of the cluster. + +The `node-failure.sh` script will manually stop an EC2 instance to simulate node failure. Here is the script we will use: + +```file +manifests/modules/observability/resiliency/scripts/node-failure.sh +``` + +It's important to note that this experiment is repeatable, allowing you to run it multiple times to ensure consistent behavior and to test various scenarios or configurations. + +## Running the Experiment + +To simulate the node failure and monitor its effects, run the following command: + +```bash timeout=240 +$ ~/$SCRIPT_DIR/node-failure.sh && timeout 180s ~/$SCRIPT_DIR/get-pods-by-az.sh + +------us-west-2a------ + ip-10-42-127-82.us-west-2.compute.internal: + ui-6dfb84cf67-dsp55 1/1 Running 0 10m + ui-6dfb84cf67-gzd9s 1/1 Running 0 8m19s + +------us-west-2b------ + ip-10-42-133-195.us-west-2.compute.internal: + No resources found in ui namespace. + +------us-west-2c------ + ip-10-42-186-246.us-west-2.compute.internal: + ui-6dfb84cf67-4bmjm 1/1 Running 0 44s + ui-6dfb84cf67-n8x4f 1/1 Running 0 10m + ui-6dfb84cf67-wljth 1/1 Running 0 10m +``` + +This command will stop the selected EC2 instance and monitor the pod distribution for 2 minutes, observing how the system redistributes workloads. + +During the experiment, you should observe the following sequence of events: + +1. After about 1 minute, you'll see one node disappear from the list. This represents the simulated node failure. +2. Shortly after the node failure, you'll notice pods being redistributed to the remaining healthy nodes. Kubernetes detects the node failure and automatically reschedules the affected pods. +3. Approximately 2 minutes after the initial failure, the failed node will come back online. + +Throughout this process, the total number of running pods should remain constant, ensuring application availability. + +## Verifying Cluster Recovery + +While waiting for the node to finish coming back online, we will verify the cluster's self-healing capabilities and potentially recycle pods again if necessary. Since the cluster often recovers on its own, we'll focus on checking the current state and ensuring an optimal distribution of pods across AZs. + +First let's ensure all nodes are in the `Ready` state: + +```bash timeout=300 +$ EXPECTED_NODES=3 && while true; do ready_nodes=$(kubectl get nodes --no-headers | grep " Ready" | wc -l); if [ "$ready_nodes" -eq "$EXPECTED_NODES" ]; then echo "All $EXPECTED_NODES expected nodes are ready."; echo "Listing the ready nodes:"; kubectl get nodes | grep " Ready"; break; else echo "Waiting for all $EXPECTED_NODES nodes to be ready... (Currently $ready_nodes are ready)"; sleep 10; fi; done +``` + +This command counts the total number of nodes in the `Ready` state and continuously checks until all 3 active nodes are ready. + +Once all nodes are ready, we'll redeploy the pods to ensure they are balanced across the nodes: + +```bash timeout=900 wait=60 +$ kubectl delete pod --grace-period=0 --force -n ui -l app.kubernetes.io/component=service +$ kubectl delete pod --grace-period=0 --force -n orders -l app.kubernetes.io/component=mysql +$ kubectl delete pod --grace-period=0 --force -n catalog -l app.kubernetes.io/component=mysql +$ kubectl delete pod --grace-period=0 --force -n carts -l app.kubernetes.io/component=dynamodb +$ kubectl delete pod --grace-period=0 --force -n checkout -l app.kubernetes.io/component=redis +$ kubectl delete pod --grace-period=0 --force -n orders -l app.kubernetes.io/component=service +$ kubectl delete pod --grace-period=0 --force -n catalog -l app.kubernetes.io/component=service +$ kubectl delete pod --grace-period=0 --force -n carts -l app.kubernetes.io/component=service +$ kubectl delete pod --grace-period=0 --force -n checkout -l app.kubernetes.io/component=service +$ kubectl delete pod --grace-period=0 --force -n assets -l app.kubernetes.io/component=service +$ sleep 180 +$ kubectl rollout status -n ui deployment/ui --timeout 180s +$ timeout 10s ~/$SCRIPT_DIR/get-pods-by-az.sh | head -n 30 +``` + +These commands perform the following actions: + +1. Delete the existing ui pods. +2. Wait for ui pods to be provisioned automatically. +3. Use the `get-pods-by-az.sh` script to check the distribution of pods across availability zones. + +## Verify Retail Store Availability + +After simulating the node failure, we can verify that the retail store application remains accessible. Use the following command to check its availability: + +```bash timeout=900 +$ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') + +Waiting for k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com... +You can now access http://k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com +``` + +This command retrieves the load balancer hostname for the ingress and waits for it to become available. Once ready, you can access the retail store through this URL to confirm that it's still functioning correctly despite the simulated node failure. + +:::caution +The retail url may take 10 minutes to become operational. You can optionally continue on with the lab by pressing `ctrl` + `z` to move operation to the background. To access it again input: + +```bash test=false +$ fg %1 +``` + +The url may not become operational by the time `wait-for-lb` times out. In that case, it should become operational after running the command again: + +```bash test=false +$ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') +``` + +::: + +## Conclusion + +This node failure simulation demonstrates the robustness and self-healing capabilities of your Kubernetes cluster. Key observations and lessons from this experiment include: + +1. Kubernetes' ability to quickly detect node failures and respond accordingly. +2. The automatic rescheduling of pods from the failed node to healthy nodes, ensuring continuity of service. +3. The EKS cluster's self-healing process using EKS manged node group, brings the failed node back online after a short period. +4. The importance of proper resource allocation and pod distribution to maintain application availability during node failures. + +By regularly performing such experiments, you can: + +- Validate your cluster's resilience to node failures. +- Identify potential weaknesses in your application's architecture or deployment strategy. +- Gain confidence in your system's ability to handle unexpected infrastructure issues. +- Refine your incident response procedures and automation. diff --git a/website/docs/observability/high-availability/04-node-failure-partial-fis.md b/website/docs/observability/high-availability/04-node-failure-partial-fis.md new file mode 100644 index 000000000..96b6d80d1 --- /dev/null +++ b/website/docs/observability/high-availability/04-node-failure-partial-fis.md @@ -0,0 +1,148 @@ +--- +title: "Simulating Partial Node Failure with FIS" +sidebar_position: 150 +description: "Simulate a partial node failures in your Kubernetes environment using AWS Fault Injection Simulator to test application resiliency." +--- + +## AWS Fault Injection Simulator (FIS) Overview + +AWS Fault Injection Simulator (FIS) is a fully managed service that enables you to perform controlled fault injection experiments on your AWS workloads. FIS allows you to simulate various failure scenarios, which is crucial for: + +1. Validating high availability configurations +2. Testing auto-scaling and self-healing capabilities +3. Identifying potential single points of failure +4. Improving incident response procedures + +By using FIS, you can: + +- Discover hidden bugs and performance bottlenecks +- Observe how your systems behave under stress +- Implement and validate automated recovery procedures +- Conduct repeatable experiments to ensure consistent behavior + +In our FIS experiment, we'll simulate a partial node failure in our EKS cluster and observe how our application responds, providing practical insights into building resilient systems. + +:::info +For more information on AWS FIS, check out: + +- [What is AWS Fault Injection Service?](https://docs.aws.amazon.com/fis/latest/userguide/what-is.html) +- [AWS Fault Injection Simulator Console](https://console.aws.amazon.com/fis/home) +- [AWS Systems Manager, Automation](https://console.aws.amazon.com/systems-manager/automation/executions) + +::: + +## Experiment Details + +This experiment differs from the previous manual node failure simulation in several ways: + +1. **Automated execution**: FIS manages the experiment, allowing for more controlled and repeatable tests compared to the manual script execution in the previous experiment. +2. **Partial failure**: Instead of simulating a complete failure of a single node, FIS allows us to simulate a partial failure across multiple nodes. This provides a more nuanced and realistic failure scenario. +3. **Scale**: FIS allows us to target multiple nodes simultaneously. This allows us to test the resilience of our application at a larger scale compared to the single-node failure in the manual experiment. +4. **Precision**: We can specify exact percentages of instances to terminate, giving us fine-grained control over the experiment. This level of control wasn't possible in the manual experiment, where we were limited to terminating entire nodes. +5. **Minimal disruption**: The FIS experiment is designed to maintain service availability throughout the test, whereas the manual node failure might have caused temporary disruptions to the retail store's accessibility. + +These differences allows for a more comprehensive and realistic test of our application's resilience to failures, while maintaining better control over the experiment parameters. In this experiment, FIS will terminate 66% of the instances in two node groups, simulating a significant partial failure of our cluster. Similar to previous experiments, this experiment is also repeatable + +## Creating the Node Failure Experiment + +Create a new AWS FIS experiment template to simulate the partial node failure: + +```bash wait=30 +$ export NODE_EXP_ID=$(aws fis create-experiment-template --cli-input-json '{"description":"NodeDeletion","targets":{"Nodegroups-Target-1":{"resourceType":"aws:eks:nodegroup","resourceTags":{"eksctl.cluster.k8s.io/v1alpha1/cluster-name":"eks-workshop"},"selectionMode":"COUNT(2)"}},"actions":{"nodedeletion":{"actionId":"aws:eks:terminate-nodegroup-instances","parameters":{"instanceTerminationPercentage":"66"},"targets":{"Nodegroups":"Nodegroups-Target-1"}}},"stopConditions":[{"source":"none"}],"roleArn":"'$FIS_ROLE_ARN'","tags":{"ExperimentSuffix": "'$RANDOM_SUFFIX'"}}' --output json | jq -r '.experimentTemplate.id') +``` + +## Running the Experiment + +Execute the FIS experiment to simulate the node failure and monitor the response: + +```bash timeout=240 +$ aws fis start-experiment --experiment-template-id $NODE_EXP_ID --output json && timeout 240s ~/$SCRIPT_DIR/get-pods-by-az.sh + +------us-west-2a------ + ip-10-42-127-82.us-west-2.compute.internal: + ui-6dfb84cf67-s6kw4 1/1 Running 0 2m16s + ui-6dfb84cf67-vwk4x 1/1 Running 0 4m54s + +------us-west-2b------ + +------us-west-2c------ + ip-10-42-180-16.us-west-2.compute.internal: + ui-6dfb84cf67-29xtf 1/1 Running 0 79s + ui-6dfb84cf67-68hbw 1/1 Running 0 79s + ui-6dfb84cf67-plv9f 1/1 Running 0 79s + +``` + +This command triggers the node failure and monitors the pods for 4 minutes, allowing you to observe how the cluster responds to losing a significant portion of its capacity. + +During the experiment, you should observe the following: + +1. After about 1 minute, you'll see one or more nodes disappear from the list, representing the simulated partial node failure. +2. Over the next 2 minutes, you'll notice pods being rescheduled and redistributed to the remaining healthy nodes. +3. Shortly after you'll see the new node coming online to replace the terminated one. + +Your retail url should stay operational unlike the node failure without FIS. + +:::note +To verify nodes and re-balance pods, you can run: + +```bash timeout=900 wait=60 +$ EXPECTED_NODES=3 && while true; do ready_nodes=$(kubectl get nodes --no-headers | grep " Ready" | wc -l); if [ "$ready_nodes" -eq "$EXPECTED_NODES" ]; then echo "All $EXPECTED_NODES expected nodes are ready."; echo "Listing the ready nodes:"; kubectl get nodes | grep " Ready"; break; else echo "Waiting for all $EXPECTED_NODES nodes to be ready... (Currently $ready_nodes are ready)"; sleep 10; fi; done +$ kubectl delete pod --grace-period=0 --force -n ui -l app.kubernetes.io/component=service +$ kubectl delete pod --grace-period=0 --force -n orders -l app.kubernetes.io/component=mysql +$ kubectl delete pod --grace-period=0 --force -n catalog -l app.kubernetes.io/component=mysql +$ kubectl delete pod --grace-period=0 --force -n carts -l app.kubernetes.io/component=dynamodb +$ kubectl delete pod --grace-period=0 --force -n checkout -l app.kubernetes.io/component=redis +$ kubectl delete pod --grace-period=0 --force -n orders -l app.kubernetes.io/component=service +$ kubectl delete pod --grace-period=0 --force -n catalog -l app.kubernetes.io/component=service +$ kubectl delete pod --grace-period=0 --force -n carts -l app.kubernetes.io/component=service +$ kubectl delete pod --grace-period=0 --force -n checkout -l app.kubernetes.io/component=service +$ kubectl delete pod --grace-period=0 --force -n assets -l app.kubernetes.io/component=service +$ sleep 180 +$ kubectl rollout status -n ui deployment/ui --timeout 180s +$ timeout 10s ~/$SCRIPT_DIR/get-pods-by-az.sh | head -n 30 +``` + +::: + +## Verifying Retail Store Availability + +Ensure that your retail store application remains operational throughout the partial node failure. Use the following command to check its availability: + +```bash timeout=900 +$ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') + +Waiting for k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com... +You can now access http://k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com +``` + +:::tip +The retail url may take 10 minutes to become operational. +::: + +Despite the partial node failure, the retail store should continue to serve traffic, demonstrating the resilience of your deployment setup. + +## Conclusion + +This partial node failure simulation using AWS FIS demonstrates several key aspects of your Kubernetes cluster's resilience: + +1. Automatic detection of node failures by Kubernetes +2. Swift rescheduling of pods from failed nodes to healthy ones +3. The cluster's ability to maintain service availability during significant infrastructure disruptions +4. Auto-scaling capabilities to replace failed nodes + +Key takeaways from this experiment: + +- The importance of distributing your workload across multiple nodes and availability zones +- The value of having appropriate resource requests and limits set for your pods +- The effectiveness of Kubernetes' self-healing mechanisms +- The need for robust monitoring and alerting systems to detect and respond to node failures + +By leveraging AWS FIS for such experiments, you gain several advantages: + +1. **Repeatability**: You can run this experiment multiple times to ensure consistent behavior. +2. **Automation**: FIS allows you to schedule regular resilience tests, ensuring your system maintains its fault-tolerant capabilities over time. +3. **Comprehensive testing**: You can create more complex scenarios involving multiple AWS services to test your entire application stack. +4. **Controlled chaos**: FIS provides a safe, managed environment for conducting chaos engineering experiments without risking unintended damage to your production systems. + +Regular execution of such experiments helps build confidence in your system's resilience and provides valuable insights for continuous improvement of your architecture and operational procedures. diff --git a/website/docs/observability/high-availability/05-node-failure-complete-fis.md b/website/docs/observability/high-availability/05-node-failure-complete-fis.md new file mode 100644 index 000000000..449873bd3 --- /dev/null +++ b/website/docs/observability/high-availability/05-node-failure-complete-fis.md @@ -0,0 +1,112 @@ +--- +title: "Simulating Complete Node Failure with FIS" +sidebar_position: 170 +description: "Demonstrates the impact of a complete node failure on a Kubernetes environment using AWS Fault Injection Simulator." +--- + +## Overview + +This experiment extends our previous partial node failure test to simulate a complete failure of all nodes in our EKS cluster. This is essentially a cluster failure. It demonstrates how AWS Fault Injection Simulator (FIS) can be used to test extreme scenarios and validate your system's resilience under catastrophic conditions. + +## Experiment Details + +This experiment is similar to the partial node failure as it is repeatable. Unlike the partial node failure simulation, this experiment: + +1. Terminates 100% of the instances in all node groups. +2. Tests your cluster's ability to recover from a state of complete failure. +3. Allows observation of the full recovery process, from total outage to full restoration. + +## Creating the Node Failure Experiment + +Create a new AWS FIS experiment template to simulate the complete node failure: + +```bash wait=30 +$ export FULL_NODE_EXP_ID=$(aws fis create-experiment-template --cli-input-json '{"description":"NodeDeletion","targets":{"Nodegroups-Target-1":{"resourceType":"aws:eks:nodegroup","resourceTags":{"eksctl.cluster.k8s.io/v1alpha1/cluster-name":"eks-workshop"},"selectionMode":"ALL"}},"actions":{"nodedeletion":{"actionId":"aws:eks:terminate-nodegroup-instances","parameters":{"instanceTerminationPercentage":"100"},"targets":{"Nodegroups":"Nodegroups-Target-1"}}},"stopConditions":[{"source":"none"}],"roleArn":"'$FIS_ROLE_ARN'","tags":{"ExperimentSuffix": "'$RANDOM_SUFFIX'"}}' --output json | jq -r '.experimentTemplate.id') +``` + +## Running the Experiment + +Execute the FIS experiment and monitor the cluster's response: + +```bash timeout=420 +$ aws fis start-experiment --experiment-template-id $FULL_NODE_EXP_ID --output json && timeout 180s ~/$SCRIPT_DIR/get-pods-by-az.sh + +------us-west-2a------ + ip-10-42-106-250.us-west-2.compute.internal: + No resources found in ui namespace. + +------us-west-2b------ + ip-10-42-141-133.us-west-2.compute.internal: + ui-6dfb84cf67-n9xns 1/1 Running 0 4m8s + ui-6dfb84cf67-slknv 1/1 Running 0 2m48s + +------us-west-2c------ + ip-10-42-179-59.us-west-2.compute.internal: + ui-6dfb84cf67-5xht5 1/1 Running 0 4m52s + ui-6dfb84cf67-b6xbf 1/1 Running 0 4m10s + ui-6dfb84cf67-fpg8j 1/1 Running 0 4m52s +``` + +This command will show the pods distribution over 6 minutes while we observe the experiment. We should see: + +1. Shortly after the experiment is initiated, all nodes and pods disappear. +2. After about 2 minutes, First node and some pods will come back online. +3. Around 4 minutes, a second node appears and more pods start up. +4. At 6 minutes, continued recovery as the last node come online. + +Due to the severity of the experiment, the retail store url will not stay operational during testing. The url should come back up after the final node is operational. If the node is not operational after this test, run `~/$SCRIPT_DIR/verify-clsuter.sh` to wait for the final node to change state to running before proceeding. + +:::note +To verify nodes and pods redistribution, you can run: + +```bash timeout=900 wait=60 +$ EXPECTED_NODES=3 && while true; do ready_nodes=$(kubectl get nodes --no-headers | grep " Ready" | wc -l); if [ "$ready_nodes" -eq "$EXPECTED_NODES" ]; then echo "All $EXPECTED_NODES expected nodes are ready."; echo "Listing the ready nodes:"; kubectl get nodes | grep " Ready"; break; else echo "Waiting for all $EXPECTED_NODES nodes to be ready... (Currently $ready_nodes are ready)"; sleep 10; fi; done +$ kubectl delete pod --grace-period=0 --force -n ui -l app.kubernetes.io/component=service +$ kubectl delete pod --grace-period=0 --force -n orders -l app.kubernetes.io/component=mysql +$ kubectl delete pod --grace-period=0 --force -n catalog -l app.kubernetes.io/component=mysql +$ kubectl delete pod --grace-period=0 --force -n carts -l app.kubernetes.io/component=dynamodb +$ kubectl delete pod --grace-period=0 --force -n checkout -l app.kubernetes.io/component=redis +$ kubectl delete pod --grace-period=0 --force -n orders -l app.kubernetes.io/component=service +$ kubectl delete pod --grace-period=0 --force -n catalog -l app.kubernetes.io/component=service +$ kubectl delete pod --grace-period=0 --force -n carts -l app.kubernetes.io/component=service +$ kubectl delete pod --grace-period=0 --force -n checkout -l app.kubernetes.io/component=service +$ kubectl delete pod --grace-period=0 --force -n assets -l app.kubernetes.io/component=service +$ sleep 180 +$ kubectl rollout status -n ui deployment/ui --timeout 180s +$ timeout 10s ~/$SCRIPT_DIR/get-pods-by-az.sh | head -n 30 +``` + +::: + +## Verifying Retail Store Availability + +Check the retail store application's recovery: + +```bash timeout=900 +$ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') + +Waiting for k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com... +You can now access http://k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com +``` + +:::tip +The retail url may take 10 minutes to become operational. +::: + +## Conclusion + +This experiment demonstrates: + +1. Your cluster's response to catastrophic failure. +2. Effectiveness of auto-scaling in replacing all failed nodes. +3. Kubernetes' ability to reschedule all pods onto new nodes. +4. Total system recovery time from complete failure. + +Key learnings: + +- Importance of robust auto-scaling configurations. +- Value of effective pod priority and preemption settings. +- Need for architectures that can withstand complete cluster failure. +- Significance of regular testing of extreme scenarios. + +By using FIS for such tests, you can safely simulate catastrophic failures, validate recovery procedures, identify critical dependencies, and measure recovery times. This helps in refining your disaster recovery plans and improving overall system resilience. diff --git a/website/docs/observability/high-availability/06-az-setup.md b/website/docs/observability/high-availability/06-az-setup.md new file mode 100644 index 000000000..49af5109b --- /dev/null +++ b/website/docs/observability/high-availability/06-az-setup.md @@ -0,0 +1,104 @@ +--- +title: "AZ Failure Experiment Setup" +sidebar_position: 190 +description: "Scale your application to two instances and prepare for an AZ failure simulation experiment." +--- + +### Scaling Instances + +To see the full impact of an Availability Zone (AZ) failure, let's first scale up to two instances per AZ as well as increase the number of pods up to 9: + +```bash timeout=120 +$ ASG_NAME=$(aws autoscaling describe-auto-scaling-groups --query "AutoScalingGroups[? Tags[? (Key=='eks:cluster-name') && Value=='eks-workshop']].AutoScalingGroupName" --output text) +$ aws autoscaling update-auto-scaling-group \ + --auto-scaling-group-name $ASG_NAME \ + --desired-capacity 6 \ + --min-size 6 \ + --max-size 6 +$ sleep 60 +$ kubectl scale deployment ui --replicas=9 -n ui +$ timeout 10s ~/$SCRIPT_DIR/get-pods-by-az.sh | head -n 30 + +------us-west-2a------ + ip-10-42-100-4.us-west-2.compute.internal: + ui-6dfb84cf67-xbbj4 0/1 ContainerCreating 0 1s + ip-10-42-106-250.us-west-2.compute.internal: + ui-6dfb84cf67-4fjhh 1/1 Running 0 5m20s + ui-6dfb84cf67-gkrtn 1/1 Running 0 5m19s + +------us-west-2b------ + ip-10-42-139-198.us-west-2.compute.internal: + ui-6dfb84cf67-7rfkf 0/1 ContainerCreating 0 4s + ip-10-42-141-133.us-west-2.compute.internal: + ui-6dfb84cf67-7qnkz 1/1 Running 0 5m23s + ui-6dfb84cf67-n58b9 1/1 Running 0 5m23s + +------us-west-2c------ + ip-10-42-175-140.us-west-2.compute.internal: + ui-6dfb84cf67-8xfk8 0/1 ContainerCreating 0 8s + ui-6dfb84cf67-s55nb 0/1 ContainerCreating 0 8s + ip-10-42-179-59.us-west-2.compute.internal: + ui-6dfb84cf67-lvdc2 1/1 Running 0 5m26s +``` + +### Setting up a Synthetic Canary + +Before starting the experiment, set up a synthetic canary for heartbeat monitoring: + +1. First, create an S3 bucket for the canary artifacts: + + ```bash wait=30 + $ export BUCKET_NAME="eks-workshop-canary-artifacts-$(date +%s)" + $ aws s3 mb s3://$BUCKET_NAME --region $AWS_REGION + + make_bucket: eks-workshop-canary-artifacts-1724131402 + ``` + +2. Create the blueprint: + + ```file + manifests/modules/observability/resiliency/scripts/create-blueprint.sh + ``` + + Place this canary blueprint into the bucket: + + ```bash wait=30 + $ ~/$SCRIPT_DIR/create-blueprint.sh + + upload: ./canary.zip to s3://eks-workshop-canary-artifacts-1724131402/canary-scripts/canary.zip + Canary script has been zipped and uploaded to s3://eks-workshop-canary-artifacts-1724131402/canary-scripts/canary.zip + The script is configured to check the URL: http://k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com + ``` + +3. Create a synthetic canary with a Cloudwatch alarm: + + ```bash wait=60 + $ aws synthetics create-canary \ + --name eks-workshop-canary \ + --artifact-s3-location "s3://$BUCKET_NAME/canary-artifacts/" \ + --execution-role-arn $CANARY_ROLE_ARN \ + --runtime-version syn-nodejs-puppeteer-9.0 \ + --schedule "Expression=rate(1 minute)" \ + --code "Handler=canary.handler,S3Bucket=$BUCKET_NAME,S3Key=canary-scripts/canary.zip" \ + --region $AWS_REGION + $ sleep 40 + $ aws synthetics describe-canaries --name eks-workshop-canary --region $AWS_REGION + $ aws synthetics start-canary --name eks-workshop-canary --region $AWS_REGION + $ aws cloudwatch put-metric-alarm \ + --alarm-name "eks-workshop-canary-alarm" \ + --metric-name SuccessPercent \ + --namespace CloudWatchSynthetics \ + --statistic Average \ + --period 60 \ + --threshold 95 \ + --comparison-operator LessThanThreshold \ + --dimensions Name=CanaryName,Value=eks-workshop-canary \ + --evaluation-periods 1 \ + --alarm-description "Alarm when Canary success rate drops below 95%" \ + --unit Percent \ + --region $AWS_REGION + ``` + +This sets up a canary that checks the health of your application every minute and a CloudWatch alarm that triggers if the success percentage falls below 95%. + +With these steps completed, your application is now scaled across to two instances in AZs and you've set up the necessary monitoring for the upcoming AZ failure simulation experiment. diff --git a/website/docs/observability/high-availability/07-az-failure.md b/website/docs/observability/high-availability/07-az-failure.md new file mode 100644 index 000000000..d5a5aafd2 --- /dev/null +++ b/website/docs/observability/high-availability/07-az-failure.md @@ -0,0 +1,119 @@ +--- +title: "Simulating AZ Failure" +sidebar_position: 210 +description: "This experiment simulates an Availability Zone failure to test the resilience of your Kubernetes environment hosted on AWS EKS." +--- + +## Overview + +This repeatable experiment simulates an Availability Zone (AZ) failure, demonstrating the resilience of your application when faced with significant infrastructure disruptions. By leveraging AWS Fault Injection Simulator (FIS) and additional AWS services, we'll test how well your system maintains functionality when an entire AZ becomes unavailable. + +### Setting up the Experiment + +Retrieve the Auto Scaling Group (ASG) name associated with your EKS cluster and create the FIS experiment template to simulate the AZ failure: + +```bash wait=30 +$ export ZONE_EXP_ID=$(aws fis create-experiment-template --cli-input-json '{"description":"publicdocument-azfailure","targets":{},"actions":{"azfailure":{"actionId":"aws:ssm:start-automation-execution","parameters":{"documentArn":"arn:aws:ssm:us-west-2::document/AWSResilienceHub-SimulateAzOutageInAsgTest_2020-07-23","documentParameters":"{\"AutoScalingGroupName\":\"'$ASG_NAME'\",\"CanaryAlarmName\":\"eks-workshop-canary-alarm\",\"AutomationAssumeRole\":\"'$FIS_ROLE_ARN'\",\"IsRollback\":\"false\",\"TestDurationInMinutes\":\"2\"}","maxDuration":"PT6M"}}},"stopConditions":[{"source":"none"}],"roleArn":"'$FIS_ROLE_ARN'","tags":{"ExperimentSuffix":"'$RANDOM_SUFFIX'"}}' --output json | jq -r '.experimentTemplate.id') +``` + +## Running the Experiment + +Execute the FIS experiment to simulate the AZ failure: + +```bash timeout=560 +$ aws fis start-experiment --experiment-template-id $ZONE_EXP_ID --output json && timeout 180s ~/$SCRIPT_DIR/get-pods-by-az.sh + +------us-west-2a------ + ip-10-42-100-4.us-west-2.compute.internal: + ui-6dfb84cf67-h57sp 1/1 Running 0 12m + ui-6dfb84cf67-h87h8 1/1 Running 0 12m + ip-10-42-111-144.us-west-2.compute.internal: + ui-6dfb84cf67-4xvmc 1/1 Running 0 11m + ui-6dfb84cf67-crl2s 1/1 Running 0 6m23s + +------us-west-2b------ + ip-10-42-141-243.us-west-2.compute.internal: + No resources found in ui namespace. + ip-10-42-150-255.us-west-2.compute.internal: + No resources found in ui namespace. + +------us-west-2c------ + ip-10-42-164-250.us-west-2.compute.internal: + ui-6dfb84cf67-fl4hk 1/1 Running 0 11m + ui-6dfb84cf67-mptkw 1/1 Running 0 11m + ui-6dfb84cf67-zxnts 1/1 Running 0 6m27s + ip-10-42-178-108.us-west-2.compute.internal: + ui-6dfb84cf67-8vmcz 1/1 Running 0 6m28s + ui-6dfb84cf67-wknc5 1/1 Running 0 12m +``` + +This command starts the experiment and monitors the distribution and status of pods across different nodes and AZs for 8 minutes to understand the immediate impact of the simulated AZ failure. + +During the experiment, you should observe the following sequence of events: + +1. After about 3 minutes, an AZ zone will fail. +2. Looking at the [Synthetic Canary]() you will see change state to `In Alarm` +3. Around 4 minutes after the experiment started, you will see pods reappearing in the other AZs +4. After the experiment is complete, after about 7 minutes, it marks the AZ as healthy, and replacement EC2 instances will be launched as a result of an EC2 autoscaling action, bringing the number of instances in each AZ to 2 again. + +During this time, the retail url will stay available showing how resilient EKS is to AZ failures. + +:::note +To verify nodes and pods redistribution, you can run: + +```bash timeout=900 wait=60 +$ EXPECTED_NODES=6 && while true; do ready_nodes=$(kubectl get nodes --no-headers | grep " Ready" | wc -l); if [ "$ready_nodes" -eq "$EXPECTED_NODES" ]; then echo "All $EXPECTED_NODES expected nodes are ready."; echo "Listing the ready nodes:"; kubectl get nodes | grep " Ready"; break; else echo "Waiting for all $EXPECTED_NODES nodes to be ready... (Currently $ready_nodes are ready)"; sleep 10; fi; done +$ kubectl delete pod --grace-period=0 --force -n ui -l app.kubernetes.io/component=service +$ kubectl delete pod --grace-period=0 --force -n orders -l app.kubernetes.io/component=mysql +$ kubectl delete pod --grace-period=0 --force -n catalog -l app.kubernetes.io/component=mysql +$ kubectl delete pod --grace-period=0 --force -n carts -l app.kubernetes.io/component=dynamodb +$ kubectl delete pod --grace-period=0 --force -n checkout -l app.kubernetes.io/component=redis +$ kubectl delete pod --grace-period=0 --force -n orders -l app.kubernetes.io/component=service +$ kubectl delete pod --grace-period=0 --force -n catalog -l app.kubernetes.io/component=service +$ kubectl delete pod --grace-period=0 --force -n carts -l app.kubernetes.io/component=service +$ kubectl delete pod --grace-period=0 --force -n checkout -l app.kubernetes.io/component=service +$ kubectl delete pod --grace-period=0 --force -n assets -l app.kubernetes.io/component=service +$ sleep 180 +$ kubectl rollout status -n ui deployment/ui --timeout 180s +$ timeout 10s ~/$SCRIPT_DIR/get-pods-by-az.sh | head -n 30 +``` + +::: + +## Post-Experiment Verification + +After the experiment, verify that your application remains operational despite the simulated AZ failure: + +```bash timeout=900 +$ wait-for-lb $(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}') + +Waiting for k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com... +You can now access http://k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com +``` + +This step confirms the effectiveness of your Kubernetes cluster's high availability configuration and its ability to maintain service continuity during significant infrastructure disruptions. + +## Conclusion + +The AZ failure simulation represents a critical test of your EKS cluster's resilience and your application's high availability design. Through this experiment, you've gained valuable insights into: + +1. The effectiveness of your multi-AZ deployment strategy +2. Kubernetes' ability to reschedule pods across remaining healthy AZs +3. The impact of an AZ failure on your application's performance and availability +4. The efficiency of your monitoring and alerting systems in detecting and responding to major infrastructure issues + +Key takeaways from this experiment include: + +- The importance of distributing your workload across multiple AZs +- The value of proper resource allocation and pod anti-affinity rules +- The need for robust monitoring and alerting systems that can quickly detect AZ-level issues +- The effectiveness of your disaster recovery and business continuity plans + +By regularly conducting such experiments, you can: + +- Identify potential weaknesses in your infrastructure and application architecture +- Refine your incident response procedures +- Build confidence in your system's ability to withstand major failures +- Continuously improve your application's resilience and reliability + +Remember, true resilience comes not just from surviving such failures, but from maintaining performance and user experience even in the face of significant infrastructure disruptions. Use the insights gained from this experiment to further enhance your application's fault tolerance and ensure seamless operations across all scenarios. diff --git a/website/docs/observability/high-availability/index.md b/website/docs/observability/high-availability/index.md new file mode 100644 index 000000000..36f322d9c --- /dev/null +++ b/website/docs/observability/high-availability/index.md @@ -0,0 +1,104 @@ +--- +title: "Chaos Engineering with EKS" +sidebar_position: 70 +sidebar_custom_props: { "module": true } +description: Stimulating various failure scenarios to check Amazon EKS cluster resiliency." +--- + +::required-time + +:::tip Before you start +Prepare your environment for this section: + +```bash timeout=900 wait=30 +$ prepare-environment observability/resiliency +``` + +This will make the following changes to your lab environment: + +- Create the ingress load balancer +- Create RBAC and Rolebindings +- Install AWS Load Balancer controller +- Create an IAM role for AWS Fault Injection Simulator (FIS) + +You can view the Terraform that applies these changes [here](https://github.com/VAR::MANIFESTS_OWNER/VAR::MANIFESTS_REPOSITORY/tree/VAR::MANIFESTS_REF/manifests/modules/observability/resiliency/.workshop/terraform). +::: + +## What is Resiliency? + +Resiliency in cloud computing refers to a system's ability to maintain acceptable performance levels in the face of faults and challenges to normal operation. It encompasses: + +1. **Fault Tolerance**: The ability to continue operating properly in the event of the failure of some of its components. +2. **Self-Healing**: The capability to detect and recover from failures automatically. +3. **Scalability**: The ability to handle increased load by adding resources. +4. **Disaster Recovery**: The process of preparing for and recovering from potential disasters. + +## Why is Resiliency Important in EKS? + +Amazon EKS provides a managed Kubernetes platform, but it's still crucial to design and implement resilient architectures. Here's why: + +1. **High Availability**: Ensure your applications remain accessible even during partial system failures. +2. **Data Integrity**: Prevent data loss and maintain consistency during unexpected events. +3. **User Experience**: Minimize downtime and performance degradation to maintain user satisfaction. +4. **Cost Efficiency**: Avoid over-provisioning by building systems that can handle variable loads and partial failures. +5. **Compliance**: Meet regulatory requirements for uptime and data protection in various industries. + +## Lab Overview and Resiliency Scenarios + +In this lab, we'll explore various high availability scenarios and test the resilience of your EKS environment. Through a series of experiments, you'll gain hands-on experience in handling different types of failures and understanding how your Kubernetes cluster responds to these challenges. + +The simulate and respond to: + +1. **Pod Failures**: Using ChaosMesh to test your application's resilience to individual pod failures. +2. **Node Failures**: Manually simulating a node failure to observe Kubernetes' self-healing capabilities. + + - Without AWS Fault Injection Simulator: Manually simulating a node failure to observe Kubernetes' self-healing capabilities. + - With AWS Fault Injection Simulator: Leveraging AWS Fault Injection Simulator for partial and complete node failure scenarios. + +3. **Availability Zone Failure**: Simulating the loss of an entire AZ to validate your multi-AZ deployment strategy. + +## What You'll Learn + +By the end of this chapter, you'll be able to: + +- Use AWS Fault Injection Simulator (FIS) to simulate and learn from controlled failure scenarios +- Understand how Kubernetes handles different types of failures (pod, node, and availability zone) +- Observe the self-healing capabilities of Kubernetes in action +- Gain practical experience in chaos engineering for EKS environments + +These experiments will help you understand: + +- How Kubernetes handles different types of failures +- The importance of proper resource allocation and pod distribution +- The effectiveness of your monitoring and alerting systems +- How to improve your application's fault tolerance and recovery strategies + +## Tools and Technologies + +Throughout this chapter, we'll be using: + +- AWS Fault Injection Simulator (FIS) for controlled chaos engineering +- Chaos Mesh for Kubernetes-native chaos testing +- AWS CloudWatch Synthetics for creating and monitoring a canary +- Kubernetes native features for observing pod and node behavior during failures + +## Importance of Chaos Engineering + +Chaos engineering is the practice of intentionally introducing controlled failures to identify weaknesses in your system. By proactively testing your system's resilience, you can: + +1. Uncover hidden issues before they affect users +2. Build confidence in your system's ability to withstand turbulent conditions +3. Improve your incident response procedures +4. Foster a culture of resilience within your organization + +By the end of this lab, you'll have a comprehensive understanding of your EKS environment's high availability capabilities and areas for potential improvement. + +:::info +For more information on AWS Resiliency features in greater depth, we recommend checking out: + +- [Ingress Load Balancer](/docs/fundamentals/exposing/ingress/) +- [Integrating with Kubernetes RBAC](/docs/security/cluster-access-management/kubernetes-rbac) +- [AWS Fault Injection Simulator](https://aws.amazon.com/fis/) +- [Operating resilient workloads on Amazon EKS](https://aws.amazon.com/blogs/containers/operating-resilient-workloads-on-amazon-eks/) + +::: diff --git a/website/docs/observability/high-availability/tests/hook-suite.sh b/website/docs/observability/high-availability/tests/hook-suite.sh new file mode 100644 index 000000000..8b5a4baea --- /dev/null +++ b/website/docs/observability/high-availability/tests/hook-suite.sh @@ -0,0 +1,11 @@ +set -e + +before() { + echo "noop" +} + +after() { + prepare-environment +} + +"$@" diff --git a/website/test-durations.json b/website/test-durations.json index ebbbc9f31..cfd2aadf1 100644 --- a/website/test-durations.json +++ b/website/test-durations.json @@ -117,6 +117,7 @@ "/observability/container-insights/index.md": 282125, "/observability/container-insights/visualize-application-metrics-cloudwatch.md": 3191, "/observability/index.md": 1, + "/observability/high-availability/index.md": 1, "/observability/kubecost/index.md": 215109, "/observability/kubecost/introduction.md": 172233, "/observability/logging/cluster-logging/enable-eks-logging.md": 62816,