From a9e5f39dba507a2b6d1843abc3b768a392414f02 Mon Sep 17 00:00:00 2001 From: Udit Gaurav <35391335+uditgaurav@users.noreply.github.com> Date: Wed, 15 Dec 2021 17:53:44 +0530 Subject: [PATCH] [ Cherry-pick for 2.4.0 ] (#473) * Refactor/experiment contributing (#470) * docs: add instructions for building litmus-sdk binary Non Linux AMD64 users will need to build the binary for their target platform. Signed-off-by: Nic Johnson * docs: update generated code & docs to aid experiment contribution It wasn't very clear what generated code needed to be kept, and what generated code needed to be replaced with experiment-specific code. Attempt to make that more clear by expanding README & adding grep-able tags inside generated code. Signed-off-by: Nic Johnson * fix issue-3350 (#468) Signed-off-by: Andrew Hu Co-authored-by: Udit Gaurav <35391335+uditgaurav@users.noreply.github.com> * Remove the stress process on timeout without failure (#472) Signed-off-by: udit * update image tag Signed-off-by: udit Co-authored-by: Nic Johnson Co-authored-by: Andrew Hu <93282581+andrewhu-hcl@users.noreply.github.com> --- build/Dockerfile | 2 +- chaoslib/litmus/node-drain/lib/node-drain.go | 56 +++++++++++++------ .../stress-chaos/helper/stress-helper.go | 10 ++-- contribute/developer-guide/README.md | 32 ++++++++--- .../developer-guide/templates/experiment.tmpl | 4 ++ pkg/status/nodes.go | 8 ++- 6 files changed, 83 insertions(+), 29 deletions(-) diff --git a/build/Dockerfile b/build/Dockerfile index 5b4ed1395..76c614238 100644 --- a/build/Dockerfile +++ b/build/Dockerfile @@ -24,7 +24,7 @@ RUN apk --update add \ # Packaging stage # Image source: https://github.com/litmuschaos/test-tools/blob/master/custom/hardened-alpine/experiment/Dockerfile # The base image is non-root (have litmus user) with default litmus directory. -FROM litmuschaos/experiment-alpine:2.3.0 +FROM litmuschaos/experiment-alpine:2.4.0 LABEL maintainer="LitmusChaos" diff --git a/chaoslib/litmus/node-drain/lib/node-drain.go b/chaoslib/litmus/node-drain/lib/node-drain.go index 52a7535fe..65ae77563 100644 --- a/chaoslib/litmus/node-drain/lib/node-drain.go +++ b/chaoslib/litmus/node-drain/lib/node-drain.go @@ -6,6 +6,7 @@ import ( "os/exec" "os/signal" "strconv" + "strings" "syscall" "time" @@ -19,6 +20,8 @@ import ( "github.com/litmuschaos/litmus-go/pkg/utils/common" "github.com/litmuschaos/litmus-go/pkg/utils/retry" "github.com/pkg/errors" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -149,29 +152,50 @@ func drainNode(experimentsDetails *experimentTypes.ExperimentDetails, clients cl // uncordonNode uncordon the application node func uncordonNode(experimentsDetails *experimentTypes.ExperimentDetails, clients clients.ClientSets, chaosDetails *types.ChaosDetails) error { - log.Infof("[Recover]: Uncordon the %v node", experimentsDetails.TargetNode) + targetNodes := strings.Split(experimentsDetails.TargetNode, ",") + for _, targetNode := range targetNodes { - command := exec.Command("kubectl", "uncordon", experimentsDetails.TargetNode) - var out, stderr bytes.Buffer - command.Stdout = &out - command.Stderr = &stderr - if err := command.Run(); err != nil { - log.Infof("Error String: %v", stderr.String()) - return errors.Errorf("unable to uncordon the %v node, err: %v", experimentsDetails.TargetNode, err) - } + //Check node exist before uncordon the node + _, err := clients.KubeClient.CoreV1().Nodes().Get(targetNode, metav1.GetOptions{}) + if err != nil { + if apierrors.IsNotFound(err) { + log.Infof("[Info]: The %v node is no longer exist, skip uncordon the node", targetNode) + common.SetTargets(targetNode, "noLongerExist", "node", chaosDetails) + continue + } else { + return errors.Errorf("unable to get the %v node, err: %v", targetNode, err) + } + } - common.SetTargets(experimentsDetails.TargetNode, "reverted", "node", chaosDetails) + log.Infof("[Recover]: Uncordon the %v node", targetNode) + command := exec.Command("kubectl", "uncordon", targetNode) + var out, stderr bytes.Buffer + command.Stdout = &out + command.Stderr = &stderr + if err := command.Run(); err != nil { + log.Infof("Error String: %v", stderr.String()) + return errors.Errorf("unable to uncordon the %v node, err: %v", targetNode, err) + } + common.SetTargets(targetNode, "reverted", "node", chaosDetails) + } return retry. Times(uint(experimentsDetails.Timeout / experimentsDetails.Delay)). Wait(time.Duration(experimentsDetails.Delay) * time.Second). Try(func(attempt uint) error { - nodeSpec, err := clients.KubeClient.CoreV1().Nodes().Get(experimentsDetails.TargetNode, v1.GetOptions{}) - if err != nil { - return err - } - if nodeSpec.Spec.Unschedulable { - return errors.Errorf("%v node is in unschedulable state", experimentsDetails.TargetNode) + targetNodes := strings.Split(experimentsDetails.TargetNode, ",") + for _, targetNode := range targetNodes { + nodeSpec, err := clients.KubeClient.CoreV1().Nodes().Get(targetNode, v1.GetOptions{}) + if err != nil { + if apierrors.IsNotFound(err) { + continue + } else { + return err + } + } + if nodeSpec.Spec.Unschedulable { + return errors.Errorf("%v node is in unschedulable state", experimentsDetails.TargetNode) + } } return nil }) diff --git a/chaoslib/litmus/stress-chaos/helper/stress-helper.go b/chaoslib/litmus/stress-chaos/helper/stress-helper.go index 7fc9a3e68..770ddc243 100644 --- a/chaoslib/litmus/stress-chaos/helper/stress-helper.go +++ b/chaoslib/litmus/stress-chaos/helper/stress-helper.go @@ -176,13 +176,15 @@ func prepareStressChaos(experimentsDetails *experimentTypes.ExperimentDetails, c select { case <-timeout: // the stress process gets timeout before completion - log.Infof("[Timeout] Stress output: %v", buf.String()) - log.Info("[Cleanup]: Killing the stress process") - terminateProcess(cmd.Process.Pid) + log.Infof("[Chaos] The stress process is not yet completed after the chaos duration of %vs", experimentsDetails.ChaosDuration+30) + log.Info("[Timeout]: Killing the stress process") + if err = terminateProcess(cmd.Process.Pid); err != nil { + return err + } if err = result.AnnotateChaosResult(resultDetails.Name, chaosDetails.ChaosNamespace, "reverted", "pod", experimentsDetails.TargetPods); err != nil { return err } - return errors.Errorf("the stress process is timeout after %vs", experimentsDetails.ChaosDuration+30) + return nil case err := <-done: if err != nil { err, ok := err.(*exec.ExitError) diff --git a/contribute/developer-guide/README.md b/contribute/developer-guide/README.md index 9f5fd5750..21a311df7 100644 --- a/contribute/developer-guide/README.md +++ b/contribute/developer-guide/README.md @@ -34,6 +34,12 @@ The *generate_experiment.go* script is a simple way to bootstrap your experiment $ cd litmus-go/contribute/developer-guide ``` +- Build litmus-sdk + + ``` + go build -o ./litmus-sdk ./bin/main.go + ``` + - Populate the `attributes.yaml` with details of the chaos experiment (or chart). Use the [attributes.yaml.sample](/contribute/developer-guide/attributes.yaml.sample) as reference. As an example, let us consider an experiment to kill one of the replicas of a nginx deployment. The attributes.yaml can be constructed like this: @@ -158,17 +164,29 @@ The *generate_experiment.go* script is a simple way to bootstrap your experiment drwxr-xr-x 2 shubham shubham 4096 Jun 10 22:41 icons/ ``` -- Proceed with construction of business logic inside the `sample-exec-chaos.go` file, by making - the appropriate modifications listed below to achieve the desired effect: - - variables - - entry & exit criteria checks for the experiment - - helper utils in either [pkg](/pkg/) or new [base chaos libraries](/chaoslib) +- Proceed with construction of business logic, by making the appropriate modifications listed below + to achieve the desired effect: + + - Pre-Chaos Checks: Additional experiment-specific checks to run before chaos. Checks should be + added at the `@TODO: user PRE-CHAOS-CHECK` marker in the + `experiments///experiment/.go` file + + - Inject Chaos: The heart of your experiment, actually enact the choas. By default, the generated + code will call out to the generated library. However, if your experiment simply makes use of + exising libraries, modify the chaos injection at the `@TODO: user INVOKE-CHAOSLIB` marker in the + `experiments///experiment/.go` file + - Library Modifications: This is where the low level chaos execution code should live. Populate + the `runChaos`, `experimentExecution`, and `injectChaos` functions as appropriate in the + `chaosLib/litmus//lib/.go` file. -- The chaoslib is created at `chaoslib/litmus/sample-exec-chaos/lib/sample-exec-chaos.go` path. It contains some pre-defined steps which runs the `ChaosInject` command (explicitly provided as an ENV var in the experiment CR). Which will induce chaos in the target application. It will wait for the given chaos duration and finally runs the `ChaosKill` command (also provided as an ENV var) for cleanup purposes. Update this chaoslib to achieve the desired effect based on the use-case or reuse the other existing chaoslib. + - Post-Chaos Checks: Additional experiment-specific checks to run after achos. Checks should be + added at the `@TODO: user POST-CHAOS-CHECK` marker in the + `experiments///experiment/.go` file -- Create an experiment README explaining, briefly, the *what*, *why* & *how* of the experiment to aid users of this experiment. +- Create an experiment README explaining, briefly, the *what*, *why* & *how* of the experiment to aid users of this experiment. This README + should live at `experiments///README.md` ### Steps to Test Experiment diff --git a/contribute/developer-guide/templates/experiment.tmpl b/contribute/developer-guide/templates/experiment.tmpl index 545926742..99f88b5b0 100644 --- a/contribute/developer-guide/templates/experiment.tmpl +++ b/contribute/developer-guide/templates/experiment.tmpl @@ -16,6 +16,7 @@ import ( "github.com/sirupsen/logrus" ) + // Experiment contains steps to inject chaos func Experiment(clients clients.ClientSets){ @@ -69,6 +70,7 @@ func Experiment(clients clients.ClientSets){ // Calling AbortWatcher go routine, it will continuously watch for the abort signal and generate the required events and result go common.AbortWatcher(experimentsDetails.ExperimentName, clients, &resultDetails, &chaosDetails, &eventsDetails) + // @TODO: user PRE-CHAOS-CHECK // ADD A PRE-CHAOS CHECK OF YOUR CHOICE HERE // POD STATUS CHECKS FOR THE APPLICATION UNDER TEST AND AUXILIARY APPLICATIONS ARE ADDED BY DEFAULT @@ -121,6 +123,7 @@ func Experiment(clients clients.ClientSets){ // INVOKE THE CHAOSLIB OF YOUR CHOICE HERE, WHICH WILL CONTAIN // THE BUSINESS LOGIC OF THE ACTUAL CHAOS // IT CAN BE A NEW CHAOSLIB YOU HAVE CREATED SPECIALLY FOR THIS EXPERIMENT OR ANY EXISTING ONE + // @TODO: user INVOKE-CHAOSLIB // Including the litmus lib switch experimentsDetails.ChaosLib { @@ -138,6 +141,7 @@ func Experiment(clients clients.ClientSets){ return } + // @TODO: user POST-CHAOS-CHECK // ADD A POST-CHAOS CHECK OF YOUR CHOICE HERE // POD STATUS CHECKS FOR THE APPLICATION UNDER TEST AND AUXILIARY APPLICATIONS ARE ADDED BY DEFAULT diff --git a/pkg/status/nodes.go b/pkg/status/nodes.go index 8bf2a648d..567a8f1e4 100644 --- a/pkg/status/nodes.go +++ b/pkg/status/nodes.go @@ -10,6 +10,7 @@ import ( "github.com/pkg/errors" logrus "github.com/sirupsen/logrus" apiv1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -26,7 +27,12 @@ func CheckNodeStatus(nodes string, timeout, delay int, clients clients.ClientSet for index := range targetNodes { node, err := clients.KubeClient.CoreV1().Nodes().Get(targetNodes[index], metav1.GetOptions{}) if err != nil { - return err + if apierrors.IsNotFound(err) { + log.Infof("[Info]: The %v node is not exist", targetNodes[index]) + continue + } else { + return err + } } nodeList.Items = append(nodeList.Items, *node) }