Skip to content

Commit

Permalink
[ Cherry-pick for 2.4.0 ] (#473)
Browse files Browse the repository at this point in the history
* Refactor/experiment contributing (#470)

* docs: add instructions for building litmus-sdk binary

Non Linux AMD64 users will need to build the binary for their target
platform.

Signed-off-by: Nic Johnson <[email protected]>

* docs: update generated code & docs to aid experiment contribution

It wasn't very clear what generated code needed to be kept, and what
generated code needed to be replaced with experiment-specific code.
Attempt to make that more clear by expanding README & adding grep-able
tags inside generated code.

Signed-off-by: Nic Johnson <[email protected]>

* fix issue-3350 (#468)

Signed-off-by: Andrew Hu <[email protected]>

Co-authored-by: Udit Gaurav <[email protected]>

* Remove the stress process on timeout without failure (#472)

Signed-off-by: udit <[email protected]>

* update image tag

Signed-off-by: udit <[email protected]>

Co-authored-by: Nic Johnson <[email protected]>
Co-authored-by: Andrew Hu <[email protected]>
  • Loading branch information
3 people authored Dec 15, 2021
1 parent c4fb546 commit a9e5f39
Show file tree
Hide file tree
Showing 6 changed files with 83 additions and 29 deletions.
2 changes: 1 addition & 1 deletion build/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ RUN apk --update add \
# Packaging stage
# Image source: https://github.com/litmuschaos/test-tools/blob/master/custom/hardened-alpine/experiment/Dockerfile
# The base image is non-root (have litmus user) with default litmus directory.
FROM litmuschaos/experiment-alpine:2.3.0
FROM litmuschaos/experiment-alpine:2.4.0

LABEL maintainer="LitmusChaos"

Expand Down
56 changes: 40 additions & 16 deletions chaoslib/litmus/node-drain/lib/node-drain.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"os/exec"
"os/signal"
"strconv"
"strings"
"syscall"
"time"

Expand All @@ -19,6 +20,8 @@ import (
"github.com/litmuschaos/litmus-go/pkg/utils/common"
"github.com/litmuschaos/litmus-go/pkg/utils/retry"
"github.com/pkg/errors"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

Expand Down Expand Up @@ -149,29 +152,50 @@ func drainNode(experimentsDetails *experimentTypes.ExperimentDetails, clients cl
// uncordonNode uncordon the application node
func uncordonNode(experimentsDetails *experimentTypes.ExperimentDetails, clients clients.ClientSets, chaosDetails *types.ChaosDetails) error {

log.Infof("[Recover]: Uncordon the %v node", experimentsDetails.TargetNode)
targetNodes := strings.Split(experimentsDetails.TargetNode, ",")
for _, targetNode := range targetNodes {

command := exec.Command("kubectl", "uncordon", experimentsDetails.TargetNode)
var out, stderr bytes.Buffer
command.Stdout = &out
command.Stderr = &stderr
if err := command.Run(); err != nil {
log.Infof("Error String: %v", stderr.String())
return errors.Errorf("unable to uncordon the %v node, err: %v", experimentsDetails.TargetNode, err)
}
//Check node exist before uncordon the node
_, err := clients.KubeClient.CoreV1().Nodes().Get(targetNode, metav1.GetOptions{})
if err != nil {
if apierrors.IsNotFound(err) {
log.Infof("[Info]: The %v node is no longer exist, skip uncordon the node", targetNode)
common.SetTargets(targetNode, "noLongerExist", "node", chaosDetails)
continue
} else {
return errors.Errorf("unable to get the %v node, err: %v", targetNode, err)
}
}

common.SetTargets(experimentsDetails.TargetNode, "reverted", "node", chaosDetails)
log.Infof("[Recover]: Uncordon the %v node", targetNode)
command := exec.Command("kubectl", "uncordon", targetNode)
var out, stderr bytes.Buffer
command.Stdout = &out
command.Stderr = &stderr
if err := command.Run(); err != nil {
log.Infof("Error String: %v", stderr.String())
return errors.Errorf("unable to uncordon the %v node, err: %v", targetNode, err)
}
common.SetTargets(targetNode, "reverted", "node", chaosDetails)
}

return retry.
Times(uint(experimentsDetails.Timeout / experimentsDetails.Delay)).
Wait(time.Duration(experimentsDetails.Delay) * time.Second).
Try(func(attempt uint) error {
nodeSpec, err := clients.KubeClient.CoreV1().Nodes().Get(experimentsDetails.TargetNode, v1.GetOptions{})
if err != nil {
return err
}
if nodeSpec.Spec.Unschedulable {
return errors.Errorf("%v node is in unschedulable state", experimentsDetails.TargetNode)
targetNodes := strings.Split(experimentsDetails.TargetNode, ",")
for _, targetNode := range targetNodes {
nodeSpec, err := clients.KubeClient.CoreV1().Nodes().Get(targetNode, v1.GetOptions{})
if err != nil {
if apierrors.IsNotFound(err) {
continue
} else {
return err
}
}
if nodeSpec.Spec.Unschedulable {
return errors.Errorf("%v node is in unschedulable state", experimentsDetails.TargetNode)
}
}
return nil
})
Expand Down
10 changes: 6 additions & 4 deletions chaoslib/litmus/stress-chaos/helper/stress-helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -176,13 +176,15 @@ func prepareStressChaos(experimentsDetails *experimentTypes.ExperimentDetails, c
select {
case <-timeout:
// the stress process gets timeout before completion
log.Infof("[Timeout] Stress output: %v", buf.String())
log.Info("[Cleanup]: Killing the stress process")
terminateProcess(cmd.Process.Pid)
log.Infof("[Chaos] The stress process is not yet completed after the chaos duration of %vs", experimentsDetails.ChaosDuration+30)
log.Info("[Timeout]: Killing the stress process")
if err = terminateProcess(cmd.Process.Pid); err != nil {
return err
}
if err = result.AnnotateChaosResult(resultDetails.Name, chaosDetails.ChaosNamespace, "reverted", "pod", experimentsDetails.TargetPods); err != nil {
return err
}
return errors.Errorf("the stress process is timeout after %vs", experimentsDetails.ChaosDuration+30)
return nil
case err := <-done:
if err != nil {
err, ok := err.(*exec.ExitError)
Expand Down
32 changes: 25 additions & 7 deletions contribute/developer-guide/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,12 @@ The *generate_experiment.go* script is a simple way to bootstrap your experiment
$ cd litmus-go/contribute/developer-guide
```

- Build litmus-sdk

```
go build -o ./litmus-sdk ./bin/main.go
```

- Populate the `attributes.yaml` with details of the chaos experiment (or chart). Use the [attributes.yaml.sample](/contribute/developer-guide/attributes.yaml.sample) as reference.

As an example, let us consider an experiment to kill one of the replicas of a nginx deployment. The attributes.yaml can be constructed like this:
Expand Down Expand Up @@ -158,17 +164,29 @@ The *generate_experiment.go* script is a simple way to bootstrap your experiment
drwxr-xr-x 2 shubham shubham 4096 Jun 10 22:41 icons/
```

- Proceed with construction of business logic inside the `sample-exec-chaos.go` file, by making
the appropriate modifications listed below to achieve the desired effect:

- variables
- entry & exit criteria checks for the experiment
- helper utils in either [pkg](/pkg/) or new [base chaos libraries](/chaoslib)
- Proceed with construction of business logic, by making the appropriate modifications listed below
to achieve the desired effect:

- Pre-Chaos Checks: Additional experiment-specific checks to run before chaos. Checks should be
added at the `@TODO: user PRE-CHAOS-CHECK` marker in the
`experiments/<category>/<name>/experiment/<name>.go` file

- Inject Chaos: The heart of your experiment, actually enact the choas. By default, the generated
code will call out to the generated library. However, if your experiment simply makes use of
exising libraries, modify the chaos injection at the `@TODO: user INVOKE-CHAOSLIB` marker in the
`experiments/<category>/<name>/experiment/<name>.go` file

- Library Modifications: This is where the low level chaos execution code should live. Populate
the `runChaos`, `experimentExecution`, and `injectChaos` functions as appropriate in the
`chaosLib/litmus/<name>/lib/<name>.go` file.

- The chaoslib is created at `chaoslib/litmus/sample-exec-chaos/lib/sample-exec-chaos.go` path. It contains some pre-defined steps which runs the `ChaosInject` command (explicitly provided as an ENV var in the experiment CR). Which will induce chaos in the target application. It will wait for the given chaos duration and finally runs the `ChaosKill` command (also provided as an ENV var) for cleanup purposes. Update this chaoslib to achieve the desired effect based on the use-case or reuse the other existing chaoslib.
- Post-Chaos Checks: Additional experiment-specific checks to run after achos. Checks should be
added at the `@TODO: user POST-CHAOS-CHECK` marker in the
`experiments/<category>/<name>/experiment/<name>.go` file

- Create an experiment README explaining, briefly, the *what*, *why* & *how* of the experiment to aid users of this experiment.
- Create an experiment README explaining, briefly, the *what*, *why* & *how* of the experiment to aid users of this experiment. This README
should live at `experiments/<category>/<name>/README.md`

### Steps to Test Experiment

Expand Down
4 changes: 4 additions & 0 deletions contribute/developer-guide/templates/experiment.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"github.com/sirupsen/logrus"
)


// Experiment contains steps to inject chaos
func Experiment(clients clients.ClientSets){

Expand Down Expand Up @@ -69,6 +70,7 @@ func Experiment(clients clients.ClientSets){
// Calling AbortWatcher go routine, it will continuously watch for the abort signal and generate the required events and result
go common.AbortWatcher(experimentsDetails.ExperimentName, clients, &resultDetails, &chaosDetails, &eventsDetails)

// @TODO: user PRE-CHAOS-CHECK
// ADD A PRE-CHAOS CHECK OF YOUR CHOICE HERE
// POD STATUS CHECKS FOR THE APPLICATION UNDER TEST AND AUXILIARY APPLICATIONS ARE ADDED BY DEFAULT

Expand Down Expand Up @@ -121,6 +123,7 @@ func Experiment(clients clients.ClientSets){
// INVOKE THE CHAOSLIB OF YOUR CHOICE HERE, WHICH WILL CONTAIN
// THE BUSINESS LOGIC OF THE ACTUAL CHAOS
// IT CAN BE A NEW CHAOSLIB YOU HAVE CREATED SPECIALLY FOR THIS EXPERIMENT OR ANY EXISTING ONE
// @TODO: user INVOKE-CHAOSLIB

// Including the litmus lib
switch experimentsDetails.ChaosLib {
Expand All @@ -138,6 +141,7 @@ func Experiment(clients clients.ClientSets){
return
}

// @TODO: user POST-CHAOS-CHECK
// ADD A POST-CHAOS CHECK OF YOUR CHOICE HERE
// POD STATUS CHECKS FOR THE APPLICATION UNDER TEST AND AUXILIARY APPLICATIONS ARE ADDED BY DEFAULT

Expand Down
8 changes: 7 additions & 1 deletion pkg/status/nodes.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"github.com/pkg/errors"
logrus "github.com/sirupsen/logrus"
apiv1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

Expand All @@ -26,7 +27,12 @@ func CheckNodeStatus(nodes string, timeout, delay int, clients clients.ClientSet
for index := range targetNodes {
node, err := clients.KubeClient.CoreV1().Nodes().Get(targetNodes[index], metav1.GetOptions{})
if err != nil {
return err
if apierrors.IsNotFound(err) {
log.Infof("[Info]: The %v node is not exist", targetNodes[index])
continue
} else {
return err
}
}
nodeList.Items = append(nodeList.Items, *node)
}
Expand Down

0 comments on commit a9e5f39

Please sign in to comment.