Skip to content

Commit

Permalink
added helper pod scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
bdevcich committed Oct 24, 2023
1 parent 3b54f39 commit 723b56c
Show file tree
Hide file tree
Showing 13 changed files with 200 additions and 27 deletions.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Build Remora Image
name: Build Helper Image
on:
push:
branches:
Expand All @@ -25,7 +25,7 @@ jobs:
uses: docker/metadata-action@v5
with:
images: |
ghcr.io/${{ env.REPO_NAME }}-remora
ghcr.io/${{ env.REPO_NAME }}-helper
tags: |
# For merge to master branch, tag example: 'master'
type=ref,event=branch
Expand Down Expand Up @@ -53,6 +53,6 @@ jobs:
id: docker_build
uses: docker/build-push-action@v5
with:
context: remora
context: helper
push: true
tags: ${{ steps.meta.outputs.tags }}
File renamed without changes.
4 changes: 2 additions & 2 deletions remora/Makefile → helper_image/Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Image URL to use all building/pushing image targets
#IMG ?= ghcr.io/nearnodeflash/nnf-remora:0.0.1
IMG ?= devcich/nnf-remora:0.0.2
# TODO version needs to come top level makefile or similar
IMG ?= ghcr.io/nearnodeflash/nnf-integration-test-helper:0.0.1

# CONTAINER_TOOL defines the container tool to be used for building images.
# Be aware that the target commands are only tested with Docker which is
Expand Down
5 changes: 5 additions & 0 deletions helper_image/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Helper Image

This image is used to perform extra setup and verification tasks for the
integration test. This image contains scripts that can be run to perform these
actions.
2 changes: 0 additions & 2 deletions remora/scripts/copy-in.sh → helper_image/scripts/copy-in.sh
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,6 @@ fi
# List the basedir contents
ls -alhR "$ROOT_DIR"/"$BASEDIR" #&>"$ROOT_DIR"/"$MYUSER"/test.output

# TODO md5sum

set +e

exit 0
31 changes: 31 additions & 0 deletions helper_image/scripts/copy-out.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/bin/bash

function usage() {
echo
echo "Verify the md5sums of two files match"
echo "This is used after copy_out directives to ensure the copy_in file matches the copy_out file"
echo
echo "Syntax: copy-out.sh FILE_IN FILE_OUT"
echo
echo "examples:"
echo "copy-out.sh testuser/test.in testuser/test.out"
echo
}

COPY_IN=$1
COPY_OUT=$2

if [[ -z "$COPY_IN" ]]; then
usage
exit 1
elif [[ -z "$COPY_OUT" ]]; then
usage
exit 1
fi

if [[ "$(md5sum "$COPY_IN")" = "$(md5sum "$COPY_OUT")" ]]; then
exit 0
else
echo "md5sums do not match"
exit 1
fi
File renamed without changes.
12 changes: 6 additions & 6 deletions int_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ var tests = []*T{
MakeTest("XFS", "#DW jobdw type=xfs name=xfs capacity=1TB").WithLabels(Simple),
MakeTest("GFS2", "#DW jobdw type=gfs2 name=gfs2 capacity=1TB").WithLabels(Simple),
MakeTest("Lustre", "#DW jobdw type=lustre name=lustre capacity=1TB").WithLabels(Simple),
MakeTest("Raw", "#DW jobdw type=raw name=raw capacity=1TB").WithLabels(Simple),

DuplicateTest(
MakeTest("XFS", "#DW jobdw type=xfs name=xfs capacity=1TB").Pending(), // Will fail for Setup() exceeding time limit; needs investigation
Expand All @@ -75,12 +76,11 @@ var tests = []*T{
// Data Movement
MakeTest("XFS with Data Movement",
"#DW jobdw type=xfs name=xfs-data-movement capacity=1TB",
"#DW copy_in source=/lus/global/test.in destination=$DW_JOB_xfs-data-movement/", // TODO: Create a file "test.in" in the global lustre directory
"#DW copy_out source=$DW_JOB_xfs-data-movement/test.out destination=/lus/global/"). // TODO: Validate file "test.out" in the global lustre directory
WithPersistentLustre("xfs-data-movement-lustre-instance"). // Manage a persistent Lustre instance as part of the test
WithGlobalLustreFromPersistentLustre("global", nil).
Serialized().
Pending(),
"#DW copy_in source=/lus/global/testuser/test.in destination=$DW_JOB_xfs-data-movement/",
"#DW copy_out source=$DW_JOB_xfs-data-movement/test.in destination=/lus/global/testuser/test.out").
WithPersistentLustre("xfs-data-movement-lustre-instance").
WithGlobalLustreFromPersistentLustre("global", []string{"default"}).
Serialized(),

// Containers - MPI
MakeTest("GFS2 with MPI Containers",
Expand Down
6 changes: 6 additions & 0 deletions internal/internal.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,10 @@ type T struct {

// Options let you modify the test case with a variety of options and customizations
options TOptions

// Helper pods that run during a test to do extra work (e.g. data
// movement). These need to be cleaned up and tracked.
helperPods []*corev1.Pod
}

func MakeTest(name string, directives ...string) *T {
Expand Down Expand Up @@ -124,6 +128,8 @@ func MakeTest(name string, directives ...string) *T {
},
}

t.helperPods = make([]*corev1.Pod, 0)

return t
}

Expand Down
17 changes: 17 additions & 0 deletions internal/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,13 @@ func (t *T) Prepare(ctx context.Context, k8sClient client.Client) error {

By(fmt.Sprintf("Creating a global lustre file system '%s' @ '%s'", client.ObjectKeyFromObject(lustre), lustre.Spec.MountRoot))
Expect(k8sClient.Create(ctx, lustre)).To(Succeed())

// For our testing purposes, a copy_in directive assumes global lustre.
// With this set, the source path will be created on the global lustre
// filesystem
if len(o.globalLustre.in) > 0 {
SetupCopyIn(ctx, k8sClient, t, t.options)
}
}

return nil
Expand All @@ -422,6 +429,16 @@ func (t *T) Prepare(ctx context.Context, k8sClient client.Client) error {
func (t *T) Cleanup(ctx context.Context, k8sClient client.Client) error {
o := t.options

// Remove any helper pods that may have been used (e.g. copy_in, copy_out)
if len(t.helperPods) > 0 {
CleanupHelperPods(ctx, k8sClient, t)
}

// TODO: If a real lustre filesystem is used rather than persistent, we
// should fire up another helper pod to delete copy_in/copy_out files (i.e.)
// to.globalLustre.in/out. In the meantime, it is assumed the global lustre
// is torn down.

if o.globalLustre != nil {
By(fmt.Sprintf("Deleting global lustre '%s'", o.globalLustre.name))
lustre := &lusv1alpha1.LustreFileSystem{
Expand Down
9 changes: 6 additions & 3 deletions internal/states.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,7 @@ func (t *T) proposal(ctx context.Context, k8sClient client.Client, workflow *dws

func (t *T) setup(ctx context.Context, k8sClient client.Client, workflow *dwsv1alpha2.Workflow) {

// TODO: Move this to a global variable and initialized in the test suite.
systemConfig := &dwsv1alpha2.SystemConfiguration{}
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: "default", Namespace: corev1.NamespaceDefault}, systemConfig)).To(Succeed())
systemConfig := GetSystemConfiguraton(ctx, k8sClient)

By("Assigns Computes")
{
Expand Down Expand Up @@ -186,6 +184,11 @@ func (t *T) postRun(ctx context.Context, k8sClient client.Client, workflow *dwsv

func (t *T) dataOut(ctx context.Context, k8sClient client.Client, workflow *dwsv1alpha2.Workflow) {
t.AdvanceStateAndWaitForReady(ctx, k8sClient, workflow, dwsv1alpha2.StateDataOut)

// If copy_out directive was set, verify that the copy_in file matches the copy_out file on global lustre
if t.options.globalLustre != nil && len(t.options.globalLustre.out) > 0 {
VerifyCopyOut(ctx, k8sClient, t, t.options)
}
}

func (t *T) teardown(ctx context.Context, k8sClient client.Client, workflow *dwsv1alpha2.Workflow) {
Expand Down
124 changes: 124 additions & 0 deletions internal/utils.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
package internal

import (
"context"
"fmt"
"strings"
"time"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"

dwsv1alpha2 "github.com/DataWorkflowServices/dws/api/v1alpha2"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"sigs.k8s.io/controller-runtime/pkg/client"
)

func GetSystemConfiguraton(ctx context.Context, k8sClient client.Client) *dwsv1alpha2.SystemConfiguration {
// TODO: Move this to a global variable and initialized in the test suite.
// Note that putting the GET in Prepare will not work for things like
// WithPersistentLustre() since those options run new MakeTests and do not
// run prepare.

systemConfig := &dwsv1alpha2.SystemConfiguration{}
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: "default", Namespace: corev1.NamespaceDefault}, systemConfig)).To(Succeed())

// Except there to be at least 1 compute and storage node
Expect(systemConfig.Spec.ComputeNodes).ToNot(HaveLen(0))
Expect(systemConfig.Spec.StorageNodes).ToNot(HaveLen(0))

return systemConfig
}

// Start up a pod that accesses the global lustre filesystem and creates a file
// in the location specified by the copy_in directive.
func SetupCopyIn(ctx context.Context, k8sClient client.Client, t *T, o TOptions) {

// remove global lustre filePath/ from the filepath
filePath := strings.Replace(o.globalLustre.in, o.globalLustre.mountRoot+"/", "", 1)

By("Starting copy-in pod and placing file(s) on global lustre")
runHelperPod(ctx, k8sClient, t, "copy-in", "/copy_in.sh", []string{
o.globalLustre.mountRoot,
filePath,
fmt.Sprintf("%d", t.workflow.Spec.UserID),
fmt.Sprintf("%d", t.workflow.Spec.GroupID),
})
}

// Start up a pod that accesses the global lustre filesystem and verifies that
// the files specified by the copy_in and copy_out directives match.
func VerifyCopyOut(ctx context.Context, k8sClient client.Client, t *T, o TOptions) {
lus := t.options.globalLustre

By("Starting copy-out pod and verifying copy out")
runHelperPod(ctx, k8sClient, t, "copy-out", "/copy_out.sh", []string{
fmt.Sprintf("%s/%s", lus.mountRoot, lus.in),
fmt.Sprintf("%s/%s", lus.mountRoot, lus.out),
})
}

// Start up a pod with the given command/args and verify that it runs to completion
func runHelperPod(ctx context.Context, k8sClient client.Client, t *T, name, command string, args []string) {
systemConfig := GetSystemConfiguraton(ctx, k8sClient)
rabbitName := systemConfig.Spec.StorageNodes[0].Name
o := t.options

pod := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: t.workflow.Name + "-" + name,
Namespace: t.workflow.Namespace,
},
Spec: corev1.PodSpec{
RestartPolicy: corev1.RestartPolicyNever,
NodeName: rabbitName,
Containers: []corev1.Container{{
Name: name,
Image: "ghcr.io/nearnodeflash/nnf-integration-test-helper:copy-in-copy-out",
ImagePullPolicy: corev1.PullAlways,
Command: []string{
command,
},
Args: args,
VolumeMounts: []corev1.VolumeMount{{
Name: o.globalLustre.name,
MountPath: o.globalLustre.mountRoot,
}},
}},
Volumes: []corev1.Volume{{
Name: o.globalLustre.name,
VolumeSource: corev1.VolumeSource{
PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
ClaimName: fmt.Sprintf("%s-%s-readwritemany-pvc",
o.globalLustre.name, t.workflow.Namespace),
},
},
}},
},
}

dwsv1alpha2.InheritParentLabels(pod, t.workflow)
dwsv1alpha2.AddOwnerLabels(pod, t.workflow)
dwsv1alpha2.AddWorkflowLabels(pod, t.workflow)

Expect(k8sClient.Create(ctx, pod)).To(Succeed())
t.helperPods = append(t.helperPods, pod)

// Wait for successful completion
Eventually(func(g Gomega) bool {
g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(pod), pod)).To(Succeed())
return pod.Status.Phase == corev1.PodSucceeded
}).WithTimeout(time.Minute).WithPolling(time.Second).Should(BeTrue())
}

func CleanupHelperPods(ctx context.Context, k8sClient client.Client, t *T) {
for _, p := range t.helperPods {
By(fmt.Sprintf("Deleting helper pod %s", p.Name))
Expect(k8sClient.Delete(ctx, p)).To(Succeed())
Eventually(func() error {
return k8sClient.Get(ctx, client.ObjectKeyFromObject(p), p)
}).ShouldNot(Succeed(), "helper pod should delete")
}
}
11 changes: 0 additions & 11 deletions remora/scripts/copy-out.sh

This file was deleted.

0 comments on commit 723b56c

Please sign in to comment.