Skip to content

Commit

Permalink
Containers: Added PreRunTimeoutSeconds + Error Exit Strategy
Browse files Browse the repository at this point in the history
- Adds a timeout to Prerun and errors out if the containers do not start
- Moved timeout anonymous functions to be normal functions
- Fixed some issues with the example profiles for testing + added
  example-mpi-fail
- Improved error handing in Postrun

Signed-off-by: Blake Devcich <[email protected]>
  • Loading branch information
bdevcich committed Sep 1, 2023
1 parent f3eb87a commit 28cc5b8
Show file tree
Hide file tree
Showing 4 changed files with 262 additions and 126 deletions.
14 changes: 11 additions & 3 deletions api/v1alpha1/nnfcontainerprofile_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,21 @@ type NnfContainerProfileData struct {
// List of possible filesystems supported by this container profile
Storages []NnfContainerProfileStorage `json:"storages,omitempty"`

// Stop any containers after X seconds once a workflow has transitioned to PostRun. Defaults to
// 0. A value of 0 disables this behavior.
// Containers are launched in the PreRun state. Allow this many seconds for the containers to
// start before declaring an error to the workflow.
// Defaults to 60. A value of 0 disables this behavior.
// +kubebuilder:default:=60
// +kubebuilder:validation:Minimum:=0
PreRunTimeoutSeconds int64 `json:"preRunTimeoutSeconds,omitempty"`

// Containers are expected to complete in the PostRun State. Allow this many seconds for the
// containers to exit before declaring an error the workflow.
// Defaults to 0. A value of 0 disables this behavior.
// +kubebuilder:validation:Minimum:=0
PostRunTimeoutSeconds int64 `json:"postRunTimeoutSeconds,omitempty"`

// Specifies the number of times a container will be retried upon a failure. A new pod is
// deployed on each retry. Defaults to 6 by kubernetes itself and must be set. A value of 0
// deployed on each retry. Defaults to 6 by kubernetes itself and must be set. A value of 0
// disables retries.
// +kubebuilder:validation:Minimum:=0
// +kubebuilder:default:=6
Expand Down
15 changes: 12 additions & 3 deletions config/crd/bases/nnf.cray.hpe.com_nnfcontainerprofiles.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8629,16 +8629,25 @@ spec:
description: Pinned is true if this instance is an immutable copy
type: boolean
postRunTimeoutSeconds:
description: Stop any containers after X seconds once a workflow has
transitioned to PostRun. Defaults to 0. A value of 0 disables this
description: Containers are expected to complete in the PostRun State.
Allow this many seconds for the containers to exit before declaring
an error the workflow. Defaults to 0. A value of 0 disables this
behavior.
format: int64
minimum: 0
type: integer
preRunTimeoutSeconds:
default: 60
description: Containers are launched in the PreRun state. Allow this
many seconds for the containers to start before declaring an error
to the workflow. Defaults to 60. A value of 0 disables this behavior.
format: int64
minimum: 0
type: integer
retryLimit:
default: 6
description: Specifies the number of times a container will be retried
upon a failure. A new pod is deployed on each retry. Defaults to
upon a failure. A new pod is deployed on each retry. Defaults to
6 by kubernetes itself and must be set. A value of 0 disables retries.
format: int32
minimum: 0
Expand Down
180 changes: 112 additions & 68 deletions config/examples/nnf_v1alpha1_nnfcontainerprofiles.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,34 @@ metadata:
data:
retryLimit: 6
storages:
- name: DW_JOB_foo_local_storage
optional: false
- name: DW_PERSISTENT_foo_persistent_storage
optional: true
- name: DW_JOB_foo_local_storage
optional: false
- name: DW_PERSISTENT_foo_persistent_storage
optional: true
- name: DW_GLOBAL_foo_global_lustre
optional: true
spec:
containers:
- name: example-success
image: alpine:latest
command:
- /bin/sh
- -c
- "sleep 15 && exit 0"
- name: example-success
image: alpine:latest
command:
- /bin/sh
- -c
- "sleep 10 && exit 0"
---
apiVersion: nnf.cray.hpe.com/v1alpha1
kind: NnfContainerProfile
metadata:
name: example-fail
data:
spec:
containers:
- name: example-fail
image: alpine:latest
command:
- /bin/sh
- -c
- "sleep 10 && exit 1"
---
apiVersion: nnf.cray.hpe.com/v1alpha1
kind: NnfContainerProfile
Expand All @@ -25,23 +41,23 @@ metadata:
data:
retryLimit: 6
storages:
- name: DW_JOB_foo_local_storage
optional: false
- name: DW_PERSISTENT_foo_persistent_storage
optional: true
- name: DW_JOB_foo_local_storage
optional: false
- name: DW_PERSISTENT_foo_persistent_storage
optional: true
spec:
containers:
- name: example-randomly-fail
image: alpine:latest
command:
- /bin/sh
- -c
- |
echo "starting..."
sleep 30
x=$(($RANDOM % 2))
echo "exiting: $x"
exit $x
- name: example-randomly-fail
image: alpine:latest
command:
- /bin/sh
- -c
- |
echo "starting..."
sleep 10
x=$(($RANDOM % 2))
echo "exiting: $x"
exit $x
---
apiVersion: nnf.cray.hpe.com/v1alpha1
kind: NnfContainerProfile
Expand All @@ -50,18 +66,18 @@ metadata:
data:
retryLimit: 6
storages:
- name: DW_JOB_foo_local_storage
optional: false
- name: DW_PERSISTENT_foo_persistent_storage
optional: true
- name: DW_JOB_foo_local_storage
optional: false
- name: DW_PERSISTENT_foo_persistent_storage
optional: true
spec:
containers:
- name: example-forever
image: alpine:latest
command:
- /bin/sh
- -c
- "while true; do date && sleep 5; done"
- name: example-forever
image: alpine:latest
command:
- /bin/sh
- -c
- "while true; do date && sleep 5; done"
---
apiVersion: nnf.cray.hpe.com/v1alpha1
kind: NnfContainerProfile
Expand All @@ -71,13 +87,13 @@ data:
retryLimit: 6
numPorts: 1
storages:
- name: DW_JOB_foo_local_storage
optional: false
- name: DW_PERSISTENT_foo_persistent_storage
optional: true
- name: DW_GLOBAL_foo_global_lustre
optional: true
pvcMode: ReadWriteMany
- name: DW_JOB_foo_local_storage
optional: false
- name: DW_PERSISTENT_foo_persistent_storage
optional: true
- name: DW_GLOBAL_foo_global_lustre
optional: true
pvcMode: ReadWriteMany
mpiSpec:
runPolicy:
cleanPodPolicy: Running
Expand All @@ -86,36 +102,64 @@ data:
template:
spec:
containers:
- name: example-mpi
image: nnf-mfu:latest
command:
- mpirun
- dcmp
- "$(DW_JOB_foo_local_storage)/0"
- "$(DW_JOB_foo_local_storage)/1"
- name: example-mpi
image: nnf-mfu:latest
command:
- mpirun
- dcmp
- "$(DW_JOB_foo_local_storage)/0"
- "$(DW_JOB_foo_local_storage)/1"
Worker:
template:
spec:
containers:
- name: example-mpi
image: nnf-mfu:latest
- name: example-mpi
image: nnf-mfu:latest

---
apiVersion: nnf.cray.hpe.com/v1alpha1
kind: NnfContainerProfile
metadata:
name: example-mpi-fail
data:
numPorts: 1
mpiSpec:
runPolicy:
cleanPodPolicy: Running
mpiReplicaSpecs:
Launcher:
template:
spec:
containers:
- name: example-mpi-fail
image: nnf-mfu:latest
command:
- mpirun
- /bin/sh
- -c
- "sleep 10 && exit 1"
Worker:
template:
spec:
containers:
- name: example-mpi-fail
image: nnf-mfu:latest
---
apiVersion: nnf.cray.hpe.com/v1alpha1
kind: NnfContainerProfile
metadata:
name: example-mpi-webserver
data:
retryLimit: 6
numPorts: 1
storages:
- name: DW_JOB_foo_local_storage
optional: false
- name: DW_PERSISTENT_foo_persistent_storage
optional: true
- name: DW_GLOBAL_foo_global_lustre
optional: true
pvcMode: ReadWriteMany
- name: DW_JOB_foo_local_storage
optional: false
- name: DW_PERSISTENT_foo_persistent_storage
optional: true
- name: DW_GLOBAL_foo_global_lustre
optional: true
pvcMode: ReadWriteMany
mpiSpec:
runPolicy:
cleanPodPolicy: Running
Expand All @@ -124,17 +168,17 @@ data:
template:
spec:
containers:
- name: example-mpi-webserver
image: ghcr.io/nearnodeflash/nnf-container-example:latest
command:
- mpirun
- python3
- -m
- http.server
- $(NNF_CONTAINER_PORTS)
- name: example-mpi-webserver
image: ghcr.io/nearnodeflash/nnf-container-example:latest
command:
- mpirun
- python3
- -m
- http.server
- $(NNF_CONTAINER_PORTS)
Worker:
template:
spec:
containers:
- name: example-mpi-webserver
image: ghcr.io/nearnodeflash/nnf-container-example:latest
- name: example-mpi-webserver
image: ghcr.io/nearnodeflash/nnf-container-example:latest
Loading

0 comments on commit 28cc5b8

Please sign in to comment.