Skip to content

Commit

Permalink
Made Pre/PostRunTimeSeconds a pointer to allow for disabling and defa…
Browse files Browse the repository at this point in the history
…ulting

Signed-off-by: Blake Devcich <[email protected]>
  • Loading branch information
bdevcich committed Sep 6, 2023
1 parent 28cc5b8 commit b34229b
Show file tree
Hide file tree
Showing 15 changed files with 588 additions and 23 deletions.
4 changes: 1 addition & 3 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,10 @@
"mode": "test",
"program": "${relativeFileDirname}",
"args": [
"-v=4",
"-ginkgo.v",
"-ginkgo.progress"
],
"env": {
"KUBEBUILDER_ASSETS": "${workspaceFolder}/bin/k8s/1.25.0-darwin-amd64",
"KUBEBUILDER_ASSETS": "${workspaceFolder}/bin/k8s/1.26.0-darwin-amd64",
"GOMEGA_DEFAULT_EVENTUALLY_TIMEOUT": "10m",
"GOMEGA_DEFAULT_EVENTUALLY_POLLING_INTERVAL": "100ms"
},
Expand Down
9 changes: 5 additions & 4 deletions api/v1alpha1/nnfcontainerprofile_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,16 +42,17 @@ type NnfContainerProfileData struct {

// Containers are launched in the PreRun state. Allow this many seconds for the containers to
// start before declaring an error to the workflow.
// Defaults to 60. A value of 0 disables this behavior.
// Defaults to 60 if not set. A value of 0 disables this behavior.
// +kubebuilder:default:=60
// +kubebuilder:validation:Minimum:=0
PreRunTimeoutSeconds int64 `json:"preRunTimeoutSeconds,omitempty"`
PreRunTimeoutSeconds *int64 `json:"preRunTimeoutSeconds,omitempty"`

// Containers are expected to complete in the PostRun State. Allow this many seconds for the
// containers to exit before declaring an error the workflow.
// Defaults to 0. A value of 0 disables this behavior.
// Defaults to 60 if not set. A value of 0 disables this behavior.
// +kubebuilder:default:=60
// +kubebuilder:validation:Minimum:=0
PostRunTimeoutSeconds int64 `json:"postRunTimeoutSeconds,omitempty"`
PostRunTimeoutSeconds *int64 `json:"postRunTimeoutSeconds,omitempty"`

// Specifies the number of times a container will be retried upon a failure. A new pod is
// deployed on each retry. Defaults to 6 by kubernetes itself and must be set. A value of 0
Expand Down
17 changes: 12 additions & 5 deletions api/v1alpha1/nnfcontainerprofile_webhook.go
Original file line number Diff line number Diff line change
Expand Up @@ -111,11 +111,14 @@ func (r *NnfContainerProfile) validateContent() error {
}

if mpiJob {
// PostRunTimeoutSeconds will update the Jobs' ActiveDeadlineSeconds once Postrun starts, so we can't set them both
if r.Data.MPISpec.RunPolicy.ActiveDeadlineSeconds != nil && r.Data.PostRunTimeoutSeconds > 0 {
// PreRunTimeoutSeconds will update the Jobs' ActiveDeadlineSeconds once PreRun timeout occurs, so we can't set them both
if r.Data.MPISpec.RunPolicy.ActiveDeadlineSeconds != nil && r.Data.PreRunTimeoutSeconds != nil && *r.Data.PreRunTimeoutSeconds > 0 {
return fmt.Errorf("both PreRunTimeoutSeconds and MPISpec.RunPolicy.ActiveDeadlineSeconds are provided - only 1 can be set")
}
// PostRunTimeoutSeconds will update the Jobs' ActiveDeadlineSeconds once PostRun starts, so we can't set them both
if r.Data.MPISpec.RunPolicy.ActiveDeadlineSeconds != nil && r.Data.PostRunTimeoutSeconds != nil && *r.Data.PostRunTimeoutSeconds > 0 {
return fmt.Errorf("both PostRunTimeoutSeconds and MPISpec.RunPolicy.ActiveDeadlineSeconds are provided - only 1 can be set")
}

// Don't allow users to set the backoff limit directly
if r.Data.MPISpec.RunPolicy.BackoffLimit != nil && r.Data.RetryLimit > 0 {
return fmt.Errorf("MPISpec.RunPolicy.BackoffLimit is set. Use RetryLimit instead")
Expand All @@ -130,8 +133,12 @@ func (r *NnfContainerProfile) validateContent() error {
return fmt.Errorf("MPISpec.MPIReplicaSpecs.Worker must be present with at least 1 container defined")
}
} else {
// PostRunTimeoutSeconds will update the Jobs' ActiveDeadlineSeconds once Postrun starts, so we can't set them both
if r.Data.Spec.ActiveDeadlineSeconds != nil && r.Data.PostRunTimeoutSeconds > 0 {
// PreRunTimeoutSeconds will update the Jobs' ActiveDeadlineSeconds once PreRun timeout occurs, so we can't set them both
if r.Data.Spec.ActiveDeadlineSeconds != nil && r.Data.PreRunTimeoutSeconds != nil && *r.Data.PreRunTimeoutSeconds > 0 {
return fmt.Errorf("both PreRunTimeoutSeconds and Spec.ActiveDeadlineSeconds are provided - only 1 can be set")
}
// PostRunTimeoutSeconds will update the Jobs' ActiveDeadlineSeconds once PostRun starts, so we can't set them both
if r.Data.Spec.ActiveDeadlineSeconds != nil && r.Data.PostRunTimeoutSeconds != nil && *r.Data.PostRunTimeoutSeconds > 0 {
return fmt.Errorf("both PostRunTimeoutSeconds and Spec.ActiveDeadlineSeconds are provided - only 1 can be set")
}

Expand Down
78 changes: 74 additions & 4 deletions api/v1alpha1/nnfcontainerprofile_webhook_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
mpiv2beta1 "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
"go.openly.dev/pointy"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"sigs.k8s.io/controller-runtime/pkg/client"
Expand Down Expand Up @@ -120,7 +121,7 @@ var _ = Describe("NnfContainerProfile Webhook", func() {
})

It("Should not allow a negative postRunTimeoutSeconds", func() {
nnfProfile.Data.PostRunTimeoutSeconds = -1
nnfProfile.Data.PostRunTimeoutSeconds = pointy.Int64(-1)
Expect(k8sClient.Create(context.TODO(), nnfProfile)).ToNot(Succeed())
nnfProfile = nil
})
Expand Down Expand Up @@ -207,12 +208,60 @@ var _ = Describe("NnfContainerProfile Webhook", func() {
nnfProfile = nil
})

DescribeTable("Should allow a user to set PreRunTimeoutSeconds",

func(timeout, expected *int64, succeed bool) {
nnfProfile.Data.Spec = &corev1.PodSpec{Containers: []corev1.Container{
{Name: "test", Image: "alpine:latest"},
}}
nnfProfile.Data.MPISpec = nil

nnfProfile.Data.PreRunTimeoutSeconds = timeout
if succeed {
Expect(k8sClient.Create(context.TODO(), nnfProfile)).To(Succeed())
Expect(nnfProfile.Data.PreRunTimeoutSeconds).To(Equal(expected))
} else {
Expect(k8sClient.Create(context.TODO(), nnfProfile)).ToNot(Succeed())
nnfProfile = nil
}

},
Entry("to 0", pointy.Int64(0), pointy.Int64(0), true),
Entry("to 45", pointy.Int64(45), pointy.Int64(45), true),
Entry("to nil and get the default(60)", nil, pointy.Int64(60), true),
Entry("to -1 and fail", pointy.Int64(-1), nil, false),
)

DescribeTable("Should allow a user to set PostRunTimeoutSeconds",

func(timeout, expected *int64, succeed bool) {
nnfProfile.Data.Spec = &corev1.PodSpec{Containers: []corev1.Container{
{Name: "test", Image: "alpine:latest"},
}}
nnfProfile.Data.MPISpec = nil

nnfProfile.Data.PostRunTimeoutSeconds = timeout
if succeed {
Expect(k8sClient.Create(context.TODO(), nnfProfile)).To(Succeed())
Expect(nnfProfile.Data.PostRunTimeoutSeconds).To(Equal(expected))
} else {
Expect(k8sClient.Create(context.TODO(), nnfProfile)).ToNot(Succeed())
nnfProfile = nil
}

},
Entry("to 0", pointy.Int64(0), pointy.Int64(0), true),
Entry("to 45", pointy.Int64(45), pointy.Int64(45), true),
Entry("to nil and get the default(60)", nil, pointy.Int64(60), true),
Entry("to -1 and fail", pointy.Int64(-1), nil, false),
)

It("Should not allow setting both PostRunTimeoutSeconds and MPISpec.RunPolicy.ActiveDeadlineSeconds", func() {
nnfProfile.Data.Spec = nil
nnfProfile.Data.MPISpec = &mpiv2beta1.MPIJobSpec{}

timeout := int64(10)
nnfProfile.Data.PostRunTimeoutSeconds = timeout
nnfProfile.Data.PostRunTimeoutSeconds = &timeout
nnfProfile.Data.MPISpec.RunPolicy.ActiveDeadlineSeconds = &timeout

Expect(k8sClient.Create(context.TODO(), nnfProfile)).ToNot(Succeed())
Expand All @@ -221,7 +270,28 @@ var _ = Describe("NnfContainerProfile Webhook", func() {

It("Should not allow setting both PostRunTimeoutSeconds and Spec.ActiveDeadlineSeconds", func() {
timeout := int64(10)
nnfProfile.Data.PostRunTimeoutSeconds = timeout
nnfProfile.Data.PostRunTimeoutSeconds = &timeout
nnfProfile.Data.Spec.ActiveDeadlineSeconds = &timeout

Expect(k8sClient.Create(context.TODO(), nnfProfile)).ToNot(Succeed())
nnfProfile = nil
})

It("Should not allow setting both PreRunTimeoutSeconds and MPISpec.RunPolicy.ActiveDeadlineSeconds", func() {
nnfProfile.Data.Spec = nil
nnfProfile.Data.MPISpec = &mpiv2beta1.MPIJobSpec{}

timeout := int64(10)
nnfProfile.Data.PreRunTimeoutSeconds = &timeout
nnfProfile.Data.MPISpec.RunPolicy.ActiveDeadlineSeconds = &timeout

Expect(k8sClient.Create(context.TODO(), nnfProfile)).ToNot(Succeed())
nnfProfile = nil
})

It("Should not allow setting both PreRunTimeoutSeconds and Spec.ActiveDeadlineSeconds", func() {
timeout := int64(10)
nnfProfile.Data.PreRunTimeoutSeconds = &timeout
nnfProfile.Data.Spec.ActiveDeadlineSeconds = &timeout

Expect(k8sClient.Create(context.TODO(), nnfProfile)).ToNot(Succeed())
Expand All @@ -240,7 +310,7 @@ var _ = Describe("NnfContainerProfile Webhook", func() {
})

It("Should allow a zero postRunTimeoutSeconds", func() {
nnfProfile.Data.PostRunTimeoutSeconds = 0
nnfProfile.Data.PostRunTimeoutSeconds = pointy.Int64(0)
Expect(k8sClient.Create(context.TODO(), nnfProfile)).To(Succeed())
})

Expand Down
10 changes: 10 additions & 0 deletions api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 5 additions & 3 deletions config/crd/bases/nnf.cray.hpe.com_nnfcontainerprofiles.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8629,18 +8629,20 @@ spec:
description: Pinned is true if this instance is an immutable copy
type: boolean
postRunTimeoutSeconds:
default: 60
description: Containers are expected to complete in the PostRun State.
Allow this many seconds for the containers to exit before declaring
an error the workflow. Defaults to 0. A value of 0 disables this
behavior.
an error the workflow. Defaults to 60 if not set. A value of 0 disables
this behavior.
format: int64
minimum: 0
type: integer
preRunTimeoutSeconds:
default: 60
description: Containers are launched in the PreRun state. Allow this
many seconds for the containers to start before declaring an error
to the workflow. Defaults to 60. A value of 0 disables this behavior.
to the workflow. Defaults to 60 if not set. A value of 0 disables
this behavior.
format: int64
minimum: 0
type: integer
Expand Down
15 changes: 12 additions & 3 deletions controllers/nnf_workflow_controller_helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -1334,7 +1334,10 @@ func (r *NnfWorkflowReconciler) waitForContainersToStart(ctx context.Context, wo
// timeout is hit. This needs to be handled slightly differently depending on if the job is MPI
// or not. Once set, k8s will take care of stopping the pods for us.
timeoutElapsed := false
timeout := time.Duration(profile.Data.PreRunTimeoutSeconds) * time.Second
timeout := time.Duration(0)
if profile.Data.PreRunTimeoutSeconds != nil {
timeout = time.Duration(*profile.Data.PreRunTimeoutSeconds) * time.Second
}
timeoutMessage := fmt.Sprintf("user container(s) failed to start after %d seconds", int(timeout.Seconds()))

// Check if PreRunTimeoutSeconds has elapsed and set the flag. The logic will check once more to
Expand Down Expand Up @@ -1569,7 +1572,10 @@ func (r *NnfWorkflowReconciler) waitForContainersToFinish(ctx context.Context, w
}
isMPIJob := profile.Data.MPISpec != nil

timeout := time.Duration(profile.Data.PostRunTimeoutSeconds) * time.Second
timeout := time.Duration(0)
if profile.Data.PostRunTimeoutSeconds != nil {
timeout = time.Duration(*profile.Data.PostRunTimeoutSeconds) * time.Second
}

if isMPIJob {
// We should expect at least 2 conditions: created and running
Expand Down Expand Up @@ -1627,7 +1633,10 @@ func (r *NnfWorkflowReconciler) checkContainersResults(ctx context.Context, work
}
isMPIJob := profile.Data.MPISpec != nil

timeout := time.Duration(profile.Data.PostRunTimeoutSeconds) * time.Second
timeout := time.Duration(0)
if profile.Data.PostRunTimeoutSeconds != nil {
timeout = time.Duration(*profile.Data.PostRunTimeoutSeconds) * time.Second
}
timeoutMessage := fmt.Sprintf("user container(s) failed to complete after %d seconds", int(timeout.Seconds()))

if isMPIJob {
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ require (
github.com/onsi/ginkgo/v2 v2.9.1
github.com/onsi/gomega v1.27.3
github.com/prometheus/client_golang v1.14.0
go.openly.dev/pointy v1.3.0
go.uber.org/zap v1.24.0
golang.org/x/sync v0.1.0
k8s.io/api v0.26.1
Expand Down
4 changes: 3 additions & 1 deletion go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -233,8 +233,8 @@ github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
github.com/ugorji/go/codec v0.0.0-20181204163529-d75b2dcb6bc8/go.mod h1:VFNgLljTbGfSG7qAOspJ7OScBnGdDN/yBr0sguwnwf0=
github.com/xeipuuv/gojsonpointer v0.0.0-20180127040702-4e3ac2762d5f/go.mod h1:N2zxlSyiKSe5eX1tZViRH5QA0qijqEDrYZiPEAiq3wU=
github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415/go.mod h1:GwrjFmJcFw6At/Gs6z4yjiIwzuJ1/+UwLxMQDVQXShQ=
Expand All @@ -247,6 +247,8 @@ go.chromium.org/luci v0.0.0-20230227223707-c4460eb434d8/go.mod h1:vTpW7gzqLQ9mhM
go.opencensus.io v0.22.5/go.mod h1:5pWMHQbX5EPX2/62yrJeAkowc+lfs/XD7Uxpq3pI6kk=
go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0=
go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo=
go.openly.dev/pointy v1.3.0 h1:keht3ObkbDNdY8PWPwB7Kcqk+MAlNStk5kXZTxukE68=
go.openly.dev/pointy v1.3.0/go.mod h1:rccSKiQDQ2QkNfSVT2KG8Budnfhf3At8IWxy/3ElYes=
go.opentelemetry.io/proto/otlp v0.7.0/go.mod h1:PqfVotwruBrMGOCsRd/89rSnXhoiJIqeYNgFYFoEGnI=
go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc=
go.uber.org/atomic v1.11.0 h1:ZvwS0R+56ePWxUNi+Atn9dWONBPp/AUETXlHW0DxSjE=
Expand Down
12 changes: 12 additions & 0 deletions vendor/go.openly.dev/pointy/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Binaries for programs and plugins
*.exe
*.exe~
*.dll
*.so
*.dylib

# Test binary, build with `go test -c`
*.test

# Output of the go coverage tool, specifically when used with LiteIDE
*.out
21 changes: 21 additions & 0 deletions vendor/go.openly.dev/pointy/LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) 2018 Mateusz Wielbut

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
Loading

0 comments on commit b34229b

Please sign in to comment.