From 8fe8de2e16ec39a5477df17586a3d212ec63a4bd Mon Sep 17 00:00:00 2001 From: Isitha Subasinghe Date: Tue, 29 Oct 2024 10:19:47 +1100 Subject: [PATCH] fix: mark taskresult complete when failed or error. Fixes #12993, Fixes #13533 (#13798) Signed-off-by: isubasinghe --- docs/environment-variables.md | 2 +- pkg/apis/workflow/v1alpha1/workflow_types.go | 5 ----- workflow/controller/taskresult.go | 10 +++++----- 3 files changed, 6 insertions(+), 11 deletions(-) diff --git a/docs/environment-variables.md b/docs/environment-variables.md index 42415fd86637..749be582146a 100644 --- a/docs/environment-variables.md +++ b/docs/environment-variables.md @@ -45,7 +45,7 @@ most users. Environment variables may be removed at any time. | `OPERATION_DURATION_METRIC_BUCKET_COUNT` | `int` | `6` | The number of buckets to collect the metric for the operation duration. | | `POD_NAMES` | `string` | `v2` | Whether to have pod names contain the template name (v2) or be the node id (v1) - should be set the same for Argo Server. | | `RECENTLY_STARTED_POD_DURATION` | `time.Duration` | `10s` | The duration of a pod before the pod is considered to be recently started. | -| `RECENTLY_DELETED_POD_DURATION` | `time.Duration` | `10s` | The duration of a pod before the pod is considered to be recently deleted. | +| `RECENTLY_DELETED_POD_DURATION` | `time.Duration` | `2m` | The duration of a pod before the pod is considered to be recently deleted. | | `RETRY_BACKOFF_DURATION` | `time.Duration` | `10ms` | The retry back-off duration when retrying API calls. | | `RETRY_BACKOFF_FACTOR` | `float` | `2.0` | The retry back-off factor when retrying API calls. | | `RETRY_BACKOFF_STEPS` | `int` | `5` | The retry back-off steps when retrying API calls. | diff --git a/pkg/apis/workflow/v1alpha1/workflow_types.go b/pkg/apis/workflow/v1alpha1/workflow_types.go index 64cd15814cb7..8d16c9fc3a34 100644 --- a/pkg/apis/workflow/v1alpha1/workflow_types.go +++ b/pkg/apis/workflow/v1alpha1/workflow_types.go @@ -2418,11 +2418,6 @@ func (n NodeStatus) IsExitNode() bool { return strings.HasSuffix(n.DisplayName, ".onExit") } -// IsPodDeleted returns whether node is error with pod deleted. -func (n NodeStatus) IsPodDeleted() bool { - return n.Phase == NodeError && n.Message == "pod deleted" -} - func (n NodeStatus) Succeeded() bool { return n.Phase == NodeSucceeded } diff --git a/workflow/controller/taskresult.go b/workflow/controller/taskresult.go index 2c1f7a2f0b85..a76a01e7eac9 100644 --- a/workflow/controller/taskresult.go +++ b/workflow/controller/taskresult.go @@ -54,7 +54,7 @@ func (wfc *WorkflowController) newWorkflowTaskResultInformer() cache.SharedIndex } func recentlyDeleted(node *wfv1.NodeStatus) bool { - return time.Since(node.FinishedAt.Time) <= envutil.LookupEnvDurationOr("RECENTLY_DELETED_POD_DURATION", 10*time.Second) + return time.Since(node.FinishedAt.Time) <= envutil.LookupEnvDurationOr("RECENTLY_DELETED_POD_DURATION", 2*time.Minute) } func (woc *wfOperationCtx) taskResultReconciliation() { @@ -83,8 +83,9 @@ func (woc *wfOperationCtx) taskResultReconciliation() { if err != nil { continue } + // Mark task result as completed if it has no chance to be completed. - if label == "false" && old.IsPodDeleted() { + if label == "false" && old.Completed() && !woc.nodePodExist(*old) { if recentlyDeleted(old) { woc.log.WithField("nodeID", nodeID).Debug("Wait for marking task result as completed because pod is recently deleted.") // If the pod was deleted, then it is possible that the controller never get another informer message about it. @@ -92,10 +93,9 @@ func (woc *wfOperationCtx) taskResultReconciliation() { // workflow will not update for 20m. Requeuing here prevents that happening. woc.requeue() continue - } else { - woc.log.WithField("nodeID", nodeID).Info("Marking task result as completed because pod has been deleted for a while.") - woc.wf.Status.MarkTaskResultComplete(nodeID) } + woc.log.WithField("nodeID", nodeID).Info("Marking task result as completed because pod has been deleted for a while.") + woc.wf.Status.MarkTaskResultComplete(nodeID) } newNode := old.DeepCopy() if result.Outputs.HasOutputs() {