From c6f8cf1bb34b5f573af7a672b63dc9bf482b2635 Mon Sep 17 00:00:00 2001 From: Piotr Konopka Date: Thu, 12 Dec 2024 15:35:32 +0100 Subject: [PATCH 1/4] OCTRL-951 [core][executor] improve reporting error transition I rearranged two logs correlated to task going to ERROR, with one producing easily-digestible message for an operator. --- core/workflow/taskrole.go | 15 ++++++++++----- executor/executable/controllabletask.go | 3 ++- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/core/workflow/taskrole.go b/core/workflow/taskrole.go index bbeaf3e3..fabfde9d 100644 --- a/core/workflow/taskrole.go +++ b/core/workflow/taskrole.go @@ -257,11 +257,16 @@ func (t *taskRole) updateState(s sm.State) { EnvironmentId: t.GetEnvironmentId().String(), }) if t.state.get() == sm.ERROR { - log.WithField("partition", t.GetEnvironmentId().String()). - WithField("level", infologger.IL_Support). - WithField("role", t.Name). - WithField("critical", t.Critical). - Error("task went into ERROR") + if t.Critical { + log.WithField("partition", t.GetEnvironmentId().String()). + WithField("level", infologger.IL_Ops). + Errorf("critical task '%s' on host '%s' went into ERROR, the environment will stop or tear down", t.Name, t.Task.GetHostname()) + } else { + log.WithField("partition", t.GetEnvironmentId().String()). + WithField("level", infologger.IL_Ops). + Errorf("non-critical task '%s' on host '%s' went into ERROR, but the environment might continue", t.Name, t.Task.GetHostname()) + } + } } t.SendEvent(&event.RoleEvent{Name: t.Name, State: t.state.get().String(), RolePath: t.GetPath()}) diff --git a/executor/executable/controllabletask.go b/executor/executable/controllabletask.go index 861ee133..a4cf5b34 100644 --- a/executor/executable/controllabletask.go +++ b/executor/executable/controllabletask.go @@ -550,7 +550,8 @@ func (t *ControllableTask) Launch() error { WithField("taskId", taskId). WithField("taskName", t.ti.Name). WithField("taskPid", t.knownPid). - Warningf("task %s transitioned to ERROR on its own - notifying environment", deo.TaskId.String()) + WithField("level", infologger.IL_Support). + Warningf("task transitioned to ERROR on its own - notifying environment") } } deviceEvent.SetLabels(map[string]string{"detector": t.knownDetector, "environmentId": t.knownEnvironmentId.String()}) From d80a1da5e0b33bca7ff06a41e55d7a966b30c794 Mon Sep 17 00:00:00 2001 From: Piotr Konopka Date: Thu, 12 Dec 2024 15:37:11 +0100 Subject: [PATCH 2/4] OCTRL-900 [core] demote getConfig warnings containing debug information --- configuration/template/stackutil.go | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/configuration/template/stackutil.go b/configuration/template/stackutil.go index 1e58fb69..44ce4a29 100644 --- a/configuration/template/stackutil.go +++ b/configuration/template/stackutil.go @@ -27,6 +27,7 @@ package template import ( "encoding/json" "fmt" + "github.com/AliceO2Group/Control/common/logger/infologger" texttemplate "text/template" "time" @@ -51,8 +52,8 @@ func getConfigLegacy(confSvc ConfigurationService, varStack map[string]string, p fields := Fields{WrapPointer(&payload)} err = fields.Execute(confSvc, query.Path(), varStack, nil, nil, make(map[string]texttemplate.Template), nil) - log.Warn(varStack) - log.Warn(payload) + log.WithField("level", infologger.IL_Devel).Debug(varStack) + log.WithField("level", infologger.IL_Devel).Debug(payload) return payload } @@ -68,8 +69,8 @@ func getConfig(confSvc ConfigurationService, varStack map[string]string, path st return fmt.Sprintf("{\"error\":\"%s\"}", err.Error()) } - log.Warn(varStack) - log.Warn(payload) + log.WithField("level", infologger.IL_Devel).Debug(varStack) + log.WithField("level", infologger.IL_Devel).Debug(payload) return payload } From 7195c295592edd04ac0afb7b6ddbdc3a94360efe Mon Sep 17 00:00:00 2001 From: Piotr Konopka Date: Thu, 12 Dec 2024 18:32:52 +0100 Subject: [PATCH 3/4] OCTRL-759 [core] clearer deployment failure logs for OPS - shortened one phrase and demoted the log to support, as a similar log is always produced for OPS anyway - removed redundant prefix "deployment error", which IMHO does not add anything to error message which follows (e.g "deployment error error=workflow deployment failed..." becomes "workflow deployment failed...") --- core/environment/transition_deploy.go | 4 ++-- core/task/manager.go | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/core/environment/transition_deploy.go b/core/environment/transition_deploy.go index 652b5325..ed7e6e45 100644 --- a/core/environment/transition_deploy.go +++ b/core/environment/transition_deploy.go @@ -342,9 +342,9 @@ func (t DeployTransition) do(env *Environment) (err error) { } if err != nil { - log.WithError(err). + log.WithField("level", infologger.IL_Ops). WithField("partition", env.Id().String()). - Error("deployment error") + Error(err) return } diff --git a/core/task/manager.go b/core/task/manager.go index 83139c69..d04964f2 100644 --- a/core/task/manager.go +++ b/core/task/manager.go @@ -1207,7 +1207,8 @@ func (m *Manager) handleMessage(tm *TaskmanMessage) error { if err != nil { log.WithError(err). WithField("partition", tm.GetEnvironmentId().String()). - Errorf("Failed task creation and Mesos resources allocation during the deployment of the environment. For more details check Devel logs in Info Logger.") + WithField("level", infologger.IL_Support). + Errorf("failed task creation and Mesos resources allocation during the deployment of the environment. More details in Devel logs.") } }() case taskop.ConfigureTasks: From d218d63de407cf4e542a2a59e4303afe5bc5584c Mon Sep 17 00:00:00 2001 From: Piotr Konopka Date: Fri, 13 Dec 2024 08:53:09 +0100 Subject: [PATCH 4/4] [core] protect from accessing nil t.Task --- core/workflow/taskrole.go | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/core/workflow/taskrole.go b/core/workflow/taskrole.go index fabfde9d..3b294b2a 100644 --- a/core/workflow/taskrole.go +++ b/core/workflow/taskrole.go @@ -257,14 +257,18 @@ func (t *taskRole) updateState(s sm.State) { EnvironmentId: t.GetEnvironmentId().String(), }) if t.state.get() == sm.ERROR { + host := "unknown" + if t.Task != nil { + host = t.Task.GetHostname() + } if t.Critical { log.WithField("partition", t.GetEnvironmentId().String()). WithField("level", infologger.IL_Ops). - Errorf("critical task '%s' on host '%s' went into ERROR, the environment will stop or tear down", t.Name, t.Task.GetHostname()) + Errorf("critical task '%s' on host '%s' went into ERROR, the environment will stop or tear down", t.Name, host) } else { log.WithField("partition", t.GetEnvironmentId().String()). WithField("level", infologger.IL_Ops). - Errorf("non-critical task '%s' on host '%s' went into ERROR, but the environment might continue", t.Name, t.Task.GetHostname()) + Errorf("non-critical task '%s' on host '%s' went into ERROR, but the environment might continue", t.Name, host) } }