Skip to content

Commit

Permalink
raise orc-dead-tablet after vtorc reparent (#513)
Browse files Browse the repository at this point in the history
  • Loading branch information
pbibra authored Sep 16, 2024
1 parent aac4574 commit 4994dc7
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 4 deletions.
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ require (
github.com/kr/text v0.2.0
github.com/mitchellh/mapstructure v1.5.0
github.com/nsf/jsondiff v0.0.0-20210926074059-1e845ec5d249
github.com/slackhq/vitess-addons v0.19.0
github.com/slackhq/vitess-addons v0.19.1
github.com/slok/noglog v0.2.0
github.com/spf13/afero v1.11.0
github.com/spf13/jwalterweatherman v1.1.0
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -450,8 +450,8 @@ github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6Mwd
github.com/sirupsen/logrus v1.7.0/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0=
github.com/sjmudd/stopwatch v0.1.1 h1:x45OvxFB5OtCkjvYtzRF5fWB857Jzjjk84Oyd5C5ebw=
github.com/sjmudd/stopwatch v0.1.1/go.mod h1:BLw0oIQJ1YLXBO/q9ufK/SgnKBVIkC2qrm6uy78Zw6U=
github.com/slackhq/vitess-addons v0.19.0 h1:+dWkQENsu8YYgsKesOKWqb3+vj66OY1WMvYOn9lmZ+I=
github.com/slackhq/vitess-addons v0.19.0/go.mod h1:E7i+cxyIY+I4An/JAvalQ9Ze2MjKlEx0u2nFXE4fgR0=
github.com/slackhq/vitess-addons v0.19.1 h1:k8f8pAJ2zqtetN+dnehAs7DFcZnI9IQRSL18ZMwNRCw=
github.com/slackhq/vitess-addons v0.19.1/go.mod h1:ZMzBBtadSA1MEuNIfZerztxLMhRFO+tmBZxv5HuV4lE=
github.com/slok/noglog v0.2.0 h1:1czu4l2EoJ8L92UwdSXXa1Y+c5TIjFAFm2P+mjej95E=
github.com/slok/noglog v0.2.0/go.mod h1:TfKxwpEZPT+UA83bQ6RME146k0MM4e8mwHLf6bhcGDI=
github.com/smartystreets/assertions v0.0.0-20190116191733-b6c0e53d7304/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc=
Expand Down
18 changes: 17 additions & 1 deletion go/vt/vtorc/logic/topology_recovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,11 @@ import (
"encoding/json"
"fmt"
"math/rand"
"os"
"time"

"github.com/patrickmn/go-cache"
"github.com/slackhq/vitess-addons/go/external"

"vitess.io/vitess/go/stats"
"vitess.io/vitess/go/vt/log"
Expand Down Expand Up @@ -81,6 +83,9 @@ var (

// recoveriesFailureCounter counts the number of failed recoveries that VTOrc has performed
recoveriesFailureCounter = stats.NewCountersWithSingleLabel("FailedRecoveries", "Count of the different failed recoveries performed", "RecoveryType", actionableRecoveriesNames...)

vtopsExec = external.NewExecVTOps(os.Getenv("VTOPS_PATH"), os.Getenv("VTOPS_HTTP_PROXY"), "vtorc", os.Getenv("HOSTNAME"))
vtopsSlackChannel = os.Getenv("SLACK_CHANNEL")
)

// recoveryFunction is the code of the recovery function to be used
Expand Down Expand Up @@ -297,6 +302,7 @@ func postErsCompletion(topologyRecovery *TopologyRecovery, analysisEntry *inst.R
_ = AuditTopologyRecovery(topologyRecovery, message)
_ = inst.AuditOperation(recoveryName, analysisEntry.AnalyzedInstanceAlias, message)
_ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("%v: successfully promoted %+v", recoveryName, promotedReplica.InstanceAlias))
vtopsExec.RaiseProblem(analysisEntry.AnalyzedInstanceHostname, "orc-dead-tablet", true)

This comment has been minimized.

Copy link
@timvaillancourt

timvaillancourt Sep 18, 2024

Member

@pbibra I think we should be backgrounding any of these calls to vtopsExec so they can't block the reparent logic, mainly because we aren't waiting for any response, ie:

go vtopsExec.RaiseProblem(analysisEntry.AnalyzedInstanceHostname, "orc-dead-tablet", true)

}
}

Expand Down Expand Up @@ -590,7 +596,6 @@ func runEmergentOperations(analysisEntry *inst.ReplicationAnalysis) {
func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (err error) {
countPendingRecoveries.Add(1)
defer countPendingRecoveries.Add(-1)

checkAndRecoverFunctionCode := getCheckAndRecoverFunctionCode(analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias)
isActionableRecovery := hasActionableRecovery(checkAndRecoverFunctionCode)
analysisEntry.IsActionableRecovery = isActionableRecovery
Expand All @@ -605,8 +610,11 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er
}
}

vtopsExec.SendSlackMessage(fmt.Sprintf("[VTOrc] No recovery available for %s for problem %s.", analysisEntry.AnalyzedInstanceHostname, analysisEntry.Analysis), vtopsSlackChannel, true)

return nil
}

// we have a recovery function; its execution still depends on filters if not disabled.
if isActionableRecovery || util.ClearToLog("executeCheckAndRecoverFunction: detection", analysisEntry.AnalyzedInstanceAlias) {
log.Infof("executeCheckAndRecoverFunction: proceeding with %+v detection on %+v; isActionable?: %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias, isActionableRecovery)
Expand Down Expand Up @@ -707,15 +715,22 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er
if isActionableRecovery || util.ClearToLog("executeCheckAndRecoverFunction: recovery", analysisEntry.AnalyzedInstanceAlias) {
log.Infof("executeCheckAndRecoverFunction: proceeding with %+v recovery on %+v; isRecoverable?: %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias, isActionableRecovery)
}

if !isActionableRecovery {
vtopsExec.SendSlackMessage(fmt.Sprintf("No actionable recovery on %s for problem %s.", analysisEntry.AnalyzedInstanceHostname, analysisEntry.Analysis), vtopsSlackChannel, true)
}

recoveryAttempted, topologyRecovery, err := getCheckAndRecoverFunction(checkAndRecoverFunctionCode)(ctx, analysisEntry)
if !recoveryAttempted {
return err
}
recoveryName := getRecoverFunctionName(checkAndRecoverFunctionCode)
recoveriesCounter.Add(recoveryName, 1)
if err != nil {
vtopsExec.SendSlackMessage(fmt.Sprintf("Recovery failed on %s for problem %s. Error: %s", analysisEntry.AnalyzedInstanceHostname, analysisEntry.Analysis, err.Error()), vtopsSlackChannel, true)
recoveriesFailureCounter.Add(recoveryName, 1)
} else {
vtopsExec.SendSlackMessage(fmt.Sprintf("Recovery succeeded on %s for problem %s.", analysisEntry.AnalyzedInstanceHostname, analysisEntry.Analysis), vtopsSlackChannel, true)
recoveriesSuccessfulCounter.Add(recoveryName, 1)
}
if topologyRecovery == nil {
Expand Down Expand Up @@ -813,6 +828,7 @@ func postPrsCompletion(topologyRecovery *TopologyRecovery, analysisEntry *inst.R
_ = AuditTopologyRecovery(topologyRecovery, message)
_ = inst.AuditOperation(string(analysisEntry.Analysis), analysisEntry.AnalyzedInstanceAlias, message)
_ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("%+v: successfully promoted %+v", analysisEntry.Analysis, promotedReplica.InstanceAlias))
vtopsExec.RaiseProblem(analysisEntry.AnalyzedInstanceHostname, "orc-dead-tablet", true)
}
}

Expand Down

0 comments on commit 4994dc7

Please sign in to comment.