Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

slack-vitess-r14.0.5: backport vitessio/vitess#13723 #142

Merged
merged 7 commits into from
Oct 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 1 addition & 7 deletions .github/workflows/static_checks_etc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,6 @@ jobs:
if: steps.skip-workflow.outputs.skip-workflow == 'false'
uses: actions/checkout@v3

- name: Run FOSSA scan and upload build data
if: steps.skip-workflow.outputs.skip-workflow == 'false'
uses: fossa-contrib/fossa-action@v1
with:
fossa-api-key: 76d7483ea206d530d9452e44bffe7ba8

- name: Check for changes in Go files
if: steps.skip-workflow.outputs.skip-workflow == 'false'
uses: frouioui/paths-filter@main
Expand Down Expand Up @@ -211,4 +205,4 @@ jobs:
echo 'We wish to maintain a consistent changelog directory, please run `go run ./go/tools/releases/releases.go`, commit and push again.'
echo 'Running `go run ./go/tools/releases/releases.go` on CI yields the following changes:'
echo "$output"
echo ""
echo ""
4 changes: 3 additions & 1 deletion go/stats/timings.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,12 @@ func NewTimings(name, help, label string, categories ...string) *Timings {
return t
}

// Reset will clear histograms: used during testing
// Reset will clear histograms and counters: used during testing
func (t *Timings) Reset() {
t.mu.RLock()
t.histograms = make(map[string]*Histogram)
t.totalCount.Set(0)
t.totalTime.Set(0)
t.mu.RUnlock()
}

Expand Down
24 changes: 18 additions & 6 deletions go/vt/vtctl/reparentutil/emergency_reparenter.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,14 @@ type EmergencyReparentOptions struct {

// counters for Emergency Reparent Shard
var (
ersCounter = stats.NewGauge("ers_counter", "Number of times Emergency Reparent Shard has been run")
ersSuccessCounter = stats.NewGauge("ers_success_counter", "Number of times Emergency Reparent Shard has succeeded")
ersFailureCounter = stats.NewGauge("ers_failure_counter", "Number of times Emergency Reparent Shard has failed")
// TODO(timvaillancourt): remove legacyERS* gauges in v19+.
legacyERSCounter = stats.NewGauge("ers_counter", "Number of times Emergency Reparent Shard has been run")
legacyERSSuccessCounter = stats.NewGauge("ers_success_counter", "Number of times Emergency Reparent Shard has succeeded")
legacyERSFailureCounter = stats.NewGauge("ers_failure_counter", "Number of times Emergency Reparent Shard has failed")

ersCounter = stats.NewCountersWithMultiLabels("emergency_reparent_counts", "Number of times Emergency Reparent Shard has been run",
[]string{"Keyspace", "Shard", "Result"},
)
)

// NewEmergencyReparenter returns a new EmergencyReparenter object, ready to
Expand Down Expand Up @@ -98,26 +103,33 @@ func NewEmergencyReparenter(ts *topo.Server, tmc tmclient.TabletManagerClient, l
// keyspace and shard.
func (erp *EmergencyReparenter) ReparentShard(ctx context.Context, keyspace string, shard string, opts EmergencyReparentOptions) (*events.Reparent, error) {
var err error
statsLabels := []string{keyspace, shard}

// First step is to lock the shard for the given operation, if not already locked
if err = topo.CheckShardLocked(ctx, keyspace, shard); err != nil {
var unlock func(*error)
opts.lockAction = erp.getLockAction(opts.NewPrimaryAlias)
ctx, unlock, err = erp.ts.LockShard(ctx, keyspace, shard, opts.lockAction)
if err != nil {
ersCounter.Add(append(statsLabels, failureResult), 1)
return nil, err
}
defer unlock(&err)
}

// dispatch success or failure of ERS
startTime := time.Now()
ev := &events.Reparent{}
defer func() {
reparentShardOpTimings.Add("EmergencyReparentShard", time.Since(startTime))
switch err {
case nil:
ersSuccessCounter.Add(1)
legacyERSSuccessCounter.Add(1)
ersCounter.Add(append(statsLabels, successResult), 1)
event.DispatchUpdate(ev, "finished EmergencyReparentShard")
default:
ersFailureCounter.Add(1)
legacyERSFailureCounter.Add(1)
ersCounter.Add(append(statsLabels, failureResult), 1)
event.DispatchUpdate(ev, "failed EmergencyReparentShard: "+err.Error())
}
}()
Expand All @@ -141,7 +153,7 @@ func (erp *EmergencyReparenter) getLockAction(newPrimaryAlias *topodatapb.Tablet
func (erp *EmergencyReparenter) reparentShardLocked(ctx context.Context, ev *events.Reparent, keyspace, shard string, opts EmergencyReparentOptions) (err error) {
// log the starting of the operation and increment the counter
erp.logger.Infof("will initiate emergency reparent shard in keyspace - %s, shard - %s", keyspace, shard)
ersCounter.Add(1)
legacyERSCounter.Add(1)

var (
stoppedReplicationSnapshot *replicationSnapshot
Expand Down
30 changes: 20 additions & 10 deletions go/vt/vtctl/reparentutil/emergency_reparenter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2747,10 +2747,12 @@ func TestEmergencyReparenter_waitForAllRelayLogsToApply(t *testing.T) {
}
}

func TestEmergencyReparenterCounters(t *testing.T) {
ersCounter.Set(0)
ersSuccessCounter.Set(0)
ersFailureCounter.Set(0)
func TestEmergencyReparenterStats(t *testing.T) {
ersCounter.ResetAll()
legacyERSCounter.Reset()
legacyERSSuccessCounter.Reset()
legacyERSFailureCounter.Reset()
reparentShardOpTimings.Reset()

emergencyReparentOps := EmergencyReparentOptions{}
tmc := &testutil.TabletManagerClient{
Expand Down Expand Up @@ -2879,9 +2881,13 @@ func TestEmergencyReparenterCounters(t *testing.T) {
require.NoError(t, err)

// check the counter values
require.EqualValues(t, 1, ersCounter.Get())
require.EqualValues(t, 1, ersSuccessCounter.Get())
require.EqualValues(t, 0, ersFailureCounter.Get())
require.EqualValues(t, map[string]int64{"testkeyspace.-.success": 1}, ersCounter.Counts())
require.EqualValues(t, map[string]int64{"All": 1, "EmergencyReparentShard": 1}, reparentShardOpTimings.Counts())

// check the legacy counter values
require.EqualValues(t, 1, legacyERSCounter.Get())
require.EqualValues(t, 1, legacyERSSuccessCounter.Get())
require.EqualValues(t, 0, legacyERSFailureCounter.Get())

// set emergencyReparentOps to request a non existent tablet
emergencyReparentOps.NewPrimaryAlias = &topodatapb.TabletAlias{
Expand All @@ -2894,9 +2900,13 @@ func TestEmergencyReparenterCounters(t *testing.T) {
require.Error(t, err)

// check the counter values
require.EqualValues(t, 2, ersCounter.Get())
require.EqualValues(t, 1, ersSuccessCounter.Get())
require.EqualValues(t, 1, ersFailureCounter.Get())
require.EqualValues(t, map[string]int64{"testkeyspace.-.success": 1, "testkeyspace.-.failure": 1}, ersCounter.Counts())
require.EqualValues(t, map[string]int64{"All": 2, "EmergencyReparentShard": 2}, reparentShardOpTimings.Counts())

// check the legacy counter values
require.EqualValues(t, 2, legacyERSCounter.Get())
require.EqualValues(t, 1, legacyERSSuccessCounter.Get())
require.EqualValues(t, 1, legacyERSFailureCounter.Get())
}

func TestEmergencyReparenter_findMostAdvanced(t *testing.T) {
Expand Down
16 changes: 16 additions & 0 deletions go/vt/vtctl/reparentutil/planned_reparenter.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (

"vitess.io/vitess/go/event"
"vitess.io/vitess/go/mysql"
"vitess.io/vitess/go/stats"
"vitess.io/vitess/go/vt/concurrency"
"vitess.io/vitess/go/vt/logutil"
"vitess.io/vitess/go/vt/topo"
Expand All @@ -39,6 +40,13 @@ import (
"vitess.io/vitess/go/vt/proto/vtrpc"
)

// counters for Planned Reparent Shard
var (
prsCounter = stats.NewCountersWithMultiLabels("planned_reparent_counts", "Number of times Planned Reparent Shard has been run",
[]string{"Keyspace", "Shard", "Result"},
)
)

// PlannedReparenter performs PlannedReparentShard operations.
type PlannedReparenter struct {
ts *topo.Server
Expand Down Expand Up @@ -88,11 +96,14 @@ func NewPlannedReparenter(ts *topo.Server, tmc tmclient.TabletManagerClient, log
// both the current and desired primary are reachable and in a good state.
func (pr *PlannedReparenter) ReparentShard(ctx context.Context, keyspace string, shard string, opts PlannedReparentOptions) (*events.Reparent, error) {
var err error
statsLabels := []string{keyspace, shard}

if err = topo.CheckShardLocked(ctx, keyspace, shard); err != nil {
var unlock func(*error)
opts.lockAction = pr.getLockAction(opts)
ctx, unlock, err = pr.ts.LockShard(ctx, keyspace, shard, opts.lockAction)
if err != nil {
prsCounter.Add(append(statsLabels, failureResult), 1)
return nil, err
}
defer unlock(&err)
Expand All @@ -101,18 +112,23 @@ func (pr *PlannedReparenter) ReparentShard(ctx context.Context, keyspace string,
if opts.NewPrimaryAlias == nil && opts.AvoidPrimaryAlias == nil {
shardInfo, err := pr.ts.GetShard(ctx, keyspace, shard)
if err != nil {
prsCounter.Add(append(statsLabels, failureResult), 1)
return nil, err
}

opts.AvoidPrimaryAlias = shardInfo.PrimaryAlias
}

startTime := time.Now()
ev := &events.Reparent{}
defer func() {
reparentShardOpTimings.Add("PlannedReparentShard", time.Since(startTime))
switch err {
case nil:
prsCounter.Add(append(statsLabels, successResult), 1)
event.DispatchUpdate(ev, "finished PlannedReparentShard")
default:
prsCounter.Add(append(statsLabels, failureResult), 1)
event.DispatchUpdate(ev, "failed PlannedReparentShard: "+err.Error())
}
}()
Expand Down
93 changes: 93 additions & 0 deletions go/vt/vtctl/reparentutil/planned_reparenter_flaky_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3719,3 +3719,96 @@ func AssertReparentEventsEqual(t *testing.T, expected *events.Reparent, actual *

AssertReparentEventsEqualWithMessage(t, expected, actual, "")
}

func TestPlannedReparenterStats(t *testing.T) {
prsCounter.ResetAll()
reparentShardOpTimings.Reset()

tmc := &testutil.TabletManagerClient{
PrimaryPositionResults: map[string]struct {
Position string
Error error
}{
"zone1-0000000100": {
Position: "position1",
Error: nil,
},
},
PopulateReparentJournalResults: map[string]error{
"zone1-0000000100": nil,
},
SetReplicationSourceResults: map[string]error{
"zone1-0000000101": nil,
},
SetReadWriteResults: map[string]error{
"zone1-0000000100": nil,
},
}
shards := []*vtctldatapb.Shard{
{
Keyspace: "testkeyspace",
Name: "-",
},
}
tablets := []*topodatapb.Tablet{
{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
Type: topodatapb.TabletType_PRIMARY,
Keyspace: "testkeyspace",
Shard: "-",
},
{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 101,
},
Type: topodatapb.TabletType_REPLICA,
Keyspace: "testkeyspace",
Shard: "-",
},
}
plannedReparentOps := PlannedReparentOptions{
NewPrimaryAlias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
}
keyspace := "testkeyspace"
shard := "-"
ts := memorytopo.NewServer("zone1")

ctx := context.Background()
logger := logutil.NewMemoryLogger()

testutil.AddShards(ctx, t, ts, shards...)
testutil.AddTablets(ctx, t, ts, &testutil.AddTabletOptions{
AlsoSetShardPrimary: true,
SkipShardCreation: false,
}, tablets...)

prp := NewPlannedReparenter(ts, tmc, logger)
// run a successful prs
_, err := prp.ReparentShard(ctx, keyspace, shard, plannedReparentOps)
require.NoError(t, err)

// check the counter values
require.EqualValues(t, map[string]int64{"testkeyspace.-.success": 1}, prsCounter.Counts())
require.EqualValues(t, map[string]int64{"All": 1, "PlannedReparentShard": 1}, reparentShardOpTimings.Counts())

// set plannedReparentOps to request a non existent tablet
plannedReparentOps.NewPrimaryAlias = &topodatapb.TabletAlias{
Cell: "bogus",
Uid: 100,
}

// run a failing prs
_, err = prp.ReparentShard(ctx, keyspace, shard, plannedReparentOps)
require.Error(t, err)

// check the counter values
require.EqualValues(t, map[string]int64{"testkeyspace.-.success": 1, "testkeyspace.-.failure": 1}, prsCounter.Counts())
require.EqualValues(t, map[string]int64{"All": 2, "PlannedReparentShard": 2}, reparentShardOpTimings.Counts())
}
7 changes: 7 additions & 0 deletions go/vt/vtctl/reparentutil/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"time"

"vitess.io/vitess/go/mysql"
"vitess.io/vitess/go/stats"
"vitess.io/vitess/go/vt/concurrency"
"vitess.io/vitess/go/vt/log"
"vitess.io/vitess/go/vt/logutil"
Expand All @@ -38,6 +39,12 @@ import (
"vitess.io/vitess/go/vt/proto/vtrpc"
)

var (
reparentShardOpTimings = stats.NewTimings("reparent_shard_operation_timings", "Timings of reparent shard operations", "Operation")
failureResult = "failure"
successResult = "success"
)

// ChooseNewPrimary finds a tablet that should become a primary after reparent.
// The criteria for the new primary-elect are (preferably) to be in the same
// cell as the current primary, and to be different from avoidPrimaryAlias. The
Expand Down