Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes for avoidance of hosts taking backup in PRS & ERS (#17300) #572

Merged
merged 2 commits into from
Dec 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
178 changes: 84 additions & 94 deletions go/vt/proto/replicationdata/replicationdata.pb.go

Large diffs are not rendered by default.

38 changes: 2 additions & 36 deletions go/vt/proto/replicationdata/replicationdata_vtproto.pb.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

959 changes: 469 additions & 490 deletions go/vt/proto/tabletmanagerdata/tabletmanagerdata.pb.go

Large diffs are not rendered by default.

72 changes: 2 additions & 70 deletions go/vt/proto/tabletmanagerdata/tabletmanagerdata_vtproto.pb.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 14 additions & 3 deletions go/vt/vtctl/reparentutil/replication.go
Original file line number Diff line number Diff line change
Expand Up @@ -251,9 +251,6 @@ func stopReplicationAndBuildStatusMaps(
logger.Infof("getting replication position from %v", alias)

stopReplicationStatus, err := tmc.StopReplicationAndGetStatus(groupCtx, tabletInfo.Tablet, replicationdatapb.StopReplicationMode_IOTHREADONLY)
m.Lock()
res.tabletsBackupState[alias] = stopReplicationStatus.GetBackupRunning()
m.Unlock()
if err != nil {
sqlErr, isSQLErr := sqlerror.NewSQLErrorFromError(err).(*sqlerror.SQLError)
if isSQLErr && sqlErr != nil && sqlErr.Number() == sqlerror.ERNotReplica {
Expand All @@ -277,6 +274,20 @@ func stopReplicationAndBuildStatusMaps(
err = vterrors.Wrapf(err, "error when getting replication status for alias %v: %v", alias, err)
}
} else {
isTakingBackup := false

// Prefer the most up-to-date information regarding whether the tablet is taking a backup from the After
// replication status, but fall back to the Before status if After is nil.
if stopReplicationStatus.After != nil {
isTakingBackup = stopReplicationStatus.After.BackupRunning
} else if stopReplicationStatus.Before != nil {
isTakingBackup = stopReplicationStatus.Before.BackupRunning
}

m.Lock()
res.tabletsBackupState[alias] = isTakingBackup
m.Unlock()

var sqlThreadRunning bool
// Check if the sql thread was running for the tablet
sqlThreadRunning, err = SQLThreadWasRunning(stopReplicationStatus)
Expand Down
82 changes: 82 additions & 0 deletions go/vt/vtctl/reparentutil/replication_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) {
waitForAllTablets bool
expectedStatusMap map[string]*replicationdatapb.StopReplicationStatus
expectedPrimaryStatusMap map[string]*replicationdatapb.PrimaryStatus
expectedTakingBackup map[string]bool
expectedTabletsReachable []*topodatapb.Tablet
shouldErr bool
}{
Expand Down Expand Up @@ -375,6 +376,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) {
After: &replicationdatapb.Status{Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429101:1-9"},
},
},
expectedTakingBackup: map[string]bool{"zone1-0000000100": false, "zone1-0000000101": false},
expectedPrimaryStatusMap: map[string]*replicationdatapb.PrimaryStatus{},
expectedTabletsReachable: []*topodatapb.Tablet{{
Type: topodatapb.TabletType_REPLICA,
Expand Down Expand Up @@ -443,6 +445,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) {
After: &replicationdatapb.Status{Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429101:1-9"},
},
},
expectedTakingBackup: map[string]bool{"zone1-0000000100": false, "zone1-0000000101": false},
expectedPrimaryStatusMap: map[string]*replicationdatapb.PrimaryStatus{},
expectedTabletsReachable: []*topodatapb.Tablet{{
Type: topodatapb.TabletType_REPLICA,
Expand Down Expand Up @@ -527,6 +530,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) {
After: &replicationdatapb.Status{Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429101:1-9"},
},
},
expectedTakingBackup: map[string]bool{"zone1-0000000100": false, "zone1-0000000101": false},
expectedPrimaryStatusMap: map[string]*replicationdatapb.PrimaryStatus{},
expectedTabletsReachable: []*topodatapb.Tablet{{
Type: topodatapb.TabletType_REPLICA,
Expand Down Expand Up @@ -621,6 +625,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) {
After: &replicationdatapb.Status{Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429101:1-9"},
},
},
expectedTakingBackup: map[string]bool{"zone1-0000000100": false, "zone1-0000000101": false},
expectedPrimaryStatusMap: map[string]*replicationdatapb.PrimaryStatus{},
expectedTabletsReachable: []*topodatapb.Tablet{{
Type: topodatapb.TabletType_REPLICA,
Expand Down Expand Up @@ -714,6 +719,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) {
After: &replicationdatapb.Status{Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429101:1-9"},
},
},
expectedTakingBackup: map[string]bool{"zone1-0000000100": false, "zone1-0000000101": false},
expectedPrimaryStatusMap: map[string]*replicationdatapb.PrimaryStatus{},
expectedTabletsReachable: []*topodatapb.Tablet{{
Type: topodatapb.TabletType_REPLICA,
Expand Down Expand Up @@ -786,6 +792,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) {
Uid: 101,
},
}},
expectedTakingBackup: map[string]bool{"zone1-0000000101": false},
expectedPrimaryStatusMap: map[string]*replicationdatapb.PrimaryStatus{},
shouldErr: false,
},
Expand Down Expand Up @@ -847,6 +854,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) {
After: &replicationdatapb.Status{Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429101:1-9"},
},
},
expectedTakingBackup: map[string]bool{"zone1-0000000101": false},
expectedPrimaryStatusMap: map[string]*replicationdatapb.PrimaryStatus{
"zone1-0000000100": {
Position: "primary-position-100",
Expand Down Expand Up @@ -921,6 +929,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) {
After: &replicationdatapb.Status{Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429101:1-9"},
},
},
expectedTakingBackup: map[string]bool{"zone1-0000000101": false},
expectedPrimaryStatusMap: map[string]*replicationdatapb.PrimaryStatus{}, // zone1-0000000100 fails to demote, so does not appear
expectedTabletsReachable: []*topodatapb.Tablet{{
Type: topodatapb.TabletType_REPLICA,
Expand Down Expand Up @@ -1044,6 +1053,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) {
Uid: 101,
},
}},
expectedTakingBackup: map[string]bool{"zone1-0000000101": false},
expectedPrimaryStatusMap: map[string]*replicationdatapb.PrimaryStatus{},
shouldErr: false,
},
Expand Down Expand Up @@ -1093,6 +1103,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) {
After: &replicationdatapb.Status{Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429101:1-9"},
},
},
expectedTakingBackup: map[string]bool{"zone1-0000000101": false},
expectedPrimaryStatusMap: map[string]*replicationdatapb.PrimaryStatus{},
expectedTabletsReachable: []*topodatapb.Tablet{{
Type: topodatapb.TabletType_REPLICA,
Expand Down Expand Up @@ -1288,8 +1299,78 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) {
},
}},
stopReplicasTimeout: time.Minute,
expectedTakingBackup: map[string]bool{"zone1-0000000100": false, "zone1-0000000101": false, "zone1-0000000102": false},
expectedPrimaryStatusMap: map[string]*replicationdatapb.PrimaryStatus{},
shouldErr: false,
}, {
name: "Handle nil replication status After. No segfaulting when determining backup status, and fall back to Before status",
durability: "none",
tmc: &stopReplicationAndBuildStatusMapsTestTMClient{
stopReplicationAndGetStatusResults: map[string]*struct {
StopStatus *replicationdatapb.StopReplicationStatus
Err error
}{
"zone1-0000000100": {
StopStatus: &replicationdatapb.StopReplicationStatus{
Before: &replicationdatapb.Status{Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429100:1-5", IoState: int32(replication.ReplicationStateRunning), SqlState: int32(replication.ReplicationStateRunning), BackupRunning: true},
After: nil,
},
},
"zone1-0000000101": {
StopStatus: &replicationdatapb.StopReplicationStatus{
Before: &replicationdatapb.Status{Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429101:1-5", IoState: int32(replication.ReplicationStateRunning), SqlState: int32(replication.ReplicationStateRunning), BackupRunning: true},
After: nil,
},
},
},
},
tabletMap: map[string]*topo.TabletInfo{
"zone1-0000000100": {
Tablet: &topodatapb.Tablet{
Type: topodatapb.TabletType_REPLICA,
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
},
},
"zone1-0000000101": {
Tablet: &topodatapb.Tablet{
Type: topodatapb.TabletType_REPLICA,
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 101,
},
},
},
},
ignoredTablets: sets.New[string](),
expectedStatusMap: map[string]*replicationdatapb.StopReplicationStatus{
"zone1-0000000100": {
Before: &replicationdatapb.Status{Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429100:1-5", IoState: int32(replication.ReplicationStateRunning), SqlState: int32(replication.ReplicationStateRunning), BackupRunning: true},
After: nil,
},
"zone1-0000000101": {
Before: &replicationdatapb.Status{Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429101:1-5", IoState: int32(replication.ReplicationStateRunning), SqlState: int32(replication.ReplicationStateRunning), BackupRunning: true},
After: nil,
},
},
expectedTakingBackup: map[string]bool{"zone1-0000000100": true, "zone1-0000000101": true},
expectedPrimaryStatusMap: map[string]*replicationdatapb.PrimaryStatus{},
expectedTabletsReachable: []*topodatapb.Tablet{{
Type: topodatapb.TabletType_REPLICA,
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
}, {
Type: topodatapb.TabletType_REPLICA,
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 101,
},
}},
shouldErr: false,
},
}

Expand Down Expand Up @@ -1317,6 +1398,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) {
for idx, tablet := range res.reachableTablets {
assert.True(t, topoproto.IsTabletInList(tablet, tt.expectedTabletsReachable), "TabletsReached[%d] not found - %s", idx, topoproto.TabletAliasString(tablet.Alias))
}
assert.Equal(t, tt.expectedTakingBackup, res.tabletsBackupState)
})
}
}
Expand Down
Loading
Loading