Skip to content

Commit

Permalink
Errant GTID Counts metric in VTOrc (#16829)
Browse files Browse the repository at this point in the history
Signed-off-by: Manan Gupta <[email protected]>
  • Loading branch information
GuptaManan100 authored Oct 7, 2024
1 parent 30d63d4 commit 45462ca
Show file tree
Hide file tree
Showing 6 changed files with 99 additions and 12 deletions.
6 changes: 6 additions & 0 deletions changelog/21.0/21.0.0/summary.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
- **[Errant GTID Detection on VTTablets](#errant-gtid-vttablet)**
- **[Automatically Replace MySQL auto_increment Clauses with Vitess Sequences](#auto-replace-mysql-autoinc-with-seq)**
- **[Experimental MySQL 8.4 support](#experimental-mysql-84)**
- **[Curreny Errant GTIDs Count Metric](#errant-gtid-metric)**


## <a id="major-changes"/>Major Changes</a>

Expand Down Expand Up @@ -223,3 +225,7 @@ work automatically during the [`MoveTables`](https://vitess.io/docs/reference/vr
### <a id="experimental-mysql-84"/>Experimental MySQL 8.4 support

We have added experimental support for MySQL 8.4. It passes the Vitess test suite, but it is otherwise not yet tested. We are looking for feedback from the community to improve this to move support out of the experimental phase in a future release.

### <a id="errant-gtid-metric"/>Current Errant GTIDs Count Metric
A new metric called `CurrentErrantGTIDCount` has been added to the `VTOrc` component.
This metric shows the current count of the errant GTIDs in the tablets.
15 changes: 15 additions & 0 deletions go/mysql/replication/mysql56_gtid_set.go
Original file line number Diff line number Diff line change
Expand Up @@ -715,3 +715,18 @@ func Subtract(lhs, rhs string) (string, error) {
diffSet := lhsSet.Difference(rhsSet)
return diffSet.String(), nil
}

// GTIDCount returns the number of GTIDs in a GTID set.
func GTIDCount(gtidStr string) (int64, error) {
gtidSet, err := ParseMysql56GTIDSet(gtidStr)
if err != nil {
return 0, err
}
var count int64
for _, intervals := range gtidSet {
for _, intvl := range intervals {
count = count + intvl.end - intvl.start + 1
}
}
return count, nil
}
46 changes: 46 additions & 0 deletions go/mysql/replication/mysql56_gtid_set_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -705,6 +705,52 @@ func BenchmarkMySQL56GTIDParsing(b *testing.B) {
}
}

func TestGTIDCount(t *testing.T) {
tests := []struct {
name string
gtidStr string
wantCount int64
wantErr string
}{
{
name: "Empty GTID String",
gtidStr: "",
wantCount: 0,
}, {
name: "Single GTID",
gtidStr: "00010203-0405-0607-0809-0a0b0c0d0e0f:12",
wantCount: 1,
}, {
name: "Single GTID Interval",
gtidStr: "00010203-0405-0607-0809-0a0b0c0d0e0f:1-5",
wantCount: 5,
}, {
name: "Single UUID",
gtidStr: "00010203-0405-0607-0809-0a0b0c0d0e0f:1-5:11-20",
wantCount: 15,
}, {
name: "Multiple UUIDs",
gtidStr: "00010203-0405-0607-0809-0a0b0c0d0e0f:1-5:10-20,00010203-0405-0607-0809-0a0b0c0d0eff:1-5:50",
wantCount: 22,
}, {
name: "Parsing error",
gtidStr: "incorrect set",
wantErr: "invalid MySQL 5.6 GTID set",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
count, err := GTIDCount(tt.gtidStr)
require.EqualValues(t, tt.wantCount, count)
if tt.wantErr != "" {
require.ErrorContains(t, err, tt.wantErr)
} else {
require.NoError(t, err)
}
})
}
}

func TestErrantGTIDsOnReplica(t *testing.T) {
tests := []struct {
name string
Expand Down
15 changes: 13 additions & 2 deletions go/test/endtoend/vtorc/api/api_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -268,11 +268,13 @@ func TestAPIEndpoints(t *testing.T) {
assert.Equal(t, "Filtering by shard without keyspace isn't supported\n", resp)

// Also verify that the metric for errant GTIDs is reporting the correct count.
waitForErrantGTIDCount(t, vtorc, 1)
waitForErrantGTIDTabletCount(t, vtorc, 1)
// Now we check the errant GTID count for the tablet
verifyErrantGTIDCount(t, vtorc, replica.Alias, 1)
})
}

func waitForErrantGTIDCount(t *testing.T, vtorc *cluster.VTOrcProcess, errantGTIDCountWanted int) {
func waitForErrantGTIDTabletCount(t *testing.T, vtorc *cluster.VTOrcProcess, errantGTIDCountWanted int) {
timeout := time.After(15 * time.Second)
for {
select {
Expand All @@ -293,3 +295,12 @@ func waitForErrantGTIDCount(t *testing.T, vtorc *cluster.VTOrcProcess, errantGTI
}
}
}

func verifyErrantGTIDCount(t *testing.T, vtorc *cluster.VTOrcProcess, tabletAlias string, countWanted int) {
vars := vtorc.GetVars()
errantGTIDCounts := vars["CurrentErrantGTIDCount"].(map[string]interface{})
gtidCountVal, isPresent := errantGTIDCounts[tabletAlias]
require.True(t, isPresent, "Tablet %s not found in errant GTID counts", tabletAlias)
gtidCount := utils.GetIntFromValue(gtidCountVal)
require.EqualValues(t, countWanted, gtidCount, "Tablet %s has %d errant GTIDs, wanted %d", tabletAlias, gtidCount, countWanted)
}
20 changes: 10 additions & 10 deletions go/test/endtoend/vtorc/utils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -998,15 +998,15 @@ func WaitForSuccessfulRecoveryCount(t *testing.T, vtorcInstance *cluster.VTOrcPr
for time.Since(startTime) < timeout {
vars := vtorcInstance.GetVars()
successfulRecoveriesMap := vars["SuccessfulRecoveries"].(map[string]interface{})
successCount := getIntFromValue(successfulRecoveriesMap[recoveryName])
successCount := GetIntFromValue(successfulRecoveriesMap[recoveryName])
if successCount == countExpected {
return
}
time.Sleep(time.Second)
}
vars := vtorcInstance.GetVars()
successfulRecoveriesMap := vars["SuccessfulRecoveries"].(map[string]interface{})
successCount := getIntFromValue(successfulRecoveriesMap[recoveryName])
successCount := GetIntFromValue(successfulRecoveriesMap[recoveryName])
assert.EqualValues(t, countExpected, successCount)
}

Expand All @@ -1019,15 +1019,15 @@ func WaitForSuccessfulPRSCount(t *testing.T, vtorcInstance *cluster.VTOrcProcess
for time.Since(startTime) < timeout {
vars := vtorcInstance.GetVars()
prsCountsMap := vars["PlannedReparentCounts"].(map[string]interface{})
successCount := getIntFromValue(prsCountsMap[mapKey])
successCount := GetIntFromValue(prsCountsMap[mapKey])
if successCount == countExpected {
return
}
time.Sleep(time.Second)
}
vars := vtorcInstance.GetVars()
prsCountsMap := vars["PlannedReparentCounts"].(map[string]interface{})
successCount := getIntFromValue(prsCountsMap[mapKey])
successCount := GetIntFromValue(prsCountsMap[mapKey])
assert.EqualValues(t, countExpected, successCount)
}

Expand All @@ -1040,15 +1040,15 @@ func WaitForSuccessfulERSCount(t *testing.T, vtorcInstance *cluster.VTOrcProcess
for time.Since(startTime) < timeout {
vars := vtorcInstance.GetVars()
ersCountsMap := vars["EmergencyReparentCounts"].(map[string]interface{})
successCount := getIntFromValue(ersCountsMap[mapKey])
successCount := GetIntFromValue(ersCountsMap[mapKey])
if successCount == countExpected {
return
}
time.Sleep(time.Second)
}
vars := vtorcInstance.GetVars()
ersCountsMap := vars["EmergencyReparentCounts"].(map[string]interface{})
successCount := getIntFromValue(ersCountsMap[mapKey])
successCount := GetIntFromValue(ersCountsMap[mapKey])
assert.EqualValues(t, countExpected, successCount)
}

Expand All @@ -1067,10 +1067,10 @@ func CheckMetricExists(t *testing.T, vtorcInstance *cluster.VTOrcProcess, metric
assert.Contains(t, metrics, metricName)
}

// getIntFromValue is a helper function to get an integer from the given value.
// GetIntFromValue is a helper function to get an integer from the given value.
// If it is convertible to a float, then we round the number to the nearest integer.
// If the value is not numeric at all, we return 0.
func getIntFromValue(val any) int {
func GetIntFromValue(val any) int {
value := reflect.ValueOf(val)
if value.CanFloat() {
return int(math.Round(value.Float()))
Expand All @@ -1091,7 +1091,7 @@ func WaitForDetectedProblems(t *testing.T, vtorcInstance *cluster.VTOrcProcess,
for time.Since(startTime) < timeout {
vars := vtorcInstance.GetVars()
problems := vars["DetectedProblems"].(map[string]interface{})
actual := getIntFromValue(problems[key])
actual := GetIntFromValue(problems[key])
if actual == expect {
return
}
Expand All @@ -1101,7 +1101,7 @@ func WaitForDetectedProblems(t *testing.T, vtorcInstance *cluster.VTOrcProcess,
vars := vtorcInstance.GetVars()
problems := vars["DetectedProblems"].(map[string]interface{})
actual, ok := problems[key]
actual = getIntFromValue(actual)
actual = GetIntFromValue(actual)

assert.True(t, ok,
"The metric DetectedProblems[%s] should exist but does not (all problems: %+v)",
Expand Down
9 changes: 9 additions & 0 deletions go/vt/vtorc/inst/instance_dao.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ var forgetAliases *cache.Cache
var (
readTopologyInstanceCounter = stats.NewCounter("InstanceReadTopology", "Number of times an instance was read from the topology")
readInstanceCounter = stats.NewCounter("InstanceRead", "Number of times an instance was read")
currentErrantGTIDCount = stats.NewGaugesWithSingleLabel("CurrentErrantGTIDCount", "Number of errant GTIDs a vttablet currently has", "TabletAlias")
backendWrites = collection.CreateOrReturnCollection("BACKEND_WRITES")
writeBufferLatency = stopwatch.NewNamedStopwatch()
)
Expand Down Expand Up @@ -378,6 +379,11 @@ Cleanup:
redactedPrimaryExecutedGtidSet.RemoveUUID(instance.SourceUUID)

instance.GtidErrant, err = replication.Subtract(redactedExecutedGtidSet.String(), redactedPrimaryExecutedGtidSet.String())
if err == nil {
var gtidCount int64
gtidCount, err = replication.GTIDCount(instance.GtidErrant)
currentErrantGTIDCount.Set(tabletAlias, gtidCount)
}
}
}
}
Expand Down Expand Up @@ -1036,6 +1042,9 @@ func ForgetInstance(tabletAlias string) error {
forgetAliases.Set(tabletAlias, true, cache.DefaultExpiration)
log.Infof("Forgetting: %v", tabletAlias)

// Remove this tablet from errant GTID count metric.
currentErrantGTIDCount.Reset(tabletAlias)

// Delete from the 'vitess_tablet' table.
_, err := db.ExecVTOrc(`
delete
Expand Down

0 comments on commit 45462ca

Please sign in to comment.