From 4d877f9524b1df1a2242a338f97e76795162467a Mon Sep 17 00:00:00 2001 From: Harshit Gangal Date: Tue, 15 Oct 2024 22:45:28 +0530 Subject: [PATCH] added atomic transaction metrics to vttablet Signed-off-by: Harshit Gangal --- go/vt/vttablet/tabletserver/dt_executor.go | 7 ++++++- go/vt/vttablet/tabletserver/tabletenv/stats.go | 11 +++++++++-- go/vt/vttablet/tabletserver/tx_engine.go | 12 +++++++----- 3 files changed, 22 insertions(+), 8 deletions(-) diff --git a/go/vt/vttablet/tabletserver/dt_executor.go b/go/vt/vttablet/tabletserver/dt_executor.go index 823751df638..9d280c36a41 100644 --- a/go/vt/vttablet/tabletserver/dt_executor.go +++ b/go/vt/vttablet/tabletserver/dt_executor.go @@ -159,7 +159,12 @@ func (dte *DTExecutor) CommitPrepared(dtid string) (err error) { defer func() { if err != nil { log.Warningf("failed to commit the prepared transaction '%s' with error: %v", dtid, err) - dte.te.checkErrorAndMarkFailed(ctx, dtid, err, "TwopcCommit") + fail := dte.te.checkErrorAndMarkFailed(ctx, dtid, err, "TwopcCommit") + if fail { + dte.te.env.Stats().CommitPreparedFail.Add("NonRetryable", 1) + } else { + dte.te.env.Stats().CommitPreparedFail.Add("Retryable", 1) + } } dte.te.txPool.RollbackAndRelease(ctx, conn) }() diff --git a/go/vt/vttablet/tabletserver/tabletenv/stats.go b/go/vt/vttablet/tabletserver/tabletenv/stats.go index 1ad93532719..7b996dec8b0 100644 --- a/go/vt/vttablet/tabletserver/tabletenv/stats.go +++ b/go/vt/vttablet/tabletserver/tabletenv/stats.go @@ -34,7 +34,6 @@ type Stats struct { ErrorCounters *stats.CountersWithSingleLabel InternalErrors *stats.CountersWithSingleLabel Warnings *stats.CountersWithSingleLabel - Unresolved *stats.GaugesWithSingleLabel // For now, only Prepares are tracked UserTableQueryCount *stats.CountersWithMultiLabels // Per CallerID/table counts UserTableQueryTimesNs *stats.CountersWithMultiLabels // Per CallerID/table latencies UserTransactionCount *stats.CountersWithMultiLabels // Per CallerID transaction counts @@ -49,6 +48,11 @@ type Stats struct { UserReservedTimesNs *stats.CountersWithSingleLabel // Per CallerID reserved connection duration QueryTimingsByTabletType *servenv.TimingsWrapper // Query timings split by current tablet type + + // Atomic Transactions + Unresolved *stats.GaugesWithSingleLabel + CommitPreparedFail *stats.CountersWithSingleLabel + RedoPreparedFail *stats.CountersWithSingleLabel } // NewStats instantiates a new set of stats scoped by exporter. @@ -83,7 +87,6 @@ func NewStats(exporter *servenv.Exporter) *Stats { ), InternalErrors: exporter.NewCountersWithSingleLabel("InternalErrors", "Internal component errors", "type", "Task", "StrayTransactions", "Panic", "HungQuery", "Schema", "TwopcCommit", "TwopcResurrection", "WatchdogFail", "Messages"), Warnings: exporter.NewCountersWithSingleLabel("Warnings", "Warnings", "type", "ResultsExceeded"), - Unresolved: exporter.NewGaugesWithSingleLabel("Unresolved", "Unresolved items", "item_type", "Prepares"), UserTableQueryCount: exporter.NewCountersWithMultiLabels("UserTableQueryCount", "Queries received for each CallerID/table combination", []string{"TableName", "CallerID", "Type"}), UserTableQueryTimesNs: exporter.NewCountersWithMultiLabels("UserTableQueryTimesNs", "Total latency for each CallerID/table combination", []string{"TableName", "CallerID", "Type"}), UserTransactionCount: exporter.NewCountersWithMultiLabels("UserTransactionCount", "transactions received for each CallerID", []string{"CallerID", "Conclusion"}), @@ -98,6 +101,10 @@ func NewStats(exporter *servenv.Exporter) *Stats { UserReservedTimesNs: exporter.NewCountersWithSingleLabel("UserReservedTimesNs", "Total reserved connection latency for each CallerID", "CallerID"), QueryTimingsByTabletType: exporter.NewTimings("QueryTimingsByTabletType", "Query timings broken down by active tablet type", "TabletType"), + + Unresolved: exporter.NewGaugesWithSingleLabel("UnresolvedTransaction", "Unresolved items", "ManagerType"), + CommitPreparedFail: exporter.NewCountersWithSingleLabel("CommitPreparedFail", "failed prepared transactions commit", "FailureType"), + RedoPreparedFail: exporter.NewCountersWithSingleLabel("RedoPreparedFail", "failed prepared transactions on redo", "FailureType"), } stats.QPSRates = exporter.NewRates("QPS", stats.QueryTimings, 15*60/5, 5*time.Second) return stats diff --git a/go/vt/vttablet/tabletserver/tx_engine.go b/go/vt/vttablet/tabletserver/tx_engine.go index d581fb79ae4..c9e0feb157c 100644 --- a/go/vt/vttablet/tabletserver/tx_engine.go +++ b/go/vt/vttablet/tabletserver/tx_engine.go @@ -83,7 +83,6 @@ type TxEngine struct { // 2. TabletControls have been set in the tablet record, and Query service is going to be disabled. twopcAllowed []bool shutdownGracePeriod time.Duration - coordinatorAddress string abandonAge time.Duration ticks *timer.Timer @@ -454,6 +453,9 @@ func (te *TxEngine) prepareFromRedo() error { allErrs = append(allErrs, vterrors.Wrapf(err, "dtid - %v", preparedTx.Dtid)) if prepFailed { failedCounter++ + te.env.Stats().RedoPreparedFail.Add("NonRetryable", 1) + } else { + te.env.Stats().RedoPreparedFail.Add("Retryable", 1) } } else { preparedCounter++ @@ -580,14 +582,13 @@ func (te *TxEngine) startTransactionWatcher() { ctx, cancel := context.WithTimeout(tabletenv.LocalContext(), te.abandonAge/4) defer cancel() - // Raise alerts on prepares that have been unresolved for too long. - // Use 5x abandonAge to give opportunity for transaction coordinator to resolve these redo logs. - count, err := te.twoPC.CountUnresolvedRedo(ctx, time.Now().Add(-te.abandonAge*5)) + // Track unresolved redo logs. + count, err := te.twoPC.CountUnresolvedRedo(ctx, time.Now().Add(-te.abandonAge)) if err != nil { te.env.Stats().InternalErrors.Add("RedoWatcherFail", 1) log.Errorf("Error reading prepared transactions: %v", err) } - te.env.Stats().Unresolved.Set("Prepares", count) + te.env.Stats().Unresolved.Set("ResourceManager", count) // Notify lingering distributed transactions. count, err = te.twoPC.CountUnresolvedTransaction(ctx, time.Now().Add(-te.abandonAge)) @@ -596,6 +597,7 @@ func (te *TxEngine) startTransactionWatcher() { log.Errorf("Error reading unresolved transactions: %v", err) return } + te.env.Stats().Unresolved.Set("MetadataManager", count) if count > 0 { te.dxNotify() }