From ebbd418809ad4043835cf8cfe90ac9d6d78e740b Mon Sep 17 00:00:00 2001
From: Max Englander <max@planetscale.com>
Date: Wed, 20 Sep 2023 10:58:11 +0100
Subject: [PATCH 1/7] metrics: change vtbackup_duration_by_phase to
 binary-valued vtbackup_phase (#12973)

Signed-off-by: Max Englander <max@planetscale.com>
---
 changelog/18.0/18.0.0/summary.md              |  26 ++-
 go/cmd/vtbackup/plugin_opentsdb.go            |  25 +++
 go/cmd/vtbackup/vtbackup.go                   |  83 +++++--
 go/flags/endtoend/vtbackup.txt                |   1 +
 go/stats/counters.go                          |  23 ++
 go/stats/export.go                            |  28 ++-
 go/stats/opentsdb/backend.go                  |  58 +++++
 go/stats/opentsdb/by_metric.go                |  54 +++++
 .../opentsdb/{opentsdb.go => collector.go}    | 212 ++++--------------
 go/stats/opentsdb/datapoint.go                |  90 ++++++++
 go/stats/opentsdb/datapoint_reader.go         |  53 +++++
 go/stats/opentsdb/doc.go                      |  18 ++
 go/stats/opentsdb/file_writer.go              |  52 +++++
 go/stats/opentsdb/flags.go                    |  38 ++++
 go/stats/opentsdb/http_writer.go              |  51 +++++
 go/stats/opentsdb/init.go                     | 104 +++++++++
 go/stats/opentsdb/opentsdb_test.go            |  13 +-
 go/stats/opentsdb/writer.go                   |  21 ++
 go/stats/statsd/statsd.go                     |  14 +-
 .../backup/vtbackup/backup_only_test.go       |  94 +++++++-
 go/test/endtoend/cluster/vtbackup_process.go  |   3 +-
 21 files changed, 845 insertions(+), 216 deletions(-)
 create mode 100644 go/cmd/vtbackup/plugin_opentsdb.go
 create mode 100644 go/stats/opentsdb/backend.go
 create mode 100644 go/stats/opentsdb/by_metric.go
 rename go/stats/opentsdb/{opentsdb.go => collector.go} (54%)
 create mode 100644 go/stats/opentsdb/datapoint.go
 create mode 100644 go/stats/opentsdb/datapoint_reader.go
 create mode 100644 go/stats/opentsdb/doc.go
 create mode 100644 go/stats/opentsdb/file_writer.go
 create mode 100644 go/stats/opentsdb/flags.go
 create mode 100644 go/stats/opentsdb/http_writer.go
 create mode 100644 go/stats/opentsdb/init.go
 create mode 100644 go/stats/opentsdb/writer.go
diff --git a/changelog/18.0/18.0.0/summary.md b/changelog/18.0/18.0.0/summary.md
index 61cf6be1cf8..db2494d3794 100644
--- a/changelog/18.0/18.0.0/summary.md
+++ b/changelog/18.0/18.0.0/summary.md
@@ -16,8 +16,10 @@
     - [Deleted `V3` planner](#deleted-v3)
     - [Deleted `k8stopo`](#deleted-k8stopo)
     - [Deleted `vtgr`](#deleted-vtgr)
+    - [Deprecated VTBackup stat `DurationByPhase`](#deprecated-vtbackup-stat-duration-by-phase)
   - **[New stats](#new-stats)**
     - [VTGate Vindex unknown parameters](#vtgate-vindex-unknown-parameters)
+    - [VTBackup stat `Phase`](#vtbackup-stat-phase)
     - [VTBackup stat `PhaseStatus`](#vtbackup-stat-phase-status)
   - **[VTTablet](#vttablet)**
     - [VTTablet: New ResetSequences RPC](#vttablet-new-rpc-reset-sequences)
@@ -115,17 +117,37 @@ the `k8stopo` has been removed.
 
 The `vtgr` has been deprecated in Vitess 17, also see https://github.com/vitessio/vitess/issues/13300. With Vitess 18 `vtgr` has been removed.
 
+#### <a id="deprecated-vtbackup-stat-duration-by-phase"/>Deprecated VTbackup stat `DurationByPhase`
+
+VTBackup stat `DurationByPhase` is deprecated. Use the binary-valued `Phase` stat instead.
+
 ### <a id="new-stats"/>New stats
 
 #### <a id="vtgate-vindex-unknown-parameters"/>VTGate Vindex unknown parameters
 
 The VTGate stat `VindexUnknownParameters` gauges unknown Vindex parameters found in the latest VSchema pulled from the topology.
 
+#### <a id="vtbackup-stat-phase"/>VTBackup `Phase` stat
+
+In v17, the `vtbackup` stat `DurationByPhase` stat was added measuring the time spent by `vtbackup` in each phase. This stat turned out to be awkward to use in production, and has been replaced in v18 by a binary-valued `Phase` stat.
+
+`Phase` reports a 1 (active) or a 0 (inactive) for each of the following phases:
+
+ * `CatchupReplication`
+ * `InitialBackup`
+ * `RestoreLastBackup`
+ * `TakeNewBackup`
+
+To calculate how long `vtbackup` has spent in a given phase, sum the 1-valued data points over time and multiply by the data collection or reporting interval. For example, in Prometheus:
+
+```
+sum_over_time(vtbackup_phase{phase="TakeNewBackup"}) * <interval>
+```
 #### <a id="vtbackup-stat-phase-status"/>VTBackup `PhaseStatus` stat
 
 `PhaseStatus` reports a 1 (active) or a 0 (inactive) for each of the following phases and statuses:
 
- * `CatchUpReplication` phase has statuses `Stalled` and `Stopped`.
+ * `CatchupReplication` phase has statuses `Stalled` and `Stopped`.
     * `Stalled` is set to `1` when replication stops advancing.
     * `Stopped` is set to `1` when replication stops before `vtbackup` catches up with the primary.
 
@@ -165,4 +187,4 @@ removing Vitess support.
 
 #### <a id="new-durability-policies"/>New Durability Policies
 
-2 new inbuilt durability policies have been added to Vitess in this release namely `semi_sync_with_rdonly_ack` and `cross_cell_with_rdonly_ack`. These policies are exactly like `semi_sync` and `cross_cell` respectively, and differ just in the part where the rdonly tablets can also send semi-sync ACKs. 
\ No newline at end of file
+2 new inbuilt durability policies have been added to Vitess in this release namely `semi_sync_with_rdonly_ack` and `cross_cell_with_rdonly_ack`. These policies are exactly like `semi_sync` and `cross_cell` respectively, and differ just in the part where the rdonly tablets can also send semi-sync ACKs. 
diff --git a/go/cmd/vtbackup/plugin_opentsdb.go b/go/cmd/vtbackup/plugin_opentsdb.go
new file mode 100644
index 00000000000..44ac886d420
--- /dev/null
+++ b/go/cmd/vtbackup/plugin_opentsdb.go
@@ -0,0 +1,25 @@
+/*
+Copyright 2023 The Vitess Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package main
+
+import "vitess.io/vitess/go/stats/opentsdb"
+
+// This plugin imports opentsdb to register the opentsdb stats backend.
+
+func init() {
+	opentsdb.Init("vtbackup")
+}
diff --git a/go/cmd/vtbackup/vtbackup.go b/go/cmd/vtbackup/vtbackup.go
index f27a991d35e..ebf83526cad 100644
--- a/go/cmd/vtbackup/vtbackup.go
+++ b/go/cmd/vtbackup/vtbackup.go
@@ -100,9 +100,12 @@ const (
 	// forever for things that should be quick.
 	operationTimeout = 1 * time.Minute
 
-	phaseNameCatchUpReplication          = "CatchUpReplication"
-	phaseStatusCatchUpReplicationStalled = "Stalled"
-	phaseStatusCatchUpReplicationStopped = "Stopped"
+	phaseNameCatchupReplication          = "CatchupReplication"
+	phaseNameInitialBackup               = "InitialBackup"
+	phaseNameRestoreLastBackup           = "RestoreLastBackup"
+	phaseNameTakeNewBackup               = "TakeNewBackup"
+	phaseStatusCatchupReplicationStalled = "Stalled"
+	phaseStatusCatchupReplicationStopped = "Stopped"
 )
 
 var (
@@ -127,20 +130,44 @@ var (
 	detachedMode     bool
 	keepAliveTimeout = 0 * time.Second
 	disableRedoLog   = false
-	durationByPhase  = stats.NewGaugesWithSingleLabel(
+
+	// Deprecated, use "Phase" instead.
+	deprecatedDurationByPhase = stats.NewGaugesWithSingleLabel(
 		"DurationByPhaseSeconds",
-		"How long it took vtbackup to perform each phase (in seconds).",
+		"[DEPRECATED] How long it took vtbackup to perform each phase (in seconds).",
+		"phase",
+	)
+
+	// This gauge is updated 3*N times during the course of a vtbackup run,
+	// where N is the number of different phases vtbackup transitions through.
+	// Once to initialize to 0, another time to set the phase to active (1),
+	// and another to deactivate the phase (back to 0).
+	//
+	// At most a single phase is active at a given time.
+	//
+	// The sync gauge immediately reports changes to push-backed backends.
+	// The benefit of the sync gauge is that it makes verifying stats in
+	// integration tests a lot more tractable.
+	phase = stats.NewSyncGaugesWithSingleLabel(
+		"Phase",
+		"Active phase.",
 		"phase",
 	)
+	phaseNames = []string{
+		phaseNameCatchupReplication,
+		phaseNameInitialBackup,
+		phaseNameRestoreLastBackup,
+		phaseNameTakeNewBackup,
+	}
 	phaseStatus = stats.NewGaugesWithMultiLabels(
 		"PhaseStatus",
 		"Internal state of vtbackup phase.",
 		[]string{"phase", "status"},
 	)
 	phaseStatuses = map[string][]string{
-		phaseNameCatchUpReplication: {
-			phaseStatusCatchUpReplicationStalled,
-			phaseStatusCatchUpReplicationStopped,
+		phaseNameCatchupReplication: {
+			phaseStatusCatchupReplicationStalled,
+			phaseStatusCatchupReplicationStopped,
 		},
 	}
 )
@@ -219,6 +246,9 @@ func main() {
 	defer topoServer.Close()
 
 	// Initialize stats.
+	for _, phaseName := range phaseNames {
+		phase.Set(phaseName, int64(0))
+	}
 	for phaseName, statuses := range phaseStatuses {
 		for _, status := range statuses {
 			phaseStatus.Set([]string{phaseName, status}, 0)
@@ -294,7 +324,7 @@ func takeBackup(ctx context.Context, topoServer *topo.Server, backupStorage back
 	if err := mysqld.Init(initCtx, mycnf, initDBSQLFile); err != nil {
 		return fmt.Errorf("failed to initialize mysql data dir and start mysqld: %v", err)
 	}
-	durationByPhase.Set("InitMySQLd", int64(time.Since(initMysqldAt).Seconds()))
+	deprecatedDurationByPhase.Set("InitMySQLd", int64(time.Since(initMysqldAt).Seconds()))
 	// Shut down mysqld when we're done.
 	defer func() {
 		// Be careful not to use the original context, because we don't want to
@@ -358,14 +388,19 @@ func takeBackup(ctx context.Context, topoServer *topo.Server, backupStorage back
 
 		backupParams.BackupTime = time.Now()
 		// Now we're ready to take the backup.
+		phase.Set(phaseNameInitialBackup, int64(1))
+		defer phase.Set(phaseNameInitialBackup, int64(0))
 		if err := mysqlctl.Backup(ctx, backupParams); err != nil {
 			return fmt.Errorf("backup failed: %v", err)
 		}
-		durationByPhase.Set("InitialBackup", int64(time.Since(backupParams.BackupTime).Seconds()))
+		deprecatedDurationByPhase.Set("InitialBackup", int64(time.Since(backupParams.BackupTime).Seconds()))
 		log.Info("Initial backup successful.")
+		phase.Set(phaseNameInitialBackup, int64(0))
 		return nil
 	}
 
+	phase.Set(phaseNameRestoreLastBackup, int64(1))
+	defer phase.Set(phaseNameRestoreLastBackup, int64(0))
 	backupDir := mysqlctl.GetBackupDir(initKeyspace, initShard)
 	log.Infof("Restoring latest backup from directory %v", backupDir)
 	restoreAt := time.Now()
@@ -397,7 +432,8 @@ func takeBackup(ctx context.Context, topoServer *topo.Server, backupStorage back
 	default:
 		return fmt.Errorf("can't restore from backup: %v", err)
 	}
-	durationByPhase.Set("RestoreLastBackup", int64(time.Since(restoreAt).Seconds()))
+	deprecatedDurationByPhase.Set("RestoreLastBackup", int64(time.Since(restoreAt).Seconds()))
+	phase.Set(phaseNameRestoreLastBackup, int64(0))
 
 	// As of MySQL 8.0.21, you can disable redo logging using the ALTER INSTANCE
 	// DISABLE INNODB REDO_LOG statement. This functionality is intended for
@@ -455,6 +491,9 @@ func takeBackup(ctx context.Context, topoServer *topo.Server, backupStorage back
 	backupParams.BackupTime = time.Now()
 
 	// Wait for replication to catch up.
+	phase.Set(phaseNameCatchupReplication, int64(1))
+	defer phase.Set(phaseNameCatchupReplication, int64(0))
+
 	var (
 		lastStatus replication.ReplicationStatus
 		status     replication.ReplicationStatus
@@ -479,26 +518,27 @@ func takeBackup(ctx context.Context, topoServer *topo.Server, backupStorage back
 			// We're caught up on replication to at least the point the primary
 			// was at when this vtbackup run started.
 			log.Infof("Replication caught up to %v after %v", status.Position, time.Since(waitStartTime))
-			durationByPhase.Set("CatchUpReplication", int64(time.Since(waitStartTime).Seconds()))
+			deprecatedDurationByPhase.Set("CatchUpReplication", int64(time.Since(waitStartTime).Seconds()))
 			break
 		}
 		if !lastStatus.Position.IsZero() {
 			if status.Position.Equal(lastStatus.Position) {
-				phaseStatus.Set([]string{phaseNameCatchUpReplication, phaseStatusCatchUpReplicationStalled}, 1)
+				phaseStatus.Set([]string{phaseNameCatchupReplication, phaseStatusCatchupReplicationStalled}, 1)
 			} else {
-				phaseStatus.Set([]string{phaseNameCatchUpReplication, phaseStatusCatchUpReplicationStalled}, 0)
+				phaseStatus.Set([]string{phaseNameCatchupReplication, phaseStatusCatchupReplicationStalled}, 0)
 			}
 		}
 		if !status.Healthy() {
 			log.Warning("Replication has stopped before backup could be taken. Trying to restart replication.")
-			phaseStatus.Set([]string{phaseNameCatchUpReplication, phaseStatusCatchUpReplicationStopped}, 1)
+			phaseStatus.Set([]string{phaseNameCatchupReplication, phaseStatusCatchupReplicationStopped}, 1)
 			if err := startReplication(ctx, mysqld, topoServer); err != nil {
 				log.Warningf("Failed to restart replication: %v", err)
 			}
 		} else {
-			phaseStatus.Set([]string{phaseNameCatchUpReplication, phaseStatusCatchUpReplicationStopped}, 0)
+			phaseStatus.Set([]string{phaseNameCatchupReplication, phaseStatusCatchupReplicationStopped}, 0)
 		}
 	}
+	phase.Set(phaseNameCatchupReplication, int64(0))
 
 	// Stop replication and see where we are.
 	if err := mysqld.StopReplication(nil); err != nil {
@@ -514,8 +554,8 @@ func takeBackup(ctx context.Context, topoServer *topo.Server, backupStorage back
 	if !status.Position.AtLeast(primaryPos) && status.Position.Equal(restorePos) {
 		return fmt.Errorf("not taking backup: replication did not make any progress from restore point: %v", restorePos)
 	}
-	phaseStatus.Set([]string{phaseNameCatchUpReplication, phaseStatusCatchUpReplicationStalled}, 0)
-	phaseStatus.Set([]string{phaseNameCatchUpReplication, phaseStatusCatchUpReplicationStopped}, 0)
+	phaseStatus.Set([]string{phaseNameCatchupReplication, phaseStatusCatchupReplicationStalled}, 0)
+	phaseStatus.Set([]string{phaseNameCatchupReplication, phaseStatusCatchupReplicationStopped}, 0)
 
 	// Re-enable redo logging.
 	if disabledRedoLog {
@@ -539,15 +579,18 @@ func takeBackup(ctx context.Context, topoServer *topo.Server, backupStorage back
 		if err := mysqld.Start(ctx, mycnf); err != nil {
 			return fmt.Errorf("Could not start MySQL after full shutdown: %v", err)
 		}
-		durationByPhase.Set("RestartBeforeBackup", int64(time.Since(restartAt).Seconds()))
+		deprecatedDurationByPhase.Set("RestartBeforeBackup", int64(time.Since(restartAt).Seconds()))
 	}
 
 	// Now we can take a new backup.
 	backupAt := time.Now()
+	phase.Set(phaseNameTakeNewBackup, int64(1))
+	defer phase.Set(phaseNameTakeNewBackup, int64(0))
 	if err := mysqlctl.Backup(ctx, backupParams); err != nil {
 		return fmt.Errorf("error taking backup: %v", err)
 	}
-	durationByPhase.Set("TakeNewBackup", int64(time.Since(backupAt).Seconds()))
+	deprecatedDurationByPhase.Set("TakeNewBackup", int64(time.Since(backupAt).Seconds()))
+	phase.Set(phaseNameTakeNewBackup, int64(0))
 
 	// Return a non-zero exit code if we didn't meet the replication position
 	// goal, even though we took a backup that pushes the high-water mark up.
diff --git a/go/flags/endtoend/vtbackup.txt b/go/flags/endtoend/vtbackup.txt
index 44cb2a08462..46e4efea301 100644
--- a/go/flags/endtoend/vtbackup.txt
+++ b/go/flags/endtoend/vtbackup.txt
@@ -134,6 +134,7 @@ Usage of vtbackup:
       --mysql_server_version string                                 MySQL server version to advertise. (default "8.0.30-Vitess")
       --mysql_socket string                                         path to the mysql socket
       --mysql_timeout duration                                      how long to wait for mysqld startup (default 5m0s)
+      --opentsdb_uri string                                         URI of opentsdb /api/put method
       --port int                                                    port for the server
       --pprof strings                                               enable profiling
       --purge_logs_interval duration                                how often try to remove old logs (default 1h0m0s)
diff --git a/go/stats/counters.go b/go/stats/counters.go
index 371cbd53818..a4dfc0dcb1f 100644
--- a/go/stats/counters.go
+++ b/go/stats/counters.go
@@ -321,6 +321,29 @@ func (g *GaugesWithSingleLabel) Set(name string, value int64) {
 	g.counters.set(name, value)
 }
 
+// SyncGaugesWithSingleLabel is a GaugesWithSingleLabel that proactively pushes
+// stats to push-based backends when Set is called.
+type SyncGaugesWithSingleLabel struct {
+	GaugesWithSingleLabel
+	name string
+}
+
+// NewSyncGaugesWithSingleLabel creates a new SyncGaugesWithSingleLabel.
+func NewSyncGaugesWithSingleLabel(name, help, label string, tags ...string) *SyncGaugesWithSingleLabel {
+	return &SyncGaugesWithSingleLabel{
+		GaugesWithSingleLabel: *NewGaugesWithSingleLabel(name, help, label, tags...),
+		name:                  name,
+	}
+}
+
+// Set sets the value of a named gauge.
+func (sg *SyncGaugesWithSingleLabel) Set(name string, value int64) {
+	sg.GaugesWithSingleLabel.Set(name, value)
+	if sg.name != "" {
+		_ = pushOne(sg.name, &sg.GaugesWithSingleLabel)
+	}
+}
+
 // GaugesWithMultiLabels is a CountersWithMultiLabels implementation where
 // the values can go up and down.
 type GaugesWithMultiLabels struct {
diff --git a/go/stats/export.go b/go/stats/export.go
index e98ef0a969c..8bda85c87b2 100644
--- a/go/stats/export.go
+++ b/go/stats/export.go
@@ -121,6 +121,22 @@ func Publish(name string, v expvar.Var) {
 	publish(name, v)
 }
 
+func pushAll() error {
+	backend, ok := pushBackends[statsBackend]
+	if !ok {
+		return fmt.Errorf("no PushBackend registered with name %s", statsBackend)
+	}
+	return backend.PushAll()
+}
+
+func pushOne(name string, v Variable) error {
+	backend, ok := pushBackends[statsBackend]
+	if !ok {
+		return fmt.Errorf("no PushBackend registered with name %s", statsBackend)
+	}
+	return backend.PushOne(name, v)
+}
+
 // StringMapFuncWithMultiLabels is a multidimensional string map publisher.
 //
 // Map keys are compound names made with joining multiple strings with '.',
@@ -183,8 +199,10 @@ func publish(name string, v expvar.Var) {
 // to be pushed to it. It's used to support push-based metrics backends, as expvar
 // by default only supports pull-based ones.
 type PushBackend interface {
-	// PushAll pushes all stats from expvar to the backend
+	// PushAll pushes all stats from expvar to the backend.
 	PushAll() error
+	// PushOne pushes a single stat from expvar to the backend.
+	PushOne(name string, v Variable) error
 }
 
 var pushBackends = make(map[string]PushBackend)
@@ -214,13 +232,7 @@ func emitToBackend(emitPeriod *time.Duration) {
 	ticker := time.NewTicker(*emitPeriod)
 	defer ticker.Stop()
 	for range ticker.C {
-		backend, ok := pushBackends[statsBackend]
-		if !ok {
-			log.Errorf("No PushBackend registered with name %s", statsBackend)
-			return
-		}
-		err := backend.PushAll()
-		if err != nil {
+		if err := pushAll(); err != nil {
 			// TODO(aaijazi): This might cause log spam...
 			log.Warningf("Pushing stats to backend %v failed: %v", statsBackend, err)
 		}
diff --git a/go/stats/opentsdb/backend.go b/go/stats/opentsdb/backend.go
new file mode 100644
index 00000000000..e5c766ba797
--- /dev/null
+++ b/go/stats/opentsdb/backend.go
@@ -0,0 +1,58 @@
+/*
+Copyright 2023 The Vitess Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package opentsdb
+
+import (
+	"time"
+
+	"vitess.io/vitess/go/stats"
+)
+
+// backend implements stats.PushBackend
+type backend struct {
+	// The prefix is the name of the binary (vtgate, vttablet, etc.) and will be
+	// prepended to all the stats reported.
+	prefix string
+	// Tags that should be included with every data point. If there's a tag name
+	// collision between the common tags and a single data point's tags, the data
+	// point tag will override the common tag.
+	commonTags map[string]string
+	// writer is used to send data points somewhere (file, http, ...).
+	writer writer
+}
+
+// PushAll pushes all stats to OpenTSDB
+func (b *backend) PushAll() error {
+	collector := b.collector()
+	collector.collectAll()
+	return b.writer.Write(collector.data)
+}
+
+// PushOne pushes a single stat to OpenTSDB
+func (b *backend) PushOne(name string, v stats.Variable) error {
+	collector := b.collector()
+	collector.collectOne(name, v)
+	return b.writer.Write(collector.data)
+}
+
+func (b *backend) collector() *collector {
+	return &collector{
+		commonTags: b.commonTags,
+		prefix:     b.prefix,
+		timestamp:  time.Now().Unix(),
+	}
+}
diff --git a/go/stats/opentsdb/by_metric.go b/go/stats/opentsdb/by_metric.go
new file mode 100644
index 00000000000..5177109a18e
--- /dev/null
+++ b/go/stats/opentsdb/by_metric.go
@@ -0,0 +1,54 @@
+/*
+Copyright 2023 The Vitess Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package opentsdb
+
+// byMetric implements sort.Interface for []*DataPoint based on the metric key
+// and then tag values (prioritized in tag name order). Having a consistent sort order
+// is convenient when refreshing /debug/opentsdb or for encoding and comparing JSON directly
+// in the tests.
+type byMetric []*DataPoint
+
+func (m byMetric) Len() int      { return len(m) }
+func (m byMetric) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
+func (m byMetric) Less(i, j int) bool {
+	if m[i].Metric < m[j].Metric {
+		return true
+	}
+
+	if m[i].Metric > m[j].Metric {
+		return false
+	}
+
+	// Metric names are the same. We can use tag values to figure out the sort order.
+	// The deciding tag will be the lexicographically earliest tag name where tag values differ.
+	decidingTagName := ""
+	result := false
+	for tagName, iVal := range m[i].Tags {
+		jVal, ok := m[j].Tags[tagName]
+		if !ok {
+			// We'll arbitrarily declare that if i has any tag name that j doesn't then it sorts earlier.
+			// This shouldn't happen in practice, though, if metric code is correct...
+			return true
+		}
+
+		if iVal != jVal && (tagName < decidingTagName || decidingTagName == "") {
+			decidingTagName = tagName
+			result = iVal < jVal
+		}
+	}
+	return result
+}
diff --git a/go/stats/opentsdb/opentsdb.go b/go/stats/opentsdb/collector.go
similarity index 54%
rename from go/stats/opentsdb/opentsdb.go
rename to go/stats/opentsdb/collector.go
index 3e85052b5f4..9b870815067 100644
--- a/go/stats/opentsdb/opentsdb.go
+++ b/go/stats/opentsdb/collector.go
@@ -14,151 +14,47 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 
-// Package opentsdb adds support for pushing stats to opentsdb.
 package opentsdb
 
 import (
 	"bytes"
 	"encoding/json"
 	"expvar"
-	"net/http"
-	"sort"
 	"strings"
-	"time"
 	"unicode"
 
-	"github.com/spf13/pflag"
-
 	"vitess.io/vitess/go/stats"
-	"vitess.io/vitess/go/vt/servenv"
 )
 
-var openTsdbURI string
-
-func registerFlags(fs *pflag.FlagSet) {
-	fs.StringVar(&openTsdbURI, "opentsdb_uri", openTsdbURI, "URI of opentsdb /api/put method")
-}
-
-func init() {
-	servenv.OnParseFor("vtctld", registerFlags)
-	servenv.OnParseFor("vtgate", registerFlags)
-	servenv.OnParseFor("vttablet", registerFlags)
-}
-
-// dataPoint represents a single OpenTSDB data point.
-type dataPoint struct {
-	// Example: sys.cpu.nice
-	Metric string `json:"metric"`
-	// Seconds or milliseconds since unix epoch.
-	Timestamp float64           `json:"timestamp"`
-	Value     float64           `json:"value"`
-	Tags      map[string]string `json:"tags"`
-}
-
-// sendDataPoints pushes a list of data points to openTSDB.
-// All other code in this file is just to support getting this function called
-// with all stats represented as data points.
-func sendDataPoints(data []dataPoint) error {
-	json, err := json.Marshal(data)
-	if err != nil {
-		return err
-	}
-
-	resp, err := http.Post(openTsdbURI, "application/json", bytes.NewReader(json))
-	if err != nil {
-		return err
-	}
-	resp.Body.Close()
-	return nil
-}
-
-// openTSDBBackend implements stats.PushBackend
-type openTSDBBackend struct {
-	// The prefix is the name of the binary (vtgate, vttablet, etc.) and will be
-	// prepended to all the stats reported.
-	prefix string
-	// Tags that should be included with every data point. If there's a tag name
-	// collision between the common tags and a single data point's tags, the data
-	// point tag will override the common tag.
+// collector tracks state for a single pass of stats reporting / data collection.
+type collector struct {
 	commonTags map[string]string
-}
-
-// dataCollector tracks state for a single pass of stats reporting / data collection.
-type dataCollector struct {
-	settings   *openTSDBBackend
+	data       []*DataPoint
+	prefix     string
 	timestamp  int64
-	dataPoints []dataPoint
-}
-
-// Init attempts to create a singleton openTSDBBackend and register it as a PushBackend.
-// If it fails to create one, this is a noop. The prefix argument is an optional string
-// to prepend to the name of every data point reported.
-func Init(prefix string) {
-	// Needs to happen in servenv.OnRun() instead of init because it requires flag parsing and logging
-	servenv.OnRun(func() {
-		InitWithoutServenv(prefix)
-	})
-}
-
-// InitWithoutServenv initializes the opentsdb without servenv
-func InitWithoutServenv(prefix string) {
-	if openTsdbURI == "" {
-		return
-	}
-
-	backend := &openTSDBBackend{
-		prefix:     prefix,
-		commonTags: stats.ParseCommonTags(stats.CommonTags),
-	}
-
-	stats.RegisterPushBackend("opentsdb", backend)
-
-	servenv.HTTPHandleFunc("/debug/opentsdb", func(w http.ResponseWriter, r *http.Request) {
-		w.Header().Set("Content-Type", "application/json; charset=utf-8")
-		dataPoints := (*backend).getDataPoints()
-		sort.Sort(byMetric(dataPoints))
-
-		if b, err := json.MarshalIndent(dataPoints, "", "  "); err != nil {
-			w.Write([]byte(err.Error()))
-		} else {
-			w.Write(b)
-		}
-	})
 }
 
-// PushAll pushes all stats to OpenTSDB
-func (backend *openTSDBBackend) PushAll() error {
-	return sendDataPoints(backend.getDataPoints())
-}
-
-// getDataPoints fetches all stats in an opentsdb-compatible format.
-// This is separated from PushAll() so it can be reused for the /debug/opentsdb handler.
-func (backend *openTSDBBackend) getDataPoints() []dataPoint {
-	dataCollector := &dataCollector{
-		settings:  backend,
-		timestamp: time.Now().Unix(),
-	}
-
+func (dc *collector) collectAll() {
 	expvar.Do(func(kv expvar.KeyValue) {
-		dataCollector.addExpVar(kv)
+		dc.addExpVar(kv)
 	})
-
-	return dataCollector.dataPoints
 }
 
-// combineMetricName joins parts of a hierarchical name with a "."
-func combineMetricName(parts ...string) string {
-	return strings.Join(parts, ".")
+func (dc *collector) collectOne(name string, v expvar.Var) {
+	dc.addExpVar(expvar.KeyValue{
+		Key:   name,
+		Value: v,
+	})
 }
 
-func (dc *dataCollector) addInt(metric string, val int64, tags map[string]string) {
+func (dc *collector) addInt(metric string, val int64, tags map[string]string) {
 	dc.addFloat(metric, float64(val), tags)
 }
 
-func (dc *dataCollector) addFloat(metric string, val float64, tags map[string]string) {
+func (dc *collector) addFloat(metric string, val float64, tags map[string]string) {
 	var fullMetric string
-	if len(dc.settings.prefix) > 0 {
-		fullMetric = combineMetricName(dc.settings.prefix, metric)
+	if len(dc.prefix) > 0 {
+		fullMetric = combineMetricName(dc.prefix, metric)
 	} else {
 		fullMetric = metric
 	}
@@ -182,20 +78,20 @@ func (dc *dataCollector) addFloat(metric string, val float64, tags map[string]st
 	}
 
 	fullTags := make(map[string]string)
-	for k, v := range dc.settings.commonTags {
+	for k, v := range dc.commonTags {
 		fullTags[sanitize(k)] = sanitize(v)
 	}
 	for k, v := range tags {
 		fullTags[sanitize(k)] = sanitize(v)
 	}
 
-	dp := dataPoint{
+	dp := &DataPoint{
 		Metric:    sanitize(fullMetric),
 		Value:     val,
 		Timestamp: float64(dc.timestamp),
 		Tags:      fullTags,
 	}
-	dc.dataPoints = append(dc.dataPoints, dp)
+	dc.data = append(dc.data, dp)
 }
 
 // addExpVar adds all the data points associated with a particular expvar to the list of
@@ -206,7 +102,7 @@ func (dc *dataCollector) addFloat(metric string, val float64, tags map[string]st
 //
 // Generic unrecognized expvars are serialized to json and their int/float values are exported.
 // Strings and lists in expvars are not exported.
-func (dc *dataCollector) addExpVar(kv expvar.KeyValue) {
+func (dc *collector) addExpVar(kv expvar.KeyValue) {
 	k := kv.Key
 	switch v := kv.Value.(type) {
 	case stats.FloatFunc:
@@ -268,24 +164,8 @@ func (dc *dataCollector) addExpVar(kv expvar.KeyValue) {
 	}
 }
 
-// makeLabel builds a tag list with a single label + value.
-func makeLabel(labelName string, labelVal string) map[string]string {
-	return map[string]string{labelName: labelVal}
-}
-
-// makeLabels takes the vitess stat representation of label values ("."-separated list) and breaks it
-// apart into a map of label name -> label value.
-func makeLabels(labelNames []string, labelValsCombined string) map[string]string {
-	tags := make(map[string]string)
-	labelVals := strings.Split(labelValsCombined, ".")
-	for i, v := range labelVals {
-		tags[labelNames[i]] = v
-	}
-	return tags
-}
-
 // addUnrecognizedExpvars recurses into a json object to pull out float64 variables to report.
-func (dc *dataCollector) addUnrecognizedExpvars(prefix string, obj map[string]any) {
+func (dc *collector) addUnrecognizedExpvars(prefix string, obj map[string]any) {
 	for k, v := range obj {
 		prefix := combineMetricName(prefix, k)
 		switch v := v.(type) {
@@ -298,7 +178,7 @@ func (dc *dataCollector) addUnrecognizedExpvars(prefix string, obj map[string]an
 }
 
 // addTimings converts a vitess Timings stat to something opentsdb can deal with.
-func (dc *dataCollector) addTimings(labels []string, timings *stats.Timings, prefix string) {
+func (dc *collector) addTimings(labels []string, timings *stats.Timings, prefix string) {
 	histograms := timings.Histograms()
 	for labelValsCombined, histogram := range histograms {
 		// If you prefer millisecond timings over nanoseconds you can pass 1000000 here instead of 1.
@@ -306,7 +186,7 @@ func (dc *dataCollector) addTimings(labels []string, timings *stats.Timings, pre
 	}
 }
 
-func (dc *dataCollector) addHistogram(histogram *stats.Histogram, divideBy int64, prefix string, tags map[string]string) {
+func (dc *collector) addHistogram(histogram *stats.Histogram, divideBy int64, prefix string, tags map[string]string) {
 	// TODO: OpenTSDB 2.3 doesn't have histogram support, although it's forthcoming.
 	// For simplicity we report each bucket as a different metric.
 	//
@@ -335,39 +215,23 @@ func (dc *dataCollector) addHistogram(histogram *stats.Histogram, divideBy int64
 	)
 }
 
-// byMetric implements sort.Interface for []dataPoint based on the metric key
-// and then tag values (prioritized in tag name order). Having a consistent sort order
-// is convenient when refreshing /debug/opentsdb or for encoding and comparing JSON directly
-// in the tests.
-type byMetric []dataPoint
-
-func (m byMetric) Len() int      { return len(m) }
-func (m byMetric) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
-func (m byMetric) Less(i, j int) bool {
-	if m[i].Metric < m[j].Metric {
-		return true
-	}
-
-	if m[i].Metric > m[j].Metric {
-		return false
-	}
+// combineMetricName joins parts of a hierarchical name with a "."
+func combineMetricName(parts ...string) string {
+	return strings.Join(parts, ".")
+}
 
-	// Metric names are the same. We can use tag values to figure out the sort order.
-	// The deciding tag will be the lexicographically earliest tag name where tag values differ.
-	decidingTagName := ""
-	result := false
-	for tagName, iVal := range m[i].Tags {
-		jVal, ok := m[j].Tags[tagName]
-		if !ok {
-			// We'll arbitrarily declare that if i has any tag name that j doesn't then it sorts earlier.
-			// This shouldn't happen in practice, though, if metric code is correct...
-			return true
-		}
+// makeLabel builds a tag list with a single label + value.
+func makeLabel(labelName string, labelVal string) map[string]string {
+	return map[string]string{labelName: labelVal}
+}
 
-		if iVal != jVal && (tagName < decidingTagName || decidingTagName == "") {
-			decidingTagName = tagName
-			result = iVal < jVal
-		}
+// makeLabels takes the vitess stat representation of label values ("."-separated list) and breaks it
+// apart into a map of label name -> label value.
+func makeLabels(labelNames []string, labelValsCombined string) map[string]string {
+	tags := make(map[string]string)
+	labelVals := strings.Split(labelValsCombined, ".")
+	for i, v := range labelVals {
+		tags[labelNames[i]] = v
 	}
-	return result
+	return tags
 }
diff --git a/go/stats/opentsdb/datapoint.go b/go/stats/opentsdb/datapoint.go
new file mode 100644
index 00000000000..42e69b84d47
--- /dev/null
+++ b/go/stats/opentsdb/datapoint.go
@@ -0,0 +1,90 @@
+/*
+Copyright 2023 The Vitess Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package opentsdb
+
+import (
+	"fmt"
+	"strconv"
+	"strings"
+)
+
+// DataPoint represents a single OpenTSDB data point.
+type DataPoint struct {
+	// Example: sys.cpu.nice
+	Metric string `json:"metric"`
+	// Seconds or milliseconds since unix epoch.
+	Timestamp float64           `json:"timestamp"`
+	Value     float64           `json:"value"`
+	Tags      map[string]string `json:"tags"`
+}
+
+func (dp *DataPoint) MarshalText() (string, error) {
+	var sb strings.Builder
+
+	if _, err := sb.WriteString(fmt.Sprintf("%s %f %f", dp.Metric, dp.Timestamp, dp.Value)); err != nil {
+		return "", err
+	}
+
+	for k, v := range dp.Tags {
+		if _, err := sb.WriteString(fmt.Sprintf(" %s=%s", k, v)); err != nil {
+			return "", err
+		}
+	}
+
+	if _, err := sb.WriteString("\n"); err != nil {
+		return "", err
+	}
+
+	return sb.String(), nil
+}
+
+func unmarshalTextToData(dp *DataPoint, text []byte) error {
+	parts := strings.Split(string(text), " ")
+
+	if len(parts) < 3 {
+		// Technically every OpenTSDB time series requires at least one tag,
+		// but some of the metrics we send have zero.
+		return fmt.Errorf("require format: <metric> <timestamp> <value> [<tagk=tagv> <tagkN=tagkV>]")
+	}
+
+	dp.Metric = parts[0]
+
+	timestamp, err := strconv.ParseFloat(parts[1], 64)
+	if err != nil {
+		return err
+	}
+	dp.Timestamp = timestamp
+
+	value, err := strconv.ParseFloat(parts[2], 64)
+	if err != nil {
+		return err
+	}
+	dp.Value = value
+
+	for _, kv := range parts[3:] {
+		tagParts := strings.Split(kv, "=")
+		if len(tagParts) != 2 {
+			return fmt.Errorf("require tag format: <tagk=tagv>")
+		}
+		if dp.Tags == nil {
+			dp.Tags = make(map[string]string)
+		}
+		dp.Tags[tagParts[0]] = tagParts[1]
+	}
+
+	return nil
+}
diff --git a/go/stats/opentsdb/datapoint_reader.go b/go/stats/opentsdb/datapoint_reader.go
new file mode 100644
index 00000000000..441be9eb7a1
--- /dev/null
+++ b/go/stats/opentsdb/datapoint_reader.go
@@ -0,0 +1,53 @@
+/*
+Copyright 2023 The Vitess Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package opentsdb
+
+import (
+	"bufio"
+	"io"
+)
+
+var newLineDelimiter = byte('\n')
+
+// DataPointReader parses bytes from io.Reader into DataPoints.
+type DataPointReader struct {
+	reader *bufio.Reader
+}
+
+func NewDataPointReader(r io.Reader) *DataPointReader {
+	return &DataPointReader{
+		reader: bufio.NewReader(r),
+	}
+}
+
+// Read returns a DataPoint from the underlying io.Reader.
+//
+// Returns an error if no DataPoint could be parsed.
+func (tr *DataPointReader) Read() (*DataPoint, error) {
+	bs, err := tr.reader.ReadBytes(newLineDelimiter)
+	if err != nil {
+		return nil, err
+	}
+
+	dp := &DataPoint{}
+
+	if err := unmarshalTextToData(dp, bs[:len(bs)-1]); err != nil {
+		return nil, err
+	}
+
+	return dp, nil
+}
diff --git a/go/stats/opentsdb/doc.go b/go/stats/opentsdb/doc.go
new file mode 100644
index 00000000000..88c22a58c70
--- /dev/null
+++ b/go/stats/opentsdb/doc.go
@@ -0,0 +1,18 @@
+/*
+Copyright 2023 The Vitess Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package opentsdb adds support for pushing stats to opentsdb.
+package opentsdb
diff --git a/go/stats/opentsdb/file_writer.go b/go/stats/opentsdb/file_writer.go
new file mode 100644
index 00000000000..7f2d2f2ccc7
--- /dev/null
+++ b/go/stats/opentsdb/file_writer.go
@@ -0,0 +1,52 @@
+/*
+Copyright 2023 The Vitess Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package opentsdb
+
+import (
+	"io"
+	"os"
+)
+
+type fileWriter struct {
+	writer io.WriteCloser
+}
+
+func newFileWriter(path string) (writer, error) {
+	f, err := os.OpenFile(path, os.O_APPEND|os.O_CREATE|os.O_SYNC|os.O_WRONLY, 0644)
+	if err != nil {
+		return nil, err
+	}
+
+	return &fileWriter{
+		writer: f,
+	}, nil
+}
+
+func (fw *fileWriter) Write(data []*DataPoint) error {
+	for _, d := range data {
+		text, err := d.MarshalText()
+		if err != nil {
+			return err
+		}
+
+		if _, err := fw.writer.Write([]byte(text)); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
diff --git a/go/stats/opentsdb/flags.go b/go/stats/opentsdb/flags.go
new file mode 100644
index 00000000000..8ccd0279981
--- /dev/null
+++ b/go/stats/opentsdb/flags.go
@@ -0,0 +1,38 @@
+/*
+Copyright 2023 The Vitess Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package opentsdb
+
+import (
+	"github.com/spf13/pflag"
+
+	"vitess.io/vitess/go/vt/servenv"
+)
+
+var (
+	openTSDBURI string
+)
+
+func registerFlags(fs *pflag.FlagSet) {
+	fs.StringVar(&openTSDBURI, "opentsdb_uri", openTSDBURI, "URI of opentsdb /api/put method")
+}
+
+func init() {
+	servenv.OnParseFor("vtbackup", registerFlags)
+	servenv.OnParseFor("vtctld", registerFlags)
+	servenv.OnParseFor("vtgate", registerFlags)
+	servenv.OnParseFor("vttablet", registerFlags)
+}
diff --git a/go/stats/opentsdb/http_writer.go b/go/stats/opentsdb/http_writer.go
new file mode 100644
index 00000000000..7b7801d7f77
--- /dev/null
+++ b/go/stats/opentsdb/http_writer.go
@@ -0,0 +1,51 @@
+/*
+Copyright 2023 The Vitess Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package opentsdb
+
+import (
+	"bytes"
+	"encoding/json"
+	"net/http"
+)
+
+type httpWriter struct {
+	client *http.Client
+	uri    string
+}
+
+func newHTTPWriter(client *http.Client, uri string) *httpWriter {
+	return &httpWriter{
+		client: client,
+		uri:    uri,
+	}
+}
+
+func (hw *httpWriter) Write(data []*DataPoint) error {
+	jsonb, err := json.Marshal(data)
+	if err != nil {
+		return err
+	}
+
+	resp, err := hw.client.Post(hw.uri, "application/json", bytes.NewReader(jsonb))
+	if err != nil {
+		return err
+	}
+
+	resp.Body.Close()
+
+	return nil
+}
diff --git a/go/stats/opentsdb/init.go b/go/stats/opentsdb/init.go
new file mode 100644
index 00000000000..51186ad7650
--- /dev/null
+++ b/go/stats/opentsdb/init.go
@@ -0,0 +1,104 @@
+/*
+Copyright 2023 The Vitess Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package opentsdb
+
+import (
+	"encoding/json"
+	"fmt"
+	"net/http"
+	"net/url"
+	"sort"
+
+	"vitess.io/vitess/go/stats"
+	"vitess.io/vitess/go/vt/log"
+	"vitess.io/vitess/go/vt/servenv"
+)
+
+var singletonBackend stats.PushBackend
+
+// Init attempts to create a singleton *opentsdb.backend and register it as a PushBackend.
+// If it fails to create one, this is a noop. The prefix argument is an optional string
+// to prepend to the name of every data point reported.
+func Init(prefix string) {
+	// Needs to happen in servenv.OnRun() instead of init because it requires flag parsing and logging
+	servenv.OnRun(func() {
+		log.Info("Initializing opentsdb backend...")
+		backend, err := InitWithoutServenv(prefix)
+		if err != nil {
+			log.Infof("Failed to initialize singleton opentsdb backend: %v", err)
+		} else {
+			singletonBackend = backend
+			log.Info("Initialized opentsdb backend.")
+		}
+	})
+}
+
+// InitWithoutServenv initializes the opentsdb without servenv
+func InitWithoutServenv(prefix string) (stats.PushBackend, error) {
+	b, err := newBackend(prefix)
+
+	if err != nil {
+		return nil, err
+	}
+
+	stats.RegisterPushBackend("opentsdb", b)
+
+	servenv.HTTPHandleFunc("/debug/opentsdb", func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "application/json; charset=utf-8")
+		collector := b.collector()
+		collector.collectAll()
+		data := collector.data
+		sort.Sort(byMetric(data))
+
+		if b, err := json.MarshalIndent(data, "", "  "); err != nil {
+			w.Write([]byte(err.Error()))
+		} else {
+			w.Write(b)
+		}
+	})
+
+	return b, nil
+}
+
+func newBackend(prefix string) (*backend, error) {
+	if openTSDBURI == "" {
+		return nil, fmt.Errorf("cannot create opentsdb PushBackend with empty --opentsdb_uri")
+	}
+
+	var w writer
+
+	// Use the file API when the uri is in format file://...
+	u, err := url.Parse(openTSDBURI)
+	if err != nil {
+		return nil, fmt.Errorf("failed to parse --opentsdb_uri %s: %v", openTSDBURI, err)
+	} else if u.Scheme == "file" {
+		fw, err := newFileWriter(u.Path)
+		if err != nil {
+			return nil, fmt.Errorf("failed to create file-based writer for --opentsdb_uri %s: %v", openTSDBURI, err)
+		} else {
+			w = fw
+		}
+	} else {
+		w = newHTTPWriter(&http.Client{}, openTSDBURI)
+	}
+
+	return &backend{
+		prefix:     prefix,
+		commonTags: stats.ParseCommonTags(stats.CommonTags),
+		writer:     w,
+	}, nil
+}
diff --git a/go/stats/opentsdb/opentsdb_test.go b/go/stats/opentsdb/opentsdb_test.go
index 0e8ff240500..940ee845ada 100644
--- a/go/stats/opentsdb/opentsdb_test.go
+++ b/go/stats/opentsdb/opentsdb_test.go
@@ -352,15 +352,16 @@ func TestOpenTsdbTimings(t *testing.T) {
 }
 
 func checkOutput(t *testing.T, statName string, wantJSON string) {
-	backend := &openTSDBBackend{
+	b := &backend{
 		prefix:     "vtgate",
 		commonTags: map[string]string{"host": "localhost"},
 	}
 	timestamp := int64(1234)
 
-	dc := &dataCollector{
-		settings:  backend,
-		timestamp: timestamp,
+	dc := &collector{
+		commonTags: b.commonTags,
+		prefix:     b.prefix,
+		timestamp:  timestamp,
 	}
 	found := false
 	expvar.Do(func(kv expvar.KeyValue) {
@@ -368,9 +369,9 @@ func checkOutput(t *testing.T, statName string, wantJSON string) {
 			found = true
 
 			dc.addExpVar(kv)
-			sort.Sort(byMetric(dc.dataPoints))
+			sort.Sort(byMetric(dc.data))
 
-			gotBytes, err := json.MarshalIndent(dc.dataPoints, "", "  ")
+			gotBytes, err := json.MarshalIndent(dc.data, "", "  ")
 			if err != nil {
 				t.Errorf("Failed to marshal json: %v", err)
 				return
diff --git a/go/stats/opentsdb/writer.go b/go/stats/opentsdb/writer.go
new file mode 100644
index 00000000000..49d221cc782
--- /dev/null
+++ b/go/stats/opentsdb/writer.go
@@ -0,0 +1,21 @@
+/*
+Copyright 2023 The Vitess Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package opentsdb
+
+type writer interface {
+	Write([]*DataPoint) error
+}
diff --git a/go/stats/statsd/statsd.go b/go/stats/statsd/statsd.go
index 269b185ff7c..f791d7b742d 100644
--- a/go/stats/statsd/statsd.go
+++ b/go/stats/statsd/statsd.go
@@ -219,7 +219,7 @@ func (sb StatsBackend) addExpVar(kv expvar.KeyValue) {
 	}
 }
 
-// PushAll flush out the pending metrics
+// PushAll flushes out the pending metrics
 func (sb StatsBackend) PushAll() error {
 	expvar.Do(func(kv expvar.KeyValue) {
 		sb.addExpVar(kv)
@@ -229,3 +229,15 @@ func (sb StatsBackend) PushAll() error {
 	}
 	return nil
 }
+
+// PushOne pushes the single provided metric.
+func (sb StatsBackend) PushOne(name string, v stats.Variable) error {
+	sb.addExpVar(expvar.KeyValue{
+		Key:   name,
+		Value: v,
+	})
+	if err := sb.statsdClient.Flush(); err != nil {
+		return err
+	}
+	return nil
+}
diff --git a/go/test/endtoend/backup/vtbackup/backup_only_test.go b/go/test/endtoend/backup/vtbackup/backup_only_test.go
index 408cc64a21b..5e80d5d3cc3 100644
--- a/go/test/endtoend/backup/vtbackup/backup_only_test.go
+++ b/go/test/endtoend/backup/vtbackup/backup_only_test.go
@@ -19,7 +19,9 @@ package vtbackup
 import (
 	"context"
 	"encoding/json"
+	"errors"
 	"fmt"
+	"io"
 	"os"
 	"path"
 	"strings"
@@ -30,6 +32,7 @@ import (
 	"github.com/stretchr/testify/require"
 
 	"vitess.io/vitess/go/mysql"
+	"vitess.io/vitess/go/stats/opentsdb"
 	"vitess.io/vitess/go/test/endtoend/cluster"
 	"vitess.io/vitess/go/vt/log"
 	"vitess.io/vitess/go/vt/mysqlctl"
@@ -59,8 +62,9 @@ func TestTabletInitialBackup(t *testing.T) {
 
 	waitForReplicationToCatchup([]cluster.Vttablet{*replica1, *replica2})
 
-	vtBackup(t, true, false, false)
+	dataPointReader := vtBackup(t, true, false, false)
 	verifyBackupCount(t, shardKsName, 1)
+	verifyBackupStats(t, dataPointReader, true /* initialBackup */)
 
 	// Initialize the tablets
 	initTablets(t, false, false)
@@ -144,11 +148,13 @@ func firstBackupTest(t *testing.T, tabletType string) {
 
 	// backup the replica
 	log.Infof("taking backup %s", time.Now())
-	vtBackup(t, false, true, true)
+	dataPointReader := vtBackup(t, false, true, true)
 	log.Infof("done taking backup %s", time.Now())
 
 	// check that the backup shows up in the listing
 	verifyBackupCount(t, shardKsName, len(backups)+1)
+	// check that backup stats are what we expect
+	verifyBackupStats(t, dataPointReader, false /* initialBackup */)
 
 	// insert more data on the primary
 	_, err = primary.VttabletProcess.QueryTablet("insert into vt_insert_test (msg) values ('test2')", keyspaceName, true)
@@ -173,16 +179,24 @@ func firstBackupTest(t *testing.T, tabletType string) {
 	verifyBackupCount(t, shardKsName, 0)
 }
 
-func vtBackup(t *testing.T, initialBackup bool, restartBeforeBackup, disableRedoLog bool) {
+func vtBackup(t *testing.T, initialBackup bool, restartBeforeBackup, disableRedoLog bool) *opentsdb.DataPointReader {
 	mysqlSocket, err := os.CreateTemp("", "vtbackup_test_mysql.sock")
 	require.Nil(t, err)
 	defer os.Remove(mysqlSocket.Name())
 
+	// Prepare opentsdb stats file path.
+	statsPath := path.Join(t.TempDir(), fmt.Sprintf("opentsdb.%s.txt", t.Name()))
+
 	// Take the back using vtbackup executable
 	extraArgs := []string{
 		"--allow_first_backup",
 		"--db-credentials-file", dbCredentialFile,
 		"--mysql_socket", mysqlSocket.Name(),
+
+		// Use opentsdb for stats.
+		"--stats_backend", "opentsdb",
+		// Write stats to file for reading afterwards.
+		"--opentsdb_uri", fmt.Sprintf("file://%s", statsPath),
 	}
 	if restartBeforeBackup {
 		extraArgs = append(extraArgs, "--restart_before_backup")
@@ -201,6 +215,10 @@ func vtBackup(t *testing.T, initialBackup bool, restartBeforeBackup, disableRedo
 	log.Infof("starting backup tablet %s", time.Now())
 	err = localCluster.StartVtbackup(newInitDBFile, initialBackup, keyspaceName, shardName, cell, extraArgs...)
 	require.Nil(t, err)
+
+	f, err := os.OpenFile(statsPath, os.O_RDONLY, 0)
+	require.NoError(t, err)
+	return opentsdb.NewDataPointReader(f)
 }
 
 func verifyBackupCount(t *testing.T, shardKsName string, expected int) []string {
@@ -413,3 +431,73 @@ func waitForReplicationToCatchup(tablets []cluster.Vttablet) bool {
 		}
 	}
 }
+
+func verifyBackupStats(t *testing.T, dataPointReader *opentsdb.DataPointReader, initialBackup bool) {
+	// During execution, the following phases will become active, in order.
+	var expectActivePhases []string
+	if initialBackup {
+		expectActivePhases = []string{
+			"initialbackup",
+		}
+	} else {
+		expectActivePhases = []string{
+			"restorelastbackup",
+			"catchupreplication",
+			"takenewbackup",
+		}
+	}
+
+	// Sequence of phase activity.
+	activePhases := make([]string, 0)
+
+	// Last seen phase values.
+	phaseValues := make(map[string]int64)
+
+	// Scan for phase activity until all we're out of stats to scan.
+	for dataPoint, err := dataPointReader.Read(); !errors.Is(err, io.EOF); dataPoint, err = dataPointReader.Read() {
+		// We're only interested in "vtbackup.phase" metrics in this test.
+		if dataPoint.Metric != "vtbackup.phase" {
+			continue
+		}
+
+		phase := dataPoint.Tags["phase"]
+		value := int64(dataPoint.Value)
+		lastValue, ok := phaseValues[phase]
+
+		// The value should always be 0 or 1.
+		require.True(t, int64(0) == value || int64(1) == value)
+
+		// The first time the phase is reported, it should be 0.
+		if !ok {
+			require.Equal(t, int64(0), value)
+		}
+
+		// Eventually the phase should go active. The next time it reports,
+		// it should go inactive.
+		if lastValue == 1 {
+			require.Equal(t, int64(0), value)
+		}
+
+		// Record current value.
+		phaseValues[phase] = value
+
+		// Add phase to sequence once it goes from active to inactive.
+		if lastValue == 1 && value == 0 {
+			activePhases = append(activePhases, phase)
+		}
+
+		// Verify at most one phase is active.
+		activeCount := 0
+		for _, value := range phaseValues {
+			if value == int64(0) {
+				continue
+			}
+
+			activeCount++
+			require.LessOrEqual(t, activeCount, 1)
+		}
+	}
+
+	// Verify phase sequences.
+	require.Equal(t, expectActivePhases, activePhases)
+}
diff --git a/go/test/endtoend/cluster/vtbackup_process.go b/go/test/endtoend/cluster/vtbackup_process.go
index ba508e8d593..57350922a21 100644
--- a/go/test/endtoend/cluster/vtbackup_process.go
+++ b/go/test/endtoend/cluster/vtbackup_process.go
@@ -69,8 +69,7 @@ func (vtbackup *VtbackupProcess) Setup() (err error) {
 
 		//Backup Arguments are not optional
 		"--backup_storage_implementation", "file",
-		"--file_backup_storage_root",
-		path.Join(os.Getenv("VTDATAROOT"), "tmp", "backupstorage"),
+		"--file_backup_storage_root", path.Join(os.Getenv("VTDATAROOT"), "tmp", "backupstorage"),
 	)
 
 	if vtbackup.initialBackup {

From b0b3ed68ecfc28d83fcec9ef171361bf81188b53 Mon Sep 17 00:00:00 2001
From: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com>
Date: Wed, 20 Sep 2023 13:31:39 +0300
Subject: [PATCH 2/7] Endtoend: stress tests for VTGate FOREIGN KEY support
 (#13799)

Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com>
---
 ...ster_endtoend_vtgate_foreignkey_stress.yml |  148 +++
 go.mod                                        |    1 +
 go.sum                                        |    2 +
 go/test/endtoend/cluster/cluster_util.go      |   51 +-
 go/test/endtoend/onlineddl/vtgate_util.go     |    4 +-
 .../buffer/reparent/failover_buffer_test.go   |    2 +-
 .../foreignkey/stress/fk_stress_test.go       | 1131 +++++++++++++++++
 test/ci_workflow_gen.go                       |    1 +
 test/config.json                              |    9 +
 9 files changed, 1334 insertions(+), 15 deletions(-)
 create mode 100644 .github/workflows/cluster_endtoend_vtgate_foreignkey_stress.yml
 create mode 100644 go/test/endtoend/vtgate/foreignkey/stress/fk_stress_test.go

diff --git a/.github/workflows/cluster_endtoend_vtgate_foreignkey_stress.yml b/.github/workflows/cluster_endtoend_vtgate_foreignkey_stress.yml
new file mode 100644
index 00000000000..d44389d1497
--- /dev/null
+++ b/.github/workflows/cluster_endtoend_vtgate_foreignkey_stress.yml
@@ -0,0 +1,148 @@
+# DO NOT MODIFY: THIS FILE IS GENERATED USING "make generate_ci_workflows"
+
+name: Cluster (vtgate_foreignkey_stress)
+on: [push, pull_request]
+concurrency:
+  group: format('{0}-{1}', ${{ github.ref }}, 'Cluster (vtgate_foreignkey_stress)')
+  cancel-in-progress: true
+
+permissions: read-all
+
+env:
+  LAUNCHABLE_ORGANIZATION: "vitess"
+  LAUNCHABLE_WORKSPACE: "vitess-app"
+  GITHUB_PR_HEAD_SHA: "${{ github.event.pull_request.head.sha }}"
+
+jobs:
+  build:
+    name: Run endtoend tests on Cluster (vtgate_foreignkey_stress)
+    runs-on: gh-hosted-runners-4cores-1
+
+    steps:
+    - name: Skip CI
+      run: |
+        if [[ "${{contains( github.event.pull_request.labels.*.name, 'Skip CI')}}" == "true" ]]; then
+          echo "skipping CI due to the 'Skip CI' label"
+          exit 1
+        fi
+
+    - name: Check if workflow needs to be skipped
+      id: skip-workflow
+      run: |
+        skip='false'
+        if [[ "${{github.event.pull_request}}" ==  "" ]] && [[ "${{github.ref}}" != "refs/heads/main" ]] && [[ ! "${{github.ref}}" =~ ^refs/heads/release-[0-9]+\.[0-9]$ ]] && [[ ! "${{github.ref}}" =~ "refs/tags/.*" ]]; then
+          skip='true'
+        fi
+        echo Skip ${skip}
+        echo "skip-workflow=${skip}" >> $GITHUB_OUTPUT
+
+        PR_DATA=$(curl \
+          -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \
+          -H "Accept: application/vnd.github.v3+json" \
+          "https://api.github.com/repos/${{ github.repository }}/pulls/${{ github.event.pull_request.number }}")
+        draft=$(echo "$PR_DATA" | jq .draft -r)
+        echo "is_draft=${draft}" >> $GITHUB_OUTPUT
+
+    - name: Check out code
+      if: steps.skip-workflow.outputs.skip-workflow == 'false'
+      uses: actions/checkout@v3
+
+    - name: Check for changes in relevant files
+      if: steps.skip-workflow.outputs.skip-workflow == 'false'
+      uses: frouioui/paths-filter@main
+      id: changes
+      with:
+        token: ''
+        filters: |
+          end_to_end:
+            - 'go/**/*.go'
+            - 'test.go'
+            - 'Makefile'
+            - 'build.env'
+            - 'go.sum'
+            - 'go.mod'
+            - 'proto/*.proto'
+            - 'tools/**'
+            - 'config/**'
+            - 'bootstrap.sh'
+            - '.github/workflows/cluster_endtoend_vtgate_foreignkey_stress.yml'
+
+    - name: Set up Go
+      if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true'
+      uses: actions/setup-go@v4
+      with:
+        go-version: 1.21.0
+
+    - name: Set up python
+      if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true'
+      uses: actions/setup-python@v4
+
+    - name: Tune the OS
+      if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true'
+      run: |
+        # Limit local port range to not use ports that overlap with server side
+        # ports that we listen on.
+        sudo sysctl -w net.ipv4.ip_local_port_range="22768 65535"
+        # Increase the asynchronous non-blocking I/O. More information at https://dev.mysql.com/doc/refman/5.7/en/innodb-parameters.html#sysvar_innodb_use_native_aio
+        echo "fs.aio-max-nr = 1048576" | sudo tee -a /etc/sysctl.conf
+        sudo sysctl -p /etc/sysctl.conf
+
+    - name: Get dependencies
+      if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true'
+      run: |
+        
+        # Get key to latest MySQL repo
+        sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 467B942D3A79BD29
+        # Setup MySQL 8.0
+        wget -c https://dev.mysql.com/get/mysql-apt-config_0.8.24-1_all.deb
+        echo mysql-apt-config mysql-apt-config/select-server select mysql-8.0 | sudo debconf-set-selections
+        sudo DEBIAN_FRONTEND="noninteractive" dpkg -i mysql-apt-config*
+        sudo apt-get update
+        # Install everything else we need, and configure
+        sudo apt-get install -y mysql-server mysql-client make unzip g++ etcd curl git wget eatmydata xz-utils libncurses5
+
+        sudo service mysql stop
+        sudo service etcd stop
+        sudo ln -s /etc/apparmor.d/usr.sbin.mysqld /etc/apparmor.d/disable/
+        sudo apparmor_parser -R /etc/apparmor.d/usr.sbin.mysqld
+        go mod download
+
+        # install JUnit report formatter
+        go install github.com/vitessio/go-junit-report@HEAD
+
+    - name: Setup launchable dependencies
+      if: steps.skip-workflow.outputs.is_draft == 'false' && steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true' && github.base_ref == 'main'
+      run: |
+        # Get Launchable CLI installed. If you can, make it a part of the builder image to speed things up
+        pip3 install --user launchable~=1.0 > /dev/null
+
+        # verify that launchable setup is all correct.
+        launchable verify || true
+
+        # Tell Launchable about the build you are producing and testing
+        launchable record build --name "$GITHUB_RUN_ID" --no-commit-collection --source .
+
+    - name: Run cluster endtoend test
+      if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true'
+      timeout-minutes: 45
+      run: |
+        # We set the VTDATAROOT to the /tmp folder to reduce the file path of mysql.sock file
+        # which musn't be more than 107 characters long.
+        export VTDATAROOT="/tmp/"
+        source build.env
+
+        set -exo pipefail
+
+        # run the tests however you normally do, then produce a JUnit XML file
+        eatmydata -- go run test.go -docker=false -follow -shard vtgate_foreignkey_stress | tee -a output.txt | go-junit-report -set-exit-code > report.xml
+
+    - name: Print test output and Record test result in launchable if PR is not a draft
+      if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true' && always()
+      run: |
+        if [[ "${{steps.skip-workflow.outputs.is_draft}}" ==  "false" ]]; then
+          # send recorded tests to launchable
+          launchable record tests --build "$GITHUB_RUN_ID" go-test . || true
+        fi
+
+        # print test output
+        cat output.txt
diff --git a/go.mod b/go.mod
index 639a22edc6b..88e09e70d22 100644
--- a/go.mod
+++ b/go.mod
@@ -107,6 +107,7 @@ require (
 	github.com/spf13/jwalterweatherman v1.1.0
 	github.com/xlab/treeprint v1.2.0
 	go.uber.org/goleak v1.2.1
+	golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63
 	golang.org/x/sync v0.3.0
 	modernc.org/sqlite v1.20.3
 )
diff --git a/go.sum b/go.sum
index ffadd6498b9..cb54903b2e9 100644
--- a/go.sum
+++ b/go.sum
@@ -677,6 +677,8 @@ golang.org/x/exp v0.0.0-20191227195350-da58074b4299/go.mod h1:2RIsYlXP63K8oxa1u0
 golang.org/x/exp v0.0.0-20200119233911-0405dc783f0a/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4=
 golang.org/x/exp v0.0.0-20200207192155-f17229e696bd/go.mod h1:J/WKrq2StrnmMY6+EHIKF9dgMWnmCNThgcyBT1FY9mM=
 golang.org/x/exp v0.0.0-20200224162631-6cc2880d07d6/go.mod h1:3jZMyOhIsHpP37uCMkUooju7aAi5cS1Q23tOzKc+0MU=
+golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63 h1:m64FZMko/V45gv0bNmrNYoDEq8U5YUhetc9cBWKS1TQ=
+golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63/go.mod h1:0v4NqG35kSWCMzLaMeX+IQrlSnVE/bqGSyC2cz/9Le8=
 golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js=
 golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
 golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
diff --git a/go/test/endtoend/cluster/cluster_util.go b/go/test/endtoend/cluster/cluster_util.go
index 0e3cc2d0c95..1af9504389b 100644
--- a/go/test/endtoend/cluster/cluster_util.go
+++ b/go/test/endtoend/cluster/cluster_util.go
@@ -223,26 +223,53 @@ func filterResultWhenRunsForCoverage(input string) string {
 	return result
 }
 
+func ValidateReplicationIsHealthy(t *testing.T, tablet *Vttablet) bool {
+	query := "show replica status"
+	rs, err := tablet.VttabletProcess.QueryTablet(query, "", true)
+	assert.NoError(t, err)
+	row := rs.Named().Row()
+	require.NotNil(t, row)
+
+	ioRunning := row.AsString("Replica_IO_Running", "")
+	require.NotEmpty(t, ioRunning)
+	ioHealthy := assert.Equalf(t, "Yes", ioRunning, "Replication is broken. Replication status: %v", row)
+	sqlRunning := row.AsString("Replica_SQL_Running", "")
+	require.NotEmpty(t, sqlRunning)
+	sqlHealthy := assert.Equalf(t, "Yes", sqlRunning, "Replication is broken. Replication status: %v", row)
+
+	return ioHealthy && sqlHealthy
+}
+
 // WaitForReplicationPos will wait for replication position to catch-up
-func WaitForReplicationPos(t *testing.T, tabletA *Vttablet, tabletB *Vttablet, hostname string, timeout float64) {
+func WaitForReplicationPos(t *testing.T, tabletA *Vttablet, tabletB *Vttablet, validateReplication bool, timeout time.Duration) {
+	hostname := "localhost"
+	ctx, cancel := context.WithTimeout(context.Background(), timeout)
+	defer cancel()
+	ticker := time.NewTicker(10 * time.Millisecond)
+	defer ticker.Stop()
+
 	replicationPosA, _ := GetPrimaryPosition(t, *tabletA, hostname)
 	for {
+		if validateReplication {
+			if !ValidateReplicationIsHealthy(t, tabletB) {
+				assert.FailNowf(t, "Replication broken on tablet %v. Will not wait for position", tabletB.Alias)
+			}
+			if t.Failed() {
+				return
+			}
+		}
 		replicationPosB, _ := GetPrimaryPosition(t, *tabletB, hostname)
 		if positionAtLeast(t, tabletA, replicationPosB, replicationPosA) {
-			break
+			return
 		}
 		msg := fmt.Sprintf("%s's replication position to catch up to %s's;currently at: %s, waiting to catch up to: %s", tabletB.Alias, tabletA.Alias, replicationPosB, replicationPosA)
-		waitStep(t, msg, timeout, 0.01)
-	}
-}
-
-func waitStep(t *testing.T, msg string, timeout float64, sleepTime float64) float64 {
-	timeout = timeout - sleepTime
-	if timeout < 0.0 {
-		t.Errorf("timeout waiting for condition '%s'", msg)
+		select {
+		case <-ctx.Done():
+			assert.FailNowf(t, "Timeout waiting for condition '%s'", msg)
+			return
+		case <-ticker.C:
+		}
 	}
-	time.Sleep(time.Duration(sleepTime) * time.Second)
-	return timeout
 }
 
 func positionAtLeast(t *testing.T, tablet *Vttablet, a string, b string) bool {
diff --git a/go/test/endtoend/onlineddl/vtgate_util.go b/go/test/endtoend/onlineddl/vtgate_util.go
index ae214a644b6..3d99a2cef92 100644
--- a/go/test/endtoend/onlineddl/vtgate_util.go
+++ b/go/test/endtoend/onlineddl/vtgate_util.go
@@ -207,7 +207,7 @@ func CheckLaunchAllMigrations(t *testing.T, vtParams *mysql.ConnParams, expectCo
 }
 
 // CheckMigrationStatus verifies that the migration indicated by given UUID has the given expected status
-func CheckMigrationStatus(t *testing.T, vtParams *mysql.ConnParams, shards []cluster.Shard, uuid string, expectStatuses ...schema.OnlineDDLStatus) {
+func CheckMigrationStatus(t *testing.T, vtParams *mysql.ConnParams, shards []cluster.Shard, uuid string, expectStatuses ...schema.OnlineDDLStatus) bool {
 	query, err := sqlparser.ParseAndBind("show vitess_migrations like %a",
 		sqltypes.StringBindVariable(uuid),
 	)
@@ -229,7 +229,7 @@ func CheckMigrationStatus(t *testing.T, vtParams *mysql.ConnParams, shards []clu
 			}
 		}
 	}
-	assert.Equal(t, len(shards), count)
+	return assert.Equal(t, len(shards), count)
 }
 
 // WaitForMigrationStatus waits for a migration to reach either provided statuses (returns immediately), or eventually time out
diff --git a/go/test/endtoend/tabletgateway/buffer/reparent/failover_buffer_test.go b/go/test/endtoend/tabletgateway/buffer/reparent/failover_buffer_test.go
index ace652fc1d2..d3828eb8166 100644
--- a/go/test/endtoend/tabletgateway/buffer/reparent/failover_buffer_test.go
+++ b/go/test/endtoend/tabletgateway/buffer/reparent/failover_buffer_test.go
@@ -51,7 +51,7 @@ func failoverExternalReparenting(t *testing.T, clusterInstance *cluster.LocalPro
 	primary.VttabletProcess.QueryTablet(demoteQuery, keyspaceUnshardedName, true)
 
 	// Wait for replica to catch up to primary.
-	cluster.WaitForReplicationPos(t, primary, replica, "localhost", 60.0)
+	cluster.WaitForReplicationPos(t, primary, replica, false, time.Minute)
 
 	duration := time.Since(start)
 	minUnavailabilityInS := 1.0
diff --git a/go/test/endtoend/vtgate/foreignkey/stress/fk_stress_test.go b/go/test/endtoend/vtgate/foreignkey/stress/fk_stress_test.go
new file mode 100644
index 00000000000..600961e6f0c
--- /dev/null
+++ b/go/test/endtoend/vtgate/foreignkey/stress/fk_stress_test.go
@@ -0,0 +1,1131 @@
+/*
+Copyright 2023 The Vitess Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package fkstress
+
+import (
+	"context"
+	"flag"
+	"fmt"
+	"math/rand"
+	"os"
+	"path"
+	"strings"
+	"sync"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+	"golang.org/x/exp/slices"
+
+	"vitess.io/vitess/go/mysql"
+	"vitess.io/vitess/go/mysql/sqlerror"
+	"vitess.io/vitess/go/sqltypes"
+	"vitess.io/vitess/go/test/endtoend/cluster"
+	"vitess.io/vitess/go/test/endtoend/onlineddl"
+	"vitess.io/vitess/go/test/endtoend/utils"
+	"vitess.io/vitess/go/textutil"
+	"vitess.io/vitess/go/vt/log"
+	"vitess.io/vitess/go/vt/schema"
+	"vitess.io/vitess/go/vt/sqlparser"
+)
+
+// This endtoend test is designd to validate VTGate's FOREIGN KEY implementation for unsharded/single-sharded/shard-scope, meaning
+// we expect foreign key constraints to be limited to a shard (related rows can never be on diffrent shards).
+//
+// This test validates NO ACTION, CASCADE and SET NULL reference actions.
+// VTGate's support for foreign keys includes:
+// - Analyzing the foreign key constraints in a keyspace.
+// - Rejecting INSERT statements for child table when there's no matching row on a parent table.
+// - Handling DELETE and UPDATE statements on a parent table according to the reference action on all children.
+//   Specifically, this means for example that VTGate will handle a ON DELETE CASCADE in Vitess plane. It will first delete rows
+//   from the child (recursive operation) before deleting the row on the parent. As result, the underlying MySQL server will have
+//   nothing to cascade.
+//
+// The design of this test is as follows:
+// - Create a cluster with PRIMARY and REPLICA tablets
+// - Given this structure of tables with foreign key constraints:
+//   stress_parent
+//   +- stress_child
+//      +- stress_grandchild
+//   +- stress_child2
+// - Create these tables. Then, on the MySQL replica, remove the foreign key constraints.
+// - Static test:
+//   - Randomly populate all tables via highly-contentive INSERT/UPDATE/DELETE statements
+//   - Validate collected metrics match actual table data
+//   - Validate foreign key constraints integrity
+// - Workload test:
+//   - Initially populate tables as above
+//   - Run a high contention workload where multiple connections issue random INSERT/UPDATE/DELETE on all related tables
+//   - Validate collected metrics match actual table data
+//   - Validate foreign key constraints integrity on MySQL primary
+//   - Validate foreign key constraints integrity on MySQL replica
+//   - Compare data on primary & replica
+//
+// We of course know that foreign key integrity is maintained on the MySQL primary. However, the replica does not have the matching
+// constraints. Since cascaded (SET NULL, CASCADE) writes are handled internally by InnoDB and not written to the binary log,
+// any cascaded writes on the primary are lost, and the replica is unaware of those writes. Without VTGate intervention, we expect
+// the replica to quickly diverge from the primary, and in fact in all likelyhood replication will break very quickly.
+// However, if VTGate implements the cascading rules correctly, the primary MySQL server will never have any actual cascades, and
+// so cascaded writes are all accounted for in the binary logs, which means we can expect the replica to be compliant with the
+// primary.
+
+type WriteMetrics struct {
+	mu                                                      sync.Mutex
+	insertsAttempts, insertsFailures, insertsNoops, inserts int64
+	updatesAttempts, updatesFailures, updatesNoops, updates int64
+	deletesAttempts, deletesFailures, deletesNoops, deletes int64
+
+	insertsFKErrors, updatesFKErrors, deletesFKErrors             int64
+	sampleInsertFKError, sampleUpdateFKError, sampleDeleteFKError error
+}
+
+func (w *WriteMetrics) Clear() {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+
+	w.inserts = 0
+	w.updates = 0
+	w.deletes = 0
+
+	w.insertsAttempts = 0
+	w.insertsFailures = 0
+	w.insertsNoops = 0
+
+	w.updatesAttempts = 0
+	w.updatesFailures = 0
+	w.updatesNoops = 0
+
+	w.deletesAttempts = 0
+	w.deletesFailures = 0
+	w.deletesNoops = 0
+
+	w.insertsFKErrors = 0
+	w.updatesFKErrors = 0
+	w.deletesFKErrors = 0
+}
+
+func (w *WriteMetrics) String() string {
+	return fmt.Sprintf(`WriteMetrics: inserts-deletes=%d, updates-deletes=%d,
+insertsAttempts=%d, insertsFailures=%d, insertsNoops=%d, inserts=%d,
+updatesAttempts=%d, updatesFailures=%d, updatesNoops=%d, updates=%d,
+deletesAttempts=%d, deletesFailures=%d, deletesNoops=%d, deletes=%d,
+`,
+		w.inserts-w.deletes, w.updates-w.deletes,
+		w.insertsAttempts, w.insertsFailures, w.insertsNoops, w.inserts,
+		w.updatesAttempts, w.updatesFailures, w.updatesNoops, w.updates,
+		w.deletesAttempts, w.deletesFailures, w.deletesNoops, w.deletes,
+	)
+}
+
+var (
+	clusterInstance *cluster.LocalProcessCluster
+	shards          []cluster.Shard
+	primary         *cluster.Vttablet
+	replica         *cluster.Vttablet
+	vtParams        mysql.ConnParams
+
+	onlineDDLStrategy     = "vitess --unsafe-allow-foreign-keys --cut-over-threshold=15s"
+	hostname              = "localhost"
+	keyspaceName          = "ks"
+	cell                  = "zone1"
+	schemaChangeDirectory = ""
+	parentTableName       = "stress_parent"
+	childTableName        = "stress_child"
+	child2TableName       = "stress_child2"
+	grandchildTableName   = "stress_grandchild"
+	tableNames            = []string{parentTableName, childTableName, child2TableName, grandchildTableName}
+	reverseTableNames     []string
+
+	seedOnce sync.Once
+
+	referenceActionMap = map[sqlparser.ReferenceAction]string{
+		sqlparser.NoAction: "NO ACTION",
+		sqlparser.Cascade:  "CASCADE",
+		sqlparser.SetNull:  "SET NULL",
+	}
+	referenceActions = []sqlparser.ReferenceAction{sqlparser.NoAction, sqlparser.SetNull, sqlparser.Cascade}
+	createStatements = []string{
+		`
+		CREATE TABLE stress_parent (
+			id bigint not null,
+			parent_id bigint,
+			rand_val varchar(32) null default '',
+			hint_col varchar(64) not null default '',
+			created_timestamp timestamp not null default current_timestamp,
+			updates int unsigned not null default 0,
+			PRIMARY KEY (id),
+			key parent_id_idx(parent_id),
+			key created_idx(created_timestamp),
+			key updates_idx(updates)
+		) ENGINE=InnoDB
+		`,
+		`
+		CREATE TABLE stress_child (
+			id bigint not null,
+			parent_id bigint,
+			rand_val varchar(32) null default '',
+			hint_col varchar(64) not null default '',
+			created_timestamp timestamp not null default current_timestamp,
+			updates int unsigned not null default 0,
+			PRIMARY KEY (id),
+			key parent_id_idx(parent_id),
+			key created_idx(created_timestamp),
+			key updates_idx(updates),
+			CONSTRAINT child_parent_fk FOREIGN KEY (parent_id) REFERENCES stress_parent (id) ON DELETE %s ON UPDATE %s
+		) ENGINE=InnoDB
+		`,
+		`
+		CREATE TABLE stress_child2 (
+			id bigint not null,
+			parent_id bigint,
+			rand_val varchar(32) null default '',
+			hint_col varchar(64) not null default '',
+			created_timestamp timestamp not null default current_timestamp,
+			updates int unsigned not null default 0,
+			PRIMARY KEY (id),
+			key parent_id_idx(parent_id),
+			key created_idx(created_timestamp),
+			key updates_idx(updates),
+			CONSTRAINT child2_parent_fk FOREIGN KEY (parent_id) REFERENCES stress_parent (id) ON DELETE %s ON UPDATE %s
+		) ENGINE=InnoDB
+		`,
+		`
+		CREATE TABLE stress_grandchild (
+			id bigint not null,
+			parent_id bigint,
+			rand_val varchar(32) null default '',
+			hint_col varchar(64) not null default '',
+			created_timestamp timestamp not null default current_timestamp,
+			updates int unsigned not null default 0,
+			PRIMARY KEY (id),
+			key parent_id_idx(parent_id),
+			key created_idx(created_timestamp),
+			key updates_idx(updates),
+			CONSTRAINT grandchild_child_fk FOREIGN KEY (parent_id) REFERENCES stress_child (id) ON DELETE %s ON UPDATE %s
+		) ENGINE=InnoDB
+		`,
+	}
+	dropConstraintsStatements = []string{
+		`ALTER TABLE stress_child DROP CONSTRAINT child_parent_fk`,
+		`ALTER TABLE stress_child2 DROP CONSTRAINT child2_parent_fk`,
+		`ALTER TABLE stress_grandchild DROP CONSTRAINT grandchild_child_fk`,
+	}
+	alterHintStatement = `
+		ALTER TABLE %s modify hint_col varchar(64) not null default '%s'
+	`
+	insertRowStatement = `
+		INSERT IGNORE INTO %s (id, parent_id, rand_val) VALUES (%d, %d, left(md5(rand()), 8))
+	`
+	updateRowStatement = `
+		UPDATE %s SET rand_val=left(md5(rand()), 8), updates=updates+1 WHERE id=%d
+	`
+	updateRowIdStatement = `
+		UPDATE %s SET id=%v, rand_val=left(md5(rand()), 8), updates=updates+1 WHERE id=%d
+	`
+	deleteRowStatement = `
+		DELETE FROM %s WHERE id=%d AND updates=1
+	`
+	// We use CAST(SUM(updates) AS SIGNED) because SUM() returns a DECIMAL datatype, and we want to read a SIGNED INTEGER type
+	selectCountRowsStatement = `
+		SELECT COUNT(*) AS num_rows, CAST(SUM(updates) AS SIGNED) AS sum_updates FROM %s
+	`
+	selectMatchingRowsChild = `
+		select stress_child.id from stress_child join stress_parent on (stress_parent.id = stress_child.parent_id)
+	`
+	selectMatchingRowsChild2 = `
+		select stress_child2.id from stress_child2 join stress_parent on (stress_parent.id = stress_child2.parent_id)
+	`
+	selectMatchingRowsGrandchild = `
+		select stress_grandchild.id from stress_grandchild join stress_child on (stress_child.id = stress_grandchild.parent_id)
+	`
+	selectOrphanedRowsChild = `
+		select stress_child.id from stress_child left join stress_parent on (stress_parent.id = stress_child.parent_id) where stress_parent.id is null
+	`
+	selectOrphanedRowsChild2 = `
+		select stress_child2.id from stress_child2 left join stress_parent on (stress_parent.id = stress_child2.parent_id) where stress_parent.id is null
+	`
+	selectOrphanedRowsGrandchild = `
+		select stress_grandchild.id from stress_grandchild left join stress_child on (stress_child.id = stress_grandchild.parent_id) where stress_child.id is null
+	`
+	deleteAllStatement = `
+		DELETE FROM %s
+	`
+	writeMetrics = map[string]*WriteMetrics{}
+)
+
+const (
+	maxTableRows         = 4096
+	workloadDuration     = 5 * time.Second
+	migrationWaitTimeout = 60 * time.Second
+)
+
+// The following variables are fit for a local, strong developer box.
+// The test overrides these into more relaxed values if running on GITHUB_ACTIONS,
+// seeing that GitHub CI is much weaker.
+var (
+	maxConcurrency                = 10
+	singleConnectionSleepInterval = 10 * time.Millisecond
+	countIterations               = 3
+)
+
+func TestMain(m *testing.M) {
+	defer cluster.PanicHandler(nil)
+	flag.Parse()
+
+	exitcode, err := func() (int, error) {
+		clusterInstance = cluster.NewCluster(cell, hostname)
+		schemaChangeDirectory = path.Join("/tmp", fmt.Sprintf("schema_change_dir_%d", clusterInstance.GetAndReserveTabletUID()))
+		defer os.RemoveAll(schemaChangeDirectory)
+		defer clusterInstance.Teardown()
+
+		if _, err := os.Stat(schemaChangeDirectory); os.IsNotExist(err) {
+			_ = os.Mkdir(schemaChangeDirectory, 0700)
+		}
+
+		clusterInstance.VtctldExtraArgs = []string{
+			"--schema_change_dir", schemaChangeDirectory,
+			"--schema_change_controller", "local",
+			"--schema_change_check_interval", "1s",
+		}
+
+		clusterInstance.VtTabletExtraArgs = []string{
+			"--heartbeat_enable",
+			"--heartbeat_interval", "250ms",
+			"--heartbeat_on_demand_duration", "5s",
+			"--watch_replication_stream",
+			"--vreplication_tablet_type", "primary",
+		}
+		clusterInstance.VtGateExtraArgs = []string{}
+
+		if err := clusterInstance.StartTopo(); err != nil {
+			return 1, err
+		}
+
+		// Start keyspace
+		keyspace := &cluster.Keyspace{
+			Name: keyspaceName,
+			VSchema: `{
+				"sharded": false,
+				"foreignKeyMode": "FK_MANAGED"
+			}`,
+		}
+
+		// We will use a replica to confirm that vtgate's cascading works correctly.
+		if err := clusterInstance.StartKeyspace(*keyspace, []string{"1"}, 1, false); err != nil {
+			return 1, err
+		}
+
+		vtgateInstance := clusterInstance.NewVtgateInstance()
+		// Start vtgate
+		if err := vtgateInstance.Setup(); err != nil {
+			return 1, err
+		}
+		// ensure it is torn down during cluster TearDown
+		clusterInstance.VtgateProcess = *vtgateInstance
+		vtParams = mysql.ConnParams{
+			Host: clusterInstance.Hostname,
+			Port: clusterInstance.VtgateMySQLPort,
+		}
+
+		return m.Run(), nil
+	}()
+	if err != nil {
+		fmt.Printf("%v\n", err)
+		os.Exit(1)
+	} else {
+		os.Exit(exitcode)
+	}
+
+}
+
+func queryTablet(t *testing.T, tablet *cluster.Vttablet, query string, expectError string) *sqltypes.Result {
+	rs, err := tablet.VttabletProcess.QueryTablet(query, keyspaceName, true)
+	if expectError == "" {
+		assert.NoError(t, err)
+	} else {
+		assert.ErrorContains(t, err, expectError)
+	}
+	return rs
+}
+
+func tabletTestName(t *testing.T, tablet *cluster.Vttablet) string {
+	switch tablet {
+	case primary:
+		return "primary"
+	case replica:
+		return "replica"
+	default:
+		assert.FailNowf(t, "unknown tablet", "%v, type=%v", tablet.Alias, tablet.Type)
+	}
+	return ""
+}
+
+func waitForReplicaCatchup(t *testing.T) {
+	cluster.WaitForReplicationPos(t, primary, replica, true, time.Minute)
+}
+
+func validateMetrics(t *testing.T, tcase *testCase) {
+	for _, workloadTable := range []string{parentTableName, childTableName, child2TableName, grandchildTableName} {
+		t.Run(workloadTable, func(t *testing.T) {
+			t.Run("fk errors", func(t *testing.T) {
+				testSelectTableFKErrors(t, workloadTable, tcase)
+			})
+			var primaryRows, replicaRows int64
+			t.Run(tabletTestName(t, primary), func(t *testing.T) {
+				primaryRows = testSelectTableMetrics(t, primary, workloadTable, tcase)
+			})
+			t.Run(tabletTestName(t, replica), func(t *testing.T) {
+				replicaRows = testSelectTableMetrics(t, replica, workloadTable, tcase)
+			})
+			t.Run("compare primary and replica", func(t *testing.T) {
+				assert.Equal(t, primaryRows, replicaRows)
+			})
+		})
+	}
+}
+
+func TestInitialSetup(t *testing.T) {
+	shards = clusterInstance.Keyspaces[0].Shards
+	require.Equal(t, 1, len(shards))
+	require.Equal(t, 2, len(shards[0].Vttablets))
+	primary = shards[0].Vttablets[0]
+	require.NotNil(t, primary)
+	replica = shards[0].Vttablets[1]
+	require.NotNil(t, replica)
+	require.NotEqual(t, primary.Alias, replica.Alias)
+
+	tableNames = []string{parentTableName, childTableName, child2TableName, grandchildTableName}
+	reverseTableNames = slices.Clone(tableNames)
+	slices.Reverse(reverseTableNames)
+	require.ElementsMatch(t, tableNames, reverseTableNames)
+
+	for _, tableName := range tableNames {
+		writeMetrics[tableName] = &WriteMetrics{}
+	}
+
+	if val, present := os.LookupEnv("GITHUB_ACTIONS"); present && val != "" {
+		// This is the place to fine tune the stress parameters if GitHub actions are too slow
+		maxConcurrency = maxConcurrency * 1
+		singleConnectionSleepInterval = singleConnectionSleepInterval * 1
+	}
+	t.Logf("==== test setup: maxConcurrency=%v, singleConnectionSleepInterval=%v", maxConcurrency, singleConnectionSleepInterval)
+}
+
+type testCase struct {
+	onDeleteAction sqlparser.ReferenceAction
+	onUpdateAction sqlparser.ReferenceAction
+	workload       bool
+	onlineDDLTable string
+}
+
+// ExecuteFKTest runs a single test case, which can be:
+// - With/out workload
+// - Either one of ON DELETE actions
+// - Either one of ON UPDATE actions
+// - Potentially running an Online DDL on an indicated table (this will not work in Vanilla MySQL, see https://vitess.io/blog/2021-06-15-online-ddl-why-no-fk/)
+func ExecuteFKTest(t *testing.T, tcase *testCase) {
+	workloadName := "static data"
+	if tcase.workload {
+		workloadName = "workload"
+	}
+	testName := fmt.Sprintf("%s/del=%s/upd=%s", workloadName, referenceActionMap[tcase.onDeleteAction], referenceActionMap[tcase.onUpdateAction])
+	if tcase.onlineDDLTable != "" {
+		testName = fmt.Sprintf("%s/ddl=%s", testName, tcase.onlineDDLTable)
+	}
+	t.Run(testName, func(t *testing.T) {
+		ctx, cancel := context.WithCancel(context.Background())
+		defer cancel()
+
+		t.Run("create schema", func(t *testing.T) {
+			createInitialSchema(t, tcase)
+		})
+		t.Run("init tables", func(t *testing.T) {
+			populateTables(t)
+		})
+		if tcase.workload {
+			t.Run("workload", func(t *testing.T) {
+				var wg sync.WaitGroup
+				for _, workloadTable := range []string{parentTableName, childTableName, child2TableName, grandchildTableName} {
+					wg.Add(1)
+					go func(tbl string) {
+						defer wg.Done()
+						runMultipleConnections(ctx, t, tbl)
+					}(workloadTable)
+				}
+				timer := time.NewTimer(workloadDuration)
+
+				if tcase.onlineDDLTable != "" {
+					t.Run("migrating", func(t *testing.T) {
+						// This cannot work with Vanilla MySQL. We put the code for testing, but we're not actually going to use it
+						// for now. The test cases all have empty tcase.onlineDDLTable
+						hint := "hint-alter"
+						uuid := testOnlineDDLStatement(t, fmt.Sprintf(alterHintStatement, tcase.onlineDDLTable, hint), onlineDDLStrategy, "vtgate", hint)
+						ok := onlineddl.CheckMigrationStatus(t, &vtParams, shards, uuid, schema.OnlineDDLStatusComplete)
+						require.True(t, ok) // or else don't attempt to cleanup artifacts
+						t.Run("cleanup artifacts", func(t *testing.T) {
+							rs := onlineddl.ReadMigrations(t, &vtParams, uuid)
+							require.NotNil(t, rs)
+							row := rs.Named().Row()
+							require.NotNil(t, row)
+
+							artifacts := textutil.SplitDelimitedList(row.AsString("artifacts", ""))
+							for _, artifact := range artifacts {
+								t.Run(artifact, func(t *testing.T) {
+									err := clusterInstance.VtctlclientProcess.ApplySchema(keyspaceName, "drop table if exists "+artifact)
+									require.NoError(t, err)
+								})
+							}
+						})
+					})
+				}
+
+				<-timer.C
+				cancel() // will cause runMultipleConnections() to terminate
+				wg.Wait()
+			})
+		}
+		t.Run("wait for replica", func(t *testing.T) {
+			waitForReplicaCatchup(t)
+		})
+		t.Run("validate metrics", func(t *testing.T) {
+			validateMetrics(t, tcase)
+		})
+		t.Run("validate replication health", func(t *testing.T) {
+			cluster.ValidateReplicationIsHealthy(t, replica)
+		})
+		t.Run("validate fk", func(t *testing.T) {
+			testFKIntegrity(t, primary, tcase)
+			testFKIntegrity(t, replica, tcase)
+		})
+	})
+}
+
+func TestStressFK(t *testing.T) {
+	defer cluster.PanicHandler(t)
+
+	t.Run("validate replication health", func(t *testing.T) {
+		cluster.ValidateReplicationIsHealthy(t, replica)
+	})
+
+	runOnlineDDL := false
+
+	// Without workload ; with workload
+	for _, workload := range []bool{false, true} {
+		// For any type of ON DELETE action
+		for _, actionDelete := range referenceActions {
+			// For any type of ON UPDATE action
+			for _, actionUpdate := range referenceActions {
+				tcase := &testCase{
+					workload:       workload,
+					onDeleteAction: actionDelete,
+					onUpdateAction: actionUpdate,
+				}
+				ExecuteFKTest(t, tcase)
+			}
+		}
+	}
+
+	if runOnlineDDL {
+		// Running Online DDL on all test tables. We don't use all of the combinations
+		// presented above; we will run with workload, and suffice with same ON DELETE - ON UPDATE actions.
+		for _, action := range referenceActions {
+			for _, table := range tableNames {
+				tcase := &testCase{
+					workload:       true,
+					onDeleteAction: action,
+					onUpdateAction: action,
+					onlineDDLTable: table,
+				}
+				ExecuteFKTest(t, tcase)
+			}
+		}
+	}
+}
+
+// createInitialSchema creates the tables from scratch, and drops the foreign key constraints on the replica.
+func createInitialSchema(t *testing.T, tcase *testCase) {
+	ctx := context.Background()
+	conn, err := mysql.Connect(ctx, &vtParams)
+	require.Nil(t, err)
+	defer conn.Close()
+
+	t.Run("dropping tables", func(t *testing.T) {
+		for _, tableName := range reverseTableNames {
+			err := clusterInstance.VtctlclientProcess.ApplySchema(keyspaceName, "drop table if exists "+tableName)
+			require.NoError(t, err)
+		}
+	})
+	t.Run("creating tables", func(t *testing.T) {
+		// Create the stress tables
+		var b strings.Builder
+		for i, sql := range createStatements {
+			if i == 0 {
+				// parent table, no foreign keys
+				b.WriteString(sql)
+			} else {
+				b.WriteString(fmt.Sprintf(sql, referenceActionMap[tcase.onDeleteAction], referenceActionMap[tcase.onUpdateAction]))
+			}
+			b.WriteString(";")
+		}
+		err := clusterInstance.VtctlclientProcess.ApplySchema(keyspaceName, b.String())
+		require.NoError(t, err)
+	})
+	t.Run("wait for replica", func(t *testing.T) {
+		waitForReplicaCatchup(t)
+	})
+	t.Run("validating tables: vttablet", func(t *testing.T) {
+		// Check if table is created. Checked on tablets.
+		checkTable(t, parentTableName, "hint_col")
+		checkTable(t, childTableName, "hint_col")
+		checkTable(t, child2TableName, "hint_col")
+		checkTable(t, grandchildTableName, "hint_col")
+	})
+	t.Run("validating tables: vtgate", func(t *testing.T) {
+		// Wait for tables to appear on VTGate
+		waitForTable(t, parentTableName, conn)
+		waitForTable(t, childTableName, conn)
+		waitForTable(t, child2TableName, conn)
+		waitForTable(t, grandchildTableName, conn)
+	})
+	t.Run("waiting for vschema definition to apply", func(t *testing.T) {
+		for _, tableName := range []string{parentTableName, childTableName, child2TableName, grandchildTableName} {
+			err := utils.WaitForColumn(t, clusterInstance.VtgateProcess, keyspaceName, tableName, "id")
+			require.NoError(t, err)
+		}
+	})
+
+	t.Run("dropping foreign keys on replica", func(t *testing.T) {
+		for _, statement := range dropConstraintsStatements {
+			_ = queryTablet(t, replica, "set global super_read_only=0", "")
+			_ = queryTablet(t, replica, statement, "")
+			_ = queryTablet(t, replica, "set global super_read_only=1", "")
+		}
+	})
+	t.Run("validate definitions", func(t *testing.T) {
+		for _, tableName := range []string{childTableName, child2TableName, grandchildTableName} {
+			t.Run(tableName, func(t *testing.T) {
+				t.Run(tabletTestName(t, primary), func(t *testing.T) {
+					stmt := getCreateTableStatement(t, primary, tableName)
+					assert.Contains(t, stmt, "CONSTRAINT")
+				})
+				t.Run(tabletTestName(t, replica), func(t *testing.T) {
+					stmt := getCreateTableStatement(t, replica, tableName)
+					assert.NotContains(t, stmt, "CONSTRAINT")
+				})
+			})
+		}
+	})
+}
+
+// testOnlineDDLStatement runs an online DDL, ALTER statement
+func testOnlineDDLStatement(t *testing.T, alterStatement string, ddlStrategy string, executeStrategy string, expectHint string) (uuid string) {
+	if executeStrategy == "vtgate" {
+		row := onlineddl.VtgateExecDDL(t, &vtParams, ddlStrategy, alterStatement, "").Named().Row()
+		if row != nil {
+			uuid = row.AsString("uuid", "")
+		}
+	} else {
+		var err error
+		uuid, err = clusterInstance.VtctlclientProcess.ApplySchemaWithOutput(keyspaceName, alterStatement, cluster.VtctlClientParams{DDLStrategy: ddlStrategy})
+		assert.NoError(t, err)
+	}
+	uuid = strings.TrimSpace(uuid)
+	fmt.Println("# Generated UUID (for debug purposes):")
+	fmt.Printf("<%s>\n", uuid)
+
+	strategySetting, err := schema.ParseDDLStrategy(ddlStrategy)
+	assert.NoError(t, err)
+
+	if !strategySetting.Strategy.IsDirect() {
+		t.Logf("===== waiting for migration %v to conclude", uuid)
+		status := onlineddl.WaitForMigrationStatus(t, &vtParams, shards, uuid, migrationWaitTimeout, schema.OnlineDDLStatusComplete, schema.OnlineDDLStatusFailed)
+		fmt.Printf("# Migration status (for debug purposes): <%s>\n", status)
+	}
+
+	if expectHint != "" {
+		stmt, err := sqlparser.Parse(alterStatement)
+		require.NoError(t, err)
+		ddlStmt, ok := stmt.(sqlparser.DDLStatement)
+		require.True(t, ok)
+		tableName := ddlStmt.GetTable().Name.String()
+		checkTable(t, tableName, expectHint)
+	}
+
+	if !strategySetting.Strategy.IsDirect() {
+		// let's see what FK tables have been renamed to
+		rs := onlineddl.ReadMigrations(t, &vtParams, uuid)
+		require.NotNil(t, rs)
+		row := rs.Named().Row()
+		require.NotNil(t, row)
+
+		artifacts := textutil.SplitDelimitedList(row.AsString("artifacts", ""))
+		for _, artifact := range artifacts {
+			checkTable(t, artifact, "")
+		}
+	}
+
+	return uuid
+}
+
+// waitForTable waits until table is seen in VTGate
+func waitForTable(t *testing.T, tableName string, conn *mysql.Conn) {
+	ctx, cancel := context.WithTimeout(context.Background(), time.Second*10)
+	defer cancel()
+	ticker := time.NewTicker(time.Second)
+	defer ticker.Stop()
+
+	query := fmt.Sprintf("select count(*) from %s", tableName)
+	for {
+		if _, err := conn.ExecuteFetch(query, 1, false); err == nil {
+			return // good
+		}
+		select {
+		case <-ticker.C:
+		case <-ctx.Done():
+			t.Fail()
+			return
+		}
+	}
+}
+
+// checkTable checks that the given table exists on all tablets
+func checkTable(t *testing.T, showTableName string, expectHint string) {
+	for _, tablet := range shards[0].Vttablets {
+		checkTablesCount(t, tablet, showTableName, 1)
+		if expectHint != "" {
+			createStatement := getCreateTableStatement(t, tablet, showTableName)
+			assert.Contains(t, createStatement, expectHint)
+		}
+	}
+}
+
+// checkTablesCount checks the number of tables in the given tablet
+func checkTablesCount(t *testing.T, tablet *cluster.Vttablet, showTableName string, expectCount int) {
+	query := fmt.Sprintf(`show tables like '%s';`, showTableName)
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+	rowcount := 0
+	for {
+		queryResult, err := tablet.VttabletProcess.QueryTablet(query, keyspaceName, true)
+		require.Nil(t, err)
+		rowcount = len(queryResult.Rows)
+		if rowcount > 0 {
+			break
+		}
+
+		select {
+		case <-time.After(time.Second):
+		case <-ctx.Done():
+			break
+		}
+	}
+	assert.Equal(t, expectCount, rowcount)
+}
+
+// getCreateTableStatement returns the CREATE TABLE statement for a given table
+func getCreateTableStatement(t *testing.T, tablet *cluster.Vttablet, tableName string) (statement string) {
+	queryResult := queryTablet(t, tablet, fmt.Sprintf("show create table %s", tableName), "")
+
+	require.Equal(t, len(queryResult.Rows), 1)
+	row := queryResult.Rows[0]
+	assert.Equal(t, len(row), 2) // table name, create statement
+	statement = row[1].ToString()
+	return statement
+}
+
+func isFKError(err error) bool {
+	if err == nil {
+		return false
+	}
+	sqlErr, ok := err.(*sqlerror.SQLError)
+	if !ok {
+		return false
+	}
+
+	// Let's try and account for all known errors:
+	switch sqlErr.Number() {
+	case sqlerror.ERDupEntry: // happens since we hammer the tables randomly
+		return false
+	case sqlerror.ERTooManyUserConnections: // can happen in Online DDL cut-over
+		return false
+	case sqlerror.ERUnknownError: // happens when query buffering times out
+		return false
+	case sqlerror.ERQueryInterrupted: // cancelled due to context expiration
+		return false
+	case sqlerror.ERLockDeadlock:
+		return false // bummer, but deadlocks can happen, it's a legit error.
+	case sqlerror.ERNoReferencedRow,
+		sqlerror.ERRowIsReferenced,
+		sqlerror.ERRowIsReferenced2,
+		sqlerror.ErNoReferencedRow2:
+		return true
+	case sqlerror.ERNotSupportedYet:
+		return true
+	}
+	// Unknown error
+	fmt.Printf("Unexpected error detected in isFKError: %v\n", err)
+	// Treat it as if it's a FK error
+	return true
+}
+
+func generateInsert(t *testing.T, tableName string, conn *mysql.Conn) error {
+	id := rand.Int31n(int32(maxTableRows))
+	parentId := rand.Int31n(int32(maxTableRows))
+	query := fmt.Sprintf(insertRowStatement, tableName, id, parentId)
+	qr, err := conn.ExecuteFetch(query, 1000, true)
+
+	func() {
+		writeMetrics[tableName].mu.Lock()
+		defer writeMetrics[tableName].mu.Unlock()
+
+		writeMetrics[tableName].insertsAttempts++
+		if err != nil {
+			writeMetrics[tableName].insertsFailures++
+			if isFKError(err) {
+				writeMetrics[tableName].insertsFKErrors++
+				writeMetrics[tableName].sampleInsertFKError = err
+			}
+			return
+		}
+		assert.Less(t, qr.RowsAffected, uint64(2))
+		if qr.RowsAffected == 0 {
+			writeMetrics[tableName].insertsNoops++
+			return
+		}
+		writeMetrics[tableName].inserts++
+	}()
+	return err
+}
+
+func generateUpdate(t *testing.T, tableName string, conn *mysql.Conn) error {
+	// Most of the UPDATEs we run are "normal" updates, but the minority will actually change the
+	// `id` column itself, which is the FOREIGN KEY parent column for some of the tables.
+	id := rand.Int31n(int32(maxTableRows))
+	query := fmt.Sprintf(updateRowStatement, tableName, id)
+	if tableName == parentTableName || tableName == childTableName {
+		if rand.Intn(4) == 0 {
+			updatedId := rand.Int31n(int32(maxTableRows))
+			query = fmt.Sprintf(updateRowIdStatement, tableName, updatedId, id)
+		}
+	}
+	qr, err := conn.ExecuteFetch(query, 1000, true)
+
+	func() {
+		writeMetrics[tableName].mu.Lock()
+		defer writeMetrics[tableName].mu.Unlock()
+
+		writeMetrics[tableName].updatesAttempts++
+		if err != nil {
+			writeMetrics[tableName].updatesFailures++
+			if isFKError(err) {
+				writeMetrics[tableName].updatesFKErrors++
+				writeMetrics[tableName].sampleUpdateFKError = err
+			}
+			return
+		}
+		assert.Less(t, qr.RowsAffected, uint64(2))
+		if qr.RowsAffected == 0 {
+			writeMetrics[tableName].updatesNoops++
+			return
+		}
+		writeMetrics[tableName].updates++
+	}()
+	return err
+}
+
+func generateDelete(t *testing.T, tableName string, conn *mysql.Conn) error {
+	id := rand.Int31n(int32(maxTableRows))
+	query := fmt.Sprintf(deleteRowStatement, tableName, id)
+	qr, err := conn.ExecuteFetch(query, 1000, true)
+
+	func() {
+		writeMetrics[tableName].mu.Lock()
+		defer writeMetrics[tableName].mu.Unlock()
+
+		writeMetrics[tableName].deletesAttempts++
+		if err != nil {
+			writeMetrics[tableName].deletesFailures++
+			if isFKError(err) {
+				writeMetrics[tableName].deletesFKErrors++
+				writeMetrics[tableName].sampleDeleteFKError = err
+			}
+			return
+		}
+		assert.Less(t, qr.RowsAffected, uint64(2))
+		if qr.RowsAffected == 0 {
+			writeMetrics[tableName].deletesNoops++
+			return
+		}
+		writeMetrics[tableName].deletes++
+	}()
+	return err
+}
+
+func runSingleConnection(ctx context.Context, t *testing.T, tableName string, done *int64) {
+	log.Infof("Running single connection on %s", tableName)
+	conn, err := mysql.Connect(ctx, &vtParams)
+	require.Nil(t, err)
+	defer conn.Close()
+
+	_, err = conn.ExecuteFetch("set autocommit=1", 1000, true)
+	require.Nil(t, err)
+	_, err = conn.ExecuteFetch("set transaction isolation level read committed", 1000, true)
+	require.Nil(t, err)
+
+	for {
+		if atomic.LoadInt64(done) == 1 {
+			log.Infof("Terminating single connection")
+			return
+		}
+		switch rand.Int31n(3) {
+		case 0:
+			_ = generateInsert(t, tableName, conn)
+		case 1:
+			_ = generateUpdate(t, tableName, conn)
+		case 2:
+			_ = generateDelete(t, tableName, conn)
+		}
+		time.Sleep(singleConnectionSleepInterval)
+	}
+}
+
+func runMultipleConnections(ctx context.Context, t *testing.T, tableName string) {
+	log.Infof("Running multiple connections")
+	var done int64
+	var wg sync.WaitGroup
+	for i := 0; i < maxConcurrency; i++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			runSingleConnection(ctx, t, tableName, &done)
+		}()
+	}
+	<-ctx.Done()
+	atomic.StoreInt64(&done, 1)
+	log.Infof("Running multiple connections: done")
+	wg.Wait()
+	log.Infof("All connections cancelled")
+}
+
+func wrapWithNoFKChecks(sql string) string {
+	return fmt.Sprintf("set foreign_key_checks=0; %s; set foreign_key_checks=1;", sql)
+}
+
+// populateTables randomly populates all test tables. This is done sequentially.
+func populateTables(t *testing.T) {
+	log.Infof("initTable begin")
+	defer log.Infof("initTable complete")
+
+	ctx := context.Background()
+	conn, err := mysql.Connect(ctx, &vtParams)
+	require.Nil(t, err)
+	defer conn.Close()
+
+	t.Logf("===== clearing tables")
+	for _, tableName := range reverseTableNames {
+		writeMetrics[tableName].Clear()
+		deleteQuery := fmt.Sprintf(deleteAllStatement, tableName)
+		_, err = conn.ExecuteFetch(deleteQuery, 1000, true)
+		require.Nil(t, err)
+	}
+	// In an ideal world we would randomly re-seed the tables in each and every instance of the test.
+	// In reality, that takes a lot of time, and while the seeding is important, it's not the heart of
+	// the test. To that effect, the seeding works as follows:
+	// - First ever time, we randomly seed the tables (running thousands of queries). We then create *_seed
+	//   tables and clone the data in those seed tables.
+	// - 2nd test and forward: we just copy over the rows from the *_seed tables.
+	tablesSeeded := false
+	seedOnce.Do(func() {
+		for _, tableName := range tableNames {
+			t.Run(tableName, func(t *testing.T) {
+				t.Run("populating", func(t *testing.T) {
+					// populate parent, then child, child2, then grandchild
+					for i := 0; i < maxTableRows/2; i++ {
+						generateInsert(t, tableName, conn)
+					}
+					for i := 0; i < maxTableRows/4; i++ {
+						generateUpdate(t, tableName, conn)
+					}
+					for i := 0; i < maxTableRows/4; i++ {
+						generateDelete(t, tableName, conn)
+					}
+				})
+				t.Run("creating seed", func(t *testing.T) {
+					// We create the seed table in the likeness of stress_parent, because that's the only table
+					// that doesn't have FK constraints.
+					{
+						createSeedQuery := fmt.Sprintf("create table %s_seed like %s", tableName, parentTableName)
+						_, err := conn.ExecuteFetch(createSeedQuery, 1000, true)
+						require.NoError(t, err)
+					}
+					{
+						seedQuery := fmt.Sprintf("insert into %s_seed select * from %s", tableName, tableName)
+						_, err := conn.ExecuteFetch(seedQuery, 1000, true)
+						require.NoError(t, err)
+					}
+					{
+						validationQuery := fmt.Sprintf("select count(*) as c from %s_seed", tableName)
+						rs, err := conn.ExecuteFetch(validationQuery, 1000, true)
+						require.NoError(t, err)
+						row := rs.Named().Row()
+						require.NotNil(t, row)
+						require.NotZero(t, row.AsInt64("c", 0))
+					}
+				})
+			})
+		}
+		tablesSeeded = true
+	})
+	if !tablesSeeded {
+		t.Run("reseeding", func(t *testing.T) {
+			for _, tableName := range tableNames {
+				seedQuery := fmt.Sprintf("insert into %s select * from %s_seed", tableName, tableName)
+				_, err := conn.ExecuteFetch(seedQuery, 1000, true)
+				require.NoError(t, err)
+			}
+		})
+	}
+
+	t.Run("validating table rows", func(t *testing.T) {
+		for _, tableName := range tableNames {
+			validationQuery := fmt.Sprintf(selectCountRowsStatement, tableName)
+			rs, err := conn.ExecuteFetch(validationQuery, 1000, true)
+			require.NoError(t, err)
+			row := rs.Named().Row()
+			require.NotNil(t, row)
+			numRows := row.AsInt64("num_rows", 0)
+			sumUpdates := row.AsInt64("sum_updates", 0)
+			require.NotZero(t, numRows)
+			if !tablesSeeded {
+				// We cloned the data from *_seed tables. This means we didn't populate writeMetrics. Now,
+				// this function only takes care of the base seed. We will later on run a stress workload on
+				// these tables, at the end of which we will examine the writeMetrics. We thus have to have those
+				// metrics consistent with the cloned data. It's a bit ugly, but we inject fake writeMetrics.
+				writeMetrics[tableName].deletes = 1
+				writeMetrics[tableName].inserts = numRows + writeMetrics[tableName].deletes
+				writeMetrics[tableName].updates = sumUpdates + writeMetrics[tableName].deletes
+			}
+		}
+	})
+}
+
+// testSelectTableMetrics cross references the known metrics (number of successful insert/delete/updates) on each table, with the
+// actual number of rows and with the row values on those tables.
+// With CASCADE/SET NULL rules we can't do the comparison, because child tables are implicitly affected by the cascading rules,
+// and the values do not match what reported to us when we UPDATE/DELETE on the parent tables.
+func testSelectTableMetrics(
+	t *testing.T,
+	tablet *cluster.Vttablet,
+	tableName string,
+	tcase *testCase,
+) int64 {
+	switch tcase.onDeleteAction {
+	case sqlparser.Cascade, sqlparser.SetNull:
+		if tableName != parentTableName {
+			// We can't validate those tables because they will have been affected by cascading rules.
+			return 0
+		}
+	}
+	// metrics are unaffected by value of onUpdateAction.
+
+	writeMetrics[tableName].mu.Lock()
+	defer writeMetrics[tableName].mu.Unlock()
+
+	log.Infof("%s %s", tableName, writeMetrics[tableName].String())
+
+	rs := queryTablet(t, tablet, fmt.Sprintf(selectCountRowsStatement, tableName), "")
+
+	row := rs.Named().Row()
+	require.NotNil(t, row)
+	log.Infof("testSelectTableMetrics, row: %v", row)
+	numRows := row.AsInt64("num_rows", 0)
+	sumUpdates := row.AsInt64("sum_updates", 0)
+	assert.NotZero(t, numRows)
+	assert.NotZero(t, sumUpdates)
+	assert.NotZero(t, writeMetrics[tableName].inserts)
+	assert.NotZero(t, writeMetrics[tableName].deletes)
+	assert.NotZero(t, writeMetrics[tableName].updates)
+	assert.Equal(t, writeMetrics[tableName].inserts-writeMetrics[tableName].deletes, numRows)
+	assert.Equal(t, writeMetrics[tableName].updates-writeMetrics[tableName].deletes, sumUpdates) // because we DELETE WHERE updates=1
+
+	return numRows
+}
+
+// testSelectTableFKErrors
+func testSelectTableFKErrors(
+	t *testing.T,
+	tableName string,
+	tcase *testCase,
+) {
+	writeMetrics[tableName].mu.Lock()
+	defer writeMetrics[tableName].mu.Unlock()
+
+	if tcase.onDeleteAction == sqlparser.Cascade {
+		assert.Zerof(t, writeMetrics[tableName].deletesFKErrors, "unexpected foreign key errors for DELETEs in ON DELETE CASCADE. Sample error: %v", writeMetrics[tableName].sampleDeleteFKError)
+	}
+	if tcase.onUpdateAction == sqlparser.Cascade {
+		assert.Zerof(t, writeMetrics[tableName].updatesFKErrors, "unexpected foreign key errors for UPDATEs in ON UPDATE CASCADE. Sample error: %v", writeMetrics[tableName].sampleUpdateFKError)
+	}
+}
+
+// testFKIntegrity validates that foreign key consitency is maintained on the given tablet. We cross reference all
+// parent-child relationships.
+// There are two test types:
+// 1. Do a JOIN on parent-child associated rows, expect non-empty
+// 2. Check that there are no orphaned child rows. Notes:
+//   - This applies to NO ACTION and CASCADE, but not to SET NULL, because SET NULL by design creates orphaned rows.
+//   - On the primary database, this test trivially passes because of course MySQL maintains this integrity. But remember
+//     that we remove the foreign key constraints on the replica. Also remember that cascaded writes are not written to
+//     the binary log. And so, if VTGate does not do a proper job, then a parent and child will drift apart in CASCADE writes.
+func testFKIntegrity(
+	t *testing.T,
+	tablet *cluster.Vttablet,
+	tcase *testCase,
+) {
+	testName := tabletTestName(t, tablet)
+	t.Run(testName, func(t *testing.T) {
+		t.Run("matching parent-child rows", func(t *testing.T) {
+			rs := queryTablet(t, tablet, selectMatchingRowsChild, "")
+			assert.NotZero(t, len(rs.Rows))
+		})
+		t.Run("matching parent-child2 rows", func(t *testing.T) {
+			rs := queryTablet(t, tablet, selectMatchingRowsChild2, "")
+			assert.NotZero(t, len(rs.Rows))
+		})
+		t.Run("matching child-grandchild rows", func(t *testing.T) {
+			rs := queryTablet(t, tablet, selectMatchingRowsGrandchild, "")
+			assert.NotZero(t, len(rs.Rows))
+		})
+		if tcase.onDeleteAction != sqlparser.SetNull && tcase.onUpdateAction != sqlparser.SetNull {
+			// Because with SET NULL there _are_ orphaned rows
+			t.Run("parent-child orphaned rows", func(t *testing.T) {
+				rs := queryTablet(t, tablet, selectOrphanedRowsChild, "")
+				assert.Zero(t, len(rs.Rows))
+			})
+			t.Run("parent-child2 orphaned rows", func(t *testing.T) {
+				rs := queryTablet(t, tablet, selectOrphanedRowsChild2, "")
+				assert.Zero(t, len(rs.Rows))
+			})
+			t.Run("child-grandchild orphaned rows", func(t *testing.T) {
+				rs := queryTablet(t, tablet, selectOrphanedRowsGrandchild, "")
+				assert.Zero(t, len(rs.Rows))
+			})
+		}
+	})
+}
diff --git a/test/ci_workflow_gen.go b/test/ci_workflow_gen.go
index 43e874ec182..5a3031d7307 100644
--- a/test/ci_workflow_gen.go
+++ b/test/ci_workflow_gen.go
@@ -111,6 +111,7 @@ var (
 		"vtgate_vschema",
 		"vtgate_queries",
 		"vtgate_schema_tracker",
+		"vtgate_foreignkey_stress",
 		"vtorc",
 		"xb_recovery",
 		"mysql80",
diff --git a/test/config.json b/test/config.json
index cc5e89f9ce7..e5bf8126ab6 100644
--- a/test/config.json
+++ b/test/config.json
@@ -860,6 +860,15 @@
 			"RetryMax": 2,
 			"Tags": []
 		},
+		"vtgate_foreignkey_stress": {
+			"File": "unused.go",
+			"Args": ["vitess.io/vitess/go/test/endtoend/vtgate/foreignkey/stress"],
+			"Command": [],
+			"Manual": false,
+			"Shard": "vtgate_foreignkey_stress",
+			"RetryMax": 1,
+			"Tags": []
+		},
 		"vtgate_gen4": {
 			"File": "unused.go",
 			"Args": ["vitess.io/vitess/go/test/endtoend/vtgate/gen4"],

From 651377220b75b0f96d83365883e034e7ff01740f Mon Sep 17 00:00:00 2001
From: Harshit Gangal <harshit@planetscale.com>
Date: Wed, 20 Sep 2023 21:29:54 +0530
Subject: [PATCH 3/7] Add session flag for stream execute grpc api (#14046)

Signed-off-by: Harshit Gangal <harshit@planetscale.com>
---
 changelog/18.0/18.0.0/summary.md              |  9 +++
 go/flags/endtoend/vtgate.txt                  |  1 +
 go/test/endtoend/vtgate/grpc_api/main_test.go |  1 +
 go/vt/vtgate/grpcvtgateservice/server.go      | 22 ++++---
 .../tabletserver/exclude_race_test.go         | 62 +++++++++++++++++++
 .../tabletserver/tabletserver_test.go         | 45 --------------
 6 files changed, 87 insertions(+), 53 deletions(-)
 create mode 100644 go/vt/vttablet/tabletserver/exclude_race_test.go

diff --git a/changelog/18.0/18.0.0/summary.md b/changelog/18.0/18.0.0/summary.md
index db2494d3794..6537aeafb43 100644
--- a/changelog/18.0/18.0.0/summary.md
+++ b/changelog/18.0/18.0.0/summary.md
@@ -9,6 +9,7 @@
     - [VTOrc flag `--allow-emergency-reparent`](#new-flag-toggle-ers)
     - [VTOrc flag `--change-tablets-with-errant-gtid-to-drained`](#new-flag-errant-gtid-convert)
     - [ERS sub flag `--wait-for-all-tablets`](#new-ers-subflag)
+    - [VTGate flag `--grpc-send-session-in-streaming`](#new-vtgate-streaming-sesion)
   - **[VTAdmin](#vtadmin)**
     - [Updated to node v18.16.0](#update-node)
   - **[Deprecations and Deletions](#deprecations-and-deletions)**
@@ -65,6 +66,14 @@ for a response from all the tablets. Originally `EmergencyReparentShard` was mea
 We have realized now that there are cases when the replication is broken but all the tablets are reachable. In these cases, it is advisable to 
 call `EmergencyReparentShard` with `--wait-for-all-tablets` so that it doesn't ignore one of the tablets.
 
+#### <a id="new-vtgate-streaming-sesion"/>VTGate GRPC stream execute session flag `--grpc-send-session-in-streaming`
+
+This flag enables transaction support on `StreamExecute` api.
+One enabled, VTGate `StreamExecute` grpc api will send session as the last packet in the response.
+The client should enable it only when they have made the required changes to expect such a packet.
+
+It is disabled by default.
+
 ### <a id="vtadmin"/>VTAdmin
 
 #### <a id="updated-node"/>vtadmin-web updated to node v18.16.0 (LTS)
diff --git a/go/flags/endtoend/vtgate.txt b/go/flags/endtoend/vtgate.txt
index 89f6544ca8f..a496b2765d2 100644
--- a/go/flags/endtoend/vtgate.txt
+++ b/go/flags/endtoend/vtgate.txt
@@ -63,6 +63,7 @@ Flags:
       --foreign_key_mode string                                          This is to provide how to handle foreign key constraint in create/alter table. Valid values are: allow, disallow (default "allow")
       --gate_query_cache_memory int                                      gate server query cache size in bytes, maximum amount of memory to be cached. vtgate analyzes every incoming query and generate a query plan, these plans are being cached in a lru cache. This config controls the capacity of the lru cache. (default 33554432)
       --gateway_initial_tablet_timeout duration                          At startup, the tabletGateway will wait up to this duration to get at least one tablet per keyspace/shard/tablet type (default 30s)
+      --grpc-send-session-in-streaming                                   If set, will send the session as last packet in streaming api to support transactions in streaming
       --grpc-use-effective-groups                                        If set, and SSL is not used, will set the immediate caller's security groups from the effective caller id's groups.
       --grpc-use-static-authentication-callerid                          If set, will set the immediate caller id to the username authenticated by the static auth plugin.
       --grpc_auth_mode string                                            Which auth plugin implementation to use (eg: static)
diff --git a/go/test/endtoend/vtgate/grpc_api/main_test.go b/go/test/endtoend/vtgate/grpc_api/main_test.go
index a51c6d9e6f2..3c8605f79a0 100644
--- a/go/test/endtoend/vtgate/grpc_api/main_test.go
+++ b/go/test/endtoend/vtgate/grpc_api/main_test.go
@@ -111,6 +111,7 @@ func TestMain(m *testing.M) {
 			"--grpc_auth_static_password_file", grpcServerAuthStaticPath,
 			"--grpc_use_effective_callerid",
 			"--grpc-use-static-authentication-callerid",
+			"--grpc-send-session-in-streaming",
 		}
 
 		// Configure vttablet to use table ACL
diff --git a/go/vt/vtgate/grpcvtgateservice/server.go b/go/vt/vtgate/grpcvtgateservice/server.go
index 7baff6cefe8..bf00db4ea1c 100644
--- a/go/vt/vtgate/grpcvtgateservice/server.go
+++ b/go/vt/vtgate/grpcvtgateservice/server.go
@@ -48,12 +48,15 @@ var (
 	useEffective                    bool
 	useEffectiveGroups              bool
 	useStaticAuthenticationIdentity bool
+
+	sendSessionInStreaming bool
 )
 
 func registerFlags(fs *pflag.FlagSet) {
 	fs.BoolVar(&useEffective, "grpc_use_effective_callerid", false, "If set, and SSL is not used, will set the immediate caller id from the effective caller id's principal.")
 	fs.BoolVar(&useEffectiveGroups, "grpc-use-effective-groups", false, "If set, and SSL is not used, will set the immediate caller's security groups from the effective caller id's groups.")
 	fs.BoolVar(&useStaticAuthenticationIdentity, "grpc-use-static-authentication-callerid", false, "If set, will set the immediate caller id to the username authenticated by the static auth plugin.")
+	fs.BoolVar(&sendSessionInStreaming, "grpc-send-session-in-streaming", false, "If set, will send the session as last packet in streaming api to support transactions in streaming")
 }
 
 func init() {
@@ -192,19 +195,22 @@ func (vtg *VTGate) StreamExecute(request *vtgatepb.StreamExecuteRequest, stream
 		})
 	})
 
-	// even if there is an error, session could have been modified.
-	// So, this needs to be sent back to the client. Session is sent in the last stream response.
-	lastErr := stream.Send(&vtgatepb.StreamExecuteResponse{
-		Session: session,
-	})
-
 	var errs []error
 	if vtgErr != nil {
 		errs = append(errs, vtgErr)
 	}
-	if lastErr != nil {
-		errs = append(errs, lastErr)
+
+	if sendSessionInStreaming {
+		// even if there is an error, session could have been modified.
+		// So, this needs to be sent back to the client. Session is sent in the last stream response.
+		lastErr := stream.Send(&vtgatepb.StreamExecuteResponse{
+			Session: session,
+		})
+		if lastErr != nil {
+			errs = append(errs, lastErr)
+		}
 	}
+
 	return vterrors.ToGRPC(vterrors.Aggregate(errs))
 }
 
diff --git a/go/vt/vttablet/tabletserver/exclude_race_test.go b/go/vt/vttablet/tabletserver/exclude_race_test.go
new file mode 100644
index 00000000000..6e55671ac96
--- /dev/null
+++ b/go/vt/vttablet/tabletserver/exclude_race_test.go
@@ -0,0 +1,62 @@
+//go:build !race
+
+package tabletserver
+
+import (
+	"context"
+	"strings"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+
+	"vitess.io/vitess/go/sqltypes"
+	querypb "vitess.io/vitess/go/vt/proto/query"
+	"vitess.io/vitess/go/vt/sqlparser"
+	"vitess.io/vitess/go/vt/vttablet/tabletserver/tabletenv"
+)
+
+// TestHandlePanicAndSendLogStatsMessageTruncation tests that when an error truncation
+// length is set and a panic occurs, the code in handlePanicAndSendLogStats will
+// truncate the error text in logs, but will not truncate the error text in the
+// error value.
+func TestHandlePanicAndSendLogStatsMessageTruncation(t *testing.T) {
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	tl := newTestLogger()
+	defer tl.Close()
+	logStats := tabletenv.NewLogStats(ctx, "TestHandlePanicAndSendLogStatsMessageTruncation")
+	db, tsv := setupTabletServerTest(t, ctx, "")
+	defer tsv.StopService()
+	defer db.Close()
+
+	longSql := "select * from test_table_loooooooooooooooooooooooooooooooooooong"
+	longBv := map[string]*querypb.BindVariable{
+		"bv1": sqltypes.Int64BindVariable(1111111111),
+		"bv2": sqltypes.Int64BindVariable(2222222222),
+		"bv3": sqltypes.Int64BindVariable(3333333333),
+		"bv4": sqltypes.Int64BindVariable(4444444444),
+	}
+	origTruncateErrLen := sqlparser.GetTruncateErrLen()
+	sqlparser.SetTruncateErrLen(32)
+	defer sqlparser.SetTruncateErrLen(origTruncateErrLen)
+
+	defer func() {
+		err := logStats.Error
+		want := "Uncaught panic for Sql: \"select * from test_table_loooooooooooooooooooooooooooooooooooong\", BindVars: {bv1: \"type:INT64 value:\\\"1111111111\\\"\"bv2: \"type:INT64 value:\\\"2222222222\\\"\"bv3: \"type:INT64 value:\\\"3333333333\\\"\"bv4: \"type:INT64 value:\\\"4444444444\\\"\"}"
+		require.Error(t, err)
+		assert.Contains(t, err.Error(), want)
+		want = "Uncaught panic for Sql: \"select * from test_t [TRUNCATED]\", BindVars: {bv1: \"typ [TRUNCATED]"
+		gotWhatWeWant := false
+		for _, log := range tl.getLogs() {
+			if strings.HasPrefix(log, want) {
+				gotWhatWeWant = true
+				break
+			}
+		}
+		assert.True(t, gotWhatWeWant)
+	}()
+
+	defer tsv.handlePanicAndSendLogStats(longSql, longBv, logStats)
+	panic("panic from TestHandlePanicAndSendLogStatsMessageTruncation")
+}
diff --git a/go/vt/vttablet/tabletserver/tabletserver_test.go b/go/vt/vttablet/tabletserver/tabletserver_test.go
index 2f4988cc2b6..d2fb10e5a77 100644
--- a/go/vt/vttablet/tabletserver/tabletserver_test.go
+++ b/go/vt/vttablet/tabletserver/tabletserver_test.go
@@ -1492,51 +1492,6 @@ func TestHandleExecUnknownError(t *testing.T) {
 	panic("unknown exec error")
 }
 
-// TestHandlePanicAndSendLogStatsMessageTruncation tests that when an error truncation
-// length is set and a panic occurs, the code in handlePanicAndSendLogStats will
-// truncate the error text in logs, but will not truncate the error text in the
-// error value.
-func TestHandlePanicAndSendLogStatsMessageTruncation(t *testing.T) {
-	ctx, cancel := context.WithCancel(context.Background())
-	defer cancel()
-	tl := newTestLogger()
-	defer tl.Close()
-	logStats := tabletenv.NewLogStats(ctx, "TestHandlePanicAndSendLogStatsMessageTruncation")
-	db, tsv := setupTabletServerTest(t, ctx, "")
-	defer tsv.StopService()
-	defer db.Close()
-
-	longSql := "select * from test_table_loooooooooooooooooooooooooooooooooooong"
-	longBv := map[string]*querypb.BindVariable{
-		"bv1": sqltypes.Int64BindVariable(1111111111),
-		"bv2": sqltypes.Int64BindVariable(2222222222),
-		"bv3": sqltypes.Int64BindVariable(3333333333),
-		"bv4": sqltypes.Int64BindVariable(4444444444),
-	}
-	origTruncateErrLen := sqlparser.GetTruncateErrLen()
-	sqlparser.SetTruncateErrLen(32)
-	defer sqlparser.SetTruncateErrLen(origTruncateErrLen)
-
-	defer func() {
-		err := logStats.Error
-		want := "Uncaught panic for Sql: \"select * from test_table_loooooooooooooooooooooooooooooooooooong\", BindVars: {bv1: \"type:INT64 value:\\\"1111111111\\\"\"bv2: \"type:INT64 value:\\\"2222222222\\\"\"bv3: \"type:INT64 value:\\\"3333333333\\\"\"bv4: \"type:INT64 value:\\\"4444444444\\\"\"}"
-		require.Error(t, err)
-		assert.Contains(t, err.Error(), want)
-		want = "Uncaught panic for Sql: \"select * from test_t [TRUNCATED]\", BindVars: {bv1: \"typ [TRUNCATED]"
-		gotWhatWeWant := false
-		for _, log := range tl.getLogs() {
-			if strings.HasPrefix(log, want) {
-				gotWhatWeWant = true
-				break
-			}
-		}
-		assert.True(t, gotWhatWeWant)
-	}()
-
-	defer tsv.handlePanicAndSendLogStats(longSql, longBv, logStats)
-	panic("panic from TestHandlePanicAndSendLogStatsMessageTruncation")
-}
-
 func TestQueryAsString(t *testing.T) {
 	longSql := "select * from test_table_loooooooooooooooooooooooooooooooooooong"
 	longBv := map[string]*querypb.BindVariable{

From a657f10ee4806adbe131d79576a34a63408a6754 Mon Sep 17 00:00:00 2001
From: Max Englander <max@planetscale.com>
Date: Wed, 20 Sep 2023 17:07:45 +0100
Subject: [PATCH 4/7] go/vt/mysqlctl: instrument s3 upload time (#12500)

Signed-off-by: Max Englander <max@planetscale.com>
---
 changelog/18.0/18.0.0/summary.md          |  12 +++
 go/vt/mysqlctl/backup_blackbox_test.go    |  30 +++---
 go/vt/mysqlctl/backup_test.go             |   8 +-
 go/vt/mysqlctl/backupstats/fake_stats.go  |   2 +-
 go/vt/mysqlctl/s3backupstorage/s3.go      |  25 +++--
 go/vt/mysqlctl/s3backupstorage/s3_test.go | 124 ++++++++++++++++++++--
 6 files changed, 167 insertions(+), 34 deletions(-)

diff --git a/changelog/18.0/18.0.0/summary.md b/changelog/18.0/18.0.0/summary.md
index 6537aeafb43..02caabc6c85 100644
--- a/changelog/18.0/18.0.0/summary.md
+++ b/changelog/18.0/18.0.0/summary.md
@@ -22,6 +22,7 @@
     - [VTGate Vindex unknown parameters](#vtgate-vindex-unknown-parameters)
     - [VTBackup stat `Phase`](#vtbackup-stat-phase)
     - [VTBackup stat `PhaseStatus`](#vtbackup-stat-phase-status)
+    - [Backup and restore metrics for AWS S3](#backup-restore-metrics-aws-s3)
   - **[VTTablet](#vttablet)**
     - [VTTablet: New ResetSequences RPC](#vttablet-new-rpc-reset-sequences)
   - **[Docker](#docker)**
@@ -160,6 +161,17 @@ sum_over_time(vtbackup_phase{phase="TakeNewBackup"}) * <interval>
     * `Stalled` is set to `1` when replication stops advancing.
     * `Stopped` is set to `1` when replication stops before `vtbackup` catches up with the primary.
 
+#### <a id="backup-restore-metrics-aws-s3"/>Backup and restore metrics for AWS S3
+
+Requests to AWS S3 are instrumented in backup and restore metrics. For example:
+
+```
+vtbackup_backup_count{component="BackupStorage",implementation="S3",operation="AWS:Request:Send"} 823
+vtbackup_backup_duration_nanoseconds{component="BackupStorage",implementation="S3",operation="AWS:Request:Send"} 1.33632421437e+11
+vtbackup_restore_count{component="BackupStorage",implementation="S3",operation="AWS:Request:Send"} 165
+vtbackup_restore_count{component="BackupStorage",implementation="S3",operation="AWS:Request:Send"} 165
+```
+
 ### <a id="vttablet"/>VTTablet
 
 #### <a id="vttablet-new-rpc-reset-sequences"/>New ResetSequences rpc
diff --git a/go/vt/mysqlctl/backup_blackbox_test.go b/go/vt/mysqlctl/backup_blackbox_test.go
index 62b58f2a5c8..8de6a8679fa 100644
--- a/go/vt/mysqlctl/backup_blackbox_test.go
+++ b/go/vt/mysqlctl/backup_blackbox_test.go
@@ -173,26 +173,25 @@ func TestExecuteBackup(t *testing.T) {
 	var sourceReadStats int
 
 	for _, sr := range fakeStats.ScopeReturns {
-		sfs := sr.(*backupstats.FakeStats)
-		switch sfs.ScopeV[backupstats.ScopeOperation] {
+		switch sr.ScopeV[backupstats.ScopeOperation] {
 		case "Destination:Close":
 			destinationCloseStats++
-			require.Len(t, sfs.TimedIncrementCalls, 1)
+			require.Len(t, sr.TimedIncrementCalls, 1)
 		case "Destination:Open":
 			destinationOpenStats++
-			require.Len(t, sfs.TimedIncrementCalls, 1)
+			require.Len(t, sr.TimedIncrementCalls, 1)
 		case "Destination:Write":
 			destinationWriteStats++
-			require.GreaterOrEqual(t, len(sfs.TimedIncrementBytesCalls), 1)
+			require.GreaterOrEqual(t, len(sr.TimedIncrementBytesCalls), 1)
 		case "Source:Close":
 			sourceCloseStats++
-			require.Len(t, sfs.TimedIncrementCalls, 1)
+			require.Len(t, sr.TimedIncrementCalls, 1)
 		case "Source:Open":
 			sourceOpenStats++
-			require.Len(t, sfs.TimedIncrementCalls, 1)
+			require.Len(t, sr.TimedIncrementCalls, 1)
 		case "Source:Read":
 			sourceReadStats++
-			require.GreaterOrEqual(t, len(sfs.TimedIncrementBytesCalls), 1)
+			require.GreaterOrEqual(t, len(sr.TimedIncrementBytesCalls), 1)
 		}
 	}
 
@@ -529,26 +528,25 @@ func TestExecuteRestoreWithTimedOutContext(t *testing.T) {
 	var sourceReadStats int
 
 	for _, sr := range fakeStats.ScopeReturns {
-		sfs := sr.(*backupstats.FakeStats)
-		switch sfs.ScopeV[backupstats.ScopeOperation] {
+		switch sr.ScopeV[backupstats.ScopeOperation] {
 		case "Destination:Close":
 			destinationCloseStats++
-			require.Len(t, sfs.TimedIncrementCalls, 1)
+			require.Len(t, sr.TimedIncrementCalls, 1)
 		case "Destination:Open":
 			destinationOpenStats++
-			require.Len(t, sfs.TimedIncrementCalls, 1)
+			require.Len(t, sr.TimedIncrementCalls, 1)
 		case "Destination:Write":
 			destinationWriteStats++
-			require.GreaterOrEqual(t, len(sfs.TimedIncrementBytesCalls), 1)
+			require.GreaterOrEqual(t, len(sr.TimedIncrementBytesCalls), 1)
 		case "Source:Close":
 			sourceCloseStats++
-			require.Len(t, sfs.TimedIncrementCalls, 1)
+			require.Len(t, sr.TimedIncrementCalls, 1)
 		case "Source:Open":
 			sourceOpenStats++
-			require.Len(t, sfs.TimedIncrementCalls, 1)
+			require.Len(t, sr.TimedIncrementCalls, 1)
 		case "Source:Read":
 			sourceReadStats++
-			require.GreaterOrEqual(t, len(sfs.TimedIncrementBytesCalls), 1)
+			require.GreaterOrEqual(t, len(sr.TimedIncrementBytesCalls), 1)
 		}
 	}
 
diff --git a/go/vt/mysqlctl/backup_test.go b/go/vt/mysqlctl/backup_test.go
index 5a135c26a30..5b97f709c2f 100644
--- a/go/vt/mysqlctl/backup_test.go
+++ b/go/vt/mysqlctl/backup_test.go
@@ -54,7 +54,7 @@ func TestBackupExecutesBackupWithScopedParams(t *testing.T) {
 	var executeBackupStats *backupstats.FakeStats
 	for _, sr := range env.stats.ScopeReturns {
 		if sr == executeBackupParams.Stats {
-			executeBackupStats = sr.(*backupstats.FakeStats)
+			executeBackupStats = sr
 		}
 	}
 	require.Contains(t, executeBackupStats.ScopeV, backupstats.ScopeComponent)
@@ -87,7 +87,7 @@ func TestBackupParameterizesBackupStorageWithScopedStats(t *testing.T) {
 	var storageStats *backupstats.FakeStats
 	for _, sr := range env.stats.ScopeReturns {
 		if sr == env.backupStorage.WithParamsCalls[0].Stats {
-			storageStats = sr.(*backupstats.FakeStats)
+			storageStats = sr
 		}
 	}
 	require.Contains(t, storageStats.ScopeV, backupstats.ScopeComponent)
@@ -344,7 +344,7 @@ func TestRestoreExecutesRestoreWithScopedParams(t *testing.T) {
 	var executeRestoreStats *backupstats.FakeStats
 	for _, sr := range env.stats.ScopeReturns {
 		if sr == executeRestoreParams.Stats {
-			executeRestoreStats = sr.(*backupstats.FakeStats)
+			executeRestoreStats = sr
 		}
 	}
 	require.Contains(t, executeRestoreStats.ScopeV, backupstats.ScopeComponent)
@@ -379,7 +379,7 @@ func TestRestoreParameterizesBackupStorageWithScopedStats(t *testing.T) {
 	var storageStats *backupstats.FakeStats
 	for _, sr := range env.stats.ScopeReturns {
 		if sr == env.backupStorage.WithParamsCalls[0].Stats {
-			storageStats = sr.(*backupstats.FakeStats)
+			storageStats = sr
 		}
 	}
 	require.Contains(t, storageStats.ScopeV, backupstats.ScopeComponent)
diff --git a/go/vt/mysqlctl/backupstats/fake_stats.go b/go/vt/mysqlctl/backupstats/fake_stats.go
index e8e84431eb9..29728d86db5 100644
--- a/go/vt/mysqlctl/backupstats/fake_stats.go
+++ b/go/vt/mysqlctl/backupstats/fake_stats.go
@@ -13,7 +13,7 @@ type FakeStats struct {
 		Duration time.Duration
 	}
 	ScopeCalls   [][]Scope
-	ScopeReturns []Stats
+	ScopeReturns []*FakeStats
 	mutex        sync.Mutex
 }
 
diff --git a/go/vt/mysqlctl/s3backupstorage/s3.go b/go/vt/mysqlctl/s3backupstorage/s3.go
index 4d10cd7f080..06cff1793b5 100644
--- a/go/vt/mysqlctl/s3backupstorage/s3.go
+++ b/go/vt/mysqlctl/s3backupstorage/s3.go
@@ -36,6 +36,7 @@ import (
 	"sort"
 	"strings"
 	"sync"
+	"time"
 
 	"github.com/aws/aws-sdk-go/aws"
 	"github.com/aws/aws-sdk-go/aws/client"
@@ -48,6 +49,7 @@ import (
 
 	"vitess.io/vitess/go/vt/concurrency"
 	"vitess.io/vitess/go/vt/log"
+	stats "vitess.io/vitess/go/vt/mysqlctl/backupstats"
 	"vitess.io/vitess/go/vt/mysqlctl/backupstorage"
 	"vitess.io/vitess/go/vt/servenv"
 )
@@ -170,8 +172,8 @@ func (bh *S3BackupHandle) AddFile(ctx context.Context, filename string, filesize
 			u.PartSize = partSizeBytes
 		})
 		object := objName(bh.dir, bh.name, filename)
-
-		_, err := uploader.Upload(&s3manager.UploadInput{
+		sendStats := bh.bs.params.Stats.Scope(stats.Operation("AWS:Request:Send"))
+		_, err := uploader.UploadWithContext(ctx, &s3manager.UploadInput{
 			Bucket:               &bucket,
 			Key:                  object,
 			Body:                 reader,
@@ -179,7 +181,11 @@ func (bh *S3BackupHandle) AddFile(ctx context.Context, filename string, filesize
 			SSECustomerAlgorithm: bh.bs.s3SSE.customerAlg,
 			SSECustomerKey:       bh.bs.s3SSE.customerKey,
 			SSECustomerKeyMD5:    bh.bs.s3SSE.customerMd5,
-		})
+		}, s3manager.WithUploaderRequestOptions(func(r *request.Request) {
+			r.Handlers.CompleteAttempt.PushBack(func(r *request.Request) {
+				sendStats.TimedIncrement(time.Since(r.AttemptTime))
+			})
+		}))
 		if err != nil {
 			reader.CloseWithError(err)
 			bh.RecordError(err)
@@ -212,12 +218,17 @@ func (bh *S3BackupHandle) ReadFile(ctx context.Context, filename string) (io.Rea
 		return nil, fmt.Errorf("ReadFile cannot be called on read-write backup")
 	}
 	object := objName(bh.dir, bh.name, filename)
-	out, err := bh.client.GetObject(&s3.GetObjectInput{
+	sendStats := bh.bs.params.Stats.Scope(stats.Operation("AWS:Request:Send"))
+	out, err := bh.client.GetObjectWithContext(ctx, &s3.GetObjectInput{
 		Bucket:               &bucket,
 		Key:                  object,
 		SSECustomerAlgorithm: bh.bs.s3SSE.customerAlg,
 		SSECustomerKey:       bh.bs.s3SSE.customerKey,
 		SSECustomerKeyMD5:    bh.bs.s3SSE.customerMd5,
+	}, func(r *request.Request) {
+		r.Handlers.CompleteAttempt.PushBack(func(r *request.Request) {
+			sendStats.TimedIncrement(time.Since(r.AttemptTime))
+		})
 	})
 	if err != nil {
 		return nil, err
@@ -272,6 +283,7 @@ type S3BackupStorage struct {
 	_client *s3.S3
 	mu      sync.Mutex
 	s3SSE   S3ServerSideEncryption
+	params  backupstorage.Params
 }
 
 // ListBackups is part of the backupstorage.BackupStorage interface.
@@ -411,8 +423,7 @@ func (bs *S3BackupStorage) Close() error {
 }
 
 func (bs *S3BackupStorage) WithParams(params backupstorage.Params) backupstorage.BackupStorage {
-	// TODO(maxeng): return a new S3BackupStorage that uses params.
-	return bs
+	return &S3BackupStorage{params: params}
 }
 
 var _ backupstorage.BackupStorage = (*S3BackupStorage)(nil)
@@ -485,7 +496,7 @@ func objName(parts ...string) *string {
 }
 
 func init() {
-	backupstorage.BackupStorageMap["s3"] = &S3BackupStorage{}
+	backupstorage.BackupStorageMap["s3"] = &S3BackupStorage{params: backupstorage.NoParams()}
 
 	logNameMap = logNameToLogLevel{
 		"LogOff":                     aws.LogOff,
diff --git a/go/vt/mysqlctl/s3backupstorage/s3_test.go b/go/vt/mysqlctl/s3backupstorage/s3_test.go
index 5303d88e5e5..a10432b78c2 100644
--- a/go/vt/mysqlctl/s3backupstorage/s3_test.go
+++ b/go/vt/mysqlctl/s3backupstorage/s3_test.go
@@ -5,31 +5,135 @@ import (
 	"crypto/rand"
 	"encoding/base64"
 	"errors"
+	"fmt"
 	"net/http"
+	"net/url"
 	"os"
 	"testing"
+	"time"
 
 	"github.com/aws/aws-sdk-go/aws"
+	"github.com/aws/aws-sdk-go/aws/client"
 	"github.com/aws/aws-sdk-go/aws/request"
 	"github.com/aws/aws-sdk-go/service/s3"
 	"github.com/aws/aws-sdk-go/service/s3/s3iface"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
+
+	"vitess.io/vitess/go/vt/logutil"
+	stats "vitess.io/vitess/go/vt/mysqlctl/backupstats"
+	"vitess.io/vitess/go/vt/mysqlctl/backupstorage"
 )
 
-type s3ErrorClient struct{ s3iface.S3API }
+type s3FakeClient struct {
+	s3iface.S3API
+	err   error
+	delay time.Duration
+}
 
-func (s3errclient *s3ErrorClient) PutObjectRequest(in *s3.PutObjectInput) (*request.Request, *s3.PutObjectOutput) {
+func (sfc *s3FakeClient) PutObjectRequest(in *s3.PutObjectInput) (*request.Request, *s3.PutObjectOutput) {
+	u, _ := url.Parse("http://localhost:1234")
 	req := request.Request{
-		HTTPRequest: &http.Request{},          // without this we segfault \_(ツ)_/¯ (see https://github.com/aws/aws-sdk-go/blob/v1.28.8/aws/request/request_context.go#L13)
-		Error:       errors.New("some error"), // this forces req.Send() (which is called by the uploader) to always return non-nil error
+		HTTPRequest: &http.Request{ // without this we segfault \_(ツ)_/¯ (see https://github.com/aws/aws-sdk-go/blob/v1.28.8/aws/request/request_context.go#L13)
+			Header: make(http.Header),
+			URL:    u,
+		},
+		Retryer: client.DefaultRetryer{},
 	}
 
+	req.Handlers.Send.PushBack(func(r *request.Request) {
+		r.Error = sfc.err
+		if sfc.delay > 0 {
+			time.Sleep(sfc.delay)
+		}
+	})
+
 	return &req, &s3.PutObjectOutput{}
 }
 
 func TestAddFileError(t *testing.T) {
-	bh := &S3BackupHandle{client: &s3ErrorClient{}, bs: &S3BackupStorage{}, readOnly: false}
+	bh := &S3BackupHandle{
+		client: &s3FakeClient{err: errors.New("some error")},
+		bs: &S3BackupStorage{
+			params: backupstorage.NoParams(),
+		},
+		readOnly: false,
+	}
+
+	wc, err := bh.AddFile(aws.BackgroundContext(), "somefile", 100000)
+	require.NoErrorf(t, err, "AddFile() expected no error, got %s", err)
+	assert.NotNil(t, wc, "AddFile() expected non-nil WriteCloser")
+
+	n, err := wc.Write([]byte("here are some bytes"))
+	require.NoErrorf(t, err, "TestAddFile() could not write to uploader, got %d bytes written, err %s", n, err)
+
+	err = wc.Close()
+	require.NoErrorf(t, err, "TestAddFile() could not close writer, got %s", err)
+
+	bh.waitGroup.Wait() // wait for the goroutine to finish, at which point it should have recorded an error
+
+	require.True(t, bh.HasErrors(), "AddFile() expected bh to record async error but did not")
+}
+
+func TestAddFileStats(t *testing.T) {
+	fakeStats := stats.NewFakeStats()
+
+	delay := 10 * time.Millisecond
+
+	bh := &S3BackupHandle{
+		client: &s3FakeClient{delay: delay},
+		bs: &S3BackupStorage{
+			params: backupstorage.Params{
+				Logger: logutil.NewMemoryLogger(),
+				Stats:  fakeStats,
+			},
+		},
+		readOnly: false,
+	}
+
+	for i := 0; i < 4; i++ {
+		wc, err := bh.AddFile(aws.BackgroundContext(), fmt.Sprintf("somefile-%d", i), 100000)
+		require.NoErrorf(t, err, "AddFile() expected no error, got %s", err)
+		assert.NotNil(t, wc, "AddFile() expected non-nil WriteCloser")
+
+		n, err := wc.Write([]byte("here are some bytes"))
+		require.NoErrorf(t, err, "TestAddFile() could not write to uploader, got %d bytes written, err %s", n, err)
+
+		err = wc.Close()
+		require.NoErrorf(t, err, "TestAddFile() could not close writer, got %s", err)
+	}
+
+	bh.waitGroup.Wait() // wait for the goroutine to finish, at which point it should have recorded an error
+
+	require.Equal(t, bh.HasErrors(), false, "AddFile() expected bh not to record async errors but did")
+
+	require.Len(t, fakeStats.ScopeCalls, 4)
+	scopedStats := fakeStats.ScopeReturns[0]
+	require.Len(t, scopedStats.ScopeV, 1)
+	require.Equal(t, scopedStats.ScopeV[stats.ScopeOperation], "AWS:Request:Send")
+	require.Len(t, scopedStats.TimedIncrementCalls, 1)
+	require.GreaterOrEqual(t, scopedStats.TimedIncrementCalls[0], delay)
+	require.Len(t, scopedStats.TimedIncrementBytesCalls, 0)
+}
+
+func TestAddFileErrorStats(t *testing.T) {
+	fakeStats := stats.NewFakeStats()
+
+	delay := 10 * time.Millisecond
+
+	bh := &S3BackupHandle{
+		client: &s3FakeClient{
+			delay: delay,
+			err:   errors.New("some error"),
+		},
+		bs: &S3BackupStorage{
+			params: backupstorage.Params{
+				Logger: logutil.NewMemoryLogger(),
+				Stats:  fakeStats,
+			},
+		},
+		readOnly: false,
+	}
 
 	wc, err := bh.AddFile(aws.BackgroundContext(), "somefile", 100000)
 	require.NoErrorf(t, err, "AddFile() expected no error, got %s", err)
@@ -43,7 +147,15 @@ func TestAddFileError(t *testing.T) {
 
 	bh.waitGroup.Wait() // wait for the goroutine to finish, at which point it should have recorded an error
 
-	require.Equal(t, bh.HasErrors(), true, "AddFile() expected bh to record async error but did not")
+	require.True(t, bh.HasErrors(), "AddFile() expected bh not to record async errors but did")
+
+	require.Len(t, fakeStats.ScopeCalls, 1)
+	scopedStats := fakeStats.ScopeReturns[0]
+	require.Len(t, scopedStats.ScopeV, 1)
+	require.Equal(t, scopedStats.ScopeV[stats.ScopeOperation], "AWS:Request:Send")
+	require.Len(t, scopedStats.TimedIncrementCalls, 1)
+	require.GreaterOrEqual(t, scopedStats.TimedIncrementCalls[0], delay)
+	require.Len(t, scopedStats.TimedIncrementBytesCalls, 0)
 }
 
 func TestNoSSE(t *testing.T) {

From 6c7f3dc842f6f744d2942ee727671945bc1310c5 Mon Sep 17 00:00:00 2001
From: Andrew Mason <andrew@planetscale.com>
Date: Wed, 20 Sep 2023 19:40:09 -0400
Subject: [PATCH 5/7] remove query_analyzer binary and release (#14055)

---
 changelog/18.0/18.0.0/summary.md        |   5 +
 go/cmd/query_analyzer/query_analyzer.go | 149 ------------------------
 tools/make-release-packages.sh          |   2 +-
 3 files changed, 6 insertions(+), 150 deletions(-)
 delete mode 100644 go/cmd/query_analyzer/query_analyzer.go

diff --git a/changelog/18.0/18.0.0/summary.md b/changelog/18.0/18.0.0/summary.md
index 02caabc6c85..03e4ffe4640 100644
--- a/changelog/18.0/18.0.0/summary.md
+++ b/changelog/18.0/18.0.0/summary.md
@@ -17,6 +17,7 @@
     - [Deleted `V3` planner](#deleted-v3)
     - [Deleted `k8stopo`](#deleted-k8stopo)
     - [Deleted `vtgr`](#deleted-vtgr)
+    - [Deleted `query_analyzer`](#deleted-query_analyzer)
     - [Deprecated VTBackup stat `DurationByPhase`](#deprecated-vtbackup-stat-duration-by-phase)
   - **[New stats](#new-stats)**
     - [VTGate Vindex unknown parameters](#vtgate-vindex-unknown-parameters)
@@ -127,6 +128,10 @@ the `k8stopo` has been removed.
 
 The `vtgr` has been deprecated in Vitess 17, also see https://github.com/vitessio/vitess/issues/13300. With Vitess 18 `vtgr` has been removed.
 
+#### <a id="deleted-query_analyzer"/>Deleted `query_analyzer`
+
+The undocumented `query_analyzer` binary has been removed in Vitess 18, see https://github.com/vitessio/vitess/issues/14054.
+
 #### <a id="deprecated-vtbackup-stat-duration-by-phase"/>Deprecated VTbackup stat `DurationByPhase`
 
 VTBackup stat `DurationByPhase` is deprecated. Use the binary-valued `Phase` stat instead.
diff --git a/go/cmd/query_analyzer/query_analyzer.go b/go/cmd/query_analyzer/query_analyzer.go
deleted file mode 100644
index 2138bde2673..00000000000
--- a/go/cmd/query_analyzer/query_analyzer.go
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
-Copyright 2019 The Vitess Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package main
-
-import (
-	"bufio"
-	"bytes"
-	"fmt"
-	"io"
-	"os"
-	"sort"
-
-	"github.com/spf13/pflag"
-
-	"vitess.io/vitess/go/acl"
-	"vitess.io/vitess/go/exit"
-	"vitess.io/vitess/go/vt/log"
-	"vitess.io/vitess/go/vt/logutil"
-	"vitess.io/vitess/go/vt/servenv"
-	"vitess.io/vitess/go/vt/sqlparser"
-
-	// Include deprecation warnings for soon-to-be-unsupported flag invocations.
-	_flag "vitess.io/vitess/go/internal/flag"
-)
-
-var (
-	ignores = [][]byte{
-		[]byte("#"),
-		[]byte("/*"),
-		[]byte("SET"),
-		[]byte("use"),
-		[]byte("BEGIN"),
-		[]byte("COMMIT"),
-		[]byte("ROLLBACK"),
-	}
-	bindIndex = 0
-	queries   = make(map[string]int)
-)
-
-type stat struct {
-	Query string
-	Count int
-}
-
-type stats []stat
-
-func (a stats) Len() int           { return len(a) }
-func (a stats) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
-func (a stats) Less(i, j int) bool { return a[i].Count > a[j].Count }
-
-func main() {
-	defer exit.Recover()
-	fs := pflag.NewFlagSet("query_analyzer", pflag.ExitOnError)
-	log.RegisterFlags(fs)
-	logutil.RegisterFlags(fs)
-	acl.RegisterFlags(fs)
-	servenv.RegisterMySQLServerFlags(fs)
-	_flag.Parse(fs)
-	logutil.PurgeLogs()
-	for _, filename := range _flag.Args() {
-		fmt.Printf("processing: %s\n", filename)
-		if err := processFile(filename); err != nil {
-			log.Errorf("processFile error: %v", err)
-			exit.Return(1)
-		}
-	}
-	var stats = make(stats, 0, 128)
-	for k, v := range queries {
-		stats = append(stats, stat{Query: k, Count: v})
-	}
-	sort.Sort(stats)
-	for _, s := range stats {
-		fmt.Printf("%d: %s\n", s.Count, s.Query)
-	}
-}
-
-func processFile(filename string) error {
-	f, err := os.Open(filename)
-	if err != nil {
-		return err
-	}
-	r := bufio.NewReader(f)
-	for {
-		line, err := r.ReadBytes('\n')
-		if err != nil {
-			if err == io.EOF {
-				break
-			}
-			return err
-		}
-		analyze(line)
-	}
-	return nil
-}
-
-func analyze(line []byte) {
-	for _, ignore := range ignores {
-		if bytes.HasPrefix(line, ignore) {
-			return
-		}
-	}
-	dml := string(bytes.TrimRight(line, "\n"))
-	ast, err := sqlparser.Parse(dml)
-	if err != nil {
-		log.Errorf("Error parsing %s", dml)
-		return
-	}
-	bindIndex = 0
-	buf := sqlparser.NewTrackedBuffer(formatWithBind)
-	buf.Myprintf("%v", ast)
-	addQuery(buf.ParsedQuery().Query)
-}
-
-func formatWithBind(buf *sqlparser.TrackedBuffer, node sqlparser.SQLNode) {
-	v, ok := node.(*sqlparser.Literal)
-	if !ok {
-		node.Format(buf)
-		return
-	}
-	switch v.Type {
-	case sqlparser.StrVal, sqlparser.HexVal, sqlparser.IntVal:
-		buf.WriteArg(":", fmt.Sprintf("v%d", bindIndex))
-		bindIndex++
-	default:
-		node.Format(buf)
-	}
-}
-
-func addQuery(query string) {
-	count, ok := queries[query]
-	if !ok {
-		count = 0
-	}
-	queries[query] = count + 1
-}
diff --git a/tools/make-release-packages.sh b/tools/make-release-packages.sh
index 21ecdcda7ee..e1a189d6507 100755
--- a/tools/make-release-packages.sh
+++ b/tools/make-release-packages.sh
@@ -35,7 +35,7 @@ mkdir -p releases
 
 # Copy a subset of binaries from issue #5421
 mkdir -p "${RELEASE_DIR}/bin"
-for binary in vttestserver mysqlctl mysqlctld query_analyzer topo2topo vtaclcheck vtadmin vtbackup vtbench vtclient vtcombo vtctl vtctldclient vtctlclient vtctld vtexplain vtgate vttablet vtorc zk zkctl zkctld; do
+for binary in vttestserver mysqlctl mysqlctld topo2topo vtaclcheck vtadmin vtbackup vtbench vtclient vtcombo vtctl vtctldclient vtctlclient vtctld vtexplain vtgate vttablet vtorc zk zkctl zkctld; do
  cp "bin/$binary" "${RELEASE_DIR}/bin/"
 done;
 

From 408c68734a4948f57ba9a14e8beee7109b1970dc Mon Sep 17 00:00:00 2001
From: Manan Gupta <35839558+GuptaManan100@users.noreply.github.com>
Date: Thu, 21 Sep 2023 10:08:17 +0530
Subject: [PATCH 6/7] Fix cascading Delete failure while using Prepared
 statements (#14048)

Signed-off-by: Manan Gupta <manan@planetscale.com>
---
 go/test/vschemawrapper/vschema_wrapper.go     |  5 ++
 .../vtgate/engine/exec_prepared_statement.go  |  4 +-
 go/vt/vtgate/engine/fk_cascade_test.go        | 10 +++
 go/vt/vtgate/planbuilder/plan_test.go         |  6 +-
 .../testdata/foreignkey_cases.json            | 81 +++++++++++++++++++
 5 files changed, 103 insertions(+), 3 deletions(-)

diff --git a/go/test/vschemawrapper/vschema_wrapper.go b/go/test/vschemawrapper/vschema_wrapper.go
index e85b18ce36d..11038fd151f 100644
--- a/go/test/vschemawrapper/vschema_wrapper.go
+++ b/go/test/vschemawrapper/vschema_wrapper.go
@@ -67,6 +67,11 @@ func (vw *VSchemaWrapper) GetPrepareData(stmtName string) *vtgatepb.PrepareData
 			PrepareStatement: "select 1 from user",
 			ParamsCount:      0,
 		}
+	case "prep_delete":
+		return &vtgatepb.PrepareData{
+			PrepareStatement: "delete from tbl5 where id = :v1",
+			ParamsCount:      1,
+		}
 	}
 	return nil
 }
diff --git a/go/vt/vtgate/engine/exec_prepared_statement.go b/go/vt/vtgate/engine/exec_prepared_statement.go
index c9a23d89e12..1874350f7db 100644
--- a/go/vt/vtgate/engine/exec_prepared_statement.go
+++ b/go/vt/vtgate/engine/exec_prepared_statement.go
@@ -31,8 +31,10 @@ var _ Primitive = (*ExecStmt)(nil)
 type ExecStmt struct {
 	Params []*sqlparser.Variable
 	Input  Primitive
+}
 
-	noTxNeeded
+func (e *ExecStmt) NeedsTransaction() bool {
+	return e.Input.NeedsTransaction()
 }
 
 func (e *ExecStmt) RouteType() string {
diff --git a/go/vt/vtgate/engine/fk_cascade_test.go b/go/vt/vtgate/engine/fk_cascade_test.go
index 6c89feebf95..6f8eec4265a 100644
--- a/go/vt/vtgate/engine/fk_cascade_test.go
+++ b/go/vt/vtgate/engine/fk_cascade_test.go
@@ -148,3 +148,13 @@ func TestUpdateCascade(t *testing.T) {
 		`ExecuteMultiShard ks.0: update parent set cola = 1 where foo = 48 {} true true`,
 	})
 }
+
+// TestNeedsTransactionInExecPrepared tests that if we have a foreign key cascade inside an ExecStmt plan, then we do mark the plan to require a transaction.
+func TestNeedsTransactionInExecPrepared(t *testing.T) {
+	// Even if FkCascade is wrapped in ExecStmt, the plan should be marked such that it requires a transaction.
+	// This is necessary because if we don't run the cascades for DMLs in a transaction, we might end up committing partial writes that should eventually be rolled back.
+	execPrepared := &ExecStmt{
+		Input: &FkCascade{},
+	}
+	require.True(t, execPrepared.NeedsTransaction())
+}
diff --git a/go/vt/vtgate/planbuilder/plan_test.go b/go/vt/vtgate/planbuilder/plan_test.go
index d79436f3850..eb9e5dacccb 100644
--- a/go/vt/vtgate/planbuilder/plan_test.go
+++ b/go/vt/vtgate/planbuilder/plan_test.go
@@ -102,7 +102,8 @@ func TestForeignKeyPlanning(t *testing.T) {
 	vschema := loadSchema(t, "vschemas/schema.json", true)
 	setFks(t, vschema)
 	vschemaWrapper := &vschemawrapper.VSchemaWrapper{
-		V: vschema,
+		V:           vschema,
+		TestBuilder: TestBuilder,
 	}
 
 	testOutputTempDir := makeTestOutput(t)
@@ -212,7 +213,8 @@ func TestOne(t *testing.T) {
 	lv := loadSchema(t, "vschemas/schema.json", true)
 	setFks(t, lv)
 	vschema := &vschemawrapper.VSchemaWrapper{
-		V: lv,
+		V:           lv,
+		TestBuilder: TestBuilder,
 	}
 
 	testFile(t, "onecase.json", "", vschema, false)
diff --git a/go/vt/vtgate/planbuilder/testdata/foreignkey_cases.json b/go/vt/vtgate/planbuilder/testdata/foreignkey_cases.json
index c9d9097e9a4..0b88ac1e9b2 100644
--- a/go/vt/vtgate/planbuilder/testdata/foreignkey_cases.json
+++ b/go/vt/vtgate/planbuilder/testdata/foreignkey_cases.json
@@ -1389,5 +1389,86 @@
         "unsharded_fk_allow.u_multicol_tbl3"
       ]
     }
+  },
+  {
+    "comment": "Cascaded delete run from prepared statement",
+    "query": "execute prep_delete using @foo",
+    "plan": {
+      "QueryType": "EXECUTE",
+      "Original": "execute prep_delete using @foo",
+      "Instructions": {
+        "OperatorType": "EXECUTE",
+        "Parameters": [
+          "foo"
+        ],
+        "Inputs": [
+          {
+            "OperatorType": "FkCascade",
+            "Inputs": [
+              {
+                "InputName": "Selection",
+                "OperatorType": "Route",
+                "Variant": "Scatter",
+                "Keyspace": {
+                  "Name": "sharded_fk_allow",
+                  "Sharded": true
+                },
+                "FieldQuery": "select col5, t5col5 from tbl5 where 1 != 1",
+                "Query": "select col5, t5col5 from tbl5 where id = :v1 for update",
+                "Table": "tbl5"
+              },
+              {
+                "InputName": "CascadeChild-1",
+                "OperatorType": "Delete",
+                "Variant": "Scatter",
+                "Keyspace": {
+                  "Name": "sharded_fk_allow",
+                  "Sharded": true
+                },
+                "TargetTabletType": "PRIMARY",
+                "BvName": "fkc_vals",
+                "Cols": [
+                  0
+                ],
+                "Query": "delete from tbl4 where (col4) in ::fkc_vals",
+                "Table": "tbl4"
+              },
+              {
+                "InputName": "CascadeChild-2",
+                "OperatorType": "Delete",
+                "Variant": "Scatter",
+                "Keyspace": {
+                  "Name": "sharded_fk_allow",
+                  "Sharded": true
+                },
+                "TargetTabletType": "PRIMARY",
+                "BvName": "fkc_vals1",
+                "Cols": [
+                  1
+                ],
+                "Query": "delete from tbl4 where (t4col4) in ::fkc_vals1",
+                "Table": "tbl4"
+              },
+              {
+                "InputName": "Parent",
+                "OperatorType": "Delete",
+                "Variant": "Scatter",
+                "Keyspace": {
+                  "Name": "sharded_fk_allow",
+                  "Sharded": true
+                },
+                "TargetTabletType": "PRIMARY",
+                "Query": "delete from tbl5 where id = :v1",
+                "Table": "tbl5"
+              }
+            ]
+          }
+        ]
+      },
+      "TablesUsed": [
+        "sharded_fk_allow.tbl4",
+        "sharded_fk_allow.tbl5"
+      ]
+    }
   }
 ]

From 5d9ee02204752df69d8e582625c6d21ff66fabf4 Mon Sep 17 00:00:00 2001
From: Dirkjan Bussink <d.bussink@gmail.com>
Date: Thu, 21 Sep 2023 11:16:12 +0200
Subject: [PATCH 7/7] evalengine: Mark UUID() function as non-constant (#14051)

Signed-off-by: Dirkjan Bussink <d.bussink@gmail.com>
---
 go/vt/vtgate/evalengine/compiler_test.go | 62 ++++++++++++++++++++++++
 go/vt/vtgate/evalengine/fn_misc.go       |  4 ++
 2 files changed, 66 insertions(+)

diff --git a/go/vt/vtgate/evalengine/compiler_test.go b/go/vt/vtgate/evalengine/compiler_test.go
index e1e905a6efa..efcf0036acb 100644
--- a/go/vt/vtgate/evalengine/compiler_test.go
+++ b/go/vt/vtgate/evalengine/compiler_test.go
@@ -587,3 +587,65 @@ func TestBindVarLiteral(t *testing.T) {
 		})
 	}
 }
+
+func TestCompilerNonConstant(t *testing.T) {
+	var testCases = []struct {
+		expression string
+	}{
+		{
+			expression: "RANDOM_BYTES(4)",
+		},
+		{
+			expression: "UUID()",
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.expression, func(t *testing.T) {
+			expr, err := sqlparser.ParseExpr(tc.expression)
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			cfg := &evalengine.Config{
+				Collation:    collations.CollationUtf8mb4ID,
+				Optimization: evalengine.OptimizationLevelCompile,
+			}
+
+			converted, err := evalengine.Translate(expr, cfg)
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			env := evalengine.EmptyExpressionEnv()
+			var prev string
+			for i := 0; i < 1000; i++ {
+				expected, err := env.Evaluate(evalengine.Deoptimize(converted))
+				if err != nil {
+					t.Fatal(err)
+				}
+				if expected.String() == prev {
+					t.Fatalf("constant evaluation from eval engine: got %s multiple times", expected.String())
+				}
+				prev = expected.String()
+			}
+
+			if cfg.CompilerErr != nil {
+				t.Fatalf("bad compilation: %v", cfg.CompilerErr)
+			}
+
+			// re-run the same evaluation multiple times to ensure results are always consistent
+			for i := 0; i < 1000; i++ {
+				res, err := env.EvaluateVM(converted.(*evalengine.CompiledExpr))
+				if err != nil {
+					t.Fatal(err)
+				}
+
+				if res.String() == prev {
+					t.Fatalf("constant evaluation from eval engine: got %s multiple times", res.String())
+				}
+				prev = res.String()
+			}
+		})
+	}
+}
diff --git a/go/vt/vtgate/evalengine/fn_misc.go b/go/vt/vtgate/evalengine/fn_misc.go
index 96522a2314f..04770c387af 100644
--- a/go/vt/vtgate/evalengine/fn_misc.go
+++ b/go/vt/vtgate/evalengine/fn_misc.go
@@ -586,6 +586,10 @@ func (call *builtinUUID) compile(c *compiler) (ctype, error) {
 	return ctype{Type: sqltypes.VarChar, Flag: 0, Col: collationUtf8mb3}, nil
 }
 
+func (call *builtinUUID) constant() bool {
+	return false
+}
+
 func (call *builtinUUIDToBin) eval(env *ExpressionEnv) (eval, error) {
 	arg, err := call.arg1(env)
 	if arg == nil || err != nil {