From 194c9e4c1136489dfae5a2112fa7982ef0eec30e Mon Sep 17 00:00:00 2001 From: Andrew Mason Date: Sat, 16 Sep 2023 10:14:38 -0400 Subject: [PATCH 01/12] migrate vtaclcheck to cobra Signed-off-by: Andrew Mason --- go/cmd/vtaclcheck/cli/vtactlcheck.go | 68 ++++++++++++++++++++++++++++ go/cmd/vtaclcheck/docgen/main.go | 37 +++++++++++++++ go/cmd/vtaclcheck/vtaclcheck.go | 37 ++------------- go/flags/endtoend/vtaclcheck.txt | 9 +++- 4 files changed, 115 insertions(+), 36 deletions(-) create mode 100644 go/cmd/vtaclcheck/cli/vtactlcheck.go create mode 100644 go/cmd/vtaclcheck/docgen/main.go diff --git a/go/cmd/vtaclcheck/cli/vtactlcheck.go b/go/cmd/vtaclcheck/cli/vtactlcheck.go new file mode 100644 index 00000000000..d6a71a23252 --- /dev/null +++ b/go/cmd/vtaclcheck/cli/vtactlcheck.go @@ -0,0 +1,68 @@ +/* +Copyright 2023 The Vitess Authors + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cli + +import ( + "github.com/spf13/cobra" + + "vitess.io/vitess/go/acl" + "vitess.io/vitess/go/vt/logutil" + "vitess.io/vitess/go/vt/servenv" + "vitess.io/vitess/go/vt/vtaclcheck" +) + +var ( + aclFile string + staticAuthFile string + + Main = &cobra.Command{ + Use: "vtaclcheck", + Short: "vtaclcheck checks that the access-control list (ACL) rules in a given file are valid.", + Args: cobra.NoArgs, + Version: servenv.AppVersion.String(), + PreRunE: servenv.CobraPreRunE, + PostRun: func(cmd *cobra.Command, args []string) { + logutil.Flush() + }, + RunE: run, + } +) + +func run(cmd *cobra.Command, args []string) error { + servenv.Init() + defer servenv.Close() + + opts := &vtaclcheck.Options{ + ACLFile: aclFile, + StaticAuthFile: staticAuthFile, + } + + if err := vtaclcheck.Init(opts); err != nil { + return err + } + + return vtaclcheck.Run() +} + +func init() { + servenv.MoveFlagsToCobraCommand(Main) + + Main.Flags().StringVar(&aclFile, "acl-file", aclFile, "The path of the JSON ACL file to check") + Main.Flags().StringVar(&staticAuthFile, "static-auth-file", staticAuthFile, "The path of the auth_server_static JSON file to check") + + acl.RegisterFlags(Main.Flags()) +} diff --git a/go/cmd/vtaclcheck/docgen/main.go b/go/cmd/vtaclcheck/docgen/main.go new file mode 100644 index 00000000000..d3da8b76179 --- /dev/null +++ b/go/cmd/vtaclcheck/docgen/main.go @@ -0,0 +1,37 @@ +/* +Copyright 2023 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "github.com/spf13/cobra" + + "vitess.io/vitess/go/cmd/internal/docgen" + "vitess.io/vitess/go/cmd/vtaclcheck/cli" +) + +func main() { + var dir string + cmd := cobra.Command{ + Use: "docgen [-d ]", + RunE: func(cmd *cobra.Command, args []string) error { + return docgen.GenerateMarkdownTree(cli.Main, dir) + }, + } + + cmd.Flags().StringVarP(&dir, "dir", "d", "doc", "output directory to write documentation") + _ = cmd.Execute() +} diff --git a/go/cmd/vtaclcheck/vtaclcheck.go b/go/cmd/vtaclcheck/vtaclcheck.go index 8b916a8cc0c..bec4cf95fe9 100644 --- a/go/cmd/vtaclcheck/vtaclcheck.go +++ b/go/cmd/vtaclcheck/vtaclcheck.go @@ -19,52 +19,21 @@ package main import ( "fmt" - "github.com/spf13/pflag" - - "vitess.io/vitess/go/acl" + "vitess.io/vitess/go/cmd/vtaclcheck/cli" "vitess.io/vitess/go/exit" "vitess.io/vitess/go/vt/logutil" - "vitess.io/vitess/go/vt/servenv" - "vitess.io/vitess/go/vt/vtaclcheck" ) -var aclFile, staticAuthFile string - func init() { logger := logutil.NewConsoleLogger() - servenv.OnParse(func(fs *pflag.FlagSet) { - fs.StringVar(&aclFile, "acl-file", aclFile, "The path of the JSON ACL file to check") - fs.StringVar(&staticAuthFile, "static-auth-file", staticAuthFile, "The path of the auth_server_static JSON file to check") - - acl.RegisterFlags(fs) - - fs.SetOutput(logutil.NewLoggerWriter(logger)) - }) + cli.Main.SetOutput(logutil.NewLoggerWriter(logger)) } func main() { defer exit.RecoverAll() - defer logutil.Flush() - - servenv.ParseFlags("vtaclcheck") - servenv.Init() - err := run() - if err != nil { + if err := cli.Main.Execute(); err != nil { fmt.Printf("ERROR: %s\n", err) exit.Return(1) } } - -func run() error { - opts := &vtaclcheck.Options{ - ACLFile: aclFile, - StaticAuthFile: staticAuthFile, - } - - if err := vtaclcheck.Init(opts); err != nil { - return err - } - - return vtaclcheck.Run() -} diff --git a/go/flags/endtoend/vtaclcheck.txt b/go/flags/endtoend/vtaclcheck.txt index 001d3a5b192..34bef9a05f9 100644 --- a/go/flags/endtoend/vtaclcheck.txt +++ b/go/flags/endtoend/vtaclcheck.txt @@ -1,4 +1,9 @@ -Usage of vtaclcheck: +vtaclcheck checks that the access-control list (ACL) rules in a given file are valid. + +Usage: + vtaclcheck [flags] + +Flags: --acl-file string The path of the JSON ACL file to check --alsologtostderr log to standard error as well as files --config-file string Full path of the config file (with extension) to use. If set, --config-path, --config-type, and --config-name are ignored. @@ -7,7 +12,7 @@ Usage of vtaclcheck: --config-path strings Paths to search for config files in. (default [{{ .Workdir }}]) --config-persistence-min-interval duration minimum interval between persisting dynamic config changes back to disk (if no change has occurred, nothing is done). (default 1s) --config-type string Config file type (omit to infer config type from file extension). - -h, --help display usage and exit + -h, --help help for vtaclcheck --keep_logs duration keep logs for this long (using ctime) (zero to keep forever) --keep_logs_by_mtime duration keep logs for this long (using mtime) (zero to keep forever) --log_backtrace_at traceLocation when logging hits line file:N, emit a stack trace (default :0) From 2c969f2b950dd94233effff702f8ce71db1d7f57 Mon Sep 17 00:00:00 2001 From: Andrew Mason Date: Sat, 16 Sep 2023 12:36:20 -0400 Subject: [PATCH 02/12] migrate vtbackup to cobra Signed-off-by: Andrew Mason --- .../{ => cli}/plugin_azblobbackupstorage.go | 2 +- .../{ => cli}/plugin_cephbackupstorage.go | 2 +- .../vtbackup/{ => cli}/plugin_consultopo.go | 2 +- go/cmd/vtbackup/{ => cli}/plugin_etcd2topo.go | 2 +- .../{ => cli}/plugin_filebackupstorage.go | 2 +- .../{ => cli}/plugin_gcsbackupstorage.go | 2 +- go/cmd/vtbackup/{ => cli}/plugin_opentsdb.go | 2 +- .../{ => cli}/plugin_prometheusbackend.go | 2 +- .../{ => cli}/plugin_s3backupstorage.go | 2 +- go/cmd/vtbackup/{ => cli}/plugin_zk2topo.go | 2 +- go/cmd/vtbackup/cli/vtbackup.go | 871 ++++++++++++++++++ go/cmd/vtbackup/docgen/main.go | 37 + go/cmd/vtbackup/vtbackup.go | 838 +---------------- go/flags/endtoend/vtbackup.txt | 49 +- 14 files changed, 966 insertions(+), 849 deletions(-) rename go/cmd/vtbackup/{ => cli}/plugin_azblobbackupstorage.go (97%) rename go/cmd/vtbackup/{ => cli}/plugin_cephbackupstorage.go (97%) rename go/cmd/vtbackup/{ => cli}/plugin_consultopo.go (97%) rename go/cmd/vtbackup/{ => cli}/plugin_etcd2topo.go (97%) rename go/cmd/vtbackup/{ => cli}/plugin_filebackupstorage.go (97%) rename go/cmd/vtbackup/{ => cli}/plugin_gcsbackupstorage.go (97%) rename go/cmd/vtbackup/{ => cli}/plugin_opentsdb.go (98%) rename go/cmd/vtbackup/{ => cli}/plugin_prometheusbackend.go (98%) rename go/cmd/vtbackup/{ => cli}/plugin_s3backupstorage.go (97%) rename go/cmd/vtbackup/{ => cli}/plugin_zk2topo.go (97%) create mode 100644 go/cmd/vtbackup/cli/vtbackup.go create mode 100644 go/cmd/vtbackup/docgen/main.go diff --git a/go/cmd/vtbackup/plugin_azblobbackupstorage.go b/go/cmd/vtbackup/cli/plugin_azblobbackupstorage.go similarity index 97% rename from go/cmd/vtbackup/plugin_azblobbackupstorage.go rename to go/cmd/vtbackup/cli/plugin_azblobbackupstorage.go index a4ca64096a9..bdadc894aae 100644 --- a/go/cmd/vtbackup/plugin_azblobbackupstorage.go +++ b/go/cmd/vtbackup/cli/plugin_azblobbackupstorage.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package main +package cli import ( _ "vitess.io/vitess/go/vt/mysqlctl/azblobbackupstorage" diff --git a/go/cmd/vtbackup/plugin_cephbackupstorage.go b/go/cmd/vtbackup/cli/plugin_cephbackupstorage.go similarity index 97% rename from go/cmd/vtbackup/plugin_cephbackupstorage.go rename to go/cmd/vtbackup/cli/plugin_cephbackupstorage.go index 819cb108126..2f5a825f270 100644 --- a/go/cmd/vtbackup/plugin_cephbackupstorage.go +++ b/go/cmd/vtbackup/cli/plugin_cephbackupstorage.go @@ -13,7 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -package main +package cli import ( _ "vitess.io/vitess/go/vt/mysqlctl/cephbackupstorage" diff --git a/go/cmd/vtbackup/plugin_consultopo.go b/go/cmd/vtbackup/cli/plugin_consultopo.go similarity index 97% rename from go/cmd/vtbackup/plugin_consultopo.go rename to go/cmd/vtbackup/cli/plugin_consultopo.go index 2b6f10e2b28..c2f8de3339e 100644 --- a/go/cmd/vtbackup/plugin_consultopo.go +++ b/go/cmd/vtbackup/cli/plugin_consultopo.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package main +package cli import ( _ "vitess.io/vitess/go/vt/topo/consultopo" diff --git a/go/cmd/vtbackup/plugin_etcd2topo.go b/go/cmd/vtbackup/cli/plugin_etcd2topo.go similarity index 97% rename from go/cmd/vtbackup/plugin_etcd2topo.go rename to go/cmd/vtbackup/cli/plugin_etcd2topo.go index 97412e65755..e4d6d4129ff 100644 --- a/go/cmd/vtbackup/plugin_etcd2topo.go +++ b/go/cmd/vtbackup/cli/plugin_etcd2topo.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package main +package cli import ( _ "vitess.io/vitess/go/vt/topo/etcd2topo" diff --git a/go/cmd/vtbackup/plugin_filebackupstorage.go b/go/cmd/vtbackup/cli/plugin_filebackupstorage.go similarity index 97% rename from go/cmd/vtbackup/plugin_filebackupstorage.go rename to go/cmd/vtbackup/cli/plugin_filebackupstorage.go index 31417781026..68bf790c827 100644 --- a/go/cmd/vtbackup/plugin_filebackupstorage.go +++ b/go/cmd/vtbackup/cli/plugin_filebackupstorage.go @@ -13,7 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -package main +package cli import ( _ "vitess.io/vitess/go/vt/mysqlctl/filebackupstorage" diff --git a/go/cmd/vtbackup/plugin_gcsbackupstorage.go b/go/cmd/vtbackup/cli/plugin_gcsbackupstorage.go similarity index 97% rename from go/cmd/vtbackup/plugin_gcsbackupstorage.go rename to go/cmd/vtbackup/cli/plugin_gcsbackupstorage.go index 2319d0aa7fe..eff9339a318 100644 --- a/go/cmd/vtbackup/plugin_gcsbackupstorage.go +++ b/go/cmd/vtbackup/cli/plugin_gcsbackupstorage.go @@ -13,7 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -package main +package cli import ( _ "vitess.io/vitess/go/vt/mysqlctl/gcsbackupstorage" diff --git a/go/cmd/vtbackup/plugin_opentsdb.go b/go/cmd/vtbackup/cli/plugin_opentsdb.go similarity index 98% rename from go/cmd/vtbackup/plugin_opentsdb.go rename to go/cmd/vtbackup/cli/plugin_opentsdb.go index 44ac886d420..597e426cc09 100644 --- a/go/cmd/vtbackup/plugin_opentsdb.go +++ b/go/cmd/vtbackup/cli/plugin_opentsdb.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package main +package cli import "vitess.io/vitess/go/stats/opentsdb" diff --git a/go/cmd/vtbackup/plugin_prometheusbackend.go b/go/cmd/vtbackup/cli/plugin_prometheusbackend.go similarity index 98% rename from go/cmd/vtbackup/plugin_prometheusbackend.go rename to go/cmd/vtbackup/cli/plugin_prometheusbackend.go index de4ecbb5e9f..3cf256e76c1 100644 --- a/go/cmd/vtbackup/plugin_prometheusbackend.go +++ b/go/cmd/vtbackup/cli/plugin_prometheusbackend.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package main +package cli // This plugin imports Prometheus to allow for instrumentation // with the Prometheus client library diff --git a/go/cmd/vtbackup/plugin_s3backupstorage.go b/go/cmd/vtbackup/cli/plugin_s3backupstorage.go similarity index 97% rename from go/cmd/vtbackup/plugin_s3backupstorage.go rename to go/cmd/vtbackup/cli/plugin_s3backupstorage.go index 917352f2469..27b4ef06dee 100644 --- a/go/cmd/vtbackup/plugin_s3backupstorage.go +++ b/go/cmd/vtbackup/cli/plugin_s3backupstorage.go @@ -13,7 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -package main +package cli import ( _ "vitess.io/vitess/go/vt/mysqlctl/s3backupstorage" diff --git a/go/cmd/vtbackup/plugin_zk2topo.go b/go/cmd/vtbackup/cli/plugin_zk2topo.go similarity index 97% rename from go/cmd/vtbackup/plugin_zk2topo.go rename to go/cmd/vtbackup/cli/plugin_zk2topo.go index 5819d2d39ed..914a9b924f9 100644 --- a/go/cmd/vtbackup/plugin_zk2topo.go +++ b/go/cmd/vtbackup/cli/plugin_zk2topo.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package main +package cli import ( _ "vitess.io/vitess/go/vt/topo/zk2topo" diff --git a/go/cmd/vtbackup/cli/vtbackup.go b/go/cmd/vtbackup/cli/vtbackup.go new file mode 100644 index 00000000000..20931b7db1c --- /dev/null +++ b/go/cmd/vtbackup/cli/vtbackup.go @@ -0,0 +1,871 @@ +/* +Copyright 2023 The Vitess Authors + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cli + +import ( + "context" + "crypto/rand" + "fmt" + "math" + "math/big" + "os" + "strings" + "syscall" + "time" + + "github.com/spf13/cobra" + + "vitess.io/vitess/go/mysql/replication" + + "vitess.io/vitess/go/acl" + "vitess.io/vitess/go/cmd" + "vitess.io/vitess/go/exit" + "vitess.io/vitess/go/stats" + "vitess.io/vitess/go/vt/dbconfigs" + "vitess.io/vitess/go/vt/log" + "vitess.io/vitess/go/vt/logutil" + "vitess.io/vitess/go/vt/mysqlctl" + "vitess.io/vitess/go/vt/mysqlctl/backupstats" + "vitess.io/vitess/go/vt/mysqlctl/backupstorage" + topodatapb "vitess.io/vitess/go/vt/proto/topodata" + "vitess.io/vitess/go/vt/servenv" + "vitess.io/vitess/go/vt/topo" + "vitess.io/vitess/go/vt/topo/topoproto" + "vitess.io/vitess/go/vt/vterrors" + _ "vitess.io/vitess/go/vt/vttablet/grpctmclient" + "vitess.io/vitess/go/vt/vttablet/tmclient" +) + +const ( + // operationTimeout is the timeout for individual operations like fetching + // the primary position. This does not impose an overall timeout on + // long-running processes like taking the backup. It only applies to + // steps along the way that should complete quickly. This ensures we don't + // place a hard cap on the overall time for a backup, while also not waiting + // forever for things that should be quick. + operationTimeout = 1 * time.Minute + + phaseNameCatchupReplication = "CatchupReplication" + phaseNameInitialBackup = "InitialBackup" + phaseNameRestoreLastBackup = "RestoreLastBackup" + phaseNameTakeNewBackup = "TakeNewBackup" + phaseStatusCatchupReplicationStalled = "Stalled" + phaseStatusCatchupReplicationStopped = "Stopped" +) + +var ( + minBackupInterval time.Duration + minRetentionTime time.Duration + minRetentionCount = 1 + initialBackup bool + allowFirstBackup bool + restartBeforeBackup bool + upgradeSafe bool + + // vttablet-like flags + initDbNameOverride string + initKeyspace string + initShard string + concurrency = 4 + incrementalFromPos string + + // mysqlctld-like flags + mysqlPort = 3306 + mysqlSocket string + mysqlTimeout = 5 * time.Minute + initDBSQLFile string + detachedMode bool + keepAliveTimeout time.Duration + disableRedoLog bool + + // Deprecated, use "Phase" instead. + deprecatedDurationByPhase = stats.NewGaugesWithSingleLabel( + "DurationByPhaseSeconds", + "[DEPRECATED] How long it took vtbackup to perform each phase (in seconds).", + "phase", + ) + + // This gauge is updated 3*N times during the course of a vtbackup run, + // where N is the number of different phases vtbackup transitions through. + // Once to initialize to 0, another time to set the phase to active (1), + // and another to deactivate the phase (back to 0). + // + // At most a single phase is active at a given time. + // + // The sync gauge immediately reports changes to push-backed backends. + // The benefit of the sync gauge is that it makes verifying stats in + // integration tests a lot more tractable. + phase = stats.NewSyncGaugesWithSingleLabel( + "Phase", + "Active phase.", + "phase", + ) + phaseNames = []string{ + phaseNameCatchupReplication, + phaseNameInitialBackup, + phaseNameRestoreLastBackup, + phaseNameTakeNewBackup, + } + phaseStatus = stats.NewGaugesWithMultiLabels( + "PhaseStatus", + "Internal state of vtbackup phase.", + []string{"phase", "status"}, + ) + phaseStatuses = map[string][]string{ + phaseNameCatchupReplication: { + phaseStatusCatchupReplicationStalled, + phaseStatusCatchupReplicationStopped, + }, + } + + Main = &cobra.Command{ + Use: "vtbackup", + Short: "vtbackup is a batch command to perform a single pass of backup maintenance for a shard.", + Long: `vtbackup is a batch command to perform a single pass of backup maintenance for a shard. + +When run periodically for each shard, vtbackup can ensure these configurable policies: + * There is always a recent backup for the shard. + * Old backups for the shard are removed. + +Whatever system launches vtbackup is responsible for the following: + - Running vtbackup with similar flags that would be used for a vttablet and + mysqlctld in the target shard to be backed up. + - Provisioning as much disk space for vtbackup as would be given to vttablet. + The data directory MUST be empty at startup. Do NOT reuse a persistent disk. + - Running vtbackup periodically for each shard, for each backup storage location. + - Ensuring that at most one instance runs at a time for a given pair of shard + and backup storage location. + - Retrying vtbackup if it fails. + - Alerting human operators if the failure is persistent. + +The process vtbackup follows to take a new backup is as follows: + 1. Restore from the most recent backup. + 2. Start a mysqld instance (but no vttablet) from the restored data. + 3. Instruct mysqld to connect to the current shard primary and replicate any + transactions that are new since the last backup. + 4. Ask the primary for its current replication position and set that as the goal + for catching up on replication before taking the backup, so the goalposts + don't move. + 5. Wait until replication is caught up to the goal position or beyond. + 6. Stop mysqld and take a new backup. + +Aside from additional replication load while vtbackup's mysqld catches up on +new transactions, the shard should be otherwise unaffected. Existing tablets +will continue to serve, and no new tablets will appear in topology, meaning no +query traffic will ever be routed to vtbackup's mysqld. This silent operation +mode helps make backups minimally disruptive to serving capacity and orthogonal +to the handling of the query path. + +The command-line parameters to vtbackup specify a policy for when a new backup +is needed, and when old backups should be removed. If the existing backups +already satisfy the policy, then vtbackup will do nothing and return success +immediately.`, + Version: servenv.AppVersion.String(), + Args: cobra.NoArgs, + PreRunE: servenv.CobraPreRunE, + RunE: run, + } +) + +func init() { + servenv.RegisterDefaultFlags() + dbconfigs.RegisterFlags(dbconfigs.All...) + mysqlctl.RegisterFlags() + + servenv.MoveFlagsToCobraCommand(Main) + + Main.Flags().DurationVar(&minBackupInterval, "min_backup_interval", minBackupInterval, "Only take a new backup if it's been at least this long since the most recent backup.") + Main.Flags().DurationVar(&minRetentionTime, "min_retention_time", minRetentionTime, "Keep each old backup for at least this long before removing it. Set to 0 to disable pruning of old backups.") + Main.Flags().IntVar(&minRetentionCount, "min_retention_count", minRetentionCount, "Always keep at least this many of the most recent backups in this backup storage location, even if some are older than the min_retention_time. This must be at least 1 since a backup must always exist to allow new backups to be made") + Main.Flags().BoolVar(&initialBackup, "initial_backup", initialBackup, "Instead of restoring from backup, initialize an empty database with the provided init_db_sql_file and upload a backup of that for the shard, if the shard has no backups yet. This can be used to seed a brand new shard with an initial, empty backup. If any backups already exist for the shard, this will be considered a successful no-op. This can only be done before the shard exists in topology (i.e. before any tablets are deployed).") + Main.Flags().BoolVar(&allowFirstBackup, "allow_first_backup", allowFirstBackup, "Allow this job to take the first backup of an existing shard.") + Main.Flags().BoolVar(&restartBeforeBackup, "restart_before_backup", restartBeforeBackup, "Perform a mysqld clean/full restart after applying binlogs, but before taking the backup. Only makes sense to work around xtrabackup bugs.") + Main.Flags().BoolVar(&upgradeSafe, "upgrade-safe", upgradeSafe, "Whether to use innodb_fast_shutdown=0 for the backup so it is safe to use for MySQL upgrades.") + + // vttablet-like flags + Main.Flags().StringVar(&initDbNameOverride, "init_db_name_override", initDbNameOverride, "(init parameter) override the name of the db used by vttablet") + Main.Flags().StringVar(&initKeyspace, "init_keyspace", initKeyspace, "(init parameter) keyspace to use for this tablet") + Main.Flags().StringVar(&initShard, "init_shard", initShard, "(init parameter) shard to use for this tablet") + Main.Flags().IntVar(&concurrency, "concurrency", concurrency, "(init restore parameter) how many concurrent files to restore at once") + Main.Flags().StringVar(&incrementalFromPos, "incremental_from_pos", incrementalFromPos, "Position of previous backup. Default: empty. If given, then this backup becomes an incremental backup from given position. If value is 'auto', backup taken from last successful backup position") + + // mysqlctld-like flags + Main.Flags().IntVar(&mysqlPort, "mysql_port", mysqlPort, "mysql port") + Main.Flags().StringVar(&mysqlSocket, "mysql_socket", mysqlSocket, "path to the mysql socket") + Main.Flags().DurationVar(&mysqlTimeout, "mysql_timeout", mysqlTimeout, "how long to wait for mysqld startup") + Main.Flags().StringVar(&initDBSQLFile, "init_db_sql_file", initDBSQLFile, "path to .sql file to run after mysql_install_db") + Main.Flags().BoolVar(&detachedMode, "detach", detachedMode, "detached mode - run backups detached from the terminal") + Main.Flags().DurationVar(&keepAliveTimeout, "keep-alive-timeout", keepAliveTimeout, "Wait until timeout elapses after a successful backup before shutting down.") + Main.Flags().BoolVar(&disableRedoLog, "disable-redo-log", disableRedoLog, "Disable InnoDB redo log during replication-from-primary phase of backup.") + + acl.RegisterFlags(Main.Flags()) +} + +func run(_ *cobra.Command, args []string) error { + servenv.Init() + defer servenv.Close() + + ctx, cancel := context.WithCancel(context.Background()) + servenv.OnClose(func() { + cancel() + }) + + defer func() { + servenv.ExitChan <- syscall.SIGTERM + <-ctx.Done() + }() + + go servenv.RunDefault() + + if detachedMode { + // this method will call os.Exit and kill this process + cmd.DetachFromTerminalAndExit() + } + + defer logutil.Flush() + + if minRetentionCount < 1 { + log.Errorf("min_retention_count must be at least 1 to allow restores to succeed") + exit.Return(1) + } + + // Open connection backup storage. + backupStorage, err := backupstorage.GetBackupStorage() + if err != nil { + return fmt.Errorf("Can't get backup storage: %w", err) + } + defer backupStorage.Close() + // Open connection to topology server. + topoServer := topo.Open() + defer topoServer.Close() + + // Initialize stats. + for _, phaseName := range phaseNames { + phase.Set(phaseName, int64(0)) + } + for phaseName, statuses := range phaseStatuses { + for _, status := range statuses { + phaseStatus.Set([]string{phaseName, status}, 0) + } + } + + // Try to take a backup, if it's been long enough since the last one. + // Skip pruning if backup wasn't fully successful. We don't want to be + // deleting things if the backup process is not healthy. + backupDir := mysqlctl.GetBackupDir(initKeyspace, initShard) + doBackup, err := shouldBackup(ctx, topoServer, backupStorage, backupDir) + if err != nil { + return fmt.Errorf("Can't take backup: %w", err) + } + if doBackup { + if err := takeBackup(ctx, topoServer, backupStorage); err != nil { + return fmt.Errorf("Failed to take backup: %w", err) + } + } + + // Prune old backups. + if err := pruneBackups(ctx, backupStorage, backupDir); err != nil { + return fmt.Errorf("Couldn't prune old backups: %w", err) + } + + if keepAliveTimeout > 0 { + log.Infof("Backup was successful, waiting %s before exiting (or until context expires).", keepAliveTimeout) + select { + case <-time.After(keepAliveTimeout): + case <-ctx.Done(): + } + } + log.Info("Exiting.") + + return nil +} + +func takeBackup(ctx context.Context, topoServer *topo.Server, backupStorage backupstorage.BackupStorage) error { + // This is an imaginary tablet alias. The value doesn't matter for anything, + // except that we generate a random UID to ensure the target backup + // directory is unique if multiple vtbackup instances are launched for the + // same shard, at exactly the same second, pointed at the same backup + // storage location. + bigN, err := rand.Int(rand.Reader, big.NewInt(math.MaxUint32)) + if err != nil { + return fmt.Errorf("can't generate random tablet UID: %v", err) + } + tabletAlias := &topodatapb.TabletAlias{ + Cell: "vtbackup", + Uid: uint32(bigN.Uint64()), + } + + // Clean up our temporary data dir if we exit for any reason, to make sure + // every invocation of vtbackup starts with a clean slate, and it does not + // accumulate garbage (and run out of disk space) if it's restarted. + tabletDir := mysqlctl.TabletDir(tabletAlias.Uid) + defer func() { + log.Infof("Removing temporary tablet directory: %v", tabletDir) + if err := os.RemoveAll(tabletDir); err != nil { + log.Warningf("Failed to remove temporary tablet directory: %v", err) + } + }() + + // Start up mysqld as if we are mysqlctld provisioning a fresh tablet. + mysqld, mycnf, err := mysqlctl.CreateMysqldAndMycnf(tabletAlias.Uid, mysqlSocket, mysqlPort) + if err != nil { + return fmt.Errorf("failed to initialize mysql config: %v", err) + } + initCtx, initCancel := context.WithTimeout(ctx, mysqlTimeout) + defer initCancel() + initMysqldAt := time.Now() + if err := mysqld.Init(initCtx, mycnf, initDBSQLFile); err != nil { + return fmt.Errorf("failed to initialize mysql data dir and start mysqld: %v", err) + } + deprecatedDurationByPhase.Set("InitMySQLd", int64(time.Since(initMysqldAt).Seconds())) + // Shut down mysqld when we're done. + defer func() { + // Be careful not to use the original context, because we don't want to + // skip shutdown just because we timed out waiting for other things. + mysqlShutdownCtx, mysqlShutdownCancel := context.WithTimeout(context.Background(), 30*time.Second) + defer mysqlShutdownCancel() + if err := mysqld.Shutdown(mysqlShutdownCtx, mycnf, false); err != nil { + log.Errorf("failed to shutdown mysqld: %v", err) + } + }() + + extraEnv := map[string]string{ + "TABLET_ALIAS": topoproto.TabletAliasString(tabletAlias), + } + dbName := initDbNameOverride + if dbName == "" { + dbName = fmt.Sprintf("vt_%s", initKeyspace) + } + + backupParams := mysqlctl.BackupParams{ + Cnf: mycnf, + Mysqld: mysqld, + Logger: logutil.NewConsoleLogger(), + Concurrency: concurrency, + IncrementalFromPos: incrementalFromPos, + HookExtraEnv: extraEnv, + TopoServer: topoServer, + Keyspace: initKeyspace, + Shard: initShard, + TabletAlias: topoproto.TabletAliasString(tabletAlias), + Stats: backupstats.BackupStats(), + UpgradeSafe: upgradeSafe, + } + // In initial_backup mode, just take a backup of this empty database. + if initialBackup { + // Take a backup of this empty DB without restoring anything. + // First, initialize it the way InitShardPrimary would, so this backup + // produces a result that can be used to skip InitShardPrimary entirely. + // This involves resetting replication (to erase any history) and then + // creating the main database and some Vitess system tables. + if err := mysqld.ResetReplication(ctx); err != nil { + return fmt.Errorf("can't reset replication: %v", err) + } + // We need to switch off super_read_only before we create the database. + resetFunc, err := mysqld.SetSuperReadOnly(false) + if err != nil { + return fmt.Errorf("failed to disable super_read_only during backup: %v", err) + } + if resetFunc != nil { + defer func() { + err := resetFunc() + if err != nil { + log.Error("Failed to set super_read_only back to its original value during backup") + } + }() + } + cmd := mysqlctl.GenerateInitialBinlogEntry() + if err := mysqld.ExecuteSuperQueryList(ctx, []string{cmd}); err != nil { + return err + } + + backupParams.BackupTime = time.Now() + // Now we're ready to take the backup. + phase.Set(phaseNameInitialBackup, int64(1)) + defer phase.Set(phaseNameInitialBackup, int64(0)) + if err := mysqlctl.Backup(ctx, backupParams); err != nil { + return fmt.Errorf("backup failed: %v", err) + } + deprecatedDurationByPhase.Set("InitialBackup", int64(time.Since(backupParams.BackupTime).Seconds())) + log.Info("Initial backup successful.") + phase.Set(phaseNameInitialBackup, int64(0)) + return nil + } + + phase.Set(phaseNameRestoreLastBackup, int64(1)) + defer phase.Set(phaseNameRestoreLastBackup, int64(0)) + backupDir := mysqlctl.GetBackupDir(initKeyspace, initShard) + log.Infof("Restoring latest backup from directory %v", backupDir) + restoreAt := time.Now() + params := mysqlctl.RestoreParams{ + Cnf: mycnf, + Mysqld: mysqld, + Logger: logutil.NewConsoleLogger(), + Concurrency: concurrency, + HookExtraEnv: extraEnv, + DeleteBeforeRestore: true, + DbName: dbName, + Keyspace: initKeyspace, + Shard: initShard, + Stats: backupstats.RestoreStats(), + } + backupManifest, err := mysqlctl.Restore(ctx, params) + var restorePos replication.Position + switch err { + case nil: + // if err is nil, we expect backupManifest to be non-nil + restorePos = backupManifest.Position + log.Infof("Successfully restored from backup at replication position %v", restorePos) + case mysqlctl.ErrNoBackup: + // There is no backup found, but we may be taking the initial backup of a shard + if !allowFirstBackup { + return fmt.Errorf("no backup found; not starting up empty since --initial_backup flag was not enabled") + } + restorePos = replication.Position{} + default: + return fmt.Errorf("can't restore from backup: %v", err) + } + deprecatedDurationByPhase.Set("RestoreLastBackup", int64(time.Since(restoreAt).Seconds())) + phase.Set(phaseNameRestoreLastBackup, int64(0)) + + // As of MySQL 8.0.21, you can disable redo logging using the ALTER INSTANCE + // DISABLE INNODB REDO_LOG statement. This functionality is intended for + // loading data into a new MySQL instance. Disabling redo logging speeds up + // data loading by avoiding redo log writes and doublewrite buffering. + disabledRedoLog := false + if disableRedoLog { + if err := mysqld.DisableRedoLog(ctx); err != nil { + log.Warningf("Error disabling redo logging: %v", err) + } else { + disabledRedoLog = true + } + } + + // We have restored a backup. Now start replication. + if err := resetReplication(ctx, restorePos, mysqld); err != nil { + return fmt.Errorf("error resetting replication: %v", err) + } + if err := startReplication(ctx, mysqld, topoServer); err != nil { + return fmt.Errorf("error starting replication: %v", err) + } + + log.Info("get the current primary replication position, and wait until we catch up") + // Get the current primary replication position, and wait until we catch up + // to that point. We do this instead of looking at ReplicationLag + // because that value can + // sometimes lie and tell you there's 0 lag when actually replication is + // stopped. Also, if replication is making progress but is too slow to ever + // catch up to live changes, we'd rather take a backup of something rather + // than timing out. + tmc := tmclient.NewTabletManagerClient() + // Keep retrying if we can't contact the primary. The primary might be + // changing, moving, or down temporarily. + var primaryPos replication.Position + err = retryOnError(ctx, func() error { + // Add a per-operation timeout so we re-read topo if the primary is unreachable. + opCtx, optCancel := context.WithTimeout(ctx, operationTimeout) + defer optCancel() + pos, err := getPrimaryPosition(opCtx, tmc, topoServer) + if err != nil { + return fmt.Errorf("can't get the primary replication position: %v", err) + } + primaryPos = pos + return nil + }) + if err != nil { + return fmt.Errorf("can't get the primary replication position after all retries: %v", err) + } + + log.Infof("takeBackup: primary position is: %s", primaryPos.String()) + + // Remember the time when we fetched the primary position, not when we caught + // up to it, so the timestamp on our backup is honest (assuming we make it + // to the goal position). + backupParams.BackupTime = time.Now() + + // Wait for replication to catch up. + phase.Set(phaseNameCatchupReplication, int64(1)) + defer phase.Set(phaseNameCatchupReplication, int64(0)) + + var ( + lastStatus replication.ReplicationStatus + status replication.ReplicationStatus + statusErr error + + waitStartTime = time.Now() + ) + for { + select { + case <-ctx.Done(): + return fmt.Errorf("error in replication catch up: %v", ctx.Err()) + case <-time.After(time.Second): + } + + lastStatus = status + status, statusErr = mysqld.ReplicationStatus() + if statusErr != nil { + log.Warningf("Error getting replication status: %v", statusErr) + continue + } + if status.Position.AtLeast(primaryPos) { + // We're caught up on replication to at least the point the primary + // was at when this vtbackup run started. + log.Infof("Replication caught up to %v after %v", status.Position, time.Since(waitStartTime)) + deprecatedDurationByPhase.Set("CatchUpReplication", int64(time.Since(waitStartTime).Seconds())) + break + } + if !lastStatus.Position.IsZero() { + if status.Position.Equal(lastStatus.Position) { + phaseStatus.Set([]string{phaseNameCatchupReplication, phaseStatusCatchupReplicationStalled}, 1) + } else { + phaseStatus.Set([]string{phaseNameCatchupReplication, phaseStatusCatchupReplicationStalled}, 0) + } + } + if !status.Healthy() { + log.Warning("Replication has stopped before backup could be taken. Trying to restart replication.") + phaseStatus.Set([]string{phaseNameCatchupReplication, phaseStatusCatchupReplicationStopped}, 1) + if err := startReplication(ctx, mysqld, topoServer); err != nil { + log.Warningf("Failed to restart replication: %v", err) + } + } else { + phaseStatus.Set([]string{phaseNameCatchupReplication, phaseStatusCatchupReplicationStopped}, 0) + } + } + phase.Set(phaseNameCatchupReplication, int64(0)) + + // Stop replication and see where we are. + if err := mysqld.StopReplication(nil); err != nil { + return fmt.Errorf("can't stop replication: %v", err) + } + + // Did we make any progress? + status, statusErr = mysqld.ReplicationStatus() + if statusErr != nil { + return fmt.Errorf("can't get replication status: %v", err) + } + log.Infof("Replication caught up to %v", status.Position) + if !status.Position.AtLeast(primaryPos) && status.Position.Equal(restorePos) { + return fmt.Errorf("not taking backup: replication did not make any progress from restore point: %v", restorePos) + } + phaseStatus.Set([]string{phaseNameCatchupReplication, phaseStatusCatchupReplicationStalled}, 0) + phaseStatus.Set([]string{phaseNameCatchupReplication, phaseStatusCatchupReplicationStopped}, 0) + + // Re-enable redo logging. + if disabledRedoLog { + if err := mysqld.EnableRedoLog(ctx); err != nil { + return fmt.Errorf("failed to re-enable redo log: %v", err) + } + } + + if restartBeforeBackup { + restartAt := time.Now() + log.Info("Proceeding with clean MySQL shutdown and startup to flush all buffers.") + // Prep for full/clean shutdown (not typically the default) + if err := mysqld.ExecuteSuperQuery(ctx, "SET GLOBAL innodb_fast_shutdown=0"); err != nil { + return fmt.Errorf("Could not prep for full shutdown: %v", err) + } + // Shutdown, waiting for it to finish + if err := mysqld.Shutdown(ctx, mycnf, true); err != nil { + return fmt.Errorf("Something went wrong during full MySQL shutdown: %v", err) + } + // Start MySQL, waiting for it to come up + if err := mysqld.Start(ctx, mycnf); err != nil { + return fmt.Errorf("Could not start MySQL after full shutdown: %v", err) + } + deprecatedDurationByPhase.Set("RestartBeforeBackup", int64(time.Since(restartAt).Seconds())) + } + + // Now we can take a new backup. + backupAt := time.Now() + phase.Set(phaseNameTakeNewBackup, int64(1)) + defer phase.Set(phaseNameTakeNewBackup, int64(0)) + if err := mysqlctl.Backup(ctx, backupParams); err != nil { + return fmt.Errorf("error taking backup: %v", err) + } + deprecatedDurationByPhase.Set("TakeNewBackup", int64(time.Since(backupAt).Seconds())) + phase.Set(phaseNameTakeNewBackup, int64(0)) + + // Return a non-zero exit code if we didn't meet the replication position + // goal, even though we took a backup that pushes the high-water mark up. + if !status.Position.AtLeast(primaryPos) { + return fmt.Errorf("replication caught up to %v but didn't make it to the goal of %v; a backup was taken anyway to save partial progress, but the operation should still be retried since not all expected data is backed up", status.Position, primaryPos) + } + log.Info("Backup successful.") + return nil +} + +func resetReplication(ctx context.Context, pos replication.Position, mysqld mysqlctl.MysqlDaemon) error { + cmds := []string{ + "STOP SLAVE", + "RESET SLAVE ALL", // "ALL" makes it forget replication source host:port. + } + if err := mysqld.ExecuteSuperQueryList(ctx, cmds); err != nil { + return vterrors.Wrap(err, "failed to reset replication") + } + + // Check if we have a position to resume from, if not reset to the beginning of time + if !pos.IsZero() { + // Set the position at which to resume from the replication source. + if err := mysqld.SetReplicationPosition(ctx, pos); err != nil { + return vterrors.Wrap(err, "failed to set replica position") + } + } else { + if err := mysqld.ResetReplication(ctx); err != nil { + return vterrors.Wrap(err, "failed to reset replication") + } + } + return nil +} + +func startReplication(ctx context.Context, mysqld mysqlctl.MysqlDaemon, topoServer *topo.Server) error { + si, err := topoServer.GetShard(ctx, initKeyspace, initShard) + if err != nil { + return vterrors.Wrap(err, "can't read shard") + } + if topoproto.TabletAliasIsZero(si.PrimaryAlias) { + // Normal tablets will sit around waiting to be reparented in this case. + // Since vtbackup is a batch job, we just have to fail. + return fmt.Errorf("can't start replication after restore: shard %v/%v has no primary", initKeyspace, initShard) + } + // TODO(enisoc): Support replicating from another replica, preferably in the + // same cell, preferably rdonly, to reduce load on the primary. + ti, err := topoServer.GetTablet(ctx, si.PrimaryAlias) + if err != nil { + return vterrors.Wrapf(err, "Cannot read primary tablet %v", si.PrimaryAlias) + } + + // Stop replication (in case we're restarting), set replication source, and start replication. + if err := mysqld.SetReplicationSource(ctx, ti.Tablet.MysqlHostname, ti.Tablet.MysqlPort, true /* stopReplicationBefore */, true /* startReplicationAfter */); err != nil { + return vterrors.Wrap(err, "MysqlDaemon.SetReplicationSource failed") + } + return nil +} + +func getPrimaryPosition(ctx context.Context, tmc tmclient.TabletManagerClient, ts *topo.Server) (replication.Position, error) { + si, err := ts.GetShard(ctx, initKeyspace, initShard) + if err != nil { + return replication.Position{}, vterrors.Wrap(err, "can't read shard") + } + if topoproto.TabletAliasIsZero(si.PrimaryAlias) { + // Normal tablets will sit around waiting to be reparented in this case. + // Since vtbackup is a batch job, we just have to fail. + return replication.Position{}, fmt.Errorf("shard %v/%v has no primary", initKeyspace, initShard) + } + ti, err := ts.GetTablet(ctx, si.PrimaryAlias) + if err != nil { + return replication.Position{}, fmt.Errorf("can't get primary tablet record %v: %v", topoproto.TabletAliasString(si.PrimaryAlias), err) + } + posStr, err := tmc.PrimaryPosition(ctx, ti.Tablet) + if err != nil { + return replication.Position{}, fmt.Errorf("can't get primary replication position: %v", err) + } + pos, err := replication.DecodePosition(posStr) + if err != nil { + return replication.Position{}, fmt.Errorf("can't decode primary replication position %q: %v", posStr, err) + } + return pos, nil +} + +// retryOnError keeps calling the given function until it succeeds, or the given +// Context is done. It waits an exponentially increasing amount of time between +// retries to avoid hot-looping. The only time this returns an error is if the +// Context is cancelled. +func retryOnError(ctx context.Context, fn func() error) error { + waitTime := 1 * time.Second + + for { + err := fn() + if err == nil { + return nil + } + log.Errorf("Waiting %v to retry after error: %v", waitTime, err) + + select { + case <-ctx.Done(): + log.Errorf("Not retrying after error: %v", ctx.Err()) + return ctx.Err() + case <-time.After(waitTime): + waitTime *= 2 + } + } +} + +func pruneBackups(ctx context.Context, backupStorage backupstorage.BackupStorage, backupDir string) error { + if minRetentionTime == 0 { + log.Info("Pruning of old backups is disabled.") + return nil + } + backups, err := backupStorage.ListBackups(ctx, backupDir) + if err != nil { + return fmt.Errorf("can't list backups: %v", err) + } + numBackups := len(backups) + if numBackups <= minRetentionCount { + log.Infof("Found %v backups. Not pruning any since this is within the min_retention_count of %v.", numBackups, minRetentionCount) + return nil + } + // We have more than the minimum retention count, so we could afford to + // prune some. See if any are beyond the minimum retention time. + // ListBackups returns them sorted by oldest first. + for _, backup := range backups { + backupTime, err := parseBackupTime(backup.Name()) + if err != nil { + return err + } + if time.Since(backupTime) < minRetentionTime { + // The oldest remaining backup is not old enough to prune. + log.Infof("Oldest backup taken at %v has not reached min_retention_time of %v. Nothing left to prune.", backupTime, minRetentionTime) + break + } + // Remove the backup. + log.Infof("Removing old backup %v from %v, since it's older than min_retention_time of %v", backup.Name(), backupDir, minRetentionTime) + if err := backupStorage.RemoveBackup(ctx, backupDir, backup.Name()); err != nil { + return fmt.Errorf("couldn't remove backup %v from %v: %v", backup.Name(), backupDir, err) + } + // We successfully removed one backup. Can we afford to prune any more? + numBackups-- + if numBackups == minRetentionCount { + log.Infof("Successfully pruned backup count to min_retention_count of %v.", minRetentionCount) + break + } + } + return nil +} + +func parseBackupTime(name string) (time.Time, error) { + // Backup names are formatted as "date.time.tablet-alias". + parts := strings.Split(name, ".") + if len(parts) != 3 { + return time.Time{}, fmt.Errorf("backup name not in expected format (date.time.tablet-alias): %v", name) + } + backupTime, err := time.Parse(mysqlctl.BackupTimestampFormat, fmt.Sprintf("%s.%s", parts[0], parts[1])) + if err != nil { + return time.Time{}, fmt.Errorf("can't parse timestamp from backup %q: %v", name, err) + } + return backupTime, nil +} + +func shouldBackup(ctx context.Context, topoServer *topo.Server, backupStorage backupstorage.BackupStorage, backupDir string) (bool, error) { + // Look for the most recent, complete backup. + backups, err := backupStorage.ListBackups(ctx, backupDir) + if err != nil { + return false, fmt.Errorf("can't list backups: %v", err) + } + lastBackup := lastCompleteBackup(ctx, backups) + + // Check preconditions for initial_backup mode. + if initialBackup { + // Check if any backups for the shard already exist in this backup storage location. + if lastBackup != nil { + log.Infof("At least one complete backup already exists, so there's no need to seed an empty backup. Doing nothing.") + return false, nil + } + + // Check whether the shard exists. + _, shardErr := topoServer.GetShard(ctx, initKeyspace, initShard) + switch { + case shardErr == nil: + // If the shard exists, we should make sure none of the tablets are + // already in a serving state, because then they might have data + // that conflicts with the initial backup we're about to take. + tablets, err := topoServer.GetTabletMapForShard(ctx, initKeyspace, initShard) + if err != nil { + // We don't know for sure whether any tablets are serving, + // so it's not safe to continue. + return false, fmt.Errorf("failed to check whether shard %v/%v has serving tablets before doing initial backup: %v", initKeyspace, initShard, err) + } + for tabletAlias, tablet := range tablets { + // Check if any tablet has its type set to one of the serving types. + // If so, it's too late to do an initial backup. + if tablet.IsInServingGraph() { + return false, fmt.Errorf("refusing to upload initial backup of empty database: the shard %v/%v already has at least one tablet that may be serving (%v); you must take a backup from a live tablet instead", initKeyspace, initShard, tabletAlias) + } + } + log.Infof("Shard %v/%v exists but has no serving tablets.", initKeyspace, initShard) + case topo.IsErrType(shardErr, topo.NoNode): + // The shard doesn't exist, so we know no tablets are running. + log.Infof("Shard %v/%v doesn't exist; assuming it has no serving tablets.", initKeyspace, initShard) + default: + // If we encounter any other error, we don't know for sure whether + // the shard exists, so it's not safe to continue. + return false, fmt.Errorf("failed to check whether shard %v/%v exists before doing initial backup: %v", initKeyspace, initShard, err) + } + + log.Infof("Shard %v/%v has no existing backups. Creating initial backup.", initKeyspace, initShard) + return true, nil + } + + // We need at least one backup so we can restore first, unless the user explicitly says we don't + if len(backups) == 0 && !allowFirstBackup { + return false, fmt.Errorf("no existing backups to restore from; backup is not possible since --initial_backup flag was not enabled") + } + if lastBackup == nil { + if allowFirstBackup { + // There's no complete backup, but we were told to take one from scratch anyway. + return true, nil + } + return false, fmt.Errorf("no complete backups to restore from; backup is not possible since --initial_backup flag was not enabled") + } + + // Has it been long enough since the last complete backup to need a new one? + if minBackupInterval == 0 { + // No minimum interval is set, so always backup. + return true, nil + } + lastBackupTime, err := parseBackupTime(lastBackup.Name()) + if err != nil { + return false, fmt.Errorf("can't check last backup time: %v", err) + } + if elapsedTime := time.Since(lastBackupTime); elapsedTime < minBackupInterval { + // It hasn't been long enough yet. + log.Infof("Skipping backup since only %v has elapsed since the last backup at %v, which is less than the min_backup_interval of %v.", elapsedTime, lastBackupTime, minBackupInterval) + return false, nil + } + // It has been long enough. + log.Infof("The last backup was taken at %v, which is older than the min_backup_interval of %v.", lastBackupTime, minBackupInterval) + return true, nil +} + +func lastCompleteBackup(ctx context.Context, backups []backupstorage.BackupHandle) backupstorage.BackupHandle { + if len(backups) == 0 { + return nil + } + + // Backups are sorted in ascending order by start time. Start at the end. + for i := len(backups) - 1; i >= 0; i-- { + // Check if this backup is complete by looking for the MANIFEST file, + // which is written at the end after all files are uploaded. + backup := backups[i] + if err := checkBackupComplete(ctx, backup); err != nil { + log.Warningf("Ignoring backup %v because it's incomplete: %v", backup.Name(), err) + continue + } + return backup + } + + return nil +} + +func checkBackupComplete(ctx context.Context, backup backupstorage.BackupHandle) error { + manifest, err := mysqlctl.GetBackupManifest(ctx, backup) + if err != nil { + return fmt.Errorf("can't get backup MANIFEST: %v", err) + } + + log.Infof("Found complete backup %v taken at position %v", backup.Name(), manifest.Position.String()) + return nil +} diff --git a/go/cmd/vtbackup/docgen/main.go b/go/cmd/vtbackup/docgen/main.go new file mode 100644 index 00000000000..90aa90ffa98 --- /dev/null +++ b/go/cmd/vtbackup/docgen/main.go @@ -0,0 +1,37 @@ +/* +Copyright 2023 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "github.com/spf13/cobra" + + "vitess.io/vitess/go/cmd/internal/docgen" + "vitess.io/vitess/go/cmd/vtbackup/cli" +) + +func main() { + var dir string + cmd := cobra.Command{ + Use: "docgen [-d ]", + RunE: func(cmd *cobra.Command, args []string) error { + return docgen.GenerateMarkdownTree(cli.Main, dir) + }, + } + + cmd.Flags().StringVarP(&dir, "dir", "d", "doc", "output directory to write documentation") + _ = cmd.Execute() +} diff --git a/go/cmd/vtbackup/vtbackup.go b/go/cmd/vtbackup/vtbackup.go index ebf83526cad..37dcadc9b19 100644 --- a/go/cmd/vtbackup/vtbackup.go +++ b/go/cmd/vtbackup/vtbackup.go @@ -14,851 +14,19 @@ See the License for the specific language governing permissions and limitations under the License. */ -/* -vtbackup is a batch command to perform a single pass of backup maintenance for a shard. - -When run periodically for each shard, vtbackup can ensure these configurable policies: -* There is always a recent backup for the shard. -* Old backups for the shard are removed. - -Whatever system launches vtbackup is responsible for the following: - - Running vtbackup with similar flags that would be used for a vttablet and - mysqlctld in the target shard to be backed up. - - Provisioning as much disk space for vtbackup as would be given to vttablet. - The data directory MUST be empty at startup. Do NOT reuse a persistent disk. - - Running vtbackup periodically for each shard, for each backup storage location. - - Ensuring that at most one instance runs at a time for a given pair of shard - and backup storage location. - - Retrying vtbackup if it fails. - - Alerting human operators if the failure is persistent. - -The process vtbackup follows to take a new backup is as follows: - 1. Restore from the most recent backup. - 2. Start a mysqld instance (but no vttablet) from the restored data. - 3. Instruct mysqld to connect to the current shard primary and replicate any - transactions that are new since the last backup. - 4. Ask the primary for its current replication position and set that as the goal - for catching up on replication before taking the backup, so the goalposts - don't move. - 5. Wait until replication is caught up to the goal position or beyond. - 6. Stop mysqld and take a new backup. - -Aside from additional replication load while vtbackup's mysqld catches up on -new transactions, the shard should be otherwise unaffected. Existing tablets -will continue to serve, and no new tablets will appear in topology, meaning no -query traffic will ever be routed to vtbackup's mysqld. This silent operation -mode helps make backups minimally disruptive to serving capacity and orthogonal -to the handling of the query path. - -The command-line parameters to vtbackup specify a policy for when a new backup -is needed, and when old backups should be removed. If the existing backups -already satisfy the policy, then vtbackup will do nothing and return success -immediately. -*/ package main import ( - "context" - "crypto/rand" - "fmt" - "math" - "math/big" - "os" - "strings" - "syscall" - "time" - - "github.com/spf13/pflag" - - "vitess.io/vitess/go/mysql/replication" - - "vitess.io/vitess/go/acl" - "vitess.io/vitess/go/cmd" + "vitess.io/vitess/go/cmd/vtbackup/cli" "vitess.io/vitess/go/exit" - "vitess.io/vitess/go/stats" - "vitess.io/vitess/go/vt/dbconfigs" "vitess.io/vitess/go/vt/log" - "vitess.io/vitess/go/vt/logutil" - "vitess.io/vitess/go/vt/mysqlctl" - "vitess.io/vitess/go/vt/mysqlctl/backupstats" - "vitess.io/vitess/go/vt/mysqlctl/backupstorage" - topodatapb "vitess.io/vitess/go/vt/proto/topodata" - "vitess.io/vitess/go/vt/servenv" - "vitess.io/vitess/go/vt/topo" - "vitess.io/vitess/go/vt/topo/topoproto" - "vitess.io/vitess/go/vt/vterrors" - _ "vitess.io/vitess/go/vt/vttablet/grpctmclient" - "vitess.io/vitess/go/vt/vttablet/tmclient" -) - -const ( - // operationTimeout is the timeout for individual operations like fetching - // the primary position. This does not impose an overall timeout on - // long-running processes like taking the backup. It only applies to - // steps along the way that should complete quickly. This ensures we don't - // place a hard cap on the overall time for a backup, while also not waiting - // forever for things that should be quick. - operationTimeout = 1 * time.Minute - - phaseNameCatchupReplication = "CatchupReplication" - phaseNameInitialBackup = "InitialBackup" - phaseNameRestoreLastBackup = "RestoreLastBackup" - phaseNameTakeNewBackup = "TakeNewBackup" - phaseStatusCatchupReplicationStalled = "Stalled" - phaseStatusCatchupReplicationStopped = "Stopped" -) - -var ( - minBackupInterval time.Duration - minRetentionTime time.Duration - minRetentionCount = 1 - initialBackup bool - allowFirstBackup bool - restartBeforeBackup bool - upgradeSafe bool - // vttablet-like flags - initDbNameOverride string - initKeyspace string - initShard string - concurrency = 4 - incrementalFromPos string - // mysqlctld-like flags - mysqlPort = 3306 - mysqlSocket string - mysqlTimeout = 5 * time.Minute - initDBSQLFile string - detachedMode bool - keepAliveTimeout = 0 * time.Second - disableRedoLog = false - - // Deprecated, use "Phase" instead. - deprecatedDurationByPhase = stats.NewGaugesWithSingleLabel( - "DurationByPhaseSeconds", - "[DEPRECATED] How long it took vtbackup to perform each phase (in seconds).", - "phase", - ) - - // This gauge is updated 3*N times during the course of a vtbackup run, - // where N is the number of different phases vtbackup transitions through. - // Once to initialize to 0, another time to set the phase to active (1), - // and another to deactivate the phase (back to 0). - // - // At most a single phase is active at a given time. - // - // The sync gauge immediately reports changes to push-backed backends. - // The benefit of the sync gauge is that it makes verifying stats in - // integration tests a lot more tractable. - phase = stats.NewSyncGaugesWithSingleLabel( - "Phase", - "Active phase.", - "phase", - ) - phaseNames = []string{ - phaseNameCatchupReplication, - phaseNameInitialBackup, - phaseNameRestoreLastBackup, - phaseNameTakeNewBackup, - } - phaseStatus = stats.NewGaugesWithMultiLabels( - "PhaseStatus", - "Internal state of vtbackup phase.", - []string{"phase", "status"}, - ) - phaseStatuses = map[string][]string{ - phaseNameCatchupReplication: { - phaseStatusCatchupReplicationStalled, - phaseStatusCatchupReplicationStopped, - }, - } ) -func registerFlags(fs *pflag.FlagSet) { - fs.DurationVar(&minBackupInterval, "min_backup_interval", minBackupInterval, "Only take a new backup if it's been at least this long since the most recent backup.") - fs.DurationVar(&minRetentionTime, "min_retention_time", minRetentionTime, "Keep each old backup for at least this long before removing it. Set to 0 to disable pruning of old backups.") - fs.IntVar(&minRetentionCount, "min_retention_count", minRetentionCount, "Always keep at least this many of the most recent backups in this backup storage location, even if some are older than the min_retention_time. This must be at least 1 since a backup must always exist to allow new backups to be made") - fs.BoolVar(&initialBackup, "initial_backup", initialBackup, "Instead of restoring from backup, initialize an empty database with the provided init_db_sql_file and upload a backup of that for the shard, if the shard has no backups yet. This can be used to seed a brand new shard with an initial, empty backup. If any backups already exist for the shard, this will be considered a successful no-op. This can only be done before the shard exists in topology (i.e. before any tablets are deployed).") - fs.BoolVar(&allowFirstBackup, "allow_first_backup", allowFirstBackup, "Allow this job to take the first backup of an existing shard.") - fs.BoolVar(&restartBeforeBackup, "restart_before_backup", restartBeforeBackup, "Perform a mysqld clean/full restart after applying binlogs, but before taking the backup. Only makes sense to work around xtrabackup bugs.") - fs.BoolVar(&upgradeSafe, "upgrade-safe", upgradeSafe, "Whether to use innodb_fast_shutdown=0 for the backup so it is safe to use for MySQL upgrades.") - // vttablet-like flags - fs.StringVar(&initDbNameOverride, "init_db_name_override", initDbNameOverride, "(init parameter) override the name of the db used by vttablet") - fs.StringVar(&initKeyspace, "init_keyspace", initKeyspace, "(init parameter) keyspace to use for this tablet") - fs.StringVar(&initShard, "init_shard", initShard, "(init parameter) shard to use for this tablet") - fs.IntVar(&concurrency, "concurrency", concurrency, "(init restore parameter) how many concurrent files to restore at once") - fs.StringVar(&incrementalFromPos, "incremental_from_pos", incrementalFromPos, "Position of previous backup. Default: empty. If given, then this backup becomes an incremental backup from given position. If value is 'auto', backup taken from last successful backup position") - // mysqlctld-like flags - fs.IntVar(&mysqlPort, "mysql_port", mysqlPort, "mysql port") - fs.StringVar(&mysqlSocket, "mysql_socket", mysqlSocket, "path to the mysql socket") - fs.DurationVar(&mysqlTimeout, "mysql_timeout", mysqlTimeout, "how long to wait for mysqld startup") - fs.StringVar(&initDBSQLFile, "init_db_sql_file", initDBSQLFile, "path to .sql file to run after mysql_install_db") - fs.BoolVar(&detachedMode, "detach", detachedMode, "detached mode - run backups detached from the terminal") - fs.DurationVar(&keepAliveTimeout, "keep-alive-timeout", keepAliveTimeout, "Wait until timeout elapses after a successful backup before shutting down.") - fs.BoolVar(&disableRedoLog, "disable-redo-log", disableRedoLog, "Disable InnoDB redo log during replication-from-primary phase of backup.") - - acl.RegisterFlags(fs) -} - -func init() { - servenv.RegisterDefaultFlags() - dbconfigs.RegisterFlags(dbconfigs.All...) - mysqlctl.RegisterFlags() - servenv.OnParse(registerFlags) -} - func main() { defer exit.Recover() - servenv.ParseFlags("vtbackup") - servenv.Init() - ctx, cancel := context.WithCancel(context.Background()) - servenv.OnClose(func() { - cancel() - }) - - defer func() { - servenv.ExitChan <- syscall.SIGTERM - <-ctx.Done() - }() - - go servenv.RunDefault() - - if detachedMode { - // this method will call os.Exit and kill this process - cmd.DetachFromTerminalAndExit() - } - - defer logutil.Flush() - - if minRetentionCount < 1 { - log.Errorf("min_retention_count must be at least 1 to allow restores to succeed") + if err := cli.Main.Execute(); err != nil { + log.Error(err) exit.Return(1) } - - // Open connection backup storage. - backupStorage, err := backupstorage.GetBackupStorage() - if err != nil { - log.Errorf("Can't get backup storage: %v", err) - exit.Return(1) - } - defer backupStorage.Close() - // Open connection to topology server. - topoServer := topo.Open() - defer topoServer.Close() - - // Initialize stats. - for _, phaseName := range phaseNames { - phase.Set(phaseName, int64(0)) - } - for phaseName, statuses := range phaseStatuses { - for _, status := range statuses { - phaseStatus.Set([]string{phaseName, status}, 0) - } - } - - // Try to take a backup, if it's been long enough since the last one. - // Skip pruning if backup wasn't fully successful. We don't want to be - // deleting things if the backup process is not healthy. - backupDir := mysqlctl.GetBackupDir(initKeyspace, initShard) - doBackup, err := shouldBackup(ctx, topoServer, backupStorage, backupDir) - if err != nil { - log.Errorf("Can't take backup: %v", err) - exit.Return(1) - } - if doBackup { - if err := takeBackup(ctx, topoServer, backupStorage); err != nil { - log.Errorf("Failed to take backup: %v", err) - exit.Return(1) - } - } - - // Prune old backups. - if err := pruneBackups(ctx, backupStorage, backupDir); err != nil { - log.Errorf("Couldn't prune old backups: %v", err) - exit.Return(1) - } - - if keepAliveTimeout > 0 { - log.Infof("Backup was successful, waiting %s before exiting (or until context expires).", keepAliveTimeout) - select { - case <-time.After(keepAliveTimeout): - case <-ctx.Done(): - } - } - log.Info("Exiting.") -} - -func takeBackup(ctx context.Context, topoServer *topo.Server, backupStorage backupstorage.BackupStorage) error { - // This is an imaginary tablet alias. The value doesn't matter for anything, - // except that we generate a random UID to ensure the target backup - // directory is unique if multiple vtbackup instances are launched for the - // same shard, at exactly the same second, pointed at the same backup - // storage location. - bigN, err := rand.Int(rand.Reader, big.NewInt(math.MaxUint32)) - if err != nil { - return fmt.Errorf("can't generate random tablet UID: %v", err) - } - tabletAlias := &topodatapb.TabletAlias{ - Cell: "vtbackup", - Uid: uint32(bigN.Uint64()), - } - - // Clean up our temporary data dir if we exit for any reason, to make sure - // every invocation of vtbackup starts with a clean slate, and it does not - // accumulate garbage (and run out of disk space) if it's restarted. - tabletDir := mysqlctl.TabletDir(tabletAlias.Uid) - defer func() { - log.Infof("Removing temporary tablet directory: %v", tabletDir) - if err := os.RemoveAll(tabletDir); err != nil { - log.Warningf("Failed to remove temporary tablet directory: %v", err) - } - }() - - // Start up mysqld as if we are mysqlctld provisioning a fresh tablet. - mysqld, mycnf, err := mysqlctl.CreateMysqldAndMycnf(tabletAlias.Uid, mysqlSocket, mysqlPort) - if err != nil { - return fmt.Errorf("failed to initialize mysql config: %v", err) - } - initCtx, initCancel := context.WithTimeout(ctx, mysqlTimeout) - defer initCancel() - initMysqldAt := time.Now() - if err := mysqld.Init(initCtx, mycnf, initDBSQLFile); err != nil { - return fmt.Errorf("failed to initialize mysql data dir and start mysqld: %v", err) - } - deprecatedDurationByPhase.Set("InitMySQLd", int64(time.Since(initMysqldAt).Seconds())) - // Shut down mysqld when we're done. - defer func() { - // Be careful not to use the original context, because we don't want to - // skip shutdown just because we timed out waiting for other things. - mysqlShutdownCtx, mysqlShutdownCancel := context.WithTimeout(context.Background(), 30*time.Second) - defer mysqlShutdownCancel() - if err := mysqld.Shutdown(mysqlShutdownCtx, mycnf, false); err != nil { - log.Errorf("failed to shutdown mysqld: %v", err) - } - }() - - extraEnv := map[string]string{ - "TABLET_ALIAS": topoproto.TabletAliasString(tabletAlias), - } - dbName := initDbNameOverride - if dbName == "" { - dbName = fmt.Sprintf("vt_%s", initKeyspace) - } - - backupParams := mysqlctl.BackupParams{ - Cnf: mycnf, - Mysqld: mysqld, - Logger: logutil.NewConsoleLogger(), - Concurrency: concurrency, - IncrementalFromPos: incrementalFromPos, - HookExtraEnv: extraEnv, - TopoServer: topoServer, - Keyspace: initKeyspace, - Shard: initShard, - TabletAlias: topoproto.TabletAliasString(tabletAlias), - Stats: backupstats.BackupStats(), - UpgradeSafe: upgradeSafe, - } - // In initial_backup mode, just take a backup of this empty database. - if initialBackup { - // Take a backup of this empty DB without restoring anything. - // First, initialize it the way InitShardPrimary would, so this backup - // produces a result that can be used to skip InitShardPrimary entirely. - // This involves resetting replication (to erase any history) and then - // creating the main database and some Vitess system tables. - if err := mysqld.ResetReplication(ctx); err != nil { - return fmt.Errorf("can't reset replication: %v", err) - } - // We need to switch off super_read_only before we create the database. - resetFunc, err := mysqld.SetSuperReadOnly(false) - if err != nil { - return fmt.Errorf("failed to disable super_read_only during backup: %v", err) - } - if resetFunc != nil { - defer func() { - err := resetFunc() - if err != nil { - log.Error("Failed to set super_read_only back to its original value during backup") - } - }() - } - cmd := mysqlctl.GenerateInitialBinlogEntry() - if err := mysqld.ExecuteSuperQueryList(ctx, []string{cmd}); err != nil { - return err - } - - backupParams.BackupTime = time.Now() - // Now we're ready to take the backup. - phase.Set(phaseNameInitialBackup, int64(1)) - defer phase.Set(phaseNameInitialBackup, int64(0)) - if err := mysqlctl.Backup(ctx, backupParams); err != nil { - return fmt.Errorf("backup failed: %v", err) - } - deprecatedDurationByPhase.Set("InitialBackup", int64(time.Since(backupParams.BackupTime).Seconds())) - log.Info("Initial backup successful.") - phase.Set(phaseNameInitialBackup, int64(0)) - return nil - } - - phase.Set(phaseNameRestoreLastBackup, int64(1)) - defer phase.Set(phaseNameRestoreLastBackup, int64(0)) - backupDir := mysqlctl.GetBackupDir(initKeyspace, initShard) - log.Infof("Restoring latest backup from directory %v", backupDir) - restoreAt := time.Now() - params := mysqlctl.RestoreParams{ - Cnf: mycnf, - Mysqld: mysqld, - Logger: logutil.NewConsoleLogger(), - Concurrency: concurrency, - HookExtraEnv: extraEnv, - DeleteBeforeRestore: true, - DbName: dbName, - Keyspace: initKeyspace, - Shard: initShard, - Stats: backupstats.RestoreStats(), - } - backupManifest, err := mysqlctl.Restore(ctx, params) - var restorePos replication.Position - switch err { - case nil: - // if err is nil, we expect backupManifest to be non-nil - restorePos = backupManifest.Position - log.Infof("Successfully restored from backup at replication position %v", restorePos) - case mysqlctl.ErrNoBackup: - // There is no backup found, but we may be taking the initial backup of a shard - if !allowFirstBackup { - return fmt.Errorf("no backup found; not starting up empty since --initial_backup flag was not enabled") - } - restorePos = replication.Position{} - default: - return fmt.Errorf("can't restore from backup: %v", err) - } - deprecatedDurationByPhase.Set("RestoreLastBackup", int64(time.Since(restoreAt).Seconds())) - phase.Set(phaseNameRestoreLastBackup, int64(0)) - - // As of MySQL 8.0.21, you can disable redo logging using the ALTER INSTANCE - // DISABLE INNODB REDO_LOG statement. This functionality is intended for - // loading data into a new MySQL instance. Disabling redo logging speeds up - // data loading by avoiding redo log writes and doublewrite buffering. - disabledRedoLog := false - if disableRedoLog { - if err := mysqld.DisableRedoLog(ctx); err != nil { - log.Warningf("Error disabling redo logging: %v", err) - } else { - disabledRedoLog = true - } - } - - // We have restored a backup. Now start replication. - if err := resetReplication(ctx, restorePos, mysqld); err != nil { - return fmt.Errorf("error resetting replication: %v", err) - } - if err := startReplication(ctx, mysqld, topoServer); err != nil { - return fmt.Errorf("error starting replication: %v", err) - } - - log.Info("get the current primary replication position, and wait until we catch up") - // Get the current primary replication position, and wait until we catch up - // to that point. We do this instead of looking at ReplicationLag - // because that value can - // sometimes lie and tell you there's 0 lag when actually replication is - // stopped. Also, if replication is making progress but is too slow to ever - // catch up to live changes, we'd rather take a backup of something rather - // than timing out. - tmc := tmclient.NewTabletManagerClient() - // Keep retrying if we can't contact the primary. The primary might be - // changing, moving, or down temporarily. - var primaryPos replication.Position - err = retryOnError(ctx, func() error { - // Add a per-operation timeout so we re-read topo if the primary is unreachable. - opCtx, optCancel := context.WithTimeout(ctx, operationTimeout) - defer optCancel() - pos, err := getPrimaryPosition(opCtx, tmc, topoServer) - if err != nil { - return fmt.Errorf("can't get the primary replication position: %v", err) - } - primaryPos = pos - return nil - }) - if err != nil { - return fmt.Errorf("can't get the primary replication position after all retries: %v", err) - } - - log.Infof("takeBackup: primary position is: %s", primaryPos.String()) - - // Remember the time when we fetched the primary position, not when we caught - // up to it, so the timestamp on our backup is honest (assuming we make it - // to the goal position). - backupParams.BackupTime = time.Now() - - // Wait for replication to catch up. - phase.Set(phaseNameCatchupReplication, int64(1)) - defer phase.Set(phaseNameCatchupReplication, int64(0)) - - var ( - lastStatus replication.ReplicationStatus - status replication.ReplicationStatus - statusErr error - - waitStartTime = time.Now() - ) - for { - select { - case <-ctx.Done(): - return fmt.Errorf("error in replication catch up: %v", ctx.Err()) - case <-time.After(time.Second): - } - - lastStatus = status - status, statusErr = mysqld.ReplicationStatus() - if statusErr != nil { - log.Warningf("Error getting replication status: %v", statusErr) - continue - } - if status.Position.AtLeast(primaryPos) { - // We're caught up on replication to at least the point the primary - // was at when this vtbackup run started. - log.Infof("Replication caught up to %v after %v", status.Position, time.Since(waitStartTime)) - deprecatedDurationByPhase.Set("CatchUpReplication", int64(time.Since(waitStartTime).Seconds())) - break - } - if !lastStatus.Position.IsZero() { - if status.Position.Equal(lastStatus.Position) { - phaseStatus.Set([]string{phaseNameCatchupReplication, phaseStatusCatchupReplicationStalled}, 1) - } else { - phaseStatus.Set([]string{phaseNameCatchupReplication, phaseStatusCatchupReplicationStalled}, 0) - } - } - if !status.Healthy() { - log.Warning("Replication has stopped before backup could be taken. Trying to restart replication.") - phaseStatus.Set([]string{phaseNameCatchupReplication, phaseStatusCatchupReplicationStopped}, 1) - if err := startReplication(ctx, mysqld, topoServer); err != nil { - log.Warningf("Failed to restart replication: %v", err) - } - } else { - phaseStatus.Set([]string{phaseNameCatchupReplication, phaseStatusCatchupReplicationStopped}, 0) - } - } - phase.Set(phaseNameCatchupReplication, int64(0)) - - // Stop replication and see where we are. - if err := mysqld.StopReplication(nil); err != nil { - return fmt.Errorf("can't stop replication: %v", err) - } - - // Did we make any progress? - status, statusErr = mysqld.ReplicationStatus() - if statusErr != nil { - return fmt.Errorf("can't get replication status: %v", err) - } - log.Infof("Replication caught up to %v", status.Position) - if !status.Position.AtLeast(primaryPos) && status.Position.Equal(restorePos) { - return fmt.Errorf("not taking backup: replication did not make any progress from restore point: %v", restorePos) - } - phaseStatus.Set([]string{phaseNameCatchupReplication, phaseStatusCatchupReplicationStalled}, 0) - phaseStatus.Set([]string{phaseNameCatchupReplication, phaseStatusCatchupReplicationStopped}, 0) - - // Re-enable redo logging. - if disabledRedoLog { - if err := mysqld.EnableRedoLog(ctx); err != nil { - return fmt.Errorf("failed to re-enable redo log: %v", err) - } - } - - if restartBeforeBackup { - restartAt := time.Now() - log.Info("Proceeding with clean MySQL shutdown and startup to flush all buffers.") - // Prep for full/clean shutdown (not typically the default) - if err := mysqld.ExecuteSuperQuery(ctx, "SET GLOBAL innodb_fast_shutdown=0"); err != nil { - return fmt.Errorf("Could not prep for full shutdown: %v", err) - } - // Shutdown, waiting for it to finish - if err := mysqld.Shutdown(ctx, mycnf, true); err != nil { - return fmt.Errorf("Something went wrong during full MySQL shutdown: %v", err) - } - // Start MySQL, waiting for it to come up - if err := mysqld.Start(ctx, mycnf); err != nil { - return fmt.Errorf("Could not start MySQL after full shutdown: %v", err) - } - deprecatedDurationByPhase.Set("RestartBeforeBackup", int64(time.Since(restartAt).Seconds())) - } - - // Now we can take a new backup. - backupAt := time.Now() - phase.Set(phaseNameTakeNewBackup, int64(1)) - defer phase.Set(phaseNameTakeNewBackup, int64(0)) - if err := mysqlctl.Backup(ctx, backupParams); err != nil { - return fmt.Errorf("error taking backup: %v", err) - } - deprecatedDurationByPhase.Set("TakeNewBackup", int64(time.Since(backupAt).Seconds())) - phase.Set(phaseNameTakeNewBackup, int64(0)) - - // Return a non-zero exit code if we didn't meet the replication position - // goal, even though we took a backup that pushes the high-water mark up. - if !status.Position.AtLeast(primaryPos) { - return fmt.Errorf("replication caught up to %v but didn't make it to the goal of %v; a backup was taken anyway to save partial progress, but the operation should still be retried since not all expected data is backed up", status.Position, primaryPos) - } - log.Info("Backup successful.") - return nil -} - -func resetReplication(ctx context.Context, pos replication.Position, mysqld mysqlctl.MysqlDaemon) error { - cmds := []string{ - "STOP SLAVE", - "RESET SLAVE ALL", // "ALL" makes it forget replication source host:port. - } - if err := mysqld.ExecuteSuperQueryList(ctx, cmds); err != nil { - return vterrors.Wrap(err, "failed to reset replication") - } - - // Check if we have a position to resume from, if not reset to the beginning of time - if !pos.IsZero() { - // Set the position at which to resume from the replication source. - if err := mysqld.SetReplicationPosition(ctx, pos); err != nil { - return vterrors.Wrap(err, "failed to set replica position") - } - } else { - if err := mysqld.ResetReplication(ctx); err != nil { - return vterrors.Wrap(err, "failed to reset replication") - } - } - return nil -} - -func startReplication(ctx context.Context, mysqld mysqlctl.MysqlDaemon, topoServer *topo.Server) error { - si, err := topoServer.GetShard(ctx, initKeyspace, initShard) - if err != nil { - return vterrors.Wrap(err, "can't read shard") - } - if topoproto.TabletAliasIsZero(si.PrimaryAlias) { - // Normal tablets will sit around waiting to be reparented in this case. - // Since vtbackup is a batch job, we just have to fail. - return fmt.Errorf("can't start replication after restore: shard %v/%v has no primary", initKeyspace, initShard) - } - // TODO(enisoc): Support replicating from another replica, preferably in the - // same cell, preferably rdonly, to reduce load on the primary. - ti, err := topoServer.GetTablet(ctx, si.PrimaryAlias) - if err != nil { - return vterrors.Wrapf(err, "Cannot read primary tablet %v", si.PrimaryAlias) - } - - // Stop replication (in case we're restarting), set replication source, and start replication. - if err := mysqld.SetReplicationSource(ctx, ti.Tablet.MysqlHostname, ti.Tablet.MysqlPort, true /* stopReplicationBefore */, true /* startReplicationAfter */); err != nil { - return vterrors.Wrap(err, "MysqlDaemon.SetReplicationSource failed") - } - return nil -} - -func getPrimaryPosition(ctx context.Context, tmc tmclient.TabletManagerClient, ts *topo.Server) (replication.Position, error) { - si, err := ts.GetShard(ctx, initKeyspace, initShard) - if err != nil { - return replication.Position{}, vterrors.Wrap(err, "can't read shard") - } - if topoproto.TabletAliasIsZero(si.PrimaryAlias) { - // Normal tablets will sit around waiting to be reparented in this case. - // Since vtbackup is a batch job, we just have to fail. - return replication.Position{}, fmt.Errorf("shard %v/%v has no primary", initKeyspace, initShard) - } - ti, err := ts.GetTablet(ctx, si.PrimaryAlias) - if err != nil { - return replication.Position{}, fmt.Errorf("can't get primary tablet record %v: %v", topoproto.TabletAliasString(si.PrimaryAlias), err) - } - posStr, err := tmc.PrimaryPosition(ctx, ti.Tablet) - if err != nil { - return replication.Position{}, fmt.Errorf("can't get primary replication position: %v", err) - } - pos, err := replication.DecodePosition(posStr) - if err != nil { - return replication.Position{}, fmt.Errorf("can't decode primary replication position %q: %v", posStr, err) - } - return pos, nil -} - -// retryOnError keeps calling the given function until it succeeds, or the given -// Context is done. It waits an exponentially increasing amount of time between -// retries to avoid hot-looping. The only time this returns an error is if the -// Context is cancelled. -func retryOnError(ctx context.Context, fn func() error) error { - waitTime := 1 * time.Second - - for { - err := fn() - if err == nil { - return nil - } - log.Errorf("Waiting %v to retry after error: %v", waitTime, err) - - select { - case <-ctx.Done(): - log.Errorf("Not retrying after error: %v", ctx.Err()) - return ctx.Err() - case <-time.After(waitTime): - waitTime *= 2 - } - } -} - -func pruneBackups(ctx context.Context, backupStorage backupstorage.BackupStorage, backupDir string) error { - if minRetentionTime == 0 { - log.Info("Pruning of old backups is disabled.") - return nil - } - backups, err := backupStorage.ListBackups(ctx, backupDir) - if err != nil { - return fmt.Errorf("can't list backups: %v", err) - } - numBackups := len(backups) - if numBackups <= minRetentionCount { - log.Infof("Found %v backups. Not pruning any since this is within the min_retention_count of %v.", numBackups, minRetentionCount) - return nil - } - // We have more than the minimum retention count, so we could afford to - // prune some. See if any are beyond the minimum retention time. - // ListBackups returns them sorted by oldest first. - for _, backup := range backups { - backupTime, err := parseBackupTime(backup.Name()) - if err != nil { - return err - } - if time.Since(backupTime) < minRetentionTime { - // The oldest remaining backup is not old enough to prune. - log.Infof("Oldest backup taken at %v has not reached min_retention_time of %v. Nothing left to prune.", backupTime, minRetentionTime) - break - } - // Remove the backup. - log.Infof("Removing old backup %v from %v, since it's older than min_retention_time of %v", backup.Name(), backupDir, minRetentionTime) - if err := backupStorage.RemoveBackup(ctx, backupDir, backup.Name()); err != nil { - return fmt.Errorf("couldn't remove backup %v from %v: %v", backup.Name(), backupDir, err) - } - // We successfully removed one backup. Can we afford to prune any more? - numBackups-- - if numBackups == minRetentionCount { - log.Infof("Successfully pruned backup count to min_retention_count of %v.", minRetentionCount) - break - } - } - return nil -} - -func parseBackupTime(name string) (time.Time, error) { - // Backup names are formatted as "date.time.tablet-alias". - parts := strings.Split(name, ".") - if len(parts) != 3 { - return time.Time{}, fmt.Errorf("backup name not in expected format (date.time.tablet-alias): %v", name) - } - backupTime, err := time.Parse(mysqlctl.BackupTimestampFormat, fmt.Sprintf("%s.%s", parts[0], parts[1])) - if err != nil { - return time.Time{}, fmt.Errorf("can't parse timestamp from backup %q: %v", name, err) - } - return backupTime, nil -} - -func shouldBackup(ctx context.Context, topoServer *topo.Server, backupStorage backupstorage.BackupStorage, backupDir string) (bool, error) { - // Look for the most recent, complete backup. - backups, err := backupStorage.ListBackups(ctx, backupDir) - if err != nil { - return false, fmt.Errorf("can't list backups: %v", err) - } - lastBackup := lastCompleteBackup(ctx, backups) - - // Check preconditions for initial_backup mode. - if initialBackup { - // Check if any backups for the shard already exist in this backup storage location. - if lastBackup != nil { - log.Infof("At least one complete backup already exists, so there's no need to seed an empty backup. Doing nothing.") - return false, nil - } - - // Check whether the shard exists. - _, shardErr := topoServer.GetShard(ctx, initKeyspace, initShard) - switch { - case shardErr == nil: - // If the shard exists, we should make sure none of the tablets are - // already in a serving state, because then they might have data - // that conflicts with the initial backup we're about to take. - tablets, err := topoServer.GetTabletMapForShard(ctx, initKeyspace, initShard) - if err != nil { - // We don't know for sure whether any tablets are serving, - // so it's not safe to continue. - return false, fmt.Errorf("failed to check whether shard %v/%v has serving tablets before doing initial backup: %v", initKeyspace, initShard, err) - } - for tabletAlias, tablet := range tablets { - // Check if any tablet has its type set to one of the serving types. - // If so, it's too late to do an initial backup. - if tablet.IsInServingGraph() { - return false, fmt.Errorf("refusing to upload initial backup of empty database: the shard %v/%v already has at least one tablet that may be serving (%v); you must take a backup from a live tablet instead", initKeyspace, initShard, tabletAlias) - } - } - log.Infof("Shard %v/%v exists but has no serving tablets.", initKeyspace, initShard) - case topo.IsErrType(shardErr, topo.NoNode): - // The shard doesn't exist, so we know no tablets are running. - log.Infof("Shard %v/%v doesn't exist; assuming it has no serving tablets.", initKeyspace, initShard) - default: - // If we encounter any other error, we don't know for sure whether - // the shard exists, so it's not safe to continue. - return false, fmt.Errorf("failed to check whether shard %v/%v exists before doing initial backup: %v", initKeyspace, initShard, err) - } - - log.Infof("Shard %v/%v has no existing backups. Creating initial backup.", initKeyspace, initShard) - return true, nil - } - - // We need at least one backup so we can restore first, unless the user explicitly says we don't - if len(backups) == 0 && !allowFirstBackup { - return false, fmt.Errorf("no existing backups to restore from; backup is not possible since --initial_backup flag was not enabled") - } - if lastBackup == nil { - if allowFirstBackup { - // There's no complete backup, but we were told to take one from scratch anyway. - return true, nil - } - return false, fmt.Errorf("no complete backups to restore from; backup is not possible since --initial_backup flag was not enabled") - } - - // Has it been long enough since the last complete backup to need a new one? - if minBackupInterval == 0 { - // No minimum interval is set, so always backup. - return true, nil - } - lastBackupTime, err := parseBackupTime(lastBackup.Name()) - if err != nil { - return false, fmt.Errorf("can't check last backup time: %v", err) - } - if elapsedTime := time.Since(lastBackupTime); elapsedTime < minBackupInterval { - // It hasn't been long enough yet. - log.Infof("Skipping backup since only %v has elapsed since the last backup at %v, which is less than the min_backup_interval of %v.", elapsedTime, lastBackupTime, minBackupInterval) - return false, nil - } - // It has been long enough. - log.Infof("The last backup was taken at %v, which is older than the min_backup_interval of %v.", lastBackupTime, minBackupInterval) - return true, nil -} - -func lastCompleteBackup(ctx context.Context, backups []backupstorage.BackupHandle) backupstorage.BackupHandle { - if len(backups) == 0 { - return nil - } - - // Backups are sorted in ascending order by start time. Start at the end. - for i := len(backups) - 1; i >= 0; i-- { - // Check if this backup is complete by looking for the MANIFEST file, - // which is written at the end after all files are uploaded. - backup := backups[i] - if err := checkBackupComplete(ctx, backup); err != nil { - log.Warningf("Ignoring backup %v because it's incomplete: %v", backup.Name(), err) - continue - } - return backup - } - - return nil -} - -func checkBackupComplete(ctx context.Context, backup backupstorage.BackupHandle) error { - manifest, err := mysqlctl.GetBackupManifest(ctx, backup) - if err != nil { - return fmt.Errorf("can't get backup MANIFEST: %v", err) - } - - log.Infof("Found complete backup %v taken at position %v", backup.Name(), manifest.Position.String()) - return nil } diff --git a/go/flags/endtoend/vtbackup.txt b/go/flags/endtoend/vtbackup.txt index 46e4efea301..bc62dd55530 100644 --- a/go/flags/endtoend/vtbackup.txt +++ b/go/flags/endtoend/vtbackup.txt @@ -1,4 +1,47 @@ -Usage of vtbackup: +vtbackup is a batch command to perform a single pass of backup maintenance for a shard. + +When run periodically for each shard, vtbackup can ensure these configurable policies: + * There is always a recent backup for the shard. + * Old backups for the shard are removed. + +Whatever system launches vtbackup is responsible for the following: + - Running vtbackup with similar flags that would be used for a vttablet and + mysqlctld in the target shard to be backed up. + - Provisioning as much disk space for vtbackup as would be given to vttablet. + The data directory MUST be empty at startup. Do NOT reuse a persistent disk. + - Running vtbackup periodically for each shard, for each backup storage location. + - Ensuring that at most one instance runs at a time for a given pair of shard + and backup storage location. + - Retrying vtbackup if it fails. + - Alerting human operators if the failure is persistent. + +The process vtbackup follows to take a new backup is as follows: + 1. Restore from the most recent backup. + 2. Start a mysqld instance (but no vttablet) from the restored data. + 3. Instruct mysqld to connect to the current shard primary and replicate any + transactions that are new since the last backup. + 4. Ask the primary for its current replication position and set that as the goal + for catching up on replication before taking the backup, so the goalposts + don't move. + 5. Wait until replication is caught up to the goal position or beyond. + 6. Stop mysqld and take a new backup. + +Aside from additional replication load while vtbackup's mysqld catches up on +new transactions, the shard should be otherwise unaffected. Existing tablets +will continue to serve, and no new tablets will appear in topology, meaning no +query traffic will ever be routed to vtbackup's mysqld. This silent operation +mode helps make backups minimally disruptive to serving capacity and orthogonal +to the handling of the query path. + +The command-line parameters to vtbackup specify a policy for when a new backup +is needed, and when old backups should be removed. If the existing backups +already satisfy the policy, then vtbackup will do nothing and return success +immediately. + +Usage: + vtbackup [flags] + +Flags: --allow_first_backup Allow this job to take the first backup of an existing shard. --alsologtostderr log to standard error as well as files --azblob_backup_account_key_file string Path to a file containing the Azure Storage account key; if this flag is unset, the environment variable VT_AZBLOB_ACCOUNT_KEY will be used as the key itself (NOT a file path). @@ -92,7 +135,7 @@ Usage of vtbackup: --grpc_keepalive_timeout duration After having pinged for keepalive check, the client waits for a duration of Timeout and if no activity is seen even after that the connection is closed. (default 10s) --grpc_max_message_size int Maximum allowed RPC message size. Larger messages will be rejected by gRPC with the error 'exceeding the max size'. (default 16777216) --grpc_prometheus Enable gRPC monitoring with Prometheus. - -h, --help display usage and exit + -h, --help help for vtbackup --incremental_from_pos string Position of previous backup. Default: empty. If given, then this backup becomes an incremental backup from given position. If value is 'auto', backup taken from last successful backup position --init_db_name_override string (init parameter) override the name of the db used by vttablet --init_db_sql_file string path to .sql file to run after mysql_install_db @@ -140,8 +183,6 @@ Usage of vtbackup: --purge_logs_interval duration how often try to remove old logs (default 1h0m0s) --remote_operation_timeout duration time to wait for a remote operation (default 15s) --restart_before_backup Perform a mysqld clean/full restart after applying binlogs, but before taking the backup. Only makes sense to work around xtrabackup bugs. - --s2a_enable_appengine_dialer If true, opportunistically use AppEngine-specific dialer to call S2A. - --s2a_timeout duration Timeout enforced on the connection to the S2A service for handshake. (default 3s) --s3_backup_aws_endpoint string endpoint of the S3 backend (region must be provided). --s3_backup_aws_region string AWS region to use. (default "us-east-1") --s3_backup_aws_retries int AWS request retries. (default -1) From d5c7d47bffe737d9ef4a6d165c922e4e11105d1d Mon Sep 17 00:00:00 2001 From: Andrew Mason Date: Sat, 16 Sep 2023 15:24:12 -0400 Subject: [PATCH 03/12] remove double-registered vtbench flags and add testdata Signed-off-by: Andrew Mason --- go/cmd/vtbench/vtbench.go | 2 -- go/flags/endtoend/vtbench.txt | 55 +++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 2 deletions(-) create mode 100644 go/flags/endtoend/vtbench.txt diff --git a/go/cmd/vtbench/vtbench.go b/go/cmd/vtbench/vtbench.go index 19044aae4ed..44b8c75a3fc 100644 --- a/go/cmd/vtbench/vtbench.go +++ b/go/cmd/vtbench/vtbench.go @@ -105,8 +105,6 @@ func initFlags(fs *pflag.FlagSet) { fs.IntVar(&count, "count", count, "Number of queries per thread") grpccommon.RegisterFlags(fs) - log.RegisterFlags(fs) - logutil.RegisterFlags(fs) acl.RegisterFlags(fs) servenv.RegisterMySQLServerFlags(fs) } diff --git a/go/flags/endtoend/vtbench.txt b/go/flags/endtoend/vtbench.txt new file mode 100644 index 00000000000..7d5fd0b88d8 --- /dev/null +++ b/go/flags/endtoend/vtbench.txt @@ -0,0 +1,55 @@ +Usage of vtbench: + --alsologtostderr log to standard error as well as files + --config-file string Full path of the config file (with extension) to use. If set, --config-path, --config-type, and --config-name are ignored. + --config-file-not-found-handling ConfigFileNotFoundHandling Behavior when a config file is not found. (Options: error, exit, ignore, warn) (default warn) + --config-name string Name of the config file (without extension) to search for. (default "vtconfig") + --config-path strings Paths to search for config files in. (default [{{ .Workdir }}]) + --config-persistence-min-interval duration minimum interval between persisting dynamic config changes back to disk (if no change has occurred, nothing is done). (default 1s) + --config-type string Config file type (omit to infer config type from file extension). + --count int Number of queries per thread (default 1000) + --db string Database name to use when connecting / running the queries (e.g. @replica, keyspace, keyspace/shard etc) + --deadline duration Maximum duration for the test run (default 5 minutes) (default 5m0s) + --grpc_auth_static_client_creds string When using grpc_static_auth in the server, this file provides the credentials to use to authenticate with server. + --grpc_compression string Which protocol to use for compressing gRPC. Default: nothing. Supported: snappy + --grpc_enable_tracing Enable gRPC tracing. + --grpc_initial_conn_window_size int gRPC initial connection window size + --grpc_initial_window_size int gRPC initial window size + --grpc_keepalive_time duration After a duration of this time, if the client doesn't see any activity, it pings the server to see if the transport is still alive. (default 10s) + --grpc_keepalive_timeout duration After having pinged for keepalive check, the client waits for a duration of Timeout and if no activity is seen even after that the connection is closed. (default 10s) + --grpc_max_message_size int Maximum allowed RPC message size. Larger messages will be rejected by gRPC with the error 'exceeding the max size'. (default 16777216) + --grpc_prometheus Enable gRPC monitoring with Prometheus. + -h, --help display usage and exit + --host string VTGate host(s) in the form 'host1,host2,...' + --keep_logs duration keep logs for this long (using ctime) (zero to keep forever) + --keep_logs_by_mtime duration keep logs for this long (using mtime) (zero to keep forever) + --log_backtrace_at traceLocation when logging hits line file:N, emit a stack trace (default :0) + --log_dir string If non-empty, write log files in this directory + --log_err_stacks log stack traces for errors + --log_rotate_max_size uint size in bytes at which logs are rotated (glog.MaxSize) (default 1887436800) + --logtostderr log to standard error instead of files + --mysql_server_version string MySQL server version to advertise. (default "8.0.30-Vitess") + --port int VTGate port + --pprof strings enable profiling + --protocol string Client protocol, either mysql (default), grpc-vtgate, or grpc-vttablet (default "mysql") + --purge_logs_interval duration how often try to remove old logs (default 1h0m0s) + --security_policy string the name of a registered security policy to use for controlling access to URLs - empty means allow all for anyone (built-in policies: deny-all, read-only) + --sql string SQL statement to execute + --sql-max-length-errors int truncate queries in error logs to the given length (default unlimited) + --sql-max-length-ui int truncate queries in debug UIs to the given length (default 512) (default 512) + --stderrthreshold severity logs at or above this threshold go to stderr (default 1) + --tablet_grpc_ca string the server ca to use to validate servers when connecting + --tablet_grpc_cert string the cert to use to connect + --tablet_grpc_crl string the server crl to use to validate server certificates when connecting + --tablet_grpc_key string the key to use to connect + --tablet_grpc_server_name string the server name to use to validate server certificate + --threads int Number of parallel threads to run (default 2) + --unix_socket string VTGate unix socket + --user string Username to connect using mysql (password comes from the db-credentials-file) + --v Level log level for V logs + -v, --version print binary version + --vmodule moduleSpec comma-separated list of pattern=N settings for file-filtered logging + --vtgate_grpc_ca string the server ca to use to validate servers when connecting + --vtgate_grpc_cert string the cert to use to connect + --vtgate_grpc_crl string the server crl to use to validate server certificates when connecting + --vtgate_grpc_key string the key to use to connect + --vtgate_grpc_server_name string the server name to use to validate server certificate From f43093270265356b41d0e8e317719bd427a65907 Mon Sep 17 00:00:00 2001 From: Andrew Mason Date: Sat, 16 Sep 2023 15:28:54 -0400 Subject: [PATCH 04/12] migrate vtbench to cobra Signed-off-by: Andrew Mason --- go/cmd/vtbench/cli/vtbench.go | 247 ++++++++++++++++++++++++++++++++++ go/cmd/vtbench/docgen/main.go | 37 +++++ go/cmd/vtbench/vtbench.go | 183 +------------------------ go/flags/endtoend/vtbench.txt | 46 ++++++- 4 files changed, 331 insertions(+), 182 deletions(-) create mode 100644 go/cmd/vtbench/cli/vtbench.go create mode 100644 go/cmd/vtbench/docgen/main.go diff --git a/go/cmd/vtbench/cli/vtbench.go b/go/cmd/vtbench/cli/vtbench.go new file mode 100644 index 00000000000..3c197dfc905 --- /dev/null +++ b/go/cmd/vtbench/cli/vtbench.go @@ -0,0 +1,247 @@ +/* +Copyright 2023 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cli + +import ( + "context" + "errors" + "fmt" + "strings" + "time" + + "github.com/spf13/cobra" + + "vitess.io/vitess/go/acl" + "vitess.io/vitess/go/vt/dbconfigs" + "vitess.io/vitess/go/vt/grpccommon" + "vitess.io/vitess/go/vt/logutil" + "vitess.io/vitess/go/vt/servenv" + "vitess.io/vitess/go/vtbench" + + // Import and register the gRPC vtgateconn client + _ "vitess.io/vitess/go/vt/vtgate/grpcvtgateconn" + // Import and register the gRPC tabletconn client + _ "vitess.io/vitess/go/vt/vttablet/grpctabletconn" +) + +/* + + Vtbench is a simple load testing client to compare workloads in + Vitess across the various client/server protocols. + + There are a number of command line options to control the behavior, + but as a basic example, the three supported client protocols are: + + Mysql protocol to vtgate: + vtbench \ + --protocol mysql \ + --host vtgate-host.my.domain \ + --port 15306 \ + --user db_username \ + --db-credentials-file ./vtbench_db_creds.json \ + --db @replica \ + --sql "select * from loadtest_table where id=123456789" \ + --threads 10 \ + --count 10 + + GRPC to vtgate: + vtbench \ + --protocol grpc-vtgate \ + --host vtgate-host.my.domain \ + --port 15999 \ + --db @replica \ + $VTTABLET_GRPC_ARGS \ + --sql "select * from loadtest_table where id=123456789" \ + --threads 10 \ + --count 10 + + GRPC to vttablet: + vtbench \ + --protocol grpc-vttablet \ + --host tablet-loadtest-00-80.my.domain \ + --port 15999 \ + --db loadtest/00-80@replica \ + --sql "select * from loadtest_table where id=123456789" \ + --threads 10 \ + --count 10 + +*/ + +var ( + host, unixSocket, user, db, sql string + port int + protocol = "mysql" + deadline = 5 * time.Minute + threads = 2 + count = 1000 + + Main = &cobra.Command{ + Use: "vtbench", + Short: "vtbench is a simple load testing client to compare workloads in Vitess across the various client/server protocols.", + Example: `There are a number of command line options to control the behavior, +but as a basic example, the three supported client protocols are: + +Mysql protocol to vtgate: +vtbench \ + --protocol mysql \ + --host vtgate-host.my.domain \ + --port 15306 \ + --user db_username \ + --db-credentials-file ./vtbench_db_creds.json \ + --db @replica \ + --sql "select * from loadtest_table where id=123456789" \ + --threads 10 \ + --count 10 + +GRPC to vtgate: +vtbench \ + --protocol grpc-vtgate \ + --host vtgate-host.my.domain \ + --port 15999 \ + --db @replica \ + $VTTABLET_GRPC_ARGS \ + --sql "select * from loadtest_table where id=123456789" \ + --threads 10 \ + --count 10 + +GRPC to vttablet: +vtbench \ + --protocol grpc-vttablet \ + --host tablet-loadtest-00-80.my.domain \ + --port 15999 \ + --db loadtest/00-80@replica \ + --sql "select * from loadtest_table where id=123456789" \ + --threads 10 \ + --count 10`, + Args: cobra.NoArgs, + Version: servenv.AppVersion.String(), + PreRunE: servenv.CobraPreRunE, + RunE: run, + } +) + +func init() { + servenv.MoveFlagsToCobraCommand(Main) + + Main.Flags().StringVar(&host, "host", host, "VTGate host(s) in the form 'host1,host2,...'") + Main.Flags().IntVar(&port, "port", port, "VTGate port") + Main.Flags().StringVar(&unixSocket, "unix_socket", unixSocket, "VTGate unix socket") + Main.Flags().StringVar(&protocol, "protocol", protocol, "Client protocol, either mysql (default), grpc-vtgate, or grpc-vttablet") + Main.Flags().StringVar(&user, "user", user, "Username to connect using mysql (password comes from the db-credentials-file)") + Main.Flags().StringVar(&db, "db", db, "Database name to use when connecting / running the queries (e.g. @replica, keyspace, keyspace/shard etc)") + + Main.Flags().DurationVar(&deadline, "deadline", deadline, "Maximum duration for the test run (default 5 minutes)") + Main.Flags().StringVar(&sql, "sql", sql, "SQL statement to execute") + Main.Flags().IntVar(&threads, "threads", threads, "Number of parallel threads to run") + Main.Flags().IntVar(&count, "count", count, "Number of queries per thread") + + Main.MarkFlagRequired("sql") + + grpccommon.RegisterFlags(Main.Flags()) + acl.RegisterFlags(Main.Flags()) + servenv.RegisterMySQLServerFlags(Main.Flags()) +} + +func run(cmd *cobra.Command, args []string) error { + logger := logutil.NewConsoleLogger() + cmd.SetOutput(logutil.NewLoggerWriter(logger)) + _ = cmd.Flags().Set("logtostderr", "true") + + servenv.Init() + defer servenv.Close() + + var clientProto vtbench.ClientProtocol + switch protocol { + case "", "mysql": + clientProto = vtbench.MySQL + case "grpc-vtgate": + clientProto = vtbench.GRPCVtgate + case "grpc-vttablet": + clientProto = vtbench.GRPCVttablet + default: + return fmt.Errorf("invalid client protocol %s", protocol) + } + + if (host != "" || port != 0) && unixSocket != "" { + return errors.New("can't specify both host:port and unix_socket") + } + + if host != "" && port == 0 { + return errors.New("must specify port when using host") + } + + if host == "" && port != 0 { + return errors.New("must specify host when using port") + } + + if host == "" && port == 0 && unixSocket == "" { + return errors.New("vtbench requires either host/port or unix_socket") + } + + var password string + if clientProto == vtbench.MySQL { + var err error + _, password, err = dbconfigs.GetCredentialsServer().GetUserAndPassword(user) + if err != nil { + return fmt.Errorf("error reading password for user %v from file: %w", user, err) + } + } + + connParams := vtbench.ConnParams{ + Hosts: strings.Split(host, ","), + Port: port, + UnixSocket: unixSocket, + Protocol: clientProto, + DB: db, + Username: user, + Password: password, + } + + b := vtbench.NewBench(threads, count, connParams, sql) + + ctx, cancel := context.WithTimeout(context.Background(), deadline) + defer cancel() + + fmt.Printf("Initializing test with %s protocol / %d threads / %d iterations\n", + b.ConnParams.Protocol.String(), b.Threads, b.Count) + err := b.Run(ctx) + if err != nil { + return fmt.Errorf("error in test: %w", err) + } + + fmt.Printf("Average Rows Returned: %d\n", b.Rows.Get()/int64(b.Threads*b.Count)) + fmt.Printf("Average Query Time: %v\n", time.Duration(b.Timings.Time()/b.Timings.Count())) + fmt.Printf("Total Test Time: %v\n", b.TotalTime) + fmt.Printf("QPS (Per Thread): %v\n", float64(b.Count)/b.TotalTime.Seconds()) + fmt.Printf("QPS (Total): %v\n", float64(b.Count*b.Threads)/b.TotalTime.Seconds()) + + last := int64(0) + + histograms := b.Timings.Histograms() + h := histograms["query"] + buckets := h.Buckets() + fmt.Printf("Query Timings:\n") + for i, bucket := range h.Cutoffs() { + count := buckets[i] + if count != 0 { + fmt.Printf("%v-%v: %v\n", time.Duration(last), time.Duration(bucket), count) + } + last = bucket + } + + return nil +} diff --git a/go/cmd/vtbench/docgen/main.go b/go/cmd/vtbench/docgen/main.go new file mode 100644 index 00000000000..5efe9e899a8 --- /dev/null +++ b/go/cmd/vtbench/docgen/main.go @@ -0,0 +1,37 @@ +/* +Copyright 2023 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "github.com/spf13/cobra" + + "vitess.io/vitess/go/cmd/internal/docgen" + "vitess.io/vitess/go/cmd/vtbench/cli" +) + +func main() { + var dir string + cmd := cobra.Command{ + Use: "docgen [-d ]", + RunE: func(cmd *cobra.Command, args []string) error { + return docgen.GenerateMarkdownTree(cli.Main, dir) + }, + } + + cmd.Flags().StringVarP(&dir, "dir", "d", "doc", "output directory to write documentation") + _ = cmd.Execute() +} diff --git a/go/cmd/vtbench/vtbench.go b/go/cmd/vtbench/vtbench.go index 44b8c75a3fc..0d8bb85b536 100644 --- a/go/cmd/vtbench/vtbench.go +++ b/go/cmd/vtbench/vtbench.go @@ -17,192 +17,15 @@ limitations under the License. package main import ( - "context" - "fmt" - "strings" - "time" - - "github.com/spf13/pflag" - - "vitess.io/vitess/go/acl" + "vitess.io/vitess/go/cmd/vtbench/cli" "vitess.io/vitess/go/exit" - "vitess.io/vitess/go/vt/dbconfigs" - "vitess.io/vitess/go/vt/grpccommon" "vitess.io/vitess/go/vt/log" - "vitess.io/vitess/go/vt/logutil" - "vitess.io/vitess/go/vt/servenv" - "vitess.io/vitess/go/vtbench" - - // Import and register the gRPC vtgateconn client - _ "vitess.io/vitess/go/vt/vtgate/grpcvtgateconn" - // Import and register the gRPC tabletconn client - _ "vitess.io/vitess/go/vt/vttablet/grpctabletconn" -) - -/* - - Vtbench is a simple load testing client to compare workloads in - Vitess across the various client/server protocols. - - There are a number of command line options to control the behavior, - but as a basic example, the three supported client protocols are: - - Mysql protocol to vtgate: - vtbench \ - --protocol mysql \ - --host vtgate-host.my.domain \ - --port 15306 \ - --user db_username \ - --db-credentials-file ./vtbench_db_creds.json \ - --db @replica \ - --sql "select * from loadtest_table where id=123456789" \ - --threads 10 \ - --count 10 - - GRPC to vtgate: - vtbench \ - --protocol grpc-vtgate \ - --host vtgate-host.my.domain \ - --port 15999 \ - --db @replica \ - $VTTABLET_GRPC_ARGS \ - --sql "select * from loadtest_table where id=123456789" \ - --threads 10 \ - --count 10 - - GRPC to vttablet: - vtbench \ - --protocol grpc-vttablet \ - --host tablet-loadtest-00-80.my.domain \ - --port 15999 \ - --db loadtest/00-80@replica \ - --sql "select * from loadtest_table where id=123456789" \ - --threads 10 \ - --count 10 - -*/ - -var ( - host, unixSocket, user, db, sql string - port int - protocol = "mysql" - deadline = 5 * time.Minute - threads = 2 - count = 1000 ) -func initFlags(fs *pflag.FlagSet) { - fs.StringVar(&host, "host", host, "VTGate host(s) in the form 'host1,host2,...'") - fs.IntVar(&port, "port", port, "VTGate port") - fs.StringVar(&unixSocket, "unix_socket", unixSocket, "VTGate unix socket") - fs.StringVar(&protocol, "protocol", protocol, "Client protocol, either mysql (default), grpc-vtgate, or grpc-vttablet") - fs.StringVar(&user, "user", user, "Username to connect using mysql (password comes from the db-credentials-file)") - fs.StringVar(&db, "db", db, "Database name to use when connecting / running the queries (e.g. @replica, keyspace, keyspace/shard etc)") - - fs.DurationVar(&deadline, "deadline", deadline, "Maximum duration for the test run (default 5 minutes)") - fs.StringVar(&sql, "sql", sql, "SQL statement to execute") - fs.IntVar(&threads, "threads", threads, "Number of parallel threads to run") - fs.IntVar(&count, "count", count, "Number of queries per thread") - - grpccommon.RegisterFlags(fs) - acl.RegisterFlags(fs) - servenv.RegisterMySQLServerFlags(fs) -} - func main() { - servenv.OnParseFor("vtbench", func(fs *pflag.FlagSet) { - logger := logutil.NewConsoleLogger() - fs.SetOutput(logutil.NewLoggerWriter(logger)) - - initFlags(fs) - _ = fs.Set("logtostderr", "true") - }) - - servenv.ParseFlags("vtbench") - servenv.Init() - defer exit.Recover() - clientProto := vtbench.MySQL - switch protocol { - case "", "mysql": - clientProto = vtbench.MySQL - case "grpc-vtgate": - clientProto = vtbench.GRPCVtgate - case "grpc-vttablet": - clientProto = vtbench.GRPCVttablet - default: - log.Exitf("invalid client protocol %s", protocol) - } - - if (host != "" || port != 0) && unixSocket != "" { - log.Exitf("can't specify both host:port and unix_socket") - } - - if host != "" && port == 0 { - log.Exitf("must specify port when using host") - } - - if host == "" && port != 0 { - log.Exitf("must specify host when using port") - } - - if host == "" && port == 0 && unixSocket == "" { - log.Exitf("vtbench requires either host/port or unix_socket") - } - - if sql == "" { - log.Exitf("must specify sql") - } - - var password string - if clientProto == vtbench.MySQL { - var err error - _, password, err = dbconfigs.GetCredentialsServer().GetUserAndPassword(user) - if err != nil { - log.Exitf("error reading password for user %v from file: %v", user, err) - } - } - - connParams := vtbench.ConnParams{ - Hosts: strings.Split(host, ","), - Port: port, - UnixSocket: unixSocket, - Protocol: clientProto, - DB: db, - Username: user, - Password: password, - } - - b := vtbench.NewBench(threads, count, connParams, sql) - - ctx, cancel := context.WithTimeout(context.Background(), deadline) - defer cancel() - - fmt.Printf("Initializing test with %s protocol / %d threads / %d iterations\n", - b.ConnParams.Protocol.String(), b.Threads, b.Count) - err := b.Run(ctx) - if err != nil { - log.Exitf("error in test: %v", err) - } - - fmt.Printf("Average Rows Returned: %d\n", b.Rows.Get()/int64(b.Threads*b.Count)) - fmt.Printf("Average Query Time: %v\n", time.Duration(b.Timings.Time()/b.Timings.Count())) - fmt.Printf("Total Test Time: %v\n", b.TotalTime) - fmt.Printf("QPS (Per Thread): %v\n", float64(b.Count)/b.TotalTime.Seconds()) - fmt.Printf("QPS (Total): %v\n", float64(b.Count*b.Threads)/b.TotalTime.Seconds()) - - last := int64(0) - - histograms := b.Timings.Histograms() - h := histograms["query"] - buckets := h.Buckets() - fmt.Printf("Query Timings:\n") - for i, bucket := range h.Cutoffs() { - count := buckets[i] - if count != 0 { - fmt.Printf("%v-%v: %v\n", time.Duration(last), time.Duration(bucket), count) - } - last = bucket + if err := cli.Main.Execute(); err != nil { + log.Exit(err) } } diff --git a/go/flags/endtoend/vtbench.txt b/go/flags/endtoend/vtbench.txt index 7d5fd0b88d8..d74dc13ebc8 100644 --- a/go/flags/endtoend/vtbench.txt +++ b/go/flags/endtoend/vtbench.txt @@ -1,4 +1,46 @@ -Usage of vtbench: +vtbench is a simple load testing client to compare workloads in Vitess across the various client/server protocols. + +Usage: + vtbench [flags] + +Examples: +There are a number of command line options to control the behavior, +but as a basic example, the three supported client protocols are: + +Mysql protocol to vtgate: +vtbench \ + --protocol mysql \ + --host vtgate-host.my.domain \ + --port 15306 \ + --user db_username \ + --db-credentials-file ./vtbench_db_creds.json \ + --db @replica \ + --sql "select * from loadtest_table where id=123456789" \ + --threads 10 \ + --count 10 + +GRPC to vtgate: +vtbench \ + --protocol grpc-vtgate \ + --host vtgate-host.my.domain \ + --port 15999 \ + --db @replica \ + $VTTABLET_GRPC_ARGS \ + --sql "select * from loadtest_table where id=123456789" \ + --threads 10 \ + --count 10 + +GRPC to vttablet: +vtbench \ + --protocol grpc-vttablet \ + --host tablet-loadtest-00-80.my.domain \ + --port 15999 \ + --db loadtest/00-80@replica \ + --sql "select * from loadtest_table where id=123456789" \ + --threads 10 \ + --count 10 + +Flags: --alsologtostderr log to standard error as well as files --config-file string Full path of the config file (with extension) to use. If set, --config-path, --config-type, and --config-name are ignored. --config-file-not-found-handling ConfigFileNotFoundHandling Behavior when a config file is not found. (Options: error, exit, ignore, warn) (default warn) @@ -18,7 +60,7 @@ Usage of vtbench: --grpc_keepalive_timeout duration After having pinged for keepalive check, the client waits for a duration of Timeout and if no activity is seen even after that the connection is closed. (default 10s) --grpc_max_message_size int Maximum allowed RPC message size. Larger messages will be rejected by gRPC with the error 'exceeding the max size'. (default 16777216) --grpc_prometheus Enable gRPC monitoring with Prometheus. - -h, --help display usage and exit + -h, --help help for vtbench --host string VTGate host(s) in the form 'host1,host2,...' --keep_logs duration keep logs for this long (using ctime) (zero to keep forever) --keep_logs_by_mtime duration keep logs for this long (using mtime) (zero to keep forever) From 2092ab7d32e24b85767960118d3ee47993c5eba2 Mon Sep 17 00:00:00 2001 From: Andrew Mason Date: Sat, 16 Sep 2023 15:32:29 -0400 Subject: [PATCH 05/12] add vtclient flag testdata Signed-off-by: Andrew Mason --- go/flags/endtoend/vtclient.txt | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 go/flags/endtoend/vtclient.txt diff --git a/go/flags/endtoend/vtclient.txt b/go/flags/endtoend/vtclient.txt new file mode 100644 index 00000000000..53fedcbb36b --- /dev/null +++ b/go/flags/endtoend/vtclient.txt @@ -0,0 +1,30 @@ +Usage of vtclient: + --alsologtostderr log to standard error as well as files + --bind_variables float bind variables as a json list (default null) + --count int DMLs only: Number of times each thread executes the query. Useful for simple, sustained load testing. (default 1) + --grpc_enable_tracing Enable gRPC tracing. + --grpc_max_message_size int Maximum allowed RPC message size. Larger messages will be rejected by gRPC with the error 'exceeding the max size'. (default 16777216) + --grpc_prometheus Enable gRPC monitoring with Prometheus. + -h, --help display usage and exit + --json Output JSON instead of human-readable table + --keep_logs duration keep logs for this long (using ctime) (zero to keep forever) + --keep_logs_by_mtime duration keep logs for this long (using mtime) (zero to keep forever) + --log_backtrace_at traceLocation when logging hits line file:N, emit a stack trace (default :0) + --log_dir string If non-empty, write log files in this directory + --log_rotate_max_size uint size in bytes at which logs are rotated (glog.MaxSize) (default 1887436800) + --logtostderr log to standard error instead of files + --max_sequence_id int max sequence ID. + --min_sequence_id int min sequence ID to generate. When max_sequence_id > min_sequence_id, for each query, a number is generated in [min_sequence_id, max_sequence_id) and attached to the end of the bind variables. + --mysql_server_version string MySQL server version to advertise. (default "8.0.30-Vitess") + --parallel int DMLs only: Number of threads executing the same query in parallel. Useful for simple load testing. (default 1) + --purge_logs_interval duration how often try to remove old logs (default 1h0m0s) + --qps int queries per second to throttle each thread at. + --security_policy string the name of a registered security policy to use for controlling access to URLs - empty means allow all for anyone (built-in policies: deny-all, read-only) + --server string vtgate server to connect to + --stderrthreshold severity logs at or above this threshold go to stderr (default 1) + --streaming use a streaming query + --target string keyspace:shard@tablet_type + --timeout duration timeout for queries (default 30s) + --use_random_sequence use random sequence for generating [min_sequence_id, max_sequence_id) + --v Level log level for V logs + --vmodule moduleSpec comma-separated list of pattern=N settings for file-filtered logging From 9357d38dcba851c9f383c95c0ae917cb20cce5c6 Mon Sep 17 00:00:00 2001 From: Andrew Mason Date: Sat, 16 Sep 2023 17:04:46 -0400 Subject: [PATCH 06/12] migrate vtclient to cobra and rewire tests Signed-off-by: Andrew Mason --- .../vtclient/{ => cli}/plugin_opentracing.go | 3 +- go/cmd/vtclient/cli/vtclient.go | 433 ++++++++++++++++++ go/cmd/vtclient/{ => cli}/vtclient_test.go | 17 +- go/cmd/vtclient/vtclient.go | 431 +---------------- go/flags/endtoend/vtclient.txt | 84 ++-- 5 files changed, 499 insertions(+), 469 deletions(-) rename go/cmd/vtclient/{ => cli}/plugin_opentracing.go (98%) create mode 100644 go/cmd/vtclient/cli/vtclient.go rename go/cmd/vtclient/{ => cli}/vtclient_test.go (90%) diff --git a/go/cmd/vtclient/plugin_opentracing.go b/go/cmd/vtclient/cli/plugin_opentracing.go similarity index 98% rename from go/cmd/vtclient/plugin_opentracing.go rename to go/cmd/vtclient/cli/plugin_opentracing.go index b48334531a3..a3466ca8c73 100644 --- a/go/cmd/vtclient/plugin_opentracing.go +++ b/go/cmd/vtclient/cli/plugin_opentracing.go @@ -14,11 +14,10 @@ See the License for the specific language governing permissions and limitations under the License. */ -package main +package cli import ( "vitess.io/vitess/go/trace" - "vitess.io/vitess/go/vt/servenv" ) diff --git a/go/cmd/vtclient/cli/vtclient.go b/go/cmd/vtclient/cli/vtclient.go new file mode 100644 index 00000000000..f7d399aa834 --- /dev/null +++ b/go/cmd/vtclient/cli/vtclient.go @@ -0,0 +1,433 @@ +/* +Copyright 2023 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cli + +import ( + "context" + "database/sql" + "encoding/json" + "fmt" + "io" + "math/rand" + "os" + "sort" + "sync" + "time" + + "github.com/olekukonko/tablewriter" + "github.com/spf13/cobra" + "github.com/spf13/pflag" + + "vitess.io/vitess/go/acl" + "vitess.io/vitess/go/vt/concurrency" + "vitess.io/vitess/go/vt/grpccommon" + "vitess.io/vitess/go/vt/log" + "vitess.io/vitess/go/vt/logutil" + "vitess.io/vitess/go/vt/servenv" + "vitess.io/vitess/go/vt/sqlparser" + "vitess.io/vitess/go/vt/vitessdriver" + "vitess.io/vitess/go/vt/vterrors" + "vitess.io/vitess/go/vt/vtgate/vtgateconn" + + vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc" + // Include deprecation warnings for soon-to-be-unsupported flag invocations. +) + +var ( + server string + streaming bool + targetString string + jsonOutput bool + useRandom bool + bindVariables *bindvars + + timeout = 30 * time.Second + parallel = 1 + count = 1 + minSeqID int + maxSeqID int + qps int + + Main = &cobra.Command{ + Use: "vtclient ", + Short: "vtclient connects to a vtgate server using the standard go driver API.", + Long: `vtclient connects to a vtgate server using the standard go driver API. + +Version 3 of the API is used, we do not send any hint to the server. + +For query bound variables, we assume place-holders in the query string +in the form of :v1, :v2, etc.`, + Example: `vtclient --server vtgate:15991 "SELECT * FROM messages" + +vtclient --server vtgate:15991 --target '@primary' --bind_variables '[ 12345, 1, "msg 12345" ]' "INSERT INTO messages (page,time_created_ns,message) VALUES (:v1, :v2, :v3)"`, + Args: cobra.ExactArgs(1), + Version: servenv.AppVersion.String(), + RunE: run, + } +) + +var ( + seqChan = make(chan int, 10) +) + +func init() { + servenv.MoveFlagsToCobraCommand(Main) + + Main.Flags().StringVar(&server, "server", server, "vtgate server to connect to") + Main.Flags().DurationVar(&timeout, "timeout", timeout, "timeout for queries") + Main.Flags().BoolVar(&streaming, "streaming", streaming, "use a streaming query") + Main.Flags().StringVar(&targetString, "target", targetString, "keyspace:shard@tablet_type") + Main.Flags().BoolVar(&jsonOutput, "json", jsonOutput, "Output JSON instead of human-readable table") + Main.Flags().IntVar(¶llel, "parallel", parallel, "DMLs only: Number of threads executing the same query in parallel. Useful for simple load testing.") + Main.Flags().IntVar(&count, "count", count, "DMLs only: Number of times each thread executes the query. Useful for simple, sustained load testing.") + Main.Flags().IntVar(&minSeqID, "min_sequence_id", minSeqID, "min sequence ID to generate. When max_sequence_id > min_sequence_id, for each query, a number is generated in [min_sequence_id, max_sequence_id) and attached to the end of the bind variables.") + Main.Flags().IntVar(&maxSeqID, "max_sequence_id", maxSeqID, "max sequence ID.") + Main.Flags().BoolVar(&useRandom, "use_random_sequence", useRandom, "use random sequence for generating [min_sequence_id, max_sequence_id)") + Main.Flags().IntVar(&qps, "qps", qps, "queries per second to throttle each thread at.") + + acl.RegisterFlags(Main.Flags()) + grpccommon.RegisterFlags(Main.Flags()) + servenv.RegisterMySQLServerFlags(Main.Flags()) + + bindVariables = newBindvars(Main.Flags(), "bind_variables", "bind variables as a json list") +} + +type bindvars []any + +func (bv *bindvars) String() string { + b, err := json.Marshal(bv) + if err != nil { + return err.Error() + } + return string(b) +} + +func (bv *bindvars) Set(s string) (err error) { + err = json.Unmarshal([]byte(s), &bv) + if err != nil { + return err + } + // json reads all numbers as float64 + // So, we just ditch floats for bindvars + for i, v := range *bv { + if f, ok := v.(float64); ok { + if f > 0 { + (*bv)[i] = uint64(f) + } else { + (*bv)[i] = int64(f) + } + } + } + + return nil +} + +// For internal flag compatibility +func (bv *bindvars) Get() any { + return bv +} + +// Type is part of the pflag.Value interface. bindvars.Set() expects all numbers as float64. +func (bv *bindvars) Type() string { + return "float64" +} + +func newBindvars(fs *pflag.FlagSet, name, usage string) *bindvars { + var bv bindvars + fs.Var(&bv, name, usage) + return &bv +} + +func run(cmd *cobra.Command, args []string) error { + defer logutil.Flush() + + qr, err := _run(cmd, args) + if jsonOutput && qr != nil { + data, err := json.MarshalIndent(qr, "", " ") + if err != nil { + return fmt.Errorf("cannot marshal data: %w", err) + } + fmt.Fprint(cmd.OutOrStdout(), string(data)) + return nil + } + + qr.print(cmd.OutOrStdout()) + return err +} + +func _run(cmd *cobra.Command, args []string) (*results, error) { + logutil.PurgeLogs() + + if maxSeqID > minSeqID { + go func() { + if useRandom { + for { + seqChan <- rand.Intn(maxSeqID-minSeqID) + minSeqID + } + } else { + for i := minSeqID; i < maxSeqID; i++ { + seqChan <- i + } + } + }() + } + + c := vitessdriver.Configuration{ + Protocol: vtgateconn.GetVTGateProtocol(), + Address: server, + Target: targetString, + Streaming: streaming, + } + db, err := vitessdriver.OpenWithConfiguration(c) + if err != nil { + return nil, fmt.Errorf("client error: %w", err) + } + + log.Infof("Sending the query...") + + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + return execMulti(ctx, db, cmd.Flags().Arg(0)) +} + +func prepareBindVariables() []any { + bv := make([]any, 0, len(*bindVariables)+1) + bv = append(bv, (*bindVariables)...) + if maxSeqID > minSeqID { + bv = append(bv, <-seqChan) + } + return bv +} + +func execMulti(ctx context.Context, db *sql.DB, sql string) (*results, error) { + all := newResults() + ec := concurrency.FirstErrorRecorder{} + wg := sync.WaitGroup{} + isDML := sqlparser.IsDML(sql) + + isThrottled := qps > 0 + + start := time.Now() + for i := 0; i < parallel; i++ { + wg.Add(1) + + go func() { + defer wg.Done() + + var ticker *time.Ticker + if isThrottled { + tickDuration := time.Second / time.Duration(qps) + ticker = time.NewTicker(tickDuration) + } + + for j := 0; j < count; j++ { + var qr *results + var err error + if isDML { + qr, err = execDml(ctx, db, sql) + } else { + qr, err = execNonDml(ctx, db, sql) + } + if count == 1 && parallel == 1 { + all = qr + } else { + all.merge(qr) + if err != nil { + all.recordError(err) + } + } + if err != nil { + ec.RecordError(err) + // We keep going and do not return early purpose. + } + + if ticker != nil { + <-ticker.C + } + } + }() + } + wg.Wait() + if all != nil { + all.duration = time.Since(start) + } + + return all, ec.Error() +} + +func execDml(ctx context.Context, db *sql.DB, sql string) (*results, error) { + start := time.Now() + tx, err := db.Begin() + if err != nil { + return nil, vterrors.Wrap(err, "BEGIN failed") + } + + result, err := tx.ExecContext(ctx, sql, []any(prepareBindVariables())...) + if err != nil { + return nil, vterrors.Wrap(err, "failed to execute DML") + } + + err = tx.Commit() + if err != nil { + return nil, vterrors.Wrap(err, "COMMIT failed") + } + + rowsAffected, _ := result.RowsAffected() + lastInsertID, _ := result.LastInsertId() + return &results{ + rowsAffected: rowsAffected, + lastInsertID: lastInsertID, + duration: time.Since(start), + }, nil +} + +func execNonDml(ctx context.Context, db *sql.DB, sql string) (*results, error) { + start := time.Now() + rows, err := db.QueryContext(ctx, sql, []any(prepareBindVariables())...) + if err != nil { + return nil, vterrors.Wrap(err, "client error") + } + defer rows.Close() + + // get the headers + var qr results + cols, err := rows.Columns() + if err != nil { + return nil, vterrors.Wrap(err, "client error") + } + qr.Fields = cols + + // get the rows + for rows.Next() { + row := make([]any, len(cols)) + for i := range row { + var col string + row[i] = &col + } + if err := rows.Scan(row...); err != nil { + return nil, vterrors.Wrap(err, "client error") + } + + // unpack []*string into []string + vals := make([]string, 0, len(row)) + for _, value := range row { + vals = append(vals, *(value.(*string))) + } + qr.Rows = append(qr.Rows, vals) + } + qr.rowsAffected = int64(len(qr.Rows)) + + if err := rows.Err(); err != nil { + return nil, vterrors.Wrap(err, "Vitess returned an error") + } + + qr.duration = time.Since(start) + return &qr, nil +} + +type results struct { + mu sync.Mutex + Fields []string `json:"fields"` + Rows [][]string `json:"rows"` + rowsAffected int64 + lastInsertID int64 + duration time.Duration + cumulativeDuration time.Duration + + // Multi DML mode: Track total error count, error count per code and the first error. + totalErrorCount int + errorCount map[vtrpcpb.Code]int + firstError map[vtrpcpb.Code]error +} + +func newResults() *results { + return &results{ + errorCount: make(map[vtrpcpb.Code]int), + firstError: make(map[vtrpcpb.Code]error), + } +} + +// merge aggregates "other" into "r". +// This is only used for executing DMLs concurrently and repeatedly. +// Therefore, "Fields" and "Rows" are not merged. +func (r *results) merge(other *results) { + if other == nil { + return + } + + r.mu.Lock() + defer r.mu.Unlock() + + r.rowsAffected += other.rowsAffected + if other.lastInsertID > r.lastInsertID { + r.lastInsertID = other.lastInsertID + } + r.cumulativeDuration += other.duration +} + +func (r *results) recordError(err error) { + r.mu.Lock() + defer r.mu.Unlock() + + r.totalErrorCount++ + code := vterrors.Code(err) + r.errorCount[code]++ + + if r.errorCount[code] == 1 { + r.firstError[code] = err + } +} + +func (r *results) print(w io.Writer) { + if r == nil { + return + } + + table := tablewriter.NewWriter(os.Stdout) + table.SetHeader(r.Fields) + table.SetAutoFormatHeaders(false) + table.AppendBulk(r.Rows) + table.Render() + fmt.Fprintf(w, "%v row(s) affected (%v, cum: %v)\n", r.rowsAffected, r.duration, r.cumulativeDuration) + if r.lastInsertID != 0 { + fmt.Fprintf(w, "Last insert ID: %v\n", r.lastInsertID) + } + + if r.totalErrorCount == 0 { + return + } + + fmt.Printf("%d error(s) were returned. Number of errors by error code:\n\n", r.totalErrorCount) + // Sort different error codes by count (descending). + type errorCounts struct { + code vtrpcpb.Code + count int + } + var counts []errorCounts + for code, count := range r.errorCount { + counts = append(counts, errorCounts{code, count}) + } + sort.Slice(counts, func(i, j int) bool { return counts[i].count >= counts[j].count }) + for _, c := range counts { + fmt.Fprintf(w, "%- 30v= % 5d\n", c.code, c.count) + } + + fmt.Fprintf(w, "\nFirst error per code:\n\n") + for code, err := range r.firstError { + fmt.Fprintf(w, "Code: %v\nError: %v\n\n", code, err) + } +} diff --git a/go/cmd/vtclient/vtclient_test.go b/go/cmd/vtclient/cli/vtclient_test.go similarity index 90% rename from go/cmd/vtclient/vtclient_test.go rename to go/cmd/vtclient/cli/vtclient_test.go index 4711b1e0127..a5ee571cd0b 100644 --- a/go/cmd/vtclient/vtclient_test.go +++ b/go/cmd/vtclient/cli/vtclient_test.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package main +package cli import ( "fmt" @@ -22,7 +22,7 @@ import ( "strings" "testing" - "github.com/spf13/pflag" + "github.com/stretchr/testify/require" "vitess.io/vitess/go/vt/vttest" @@ -120,15 +120,16 @@ func TestVtclient(t *testing.T) { }, } - // Change ErrorHandling from ExitOnError to panicking. - pflag.CommandLine.Init("vtclient_test.go", pflag.PanicOnError) for _, q := range queries { // Run main function directly and not as external process. To achieve this, // overwrite os.Args which is used by pflag.Parse(). - os.Args = []string{"vtclient_test.go", "--server", vtgateAddr} - os.Args = append(os.Args, q.args...) + args := []string{"--server", vtgateAddr} + args = append(args, q.args...) - results, err := run() + err := Main.ParseFlags(args) + require.NoError(t, err) + + results, err := _run(Main, args) if q.errMsg != "" { if got, want := err.Error(), q.errMsg; !strings.Contains(got, want) { t.Fatalf("vtclient %v returned wrong error: got = %v, want contains = %v", os.Args[1:], got, want) @@ -137,7 +138,7 @@ func TestVtclient(t *testing.T) { } if err != nil { - t.Fatalf("vtclient %v failed: %v", os.Args[1:], err) + t.Fatalf("vtclient %v failed: %v", args[1:], err) } if got, want := results.rowsAffected, q.rowsAffected; got != want { t.Fatalf("wrong rows affected for query: %v got = %v, want = %v", os.Args[1:], got, want) diff --git a/go/cmd/vtclient/vtclient.go b/go/cmd/vtclient/vtclient.go index 26c8cfd4806..4201d25c882 100644 --- a/go/cmd/vtclient/vtclient.go +++ b/go/cmd/vtclient/vtclient.go @@ -17,439 +17,12 @@ limitations under the License. package main import ( - "context" - "database/sql" - "encoding/json" - "errors" - "flag" - "fmt" - "io" - "math/rand" - "os" - "sort" - "sync" - "time" - - "github.com/olekukonko/tablewriter" - "github.com/spf13/pflag" - - "vitess.io/vitess/go/acl" - "vitess.io/vitess/go/vt/concurrency" - "vitess.io/vitess/go/vt/grpccommon" + "vitess.io/vitess/go/cmd/vtclient/cli" "vitess.io/vitess/go/vt/log" - "vitess.io/vitess/go/vt/logutil" - "vitess.io/vitess/go/vt/servenv" - "vitess.io/vitess/go/vt/sqlparser" - "vitess.io/vitess/go/vt/vitessdriver" - "vitess.io/vitess/go/vt/vterrors" - "vitess.io/vitess/go/vt/vtgate/vtgateconn" - - vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc" - - // Include deprecation warnings for soon-to-be-unsupported flag invocations. - _flag "vitess.io/vitess/go/internal/flag" ) -var ( - usage = ` -vtclient connects to a vtgate server using the standard go driver API. -Version 3 of the API is used, we do not send any hint to the server. - -For query bound variables, we assume place-holders in the query string -in the form of :v1, :v2, etc. - -Examples: - - $ vtclient --server vtgate:15991 "SELECT * FROM messages" - - $ vtclient --server vtgate:15991 --target '@primary' --bind_variables '[ 12345, 1, "msg 12345" ]' "INSERT INTO messages (page,time_created_ns,message) VALUES (:v1, :v2, :v3)" - -` - server string - streaming bool - targetString string - jsonOutput bool - useRandom bool - bindVariables *bindvars - - timeout = 30 * time.Second - parallel = 1 - count = 1 - minSeqID = 0 - maxSeqID = 0 - qps = 0 -) - -var ( - seqChan = make(chan int, 10) -) - -func init() { - _flag.SetUsage(flag.CommandLine, _flag.UsageOptions{ - Epilogue: func(w io.Writer) { fmt.Fprint(w, usage) }, - }) -} - -func registerFlags(fs *pflag.FlagSet) { - fs.StringVar(&server, "server", server, "vtgate server to connect to") - fs.DurationVar(&timeout, "timeout", timeout, "timeout for queries") - fs.BoolVar(&streaming, "streaming", streaming, "use a streaming query") - fs.StringVar(&targetString, "target", targetString, "keyspace:shard@tablet_type") - fs.BoolVar(&jsonOutput, "json", jsonOutput, "Output JSON instead of human-readable table") - fs.IntVar(¶llel, "parallel", parallel, "DMLs only: Number of threads executing the same query in parallel. Useful for simple load testing.") - fs.IntVar(&count, "count", count, "DMLs only: Number of times each thread executes the query. Useful for simple, sustained load testing.") - fs.IntVar(&minSeqID, "min_sequence_id", minSeqID, "min sequence ID to generate. When max_sequence_id > min_sequence_id, for each query, a number is generated in [min_sequence_id, max_sequence_id) and attached to the end of the bind variables.") - fs.IntVar(&maxSeqID, "max_sequence_id", maxSeqID, "max sequence ID.") - fs.BoolVar(&useRandom, "use_random_sequence", useRandom, "use random sequence for generating [min_sequence_id, max_sequence_id)") - fs.IntVar(&qps, "qps", qps, "queries per second to throttle each thread at.") - - acl.RegisterFlags(fs) - - bindVariables = newBindvars(fs, "bind_variables", "bind variables as a json list") -} - -type bindvars []any - -func (bv *bindvars) String() string { - b, err := json.Marshal(bv) - if err != nil { - return err.Error() - } - return string(b) -} - -func (bv *bindvars) Set(s string) (err error) { - err = json.Unmarshal([]byte(s), &bv) - if err != nil { - return err - } - // json reads all numbers as float64 - // So, we just ditch floats for bindvars - for i, v := range *bv { - if f, ok := v.(float64); ok { - if f > 0 { - (*bv)[i] = uint64(f) - } else { - (*bv)[i] = int64(f) - } - } - } - - return nil -} - -// For internal flag compatibility -func (bv *bindvars) Get() any { - return bv -} - -// Type is part of the pflag.Value interface. bindvars.Set() expects all numbers as float64. -func (bv *bindvars) Type() string { - return "float64" -} - -func newBindvars(fs *pflag.FlagSet, name, usage string) *bindvars { - var bv bindvars - fs.Var(&bv, name, usage) - return &bv -} - func main() { - defer logutil.Flush() - - qr, err := run() - if jsonOutput && qr != nil { - data, err := json.MarshalIndent(qr, "", " ") - if err != nil { - log.Exitf("cannot marshal data: %v", err) - } - fmt.Print(string(data)) - return - } - - qr.print() - - if err != nil { + if err := cli.Main.Execute(); err != nil { log.Exit(err) } } - -func run() (*results, error) { - fs := pflag.NewFlagSet("vtclient", pflag.ExitOnError) - grpccommon.RegisterFlags(fs) - log.RegisterFlags(fs) - logutil.RegisterFlags(fs) - servenv.RegisterMySQLServerFlags(fs) - registerFlags(fs) - _flag.Parse(fs) - args := _flag.Args() - - logutil.PurgeLogs() - - if len(args) == 0 { - pflag.Usage() - return nil, errors.New("no arguments provided. See usage above") - } - if len(args) > 1 { - return nil, errors.New("no additional arguments after the query allowed") - } - - if maxSeqID > minSeqID { - go func() { - if useRandom { - for { - seqChan <- rand.Intn(maxSeqID-minSeqID) + minSeqID - } - } else { - for i := minSeqID; i < maxSeqID; i++ { - seqChan <- i - } - } - }() - } - - c := vitessdriver.Configuration{ - Protocol: vtgateconn.GetVTGateProtocol(), - Address: server, - Target: targetString, - Streaming: streaming, - } - db, err := vitessdriver.OpenWithConfiguration(c) - if err != nil { - return nil, fmt.Errorf("client error: %v", err) - } - - log.Infof("Sending the query...") - - ctx, cancel := context.WithTimeout(context.Background(), timeout) - defer cancel() - return execMulti(ctx, db, args[0]) -} - -func prepareBindVariables() []any { - bv := make([]any, 0, len(*bindVariables)+1) - bv = append(bv, (*bindVariables)...) - if maxSeqID > minSeqID { - bv = append(bv, <-seqChan) - } - return bv -} - -func execMulti(ctx context.Context, db *sql.DB, sql string) (*results, error) { - all := newResults() - ec := concurrency.FirstErrorRecorder{} - wg := sync.WaitGroup{} - isDML := sqlparser.IsDML(sql) - - isThrottled := qps > 0 - - start := time.Now() - for i := 0; i < parallel; i++ { - wg.Add(1) - - go func() { - defer wg.Done() - - var ticker *time.Ticker - if isThrottled { - tickDuration := time.Second / time.Duration(qps) - ticker = time.NewTicker(tickDuration) - } - - for j := 0; j < count; j++ { - var qr *results - var err error - if isDML { - qr, err = execDml(ctx, db, sql) - } else { - qr, err = execNonDml(ctx, db, sql) - } - if count == 1 && parallel == 1 { - all = qr - } else { - all.merge(qr) - if err != nil { - all.recordError(err) - } - } - if err != nil { - ec.RecordError(err) - // We keep going and do not return early purpose. - } - - if ticker != nil { - <-ticker.C - } - } - }() - } - wg.Wait() - if all != nil { - all.duration = time.Since(start) - } - - return all, ec.Error() -} - -func execDml(ctx context.Context, db *sql.DB, sql string) (*results, error) { - start := time.Now() - tx, err := db.Begin() - if err != nil { - return nil, vterrors.Wrap(err, "BEGIN failed") - } - - result, err := tx.ExecContext(ctx, sql, []any(prepareBindVariables())...) - if err != nil { - return nil, vterrors.Wrap(err, "failed to execute DML") - } - - err = tx.Commit() - if err != nil { - return nil, vterrors.Wrap(err, "COMMIT failed") - } - - rowsAffected, _ := result.RowsAffected() - lastInsertID, _ := result.LastInsertId() - return &results{ - rowsAffected: rowsAffected, - lastInsertID: lastInsertID, - duration: time.Since(start), - }, nil -} - -func execNonDml(ctx context.Context, db *sql.DB, sql string) (*results, error) { - start := time.Now() - rows, err := db.QueryContext(ctx, sql, []any(prepareBindVariables())...) - if err != nil { - return nil, vterrors.Wrap(err, "client error") - } - defer rows.Close() - - // get the headers - var qr results - cols, err := rows.Columns() - if err != nil { - return nil, vterrors.Wrap(err, "client error") - } - qr.Fields = cols - - // get the rows - for rows.Next() { - row := make([]any, len(cols)) - for i := range row { - var col string - row[i] = &col - } - if err := rows.Scan(row...); err != nil { - return nil, vterrors.Wrap(err, "client error") - } - - // unpack []*string into []string - vals := make([]string, 0, len(row)) - for _, value := range row { - vals = append(vals, *(value.(*string))) - } - qr.Rows = append(qr.Rows, vals) - } - qr.rowsAffected = int64(len(qr.Rows)) - - if err := rows.Err(); err != nil { - return nil, vterrors.Wrap(err, "Vitess returned an error") - } - - qr.duration = time.Since(start) - return &qr, nil -} - -type results struct { - mu sync.Mutex - Fields []string `json:"fields"` - Rows [][]string `json:"rows"` - rowsAffected int64 - lastInsertID int64 - duration time.Duration - cumulativeDuration time.Duration - - // Multi DML mode: Track total error count, error count per code and the first error. - totalErrorCount int - errorCount map[vtrpcpb.Code]int - firstError map[vtrpcpb.Code]error -} - -func newResults() *results { - return &results{ - errorCount: make(map[vtrpcpb.Code]int), - firstError: make(map[vtrpcpb.Code]error), - } -} - -// merge aggregates "other" into "r". -// This is only used for executing DMLs concurrently and repeatedly. -// Therefore, "Fields" and "Rows" are not merged. -func (r *results) merge(other *results) { - if other == nil { - return - } - - r.mu.Lock() - defer r.mu.Unlock() - - r.rowsAffected += other.rowsAffected - if other.lastInsertID > r.lastInsertID { - r.lastInsertID = other.lastInsertID - } - r.cumulativeDuration += other.duration -} - -func (r *results) recordError(err error) { - r.mu.Lock() - defer r.mu.Unlock() - - r.totalErrorCount++ - code := vterrors.Code(err) - r.errorCount[code]++ - - if r.errorCount[code] == 1 { - r.firstError[code] = err - } -} - -func (r *results) print() { - if r == nil { - return - } - - table := tablewriter.NewWriter(os.Stdout) - table.SetHeader(r.Fields) - table.SetAutoFormatHeaders(false) - table.AppendBulk(r.Rows) - table.Render() - fmt.Printf("%v row(s) affected (%v, cum: %v)\n", r.rowsAffected, r.duration, r.cumulativeDuration) - if r.lastInsertID != 0 { - fmt.Printf("Last insert ID: %v\n", r.lastInsertID) - } - - if r.totalErrorCount == 0 { - return - } - - fmt.Printf("%d error(s) were returned. Number of errors by error code:\n\n", r.totalErrorCount) - // Sort different error codes by count (descending). - type errorCounts struct { - code vtrpcpb.Code - count int - } - var counts []errorCounts - for code, count := range r.errorCount { - counts = append(counts, errorCounts{code, count}) - } - sort.Slice(counts, func(i, j int) bool { return counts[i].count >= counts[j].count }) - for _, c := range counts { - fmt.Printf("%- 30v= % 5d\n", c.code, c.count) - } - - fmt.Printf("\nFirst error per code:\n\n") - for code, err := range r.firstError { - fmt.Printf("Code: %v\nError: %v\n\n", code, err) - } -} diff --git a/go/flags/endtoend/vtclient.txt b/go/flags/endtoend/vtclient.txt index 53fedcbb36b..45a5d27b26e 100644 --- a/go/flags/endtoend/vtclient.txt +++ b/go/flags/endtoend/vtclient.txt @@ -1,30 +1,54 @@ -Usage of vtclient: - --alsologtostderr log to standard error as well as files - --bind_variables float bind variables as a json list (default null) - --count int DMLs only: Number of times each thread executes the query. Useful for simple, sustained load testing. (default 1) - --grpc_enable_tracing Enable gRPC tracing. - --grpc_max_message_size int Maximum allowed RPC message size. Larger messages will be rejected by gRPC with the error 'exceeding the max size'. (default 16777216) - --grpc_prometheus Enable gRPC monitoring with Prometheus. - -h, --help display usage and exit - --json Output JSON instead of human-readable table - --keep_logs duration keep logs for this long (using ctime) (zero to keep forever) - --keep_logs_by_mtime duration keep logs for this long (using mtime) (zero to keep forever) - --log_backtrace_at traceLocation when logging hits line file:N, emit a stack trace (default :0) - --log_dir string If non-empty, write log files in this directory - --log_rotate_max_size uint size in bytes at which logs are rotated (glog.MaxSize) (default 1887436800) - --logtostderr log to standard error instead of files - --max_sequence_id int max sequence ID. - --min_sequence_id int min sequence ID to generate. When max_sequence_id > min_sequence_id, for each query, a number is generated in [min_sequence_id, max_sequence_id) and attached to the end of the bind variables. - --mysql_server_version string MySQL server version to advertise. (default "8.0.30-Vitess") - --parallel int DMLs only: Number of threads executing the same query in parallel. Useful for simple load testing. (default 1) - --purge_logs_interval duration how often try to remove old logs (default 1h0m0s) - --qps int queries per second to throttle each thread at. - --security_policy string the name of a registered security policy to use for controlling access to URLs - empty means allow all for anyone (built-in policies: deny-all, read-only) - --server string vtgate server to connect to - --stderrthreshold severity logs at or above this threshold go to stderr (default 1) - --streaming use a streaming query - --target string keyspace:shard@tablet_type - --timeout duration timeout for queries (default 30s) - --use_random_sequence use random sequence for generating [min_sequence_id, max_sequence_id) - --v Level log level for V logs - --vmodule moduleSpec comma-separated list of pattern=N settings for file-filtered logging +vtclient connects to a vtgate server using the standard go driver API. + +Version 3 of the API is used, we do not send any hint to the server. + +For query bound variables, we assume place-holders in the query string +in the form of :v1, :v2, etc. + +Usage: + vtclient [flags] + +Examples: +vtclient --server vtgate:15991 "SELECT * FROM messages" + +vtclient --server vtgate:15991 --target '@primary' --bind_variables '[ 12345, 1, "msg 12345" ]' "INSERT INTO messages (page,time_created_ns,message) VALUES (:v1, :v2, :v3)" + +Flags: + --alsologtostderr log to standard error as well as files + --bind_variables float bind variables as a json list (default null) + --config-file string Full path of the config file (with extension) to use. If set, --config-path, --config-type, and --config-name are ignored. + --config-file-not-found-handling ConfigFileNotFoundHandling Behavior when a config file is not found. (Options: error, exit, ignore, warn) (default warn) + --config-name string Name of the config file (without extension) to search for. (default "vtconfig") + --config-path strings Paths to search for config files in. (default [{{ .Workdir }}]) + --config-persistence-min-interval duration minimum interval between persisting dynamic config changes back to disk (if no change has occurred, nothing is done). (default 1s) + --config-type string Config file type (omit to infer config type from file extension). + --count int DMLs only: Number of times each thread executes the query. Useful for simple, sustained load testing. (default 1) + --grpc_enable_tracing Enable gRPC tracing. + --grpc_max_message_size int Maximum allowed RPC message size. Larger messages will be rejected by gRPC with the error 'exceeding the max size'. (default 16777216) + --grpc_prometheus Enable gRPC monitoring with Prometheus. + -h, --help help for vtclient + --json Output JSON instead of human-readable table + --keep_logs duration keep logs for this long (using ctime) (zero to keep forever) + --keep_logs_by_mtime duration keep logs for this long (using mtime) (zero to keep forever) + --log_backtrace_at traceLocation when logging hits line file:N, emit a stack trace (default :0) + --log_dir string If non-empty, write log files in this directory + --log_err_stacks log stack traces for errors + --log_rotate_max_size uint size in bytes at which logs are rotated (glog.MaxSize) (default 1887436800) + --logtostderr log to standard error instead of files + --max_sequence_id int max sequence ID. + --min_sequence_id int min sequence ID to generate. When max_sequence_id > min_sequence_id, for each query, a number is generated in [min_sequence_id, max_sequence_id) and attached to the end of the bind variables. + --mysql_server_version string MySQL server version to advertise. (default "8.0.30-Vitess") + --parallel int DMLs only: Number of threads executing the same query in parallel. Useful for simple load testing. (default 1) + --pprof strings enable profiling + --purge_logs_interval duration how often try to remove old logs (default 1h0m0s) + --qps int queries per second to throttle each thread at. + --security_policy string the name of a registered security policy to use for controlling access to URLs - empty means allow all for anyone (built-in policies: deny-all, read-only) + --server string vtgate server to connect to + --stderrthreshold severity logs at or above this threshold go to stderr (default 1) + --streaming use a streaming query + --target string keyspace:shard@tablet_type + --timeout duration timeout for queries (default 30s) + --use_random_sequence use random sequence for generating [min_sequence_id, max_sequence_id) + --v Level log level for V logs + -v, --version print binary version + --vmodule moduleSpec comma-separated list of pattern=N settings for file-filtered logging From f536903d7f11e0d50712791993a91b5688074e95 Mon Sep 17 00:00:00 2001 From: Andrew Mason Date: Sat, 16 Sep 2023 19:49:25 -0400 Subject: [PATCH 07/12] add vtcombo flag testdata Signed-off-by: Andrew Mason --- go/flags/endtoend/vtcombo.txt | 418 ++++++++++++++++++++++++++++++++++ 1 file changed, 418 insertions(+) create mode 100644 go/flags/endtoend/vtcombo.txt diff --git a/go/flags/endtoend/vtcombo.txt b/go/flags/endtoend/vtcombo.txt new file mode 100644 index 00000000000..ec42626dec8 --- /dev/null +++ b/go/flags/endtoend/vtcombo.txt @@ -0,0 +1,418 @@ +Usage of vtcombo: + --action_timeout duration time to wait for an action before resorting to force (default 1m0s) + --allow-kill-statement Allows the execution of kill statement + --allowed_tablet_types strings Specifies the tablet types this vtgate is allowed to route queries to. Should be provided as a comma-separated set of tablet types. + --alsologtostderr log to standard error as well as files + --app_idle_timeout duration Idle timeout for app connections (default 1m0s) + --app_pool_size int Size of the connection pool for app connections (default 40) + --backup_engine_implementation string Specifies which implementation to use for creating new backups (builtin or xtrabackup). Restores will always be done with whichever engine created a given backup. (default "builtin") + --backup_storage_block_size int if backup_storage_compress is true, backup_storage_block_size sets the byte size for each block while compressing (default is 250000). (default 250000) + --backup_storage_compress if set, the backup files will be compressed. (default true) + --backup_storage_number_blocks int if backup_storage_compress is true, backup_storage_number_blocks sets the number of blocks that can be processed, in parallel, before the writer blocks, during compression (default is 2). It should be equal to the number of CPUs available for compression. (default 2) + --binlog_host string PITR restore parameter: hostname/IP of binlog server. + --binlog_password string PITR restore parameter: password of binlog server. + --binlog_player_protocol string the protocol to download binlogs from a vttablet (default "grpc") + --binlog_port int PITR restore parameter: port of binlog server. + --binlog_ssl_ca string PITR restore parameter: Filename containing TLS CA certificate to verify binlog server TLS certificate against. + --binlog_ssl_cert string PITR restore parameter: Filename containing mTLS client certificate to present to binlog server as authentication. + --binlog_ssl_key string PITR restore parameter: Filename containing mTLS client private key for use in binlog server authentication. + --binlog_ssl_server_name string PITR restore parameter: TLS server name (common name) to verify against for the binlog server we are connecting to (If not set: use the hostname or IP supplied in --binlog_host). + --binlog_user string PITR restore parameter: username of binlog server. + --buffer_drain_concurrency int Maximum number of requests retried simultaneously. More concurrency will increase the load on the PRIMARY vttablet when draining the buffer. (default 1) + --buffer_keyspace_shards string If not empty, limit buffering to these entries (comma separated). Entry format: keyspace or keyspace/shard. Requires --enable_buffer=true. + --buffer_max_failover_duration duration Stop buffering completely if a failover takes longer than this duration. (default 20s) + --buffer_min_time_between_failovers duration Minimum time between the end of a failover and the start of the next one (tracked per shard). Faster consecutive failovers will not trigger buffering. (default 1m0s) + --buffer_size int Maximum number of buffered requests in flight (across all ongoing failovers). (default 1000) + --buffer_window duration Duration for how long a request should be buffered at most. (default 10s) + --builtinbackup-file-read-buffer-size uint read files using an IO buffer of this many bytes. Golang defaults are used when set to 0. + --builtinbackup-file-write-buffer-size uint write files using an IO buffer of this many bytes. Golang defaults are used when set to 0. (default 2097152) + --builtinbackup_mysqld_timeout duration how long to wait for mysqld to shutdown at the start of the backup. (default 10m0s) + --builtinbackup_progress duration how often to send progress updates when backing up large files. (default 5s) + --catch-sigpipe catch and ignore SIGPIPE on stdout and stderr if specified + --cell string cell to use + --compression-engine-name string compressor engine used for compression. (default "pargzip") + --compression-level int what level to pass to the compressor. (default 1) + --config-file string Full path of the config file (with extension) to use. If set, --config-path, --config-type, and --config-name are ignored. + --config-file-not-found-handling ConfigFileNotFoundHandling Behavior when a config file is not found. (Options: error, exit, ignore, warn) (default warn) + --config-name string Name of the config file (without extension) to search for. (default "vtconfig") + --config-path strings Paths to search for config files in. (default [{{ .Workdir }}]) + --config-persistence-min-interval duration minimum interval between persisting dynamic config changes back to disk (if no change has occurred, nothing is done). (default 1s) + --config-type string Config file type (omit to infer config type from file extension). + --consolidator-stream-query-size int Configure the stream consolidator query size in bytes. Setting to 0 disables the stream consolidator. (default 2097152) + --consolidator-stream-total-size int Configure the stream consolidator total size in bytes. Setting to 0 disables the stream consolidator. (default 134217728) + --consul_auth_static_file string JSON File to read the topos/tokens from. + --datadog-agent-host string host to send spans to. if empty, no tracing will be done + --datadog-agent-port string port to send spans to. if empty, no tracing will be done + --db-credentials-file string db credentials file; send SIGHUP to reload this file + --db-credentials-server string db credentials server type ('file' - file implementation; 'vault' - HashiCorp Vault implementation) (default "file") + --db-credentials-vault-addr string URL to Vault server + --db-credentials-vault-path string Vault path to credentials JSON blob, e.g.: secret/data/prod/dbcreds + --db-credentials-vault-role-mountpoint string Vault AppRole mountpoint; can also be passed using VAULT_MOUNTPOINT environment variable (default "approle") + --db-credentials-vault-role-secretidfile string Path to file containing Vault AppRole secret_id; can also be passed using VAULT_SECRETID environment variable + --db-credentials-vault-roleid string Vault AppRole id; can also be passed using VAULT_ROLEID environment variable + --db-credentials-vault-timeout duration Timeout for vault API operations (default 10s) + --db-credentials-vault-tls-ca string Path to CA PEM for validating Vault server certificate + --db-credentials-vault-tokenfile string Path to file containing Vault auth token; token can also be passed using VAULT_TOKEN environment variable + --db-credentials-vault-ttl duration How long to cache DB credentials from the Vault server (default 30m0s) + --db_allprivs_password string db allprivs password + --db_allprivs_use_ssl Set this flag to false to make the allprivs connection to not use ssl (default true) + --db_allprivs_user string db allprivs user userKey (default "vt_allprivs") + --db_app_password string db app password + --db_app_use_ssl Set this flag to false to make the app connection to not use ssl (default true) + --db_app_user string db app user userKey (default "vt_app") + --db_appdebug_password string db appdebug password + --db_appdebug_use_ssl Set this flag to false to make the appdebug connection to not use ssl (default true) + --db_appdebug_user string db appdebug user userKey (default "vt_appdebug") + --db_charset string Character set used for this tablet. (default "utf8mb4") + --db_conn_query_info enable parsing and processing of QUERY_OK info fields + --db_connect_timeout_ms int connection timeout to mysqld in milliseconds (0 for no timeout) + --db_dba_password string db dba password + --db_dba_use_ssl Set this flag to false to make the dba connection to not use ssl (default true) + --db_dba_user string db dba user userKey (default "vt_dba") + --db_erepl_password string db erepl password + --db_erepl_use_ssl Set this flag to false to make the erepl connection to not use ssl (default true) + --db_erepl_user string db erepl user userKey (default "vt_erepl") + --db_filtered_password string db filtered password + --db_filtered_use_ssl Set this flag to false to make the filtered connection to not use ssl (default true) + --db_filtered_user string db filtered user userKey (default "vt_filtered") + --db_flags uint Flag values as defined by MySQL. + --db_flavor string Flavor overrid. Valid value is FilePos. + --db_host string The host name for the tcp connection. + --db_port int tcp port + --db_repl_password string db repl password + --db_repl_use_ssl Set this flag to false to make the repl connection to not use ssl (default true) + --db_repl_user string db repl user userKey (default "vt_repl") + --db_server_name string server name of the DB we are connecting to. + --db_socket string The unix socket to connect on. If this is specified, host and port will not be used. + --db_ssl_ca string connection ssl ca + --db_ssl_ca_path string connection ssl ca path + --db_ssl_cert string connection ssl certificate + --db_ssl_key string connection ssl key + --db_ssl_mode SslMode SSL mode to connect with. One of disabled, preferred, required, verify_ca & verify_identity. + --db_tls_min_version string Configures the minimal TLS version negotiated when SSL is enabled. Defaults to TLSv1.2. Options: TLSv1.0, TLSv1.1, TLSv1.2, TLSv1.3. + --dba_idle_timeout duration Idle timeout for dba connections (default 1m0s) + --dba_pool_size int Size of the connection pool for dba connections (default 20) + --dbddl_plugin string controls how to handle CREATE/DROP DATABASE. use it if you are using your own database provisioning service (default "fail") + --ddl_strategy string Set default strategy for DDL statements. Override with @@ddl_strategy session variable (default "direct") + --default_tablet_type topodatapb.TabletType The default tablet type to set for queries, when one is not explicitly selected. (default PRIMARY) + --degraded_threshold duration replication lag after which a replica is considered degraded (default 30s) + --disable_active_reparents if set, do not allow active reparents. Use this to protect a cluster using external reparents. + --emit_stats If set, emit stats to push-based monitoring and stats backends + --enable-consolidator Synonym to -enable_consolidator (default true) + --enable-consolidator-replicas Synonym to -enable_consolidator_replicas + --enable-partial-keyspace-migration (Experimental) Follow shard routing rules: enable only while migrating a keyspace shard by shard. See documentation on Partial MoveTables for more. (default false) + --enable-per-workload-table-metrics If true, query counts and query error metrics include a label that identifies the workload + --enable-tx-throttler Synonym to -enable_tx_throttler + --enable-views Enable views support in vtgate. + --enable_buffer Enable buffering (stalling) of primary traffic during failovers. + --enable_buffer_dry_run Detect and log failover events, but do not actually buffer requests. + --enable_consolidator This option enables the query consolidator. (default true) + --enable_consolidator_replicas This option enables the query consolidator only on replicas. + --enable_direct_ddl Allow users to submit direct DDL statements (default true) + --enable_hot_row_protection If true, incoming transactions for the same row (range) will be queued and cannot consume all txpool slots. + --enable_hot_row_protection_dry_run If true, hot row protection is not enforced but logs if transactions would have been queued. + --enable_online_ddl Allow users to submit, review and control Online DDL (default true) + --enable_replication_reporter Use polling to track replication lag. + --enable_set_var This will enable the use of MySQL's SET_VAR query hint for certain system variables instead of using reserved connections (default true) + --enable_system_settings This will enable the system settings to be changed per session at the database connection level (default true) + --enable_transaction_limit If true, limit on number of transactions open at the same time will be enforced for all users. User trying to open a new transaction after exhausting their limit will receive an error immediately, regardless of whether there are available slots or not. + --enable_transaction_limit_dry_run If true, limit on number of transactions open at the same time will be tracked for all users, but not enforced. + --enable_tx_throttler If true replication-lag-based throttling on transactions will be enabled. + --enforce_strict_trans_tables If true, vttablet requires MySQL to run with STRICT_TRANS_TABLES or STRICT_ALL_TABLES on. It is recommended to not turn this flag off. Otherwise MySQL may alter your supplied values before saving them to the database. (default true) + --external-compressor string command with arguments to use when compressing a backup. + --external-compressor-extension string extension to use when using an external compressor. + --external-decompressor string command with arguments to use when decompressing a backup. + --external_topo_server Should vtcombo use an external topology server instead of starting its own in-memory topology server. If true, vtcombo will use the flags defined in topo/server.go to open topo server + --foreign_key_mode string This is to provide how to handle foreign key constraint in create/alter table. Valid values are: allow, disallow (default "allow") + --gate_query_cache_memory int gate server query cache size in bytes, maximum amount of memory to be cached. vtgate analyzes every incoming query and generate a query plan, these plans are being cached in a lru cache. This config controls the capacity of the lru cache. (default 33554432) + --gc_check_interval duration Interval between garbage collection checks (default 1h0m0s) + --gc_purge_check_interval duration Interval between purge discovery checks (default 1m0s) + --gh-ost-path string override default gh-ost binary full path + --grpc_auth_mode string Which auth plugin implementation to use (eg: static) + --grpc_auth_mtls_allowed_substrings string List of substrings of at least one of the client certificate names (separated by colon). + --grpc_auth_static_password_file string JSON File to read the users/passwords from. + --grpc_ca string server CA to use for gRPC connections, requires TLS, and enforces client certificate check + --grpc_cert string server certificate to use for gRPC connections, requires grpc_key, enables TLS + --grpc_crl string path to a certificate revocation list in PEM format, client certificates will be further verified against this file during TLS handshake + --grpc_enable_optional_tls enable optional TLS mode when a server accepts both TLS and plain-text connections on the same port + --grpc_enable_tracing Enable gRPC tracing. + --grpc_key string server private key to use for gRPC connections, requires grpc_cert, enables TLS + --grpc_max_connection_age duration Maximum age of a client connection before GoAway is sent. (default 2562047h47m16.854775807s) + --grpc_max_connection_age_grace duration Additional grace period after grpc_max_connection_age, after which connections are forcibly closed. (default 2562047h47m16.854775807s) + --grpc_max_message_size int Maximum allowed RPC message size. Larger messages will be rejected by gRPC with the error 'exceeding the max size'. (default 16777216) + --grpc_port int Port to listen on for gRPC calls. If zero, do not listen. + --grpc_prometheus Enable gRPC monitoring with Prometheus. + --grpc_server_ca string path to server CA in PEM format, which will be combine with server cert, return full certificate chain to clients + --grpc_server_initial_conn_window_size int gRPC server initial connection window size + --grpc_server_initial_window_size int gRPC server initial window size + --grpc_server_keepalive_enforcement_policy_min_time duration gRPC server minimum keepalive time (default 10s) + --grpc_server_keepalive_enforcement_policy_permit_without_stream gRPC server permit client keepalive pings even when there are no active streams (RPCs) + --health_check_interval duration Interval between health checks (default 20s) + --healthcheck_retry_delay duration health check retry delay (default 2ms) + --healthcheck_timeout duration the health check timeout period (default 1m0s) + --heartbeat_enable If true, vttablet records (if master) or checks (if replica) the current time of a replication heartbeat in the sidecar database's heartbeat table. The result is used to inform the serving state of the vttablet via healthchecks. + --heartbeat_interval duration How frequently to read and write replication heartbeat. (default 1s) + --heartbeat_on_demand_duration duration If non-zero, heartbeats are only written upon consumer request, and only run for up to given duration following the request. Frequent requests can keep the heartbeat running consistently; when requests are infrequent heartbeat may completely stop between requests + -h, --help display usage and exit + --hot_row_protection_concurrent_transactions int Number of concurrent transactions let through to the txpool/MySQL for the same hot row. Should be > 1 to have enough 'ready' transactions in MySQL and benefit from a pipelining effect. (default 5) + --hot_row_protection_max_global_queue_size int Global queue limit across all row (ranges). Useful to prevent that the queue can grow unbounded. (default 1000) + --hot_row_protection_max_queue_size int Maximum number of BeginExecute RPCs which will be queued for the same row (range). (default 20) + --init_db_name_override string (init parameter) override the name of the db used by vttablet. Without this flag, the db name defaults to vt_ + --init_keyspace string (init parameter) keyspace to use for this tablet + --init_shard string (init parameter) shard to use for this tablet + --init_tablet_type string (init parameter) the tablet type to use for this tablet. + --init_tags StringMap (init parameter) comma separated list of key:value pairs used to tag the tablet + --init_timeout duration (init parameter) timeout to use for the init phase. (default 1m0s) + --jaeger-agent-host string host and port to send spans to. if empty, no tracing will be done + --json_topo vttest.TopoData vttest proto definition of the topology, encoded in json format. See vttest.proto for more information. + --keep_logs duration keep logs for this long (using ctime) (zero to keep forever) + --keep_logs_by_mtime duration keep logs for this long (using mtime) (zero to keep forever) + --keyspaces_to_watch strings Specifies which keyspaces this vtgate should have access to while routing queries or accessing the vschema. + --lameduck-period duration keep running at least this long after SIGTERM before stopping (default 50ms) + --lock-timeout duration Maximum time for which a shard/keyspace lock can be acquired for (default 45s) + --lock_heartbeat_time duration If there is lock function used. This will keep the lock connection active by using this heartbeat (default 5s) + --lock_tables_timeout duration How long to keep the table locked before timing out (default 1m0s) + --log_backtrace_at traceLocation when logging hits line file:N, emit a stack trace (default :0) + --log_dir string If non-empty, write log files in this directory + --log_err_stacks log stack traces for errors + --log_queries_to_file string Enable query logging to the specified file + --log_rotate_max_size uint size in bytes at which logs are rotated (glog.MaxSize) (default 1887436800) + --logtostderr log to standard error instead of files + --manifest-external-decompressor string command with arguments to store in the backup manifest when compressing a backup with an external compression engine. + --max-stack-size int configure the maximum stack size in bytes (default 67108864) + --max_concurrent_online_ddl int Maximum number of online DDL changes that may run concurrently (default 256) + --max_memory_rows int Maximum number of rows that will be held in memory for intermediate results as well as the final result. (default 300000) + --max_payload_size int The threshold for query payloads in bytes. A payload greater than this threshold will result in a failure to handle the query. + --message_stream_grace_period duration the amount of time to give for a vttablet to resume if it ends a message stream, usually because of a reparent. (default 30s) + --migration_check_interval duration Interval between migration checks (default 1m0s) + --mycnf-file string path to my.cnf, if reading all config params from there + --mycnf_bin_log_path string mysql binlog path + --mycnf_data_dir string data directory for mysql + --mycnf_error_log_path string mysql error log path + --mycnf_general_log_path string mysql general log path + --mycnf_innodb_data_home_dir string Innodb data home directory + --mycnf_innodb_log_group_home_dir string Innodb log group home directory + --mycnf_master_info_file string mysql master.info file + --mycnf_mysql_port int port mysql is listening on + --mycnf_pid_file string mysql pid file + --mycnf_relay_log_index_path string mysql relay log index path + --mycnf_relay_log_info_path string mysql relay log info path + --mycnf_relay_log_path string mysql relay log path + --mycnf_secure_file_priv string mysql path for loading secure files + --mycnf_server_id int mysql server id of the server (if specified, mycnf-file will be ignored) + --mycnf_slow_log_path string mysql slow query log path + --mycnf_socket_file string mysql socket file + --mycnf_tmp_dir string mysql tmp directory + --mysql-server-keepalive-period duration TCP period between keep-alives + --mysql-server-pool-conn-read-buffers If set, the server will pool incoming connection read buffers + --mysql_allow_clear_text_without_tls If set, the server will allow the use of a clear text password over non-SSL connections. + --mysql_auth_server_impl string Which auth server implementation to use. Options: none, ldap, clientcert, static, vault. (default "static") + --mysql_default_workload string Default session workload (OLTP, OLAP, DBA) (default "OLTP") + --mysql_port int mysql port (default 3306) + --mysql_server_bind_address string Binds on this address when listening to MySQL binary protocol. Useful to restrict listening to 'localhost' only for instance. + --mysql_server_port int If set, also listen for MySQL binary protocol connections on this port. (default -1) + --mysql_server_query_timeout duration mysql query timeout + --mysql_server_read_timeout duration connection read timeout + --mysql_server_require_secure_transport Reject insecure connections but only if mysql_server_ssl_cert and mysql_server_ssl_key are provided + --mysql_server_socket_path string This option specifies the Unix socket file to use when listening for local connections. By default it will be empty and it won't listen to a unix socket + --mysql_server_ssl_ca string Path to ssl CA for mysql server plugin SSL. If specified, server will require and validate client certs. + --mysql_server_ssl_cert string Path to the ssl cert for mysql server plugin SSL + --mysql_server_ssl_crl string Path to ssl CRL for mysql server plugin SSL + --mysql_server_ssl_key string Path to ssl key for mysql server plugin SSL + --mysql_server_ssl_server_ca string path to server CA in PEM format, which will be combine with server cert, return full certificate chain to clients + --mysql_server_tls_min_version string Configures the minimal TLS version negotiated when SSL is enabled. Defaults to TLSv1.2. Options: TLSv1.0, TLSv1.1, TLSv1.2, TLSv1.3. + --mysql_server_version string MySQL server version to advertise. (default "8.0.30-Vitess") + --mysql_server_write_timeout duration connection write timeout + --mysql_slow_connect_warn_threshold duration Warn if it takes more than the given threshold for a mysql connection to establish + --mysql_tcp_version string Select tcp, tcp4, or tcp6 to control the socket type. (default "tcp") + --mysqlctl_mycnf_template string template file to use for generating the my.cnf file during server init + --mysqlctl_socket string socket file to use for remote mysqlctl actions (empty for local actions) + --no_scatter when set to true, the planner will fail instead of producing a plan that includes scatter queries + --normalize_queries Rewrite queries with bind vars. Turn this off if the app itself sends normalized queries with bind vars. (default true) + --onclose_timeout duration wait no more than this for OnClose handlers before stopping (default 10s) + --onterm_timeout duration wait no more than this for OnTermSync handlers before stopping (default 10s) + --pid_file string If set, the process will write its pid to the named file, and delete it on graceful shutdown. + --pitr_gtid_lookup_timeout duration PITR restore parameter: timeout for fetching gtid from timestamp. (default 1m0s) + --planner-version string Sets the default planner to use when the session has not changed it. Valid values are: Gen4, Gen4Greedy, Gen4Left2Right + --pool_hostname_resolve_interval duration if set force an update to all hostnames and reconnect if changed, defaults to 0 (disabled) + --port int port for the server + --pprof strings enable profiling + --proto_topo vttest.TopoData vttest proto definition of the topology, encoded in compact text format. See vttest.proto for more information. + --proxy_protocol Enable HAProxy PROXY protocol on MySQL listener socket + --proxy_tablets Setting this true will make vtctld proxy the tablet status instead of redirecting to them + --pt-osc-path string override default pt-online-schema-change binary full path + --publish_retry_interval duration how long vttablet waits to retry publishing the tablet record (default 30s) + --purge_logs_interval duration how often try to remove old logs (default 1h0m0s) + --query-log-stream-handler string URL handler for streaming queries log (default "/debug/querylog") + --query-timeout int Sets the default query timeout (in ms). Can be overridden by session variable (query_timeout) or comment directive (QUERY_TIMEOUT_MS) + --querylog-buffer-size int Maximum number of buffered query logs before throttling log output (default 10) + --querylog-filter-tag string string that must be present in the query for it to be logged; if using a value as the tag, you need to disable query normalization + --querylog-format string format for query logs ("text" or "json") (default "text") + --querylog-row-threshold uint Number of rows a query has to return or affect before being logged; not useful for streaming queries. 0 means all queries will be logged. + --queryserver-config-acl-exempt-acl string an acl that exempt from table acl checking (this acl is free to access any vitess tables). + --queryserver-config-annotate-queries prefix queries to MySQL backend with comment indicating vtgate principal (user) and target tablet type + --queryserver-config-enable-table-acl-dry-run If this flag is enabled, tabletserver will emit monitoring metrics and let the request pass regardless of table acl check results + --queryserver-config-idle-timeout duration query server idle timeout (in seconds), vttablet manages various mysql connection pools. This config means if a connection has not been used in given idle timeout, this connection will be removed from pool. This effectively manages number of connection objects and optimize the pool performance. (default 30m0s) + --queryserver-config-max-result-size int query server max result size, maximum number of rows allowed to return from vttablet for non-streaming queries. (default 10000) + --queryserver-config-message-postpone-cap int query server message postpone cap is the maximum number of messages that can be postponed at any given time. Set this number to substantially lower than transaction cap, so that the transaction pool isn't exhausted by the message subsystem. (default 4) + --queryserver-config-olap-transaction-timeout duration query server transaction timeout (in seconds), after which a transaction in an OLAP session will be killed (default 30s) + --queryserver-config-passthrough-dmls query server pass through all dml statements without rewriting + --queryserver-config-pool-conn-max-lifetime duration query server connection max lifetime (in seconds), vttablet manages various mysql connection pools. This config means if a connection has lived at least this long, it connection will be removed from pool upon the next time it is returned to the pool. (default 0s) + --queryserver-config-pool-size int query server read pool size, connection pool is used by regular queries (non streaming, not in a transaction) (default 16) + --queryserver-config-query-cache-memory int query server query cache size in bytes, maximum amount of memory to be used for caching. vttablet analyzes every incoming query and generate a query plan, these plans are being cached in a lru cache. This config controls the capacity of the lru cache. (default 33554432) + --queryserver-config-query-pool-timeout duration query server query pool timeout (in seconds), it is how long vttablet waits for a connection from the query pool. If set to 0 (default) then the overall query timeout is used instead. (default 0s) + --queryserver-config-query-pool-waiter-cap int query server query pool waiter limit, this is the maximum number of queries that can be queued waiting to get a connection (default 5000) + --queryserver-config-query-timeout duration query server query timeout (in seconds), this is the query timeout in vttablet side. If a query takes more than this timeout, it will be killed. (default 30s) + --queryserver-config-schema-change-signal query server schema signal, will signal connected vtgates that schema has changed whenever this is detected. VTGates will need to have -schema_change_signal enabled for this to work (default true) + --queryserver-config-schema-reload-time duration query server schema reload time, how often vttablet reloads schemas from underlying MySQL instance in seconds. vttablet keeps table schemas in its own memory and periodically refreshes it from MySQL. This config controls the reload time. (default 30m0s) + --queryserver-config-stream-buffer-size int query server stream buffer size, the maximum number of bytes sent from vttablet for each stream call. It's recommended to keep this value in sync with vtgate's stream_buffer_size. (default 32768) + --queryserver-config-stream-pool-size int query server stream connection pool size, stream pool is used by stream queries: queries that return results to client in a streaming fashion (default 200) + --queryserver-config-stream-pool-timeout duration query server stream pool timeout (in seconds), it is how long vttablet waits for a connection from the stream pool. If set to 0 (default) then there is no timeout. (default 0s) + --queryserver-config-stream-pool-waiter-cap int query server stream pool waiter limit, this is the maximum number of streaming queries that can be queued waiting to get a connection + --queryserver-config-strict-table-acl only allow queries that pass table acl checks + --queryserver-config-terse-errors prevent bind vars from escaping in client error messages + --queryserver-config-transaction-cap int query server transaction cap is the maximum number of transactions allowed to happen at any given point of a time for a single vttablet. E.g. by setting transaction cap to 100, there are at most 100 transactions will be processed by a vttablet and the 101th transaction will be blocked (and fail if it cannot get connection within specified timeout) (default 20) + --queryserver-config-transaction-timeout duration query server transaction timeout (in seconds), a transaction will be killed if it takes longer than this value (default 30s) + --queryserver-config-truncate-error-len int truncate errors sent to client if they are longer than this value (0 means do not truncate) + --queryserver-config-txpool-timeout duration query server transaction pool timeout, it is how long vttablet waits if tx pool is full (default 1s) + --queryserver-config-txpool-waiter-cap int query server transaction pool waiter limit, this is the maximum number of transactions that can be queued waiting to get a connection (default 5000) + --queryserver-config-warn-result-size int query server result size warning threshold, warn if number of rows returned from vttablet for non-streaming queries exceeds this + --queryserver-enable-settings-pool Enable pooling of connections with modified system settings (default true) + --queryserver-enable-views Enable views support in vttablet. + --queryserver_enable_online_ddl Enable online DDL. (default true) + --redact-debug-ui-queries redact full queries and bind variables from debug UI + --relay_log_max_items int Maximum number of rows for VReplication target buffering. (default 5000) + --relay_log_max_size int Maximum buffer size (in bytes) for VReplication target buffering. If single rows are larger than this, a single row is buffered at a time. (default 250000) + --remote_operation_timeout duration time to wait for a remote operation (default 15s) + --replication_connect_retry duration how long to wait in between replica reconnect attempts. Only precise to the second. (default 10s) + --restore_concurrency int (init restore parameter) how many concurrent files to restore at once (default 4) + --restore_from_backup (init restore parameter) will check BackupStorage for a recent backup at startup and start there + --restore_from_backup_ts string (init restore parameter) if set, restore the latest backup taken at or before this timestamp. Example: '2021-04-29.133050' + --retain_online_ddl_tables duration How long should vttablet keep an old migrated table before purging it (default 24h0m0s) + --sanitize_log_messages Remove potentially sensitive information in tablet INFO, WARNING, and ERROR log messages such as query parameters. + --schema-change-reload-timeout duration query server schema change reload timeout, this is how long to wait for the signaled schema reload operation to complete before giving up (default 30s) + --schema-version-max-age-seconds int max age of schema version records to kept in memory by the vreplication historian + --schema_change_signal Enable the schema tracker; requires queryserver-config-schema-change-signal to be enabled on the underlying vttablets for this to work (default true) + --schema_dir string Schema base directory. Should contain one directory per keyspace, with a vschema.json file if necessary. + --security_policy string the name of a registered security policy to use for controlling access to URLs - empty means allow all for anyone (built-in policies: deny-all, read-only) + --service_map strings comma separated list of services to enable (or disable if prefixed with '-') Example: grpc-queryservice + --serving_state_grace_period duration how long to pause after broadcasting health to vtgate, before enforcing a new serving state + --shard_sync_retry_delay duration delay between retries of updates to keep the tablet and its shard record in sync (default 30s) + --shutdown_grace_period duration how long to wait (in seconds) for queries and transactions to complete during graceful shutdown. (default 0s) + --sql-max-length-errors int truncate queries in error logs to the given length (default unlimited) + --sql-max-length-ui int truncate queries in debug UIs to the given length (default 512) (default 512) + --srv_topo_cache_refresh duration how frequently to refresh the topology for cached entries (default 1s) + --srv_topo_cache_ttl duration how long to use cached entries for topology (default 1s) + --srv_topo_timeout duration topo server timeout (default 5s) + --start_mysql Should vtcombo also start mysql + --stats_backend string The name of the registered push-based monitoring/stats backend to use + --stats_combine_dimensions string List of dimensions to be combined into a single "all" value in exported stats vars + --stats_common_tags strings Comma-separated list of common tags for the stats backend. It provides both label and values. Example: label1:value1,label2:value2 + --stats_drop_variables string Variables to be dropped from the list of exported variables. + --stats_emit_period duration Interval between emitting stats to all registered backends (default 1m0s) + --stderrthreshold severity logs at or above this threshold go to stderr (default 1) + --stream_buffer_size int the number of bytes sent from vtgate for each stream call. It's recommended to keep this value in sync with vttablet's query-server-config-stream-buffer-size. (default 32768) + --stream_health_buffer_size uint max streaming health entries to buffer per streaming health client (default 20) + --table-refresh-interval int interval in milliseconds to refresh tables in status page with refreshRequired class + --table_gc_lifecycle string States for a DROP TABLE garbage collection cycle. Default is 'hold,purge,evac,drop', use any subset ('drop' implcitly always included) (default "hold,purge,evac,drop") + --tablet_dir string The directory within the vtdataroot to store vttablet/mysql files. Defaults to being generated by the tablet uid. + --tablet_filters strings Specifies a comma-separated list of 'keyspace|shard_name or keyrange' values to filter the tablets to watch. + --tablet_health_keep_alive duration close streaming tablet health connection if there are no requests for this long (default 5m0s) + --tablet_hostname string if not empty, this hostname will be assumed instead of trying to resolve it + --tablet_manager_grpc_ca string the server ca to use to validate servers when connecting + --tablet_manager_grpc_cert string the cert to use to connect + --tablet_manager_grpc_concurrency int concurrency to use to talk to a vttablet server for performance-sensitive RPCs (like ExecuteFetchAs{Dba,AllPrivs,App}) (default 8) + --tablet_manager_grpc_connpool_size int number of tablets to keep tmclient connections open to (default 100) + --tablet_manager_grpc_crl string the server crl to use to validate server certificates when connecting + --tablet_manager_grpc_key string the key to use to connect + --tablet_manager_grpc_server_name string the server name to use to validate server certificate + --tablet_manager_protocol string Protocol to use to make tabletmanager RPCs to vttablets. (default "grpc") + --tablet_refresh_interval duration Tablet refresh interval. (default 1m0s) + --tablet_refresh_known_tablets Whether to reload the tablet's address/port map from topo in case they change. (default true) + --tablet_url_template string Format string describing debug tablet url formatting. See getTabletDebugURL() for how to customize this. (default "http://{{ "{{.GetTabletHostPort}}" }}") + --throttle_tablet_types string Comma separated VTTablet types to be considered by the throttler. default: 'replica'. example: 'replica,rdonly'. 'replica' aways implicitly included (default "replica") + --topo_consul_lock_delay duration LockDelay for consul session. (default 15s) + --topo_consul_lock_session_checks string List of checks for consul session. (default "serfHealth") + --topo_consul_lock_session_ttl string TTL for consul session. + --topo_consul_watch_poll_duration duration time of the long poll for watch queries. (default 30s) + --topo_etcd_lease_ttl int Lease TTL for locks and leader election. The client will use KeepAlive to keep the lease going. (default 30) + --topo_etcd_tls_ca string path to the ca to use to validate the server cert when connecting to the etcd topo server + --topo_etcd_tls_cert string path to the client cert to use to connect to the etcd topo server, requires topo_etcd_tls_key, enables TLS + --topo_etcd_tls_key string path to the client key to use to connect to the etcd topo server, enables TLS + --topo_global_root string the path of the global topology data in the global topology server + --topo_global_server_address string the address of the global topology server + --topo_implementation string the topology implementation to use + --topo_read_concurrency int Concurrency of topo reads. (default 32) + --topo_zk_auth_file string auth to use when connecting to the zk topo server, file contents should be :, e.g., digest:user:pass + --topo_zk_base_timeout duration zk base timeout (see zk.Connect) (default 30s) + --topo_zk_max_concurrency int maximum number of pending requests to send to a Zookeeper server. (default 64) + --topo_zk_tls_ca string the server ca to use to validate servers when connecting to the zk topo server + --topo_zk_tls_cert string the cert to use to connect to the zk topo server, requires topo_zk_tls_key, enables TLS + --topo_zk_tls_key string the key to use to connect to the zk topo server, enables TLS + --tracer string tracing service to use (default "noop") + --tracing-enable-logging whether to enable logging in the tracing service + --tracing-sampling-rate float sampling rate for the probabilistic jaeger sampler (default 0.1) + --tracing-sampling-type string sampling strategy to use for jaeger. possible values are 'const', 'probabilistic', 'rateLimiting', or 'remote' (default "const") + --track_schema_versions When enabled, vttablet will store versions of schemas at each position that a DDL is applied and allow retrieval of the schema corresponding to a position + --transaction-log-stream-handler string URL handler for streaming transactions log (default "/debug/txlog") + --transaction_limit_by_component Include CallerID.component when considering who the user is for the purpose of transaction limit. + --transaction_limit_by_principal Include CallerID.principal when considering who the user is for the purpose of transaction limit. (default true) + --transaction_limit_by_subcomponent Include CallerID.subcomponent when considering who the user is for the purpose of transaction limit. + --transaction_limit_by_username Include VTGateCallerID.username when considering who the user is for the purpose of transaction limit. (default true) + --transaction_limit_per_user float Maximum number of transactions a single user is allowed to use at any time, represented as fraction of -transaction_cap. (default 0.4) + --transaction_mode string SINGLE: disallow multi-db transactions, MULTI: allow multi-db transactions with best effort commit, TWOPC: allow multi-db transactions with 2pc commit (default "MULTI") + --truncate-error-len int truncate errors sent to client if they are longer than this value (0 means do not truncate) + --twopc_abandon_age float time in seconds. Any unresolved transaction older than this time will be sent to the coordinator to be resolved. + --twopc_coordinator_address string address of the (VTGate) process(es) that will be used to notify of abandoned transactions. + --twopc_enable if the flag is on, 2pc is enabled. Other 2pc flags must be supplied. + --tx-throttler-config string Synonym to -tx_throttler_config (default "target_replication_lag_sec:2 max_replication_lag_sec:10 initial_rate:100 max_increase:1 emergency_decrease:0.5 min_duration_between_increases_sec:40 max_duration_between_increases_sec:62 min_duration_between_decreases_sec:20 spread_backlog_across_sec:20 age_bad_rate_after_sec:180 bad_rate_increase:0.1 max_rate_approach_threshold:0.9") + --tx-throttler-default-priority int Default priority assigned to queries that lack priority information (default 100) + --tx-throttler-dry-run If present, the transaction throttler only records metrics about requests received and throttled, but does not actually throttle any requests. + --tx-throttler-healthcheck-cells strings Synonym to -tx_throttler_healthcheck_cells + --tx-throttler-tablet-types strings A comma-separated list of tablet types. Only tablets of this type are monitored for replication lag by the transaction throttler. Supported types are replica and/or rdonly. (default replica) + --tx-throttler-topo-refresh-interval duration The rate that the transaction throttler will refresh the topology to find cells. (default 5m0s) + --tx_throttler_config string The configuration of the transaction throttler as a text-formatted throttlerdata.Configuration protocol buffer message. (default "target_replication_lag_sec:2 max_replication_lag_sec:10 initial_rate:100 max_increase:1 emergency_decrease:0.5 min_duration_between_increases_sec:40 max_duration_between_increases_sec:62 min_duration_between_decreases_sec:20 spread_backlog_across_sec:20 age_bad_rate_after_sec:180 bad_rate_increase:0.1 max_rate_approach_threshold:0.9") + --tx_throttler_healthcheck_cells strings A comma-separated list of cells. Only tabletservers running in these cells will be monitored for replication lag by the transaction throttler. + --unhealthy_threshold duration replication lag after which a replica is considered unhealthy (default 2h0m0s) + --v Level log level for V logs + -v, --version print binary version + --vmodule moduleSpec comma-separated list of pattern=N settings for file-filtered logging + --vreplication-parallel-insert-workers int Number of parallel insertion workers to use during copy phase. Set <= 1 to disable parallelism, or > 1 to enable concurrent insertion during copy phase. (default 1) + --vreplication_copy_phase_duration duration Duration for each copy phase loop (before running the next catchup: default 1h) (default 1h0m0s) + --vreplication_copy_phase_max_innodb_history_list_length int The maximum InnoDB transaction history that can exist on a vstreamer (source) before starting another round of copying rows. This helps to limit the impact on the source tablet. (default 1000000) + --vreplication_copy_phase_max_mysql_replication_lag int The maximum MySQL replication lag (in seconds) that can exist on a vstreamer (source) before starting another round of copying rows. This helps to limit the impact on the source tablet. (default 43200) + --vreplication_healthcheck_retry_delay duration healthcheck retry delay (default 5s) + --vreplication_healthcheck_timeout duration healthcheck retry delay (default 1m0s) + --vreplication_healthcheck_topology_refresh duration refresh interval for re-reading the topology (default 30s) + --vreplication_heartbeat_update_interval int Frequency (in seconds, default 1, max 60) at which the time_updated column of a vreplication stream when idling (default 1) + --vreplication_max_time_to_retry_on_error duration stop automatically retrying when we've had consecutive failures with the same error for this long after the first occurrence + --vreplication_replica_lag_tolerance duration Replica lag threshold duration: once lag is below this we switch from copy phase to the replication (streaming) phase (default 1m0s) + --vreplication_retry_delay duration delay before retrying a failed workflow event in the replication phase (default 5s) + --vreplication_store_compressed_gtid Store compressed gtids in the pos column of the sidecar database's vreplication table + --vreplication_tablet_type string comma separated list of tablet types used as a source (default "in_order:REPLICA,PRIMARY") + --vschema-persistence-dir string If set, per-keyspace vschema will be persisted in this directory and reloaded into the in-memory topology server across restarts. Bookkeeping is performed using a simple watcher goroutine. This is useful when running vtcombo as an application development container (e.g. vttestserver) where you want to keep the same vschema even if developer's machine reboots. This works in tandem with vttestserver's --persistent_mode flag. Needless to say, this is neither a perfect nor a production solution for vschema persistence. Consider using the --external_topo_server flag if you require a more complete solution. This flag is ignored if --external_topo_server is set. + --vschema_ddl_authorized_users string List of users authorized to execute vschema ddl operations, or '%' to allow all users. + --vstream-binlog-rotation-threshold int Byte size at which a VStreamer will attempt to rotate the source's open binary log before starting a GTID snapshot based stream (e.g. a ResultStreamer or RowStreamer) (default 67108864) + --vstream_dynamic_packet_size Enable dynamic packet sizing for VReplication. This will adjust the packet size during replication to improve performance. (default true) + --vstream_packet_size int Suggested packet size for VReplication streamer. This is used only as a recommendation. The actual packet size may be more or less than this amount. (default 250000) + --vtctld_sanitize_log_messages When true, vtctld sanitizes logging. + --vtgate-config-terse-errors prevent bind vars from escaping in returned errors + --vtgate_grpc_ca string the server ca to use to validate servers when connecting + --vtgate_grpc_cert string the cert to use to connect + --vtgate_grpc_crl string the server crl to use to validate server certificates when connecting + --vtgate_grpc_key string the key to use to connect + --vtgate_grpc_server_name string the server name to use to validate server certificate + --vttablet_skip_buildinfo_tags string comma-separated list of buildinfo tags to skip from merging with --init_tags. each tag is either an exact match or a regular expression of the form '/regexp/'. (default "/.*/") + --wait_for_backup_interval duration (init restore parameter) if this is greater than 0, instead of starting up empty when no backups are found, keep checking at this interval for a backup to appear + --warn_memory_rows int Warning threshold for in-memory results. A row count higher than this amount will cause the VtGateWarnings.ResultsExceeded counter to be incremented. (default 30000) + --warn_payload_size int The warning threshold for query payloads in bytes. A payload greater than this threshold will cause the VtGateWarnings.WarnPayloadSizeExceeded counter to be incremented. + --warn_sharded_only If any features that are only available in unsharded mode are used, query execution warnings will be added to the session + --watch_replication_stream When enabled, vttablet will stream the MySQL replication stream from the local server, and use it to update schema when it sees a DDL. + --xbstream_restore_flags string Flags to pass to xbstream command during restore. These should be space separated and will be added to the end of the command. These need to match the ones used for backup e.g. --compress / --decompress, --encrypt / --decrypt + --xtrabackup_backup_flags string Flags to pass to backup command. These should be space separated and will be added to the end of the command + --xtrabackup_prepare_flags string Flags to pass to prepare command. These should be space separated and will be added to the end of the command + --xtrabackup_root_path string Directory location of the xtrabackup and xbstream executables, e.g., /usr/bin + --xtrabackup_stream_mode string Which mode to use if streaming, valid values are tar and xbstream. Please note that tar is not supported in XtraBackup 8.0 (default "tar") + --xtrabackup_stripe_block_size uint Size in bytes of each block that gets sent to a given stripe before rotating to the next stripe (default 102400) + --xtrabackup_stripes uint If greater than 0, use data striping across this many destination files to parallelize data transfer and decompression + --xtrabackup_user string User that xtrabackup will use to connect to the database server. This user must have all necessary privileges. For details, please refer to xtrabackup documentation. From 921d39bff45862569d085c5c9edc8c35e43a5d06 Mon Sep 17 00:00:00 2001 From: Andrew Mason Date: Sat, 16 Sep 2023 19:46:07 -0400 Subject: [PATCH 08/12] migrate vtcombo to cobra Signed-off-by: Andrew Mason --- go/cmd/vtcombo/cli/main.go | 358 ++++++++++++++++++ go/cmd/vtcombo/{ => cli}/plugin_dbddl.go | 2 +- .../{ => cli}/plugin_grpcvtctldserver.go | 2 +- .../{ => cli}/plugin_grpcvtctlserver.go | 2 +- .../{ => cli}/plugin_grpcvtgateservice.go | 2 +- .../vtcombo/{ => cli}/plugin_opentracing.go | 2 +- go/cmd/vtcombo/{ => cli}/status.go | 2 +- go/cmd/vtcombo/{ => cli}/vschema_watcher.go | 11 +- go/cmd/vtcombo/docgen/main.go | 37 ++ go/cmd/vtcombo/main.go | 330 +--------------- go/flags/endtoend/vtcombo.txt | 18 +- 11 files changed, 426 insertions(+), 340 deletions(-) create mode 100644 go/cmd/vtcombo/cli/main.go rename go/cmd/vtcombo/{ => cli}/plugin_dbddl.go (99%) rename go/cmd/vtcombo/{ => cli}/plugin_grpcvtctldserver.go (98%) rename go/cmd/vtcombo/{ => cli}/plugin_grpcvtctlserver.go (98%) rename go/cmd/vtcombo/{ => cli}/plugin_grpcvtgateservice.go (98%) rename go/cmd/vtcombo/{ => cli}/plugin_opentracing.go (98%) rename go/cmd/vtcombo/{ => cli}/status.go (99%) rename go/cmd/vtcombo/{ => cli}/vschema_watcher.go (95%) create mode 100644 go/cmd/vtcombo/docgen/main.go diff --git a/go/cmd/vtcombo/cli/main.go b/go/cmd/vtcombo/cli/main.go new file mode 100644 index 00000000000..34fab26717f --- /dev/null +++ b/go/cmd/vtcombo/cli/main.go @@ -0,0 +1,358 @@ +/* +Copyright 2023 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// vtcombo: a single binary that contains: +// - a ZK topology server based on an in-memory map. +// - one vtgate instance. +// - many vttablet instances. +// - a vtctld instance so it's easy to see the topology. +package cli + +import ( + "context" + "fmt" + "os" + "strings" + "time" + + "github.com/spf13/cobra" + + "vitess.io/vitess/go/acl" + "vitess.io/vitess/go/exit" + "vitess.io/vitess/go/mysql/replication" + "vitess.io/vitess/go/vt/dbconfigs" + "vitess.io/vitess/go/vt/log" + "vitess.io/vitess/go/vt/logutil" + "vitess.io/vitess/go/vt/mysqlctl" + "vitess.io/vitess/go/vt/servenv" + "vitess.io/vitess/go/vt/srvtopo" + "vitess.io/vitess/go/vt/topo" + "vitess.io/vitess/go/vt/topo/memorytopo" + "vitess.io/vitess/go/vt/topotools" + "vitess.io/vitess/go/vt/vtcombo" + "vitess.io/vitess/go/vt/vtctld" + "vitess.io/vitess/go/vt/vtgate" + "vitess.io/vitess/go/vt/vtgate/planbuilder/plancontext" + "vitess.io/vitess/go/vt/vttablet/tabletserver/tabletenv" + "vitess.io/vitess/go/vt/vttest" + "vitess.io/vitess/go/vt/wrangler" + + topodatapb "vitess.io/vitess/go/vt/proto/topodata" + vttestpb "vitess.io/vitess/go/vt/proto/vttest" +) + +var ( + Main = &cobra.Command{ + Use: "vtcombo", + Short: "vtcombo is a single binary containing several vitess components.", + Long: `vtcombo is a single binary containing several vitess components. + +In particular, it contains: +- A ZK topology server based on an in-memory map. +- One vtgate instance. +- Many vttablet instances. +- A vtctld instance so it's easy to see the topology.`, + Args: cobra.NoArgs, + Version: servenv.AppVersion.String(), + PreRunE: servenv.CobraPreRunE, + RunE: run, + } + schemaDir string + startMysql bool + mysqlPort = 3306 + externalTopoServer bool + plannerName string + vschemaPersistenceDir string + + tpb vttestpb.VTTestTopology + ts *topo.Server + resilientServer *srvtopo.ResilientServer +) + +func init() { + servenv.RegisterDefaultFlags() + servenv.RegisterFlags() + servenv.RegisterGRPCServerFlags() + servenv.RegisterGRPCServerAuthFlags() + servenv.RegisterServiceMapFlag() + + dbconfigs.RegisterFlags(dbconfigs.All...) + mysqlctl.RegisterFlags() + + servenv.MoveFlagsToCobraCommand(Main) + + acl.RegisterFlags(Main.Flags()) + + Main.Flags().StringVar(&schemaDir, "schema_dir", schemaDir, "Schema base directory. Should contain one directory per keyspace, with a vschema.json file if necessary.") + Main.Flags().BoolVar(&startMysql, "start_mysql", startMysql, "Should vtcombo also start mysql") + Main.Flags().IntVar(&mysqlPort, "mysql_port", mysqlPort, "mysql port") + Main.Flags().BoolVar(&externalTopoServer, "external_topo_server", externalTopoServer, "Should vtcombo use an external topology server instead of starting its own in-memory topology server. "+ + "If true, vtcombo will use the flags defined in topo/server.go to open topo server") + Main.Flags().StringVar(&plannerName, "planner-version", plannerName, "Sets the default planner to use when the session has not changed it. Valid values are: Gen4, Gen4Greedy, Gen4Left2Right") + Main.Flags().StringVar(&vschemaPersistenceDir, "vschema-persistence-dir", vschemaPersistenceDir, "If set, per-keyspace vschema will be persisted in this directory "+ + "and reloaded into the in-memory topology server across restarts. Bookkeeping is performed using a simple watcher goroutine. "+ + "This is useful when running vtcombo as an application development container (e.g. vttestserver) where you want to keep the same "+ + "vschema even if developer's machine reboots. This works in tandem with vttestserver's --persistent_mode flag. Needless to say, "+ + "this is neither a perfect nor a production solution for vschema persistence. Consider using the --external_topo_server flag if "+ + "you require a more complete solution. This flag is ignored if --external_topo_server is set.") + + Main.Flags().Var(vttest.TextTopoData(&tpb), "proto_topo", "vttest proto definition of the topology, encoded in compact text format. See vttest.proto for more information.") + Main.Flags().Var(vttest.JSONTopoData(&tpb), "json_topo", "vttest proto definition of the topology, encoded in json format. See vttest.proto for more information.") + + // We're going to force the value later, so don't even bother letting the + // user know about this flag. + Main.Flags().MarkHidden("tablet_protocol") +} + +func startMysqld(uid uint32) (mysqld *mysqlctl.Mysqld, cnf *mysqlctl.Mycnf, err error) { + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + mycnfFile := mysqlctl.MycnfFile(uid) + + if _, statErr := os.Stat(mycnfFile); os.IsNotExist(statErr) { + mysqld, cnf, err = mysqlctl.CreateMysqldAndMycnf(uid, "", mysqlPort) + if err != nil { + return nil, nil, fmt.Errorf("failed to initialize mysql config :%w", err) + } + if err := mysqld.Init(ctx, cnf, ""); err != nil { + return nil, nil, fmt.Errorf("failed to initialize mysql :%w", err) + } + } else { + mysqld, cnf, err = mysqlctl.OpenMysqldAndMycnf(uid) + if err != nil { + return nil, nil, fmt.Errorf("failed to find mysql config: %w", err) + } + err = mysqld.RefreshConfig(ctx, cnf) + if err != nil { + return nil, nil, fmt.Errorf("failed to refresh config: %w", err) + } + if err := mysqld.Start(ctx, cnf); err != nil { + return nil, nil, fmt.Errorf("Failed to start mysqld: %w", err) + } + } + + return mysqld, cnf, nil +} + +func run(cmd *cobra.Command, args []string) (err error) { + // Stash away a copy of the topology that vtcombo was started with. + // + // We will use this to determine the shard structure when keyspaces + // get recreated. + originalTopology := (&tpb).CloneVT() + + // default cell to "test" if unspecified + if len(tpb.Cells) == 0 { + tpb.Cells = append(tpb.Cells, "test") + } + + cmd.Flags().Set("cells_to_watch", strings.Join(tpb.Cells, ",")) + + // vtctld UI requires the cell flag + cmd.Flags().Set("cell", tpb.Cells[0]) + if cmd.Flags().Lookup("log_dir") == nil { + cmd.Flags().Set("log_dir", "$VTDATAROOT/tmp") + } + + if externalTopoServer { + // Open topo server based on the command line flags defined at topo/server.go + // do not create cell info as it should be done by whoever sets up the external topo server + ts = topo.Open() + } else { + // Create topo server. We use a 'memorytopo' implementation. + ts = memorytopo.NewServer(context.Background(), tpb.Cells...) + } + + // attempt to load any routing rules specified by tpb + if err := vtcombo.InitRoutingRules(context.Background(), ts, tpb.GetRoutingRules()); err != nil { + return fmt.Errorf("Failed to load routing rules: %w", err) + } + + servenv.Init() + defer servenv.Close() + + tabletenv.Init() + + var ( + mysqld = &vtcomboMysqld{} + cnf *mysqlctl.Mycnf + ) + + if startMysql { + mysqld.Mysqld, cnf, err = startMysqld(1) + servenv.OnClose(func() { + mysqld.Shutdown(context.TODO(), cnf, true) + }) + // We want to ensure we can write to this database + mysqld.SetReadOnly(false) + + } else { + dbconfigs.GlobalDBConfigs.InitWithSocket("") + mysqld.Mysqld = mysqlctl.NewMysqld(&dbconfigs.GlobalDBConfigs) + servenv.OnClose(mysqld.Close) + } + + // Tablet configuration and init. + // Send mycnf as nil because vtcombo won't do backups and restores. + // + // Also force the `--tablet_manager_protocol` and `--tablet_protocol` flags + // to be the "internal" protocol that InitTabletMap registers. + cmd.Flags().Set("tablet_manager_protocol", "internal") + cmd.Flags().Set("tablet_protocol", "internal") + uid, err := vtcombo.InitTabletMap(ts, &tpb, mysqld, &dbconfigs.GlobalDBConfigs, schemaDir, startMysql) + if err != nil { + // ensure we start mysql in the event we fail here + if startMysql { + mysqld.Shutdown(context.TODO(), cnf, true) + } + + return fmt.Errorf("initTabletMapProto failed: %w", err) + } + + globalCreateDb = func(ctx context.Context, ks *vttestpb.Keyspace) error { + // Check if we're recreating a keyspace that was previously deleted by looking + // at the original topology definition. + // + // If we find a matching keyspace, we create it with the same sharding + // configuration. This ensures that dropping and recreating a keyspace + // will end up with the same number of shards. + for _, originalKs := range originalTopology.Keyspaces { + if originalKs.Name == ks.Name { + ks = originalKs.CloneVT() + } + } + + wr := wrangler.New(logutil.NewConsoleLogger(), ts, nil) + newUID, err := vtcombo.CreateKs(ctx, ts, &tpb, mysqld, &dbconfigs.GlobalDBConfigs, schemaDir, ks, true, uid, wr) + if err != nil { + return err + } + uid = newUID + tpb.Keyspaces = append(tpb.Keyspaces, ks) + return nil + } + + globalDropDb = func(ctx context.Context, ksName string) error { + if err := vtcombo.DeleteKs(ctx, ts, ksName, mysqld, &tpb); err != nil { + return err + } + + // Rebuild the SrvVSchema object + if err := ts.RebuildSrvVSchema(ctx, tpb.Cells); err != nil { + return err + } + + return nil + } + + // Now that we have fully initialized the tablets, rebuild the keyspace graph. + for _, ks := range tpb.Keyspaces { + err := topotools.RebuildKeyspace(context.Background(), logutil.NewConsoleLogger(), ts, ks.GetName(), tpb.Cells, false) + if err != nil { + if startMysql { + mysqld.Shutdown(context.TODO(), cnf, true) + } + + return fmt.Errorf("Couldn't build srv keyspace for (%v: %v). Got error: %w", ks, tpb.Cells, err) + } + } + + // vtgate configuration and init + resilientServer = srvtopo.NewResilientServer(context.Background(), ts, "ResilientSrvTopoServer") + tabletTypesToWait := []topodatapb.TabletType{ + topodatapb.TabletType_PRIMARY, + topodatapb.TabletType_REPLICA, + topodatapb.TabletType_RDONLY, + } + plannerVersion, _ := plancontext.PlannerNameToVersion(plannerName) + + vtgate.QueryLogHandler = "/debug/vtgate/querylog" + vtgate.QueryLogzHandler = "/debug/vtgate/querylogz" + vtgate.QueryzHandler = "/debug/vtgate/queryz" + // pass nil for healthcheck, it will get created + vtg := vtgate.Init(context.Background(), nil, resilientServer, tpb.Cells[0], tabletTypesToWait, plannerVersion) + + // vtctld configuration and init + err = vtctld.InitVtctld(ts) + if err != nil { + exit.Return(1) + } + + if vschemaPersistenceDir != "" && !externalTopoServer { + startVschemaWatcher(vschemaPersistenceDir, tpb.Keyspaces, ts) + } + + servenv.OnRun(func() { + addStatusParts(vtg) + }) + + servenv.OnTerm(func() { + log.Error("Terminating") + // FIXME(alainjobart): stop vtgate + }) + servenv.OnClose(func() { + // We will still use the topo server during lameduck period + // to update our state, so closing it in OnClose() + ts.Close() + }) + servenv.RunDefault() + + return nil +} + +// vtcomboMysqld is a wrapper on top of mysqlctl.Mysqld. +// We need this wrapper because vtcombo runs with a single MySQL instance +// which all the tablets connect to. (replica, primary, all). This means that we shouldn't +// be trying to run any replication related commands on it, otherwise they fail. +type vtcomboMysqld struct { + *mysqlctl.Mysqld +} + +// SetReplicationSource implements the MysqlDaemon interface +func (mysqld *vtcomboMysqld) SetReplicationSource(ctx context.Context, host string, port int32, stopReplicationBefore bool, startReplicationAfter bool) error { + return nil +} + +// StartReplication implements the MysqlDaemon interface +func (mysqld *vtcomboMysqld) StartReplication(hookExtraEnv map[string]string) error { + return nil +} + +// RestartReplication implements the MysqlDaemon interface +func (mysqld *vtcomboMysqld) RestartReplication(hookExtraEnv map[string]string) error { + return nil +} + +// StartReplicationUntilAfter implements the MysqlDaemon interface +func (mysqld *vtcomboMysqld) StartReplicationUntilAfter(ctx context.Context, pos replication.Position) error { + return nil +} + +// StopReplication implements the MysqlDaemon interface +func (mysqld *vtcomboMysqld) StopReplication(hookExtraEnv map[string]string) error { + return nil +} + +// SetSemiSyncEnabled implements the MysqlDaemon interface +func (mysqld *vtcomboMysqld) SetSemiSyncEnabled(source, replica bool) error { + return nil +} + +// SemiSyncExtensionLoaded implements the MysqlDaemon interface +func (mysqld *vtcomboMysqld) SemiSyncExtensionLoaded() (bool, error) { + return true, nil +} diff --git a/go/cmd/vtcombo/plugin_dbddl.go b/go/cmd/vtcombo/cli/plugin_dbddl.go similarity index 99% rename from go/cmd/vtcombo/plugin_dbddl.go rename to go/cmd/vtcombo/cli/plugin_dbddl.go index 1a95e073308..b04af91af5c 100644 --- a/go/cmd/vtcombo/plugin_dbddl.go +++ b/go/cmd/vtcombo/cli/plugin_dbddl.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package main +package cli import ( "context" diff --git a/go/cmd/vtcombo/plugin_grpcvtctldserver.go b/go/cmd/vtcombo/cli/plugin_grpcvtctldserver.go similarity index 98% rename from go/cmd/vtcombo/plugin_grpcvtctldserver.go rename to go/cmd/vtcombo/cli/plugin_grpcvtctldserver.go index e5bba399072..2cf8eed8368 100644 --- a/go/cmd/vtcombo/plugin_grpcvtctldserver.go +++ b/go/cmd/vtcombo/cli/plugin_grpcvtctldserver.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package main +package cli import ( "vitess.io/vitess/go/vt/servenv" diff --git a/go/cmd/vtcombo/plugin_grpcvtctlserver.go b/go/cmd/vtcombo/cli/plugin_grpcvtctlserver.go similarity index 98% rename from go/cmd/vtcombo/plugin_grpcvtctlserver.go rename to go/cmd/vtcombo/cli/plugin_grpcvtctlserver.go index 4ec5323b075..8b7f918bc58 100644 --- a/go/cmd/vtcombo/plugin_grpcvtctlserver.go +++ b/go/cmd/vtcombo/cli/plugin_grpcvtctlserver.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package main +package cli import ( "vitess.io/vitess/go/vt/servenv" diff --git a/go/cmd/vtcombo/plugin_grpcvtgateservice.go b/go/cmd/vtcombo/cli/plugin_grpcvtgateservice.go similarity index 98% rename from go/cmd/vtcombo/plugin_grpcvtgateservice.go rename to go/cmd/vtcombo/cli/plugin_grpcvtgateservice.go index ff58dff616a..a980f063577 100644 --- a/go/cmd/vtcombo/plugin_grpcvtgateservice.go +++ b/go/cmd/vtcombo/cli/plugin_grpcvtgateservice.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package main +package cli // Imports and register the gRPC vtgateservice server diff --git a/go/cmd/vtcombo/plugin_opentracing.go b/go/cmd/vtcombo/cli/plugin_opentracing.go similarity index 98% rename from go/cmd/vtcombo/plugin_opentracing.go rename to go/cmd/vtcombo/cli/plugin_opentracing.go index c2ea8325e6a..0b9274b498d 100644 --- a/go/cmd/vtcombo/plugin_opentracing.go +++ b/go/cmd/vtcombo/cli/plugin_opentracing.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package main +package cli import ( "vitess.io/vitess/go/trace" diff --git a/go/cmd/vtcombo/status.go b/go/cmd/vtcombo/cli/status.go similarity index 99% rename from go/cmd/vtcombo/status.go rename to go/cmd/vtcombo/cli/status.go index 2b5e2696391..e52c9ccead8 100644 --- a/go/cmd/vtcombo/status.go +++ b/go/cmd/vtcombo/cli/status.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package main +package cli import ( "vitess.io/vitess/go/vt/discovery" diff --git a/go/cmd/vtcombo/vschema_watcher.go b/go/cmd/vtcombo/cli/vschema_watcher.go similarity index 95% rename from go/cmd/vtcombo/vschema_watcher.go rename to go/cmd/vtcombo/cli/vschema_watcher.go index 948ed67bea7..c1c9f120b96 100644 --- a/go/cmd/vtcombo/vschema_watcher.go +++ b/go/cmd/vtcombo/cli/vschema_watcher.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package main +package cli import ( "context" @@ -23,10 +23,11 @@ import ( "path" "vitess.io/vitess/go/vt/log" - vschemapb "vitess.io/vitess/go/vt/proto/vschema" - vttestpb "vitess.io/vitess/go/vt/proto/vttest" "vitess.io/vitess/go/vt/topo" "vitess.io/vitess/go/vt/vtgate/vindexes" + + vschemapb "vitess.io/vitess/go/vt/proto/vschema" + vttestpb "vitess.io/vitess/go/vt/proto/vttest" ) func startVschemaWatcher(vschemaPersistenceDir string, keyspaces []*vttestpb.Keyspace, ts *topo.Server) { @@ -100,11 +101,11 @@ func persistNewSrvVSchema(srvVSchema *vschemapb.SrvVSchema) { continue } - err = os.WriteFile(path.Join(*vschemaPersistenceDir, ksName+".json"), jsonBytes, 0644) + err = os.WriteFile(path.Join(vschemaPersistenceDir, ksName+".json"), jsonBytes, 0644) if err != nil { log.Errorf("Error writing keyspace file: %v", err) } - log.Infof("Persisted keyspace %v to %v", ksName, *vschemaPersistenceDir) + log.Infof("Persisted keyspace %v to %v", ksName, vschemaPersistenceDir) } } diff --git a/go/cmd/vtcombo/docgen/main.go b/go/cmd/vtcombo/docgen/main.go new file mode 100644 index 00000000000..31304296b3e --- /dev/null +++ b/go/cmd/vtcombo/docgen/main.go @@ -0,0 +1,37 @@ +/* +Copyright 2023 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "github.com/spf13/cobra" + + "vitess.io/vitess/go/cmd/internal/docgen" + "vitess.io/vitess/go/cmd/vtcombo/cli" +) + +func main() { + var dir string + cmd := cobra.Command{ + Use: "docgen [-d ]", + RunE: func(cmd *cobra.Command, args []string) error { + return docgen.GenerateMarkdownTree(cli.Main, dir) + }, + } + + cmd.Flags().StringVarP(&dir, "dir", "d", "doc", "output directory to write documentation") + _ = cmd.Execute() +} diff --git a/go/cmd/vtcombo/main.go b/go/cmd/vtcombo/main.go index 24994f06c6e..f5de215b617 100644 --- a/go/cmd/vtcombo/main.go +++ b/go/cmd/vtcombo/main.go @@ -22,340 +22,16 @@ limitations under the License. package main import ( - "context" - "os" - "strings" - "time" - - "github.com/spf13/pflag" - - "vitess.io/vitess/go/acl" + "vitess.io/vitess/go/cmd/vtcombo/cli" "vitess.io/vitess/go/exit" - "vitess.io/vitess/go/mysql/replication" - "vitess.io/vitess/go/vt/dbconfigs" "vitess.io/vitess/go/vt/log" - "vitess.io/vitess/go/vt/logutil" - "vitess.io/vitess/go/vt/mysqlctl" - "vitess.io/vitess/go/vt/servenv" - "vitess.io/vitess/go/vt/srvtopo" - "vitess.io/vitess/go/vt/topo" - "vitess.io/vitess/go/vt/topo/memorytopo" - "vitess.io/vitess/go/vt/topotools" - "vitess.io/vitess/go/vt/vtcombo" - "vitess.io/vitess/go/vt/vtctld" - "vitess.io/vitess/go/vt/vtgate" - "vitess.io/vitess/go/vt/vtgate/planbuilder/plancontext" - "vitess.io/vitess/go/vt/vttablet/tabletserver/tabletenv" - "vitess.io/vitess/go/vt/vttest" - "vitess.io/vitess/go/vt/wrangler" - - topodatapb "vitess.io/vitess/go/vt/proto/topodata" - vttestpb "vitess.io/vitess/go/vt/proto/vttest" ) -var ( - flags = pflag.NewFlagSet("vtcombo", pflag.ContinueOnError) - schemaDir = flags.String("schema_dir", "", "Schema base directory. Should contain one directory per keyspace, with a vschema.json file if necessary.") - startMysql = flags.Bool("start_mysql", false, "Should vtcombo also start mysql") - mysqlPort = flags.Int("mysql_port", 3306, "mysql port") - externalTopoServer = flags.Bool("external_topo_server", false, "Should vtcombo use an external topology server instead of starting its own in-memory topology server. "+ - "If true, vtcombo will use the flags defined in topo/server.go to open topo server") - plannerName = flags.String("planner-version", "", "Sets the default planner to use when the session has not changed it. Valid values are: Gen4, Gen4Greedy, Gen4Left2Right") - vschemaPersistenceDir = flags.String("vschema-persistence-dir", "", "If set, per-keyspace vschema will be persisted in this directory "+ - "and reloaded into the in-memory topology server across restarts. Bookkeeping is performed using a simple watcher goroutine. "+ - "This is useful when running vtcombo as an application development container (e.g. vttestserver) where you want to keep the same "+ - "vschema even if developer's machine reboots. This works in tandem with vttestserver's --persistent_mode flag. Needless to say, "+ - "this is neither a perfect nor a production solution for vschema persistence. Consider using the --external_topo_server flag if "+ - "you require a more complete solution. This flag is ignored if --external_topo_server is set.") - - tpb vttestpb.VTTestTopology - ts *topo.Server - resilientServer *srvtopo.ResilientServer -) - -func init() { - flags.Var(vttest.TextTopoData(&tpb), "proto_topo", "vttest proto definition of the topology, encoded in compact text format. See vttest.proto for more information.") - flags.Var(vttest.JSONTopoData(&tpb), "json_topo", "vttest proto definition of the topology, encoded in json format. See vttest.proto for more information.") - - servenv.RegisterDefaultFlags() - servenv.RegisterFlags() - servenv.RegisterGRPCServerFlags() - servenv.RegisterGRPCServerAuthFlags() - servenv.RegisterServiceMapFlag() -} - -func startMysqld(uid uint32) (*mysqlctl.Mysqld, *mysqlctl.Mycnf) { - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) - mycnfFile := mysqlctl.MycnfFile(uid) - - var mysqld *mysqlctl.Mysqld - var cnf *mysqlctl.Mycnf - var err error - - if _, statErr := os.Stat(mycnfFile); os.IsNotExist(statErr) { - mysqld, cnf, err = mysqlctl.CreateMysqldAndMycnf(uid, "", *mysqlPort) - if err != nil { - log.Errorf("failed to initialize mysql config :%v", err) - exit.Return(1) - } - if err := mysqld.Init(ctx, cnf, ""); err != nil { - log.Errorf("failed to initialize mysql :%v", err) - exit.Return(1) - } - } else { - mysqld, cnf, err = mysqlctl.OpenMysqldAndMycnf(uid) - if err != nil { - log.Errorf("failed to find mysql config: %v", err) - exit.Return(1) - } - err = mysqld.RefreshConfig(ctx, cnf) - if err != nil { - log.Errorf("failed to refresh config: %v", err) - exit.Return(1) - } - if err := mysqld.Start(ctx, cnf); err != nil { - log.Errorf("Failed to start mysqld: %v", err) - exit.Return(1) - } - } - cancel() - return mysqld, cnf -} - func main() { defer exit.Recover() - // flag parsing - var globalFlags *pflag.FlagSet - dbconfigs.RegisterFlags(dbconfigs.All...) - mysqlctl.RegisterFlags() - servenv.OnParseFor("vtcombo", func(fs *pflag.FlagSet) { - // We're going to force the value later, so don't even bother letting - // the user know about this flag. - fs.MarkHidden("tablet_protocol") - - // Add the vtcombo flags declared above in var/init sections to the - // global flags. - fs.AddFlagSet(flags) - // Save for later -- see comment directly after ParseFlags for why. - globalFlags = fs - - acl.RegisterFlags(fs) - }) - - servenv.ParseFlags("vtcombo") - - // At this point, servenv.ParseFlags has invoked _flag.Parse, which has - // combined all the flags everywhere into the globalFlags variable we - // stashed a reference to earlier in our OnParseFor callback function. - // - // We now take those flags and make them available to our `flags` instance, - // which we call `Set` on various flags to force their values further down - // in main(). - // - // N.B.: we could just as easily call Set on globalFlags on everything - // (including our local flags), but we need to save a reference either way, - // and that in particular (globalFlags.Set on a local flag) feels more - // potentially confusing than its inverse (flags.Set on a global flag), so - // we go this way. - flags.AddFlagSet(globalFlags) - - // Stash away a copy of the topology that vtcombo was started with. - // - // We will use this to determine the shard structure when keyspaces - // get recreated. - originalTopology := (&tpb).CloneVT() - - // default cell to "test" if unspecified - if len(tpb.Cells) == 0 { - tpb.Cells = append(tpb.Cells, "test") - } - - flags.Set("cells_to_watch", strings.Join(tpb.Cells, ",")) - - // vtctld UI requires the cell flag - flags.Set("cell", tpb.Cells[0]) - if flags.Lookup("log_dir") == nil { - flags.Set("log_dir", "$VTDATAROOT/tmp") - } - - if *externalTopoServer { - // Open topo server based on the command line flags defined at topo/server.go - // do not create cell info as it should be done by whoever sets up the external topo server - ts = topo.Open() - } else { - // Create topo server. We use a 'memorytopo' implementation. - ts = memorytopo.NewServer(context.Background(), tpb.Cells...) - } - - // attempt to load any routing rules specified by tpb - if err := vtcombo.InitRoutingRules(context.Background(), ts, tpb.GetRoutingRules()); err != nil { - log.Errorf("Failed to load routing rules: %v", err) - exit.Return(1) - } - - servenv.Init() - tabletenv.Init() - - mysqld := &vtcomboMysqld{} - var cnf *mysqlctl.Mycnf - if *startMysql { - mysqld.Mysqld, cnf = startMysqld(1) - servenv.OnClose(func() { - mysqld.Shutdown(context.TODO(), cnf, true) - }) - // We want to ensure we can write to this database - mysqld.SetReadOnly(false) - - } else { - dbconfigs.GlobalDBConfigs.InitWithSocket("") - mysqld.Mysqld = mysqlctl.NewMysqld(&dbconfigs.GlobalDBConfigs) - servenv.OnClose(mysqld.Close) - } - - // Tablet configuration and init. - // Send mycnf as nil because vtcombo won't do backups and restores. - // - // Also force the `--tablet_manager_protocol` and `--tablet_protocol` flags - // to be the "internal" protocol that InitTabletMap registers. - flags.Set("tablet_manager_protocol", "internal") - flags.Set("tablet_protocol", "internal") - uid, err := vtcombo.InitTabletMap(ts, &tpb, mysqld, &dbconfigs.GlobalDBConfigs, *schemaDir, *startMysql) - if err != nil { - log.Errorf("initTabletMapProto failed: %v", err) - // ensure we start mysql in the event we fail here - if *startMysql { - mysqld.Shutdown(context.TODO(), cnf, true) - } - exit.Return(1) - } - - globalCreateDb = func(ctx context.Context, ks *vttestpb.Keyspace) error { - // Check if we're recreating a keyspace that was previously deleted by looking - // at the original topology definition. - // - // If we find a matching keyspace, we create it with the same sharding - // configuration. This ensures that dropping and recreating a keyspace - // will end up with the same number of shards. - for _, originalKs := range originalTopology.Keyspaces { - if originalKs.Name == ks.Name { - ks = originalKs.CloneVT() - } - } - - wr := wrangler.New(logutil.NewConsoleLogger(), ts, nil) - newUID, err := vtcombo.CreateKs(ctx, ts, &tpb, mysqld, &dbconfigs.GlobalDBConfigs, *schemaDir, ks, true, uid, wr) - if err != nil { - return err - } - uid = newUID - tpb.Keyspaces = append(tpb.Keyspaces, ks) - return nil - } - - globalDropDb = func(ctx context.Context, ksName string) error { - if err := vtcombo.DeleteKs(ctx, ts, ksName, mysqld, &tpb); err != nil { - return err - } - - // Rebuild the SrvVSchema object - if err := ts.RebuildSrvVSchema(ctx, tpb.Cells); err != nil { - return err - } - - return nil - } - // Now that we have fully initialized the tablets, rebuild the keyspace graph. - for _, ks := range tpb.Keyspaces { - err := topotools.RebuildKeyspace(context.Background(), logutil.NewConsoleLogger(), ts, ks.GetName(), tpb.Cells, false) - if err != nil { - if *startMysql { - mysqld.Shutdown(context.TODO(), cnf, true) - } - log.Fatalf("Couldn't build srv keyspace for (%v: %v). Got error: %v", ks, tpb.Cells, err) - } - } - - // vtgate configuration and init - resilientServer = srvtopo.NewResilientServer(context.Background(), ts, "ResilientSrvTopoServer") - tabletTypesToWait := []topodatapb.TabletType{ - topodatapb.TabletType_PRIMARY, - topodatapb.TabletType_REPLICA, - topodatapb.TabletType_RDONLY, - } - plannerVersion, _ := plancontext.PlannerNameToVersion(*plannerName) - - vtgate.QueryLogHandler = "/debug/vtgate/querylog" - vtgate.QueryLogzHandler = "/debug/vtgate/querylogz" - vtgate.QueryzHandler = "/debug/vtgate/queryz" - // pass nil for healthcheck, it will get created - vtg := vtgate.Init(context.Background(), nil, resilientServer, tpb.Cells[0], tabletTypesToWait, plannerVersion) - - // vtctld configuration and init - err = vtctld.InitVtctld(ts) - if err != nil { + if err := cli.Main.Execute(); err != nil { + log.Error(err) exit.Return(1) } - - if *vschemaPersistenceDir != "" && !*externalTopoServer { - startVschemaWatcher(*vschemaPersistenceDir, tpb.Keyspaces, ts) - } - - servenv.OnRun(func() { - addStatusParts(vtg) - }) - - servenv.OnTerm(func() { - log.Error("Terminating") - // FIXME(alainjobart): stop vtgate - }) - servenv.OnClose(func() { - // We will still use the topo server during lameduck period - // to update our state, so closing it in OnClose() - ts.Close() - }) - servenv.RunDefault() -} - -// vtcomboMysqld is a wrapper on top of mysqlctl.Mysqld. -// We need this wrapper because vtcombo runs with a single MySQL instance -// which all the tablets connect to. (replica, primary, all). This means that we shouldn't -// be trying to run any replication related commands on it, otherwise they fail. -type vtcomboMysqld struct { - *mysqlctl.Mysqld -} - -// SetReplicationSource implements the MysqlDaemon interface -func (mysqld *vtcomboMysqld) SetReplicationSource(ctx context.Context, host string, port int32, stopReplicationBefore bool, startReplicationAfter bool) error { - return nil -} - -// StartReplication implements the MysqlDaemon interface -func (mysqld *vtcomboMysqld) StartReplication(hookExtraEnv map[string]string) error { - return nil -} - -// RestartReplication implements the MysqlDaemon interface -func (mysqld *vtcomboMysqld) RestartReplication(hookExtraEnv map[string]string) error { - return nil -} - -// StartReplicationUntilAfter implements the MysqlDaemon interface -func (mysqld *vtcomboMysqld) StartReplicationUntilAfter(ctx context.Context, pos replication.Position) error { - return nil -} - -// StopReplication implements the MysqlDaemon interface -func (mysqld *vtcomboMysqld) StopReplication(hookExtraEnv map[string]string) error { - return nil -} - -// SetSemiSyncEnabled implements the MysqlDaemon interface -func (mysqld *vtcomboMysqld) SetSemiSyncEnabled(source, replica bool) error { - return nil -} - -// SemiSyncExtensionLoaded implements the MysqlDaemon interface -func (mysqld *vtcomboMysqld) SemiSyncExtensionLoaded() (bool, error) { - return true, nil } diff --git a/go/flags/endtoend/vtcombo.txt b/go/flags/endtoend/vtcombo.txt index ec42626dec8..67ae9a4613e 100644 --- a/go/flags/endtoend/vtcombo.txt +++ b/go/flags/endtoend/vtcombo.txt @@ -1,4 +1,15 @@ -Usage of vtcombo: +vtcombo is a single binary containing several vitess components. + +In particular, it contains: +- A ZK topology server based on an in-memory map. +- One vtgate instance. +- Many vttablet instances. +- A vtctld instance so it's easy to see the topology. + +Usage: + vtcombo [flags] + +Flags: --action_timeout duration time to wait for an action before resorting to force (default 1m0s) --allow-kill-statement Allows the execution of kill statement --allowed_tablet_types strings Specifies the tablet types this vtgate is allowed to route queries to. Should be provided as a comma-separated set of tablet types. @@ -128,6 +139,8 @@ Usage of vtcombo: --gc_check_interval duration Interval between garbage collection checks (default 1h0m0s) --gc_purge_check_interval duration Interval between purge discovery checks (default 1m0s) --gh-ost-path string override default gh-ost binary full path + --grpc-use-effective-groups If set, and SSL is not used, will set the immediate caller's security groups from the effective caller id's groups. + --grpc-use-static-authentication-callerid If set, will set the immediate caller id to the username authenticated by the static auth plugin. --grpc_auth_mode string Which auth plugin implementation to use (eg: static) --grpc_auth_mtls_allowed_substrings string List of substrings of at least one of the client certificate names (separated by colon). --grpc_auth_static_password_file string JSON File to read the users/passwords from. @@ -147,13 +160,14 @@ Usage of vtcombo: --grpc_server_initial_window_size int gRPC server initial window size --grpc_server_keepalive_enforcement_policy_min_time duration gRPC server minimum keepalive time (default 10s) --grpc_server_keepalive_enforcement_policy_permit_without_stream gRPC server permit client keepalive pings even when there are no active streams (RPCs) + --grpc_use_effective_callerid If set, and SSL is not used, will set the immediate caller id from the effective caller id's principal. --health_check_interval duration Interval between health checks (default 20s) --healthcheck_retry_delay duration health check retry delay (default 2ms) --healthcheck_timeout duration the health check timeout period (default 1m0s) --heartbeat_enable If true, vttablet records (if master) or checks (if replica) the current time of a replication heartbeat in the sidecar database's heartbeat table. The result is used to inform the serving state of the vttablet via healthchecks. --heartbeat_interval duration How frequently to read and write replication heartbeat. (default 1s) --heartbeat_on_demand_duration duration If non-zero, heartbeats are only written upon consumer request, and only run for up to given duration following the request. Frequent requests can keep the heartbeat running consistently; when requests are infrequent heartbeat may completely stop between requests - -h, --help display usage and exit + -h, --help help for vtcombo --hot_row_protection_concurrent_transactions int Number of concurrent transactions let through to the txpool/MySQL for the same hot row. Should be > 1 to have enough 'ready' transactions in MySQL and benefit from a pipelining effect. (default 5) --hot_row_protection_max_global_queue_size int Global queue limit across all row (ranges). Useful to prevent that the queue can grow unbounded. (default 1000) --hot_row_protection_max_queue_size int Maximum number of BeginExecute RPCs which will be queued for the same row (range). (default 20) From 623f86c873c2e8d46c65324ccc74fe7d0249fc8e Mon Sep 17 00:00:00 2001 From: Andrew Mason Date: Sun, 17 Sep 2023 06:22:15 -0400 Subject: [PATCH 09/12] remove added defer servenv close Signed-off-by: Andrew Mason --- go/cmd/vtcombo/cli/main.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/go/cmd/vtcombo/cli/main.go b/go/cmd/vtcombo/cli/main.go index 34fab26717f..6a6642eab67 100644 --- a/go/cmd/vtcombo/cli/main.go +++ b/go/cmd/vtcombo/cli/main.go @@ -183,8 +183,6 @@ func run(cmd *cobra.Command, args []string) (err error) { } servenv.Init() - defer servenv.Close() - tabletenv.Init() var ( From 5099f4f6217b6a48d659183a5ab500448dedb338 Mon Sep 17 00:00:00 2001 From: Andrew Mason Date: Sun, 17 Sep 2023 06:22:31 -0400 Subject: [PATCH 10/12] bugfixes for error handling Signed-off-by: Andrew Mason --- go/cmd/vtcombo/cli/main.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/go/cmd/vtcombo/cli/main.go b/go/cmd/vtcombo/cli/main.go index 6a6642eab67..d2c05887fc5 100644 --- a/go/cmd/vtcombo/cli/main.go +++ b/go/cmd/vtcombo/cli/main.go @@ -31,7 +31,6 @@ import ( "github.com/spf13/cobra" "vitess.io/vitess/go/acl" - "vitess.io/vitess/go/exit" "vitess.io/vitess/go/mysql/replication" "vitess.io/vitess/go/vt/dbconfigs" "vitess.io/vitess/go/vt/log" @@ -192,6 +191,9 @@ func run(cmd *cobra.Command, args []string) (err error) { if startMysql { mysqld.Mysqld, cnf, err = startMysqld(1) + if err != nil { + return err + } servenv.OnClose(func() { mysqld.Shutdown(context.TODO(), cnf, true) }) @@ -287,7 +289,7 @@ func run(cmd *cobra.Command, args []string) (err error) { // vtctld configuration and init err = vtctld.InitVtctld(ts) if err != nil { - exit.Return(1) + return err } if vschemaPersistenceDir != "" && !externalTopoServer { From ed83601d558ba09886d37c69debe807e3fb76876 Mon Sep 17 00:00:00 2001 From: Andrew Mason Date: Sun, 17 Sep 2023 07:47:45 -0400 Subject: [PATCH 11/12] whoops Signed-off-by: Andrew Mason --- go/cmd/vtclient/docgen/main.go | 37 ++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 go/cmd/vtclient/docgen/main.go diff --git a/go/cmd/vtclient/docgen/main.go b/go/cmd/vtclient/docgen/main.go new file mode 100644 index 00000000000..b740cbd67a7 --- /dev/null +++ b/go/cmd/vtclient/docgen/main.go @@ -0,0 +1,37 @@ +/* +Copyright 2023 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "github.com/spf13/cobra" + + "vitess.io/vitess/go/cmd/internal/docgen" + "vitess.io/vitess/go/cmd/vtclient/cli" +) + +func main() { + var dir string + cmd := cobra.Command{ + Use: "docgen [-d ]", + RunE: func(cmd *cobra.Command, args []string) error { + return docgen.GenerateMarkdownTree(cli.Main, dir) + }, + } + + cmd.Flags().StringVarP(&dir, "dir", "d", "doc", "output directory to write documentation") + _ = cmd.Execute() +} From 15ae53d1307405117f6e33b44072f79356b0aa99 Mon Sep 17 00:00:00 2001 From: Andrew Mason Date: Wed, 20 Sep 2023 18:31:07 -0400 Subject: [PATCH 12/12] update with PR feedback and regenerate Signed-off-by: Andrew Mason --- go/cmd/vtbackup/cli/vtbackup.go | 2 +- go/cmd/vtclient/cli/vtclient.go | 2 -- go/cmd/vtcombo/cli/main.go | 2 +- go/flags/endtoend/vtbackup.txt | 2 +- go/flags/endtoend/vtclient.txt | 2 -- go/flags/endtoend/vtcombo.txt | 2 +- 6 files changed, 4 insertions(+), 8 deletions(-) diff --git a/go/cmd/vtbackup/cli/vtbackup.go b/go/cmd/vtbackup/cli/vtbackup.go index 20931b7db1c..8e70e3c5c1d 100644 --- a/go/cmd/vtbackup/cli/vtbackup.go +++ b/go/cmd/vtbackup/cli/vtbackup.go @@ -152,7 +152,7 @@ Whatever system launches vtbackup is responsible for the following: - Retrying vtbackup if it fails. - Alerting human operators if the failure is persistent. -The process vtbackup follows to take a new backup is as follows: +The process vtbackup follows to take a new backup has the following steps: 1. Restore from the most recent backup. 2. Start a mysqld instance (but no vttablet) from the restored data. 3. Instruct mysqld to connect to the current shard primary and replicate any diff --git a/go/cmd/vtclient/cli/vtclient.go b/go/cmd/vtclient/cli/vtclient.go index f7d399aa834..949af851ab4 100644 --- a/go/cmd/vtclient/cli/vtclient.go +++ b/go/cmd/vtclient/cli/vtclient.go @@ -67,8 +67,6 @@ var ( Short: "vtclient connects to a vtgate server using the standard go driver API.", Long: `vtclient connects to a vtgate server using the standard go driver API. -Version 3 of the API is used, we do not send any hint to the server. - For query bound variables, we assume place-holders in the query string in the form of :v1, :v2, etc.`, Example: `vtclient --server vtgate:15991 "SELECT * FROM messages" diff --git a/go/cmd/vtcombo/cli/main.go b/go/cmd/vtcombo/cli/main.go index d2c05887fc5..bfc0ad894fe 100644 --- a/go/cmd/vtcombo/cli/main.go +++ b/go/cmd/vtcombo/cli/main.go @@ -60,7 +60,7 @@ var ( Long: `vtcombo is a single binary containing several vitess components. In particular, it contains: -- A ZK topology server based on an in-memory map. +- A topology server based on an in-memory map. - One vtgate instance. - Many vttablet instances. - A vtctld instance so it's easy to see the topology.`, diff --git a/go/flags/endtoend/vtbackup.txt b/go/flags/endtoend/vtbackup.txt index bc62dd55530..720e37dfeda 100644 --- a/go/flags/endtoend/vtbackup.txt +++ b/go/flags/endtoend/vtbackup.txt @@ -15,7 +15,7 @@ Whatever system launches vtbackup is responsible for the following: - Retrying vtbackup if it fails. - Alerting human operators if the failure is persistent. -The process vtbackup follows to take a new backup is as follows: +The process vtbackup follows to take a new backup has the following steps: 1. Restore from the most recent backup. 2. Start a mysqld instance (but no vttablet) from the restored data. 3. Instruct mysqld to connect to the current shard primary and replicate any diff --git a/go/flags/endtoend/vtclient.txt b/go/flags/endtoend/vtclient.txt index 45a5d27b26e..3d17734168c 100644 --- a/go/flags/endtoend/vtclient.txt +++ b/go/flags/endtoend/vtclient.txt @@ -1,7 +1,5 @@ vtclient connects to a vtgate server using the standard go driver API. -Version 3 of the API is used, we do not send any hint to the server. - For query bound variables, we assume place-holders in the query string in the form of :v1, :v2, etc. diff --git a/go/flags/endtoend/vtcombo.txt b/go/flags/endtoend/vtcombo.txt index 67ae9a4613e..89d972f2f6b 100644 --- a/go/flags/endtoend/vtcombo.txt +++ b/go/flags/endtoend/vtcombo.txt @@ -1,7 +1,7 @@ vtcombo is a single binary containing several vitess components. In particular, it contains: -- A ZK topology server based on an in-memory map. +- A topology server based on an in-memory map. - One vtgate instance. - Many vttablet instances. - A vtctld instance so it's easy to see the topology.