From 314de9d01bfef0be4b0fb831047058afadf7367a Mon Sep 17 00:00:00 2001 From: Florent Poinsard Date: Tue, 3 Oct 2023 18:10:35 -0600 Subject: [PATCH 1/6] wip major version upgrade Signed-off-by: Florent Poinsard --- Makefile | 2 +- .../vitessshardreplication/reconcile_drain.go | 163 +++++++++++++++++- .../reconcile_drain_test.go | 150 ++++++++++++++++ .../vitessshardreplication_controller.go | 2 +- pkg/operator/vttablet/constants.go | 2 +- pkg/operator/vttablet/pod.go | 2 +- 6 files changed, 315 insertions(+), 6 deletions(-) create mode 100644 pkg/controller/vitessshardreplication/reconcile_drain_test.go diff --git a/Makefile b/Makefile index 29cc2164..76f7d37a 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ IMAGE_REGISTRY:=docker.io IMAGE_TAG:=latest -IMAGE:=planetscale/vitess-operator +IMAGE:=frouioui/vitess-operator IMAGE_NAME:=$(IMAGE_REGISTRY)/$(IMAGE) diff --git a/pkg/controller/vitessshardreplication/reconcile_drain.go b/pkg/controller/vitessshardreplication/reconcile_drain.go index ca110c4e..08e33b7d 100644 --- a/pkg/controller/vitessshardreplication/reconcile_drain.go +++ b/pkg/controller/vitessshardreplication/reconcile_drain.go @@ -19,9 +19,15 @@ package vitessshardreplication import ( "context" "fmt" + "regexp" + "slices" + "strconv" + "strings" "time" + "github.com/sirupsen/logrus" "vitess.io/vitess/go/mysql/replication" + "vitess.io/vitess/go/vt/proto/tabletmanagerdata" topodatapb "vitess.io/vitess/go/vt/proto/topodata" "vitess.io/vitess/go/vt/topo" "vitess.io/vitess/go/vt/topo/topoproto" @@ -94,7 +100,7 @@ drainer delete something at a bad time. However, by deleting only one tablet at a time we still ensure that for shards with three or more tablets we still have redundancy during the decommissioning. Maybe later we can do better. */ -func (r *ReconcileVitessShard) reconcileDrain(ctx context.Context, vts *planetscalev2.VitessShard, wr *wrangler.Wrangler) (reconcile.Result, error) { +func (r *ReconcileVitessShard) reconcileDrain(ctx context.Context, vts *planetscalev2.VitessShard, wr *wrangler.Wrangler, log *logrus.Entry) (reconcile.Result, error) { clusterName := vts.Labels[planetscalev2.ClusterLabel] keyspaceName := vts.Labels[planetscalev2.KeyspaceLabel] resultBuilder := &results.Builder{} @@ -255,8 +261,16 @@ func (r *ReconcileVitessShard) reconcileDrain(ctx context.Context, vts *planetsc } } + // 4. Check if we need to any operations like disabling fast shutdown + // for upgrades here. + if err := r.disableFastShutdown(ctx, wr, pods, tablets, vts.Spec.Images.Mysqld.Image(), log); err != nil { + r.recorder.Eventf(vts, corev1.EventTypeWarning, + "MysqldSafeUpgradeFailed", "failed to disable fast shutdown: %v", err) + return resultBuilder.Error(err) + } + // - // 4. Reparent draining primarys only if marked/will be marked as "Finished". + // 5. Reparent draining primarys only if marked/will be marked as "Finished". // // If we have acknowledged a drain and haven't already marked the primary as @@ -458,6 +472,151 @@ func candidatePrimary(ctx context.Context, wr *wrangler.Wrangler, shard *topo.Sh return bestCandidate } +func (r *ReconcileVitessShard) disableFastShutdown( + ctx context.Context, + wr *wrangler.Wrangler, + pods map[string]*corev1.Pod, + tablets map[string]*topo.TabletInfo, + desiredImage string, + log *logrus.Entry, +) error { + const disableFastShutdown = "set @@global.innodb_fast_shutdown = 0" + + fetchReq := &tabletmanagerdata.ExecuteFetchAsDbaRequest{ + Query: []byte(disableFastShutdown), + DbName: "_vt", + MaxRows: 0, + DisableBinlogs: false, + ReloadSchema: false, + } + + tmc := wr.TabletManagerClient() + + for tabletAlias, pod := range pods { + tablet, ok := tablets[tabletAlias] + if !ok { + continue + } + + var current string + for _, container := range pod.Spec.Containers { + if container.Name == vttablet.MysqldContainerName { + current = container.Image + break + } + } + + needsSafe, err := safeMysqldUpgrade(current, desiredImage) + if err != nil { + return err + } + + if !needsSafe { + continue + } + _, err = tmc.ExecuteFetchAsDba(ctx, tablet.Tablet, true /*usePool*/, fetchReq) + if err != nil { + return fmt.Errorf("failed to disable fast shutdown for tablet %v: %w", tabletAlias, err) + } + r.recorder.Eventf(pod, corev1.EventTypeNormal, + "MySQL_Upgrade", "innodb_fast_shutdown = 0 to prepare MySQL upgrade") + log.Infof("innodb_fast_shutdown = 0 to prepare MySQL upgrade on pod %s", pod.Name) + } + return nil +} + +var mysqlImageVersion = regexp.MustCompile(`^(\d+)\.(\d+)\.(\d+)`) + +func safeMysqldUpgrade(currentImage, desiredImage string) (bool, error) { + if currentImage == "" || desiredImage == "" { + // No action if we have unknown versions. + return false, nil + } + + // Quick check so no regexp matching is needed for the most common + // case where nothing changes. + if desiredImage == currentImage { + return false, nil + } + + currentParts := strings.SplitN(currentImage, ":", 2) + if len(currentParts) != 2 { + return false, nil + } + + desiredParts := strings.SplitN(desiredImage, ":", 2) + if len(desiredParts) != 2 { + return false, nil + } + + current := currentParts[1] + desired := desiredParts[1] + + curStrParts := mysqlImageVersion.FindStringSubmatch(current) + if len(curStrParts) != 4 { + // Invalid version, assume that we need to do a safe upgrade. + return true, nil + } + dstStrParts := mysqlImageVersion.FindStringSubmatch(desired) + if len(dstStrParts) != 4 { + // Invalid version, assume that we need to do a safe upgrade. + return true, nil + } + if slices.Equal(curStrParts, dstStrParts) { + return false, nil + } + dstParts := make([]int, len(dstStrParts)-1) + curParts := make([]int, len(curStrParts)-1) + for i, part := range dstStrParts[1:] { + // We already matched with `\d_` so there's no + // way this can trigger an error. + dstParts[i], _ = strconv.Atoi(part) + } + + for i, part := range curStrParts[1:] { + // We already matched with `\d_` so there's no + // way this can trigger an error. + curParts[i], _ = strconv.Atoi(part) + } + + if dstParts[0] < curParts[0] { + return false, fmt.Errorf("cannot downgrade major version from %s to %s", current, desired) + } + if dstParts[0] == curParts[1] && dstParts[1] < curParts[1] { + return false, fmt.Errorf("cannot downgrade minor version from %s to %s", current, desired) + } + + // Alright, here it gets more tricky. MySQL has had a complicated release history. For the 8.0 series, + // up to 8.0.34 at least (known at this point), it was not supported to downgrade patch releases + // as patch release could also include on-disk data format changes. This happened a number of times + // in practice as well, so this concern is real. + // + // MySQL though as announced a new release strategy, see: + // https://dev.mysql.com/blog-archive/introducing-mysql-innovation-and-long-term-support-lts-versions/ + // + // With that release strategy, it will become possible that patch releases will be safe to downgrade + // as well and since the data format doesn't change on-disk anymore, it's also safe to upgrade with + // fast shutdown enabled. + // Specifically, it calls out that "MySQL 8.0.34+ will become bugfix only release (red)". This means + // that we use that version as a cut-off point here for when we need to disable fast shutdown or not. + if dstParts[0] == 8 && dstParts[1] == 0 && curParts[0] == 8 && curParts[1] == 0 { + // Our upgrade process stays within the 8.0.x version range. + if dstParts[2] >= 34 && curParts[2] >= 34 { + // No need for safe upgrade if both versions are 8.0.34 or higher. + return false, nil + } + // We can't downgrade within the 8.0.x series before 8.0.34. + if dstParts[2] < curParts[2] { + return false, fmt.Errorf("cannot downgrade patch version from %s to %s", current, desired) + } + // Always need safe upgrade if we change the patch release for 8.0.x before 8.0.34. + return dstParts[2] != curParts[2], nil + } + + // For any major or minor version change we always need safe upgrade. + return dstParts[0] != curParts[0] || dstParts[1] != curParts[1], nil +} + type candidateInfo struct { tablet *topo.TabletInfo position replication.Position diff --git a/pkg/controller/vitessshardreplication/reconcile_drain_test.go b/pkg/controller/vitessshardreplication/reconcile_drain_test.go new file mode 100644 index 00000000..f06b44fd --- /dev/null +++ b/pkg/controller/vitessshardreplication/reconcile_drain_test.go @@ -0,0 +1,150 @@ +/* +Copyright 2023 PlanetScale Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package vitessshardreplication + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestSafeMysqldUpgrade(t *testing.T) { + tests := []struct { + name string + current string + desired string + needsSafe bool + err string + }{ + { + name: "no desired and current", + needsSafe: false, + }, + { + name: "no desired", + desired: "docker.io/vitess/mysql:8.0.23", + needsSafe: false, + }, + { + name: "no current", + current: "docker.io/vitess/mysql:8.0.23", + needsSafe: false, + }, + { + name: "equal version with same registry", + current: "docker.io/vitess/mysql:8.0.23", + desired: "docker.io/vitess/mysql:8.0.23", + needsSafe: false, + }, + { + name: "equal version with different registry", + current: "docker.io/vitess/mysql:8.0.23", + desired: "docker.io/vitess/mysql:8.0.23", + needsSafe: false, + }, + { + name: "no explicit current version", + current: "docker.io/vitess/mysql:latest", + desired: "docker.io/vitess/mysql:8.9.23", + needsSafe: true, + }, + { + name: "no explicit desired version", + current: "docker.io/vitess/mysql:8.0.23", + desired: "docker.io/vitess/mysql:latest", + needsSafe: true, + }, + { + name: "downgrade version", + current: "docker.io/vitess/mysql:8.0.23", + desired: "docker.io/vitess/mysql:8.0.22", + err: "cannot downgrade patch version from 8.0.23 to 8.0.22", + }, + { + name: "newer version", + current: "docker.io/vitess/mysql:8.0.23", + desired: "docker.io/vitess/mysql:8.0.24", + needsSafe: true, + }, + { + name: "newer version until 8.0.34", + current: "docker.io/vitess/mysql:8.0.33", + desired: "docker.io/vitess/mysql:8.0.34", + needsSafe: true, + }, + { + name: "downgrade with 8.0.34", + current: "docker.io/vitess/mysql:8.0.34", + desired: "docker.io/vitess/mysql:8.0.33", + err: "cannot downgrade patch version from 8.0.34 to 8.0.33", + }, + { + name: "newer version skipping 8.0.34", + current: "docker.io/vitess/mysql:8.0.33", + desired: "docker.io/vitess/mysql:8.0.35", + needsSafe: true, + }, + { + name: "downgrade skipping 8.0.34", + current: "docker.io/vitess/mysql:8.0.35", + desired: "docker.io/vitess/mysql:8.0.33", + err: "cannot downgrade patch version from 8.0.35 to 8.0.33", + }, + { + name: "newer version after 8.0.34", + current: "docker.io/vitess/mysql:8.0.34", + desired: "docker.io/vitess/mysql:8.0.35", + needsSafe: false, + }, + { + name: "older version after 8.0.35", + current: "docker.io/vitess/mysql:8.0.35", + desired: "docker.io/vitess/mysql:8.0.34", + needsSafe: false, + }, + { + name: "early major upgrade", + current: "docker.io/vitess/mysql:5.7.35", + desired: "docker.io/vitess/mysql:8.0.34", + needsSafe: true, + }, + { + name: "late major upgrade", + current: "docker.io/vitess/mysql:8.2.35", + desired: "docker.io/vitess/mysql:9.0.21", + needsSafe: true, + }, + { + name: "minor upgrade", + current: "docker.io/vitess/mysql:8.2.25", + desired: "docker.io/vitess/mysql:8.4.12", + needsSafe: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + needsSafe, err := safeMysqldUpgrade(tt.current, tt.desired) + if tt.err != "" { + assert.EqualError(t, err, tt.err) + } else { + assert.NoError(t, err) + assert.Equal(t, tt.needsSafe, needsSafe) + } + }) + } +} diff --git a/pkg/controller/vitessshardreplication/vitessshardreplication_controller.go b/pkg/controller/vitessshardreplication/vitessshardreplication_controller.go index a2a1b572..f2924350 100644 --- a/pkg/controller/vitessshardreplication/vitessshardreplication_controller.go +++ b/pkg/controller/vitessshardreplication/vitessshardreplication_controller.go @@ -204,7 +204,7 @@ func (r *ReconcileVitessShard) Reconcile(cctx context.Context, request reconcile resultBuilder.Merge(initReplicationResult, err) // Check if we've been asked to do a planned reparent. - drainResult, err := r.reconcileDrain(ctx, vts, wr) + drainResult, err := r.reconcileDrain(ctx, vts, wr, log) resultBuilder.Merge(drainResult, err) // Request a periodic resync for the shard so we can recheck replication diff --git a/pkg/operator/vttablet/constants.go b/pkg/operator/vttablet/constants.go index 2a84aa84..94a388f7 100644 --- a/pkg/operator/vttablet/constants.go +++ b/pkg/operator/vttablet/constants.go @@ -27,7 +27,7 @@ const ( vtbackupContainerName = "vtbackup" vtbackupCommand = "/vt/bin/vtbackup" - mysqldContainerName = "mysqld" + MysqldContainerName = "mysqld" mysqldCommand = "/vt/bin/mysqlctld" mysqldExporterContainerName = "mysqld-exporter" diff --git a/pkg/operator/vttablet/pod.go b/pkg/operator/vttablet/pod.go index b17d354e..da2341c1 100644 --- a/pkg/operator/vttablet/pod.go +++ b/pkg/operator/vttablet/pod.go @@ -166,7 +166,7 @@ func UpdatePod(obj *corev1.Pod, spec *Spec) { if spec.Mysqld != nil { mysqldContainer = &corev1.Container{ - Name: mysqldContainerName, + Name: MysqldContainerName, Image: spec.Images.Mysqld.Image(), ImagePullPolicy: spec.ImagePullPolicies.Mysqld, Command: []string{mysqldCommand}, From b5dd7a06d0144f3b0098fc346236efd0a4f7e588 Mon Sep 17 00:00:00 2001 From: Florent Poinsard Date: Tue, 3 Oct 2023 18:20:13 -0600 Subject: [PATCH 2/6] remove debug Signed-off-by: Florent Poinsard --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 76f7d37a..29cc2164 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ IMAGE_REGISTRY:=docker.io IMAGE_TAG:=latest -IMAGE:=frouioui/vitess-operator +IMAGE:=planetscale/vitess-operator IMAGE_NAME:=$(IMAGE_REGISTRY)/$(IMAGE) From 5e30ea7702a1ca4952599ca468badae73e30e83a Mon Sep 17 00:00:00 2001 From: Harshit Gangal Date: Tue, 3 Oct 2023 20:35:17 +0530 Subject: [PATCH 3/6] update to new vitess snapshot release Signed-off-by: Harshit Gangal Signed-off-by: Florent Poinsard --- test/endtoend/backup_restore_test.sh | 2 +- test/endtoend/operator/101_initial_cluster.yaml | 12 ++++++------ test/endtoend/operator/operator.yaml | 2 +- test/endtoend/upgrade_test.sh | 4 ++-- test/endtoend/vtorc_vtadmin_test.sh | 2 +- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/test/endtoend/backup_restore_test.sh b/test/endtoend/backup_restore_test.sh index 2ffe437a..f30cfbde 100755 --- a/test/endtoend/backup_restore_test.sh +++ b/test/endtoend/backup_restore_test.sh @@ -103,7 +103,7 @@ killall kubectl setupKubectlAccessForCI get_started "operator-latest.yaml" "101_initial_cluster_backup.yaml" -verifyVtGateVersion "18.0.0" +verifyVtGateVersion "19.0.0" checkSemiSyncSetup takeBackup "commerce/-" verifyListBackupsOutput diff --git a/test/endtoend/operator/101_initial_cluster.yaml b/test/endtoend/operator/101_initial_cluster.yaml index 567bf973..de037be5 100644 --- a/test/endtoend/operator/101_initial_cluster.yaml +++ b/test/endtoend/operator/101_initial_cluster.yaml @@ -8,13 +8,13 @@ metadata: name: example spec: images: - vtctld: vitess/lite:v17.0.2 - vtgate: vitess/lite:v17.0.2 - vttablet: vitess/lite:v17.0.2 - vtorc: vitess/lite:v17.0.2 - vtbackup: vitess/lite:v17.0.2 + vtctld: vitess/lite:v18.0.0-rc1 + vtgate: vitess/lite:v18.0.0-rc1 + vttablet: vitess/lite:v18.0.0-rc1 + vtorc: vitess/lite:v18.0.0-rc1 + vtbackup: vitess/lite:v18.0.0-rc1 mysqld: - mysql80Compatible: vitess/lite:v17.0.2 + mysql80Compatible: vitess/lite:v18.0.0-rc1 mysqldExporter: prom/mysqld-exporter:v0.11.0 cells: - name: zone1 diff --git a/test/endtoend/operator/operator.yaml b/test/endtoend/operator/operator.yaml index d47fb8c6..9bb19200 100644 --- a/test/endtoend/operator/operator.yaml +++ b/test/endtoend/operator/operator.yaml @@ -6357,7 +6357,7 @@ spec: fieldPath: metadata.name - name: OPERATOR_NAME value: vitess-operator - image: planetscale/vitess-operator:v2.10.2 + image: planetscale/vitess-operator:v2.11.0-rc1 name: vitess-operator resources: limits: diff --git a/test/endtoend/upgrade_test.sh b/test/endtoend/upgrade_test.sh index 0a6e7a90..b8054dde 100755 --- a/test/endtoend/upgrade_test.sh +++ b/test/endtoend/upgrade_test.sh @@ -245,12 +245,12 @@ killall kubectl setupKubectlAccessForCI get_started "operator.yaml" "101_initial_cluster.yaml" -verifyVtGateVersion "17.0.2" +verifyVtGateVersion "18.0.0-rc1" checkSemiSyncSetup # Initially too durability policy should be specified verifyDurabilityPolicy "commerce" "semi_sync" upgradeToLatest -verifyVtGateVersion "18.0.0" +verifyVtGateVersion "19.0.0" checkSemiSyncSetup # After upgrading, we verify that the durability policy is still semi_sync verifyDurabilityPolicy "commerce" "semi_sync" diff --git a/test/endtoend/vtorc_vtadmin_test.sh b/test/endtoend/vtorc_vtadmin_test.sh index 944bdfcd..df82ef85 100755 --- a/test/endtoend/vtorc_vtadmin_test.sh +++ b/test/endtoend/vtorc_vtadmin_test.sh @@ -242,7 +242,7 @@ killall kubectl setupKubectlAccessForCI get_started_vtorc_vtadmin -verifyVtGateVersion "18.0.0" +verifyVtGateVersion "19.0.0" checkSemiSyncSetup # Check Vtadmin is setup From dc7cdc0752fd7ad6a5c0aa175b5fce9a12a5d395 Mon Sep 17 00:00:00 2001 From: Florent Poinsard Date: Thu, 5 Oct 2023 12:05:50 -0600 Subject: [PATCH 4/6] apply review suggestions Signed-off-by: Florent Poinsard --- .../vitessshardreplication/reconcile_drain_test.go | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pkg/controller/vitessshardreplication/reconcile_drain_test.go b/pkg/controller/vitessshardreplication/reconcile_drain_test.go index f06b44fd..734a4e89 100644 --- a/pkg/controller/vitessshardreplication/reconcile_drain_test.go +++ b/pkg/controller/vitessshardreplication/reconcile_drain_test.go @@ -50,12 +50,6 @@ func TestSafeMysqldUpgrade(t *testing.T) { desired: "docker.io/vitess/mysql:8.0.23", needsSafe: false, }, - { - name: "equal version with different registry", - current: "docker.io/vitess/mysql:8.0.23", - desired: "docker.io/vitess/mysql:8.0.23", - needsSafe: false, - }, { name: "no explicit current version", current: "docker.io/vitess/mysql:latest", From 011801f5cf735f84f57ae7ac3abf74242b25d82e Mon Sep 17 00:00:00 2001 From: Florent Poinsard <35779988+frouioui@users.noreply.github.com> Date: Thu, 5 Oct 2023 17:29:59 -0500 Subject: [PATCH 5/6] Update pkg/controller/vitessshardreplication/reconcile_drain.go Co-authored-by: Deepthi Sigireddi Signed-off-by: Florent Poinsard --- pkg/controller/vitessshardreplication/reconcile_drain.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/controller/vitessshardreplication/reconcile_drain.go b/pkg/controller/vitessshardreplication/reconcile_drain.go index 08e33b7d..b601fa65 100644 --- a/pkg/controller/vitessshardreplication/reconcile_drain.go +++ b/pkg/controller/vitessshardreplication/reconcile_drain.go @@ -261,7 +261,7 @@ func (r *ReconcileVitessShard) reconcileDrain(ctx context.Context, vts *planetsc } } - // 4. Check if we need to any operations like disabling fast shutdown + // 4. Check if we need to perform any operations like disabling fast shutdown // for upgrades here. if err := r.disableFastShutdown(ctx, wr, pods, tablets, vts.Spec.Images.Mysqld.Image(), log); err != nil { r.recorder.Eventf(vts, corev1.EventTypeWarning, From af29fbf820782fbe362c58568225e9101e390af8 Mon Sep 17 00:00:00 2001 From: Florent Poinsard <35779988+frouioui@users.noreply.github.com> Date: Thu, 5 Oct 2023 17:30:05 -0500 Subject: [PATCH 6/6] Update pkg/controller/vitessshardreplication/reconcile_drain.go Co-authored-by: Deepthi Sigireddi Signed-off-by: Florent Poinsard --- pkg/controller/vitessshardreplication/reconcile_drain.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/controller/vitessshardreplication/reconcile_drain.go b/pkg/controller/vitessshardreplication/reconcile_drain.go index b601fa65..9a4851c7 100644 --- a/pkg/controller/vitessshardreplication/reconcile_drain.go +++ b/pkg/controller/vitessshardreplication/reconcile_drain.go @@ -591,7 +591,7 @@ func safeMysqldUpgrade(currentImage, desiredImage string) (bool, error) { // as patch release could also include on-disk data format changes. This happened a number of times // in practice as well, so this concern is real. // - // MySQL though as announced a new release strategy, see: + // MySQL though has announced a new release strategy, see: // https://dev.mysql.com/blog-archive/introducing-mysql-innovation-and-long-term-support-lts-versions/ // // With that release strategy, it will become possible that patch releases will be safe to downgrade