diff --git a/go/vt/vtgate/buffer/buffer.go b/go/vt/vtgate/buffer/buffer.go index 622bb03b082..26854b54ea2 100644 --- a/go/vt/vtgate/buffer/buffer.go +++ b/go/vt/vtgate/buffer/buffer.go @@ -67,7 +67,7 @@ const ( type RetryDoneFunc context.CancelFunc const ( - ClusterEventReshardingInProgress = "current keyspace is being resharded" + ClusterEventReshardingInProgress = "current keyspace is potentially being resharded" ClusterEventReparentInProgress = "primary is not serving, there may be a reparent operation in progress" ClusterEventMoveTables = "disallowed due to rule" ) diff --git a/go/vt/vtgate/tabletgateway.go b/go/vt/vtgate/tabletgateway.go index de63da87907..52ef51182be 100644 --- a/go/vt/vtgate/tabletgateway.go +++ b/go/vt/vtgate/tabletgateway.go @@ -283,22 +283,24 @@ func (gw *TabletGateway) withRetry(ctx context.Context, target *querypb.Target, // if we have a keyspace event watcher, check if the reason why our primary is not available is that it's currently being resharded // or if a reparent operation is in progress. if kev := gw.kev; kev != nil { - if kev.TargetIsBeingResharded(ctx, target) { - log.V(2).Infof("current keyspace is being resharded, retrying: %s: %s", target.Keyspace, debug.Stack()) - err = vterrors.Errorf(vtrpcpb.Code_CLUSTER_EVENT, buffer.ClusterEventReshardingInProgress) - continue - } primary, notServing := kev.PrimaryIsNotServing(ctx, target) if notServing { err = vterrors.Errorf(vtrpcpb.Code_CLUSTER_EVENT, buffer.ClusterEventReparentInProgress) continue } + // if primary is serving, but we initially found no tablet, we're in an inconsistent state // we then retry the entire loop if primary != nil { err = vterrors.Errorf(vtrpcpb.Code_UNAVAILABLE, "inconsistent state detected, primary is serving but initially found no available tablet") continue } + + if kev.TargetIsBeingResharded(ctx, target) { + log.V(2).Infof("current keyspace is potentally being resharded, retrying: %s: %s", target.Keyspace, debug.Stack()) + err = vterrors.Errorf(vtrpcpb.Code_CLUSTER_EVENT, buffer.ClusterEventReshardingInProgress) + continue + } } // fail fast if there is no tablet