From 4684265cc364ae51895430f7d484d7e44658a12e Mon Sep 17 00:00:00 2001 From: Brendan Dougherty Date: Fri, 29 Mar 2024 20:51:13 +0000 Subject: [PATCH] discovery: Fix tablets removed from healthcheck on topo error Signed-off-by: Brendan Dougherty --- go/vt/discovery/topology_watcher.go | 24 +++++++--- go/vt/discovery/topology_watcher_test.go | 60 ++++++++++++++++++++++++ 2 files changed, 78 insertions(+), 6 deletions(-) diff --git a/go/vt/discovery/topology_watcher.go b/go/vt/discovery/topology_watcher.go index 8a4dcb3213e..56b27515ef0 100644 --- a/go/vt/discovery/topology_watcher.go +++ b/go/vt/discovery/topology_watcher.go @@ -213,6 +213,8 @@ func (tw *TopologyWatcher) loadTablets() { wg.Wait() tw.mu.Lock() + partialResult := len(tabletAliases) != len(newTablets) + for alias, newVal := range newTablets { if tw.tabletFilter != nil && !tw.tabletFilter.IsIncluded(newVal.tablet) { continue @@ -236,14 +238,24 @@ func (tw *TopologyWatcher) loadTablets() { } } - for _, val := range tw.tablets { - if tw.tabletFilter != nil && !tw.tabletFilter.IsIncluded(val.tablet) { - continue + if partialResult { + for _, val := range tw.tablets { + if _, ok := newTablets[val.alias]; !ok { + // We don't know if the tablet was removed or if we simply failed to fetch it. + // We'll assume it was not removed and ensure it remains in the tablet list + newTablets[val.alias] = val + } } + } else { + for _, val := range tw.tablets { + if tw.tabletFilter != nil && !tw.tabletFilter.IsIncluded(val.tablet) { + continue + } - if _, ok := newTablets[val.alias]; !ok { - tw.healthcheck.RemoveTablet(val.tablet) - topologyWatcherOperations.Add(topologyWatcherOpRemoveTablet, 1) + if _, ok := newTablets[val.alias]; !ok { + tw.healthcheck.RemoveTablet(val.tablet) + topologyWatcherOperations.Add(topologyWatcherOpRemoveTablet, 1) + } } } tw.tablets = newTablets diff --git a/go/vt/discovery/topology_watcher_test.go b/go/vt/discovery/topology_watcher_test.go index dff8ba720c7..cdb454486f8 100644 --- a/go/vt/discovery/topology_watcher_test.go +++ b/go/vt/discovery/topology_watcher_test.go @@ -29,6 +29,7 @@ import ( "vitess.io/vitess/go/vt/logutil" topodatapb "vitess.io/vitess/go/vt/proto/topodata" "vitess.io/vitess/go/vt/topo" + "vitess.io/vitess/go/vt/topo/faketopo" "vitess.io/vitess/go/vt/topo/memorytopo" ) @@ -614,3 +615,62 @@ func TestFilterByKeypsaceSkipsIgnoredTablets(t *testing.T) { tw.Stop() } + +func TestGetTabletErrorDoesNotRemoveFromHealthcheck(t *testing.T) { + factory := faketopo.NewFakeTopoFactory() + // add cell to the factory. This returns a fake connection which we will use to set the get and update errors as we require. + fakeConn := factory.AddCell("aa") + + ts := faketopo.NewFakeTopoServer(factory) + if err := ts.CreateCellInfo(context.Background(), "aa", &topodatapb.CellInfo{}); err != nil { + t.Fatalf("CreateCellInfo failed: %v", err) + } + + fhc := NewFakeHealthCheck(nil) + topologyWatcherOperations.ZeroAll() + counts := topologyWatcherOperations.Counts() + tw := NewCellTabletsWatcher(context.Background(), ts, fhc, nil, "aa", 10*time.Minute, true, 5) + + counts = checkOpCounts(t, counts, map[string]int64{}) + checkChecksum(t, tw, 0) + + // Add a tablet to the topology. + tablet := &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "aa", + Uid: 0, + }, + Hostname: "host1", + PortMap: map[string]int32{ + "vt": 123, + }, + Keyspace: "keyspace", + Shard: "shard", + } + if err := ts.CreateTablet(context.Background(), tablet); err != nil { + t.Fatalf("CreateTablet failed: %v", err) + } + tw.loadTablets() + counts = checkOpCounts(t, counts, map[string]int64{"ListTablets": 1, "GetTablet": 1, "AddTablet": 1}) + checkChecksum(t, tw, 3238442862) + + // Check the tablet is returned by GetAllTablets(). + allTablets := fhc.GetAllTablets() + key := TabletToMapKey(tablet) + if _, ok := allTablets[key]; !ok || len(allTablets) != 1 || !proto.Equal(allTablets[key], tablet) { + t.Errorf("fhc.GetAllTablets() = %+v; want %+v", allTablets, tablet) + } + + // Force the next topo Get call to return an error (the ListDir call for the tablet aliases will still succeed) + fakeConn.AddGetError(true) + + tw.loadTablets() + checkOpCounts(t, counts, map[string]int64{"ListTablets": 1, "GetTablet": 1}) + checkChecksum(t, tw, 3238442862) + + // Check the tablet is still returned by GetAllTablets(). + allTablets2 := fhc.GetAllTablets() + if _, ok := allTablets2[key]; !ok || len(allTablets2) != 1 || !proto.Equal(allTablets2[key], tablet) { + t.Errorf("fhc.GetAllTablets() = %+v; want %+v", allTablets2, tablet) + } +}