Skip to content

Commit

Permalink
discovery: Fix tablets removed from healthcheck on topo error
Browse files Browse the repository at this point in the history
Signed-off-by: Brendan Dougherty <[email protected]>
  • Loading branch information
brendar committed Mar 29, 2024
1 parent e1151b9 commit 4684265
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 6 deletions.
24 changes: 18 additions & 6 deletions go/vt/discovery/topology_watcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,8 @@ func (tw *TopologyWatcher) loadTablets() {
wg.Wait()
tw.mu.Lock()

partialResult := len(tabletAliases) != len(newTablets)

for alias, newVal := range newTablets {
if tw.tabletFilter != nil && !tw.tabletFilter.IsIncluded(newVal.tablet) {
continue
Expand All @@ -236,14 +238,24 @@ func (tw *TopologyWatcher) loadTablets() {
}
}

for _, val := range tw.tablets {
if tw.tabletFilter != nil && !tw.tabletFilter.IsIncluded(val.tablet) {
continue
if partialResult {
for _, val := range tw.tablets {
if _, ok := newTablets[val.alias]; !ok {
// We don't know if the tablet was removed or if we simply failed to fetch it.
// We'll assume it was not removed and ensure it remains in the tablet list
newTablets[val.alias] = val
}
}
} else {
for _, val := range tw.tablets {
if tw.tabletFilter != nil && !tw.tabletFilter.IsIncluded(val.tablet) {
continue
}

if _, ok := newTablets[val.alias]; !ok {
tw.healthcheck.RemoveTablet(val.tablet)
topologyWatcherOperations.Add(topologyWatcherOpRemoveTablet, 1)
if _, ok := newTablets[val.alias]; !ok {
tw.healthcheck.RemoveTablet(val.tablet)
topologyWatcherOperations.Add(topologyWatcherOpRemoveTablet, 1)
}
}
}
tw.tablets = newTablets
Expand Down
60 changes: 60 additions & 0 deletions go/vt/discovery/topology_watcher_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (
"vitess.io/vitess/go/vt/logutil"
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
"vitess.io/vitess/go/vt/topo"
"vitess.io/vitess/go/vt/topo/faketopo"
"vitess.io/vitess/go/vt/topo/memorytopo"
)

Expand Down Expand Up @@ -614,3 +615,62 @@ func TestFilterByKeypsaceSkipsIgnoredTablets(t *testing.T) {

tw.Stop()
}

func TestGetTabletErrorDoesNotRemoveFromHealthcheck(t *testing.T) {
factory := faketopo.NewFakeTopoFactory()
// add cell to the factory. This returns a fake connection which we will use to set the get and update errors as we require.
fakeConn := factory.AddCell("aa")

ts := faketopo.NewFakeTopoServer(factory)
if err := ts.CreateCellInfo(context.Background(), "aa", &topodatapb.CellInfo{}); err != nil {
t.Fatalf("CreateCellInfo failed: %v", err)
}

fhc := NewFakeHealthCheck(nil)
topologyWatcherOperations.ZeroAll()
counts := topologyWatcherOperations.Counts()
tw := NewCellTabletsWatcher(context.Background(), ts, fhc, nil, "aa", 10*time.Minute, true, 5)

counts = checkOpCounts(t, counts, map[string]int64{})
checkChecksum(t, tw, 0)

// Add a tablet to the topology.
tablet := &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{
Cell: "aa",
Uid: 0,
},
Hostname: "host1",
PortMap: map[string]int32{
"vt": 123,
},
Keyspace: "keyspace",
Shard: "shard",
}
if err := ts.CreateTablet(context.Background(), tablet); err != nil {
t.Fatalf("CreateTablet failed: %v", err)
}
tw.loadTablets()
counts = checkOpCounts(t, counts, map[string]int64{"ListTablets": 1, "GetTablet": 1, "AddTablet": 1})
checkChecksum(t, tw, 3238442862)

// Check the tablet is returned by GetAllTablets().
allTablets := fhc.GetAllTablets()
key := TabletToMapKey(tablet)
if _, ok := allTablets[key]; !ok || len(allTablets) != 1 || !proto.Equal(allTablets[key], tablet) {
t.Errorf("fhc.GetAllTablets() = %+v; want %+v", allTablets, tablet)
}

// Force the next topo Get call to return an error (the ListDir call for the tablet aliases will still succeed)
fakeConn.AddGetError(true)

tw.loadTablets()
checkOpCounts(t, counts, map[string]int64{"ListTablets": 1, "GetTablet": 1})
checkChecksum(t, tw, 3238442862)

// Check the tablet is still returned by GetAllTablets().
allTablets2 := fhc.GetAllTablets()
if _, ok := allTablets2[key]; !ok || len(allTablets2) != 1 || !proto.Equal(allTablets2[key], tablet) {
t.Errorf("fhc.GetAllTablets() = %+v; want %+v", allTablets2, tablet)
}
}

0 comments on commit 4684265

Please sign in to comment.