From abac489577846db631d7968e1ef97b90d44f7321 Mon Sep 17 00:00:00 2001 From: Kyle Wong <37189875+kyle-a-wong@users.noreply.github.com> Date: Fri, 11 Oct 2024 14:36:18 -0400 Subject: [PATCH] server: Fix NPE in spanStatsFanOut a NPE was surfaced when doing a span stats request on an unfinalized (mixed version) cluster. To fix, defensive checks are added to defend against potential nil responses and references to non-existant map entries for a request span stat Fixes: #132130 Release note (bug fix): Fixes a bug where a span stats request on a mixed version cluster resulted in an NPE --- pkg/server/BUILD.bazel | 1 + pkg/server/span_stats_server.go | 68 +++++--- pkg/server/span_stats_server_test.go | 239 +++++++++++++++++++++++++++ 3 files changed, 282 insertions(+), 26 deletions(-) diff --git a/pkg/server/BUILD.bazel b/pkg/server/BUILD.bazel index dea17db291a..896d70d6758 100644 --- a/pkg/server/BUILD.bazel +++ b/pkg/server/BUILD.bazel @@ -548,6 +548,7 @@ go_test( "//pkg/sql/tablemetadatacache/util", "//pkg/storage", "//pkg/storage/disk", + "//pkg/storage/enginepb", "//pkg/storage/fs", "//pkg/testutils", "//pkg/testutils/datapathutils", diff --git a/pkg/server/span_stats_server.go b/pkg/server/span_stats_server.go index 852caf0ddd2..d66b27b21e1 100644 --- a/pkg/server/span_stats_server.go +++ b/pkg/server/span_stats_server.go @@ -77,8 +77,6 @@ func (s *systemStatusServer) spanStatsFanOut( res.SpanToStats[sp.String()] = &roachpb.SpanStats{} } - responses := make(map[string]struct{}) - spansPerNode, err := s.getSpansPerNode(ctx, req) if err != nil { return nil, err @@ -127,7 +125,37 @@ func (s *systemStatusServer) spanStatsFanOut( return resp, err } - responseFn := func(nodeID roachpb.NodeID, resp interface{}) { + errorFn := func(nodeID roachpb.NodeID, err error) { + log.Errorf(ctx, nodeErrorMsgPlaceholder, nodeID, err) + errorMessage := fmt.Sprintf("%v", err) + res.Errors = append(res.Errors, errorMessage) + } + + timeout := SpanStatsNodeTimeout.Get(&s.st.SV) + if err := iterateNodes( + ctx, + s.serverIterator, + s.stopper, + "iterating nodes for span stats", + timeout, + smartDial, + nodeFn, + collectSpanStatsResponses(ctx, res), + errorFn, + ); err != nil { + return nil, err + } + + return res, nil +} + +// collectSpanStatsResponses takes a *roachpb.SpanStatsResponse and creates a closure around it +// to provide as a callback to fanned out SpanStats requests. +func collectSpanStatsResponses( + ctx context.Context, res *roachpb.SpanStatsResponse, +) func(nodeID roachpb.NodeID, resp interface{}) { + responses := make(map[string]struct{}) + return func(nodeID roachpb.NodeID, resp interface{}) { // Noop if nil response (returned from skipped node). if resp == nil { return @@ -143,6 +171,17 @@ func (s *systemStatusServer) spanStatsFanOut( // MVCC stats from the leaseholder, MVCC stats are taken from the node that responded first. // See #108779. for spanStr, spanStats := range nodeResponse.SpanToStats { + if spanStats == nil { + log.Errorf(ctx, "Span stats for %s from node response is nil", spanStr) + continue + } + + _, ok := res.SpanToStats[spanStr] + if !ok { + log.Warningf(ctx, "Received Span not in original request: %s", spanStr) + res.SpanToStats[spanStr] = &roachpb.SpanStats{} + } + // Accumulate physical values across all replicas: res.SpanToStats[spanStr].ApproximateTotalStats.Add(spanStats.TotalStats) res.SpanToStats[spanStr].ApproximateDiskBytes += spanStats.ApproximateDiskBytes @@ -160,29 +199,6 @@ func (s *systemStatusServer) spanStatsFanOut( } } } - - errorFn := func(nodeID roachpb.NodeID, err error) { - log.Errorf(ctx, nodeErrorMsgPlaceholder, nodeID, err) - errorMessage := fmt.Sprintf("%v", err) - res.Errors = append(res.Errors, errorMessage) - } - - timeout := SpanStatsNodeTimeout.Get(&s.st.SV) - if err := iterateNodes( - ctx, - s.serverIterator, - s.stopper, - "iterating nodes for span stats", - timeout, - smartDial, - nodeFn, - responseFn, - errorFn, - ); err != nil { - return nil, err - } - - return res, nil } func (s *systemStatusServer) getLocalStats( diff --git a/pkg/server/span_stats_server_test.go b/pkg/server/span_stats_server_test.go index ac6efcbbcfb..8e5479e220e 100644 --- a/pkg/server/span_stats_server_test.go +++ b/pkg/server/span_stats_server_test.go @@ -11,6 +11,7 @@ import ( "testing" "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/storage/enginepb" "github.com/cockroachdb/cockroach/pkg/util/leaktest" "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/stretchr/testify/require" @@ -102,3 +103,241 @@ func TestSpanStatsBatching(t *testing.T) { } } + +func TestCollectSpanStatsResponses(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + type nodeResponse struct { + nodeID roachpb.NodeID + resp *roachpb.SpanStatsResponse + } + type testCase struct { + actualRes *roachpb.SpanStatsResponse + nodeResponses []nodeResponse + expectedRes *roachpb.SpanStatsResponse + } + + var testCases []testCase + + // test case 1 + tc1 := testCase{ + actualRes: createSpanStatsResponse("span1", "span2"), + nodeResponses: []nodeResponse{ + {nodeID: 1, resp: &roachpb.SpanStatsResponse{ + SpanToStats: map[string]*roachpb.SpanStats{ + "span1": {RangeCount: 1, + ApproximateDiskBytes: 1, + RemoteFileBytes: 1, + ExternalFileBytes: 1, + StoreIDs: []roachpb.StoreID{1}, + ReplicaCount: 1, + }, + }, + }}, + {nodeID: 2, resp: &roachpb.SpanStatsResponse{ + SpanToStats: map[string]*roachpb.SpanStats{ + "span2": {RangeCount: 1, + ApproximateDiskBytes: 1, + RemoteFileBytes: 1, + ExternalFileBytes: 1, + StoreIDs: []roachpb.StoreID{2}, + ReplicaCount: 1, + }, + }, + }}, + }, + expectedRes: &roachpb.SpanStatsResponse{ + SpanToStats: map[string]*roachpb.SpanStats{ + "span1": {RangeCount: 1, + ApproximateDiskBytes: 1, + RemoteFileBytes: 1, + ExternalFileBytes: 1, + StoreIDs: []roachpb.StoreID{1}, + ReplicaCount: 1, + }, + "span2": {RangeCount: 1, + ApproximateDiskBytes: 1, + RemoteFileBytes: 1, + ExternalFileBytes: 1, + StoreIDs: []roachpb.StoreID{2}, + ReplicaCount: 1, + }, + }, + }, + } + testCases = append(testCases, tc1) + + // test case 2 - span is replicated in node 1 and 2 + totalStats1 := enginepb.MVCCStats{LiveBytes: 1} + totalStats2 := enginepb.MVCCStats{LiveBytes: 2} + expectedApproxTotalStats := enginepb.MVCCStats{} + expectedApproxTotalStats.Add(totalStats1) + expectedApproxTotalStats.Add(totalStats2) + tc2 := testCase{ + actualRes: createSpanStatsResponse("span1", "span2"), + nodeResponses: []nodeResponse{ + {nodeID: 1, resp: &roachpb.SpanStatsResponse{ + SpanToStats: map[string]*roachpb.SpanStats{ + "span1": { + TotalStats: totalStats1, + RangeCount: 1, + ApproximateDiskBytes: 1, + RemoteFileBytes: 1, + ExternalFileBytes: 1, + StoreIDs: []roachpb.StoreID{1, 2}, + ReplicaCount: 1, + }, + }, + }}, + {nodeID: 2, resp: &roachpb.SpanStatsResponse{ + SpanToStats: map[string]*roachpb.SpanStats{ + "span1": { + TotalStats: totalStats2, + RangeCount: 1, + ApproximateDiskBytes: 1, + RemoteFileBytes: 1, + ExternalFileBytes: 1, + StoreIDs: []roachpb.StoreID{2}, + ReplicaCount: 1, + }, + }, + }}, + }, + expectedRes: &roachpb.SpanStatsResponse{ + SpanToStats: map[string]*roachpb.SpanStats{ + "span1": { + TotalStats: totalStats1, + ApproximateTotalStats: expectedApproxTotalStats, + RangeCount: 1, + ApproximateDiskBytes: 2, + RemoteFileBytes: 2, + ExternalFileBytes: 2, + StoreIDs: []roachpb.StoreID{1, 2}, + ReplicaCount: 1, + }, + }, + }, + } + testCases = append(testCases, tc2) + + // test case 3 - node response spanToStats value is nil for span2 + tc3 := testCase{ + actualRes: createSpanStatsResponse("span1", "span2"), + nodeResponses: []nodeResponse{ + {nodeID: 1, resp: &roachpb.SpanStatsResponse{ + SpanToStats: map[string]*roachpb.SpanStats{ + "span1": { + RangeCount: 1, + ApproximateDiskBytes: 1, + RemoteFileBytes: 1, + ExternalFileBytes: 1, + StoreIDs: []roachpb.StoreID{1}, + ReplicaCount: 1, + }, + }, + }}, + {nodeID: 2, resp: &roachpb.SpanStatsResponse{ + SpanToStats: map[string]*roachpb.SpanStats{ + "span2": nil, + }, + }}, + }, + expectedRes: &roachpb.SpanStatsResponse{ + SpanToStats: map[string]*roachpb.SpanStats{ + "span1": { + RangeCount: 1, + ApproximateDiskBytes: 1, + RemoteFileBytes: 1, + ExternalFileBytes: 1, + StoreIDs: []roachpb.StoreID{1}, + ReplicaCount: 1, + }, + "span2": {}, + }, + }, + } + testCases = append(testCases, tc3) + + // test case 4 - node response contains span not in original response struct + tc4 := testCase{ + actualRes: createSpanStatsResponse("span1"), + nodeResponses: []nodeResponse{ + {nodeID: 1, resp: &roachpb.SpanStatsResponse{ + SpanToStats: map[string]*roachpb.SpanStats{ + "span1": { + RangeCount: 1, + ApproximateDiskBytes: 1, + RemoteFileBytes: 1, + ExternalFileBytes: 1, + StoreIDs: []roachpb.StoreID{1}, + ReplicaCount: 1, + }, + }, + }}, + {nodeID: 2, resp: &roachpb.SpanStatsResponse{ + SpanToStats: map[string]*roachpb.SpanStats{ + "span2": { + RangeCount: 1, + ApproximateDiskBytes: 1, + RemoteFileBytes: 1, + ExternalFileBytes: 1, + StoreIDs: []roachpb.StoreID{2}, + ReplicaCount: 1, + }, + }, + }}, + }, + expectedRes: &roachpb.SpanStatsResponse{ + SpanToStats: map[string]*roachpb.SpanStats{ + "span1": { + RangeCount: 1, + ApproximateDiskBytes: 1, + RemoteFileBytes: 1, + ExternalFileBytes: 1, + StoreIDs: []roachpb.StoreID{1}, + ReplicaCount: 1, + }, + "span2": { + RangeCount: 1, + ApproximateDiskBytes: 1, + RemoteFileBytes: 1, + ExternalFileBytes: 1, + StoreIDs: []roachpb.StoreID{2}, + ReplicaCount: 1, + }, + }, + }, + } + testCases = append(testCases, tc4) + + ctx := context.Background() + for _, tc := range testCases { + cb := collectSpanStatsResponses(ctx, tc.actualRes) + for _, nodeResp := range tc.nodeResponses { + cb(nodeResp.nodeID, nodeResp.resp) + } + + for spanStr, spanStats := range tc.expectedRes.SpanToStats { + actualSpanStat, ok := tc.actualRes.SpanToStats[spanStr] + require.True(t, ok) + require.NotNil(t, actualSpanStat) + require.Equal(t, spanStats.TotalStats, actualSpanStat.TotalStats) + require.Equal(t, spanStats.RangeCount, actualSpanStat.RangeCount) + require.Equal(t, spanStats.ApproximateDiskBytes, actualSpanStat.ApproximateDiskBytes) + require.Equal(t, spanStats.RemoteFileBytes, actualSpanStat.RemoteFileBytes) + require.Equal(t, spanStats.ExternalFileBytes, actualSpanStat.ExternalFileBytes) + require.Equal(t, spanStats.StoreIDs, actualSpanStat.StoreIDs) + require.Equal(t, spanStats.ReplicaCount, actualSpanStat.ReplicaCount) + } + } +} + +func createSpanStatsResponse(spanStrs ...string) *roachpb.SpanStatsResponse { + resp := &roachpb.SpanStatsResponse{} + resp.SpanToStats = make(map[string]*roachpb.SpanStats) + for _, str := range spanStrs { + resp.SpanToStats[str] = &roachpb.SpanStats{} + } + return resp +}