Skip to content

Commit

Permalink
Merge #135186
Browse files Browse the repository at this point in the history
135186: server: decrease nodes_ui response size r=kyle-a-wong a=kyle-a-wong

The /_status/nodes_ui grpc API is used by many db-console pages to show node data relevant information. This API is extremely heavy and includes all node and node store related metrics. To give some perspective, the current drt-scale cluster's nodes_ui API call has a payload of size of ~8.4MB. As a result, this request is taking ~2.75s to complete in db-console.

As a partial remedy to this, this patch will filter down the node and node store metrics to only return metrics needed by db-console.

This list of metrics was determined by the `MetricsConstants` variable defined here:
https://github.com/cockroachdb/cockroach/blob/d5f328ea6f3efd8fbe631c97d59f7b74307d22f9/pkg/ui/workspaces/db-console/src/util/proto.ts#L55

This patch does not include any changes to the underlying data in KV, meaning the full NodeStatus objects (which includes the metrics) are still fetched from KV and unmarshalled. That being said, this patch reduces the cost of the full metrics payload back into a
serverpb.NodeResponse protobuf, sending it over the wire, and decoding it into json.

Testing locally with a demo tpcc cluster with 20 nodes, the payload of nodes_ui on a new cluster was around 530kb before this change, and 8kb after.

Epic: None
Release note (performance improvement): the /_status/nodes_ui API no longer returns unnecessary metrics in its response. This decreases the payload size of the API and improves the load time of various db-console pages and components.

Co-authored-by: Kyle Wong <[email protected]>
  • Loading branch information
craig[bot] and kyle-a-wong committed Nov 14, 2024
2 parents b1a3408 + f927757 commit 358b42f
Show file tree
Hide file tree
Showing 2 changed files with 84 additions and 2 deletions.
58 changes: 56 additions & 2 deletions pkg/server/nodes_response.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,47 @@ import (
"github.com/cockroachdb/cockroach/pkg/util"
)

// uiNodeMetrics contains all the metrics required for the db-console frontend.
// These will be the only node metrics returned in the serverpb.NodeResponse
// metrics.
var uiNodeMetrics = []string{
"sys.cpu.user.percent",
"sys.cpu.sys.percent",
"sys.go.allocbytes",
"sql.conns",
"sys.rss",
}

// uiStoreMetrics contains all the metrics required for the db-console frontend.
// These will be the only node store metrics returned in the
// serverpb.NodeResponse store_status metrics.
var uiStoreMetrics = []string{
"replicas",
"replicas.leaders",
"replicas.leaseholders",
"ranges",
"ranges.unavailable",
"ranges.underreplicated",
"livebytes",
"keybytes",
"valbytes",
"rangekeybytes",
"rangevalbytes",
"totalbytes",
"intentbytes",
"livecount",
"keycount",
"valcount",
"intentcount",
"intentage",
"gcbytesage",
"capacity",
"capacity.available",
"capacity.used",
"sysbytes",
"syscount",
}

func nodeStatusToResp(n *statuspb.NodeStatus, hasViewClusterMetadata bool) serverpb.NodeResponse {
tiers := make([]serverpb.Tier, len(n.Desc.Locality.Tiers))
for j, t := range n.Desc.Locality.Tiers {
Expand Down Expand Up @@ -52,6 +93,12 @@ func nodeStatusToResp(n *statuspb.NodeStatus, hasViewClusterMetadata bool) serve

statuses := make([]serverpb.StoreStatus, len(n.StoreStatuses))
for i, ss := range n.StoreStatuses {
storeMetrics := make(map[string]float64, len(uiStoreMetrics))
for _, m := range uiStoreMetrics {
if d, ok := ss.Metrics[m]; ok {
storeMetrics[m] = d
}
}
statuses[i] = serverpb.StoreStatus{
Desc: serverpb.StoreDescriptor{
StoreID: ss.Desc.StoreID,
Expand All @@ -64,7 +111,7 @@ func nodeStatusToResp(n *statuspb.NodeStatus, hasViewClusterMetadata bool) serve
Encrypted: ss.Desc.Properties.Encrypted,
},
},
Metrics: ss.Metrics,
Metrics: storeMetrics,
}
if fsprops := ss.Desc.Properties.FileStoreProperties; fsprops != nil {
sfsprops := &roachpb.FileStoreProperties{
Expand All @@ -80,12 +127,19 @@ func nodeStatusToResp(n *statuspb.NodeStatus, hasViewClusterMetadata bool) serve
}
}

metrics := make(map[string]float64, len(uiNodeMetrics))
for _, m := range uiNodeMetrics {
if d, ok := n.Metrics[m]; ok {
metrics[m] = d
}
}

resp := serverpb.NodeResponse{
Desc: nodeDescriptor,
BuildInfo: n.BuildInfo,
StartedAt: n.StartedAt,
UpdatedAt: n.UpdatedAt,
Metrics: n.Metrics,
Metrics: metrics,
StoreStatuses: statuses,
Args: nil,
Env: nil,
Expand Down
28 changes: 28 additions & 0 deletions pkg/server/status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -543,3 +543,31 @@ WHERE id = $1 AND claim_instance_id IS NOT NULL`, jobs.UpdateTableMetadataCacheJ
require.Containsf(t, runningStatus, "Job completed at", "running_status not updated: %s", runningStatus)
})
}

// TestNodesUiMetrics tests that the metrics fields of NodesUI
// rpcs only returns the subset of metrics needed in the UI
func TestNodesUiMetrics(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)

ts := serverutils.StartServerOnly(t, base.TestServerArgs{})

ctx := context.Background()
defer ts.Stopper().Stop(ctx)

s := ts.StatusServer().(*systemStatusServer)
resp, err := s.NodesUI(ctx, &serverpb.NodesRequest{})
require.NoError(t, err)
require.Len(t, resp.Nodes, 1)
for _, node := range resp.Nodes {
for _, m := range uiNodeMetrics {
require.Contains(t, node.Metrics, m)
}
require.Greater(t, len(node.StoreStatuses), 0)
for _, storeStatus := range node.StoreStatuses {
for _, m := range uiStoreMetrics {
require.Contains(t, storeStatus.Metrics, m)
}
}
}
}

0 comments on commit 358b42f

Please sign in to comment.