From f6f3d69c00c4bd00b5146ff48c2e5ffaf754255a Mon Sep 17 00:00:00 2001 From: Sumit Bansal Date: Wed, 24 Jul 2024 17:36:12 +0530 Subject: [PATCH] add logs in coordinator::publish flow --- .../cluster/coordination/Coordinator.java | 31 +++++++++++++++++++ .../routing/allocation/AllocationService.java | 31 ++++++++++++------- .../service/ClusterApplierService.java | 2 ++ 3 files changed, 53 insertions(+), 11 deletions(-) diff --git a/server/src/main/java/org/opensearch/cluster/coordination/Coordinator.java b/server/src/main/java/org/opensearch/cluster/coordination/Coordinator.java index 87f02c6891be6..93278053a40eb 100644 --- a/server/src/main/java/org/opensearch/cluster/coordination/Coordinator.java +++ b/server/src/main/java/org/opensearch/cluster/coordination/Coordinator.java @@ -1292,6 +1292,7 @@ public void publish( ) { try { synchronized (mutex) { + long overallStartTimeNS = System.nanoTime(); if (mode != Mode.LEADER || getCurrentTerm() != clusterChangedEvent.state().term()) { logger.debug( () -> new ParameterizedMessage( @@ -1349,11 +1350,41 @@ assert getLocalNode().equals(clusterState.getNodes().get(getLocalNode().getId()) currentPublication = Optional.of(publication); final DiscoveryNodes publishNodes = publishRequest.getAcceptedState().nodes(); + long startTimeNS = System.nanoTime(); leaderChecker.setCurrentNodes(publishNodes); + logger.info( + "[Custom Log] Coordinator, leaderChecker.setCurrentNodes latency: {} ms", + TimeValue.nsecToMSec(System.nanoTime() - startTimeNS) + ); + startTimeNS = System.nanoTime(); followersChecker.setCurrentNodes(publishNodes); + logger.info( + "[Custom Log] Coordinator, followersChecker.setCurrentNodes latency: {} ms", + TimeValue.nsecToMSec(System.nanoTime() - startTimeNS) + ); + startTimeNS = System.nanoTime(); lagDetector.setTrackedNodes(publishNodes); + logger.info( + "[Custom Log] Coordinator, lagDetector.setTrackedNodes latency: {} ms", + TimeValue.nsecToMSec(System.nanoTime() - startTimeNS) + ); + startTimeNS = System.nanoTime(); coordinationState.get().handlePrePublish(clusterState); + logger.info( + "[Custom Log] Coordinator, coordinationState.get().handlePrePublish latency: {} ms", + TimeValue.nsecToMSec(System.nanoTime() - startTimeNS) + ); + startTimeNS = System.nanoTime(); publication.start(followersChecker.getFaultyNodes()); + logger.info( + "[Custom Log] Coordinator, publication.start(followersChecker.getFaultyNodes) latency: {} ms", + TimeValue.nsecToMSec(System.nanoTime() - startTimeNS) + ); + + logger.info( + "[Custom Log] Coordinator::publish latency: {} ms", + TimeValue.nsecToMSec(System.nanoTime() - overallStartTimeNS) + ); } } catch (Exception e) { logger.debug(() -> new ParameterizedMessage("[{}] publishing failed", clusterChangedEvent.source()), e); diff --git a/server/src/main/java/org/opensearch/cluster/routing/allocation/AllocationService.java b/server/src/main/java/org/opensearch/cluster/routing/allocation/AllocationService.java index 2184e40011e04..263dc0431def6 100644 --- a/server/src/main/java/org/opensearch/cluster/routing/allocation/AllocationService.java +++ b/server/src/main/java/org/opensearch/cluster/routing/allocation/AllocationService.java @@ -563,11 +563,29 @@ private void reroute(RoutingAllocation allocation) { : "auto-expand replicas out of sync with number of nodes in the cluster"; assert assertInitialized(); long rerouteStartTimeNS = System.nanoTime(); + long startTimeNS = System.nanoTime(); removeDelayMarkers(allocation); - + logger.info( + "[Custom Log] AllocationService, removeDelayMarkers latency: {} ms", + TimeValue.nsecToMSec(System.nanoTime() - startTimeNS) + ); + startTimeNS = System.nanoTime(); allocateExistingUnassignedShards(allocation); // try to allocate existing shard copies first + logger.info( + "[Custom Log] AllocationService, allocateExistingUnassignedShards latency: {} ms", + TimeValue.nsecToMSec(System.nanoTime() - startTimeNS) + ); + startTimeNS = System.nanoTime(); shardsAllocator.allocate(allocation); - logger.info("[Custom Log] AllocationService, reroute latency: {} ms", TimeValue.nsecToMSec(System.nanoTime() - rerouteStartTimeNS)); + logger.info( + "[Custom Log] AllocationService, shardsAllocator.allocate latency: {} ms", + TimeValue.nsecToMSec(System.nanoTime() - startTimeNS) + ); + + logger.info( + "[Custom Log] AllocationService, Overall reroute latency: {} ms", + TimeValue.nsecToMSec(System.nanoTime() - rerouteStartTimeNS) + ); clusterManagerMetrics.recordLatency( clusterManagerMetrics.rerouteHistogram, (double) Math.max(0, TimeValue.nsecToMSec(System.nanoTime() - rerouteStartTimeNS)) @@ -576,7 +594,6 @@ private void reroute(RoutingAllocation allocation) { } private void allocateExistingUnassignedShards(RoutingAllocation allocation) { - long latencyStartTimeInNs = System.nanoTime(); allocation.routingNodes().unassigned().sort(PriorityComparator.getAllocationComparator(allocation)); // sort for priority ordering for (final ExistingShardsAllocator existingShardsAllocator : existingShardsAllocators.values()) { @@ -592,10 +609,6 @@ private void allocateExistingUnassignedShards(RoutingAllocation allocation) { Currently AllocationService will not run any custom Allocator that implements allocateAllUnassignedShards */ allocateAllUnassignedShards(allocation); - logger.info( - "[Custom Log] AllocationService, allocateExistingUnassignedShards latency: {} ms", - TimeValue.nsecToMSec(System.nanoTime() - latencyStartTimeInNs) - ); return; } logger.warn("Falling back to single shard assignment since batch mode disable or multiple custom allocators set"); @@ -619,10 +632,6 @@ private void allocateExistingUnassignedShards(RoutingAllocation allocation) { getAllocatorForShard(shardRouting, allocation).allocateUnassigned(shardRouting, allocation, replicaIterator); } } - logger.info( - "[Custom Log] AllocationService, allocateExistingUnassignedShards latency: {} ms", - TimeValue.nsecToMSec(System.nanoTime() - latencyStartTimeInNs) - ); } private void allocateAllUnassignedShards(RoutingAllocation allocation) { diff --git a/server/src/main/java/org/opensearch/cluster/service/ClusterApplierService.java b/server/src/main/java/org/opensearch/cluster/service/ClusterApplierService.java index 21b2ebd680e6b..d3ef5faa41f10 100644 --- a/server/src/main/java/org/opensearch/cluster/service/ClusterApplierService.java +++ b/server/src/main/java/org/opensearch/cluster/service/ClusterApplierService.java @@ -201,7 +201,9 @@ public ClusterState apply(ClusterState clusterState) { @Override public void run() { + long startTimeInNs = System.nanoTime(); runTask(this); + logger.info("[Custom Log] ClusterApplierService, run latency: {} ms", TimeValue.nsecToMSec(System.nanoTime() - startTimeInNs)); } }