From 3d0655fb425db07a60221e37e9282ba8ab9a36f2 Mon Sep 17 00:00:00 2001 From: David Turner Date: Mon, 31 Jan 2022 10:54:10 +0000 Subject: [PATCH] Add detail to slow cluster state warning message (#83221) Today if writing the cluster state is slow then we report how many index metadata documents we wrote but we don't mention how many documents were deleted. Deleting docs might trigger a merge and explain why the update is slower than expected. This commit adds info about the number of removed indices and also distinguishes new indices from updates to existing indices, since they also have different behaviour in terms of deletes. --- docs/changelog/83221.yaml | 5 ++ .../gateway/PersistedClusterStateService.java | 75 +++++++++++-------- .../PersistedClusterStateServiceTests.java | 18 +++-- 3 files changed, 60 insertions(+), 38 deletions(-) create mode 100644 docs/changelog/83221.yaml diff --git a/docs/changelog/83221.yaml b/docs/changelog/83221.yaml new file mode 100644 index 000000000000..957bf0020499 --- /dev/null +++ b/docs/changelog/83221.yaml @@ -0,0 +1,5 @@ +pr: 83221 +summary: Add detail to slow cluster state warning message +area: Cluster Coordination +type: enhancement +issues: [] diff --git a/server/src/main/java/org/elasticsearch/gateway/PersistedClusterStateService.java b/server/src/main/java/org/elasticsearch/gateway/PersistedClusterStateService.java index b075135e82d4..61ee1b170611 100644 --- a/server/src/main/java/org/elasticsearch/gateway/PersistedClusterStateService.java +++ b/server/src/main/java/org/elasticsearch/gateway/PersistedClusterStateService.java @@ -84,6 +84,7 @@ import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.concurrent.atomic.AtomicBoolean; @@ -770,18 +771,13 @@ public void writeFullStateAndCommit(long currentTerm, ClusterState clusterState) final TimeValue finalSlowWriteLoggingThreshold = slowWriteLoggingThresholdSupplier.get(); if (durationMillis >= finalSlowWriteLoggingThreshold.getMillis()) { logger.warn( - "writing cluster state took [{}ms] which is above the warn threshold of [{}]; " - + "wrote full state with [{}] indices", + "writing full cluster state took [{}ms] which is above the warn threshold of [{}]; {}", durationMillis, finalSlowWriteLoggingThreshold, - stats.numIndicesUpdated + stats ); } else { - logger.debug( - "writing cluster state took [{}ms]; " + "wrote full state with [{}] indices", - durationMillis, - stats.numIndicesUpdated - ); + logger.debug("writing full cluster state took [{}ms]; {}", durationMillis, stats); } } finally { closeIfAnyIndexWriterHasTragedyOrIsClosed(); @@ -809,23 +805,13 @@ void writeIncrementalStateAndCommit(long currentTerm, ClusterState previousClust final TimeValue finalSlowWriteLoggingThreshold = slowWriteLoggingThresholdSupplier.get(); if (durationMillis >= finalSlowWriteLoggingThreshold.getMillis()) { logger.warn( - "writing cluster state took [{}ms] which is above the warn threshold of [{}]; " - + "wrote global metadata [{}] and metadata for [{}] indices and skipped [{}] unchanged indices", + "writing cluster state took [{}ms] which is above the warn threshold of [{}]; {}", durationMillis, finalSlowWriteLoggingThreshold, - stats.globalMetaUpdated, - stats.numIndicesUpdated, - stats.numIndicesUnchanged + stats ); } else { - logger.debug( - "writing cluster state took [{}ms]; " - + "wrote global metadata [{}] and metadata for [{}] indices and skipped [{}] unchanged indices", - durationMillis, - stats.globalMetaUpdated, - stats.numIndicesUpdated, - stats.numIndicesUnchanged - ); + logger.debug("writing cluster state took [{}ms]; {}", durationMillis, stats); } } finally { closeIfAnyIndexWriterHasTragedyOrIsClosed(); @@ -864,7 +850,9 @@ private WriterStats updateMetadata(Metadata previouslyWrittenMetadata, Metadata assert previousValue == null : indexMetadata.getIndexUUID() + " already mapped to " + previousValue; } + int numIndicesAdded = 0; int numIndicesUpdated = 0; + int numIndicesRemoved = 0; int numIndicesUnchanged = 0; for (IndexMetadata indexMetadata : metadata.indices().values()) { final Long previousVersion = indexMetadataVersionByUUID.get(indexMetadata.getIndexUUID()); @@ -875,7 +863,11 @@ private WriterStats updateMetadata(Metadata previouslyWrittenMetadata, Metadata previousVersion, indexMetadata.getVersion() ); - numIndicesUpdated++; + if (previousVersion == null) { + numIndicesAdded++; + } else { + numIndicesUpdated++; + } for (MetadataIndexWriter metadataIndexWriter : metadataIndexWriters) { metadataIndexWriter.deleteIndexMetadata(indexMetadata.getIndexUUID()); @@ -891,6 +883,7 @@ private WriterStats updateMetadata(Metadata previouslyWrittenMetadata, Metadata for (String removedIndexUUID : indexMetadataVersionByUUID.keySet()) { for (MetadataIndexWriter metadataIndexWriter : metadataIndexWriters) { + numIndicesRemoved++; metadataIndexWriter.deleteIndexMetadata(removedIndexUUID); } } @@ -901,7 +894,7 @@ private WriterStats updateMetadata(Metadata previouslyWrittenMetadata, Metadata metadataIndexWriter.flush(); } - return new WriterStats(updateGlobalMeta, numIndicesUpdated, numIndicesUnchanged); + return new WriterStats(false, updateGlobalMeta, numIndicesUnchanged, numIndicesAdded, numIndicesUpdated, numIndicesRemoved); } private static int lastPageValue(boolean isLastPage) { @@ -977,7 +970,7 @@ private WriterStats addMetadata(Metadata metadata) throws IOException { metadataIndexWriter.flush(); } - return new WriterStats(true, metadata.indices().size(), 0); + return new WriterStats(true, true, 0, 0, metadata.indices().size(), 0); } public void writeIncrementalTermUpdateAndCommit(long currentTerm, long lastAcceptedVersion, Version oldestIndexVersion) @@ -1056,15 +1049,31 @@ public void close() throws IOException { } } - static class WriterStats { - final boolean globalMetaUpdated; - final long numIndicesUpdated; - final long numIndicesUnchanged; - - WriterStats(boolean globalMetaUpdated, long numIndicesUpdated, long numIndicesUnchanged) { - this.globalMetaUpdated = globalMetaUpdated; - this.numIndicesUpdated = numIndicesUpdated; - this.numIndicesUnchanged = numIndicesUnchanged; + private record WriterStats( + boolean isFullWrite, + boolean globalMetaUpdated, + int numIndicesUnchanged, + int numIndicesAdded, + int numIndicesUpdated, + int numIndicesRemoved + ) { + @Override + public String toString() { + if (isFullWrite) { + return String.format(Locale.ROOT, "wrote global metadata and metadata for [%d] indices", numIndicesUpdated); + } else { + return String.format( + Locale.ROOT, + """ + [%s] global metadata, wrote metadata for [%d] new indices and [%d] existing indices, \ + removed metadata for [%d] indices and skipped [%d] unchanged indices""", + globalMetaUpdated ? "wrote" : "skipped writing", + numIndicesAdded, + numIndicesUpdated, + numIndicesRemoved, + numIndicesUnchanged + ); + } } } } diff --git a/server/src/test/java/org/elasticsearch/gateway/PersistedClusterStateServiceTests.java b/server/src/test/java/org/elasticsearch/gateway/PersistedClusterStateServiceTests.java index 75e4ac19af15..8c967aec47df 100644 --- a/server/src/test/java/org/elasticsearch/gateway/PersistedClusterStateServiceTests.java +++ b/server/src/test/java/org/elasticsearch/gateway/PersistedClusterStateServiceTests.java @@ -1192,7 +1192,9 @@ public void testSlowLogging() throws IOException, IllegalAccessException { "should see warning at threshold", PersistedClusterStateService.class.getCanonicalName(), Level.WARN, - "writing cluster state took [*] which is above the warn threshold of [*]; " + "wrote full state with [0] indices" + """ + writing full cluster state took [*] which is above the warn threshold of [*]; \ + wrote global metadata and metadata for [0] indices""" ) ); @@ -1206,7 +1208,9 @@ public void testSlowLogging() throws IOException, IllegalAccessException { "should see warning above threshold", PersistedClusterStateService.class.getCanonicalName(), Level.WARN, - "writing cluster state took [*] which is above the warn threshold of [*]; " + "wrote full state with [0] indices" + """ + writing full cluster state took [*] which is above the warn threshold of [*]; \ + wrote global metadata and metadata for [0] indices""" ) ); @@ -1238,7 +1242,9 @@ public void testSlowLogging() throws IOException, IllegalAccessException { "should see warning at reduced threshold", PersistedClusterStateService.class.getCanonicalName(), Level.WARN, - "writing cluster state took [*] which is above the warn threshold of [*]; " + "wrote full state with [0] indices" + """ + writing full cluster state took [*] which is above the warn threshold of [*]; \ + wrote global metadata and metadata for [0] indices""" ) ); @@ -1269,8 +1275,10 @@ public void testSlowLogging() throws IOException, IllegalAccessException { "should see warning at threshold", PersistedClusterStateService.class.getCanonicalName(), Level.WARN, - "writing cluster state took [*] which is above the warn threshold of [*]; " - + "wrote global metadata [false] and metadata for [1] indices and skipped [0] unchanged indices" + """ + writing cluster state took [*] which is above the warn threshold of [*]; [skipped writing] global metadata, \ + wrote metadata for [1] new indices and [0] existing indices, removed metadata for [0] indices and \ + skipped [0] unchanged indices""" ) );