Skip to content

Commit

Permalink
Add detail to slow cluster state warning message (elastic#83221)
Browse files Browse the repository at this point in the history
Today if writing the cluster state is slow then we report how many index
metadata documents we wrote but we don't mention how many documents were
deleted. Deleting docs might trigger a merge and explain why the update
is slower than expected.

This commit adds info about the number of removed indices and also
distinguishes new indices from updates to existing indices, since they
also have different behaviour in terms of deletes.
  • Loading branch information
DaveCTurner authored Jan 31, 2022
1 parent e51bf31 commit 3d0655f
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 38 deletions.
5 changes: 5 additions & 0 deletions docs/changelog/83221.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 83221
summary: Add detail to slow cluster state warning message
area: Cluster Coordination
type: enhancement
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicBoolean;
Expand Down Expand Up @@ -770,18 +771,13 @@ public void writeFullStateAndCommit(long currentTerm, ClusterState clusterState)
final TimeValue finalSlowWriteLoggingThreshold = slowWriteLoggingThresholdSupplier.get();
if (durationMillis >= finalSlowWriteLoggingThreshold.getMillis()) {
logger.warn(
"writing cluster state took [{}ms] which is above the warn threshold of [{}]; "
+ "wrote full state with [{}] indices",
"writing full cluster state took [{}ms] which is above the warn threshold of [{}]; {}",
durationMillis,
finalSlowWriteLoggingThreshold,
stats.numIndicesUpdated
stats
);
} else {
logger.debug(
"writing cluster state took [{}ms]; " + "wrote full state with [{}] indices",
durationMillis,
stats.numIndicesUpdated
);
logger.debug("writing full cluster state took [{}ms]; {}", durationMillis, stats);
}
} finally {
closeIfAnyIndexWriterHasTragedyOrIsClosed();
Expand Down Expand Up @@ -809,23 +805,13 @@ void writeIncrementalStateAndCommit(long currentTerm, ClusterState previousClust
final TimeValue finalSlowWriteLoggingThreshold = slowWriteLoggingThresholdSupplier.get();
if (durationMillis >= finalSlowWriteLoggingThreshold.getMillis()) {
logger.warn(
"writing cluster state took [{}ms] which is above the warn threshold of [{}]; "
+ "wrote global metadata [{}] and metadata for [{}] indices and skipped [{}] unchanged indices",
"writing cluster state took [{}ms] which is above the warn threshold of [{}]; {}",
durationMillis,
finalSlowWriteLoggingThreshold,
stats.globalMetaUpdated,
stats.numIndicesUpdated,
stats.numIndicesUnchanged
stats
);
} else {
logger.debug(
"writing cluster state took [{}ms]; "
+ "wrote global metadata [{}] and metadata for [{}] indices and skipped [{}] unchanged indices",
durationMillis,
stats.globalMetaUpdated,
stats.numIndicesUpdated,
stats.numIndicesUnchanged
);
logger.debug("writing cluster state took [{}ms]; {}", durationMillis, stats);
}
} finally {
closeIfAnyIndexWriterHasTragedyOrIsClosed();
Expand Down Expand Up @@ -864,7 +850,9 @@ private WriterStats updateMetadata(Metadata previouslyWrittenMetadata, Metadata
assert previousValue == null : indexMetadata.getIndexUUID() + " already mapped to " + previousValue;
}

int numIndicesAdded = 0;
int numIndicesUpdated = 0;
int numIndicesRemoved = 0;
int numIndicesUnchanged = 0;
for (IndexMetadata indexMetadata : metadata.indices().values()) {
final Long previousVersion = indexMetadataVersionByUUID.get(indexMetadata.getIndexUUID());
Expand All @@ -875,7 +863,11 @@ private WriterStats updateMetadata(Metadata previouslyWrittenMetadata, Metadata
previousVersion,
indexMetadata.getVersion()
);
numIndicesUpdated++;
if (previousVersion == null) {
numIndicesAdded++;
} else {
numIndicesUpdated++;
}

for (MetadataIndexWriter metadataIndexWriter : metadataIndexWriters) {
metadataIndexWriter.deleteIndexMetadata(indexMetadata.getIndexUUID());
Expand All @@ -891,6 +883,7 @@ private WriterStats updateMetadata(Metadata previouslyWrittenMetadata, Metadata

for (String removedIndexUUID : indexMetadataVersionByUUID.keySet()) {
for (MetadataIndexWriter metadataIndexWriter : metadataIndexWriters) {
numIndicesRemoved++;
metadataIndexWriter.deleteIndexMetadata(removedIndexUUID);
}
}
Expand All @@ -901,7 +894,7 @@ private WriterStats updateMetadata(Metadata previouslyWrittenMetadata, Metadata
metadataIndexWriter.flush();
}

return new WriterStats(updateGlobalMeta, numIndicesUpdated, numIndicesUnchanged);
return new WriterStats(false, updateGlobalMeta, numIndicesUnchanged, numIndicesAdded, numIndicesUpdated, numIndicesRemoved);
}

private static int lastPageValue(boolean isLastPage) {
Expand Down Expand Up @@ -977,7 +970,7 @@ private WriterStats addMetadata(Metadata metadata) throws IOException {
metadataIndexWriter.flush();
}

return new WriterStats(true, metadata.indices().size(), 0);
return new WriterStats(true, true, 0, 0, metadata.indices().size(), 0);
}

public void writeIncrementalTermUpdateAndCommit(long currentTerm, long lastAcceptedVersion, Version oldestIndexVersion)
Expand Down Expand Up @@ -1056,15 +1049,31 @@ public void close() throws IOException {
}
}

static class WriterStats {
final boolean globalMetaUpdated;
final long numIndicesUpdated;
final long numIndicesUnchanged;

WriterStats(boolean globalMetaUpdated, long numIndicesUpdated, long numIndicesUnchanged) {
this.globalMetaUpdated = globalMetaUpdated;
this.numIndicesUpdated = numIndicesUpdated;
this.numIndicesUnchanged = numIndicesUnchanged;
private record WriterStats(
boolean isFullWrite,
boolean globalMetaUpdated,
int numIndicesUnchanged,
int numIndicesAdded,
int numIndicesUpdated,
int numIndicesRemoved
) {
@Override
public String toString() {
if (isFullWrite) {
return String.format(Locale.ROOT, "wrote global metadata and metadata for [%d] indices", numIndicesUpdated);
} else {
return String.format(
Locale.ROOT,
"""
[%s] global metadata, wrote metadata for [%d] new indices and [%d] existing indices, \
removed metadata for [%d] indices and skipped [%d] unchanged indices""",
globalMetaUpdated ? "wrote" : "skipped writing",
numIndicesAdded,
numIndicesUpdated,
numIndicesRemoved,
numIndicesUnchanged
);
}
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1192,7 +1192,9 @@ public void testSlowLogging() throws IOException, IllegalAccessException {
"should see warning at threshold",
PersistedClusterStateService.class.getCanonicalName(),
Level.WARN,
"writing cluster state took [*] which is above the warn threshold of [*]; " + "wrote full state with [0] indices"
"""
writing full cluster state took [*] which is above the warn threshold of [*]; \
wrote global metadata and metadata for [0] indices"""
)
);

Expand All @@ -1206,7 +1208,9 @@ public void testSlowLogging() throws IOException, IllegalAccessException {
"should see warning above threshold",
PersistedClusterStateService.class.getCanonicalName(),
Level.WARN,
"writing cluster state took [*] which is above the warn threshold of [*]; " + "wrote full state with [0] indices"
"""
writing full cluster state took [*] which is above the warn threshold of [*]; \
wrote global metadata and metadata for [0] indices"""
)
);

Expand Down Expand Up @@ -1238,7 +1242,9 @@ public void testSlowLogging() throws IOException, IllegalAccessException {
"should see warning at reduced threshold",
PersistedClusterStateService.class.getCanonicalName(),
Level.WARN,
"writing cluster state took [*] which is above the warn threshold of [*]; " + "wrote full state with [0] indices"
"""
writing full cluster state took [*] which is above the warn threshold of [*]; \
wrote global metadata and metadata for [0] indices"""
)
);

Expand Down Expand Up @@ -1269,8 +1275,10 @@ public void testSlowLogging() throws IOException, IllegalAccessException {
"should see warning at threshold",
PersistedClusterStateService.class.getCanonicalName(),
Level.WARN,
"writing cluster state took [*] which is above the warn threshold of [*]; "
+ "wrote global metadata [false] and metadata for [1] indices and skipped [0] unchanged indices"
"""
writing cluster state took [*] which is above the warn threshold of [*]; [skipped writing] global metadata, \
wrote metadata for [1] new indices and [0] existing indices, removed metadata for [0] indices and \
skipped [0] unchanged indices"""
)
);

Expand Down

0 comments on commit 3d0655f

Please sign in to comment.