diff --git a/CHANGELOG.md b/CHANGELOG.md index 1cf866733c5ef..4622126868b36 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Add vertical scaling and SoftReference for snapshot repository data cache ([#16489](https://github.com/opensearch-project/OpenSearch/pull/16489)) - Support prefix list for remote repository attributes([#16271](https://github.com/opensearch-project/OpenSearch/pull/16271)) - Add new configuration setting `synonym_analyzer`, to the `synonym` and `synonym_graph` filters, enabling the specification of a custom analyzer for reading the synonym file ([#16488](https://github.com/opensearch-project/OpenSearch/pull/16488)). +- Add stats for remote publication failure and move download failure stats to remote methods([#16682](https://github.com/opensearch-project/OpenSearch/pull/16682/)) ### Dependencies - Bump `com.google.cloud:google-cloud-core-http` from 2.23.0 to 2.47.0 ([#16504](https://github.com/opensearch-project/OpenSearch/pull/16504)) @@ -36,6 +37,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Bump `Netty` from 4.1.114.Final to 4.1.115.Final ([#16661](https://github.com/opensearch-project/OpenSearch/pull/16661)) - Bump `org.xerial.snappy:snappy-java` from 1.1.10.6 to 1.1.10.7 ([#16665](https://github.com/opensearch-project/OpenSearch/pull/16665)) - Bump `codecov/codecov-action` from 4 to 5 ([#16667](https://github.com/opensearch-project/OpenSearch/pull/16667)) +- Bump `org.apache.logging.log4j:log4j-core` from 2.24.1 to 2.24.2 ([#16718](https://github.com/opensearch-project/OpenSearch/pull/16718)) ### Changed @@ -53,6 +55,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Fix permissions error on scripted query against remote snapshot ([#16544](https://github.com/opensearch-project/OpenSearch/pull/16544)) - Fix `doc_values` only (`index:false`) IP field searching for masks ([#16628](https://github.com/opensearch-project/OpenSearch/pull/16628)) - Fix stale cluster state custom file deletion ([#16670](https://github.com/opensearch-project/OpenSearch/pull/16670)) +- [Tiered Caching] Fix bug in cache stats API ([#16560](https://github.com/opensearch-project/OpenSearch/pull/16560)) - Ensure consistency of system flag on IndexMetadata after diff is applied ([#16644](https://github.com/opensearch-project/OpenSearch/pull/16644)) ### Security diff --git a/buildSrc/src/testKit/thirdPartyAudit/sample_jars/build.gradle b/buildSrc/src/testKit/thirdPartyAudit/sample_jars/build.gradle index 4d425964c77af..3db2a6e7c2733 100644 --- a/buildSrc/src/testKit/thirdPartyAudit/sample_jars/build.gradle +++ b/buildSrc/src/testKit/thirdPartyAudit/sample_jars/build.gradle @@ -17,7 +17,7 @@ repositories { } dependencies { - implementation "org.apache.logging.log4j:log4j-core:2.24.1" + implementation "org.apache.logging.log4j:log4j-core:2.24.2" } ["0.0.1", "0.0.2"].forEach { v -> diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties index 3bfe9cc6bd3c2..ec480eaeb61ef 100644 --- a/gradle/wrapper/gradle-wrapper.properties +++ b/gradle/wrapper/gradle-wrapper.properties @@ -11,7 +11,7 @@ distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists -distributionUrl=https\://services.gradle.org/distributions/gradle-8.11-all.zip +distributionUrl=https\://services.gradle.org/distributions/gradle-8.11.1-all.zip zipStoreBase=GRADLE_USER_HOME zipStorePath=wrapper/dists -distributionSha256Sum=73d2d553933194d8eefed0a291acbe45392ca3572ba13834cbbf373da375276d +distributionSha256Sum=89d4e70e4e84e2d2dfbb63e4daa53e21b25017cc70c37e4eea31ee51fb15098a diff --git a/modules/cache-common/src/internalClusterTest/java/org/opensearch/cache/common/tier/TieredSpilloverCacheStatsIT.java b/modules/cache-common/src/internalClusterTest/java/org/opensearch/cache/common/tier/TieredSpilloverCacheStatsIT.java index fe6bd7050a8f3..a858e94ad1609 100644 --- a/modules/cache-common/src/internalClusterTest/java/org/opensearch/cache/common/tier/TieredSpilloverCacheStatsIT.java +++ b/modules/cache-common/src/internalClusterTest/java/org/opensearch/cache/common/tier/TieredSpilloverCacheStatsIT.java @@ -10,6 +10,7 @@ import org.opensearch.action.admin.cluster.node.stats.NodesStatsRequest; import org.opensearch.action.admin.cluster.node.stats.NodesStatsResponse; +import org.opensearch.action.admin.indices.delete.DeleteIndexRequest; import org.opensearch.action.admin.indices.forcemerge.ForceMergeResponse; import org.opensearch.action.admin.indices.stats.CommonStatsFlags; import org.opensearch.action.search.SearchResponse; @@ -40,6 +41,7 @@ import static org.opensearch.cache.common.tier.TieredSpilloverCacheStatsHolder.TIER_DIMENSION_NAME; import static org.opensearch.cache.common.tier.TieredSpilloverCacheStatsHolder.TIER_DIMENSION_VALUE_DISK; import static org.opensearch.cache.common.tier.TieredSpilloverCacheStatsHolder.TIER_DIMENSION_VALUE_ON_HEAP; +import static org.opensearch.indices.IndicesService.INDICES_CACHE_CLEAN_INTERVAL_SETTING; import static org.opensearch.test.hamcrest.OpenSearchAssertions.assertAcked; import static org.opensearch.test.hamcrest.OpenSearchAssertions.assertSearchResponse; @@ -417,6 +419,55 @@ public void testStatsWithMultipleSegments() throws Exception { assertTrue(diskCacheStat.getEvictions() == 0); } + public void testClosingShard() throws Exception { + // Closing the shard should totally remove the stats associated with that shard. + internalCluster().startNodes( + 1, + Settings.builder() + .put(defaultSettings(HEAP_CACHE_SIZE_STRING, getNumberOfSegments())) + .put( + TieredSpilloverCacheSettings.TOOK_TIME_POLICY_CONCRETE_SETTINGS_MAP.get(CacheType.INDICES_REQUEST_CACHE).getKey(), + new TimeValue(0, TimeUnit.SECONDS) + ) + .put(INDICES_CACHE_CLEAN_INTERVAL_SETTING.getKey(), new TimeValue(1)) + .build() + ); + String index = "index"; + Client client = client(); + startIndex(client, index); + + // First search one time to see how big a single value will be + searchIndex(client, index, 0); + // get total stats + long singleSearchSize = getTotalStats(client).getSizeInBytes(); + // Select numbers so we get some values on both heap and disk + int itemsOnHeap = HEAP_CACHE_SIZE / (int) singleSearchSize; + int itemsOnDisk = 1 + randomInt(30); // The first one we search (to get the size) always goes to disk + int expectedEntries = itemsOnHeap + itemsOnDisk; + + for (int i = 1; i < expectedEntries; i++) { + // Cause misses + searchIndex(client, index, i); + } + int expectedMisses = itemsOnHeap + itemsOnDisk; + + // Cause some hits + int expectedHits = randomIntBetween(itemsOnHeap, expectedEntries); // Select it so some hits come from both tiers + for (int i = 0; i < expectedHits; i++) { + searchIndex(client, index, i); + } + + // Check the new stats API values are as expected + assertEquals( + new ImmutableCacheStats(expectedHits, expectedMisses, 0, expectedEntries * singleSearchSize, expectedEntries), + getTotalStats(client) + ); + + // Closing the index should close the shard + assertAcked(client().admin().indices().delete(new DeleteIndexRequest("index")).get()); + assertEquals(new ImmutableCacheStats(0, 0, 0, 0, 0), getTotalStats(client)); + } + private void startIndex(Client client, String indexName) throws InterruptedException { assertAcked( client.admin() diff --git a/modules/cache-common/src/main/java/org/opensearch/cache/common/tier/TieredSpilloverCache.java b/modules/cache-common/src/main/java/org/opensearch/cache/common/tier/TieredSpilloverCache.java index ab5335ca0ca66..38a6915ffd10e 100644 --- a/modules/cache-common/src/main/java/org/opensearch/cache/common/tier/TieredSpilloverCache.java +++ b/modules/cache-common/src/main/java/org/opensearch/cache/common/tier/TieredSpilloverCache.java @@ -373,12 +373,10 @@ private V compute(ICacheKey key, LoadAwareCacheLoader, V> loader @Override public void invalidate(ICacheKey key) { - for (Map.Entry, TierInfo> cacheEntry : caches.entrySet()) { - if (key.getDropStatsForDimensions()) { - List dimensionValues = statsHolder.getDimensionsWithTierValue(key.dimensions, cacheEntry.getValue().tierName); - statsHolder.removeDimensions(dimensionValues); - } - if (key.key != null) { + if (key.getDropStatsForDimensions()) { + statsHolder.removeDimensions(key.dimensions); + } else if (key.key != null) { + for (Map.Entry, TierInfo> cacheEntry : caches.entrySet()) { try (ReleasableLock ignore = writeLock.acquire()) { cacheEntry.getKey().invalidate(key); } diff --git a/modules/cache-common/src/main/java/org/opensearch/cache/common/tier/TieredSpilloverCacheStatsHolder.java b/modules/cache-common/src/main/java/org/opensearch/cache/common/tier/TieredSpilloverCacheStatsHolder.java index b40724430454b..7ea6d3504a52c 100644 --- a/modules/cache-common/src/main/java/org/opensearch/cache/common/tier/TieredSpilloverCacheStatsHolder.java +++ b/modules/cache-common/src/main/java/org/opensearch/cache/common/tier/TieredSpilloverCacheStatsHolder.java @@ -43,6 +43,8 @@ public class TieredSpilloverCacheStatsHolder extends DefaultCacheStatsHolder { /** Dimension value for on-disk cache, like EhcacheDiskCache. */ public static final String TIER_DIMENSION_VALUE_DISK = "disk"; + static final List TIER_VALUES = List.of(TIER_DIMENSION_VALUE_ON_HEAP, TIER_DIMENSION_VALUE_DISK); + /** * Constructor for the stats holder. * @param originalDimensionNames the original dimension names, not including TIER_DIMENSION_NAME @@ -167,4 +169,17 @@ public void decrementItems(List dimensionValues) { void setDiskCacheEnabled(boolean diskCacheEnabled) { this.diskCacheEnabled = diskCacheEnabled; } + + @Override + public void removeDimensions(List dimensionValues) { + assert dimensionValues.size() == dimensionNames.size() - 1 + : "Must specify a value for every dimension except tier when removing from StatsHolder"; + // As we are removing nodes from the tree, obtain the lock + lock.lock(); + try { + removeDimensionsHelper(dimensionValues, statsRoot, 0); + } finally { + lock.unlock(); + } + } } diff --git a/modules/cache-common/src/test/java/org/opensearch/cache/common/tier/TieredSpilloverCacheStatsHolderTests.java b/modules/cache-common/src/test/java/org/opensearch/cache/common/tier/TieredSpilloverCacheStatsHolderTests.java new file mode 100644 index 0000000000000..09273a0761663 --- /dev/null +++ b/modules/cache-common/src/test/java/org/opensearch/cache/common/tier/TieredSpilloverCacheStatsHolderTests.java @@ -0,0 +1,378 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.cache.common.tier; + +import org.opensearch.common.Randomness; +import org.opensearch.common.cache.stats.CacheStats; +import org.opensearch.common.cache.stats.DefaultCacheStatsHolder; +import org.opensearch.common.cache.stats.ImmutableCacheStats; +import org.opensearch.test.OpenSearchTestCase; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.UUID; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.CountDownLatch; + +import static org.opensearch.cache.common.tier.TieredSpilloverCacheStatsHolder.TIER_DIMENSION_VALUE_DISK; +import static org.opensearch.cache.common.tier.TieredSpilloverCacheStatsHolder.TIER_DIMENSION_VALUE_ON_HEAP; +import static org.opensearch.cache.common.tier.TieredSpilloverCacheStatsHolder.TIER_VALUES; + +public class TieredSpilloverCacheStatsHolderTests extends OpenSearchTestCase { + // These are modified from DefaultCacheStatsHolderTests.java to account for the tiers. Because we can't add a dependency on server.test, + // we can't reuse the same code. + + public void testAddAndGet() throws Exception { + for (boolean diskTierEnabled : List.of(true, false)) { + List dimensionNames = List.of("dim1", "dim2", "dim3", "dim4"); + TieredSpilloverCacheStatsHolder cacheStatsHolder = new TieredSpilloverCacheStatsHolder(dimensionNames, diskTierEnabled); + Map> usedDimensionValues = getUsedDimensionValues(cacheStatsHolder, 10, diskTierEnabled); + Map, CacheStats> expected = populateStats(cacheStatsHolder, usedDimensionValues, 1000, 10, diskTierEnabled); + + // test the value in the map is as expected for each distinct combination of values (all leaf nodes) + for (List dimensionValues : expected.keySet()) { + CacheStats expectedCounter = expected.get(dimensionValues); + ImmutableCacheStats actualStatsHolder = getNode(dimensionValues, cacheStatsHolder.getStatsRoot()).getImmutableStats(); + ImmutableCacheStats actualCacheStats = getNode(dimensionValues, cacheStatsHolder.getStatsRoot()).getImmutableStats(); + assertEquals(expectedCounter.immutableSnapshot(), actualStatsHolder); + assertEquals(expectedCounter.immutableSnapshot(), actualCacheStats); + } + + // Check overall total matches + CacheStats expectedTotal = new CacheStats(); + for (List dims : expected.keySet()) { + CacheStats other = expected.get(dims); + boolean countMissesAndEvictionsTowardsTotal = dims.get(dims.size() - 1).equals(TIER_DIMENSION_VALUE_DISK) + || !diskTierEnabled; + add(expectedTotal, other, countMissesAndEvictionsTowardsTotal); + } + assertEquals(expectedTotal.immutableSnapshot(), cacheStatsHolder.getStatsRoot().getImmutableStats()); + } + } + + private void add(CacheStats original, CacheStats other, boolean countMissesAndEvictionsTowardsTotal) { + // Add other to original, accounting for whether other is from the heap or disk tier + long misses = 0; + long evictions = 0; + if (countMissesAndEvictionsTowardsTotal) { + misses = other.getMisses(); + evictions = other.getEvictions(); + } + CacheStats modifiedOther = new CacheStats(other.getHits(), misses, evictions, other.getSizeInBytes(), other.getItems()); + original.add(modifiedOther); + } + + public void testReset() throws Exception { + List dimensionNames = List.of("dim1", "dim2"); + TieredSpilloverCacheStatsHolder cacheStatsHolder = new TieredSpilloverCacheStatsHolder(dimensionNames, true); + Map> usedDimensionValues = getUsedDimensionValues(cacheStatsHolder, 10, true); + Map, CacheStats> expected = populateStats(cacheStatsHolder, usedDimensionValues, 100, 10, true); + + cacheStatsHolder.reset(); + for (List dimensionValues : expected.keySet()) { + CacheStats originalCounter = expected.get(dimensionValues); + ImmutableCacheStats expectedTotal = new ImmutableCacheStats( + originalCounter.getHits(), + originalCounter.getMisses(), + originalCounter.getEvictions(), + 0, + 0 + ); + + DefaultCacheStatsHolder.Node node = getNode(dimensionValues, cacheStatsHolder.getStatsRoot()); + ImmutableCacheStats actual = node.getImmutableStats(); + assertEquals(expectedTotal, actual); + } + } + + public void testDropStatsForDimensions() throws Exception { + List dimensionNames = List.of("dim1", "dim2"); + // Create stats for the following dimension sets + List> statsToPopulate = List.of(List.of("A1", "B1"), List.of("A2", "B2"), List.of("A2", "B3")); + for (boolean diskTierEnabled : List.of(true, false)) { + TieredSpilloverCacheStatsHolder cacheStatsHolder = new TieredSpilloverCacheStatsHolder(dimensionNames, diskTierEnabled); + setupRemovalTest(cacheStatsHolder, statsToPopulate, diskTierEnabled); + + // Check the resulting total is correct. + int numNodes = statsToPopulate.size(); // Number of distinct sets of dimensions (not including tiers) + // If disk tier is enabled, we expect hits to be 2 * numNodes (1 heap + 1 disk per combination of dims), otherwise 1 * numNodes. + // Misses and evictions should be 1 * numNodes in either case (if disk tier is present, count only the disk misses/evictions, if + // disk tier is absent, count the heap ones) + long originalHits = diskTierEnabled ? 2 * numNodes : numNodes; + ImmutableCacheStats expectedTotal = new ImmutableCacheStats(originalHits, numNodes, numNodes, 0, 0); + assertEquals(expectedTotal, cacheStatsHolder.getStatsRoot().getImmutableStats()); + + // When we invalidate A2, B2, we should lose the node for B2, but not B3 or A2. + cacheStatsHolder.removeDimensions(List.of("A2", "B2")); + + // We expect hits to go down by 2 (1 heap + 1 disk) if disk is enabled, and 1 otherwise. Evictions/misses should go down by 1 in + // either case. + long removedHitsPerRemovedNode = diskTierEnabled ? 2 : 1; + expectedTotal = new ImmutableCacheStats(originalHits - removedHitsPerRemovedNode, numNodes - 1, numNodes - 1, 0, 0); + assertEquals(expectedTotal, cacheStatsHolder.getStatsRoot().getImmutableStats()); + assertNull(getNode(List.of("A2", "B2", TIER_DIMENSION_VALUE_ON_HEAP), cacheStatsHolder.getStatsRoot())); + assertNull(getNode(List.of("A2", "B2", TIER_DIMENSION_VALUE_DISK), cacheStatsHolder.getStatsRoot())); + assertNull(getNode(List.of("A2", "B2"), cacheStatsHolder.getStatsRoot())); + assertNotNull(getNode(List.of("A2"), cacheStatsHolder.getStatsRoot())); + assertNotNull(getNode(List.of("A2", "B3", TIER_DIMENSION_VALUE_ON_HEAP), cacheStatsHolder.getStatsRoot())); + + // When we invalidate A1, B1, we should lose the nodes for B1 and also A1, as it has no more children. + cacheStatsHolder.removeDimensions(List.of("A1", "B1")); + expectedTotal = new ImmutableCacheStats(originalHits - 2 * removedHitsPerRemovedNode, numNodes - 2, numNodes - 2, 0, 0); + assertEquals(expectedTotal, cacheStatsHolder.getStatsRoot().getImmutableStats()); + assertNull(getNode(List.of("A1", "B1", TIER_DIMENSION_VALUE_ON_HEAP), cacheStatsHolder.getStatsRoot())); + assertNull(getNode(List.of("A1", "B1", TIER_DIMENSION_VALUE_DISK), cacheStatsHolder.getStatsRoot())); + assertNull(getNode(List.of("A1", "B1"), cacheStatsHolder.getStatsRoot())); + assertNull(getNode(List.of("A1"), cacheStatsHolder.getStatsRoot())); + + // When we invalidate the last node, all nodes should be deleted except the root node + cacheStatsHolder.removeDimensions(List.of("A2", "B3")); + assertEquals(new ImmutableCacheStats(0, 0, 0, 0, 0), cacheStatsHolder.getStatsRoot().getImmutableStats()); + // assertEquals(0, cacheStatsHolder.getStatsRoot().getChildren().size()); + } + } + + public void testCount() throws Exception { + List dimensionNames = List.of("dim1", "dim2"); + TieredSpilloverCacheStatsHolder cacheStatsHolder = new TieredSpilloverCacheStatsHolder(dimensionNames, true); + Map> usedDimensionValues = getUsedDimensionValues(cacheStatsHolder, 10, true); + Map, CacheStats> expected = populateStats(cacheStatsHolder, usedDimensionValues, 100, 10, true); + + long expectedCount = 0L; + for (CacheStats counter : expected.values()) { + expectedCount += counter.getItems(); + } + assertEquals(expectedCount, cacheStatsHolder.count()); + } + + public void testConcurrentRemoval() throws Exception { + List dimensionNames = List.of("A", "B"); + TieredSpilloverCacheStatsHolder cacheStatsHolder = new TieredSpilloverCacheStatsHolder(dimensionNames, true); + + // Create stats for the following dimension sets + List> statsToPopulate = new ArrayList<>(); + int numAValues = 10; + int numBValues = 2; + for (int indexA = 0; indexA < numAValues; indexA++) { + for (int indexB = 0; indexB < numBValues; indexB++) { + statsToPopulate.add(List.of("A" + indexA, "B" + indexB)); + } + } + setupRemovalTest(cacheStatsHolder, statsToPopulate, true); + + // Remove a subset of the dimensions concurrently. + // Remove both (A0, B0), and (A0, B1), so we expect the intermediate node for A0 to be null afterwards. + // For all the others, remove only the B0 value. Then we expect the intermediate nodes for A1 through A9 to be present + // and reflect only the stats for their B1 child. + + Thread[] threads = new Thread[numAValues + 1]; + for (int i = 0; i < numAValues; i++) { + int finalI = i; + threads[i] = new Thread(() -> { cacheStatsHolder.removeDimensions(List.of("A" + finalI, "B0")); }); + } + threads[numAValues] = new Thread(() -> { cacheStatsHolder.removeDimensions(List.of("A0", "B1")); }); + for (Thread thread : threads) { + thread.start(); + } + for (Thread thread : threads) { + thread.join(); + } + + // intermediate node for A0 should be null + assertNull(getNode(List.of("A0"), cacheStatsHolder.getStatsRoot())); + + // leaf nodes for all B0 values should be null since they were removed + for (int indexA = 0; indexA < numAValues; indexA++) { + assertNull(getNode(List.of("A" + indexA, "B0"), cacheStatsHolder.getStatsRoot())); + } + + // leaf nodes for all B1 values, except (A0, B1), should not be null as they weren't removed, + // and the intermediate nodes A1 through A9 shouldn't be null as they have remaining children + for (int indexA = 1; indexA < numAValues; indexA++) { + DefaultCacheStatsHolder.Node b1LeafNode = getNode(List.of("A" + indexA, "B1"), cacheStatsHolder.getStatsRoot()); + assertNotNull(b1LeafNode); + assertEquals(new ImmutableCacheStats(2, 1, 1, 0, 0), b1LeafNode.getImmutableStats()); + DefaultCacheStatsHolder.Node intermediateLevelNode = getNode(List.of("A" + indexA), cacheStatsHolder.getStatsRoot()); + assertNotNull(intermediateLevelNode); + assertEquals(b1LeafNode.getImmutableStats(), intermediateLevelNode.getImmutableStats()); + } + } + + static void setupRemovalTest( + TieredSpilloverCacheStatsHolder cacheStatsHolder, + List> statsToPopulate, + boolean diskTierEnabled + ) { + List tiers = diskTierEnabled ? TIER_VALUES : List.of(TIER_DIMENSION_VALUE_ON_HEAP); + for (List dims : statsToPopulate) { + // Increment hits, misses, and evictions for set of dimensions, for both heap and disk + for (String tier : tiers) { + List dimsWithDimension = cacheStatsHolder.getDimensionsWithTierValue(dims, tier); + cacheStatsHolder.incrementHits(dimsWithDimension); + cacheStatsHolder.incrementMisses(dimsWithDimension); + boolean includeInTotal = tier.equals(TIER_DIMENSION_VALUE_DISK) || !diskTierEnabled; + cacheStatsHolder.incrementEvictions(dimsWithDimension, includeInTotal); + } + } + } + + /** + * Returns the node found by following these dimension values down from the root node. + * Returns null if no such node exists. + */ + static DefaultCacheStatsHolder.Node getNode(List dimensionValues, DefaultCacheStatsHolder.Node root) { + DefaultCacheStatsHolder.Node current = root; + for (String dimensionValue : dimensionValues) { + current = current.getChildren().get(dimensionValue); + if (current == null) { + return null; + } + } + return current; + } + + static Map, CacheStats> populateStats( + TieredSpilloverCacheStatsHolder cacheStatsHolder, + Map> usedDimensionValues, + int numDistinctValuePairs, + int numRepetitionsPerValue, + boolean diskTierEnabled + ) throws InterruptedException { + return populateStats( + List.of(cacheStatsHolder), + usedDimensionValues, + numDistinctValuePairs, + numRepetitionsPerValue, + diskTierEnabled + ); + } + + static Map, CacheStats> populateStats( + List cacheStatsHolders, + Map> usedDimensionValues, + int numDistinctValuePairs, + int numRepetitionsPerValue, + boolean diskTierEnabled + ) throws InterruptedException { + for (TieredSpilloverCacheStatsHolder statsHolder : cacheStatsHolders) { + assertEquals(cacheStatsHolders.get(0).getDimensionNames(), statsHolder.getDimensionNames()); + } + Map, CacheStats> expected = new ConcurrentHashMap<>(); + Thread[] threads = new Thread[numDistinctValuePairs]; + CountDownLatch countDownLatch = new CountDownLatch(numDistinctValuePairs); + Random rand = Randomness.get(); + List> dimensionsForThreads = new ArrayList<>(); + for (int i = 0; i < numDistinctValuePairs; i++) { + dimensionsForThreads.add(getRandomDimList(cacheStatsHolders.get(0).getDimensionNames(), usedDimensionValues, true, rand)); + int finalI = i; + threads[i] = new Thread(() -> { + Random threadRand = Randomness.get(); + List dimensions = dimensionsForThreads.get(finalI); + expected.computeIfAbsent(dimensions, (key) -> new CacheStats()); + for (TieredSpilloverCacheStatsHolder cacheStatsHolder : cacheStatsHolders) { + for (int j = 0; j < numRepetitionsPerValue; j++) { + CacheStats statsToInc = new CacheStats( + threadRand.nextInt(10), + threadRand.nextInt(10), + threadRand.nextInt(10), + threadRand.nextInt(5000), + threadRand.nextInt(10) + ); + for (int iter = 0; iter < statsToInc.getHits(); iter++) { + expected.get(dimensions).incrementHits(); + } + for (int iter = 0; iter < statsToInc.getMisses(); iter++) { + expected.get(dimensions).incrementMisses(); + } + for (int iter = 0; iter < statsToInc.getEvictions(); iter++) { + expected.get(dimensions).incrementEvictions(); + } + expected.get(dimensions).incrementSizeInBytes(statsToInc.getSizeInBytes()); + for (int iter = 0; iter < statsToInc.getItems(); iter++) { + expected.get(dimensions).incrementItems(); + } + populateStatsHolderFromStatsValueMap(cacheStatsHolder, Map.of(dimensions, statsToInc), diskTierEnabled); + } + } + countDownLatch.countDown(); + }); + } + for (Thread thread : threads) { + thread.start(); + } + countDownLatch.await(); + return expected; + } + + private static List getRandomDimList( + List dimensionNames, + Map> usedDimensionValues, + boolean pickValueForAllDims, + Random rand + ) { + List result = new ArrayList<>(); + for (String dimName : dimensionNames) { + if (pickValueForAllDims || rand.nextBoolean()) { // if pickValueForAllDims, always pick a value for each dimension, otherwise do + // so 50% of the time + int index = between(0, usedDimensionValues.get(dimName).size() - 1); + result.add(usedDimensionValues.get(dimName).get(index)); + } + } + return result; + } + + static Map> getUsedDimensionValues( + TieredSpilloverCacheStatsHolder cacheStatsHolder, + int numValuesPerDim, + boolean diskTierEnabled + ) { + Map> usedDimensionValues = new HashMap<>(); + for (int i = 0; i < cacheStatsHolder.getDimensionNames().size() - 1; i++) { // Have to handle final tier dimension separately + List values = new ArrayList<>(); + for (int j = 0; j < numValuesPerDim; j++) { + values.add(UUID.randomUUID().toString()); + } + usedDimensionValues.put(cacheStatsHolder.getDimensionNames().get(i), values); + } + if (diskTierEnabled) { + usedDimensionValues.put(TieredSpilloverCacheStatsHolder.TIER_DIMENSION_NAME, TIER_VALUES); + } else { + usedDimensionValues.put(TieredSpilloverCacheStatsHolder.TIER_DIMENSION_NAME, List.of(TIER_DIMENSION_VALUE_ON_HEAP)); + } + return usedDimensionValues; + } + + public static void populateStatsHolderFromStatsValueMap( + TieredSpilloverCacheStatsHolder cacheStatsHolder, + Map, CacheStats> statsMap, + boolean diskTierEnabled + ) { + for (Map.Entry, CacheStats> entry : statsMap.entrySet()) { + CacheStats stats = entry.getValue(); + List dims = entry.getKey(); + for (int i = 0; i < stats.getHits(); i++) { + cacheStatsHolder.incrementHits(dims); + } + for (int i = 0; i < stats.getMisses(); i++) { + cacheStatsHolder.incrementMisses(dims); + } + for (int i = 0; i < stats.getEvictions(); i++) { + boolean includeInTotal = dims.get(dims.size() - 1).equals(TIER_DIMENSION_VALUE_DISK) || !diskTierEnabled; + cacheStatsHolder.incrementEvictions(dims, includeInTotal); + } + cacheStatsHolder.incrementSizeInBytes(dims, stats.getSizeInBytes()); + for (int i = 0; i < stats.getItems(); i++) { + cacheStatsHolder.incrementItems(dims); + } + } + } +} diff --git a/modules/cache-common/src/test/java/org/opensearch/cache/common/tier/TieredSpilloverCacheTests.java b/modules/cache-common/src/test/java/org/opensearch/cache/common/tier/TieredSpilloverCacheTests.java index 1215a2130ac2d..3bb1321f9faf2 100644 --- a/modules/cache-common/src/test/java/org/opensearch/cache/common/tier/TieredSpilloverCacheTests.java +++ b/modules/cache-common/src/test/java/org/opensearch/cache/common/tier/TieredSpilloverCacheTests.java @@ -2112,6 +2112,60 @@ public void testTieredCacheDefaultSegmentCount() { assertTrue(VALID_SEGMENT_COUNT_VALUES.contains(tieredSpilloverCache.getNumberOfSegments())); } + public void testDropStatsForDimensions() throws Exception { + int onHeapCacheSize = randomIntBetween(300, 600); + int diskCacheSize = randomIntBetween(700, 1200); + int numberOfSegments = getNumberOfSegments(); + int keyValueSize = 50; + MockCacheRemovalListener removalListener = new MockCacheRemovalListener<>(); + TieredSpilloverCache tieredSpilloverCache = initializeTieredSpilloverCache( + keyValueSize, + diskCacheSize, + removalListener, + Settings.builder() + .put( + TieredSpilloverCacheSettings.TIERED_SPILLOVER_ONHEAP_STORE_SIZE.getConcreteSettingForNamespace( + CacheType.INDICES_REQUEST_CACHE.getSettingPrefix() + ).getKey(), + onHeapCacheSize * keyValueSize + "b" + ) + .build(), + 0, + numberOfSegments + ); + + List> usedKeys = new ArrayList<>(); + // Fill the cache, getting some entries + evictions for both tiers + int minMisses = (diskCacheSize + onHeapCacheSize) / keyValueSize + 10; + int numMisses = onHeapCacheSize + diskCacheSize + randomIntBetween(minMisses, minMisses + 50); + for (int iter = 0; iter < numMisses; iter++) { + ICacheKey key = getICacheKey(UUID.randomUUID().toString()); + usedKeys.add(key); + LoadAwareCacheLoader, String> tieredCacheLoader = getLoadAwareCacheLoader(); + tieredSpilloverCache.computeIfAbsent(key, tieredCacheLoader); + } + // Also do some random hits + Random rand = Randomness.get(); + int approxNumHits = 30; + for (int i = 0; i < approxNumHits; i++) { + LoadAwareCacheLoader, String> tieredCacheLoader = getLoadAwareCacheLoader(); + ICacheKey key = usedKeys.get(rand.nextInt(usedKeys.size())); + tieredSpilloverCache.computeIfAbsent(key, tieredCacheLoader); + } + + ImmutableCacheStats totalStats = tieredSpilloverCache.stats().getTotalStats(); + assertTrue(totalStats.getHits() > 0); + assertTrue(totalStats.getMisses() > 0); + assertTrue(totalStats.getEvictions() > 0); + + // Since all the keys have the same dimension values, except tiers, we only need to remove that one, and we expect all stats values + // should be 0 after that. + ICacheKey dropDimensionsKey = new ICacheKey<>(null, getMockDimensions()); + dropDimensionsKey.setDropStatsForDimensions(true); + tieredSpilloverCache.invalidate(dropDimensionsKey); + assertEquals(new ImmutableCacheStats(0, 0, 0, 0, 0), tieredSpilloverCache.stats().getTotalStats()); + } + private List getMockDimensions() { List dims = new ArrayList<>(); for (String dimensionName : dimensionNames) { diff --git a/server/src/main/java/org/opensearch/cluster/coordination/PublicationTransportHandler.java b/server/src/main/java/org/opensearch/cluster/coordination/PublicationTransportHandler.java index c4cb484cda693..7275d72f2db9f 100644 --- a/server/src/main/java/org/opensearch/cluster/coordination/PublicationTransportHandler.java +++ b/server/src/main/java/org/opensearch/cluster/coordination/PublicationTransportHandler.java @@ -298,9 +298,9 @@ PublishWithJoinResponse handleIncomingRemotePublishRequest(RemotePublishRequest } } catch (Exception e) { if (applyFullState) { - remoteClusterStateService.fullDownloadFailed(); + remoteClusterStateService.fullIncomingPublicationFailed(); } else { - remoteClusterStateService.diffDownloadFailed(); + remoteClusterStateService.diffIncomingPublicationFailed(); } throw e; } diff --git a/server/src/main/java/org/opensearch/common/cache/stats/DefaultCacheStatsHolder.java b/server/src/main/java/org/opensearch/common/cache/stats/DefaultCacheStatsHolder.java index ea92c8e81b8f0..7434283ff6f41 100644 --- a/server/src/main/java/org/opensearch/common/cache/stats/DefaultCacheStatsHolder.java +++ b/server/src/main/java/org/opensearch/common/cache/stats/DefaultCacheStatsHolder.java @@ -37,10 +37,10 @@ public class DefaultCacheStatsHolder implements CacheStatsHolder { // Non-leaf nodes have stats matching the sum of their children. // We use a tree structure, rather than a map with concatenated keys, to save on memory usage. If there are many leaf // nodes that share a parent, that parent's dimension value will only be stored once, not many times. - private final Node statsRoot; + protected final Node statsRoot; // To avoid sync problems, obtain a lock before creating or removing nodes in the stats tree. // No lock is needed to edit stats on existing nodes. - private final Lock lock = new ReentrantLock(); + protected final Lock lock = new ReentrantLock(); // The name of the cache type using these stats private final String storeName; @@ -188,8 +188,10 @@ public void removeDimensions(List dimensionValues) { } // Returns a CacheStatsCounterSnapshot object for the stats to decrement if the removal happened, null otherwise. - private ImmutableCacheStats removeDimensionsHelper(List dimensionValues, Node node, int depth) { + protected ImmutableCacheStats removeDimensionsHelper(List dimensionValues, Node node, int depth) { if (depth == dimensionValues.size()) { + // Remove children, if present. + node.children.clear(); // Pass up a snapshot of the original stats to avoid issues when the original is decremented by other fn invocations return node.getImmutableStats(); } @@ -208,15 +210,14 @@ private ImmutableCacheStats removeDimensionsHelper(List dimensionValues, return statsToDecrement; } - // pkg-private for testing - Node getStatsRoot() { + public Node getStatsRoot() { return statsRoot; } /** * Nodes that make up the tree in the stats holder. */ - protected static class Node { + public static class Node { private final String dimensionValue; // Map from dimensionValue to the DimensionNode for that dimension value. final Map children; @@ -241,7 +242,7 @@ public String getDimensionValue() { return dimensionValue; } - protected Map getChildren() { + public Map getChildren() { // We can safely iterate over ConcurrentHashMap without worrying about thread issues. return children; } @@ -280,7 +281,7 @@ long getEntries() { return this.stats.getItems(); } - ImmutableCacheStats getImmutableStats() { + public ImmutableCacheStats getImmutableStats() { return this.stats.immutableSnapshot(); } diff --git a/server/src/main/java/org/opensearch/gateway/remote/RemoteClusterStateService.java b/server/src/main/java/org/opensearch/gateway/remote/RemoteClusterStateService.java index e4f4bae9bef7c..c5fc6d5cae6a7 100644 --- a/server/src/main/java/org/opensearch/gateway/remote/RemoteClusterStateService.java +++ b/server/src/main/java/org/opensearch/gateway/remote/RemoteClusterStateService.java @@ -1470,173 +1470,191 @@ public ClusterState getClusterStateForManifest( String localNodeId, boolean includeEphemeral ) throws IOException { - ClusterState stateFromCache = remoteClusterStateCache.getState(clusterName, manifest); - if (stateFromCache != null) { - return stateFromCache; - } + try { + ClusterState stateFromCache = remoteClusterStateCache.getState(clusterName, manifest); + if (stateFromCache != null) { + return stateFromCache; + } - final ClusterState clusterState; - final long startTimeNanos = relativeTimeNanosSupplier.getAsLong(); - if (manifest.onOrAfterCodecVersion(CODEC_V2)) { - clusterState = readClusterStateInParallel( - ClusterState.builder(new ClusterName(clusterName)).build(), - manifest, - manifest.getClusterUUID(), - localNodeId, - manifest.getIndices(), - manifest.getCustomMetadataMap(), - manifest.getCoordinationMetadata() != null, - manifest.getSettingsMetadata() != null, - includeEphemeral && manifest.getTransientSettingsMetadata() != null, - manifest.getTemplatesMetadata() != null, - includeEphemeral && manifest.getDiscoveryNodesMetadata() != null, - includeEphemeral && manifest.getClusterBlocksMetadata() != null, - includeEphemeral ? manifest.getIndicesRouting() : emptyList(), - includeEphemeral && manifest.getHashesOfConsistentSettings() != null, - includeEphemeral ? manifest.getClusterStateCustomMap() : emptyMap(), - false, - includeEphemeral - ); + final ClusterState clusterState; + final long startTimeNanos = relativeTimeNanosSupplier.getAsLong(); + if (manifest.onOrAfterCodecVersion(CODEC_V2)) { + clusterState = readClusterStateInParallel( + ClusterState.builder(new ClusterName(clusterName)).build(), + manifest, + manifest.getClusterUUID(), + localNodeId, + manifest.getIndices(), + manifest.getCustomMetadataMap(), + manifest.getCoordinationMetadata() != null, + manifest.getSettingsMetadata() != null, + includeEphemeral && manifest.getTransientSettingsMetadata() != null, + manifest.getTemplatesMetadata() != null, + includeEphemeral && manifest.getDiscoveryNodesMetadata() != null, + includeEphemeral && manifest.getClusterBlocksMetadata() != null, + includeEphemeral ? manifest.getIndicesRouting() : emptyList(), + includeEphemeral && manifest.getHashesOfConsistentSettings() != null, + includeEphemeral ? manifest.getClusterStateCustomMap() : emptyMap(), + false, + includeEphemeral + ); - if (includeEphemeral - && !remoteClusterStateValidationMode.equals(RemoteClusterStateValidationMode.NONE) - && manifest.getClusterStateChecksum() != null) { - validateClusterStateFromChecksum(manifest, clusterState, clusterName, localNodeId, true); + if (includeEphemeral + && !remoteClusterStateValidationMode.equals(RemoteClusterStateValidationMode.NONE) + && manifest.getClusterStateChecksum() != null) { + validateClusterStateFromChecksum(manifest, clusterState, clusterName, localNodeId, true); + } + } else { + ClusterState state = readClusterStateInParallel( + ClusterState.builder(new ClusterName(clusterName)).build(), + manifest, + manifest.getClusterUUID(), + localNodeId, + manifest.getIndices(), + // for manifest codec V1, we don't have the following objects to read, so not passing anything + emptyMap(), + false, + false, + false, + false, + false, + false, + emptyList(), + false, + emptyMap(), + false, + false + ); + Metadata.Builder mb = Metadata.builder(remoteGlobalMetadataManager.getGlobalMetadata(manifest.getClusterUUID(), manifest)); + mb.indices(state.metadata().indices()); + clusterState = ClusterState.builder(state).metadata(mb).build(); } - } else { - ClusterState state = readClusterStateInParallel( - ClusterState.builder(new ClusterName(clusterName)).build(), - manifest, - manifest.getClusterUUID(), - localNodeId, - manifest.getIndices(), - // for manifest codec V1, we don't have the following objects to read, so not passing anything - emptyMap(), - false, - false, - false, - false, - false, - false, - emptyList(), - false, - emptyMap(), - false, - false - ); - Metadata.Builder mb = Metadata.builder(remoteGlobalMetadataManager.getGlobalMetadata(manifest.getClusterUUID(), manifest)); - mb.indices(state.metadata().indices()); - clusterState = ClusterState.builder(state).metadata(mb).build(); - } - final long durationMillis = TimeValue.nsecToMSec(relativeTimeNanosSupplier.getAsLong() - startTimeNanos); - remoteStateStats.stateFullDownloadSucceeded(); - remoteStateStats.stateFullDownloadTook(durationMillis); - if (includeEphemeral) { - // cache only if the entire cluster-state is present - remoteClusterStateCache.putState(clusterState); + final long durationMillis = TimeValue.nsecToMSec(relativeTimeNanosSupplier.getAsLong() - startTimeNanos); + remoteStateStats.stateFullDownloadSucceeded(); + remoteStateStats.stateFullDownloadTook(durationMillis); + if (includeEphemeral) { + // cache only if the entire cluster-state is present + remoteClusterStateCache.putState(clusterState); + } + return clusterState; + } catch (Exception e) { + logger.error("Failure in downloading full cluster state. ", e); + remoteStateStats.stateFullDownloadFailed(); + throw e; } - return clusterState; } public ClusterState getClusterStateUsingDiff(ClusterMetadataManifest manifest, ClusterState previousState, String localNodeId) { - assert manifest.getDiffManifest() != null : "Diff manifest null which is required for downloading cluster state"; - final long startTimeNanos = relativeTimeNanosSupplier.getAsLong(); - ClusterStateDiffManifest diff = manifest.getDiffManifest(); - boolean includeEphemeral = true; - - List updatedIndices = diff.getIndicesUpdated().stream().map(idx -> { - Optional uploadedIndexMetadataOptional = manifest.getIndices() - .stream() - .filter(idx2 -> idx2.getIndexName().equals(idx)) - .findFirst(); - assert uploadedIndexMetadataOptional.isPresent() == true; - return uploadedIndexMetadataOptional.get(); - }).collect(Collectors.toList()); - - Map updatedCustomMetadata = new HashMap<>(); - if (diff.getCustomMetadataUpdated() != null) { - for (String customType : diff.getCustomMetadataUpdated()) { - updatedCustomMetadata.put(customType, manifest.getCustomMetadataMap().get(customType)); + try { + assert manifest.getDiffManifest() != null : "Diff manifest null which is required for downloading cluster state"; + final long startTimeNanos = relativeTimeNanosSupplier.getAsLong(); + ClusterStateDiffManifest diff = manifest.getDiffManifest(); + boolean includeEphemeral = true; + + List updatedIndices = diff.getIndicesUpdated().stream().map(idx -> { + Optional uploadedIndexMetadataOptional = manifest.getIndices() + .stream() + .filter(idx2 -> idx2.getIndexName().equals(idx)) + .findFirst(); + assert uploadedIndexMetadataOptional.isPresent() == true; + return uploadedIndexMetadataOptional.get(); + }).collect(Collectors.toList()); + + Map updatedCustomMetadata = new HashMap<>(); + if (diff.getCustomMetadataUpdated() != null) { + for (String customType : diff.getCustomMetadataUpdated()) { + updatedCustomMetadata.put(customType, manifest.getCustomMetadataMap().get(customType)); + } } - } - Map updatedClusterStateCustom = new HashMap<>(); - if (diff.getClusterStateCustomUpdated() != null) { - for (String customType : diff.getClusterStateCustomUpdated()) { - updatedClusterStateCustom.put(customType, manifest.getClusterStateCustomMap().get(customType)); + Map updatedClusterStateCustom = new HashMap<>(); + if (diff.getClusterStateCustomUpdated() != null) { + for (String customType : diff.getClusterStateCustomUpdated()) { + updatedClusterStateCustom.put(customType, manifest.getClusterStateCustomMap().get(customType)); + } + } + + List updatedIndexRouting = new ArrayList<>(); + if (manifest.getCodecVersion() == CODEC_V2 || manifest.getCodecVersion() == CODEC_V3) { + updatedIndexRouting.addAll( + remoteRoutingTableService.getUpdatedIndexRoutingTableMetadata( + diff.getIndicesRoutingUpdated(), + manifest.getIndicesRouting() + ) + ); } - } - List updatedIndexRouting = new ArrayList<>(); - if (manifest.getCodecVersion() == CODEC_V2 || manifest.getCodecVersion() == CODEC_V3) { - updatedIndexRouting.addAll( - remoteRoutingTableService.getUpdatedIndexRoutingTableMetadata(diff.getIndicesRoutingUpdated(), manifest.getIndicesRouting()) + ClusterState updatedClusterState = readClusterStateInParallel( + previousState, + manifest, + manifest.getClusterUUID(), + localNodeId, + updatedIndices, + updatedCustomMetadata, + diff.isCoordinationMetadataUpdated(), + diff.isSettingsMetadataUpdated(), + diff.isTransientSettingsMetadataUpdated(), + diff.isTemplatesMetadataUpdated(), + diff.isDiscoveryNodesUpdated(), + diff.isClusterBlocksUpdated(), + updatedIndexRouting, + diff.isHashesOfConsistentSettingsUpdated(), + updatedClusterStateCustom, + manifest.getDiffManifest() != null + && manifest.getDiffManifest().getIndicesRoutingDiffPath() != null + && !manifest.getDiffManifest().getIndicesRoutingDiffPath().isEmpty(), + includeEphemeral ); - } + ClusterState.Builder clusterStateBuilder = ClusterState.builder(updatedClusterState); + Metadata.Builder metadataBuilder = Metadata.builder(updatedClusterState.metadata()); + // remove the deleted indices from the metadata + for (String index : diff.getIndicesDeleted()) { + metadataBuilder.remove(index); + } + // remove the deleted metadata customs from the metadata + if (diff.getCustomMetadataDeleted() != null) { + for (String customType : diff.getCustomMetadataDeleted()) { + metadataBuilder.removeCustom(customType); + } + } - ClusterState updatedClusterState = readClusterStateInParallel( - previousState, - manifest, - manifest.getClusterUUID(), - localNodeId, - updatedIndices, - updatedCustomMetadata, - diff.isCoordinationMetadataUpdated(), - diff.isSettingsMetadataUpdated(), - diff.isTransientSettingsMetadataUpdated(), - diff.isTemplatesMetadataUpdated(), - diff.isDiscoveryNodesUpdated(), - diff.isClusterBlocksUpdated(), - updatedIndexRouting, - diff.isHashesOfConsistentSettingsUpdated(), - updatedClusterStateCustom, - manifest.getDiffManifest() != null - && manifest.getDiffManifest().getIndicesRoutingDiffPath() != null - && !manifest.getDiffManifest().getIndicesRoutingDiffPath().isEmpty(), - includeEphemeral - ); - ClusterState.Builder clusterStateBuilder = ClusterState.builder(updatedClusterState); - Metadata.Builder metadataBuilder = Metadata.builder(updatedClusterState.metadata()); - // remove the deleted indices from the metadata - for (String index : diff.getIndicesDeleted()) { - metadataBuilder.remove(index); - } - // remove the deleted metadata customs from the metadata - if (diff.getCustomMetadataDeleted() != null) { - for (String customType : diff.getCustomMetadataDeleted()) { - metadataBuilder.removeCustom(customType); + // remove the deleted cluster state customs from the metadata + if (diff.getClusterStateCustomDeleted() != null) { + for (String customType : diff.getClusterStateCustomDeleted()) { + clusterStateBuilder.removeCustom(customType); + } } - } - // remove the deleted cluster state customs from the metadata - if (diff.getClusterStateCustomDeleted() != null) { - for (String customType : diff.getClusterStateCustomDeleted()) { - clusterStateBuilder.removeCustom(customType); + HashMap indexRoutingTables = new HashMap<>( + updatedClusterState.getRoutingTable().getIndicesRouting() + ); + if (manifest.getCodecVersion() == CODEC_V2 || manifest.getCodecVersion() == CODEC_V3) { + for (String indexName : diff.getIndicesRoutingDeleted()) { + indexRoutingTables.remove(indexName); + } } - } - HashMap indexRoutingTables = new HashMap<>(updatedClusterState.getRoutingTable().getIndicesRouting()); - if (manifest.getCodecVersion() == CODEC_V2 || manifest.getCodecVersion() == CODEC_V3) { - for (String indexName : diff.getIndicesRoutingDeleted()) { - indexRoutingTables.remove(indexName); + ClusterState clusterState = clusterStateBuilder.stateUUID(manifest.getStateUUID()) + .version(manifest.getStateVersion()) + .metadata(metadataBuilder) + .routingTable(new RoutingTable(manifest.getRoutingTableVersion(), indexRoutingTables)) + .build(); + if (!remoteClusterStateValidationMode.equals(RemoteClusterStateValidationMode.NONE) + && manifest.getClusterStateChecksum() != null) { + validateClusterStateFromChecksum(manifest, clusterState, previousState.getClusterName().value(), localNodeId, false); } - } + final long durationMillis = TimeValue.nsecToMSec(relativeTimeNanosSupplier.getAsLong() - startTimeNanos); + remoteStateStats.stateDiffDownloadSucceeded(); + remoteStateStats.stateDiffDownloadTook(durationMillis); - ClusterState clusterState = clusterStateBuilder.stateUUID(manifest.getStateUUID()) - .version(manifest.getStateVersion()) - .metadata(metadataBuilder) - .routingTable(new RoutingTable(manifest.getRoutingTableVersion(), indexRoutingTables)) - .build(); - if (!remoteClusterStateValidationMode.equals(RemoteClusterStateValidationMode.NONE) && manifest.getClusterStateChecksum() != null) { - validateClusterStateFromChecksum(manifest, clusterState, previousState.getClusterName().value(), localNodeId, false); + assert includeEphemeral == true; + // newState includes all the fields of cluster-state (includeEphemeral=true always) + remoteClusterStateCache.putState(clusterState); + return clusterState; + } catch (Exception e) { + logger.error("Failure in downloading diff cluster state. ", e); + remoteStateStats.stateDiffDownloadFailed(); + throw e; } - final long durationMillis = TimeValue.nsecToMSec(relativeTimeNanosSupplier.getAsLong() - startTimeNanos); - remoteStateStats.stateDiffDownloadSucceeded(); - remoteStateStats.stateDiffDownloadTook(durationMillis); - - assert includeEphemeral == true; - // newState includes all the fields of cluster-state (includeEphemeral=true always) - remoteClusterStateCache.putState(clusterState); - return clusterState; } void validateClusterStateFromChecksum( @@ -2036,6 +2054,14 @@ public void diffDownloadFailed() { remoteStateStats.stateDiffDownloadFailed(); } + public void fullIncomingPublicationFailed() { + remoteStateStats.stateFullIncomingPublicationFailed(); + } + + public void diffIncomingPublicationFailed() { + remoteStateStats.stateDiffIncomingPublicationFailed(); + } + RemoteClusterStateCache getRemoteClusterStateCache() { return remoteClusterStateCache; } diff --git a/server/src/main/java/org/opensearch/gateway/remote/RemoteDownloadStats.java b/server/src/main/java/org/opensearch/gateway/remote/RemoteDownloadStats.java index a8f4b33a19c37..0f520babca48d 100644 --- a/server/src/main/java/org/opensearch/gateway/remote/RemoteDownloadStats.java +++ b/server/src/main/java/org/opensearch/gateway/remote/RemoteDownloadStats.java @@ -20,10 +20,13 @@ public class RemoteDownloadStats extends PersistedStateStats { static final String CHECKSUM_VALIDATION_FAILED_COUNT = "checksum_validation_failed_count"; private AtomicLong checksumValidationFailedCount = new AtomicLong(0); + public static final String INCOMING_PUBLICATION_FAILED_COUNT = "incoming_publication_failed_count"; + private AtomicLong incomingPublicationFailedCount = new AtomicLong(0); public RemoteDownloadStats(String statsName) { super(statsName); addToExtendedFields(CHECKSUM_VALIDATION_FAILED_COUNT, checksumValidationFailedCount); + addToExtendedFields(INCOMING_PUBLICATION_FAILED_COUNT, incomingPublicationFailedCount); } public void checksumValidationFailedCount() { @@ -33,4 +36,12 @@ public void checksumValidationFailedCount() { public long getChecksumValidationFailedCount() { return checksumValidationFailedCount.get(); } + + public void incomingPublicationFailedCount() { + incomingPublicationFailedCount.incrementAndGet(); + } + + public long getIncomingPublicationFailedCount() { + return incomingPublicationFailedCount.get(); + } } diff --git a/server/src/main/java/org/opensearch/gateway/remote/RemotePersistenceStats.java b/server/src/main/java/org/opensearch/gateway/remote/RemotePersistenceStats.java index 1a8e85f30527d..7a6f5f9b95224 100644 --- a/server/src/main/java/org/opensearch/gateway/remote/RemotePersistenceStats.java +++ b/server/src/main/java/org/opensearch/gateway/remote/RemotePersistenceStats.java @@ -106,6 +106,14 @@ public long getStateFullDownloadValidationFailed() { return remoteFullDownloadStats.getChecksumValidationFailedCount(); } + public void stateDiffIncomingPublicationFailed() { + remoteDiffDownloadStats.incomingPublicationFailedCount(); + } + + public void stateFullIncomingPublicationFailed() { + remoteFullDownloadStats.incomingPublicationFailedCount(); + } + public PersistedStateStats getUploadStats() { return remoteUploadStats; } diff --git a/server/src/test/java/org/opensearch/cluster/coordination/PublicationTransportHandlerTests.java b/server/src/test/java/org/opensearch/cluster/coordination/PublicationTransportHandlerTests.java index 616559e91536d..c51f85e30283a 100644 --- a/server/src/test/java/org/opensearch/cluster/coordination/PublicationTransportHandlerTests.java +++ b/server/src/test/java/org/opensearch/cluster/coordination/PublicationTransportHandlerTests.java @@ -52,6 +52,7 @@ import org.opensearch.gateway.remote.ClusterMetadataManifest; import org.opensearch.gateway.remote.ClusterStateDiffManifest; import org.opensearch.gateway.remote.RemoteClusterStateService; +import org.opensearch.gateway.remote.RemoteDownloadStats; import org.opensearch.node.Node; import org.opensearch.telemetry.tracing.noop.NoopTracer; import org.opensearch.test.OpenSearchTestCase; @@ -64,10 +65,12 @@ import java.util.Collections; import java.util.Map; import java.util.Optional; +import java.util.concurrent.atomic.AtomicLong; import java.util.function.Function; import org.mockito.Mockito; +import static org.opensearch.gateway.remote.RemoteDownloadStats.INCOMING_PUBLICATION_FAILED_COUNT; import static org.opensearch.node.remotestore.RemoteStoreNodeAttribute.REMOTE_STORE_CLUSTER_STATE_REPOSITORY_NAME_ATTRIBUTE_KEY; import static org.opensearch.node.remotestore.RemoteStoreNodeAttribute.REMOTE_STORE_ROUTING_TABLE_REPOSITORY_NAME_ATTRIBUTE_KEY; import static org.hamcrest.Matchers.containsString; @@ -180,8 +183,8 @@ public void testHandleIncomingRemotePublishRequestWhenNoCurrentPublishRequest() () -> handler.handleIncomingRemotePublishRequest(remotePublishRequest) ); assertThat(e.getMessage(), containsString("publication to self failed")); - verify(remoteClusterStateService, times(0)).fullDownloadFailed(); - verify(remoteClusterStateService, times(1)).diffDownloadFailed(); + verify(remoteClusterStateService, times(0)).fullIncomingPublicationFailed(); + verify(remoteClusterStateService, times(1)).diffIncomingPublicationFailed(); verifyNoMoreInteractions(remoteClusterStateService); } @@ -207,8 +210,8 @@ public void testHandleIncomingRemotePublishRequestWhenTermMismatch() { () -> handler.handleIncomingRemotePublishRequest(remotePublishRequest) ); assertThat(e.getMessage(), containsString("publication to self failed")); - verify(remoteClusterStateService, times(0)).fullDownloadFailed(); - verify(remoteClusterStateService, times(1)).diffDownloadFailed(); + verify(remoteClusterStateService, times(0)).fullIncomingPublicationFailed(); + verify(remoteClusterStateService, times(1)).diffIncomingPublicationFailed(); verifyNoMoreInteractions(remoteClusterStateService); } @@ -234,8 +237,8 @@ public void testHandleIncomingRemotePublishRequestWhenVersionMismatch() { () -> handler.handleIncomingRemotePublishRequest(remotePublishRequest) ); assertThat(e.getMessage(), containsString("publication to self failed")); - verify(remoteClusterStateService, times(1)).diffDownloadFailed(); - verify(remoteClusterStateService, times(0)).fullDownloadFailed(); + verify(remoteClusterStateService, times(1)).diffIncomingPublicationFailed(); + verify(remoteClusterStateService, times(0)).fullIncomingPublicationFailed(); verifyNoMoreInteractions(remoteClusterStateService); } @@ -263,20 +266,20 @@ public void testHandleIncomingRemotePublishRequestForLocalNode() throws IOExcept public void testDownloadRemotePersistedFullStateFailedStats() throws IOException { RemoteClusterStateService remoteClusterStateService = mock(RemoteClusterStateService.class); - PersistedStateStats remoteFullDownloadStats = new PersistedStateStats("dummy_full_stats"); - PersistedStateStats remoteDiffDownloadStats = new PersistedStateStats("dummy_diff_stats"); + PersistedStateStats remoteFullDownloadStats = new RemoteDownloadStats("dummy_full_stats"); + PersistedStateStats remoteDiffDownloadStats = new RemoteDownloadStats("dummy_diff_stats"); when(remoteClusterStateService.getFullDownloadStats()).thenReturn(remoteFullDownloadStats); when(remoteClusterStateService.getDiffDownloadStats()).thenReturn(remoteDiffDownloadStats); doAnswer((i) -> { - remoteFullDownloadStats.stateFailed(); + remoteFullDownloadStats.getExtendedFields().put(INCOMING_PUBLICATION_FAILED_COUNT, new AtomicLong(1)); return null; - }).when(remoteClusterStateService).fullDownloadFailed(); + }).when(remoteClusterStateService).fullIncomingPublicationFailed(); doAnswer((i) -> { - remoteDiffDownloadStats.stateFailed(); + remoteDiffDownloadStats.getExtendedFields().put(INCOMING_PUBLICATION_FAILED_COUNT, new AtomicLong(1)); return null; - }).when(remoteClusterStateService).diffDownloadFailed(); + }).when(remoteClusterStateService).diffIncomingPublicationFailed(); PublishWithJoinResponse expectedPublishResponse = new PublishWithJoinResponse(new PublishResponse(TERM, VERSION), Optional.empty()); Function handlePublishRequest = p -> expectedPublishResponse; @@ -294,8 +297,8 @@ public void testDownloadRemotePersistedFullStateFailedStats() throws IOException handler.setCurrentPublishRequestToSelf(publishRequest); assertThrows(IllegalStateException.class, () -> handler.handleIncomingRemotePublishRequest(remotePublishRequest)); - assertEquals(1, remoteClusterStateService.getDiffDownloadStats().getFailedCount()); - assertEquals(0, remoteClusterStateService.getFullDownloadStats().getFailedCount()); + assertEquals(1, remoteClusterStateService.getDiffDownloadStats().getExtendedFields().get(INCOMING_PUBLICATION_FAILED_COUNT).get()); + assertEquals(0, remoteClusterStateService.getFullDownloadStats().getExtendedFields().get(INCOMING_PUBLICATION_FAILED_COUNT).get()); } public void testDownloadRemotePersistedDiffStateFailedStats() throws IOException { @@ -309,9 +312,9 @@ public void testDownloadRemotePersistedDiffStateFailedStats() throws IOException when(remoteClusterStateService.getClusterMetadataManifestByFileName(any(), any())).thenReturn(metadataManifest); doAnswer((i) -> { - remoteDiffDownloadStats.stateFailed(); + remoteDiffDownloadStats.getExtendedFields().put(INCOMING_PUBLICATION_FAILED_COUNT, new AtomicLong(1)); return null; - }).when(remoteClusterStateService).diffDownloadFailed(); + }).when(remoteClusterStateService).diffIncomingPublicationFailed(); PublishWithJoinResponse expectedPublishResponse = new PublishWithJoinResponse(new PublishResponse(TERM, VERSION), Optional.empty()); Function handlePublishRequest = p -> expectedPublishResponse; @@ -333,7 +336,7 @@ public void testDownloadRemotePersistedDiffStateFailedStats() throws IOException handler.setCurrentPublishRequestToSelf(publishRequest); assertThrows(NullPointerException.class, () -> handler.handleIncomingRemotePublishRequest(remotePublishRequest)); - assertEquals(1, remoteClusterStateService.getDiffDownloadStats().getFailedCount()); + assertEquals(1, remoteClusterStateService.getDiffDownloadStats().getExtendedFields().get(INCOMING_PUBLICATION_FAILED_COUNT).get()); } diff --git a/server/src/test/java/org/opensearch/gateway/remote/RemoteClusterStateServiceTests.java b/server/src/test/java/org/opensearch/gateway/remote/RemoteClusterStateServiceTests.java index 448b9cc9d78ac..be07aa0d05e9f 100644 --- a/server/src/test/java/org/opensearch/gateway/remote/RemoteClusterStateServiceTests.java +++ b/server/src/test/java/org/opensearch/gateway/remote/RemoteClusterStateServiceTests.java @@ -962,6 +962,9 @@ public void testGetClusterStateForManifest_ExcludeEphemeral() throws IOException eq(false) ); + assertNotNull(remoteClusterStateService.getFullDownloadStats()); + assertEquals(1, remoteClusterStateService.getFullDownloadStats().getSuccessCount()); + assertEquals(0, remoteClusterStateService.getFullDownloadStats().getFailedCount()); } public void testGetClusterStateFromManifest_CodecV1() throws IOException { @@ -1296,6 +1299,9 @@ public void testGetClusterStateUsingDiff() throws IOException { diffManifest.getClusterStateCustomDeleted().forEach(clusterStateCustomName -> { assertFalse(updatedClusterState.customs().containsKey(clusterStateCustomName)); }); + assertNotNull(remoteClusterStateService.getDiffDownloadStats()); + assertEquals(1, remoteClusterStateService.getDiffDownloadStats().getSuccessCount()); + assertEquals(0, remoteClusterStateService.getDiffDownloadStats().getFailedCount()); } public void testReadClusterStateInParallel_TimedOut() throws IOException { @@ -3421,6 +3427,9 @@ public void testGetClusterStateForManifestWithChecksumValidationEnabledWithMisma true ); assertEquals(1, remoteClusterStateService.getRemoteStateStats().getStateFullDownloadValidationFailed()); + assertNotNull(remoteClusterStateService.getFullDownloadStats()); + assertEquals(0, remoteClusterStateService.getFullDownloadStats().getSuccessCount()); + assertEquals(1, remoteClusterStateService.getFullDownloadStats().getFailedCount()); } public void testGetClusterStateForManifestWithChecksumValidationDebugWithMismatch() throws IOException { @@ -3717,6 +3726,9 @@ public void testGetClusterStateUsingDiffWithChecksumMismatch() throws IOExceptio eq(false) ); assertEquals(1, remoteClusterStateService.getRemoteStateStats().getStateDiffDownloadValidationFailed()); + assertNotNull(remoteClusterStateService.getDiffDownloadStats()); + assertEquals(0, remoteClusterStateService.getDiffDownloadStats().getSuccessCount()); + assertEquals(1, remoteClusterStateService.getDiffDownloadStats().getFailedCount()); } private void mockObjectsForGettingPreviousClusterUUID(Map clusterUUIDsPointers) throws IOException {