opensearch-project · sohami · Jul 20, 2023 · andrross · Jul 24, 2023 · andrross
@@ -76,6 +76,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 ### Added
 - Add server version as REST response header [#6583](https://github.com/opensearch-project/OpenSearch/issues/6583)
 - Start replication checkpointTimers on primary before segments upload to remote store. ([#8221]()https://github.com/opensearch-project/OpenSearch/pull/8221)
+- Introduce new static cluster setting to control slice computation for concurrent segment search. ([#8847](https://github.com/opensearch-project/OpenSearch/pull/8847))
 
 ### Dependencies
 - Bump `org.apache.logging.log4j:log4j-core` from 2.17.1 to 2.20.0 ([#8307](https://github.com/opensearch-project/OpenSearch/pull/8307))

@@ -44,6 +44,7 @@
 import org.opensearch.index.ShardIndexingPressureMemoryManager;
 import org.opensearch.index.ShardIndexingPressureSettings;
 import org.opensearch.index.ShardIndexingPressureStore;
+import org.opensearch.search.SearchBootstrapSettings;
 import org.opensearch.search.backpressure.settings.NodeDuressSettings;
 import org.opensearch.search.backpressure.settings.SearchBackpressureSettings;
 import org.opensearch.search.backpressure.settings.SearchShardTaskSettings;
@@ -493,6 +494,7 @@ public void apply(Settings value, Settings current, Settings previous) {
                 SearchService.MAX_OPEN_SCROLL_CONTEXT,
                 SearchService.MAX_OPEN_PIT_CONTEXT,
                 SearchService.MAX_PIT_KEEPALIVE_SETTING,
+                SearchBootstrapSettings.CONCURRENT_SEGMENT_SEARCH_TARGET_MAX_SLICE_COUNT_SETTING,
                 CreatePitController.PIT_INIT_KEEP_ALIVE,
                 Node.WRITE_PORTS_FILE_SETTING,
                 Node.NODE_NAME_SETTING,

@@ -56,6 +56,7 @@
 import org.opensearch.monitor.fs.FsProbe;
 import org.opensearch.plugins.ExtensionAwarePlugin;
 import org.opensearch.plugins.SearchPipelinePlugin;
+import org.opensearch.search.SearchBootstrapSettings;
 import org.opensearch.telemetry.tracing.NoopTracerFactory;
 import org.opensearch.telemetry.tracing.Tracer;
 import org.opensearch.telemetry.tracing.TracerFactory;
@@ -466,6 +467,7 @@ protected Node(
 
             // Ensure to initialize Feature Flags via the settings from opensearch.yml
             FeatureFlags.initializeFeatureFlags(settings);
+            SearchBootstrapSettings.initialize(settings);
 
             final List<IdentityPlugin> identityPlugins = new ArrayList<>();
             if (FeatureFlags.isEnabled(FeatureFlags.IDENTITY)) {

@@ -0,0 +1,38 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.search;
+
+import org.opensearch.common.settings.Setting;
+import org.opensearch.common.settings.Settings;
+
+/**
+ * Keeps track of all the search related node level settings which can be accessed via static methods
+ */
+public class SearchBootstrapSettings {
+    // settings to configure maximum slice created per search request using OS custom slice computation mechanism. Default lucene
+    // mechanism will not be used if this setting is set with value > 0
+    public static final String CONCURRENT_SEGMENT_SEARCH_TARGET_MAX_SLICE_COUNT_KEY = "search.concurrent.max_slice";
+    public static final int CONCURRENT_SEGMENT_SEARCH_TARGET_MAX_SLICE_COUNT_DEFAULT_VALUE = -1;
+
+    // value <= 0 means lucene slice computation will be used
+    public static final Setting<Integer> CONCURRENT_SEGMENT_SEARCH_TARGET_MAX_SLICE_COUNT_SETTING = Setting.intSetting(
+        CONCURRENT_SEGMENT_SEARCH_TARGET_MAX_SLICE_COUNT_KEY,
+        CONCURRENT_SEGMENT_SEARCH_TARGET_MAX_SLICE_COUNT_DEFAULT_VALUE,
+        Setting.Property.NodeScope
+    );
 public static Setting<Integer> intSetting(String key, int defaultValue, int minValue, int maxValue, Property... properties) { 
     return intSetting(key, defaultValue, minValue, maxValue, v -> {}, properties); 
 } 
 public static Setting<Integer> intSetting(String key, int defaultValue, int minValue, int maxValue, Property... properties) { 
     return intSetting(key, defaultValue, minValue, maxValue, v -> {}, properties); 
 } 
+    private static Settings settings;
+
+    public static void initialize(Settings openSearchSettings) {
+        settings = openSearchSettings;
+    }
+
+    public static int getValueAsInt(String settingName, int defaultValue) {
+        return (settings != null) ? settings.getAsInt(settingName, defaultValue) : defaultValue;
+    }
+}
@@ -32,6 +32,8 @@
 
 package org.opensearch.search.internal;
 
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.LeafReaderContext;
@@ -66,6 +68,7 @@
 import org.opensearch.common.lucene.search.TopDocsAndMaxScore;
 import org.opensearch.common.lease.Releasable;
 import org.opensearch.search.DocValueFormat;
+import org.opensearch.search.SearchBootstrapSettings;
 import org.opensearch.search.SearchService;
 import org.opensearch.search.dfs.AggregatedDfs;
 import org.opensearch.search.profile.ContextualProfileBreakdown;
@@ -93,11 +96,13 @@
  * @opensearch.internal
  */
 public class ContextIndexSearcher extends IndexSearcher implements Releasable {
+
+    private static final Logger logger = LogManager.getLogger(ContextIndexSearcher.class);
     /**
      * The interval at which we check for search cancellation when we cannot use
      * a {@link CancellableBulkScorer}. See {@link #intersectScorerAndBitSet}.
      */
-    private static int CHECK_CANCELLED_SCORER_INTERVAL = 1 << 11;
+    private static final int CHECK_CANCELLED_SCORER_INTERVAL = 1 << 11;
 
     private AggregatedDfs aggregatedDfs;
     private QueryProfiler profiler;
@@ -439,6 +444,20 @@ public CollectionStatistics collectionStatistics(String field) throws IOExceptio
         return collectionStatistics;
     }
 
+    /**
+     * Compute the leaf slices that will be used by concurrent segment search to spread work across threads
+     * @param leaves all the segments
+     * @return leafSlice group to be executed by different threads
+     */
+    @Override
+    public LeafSlice[] slices(List<LeafReaderContext> leaves) {
+        final int target_max_slices = SearchBootstrapSettings.getValueAsInt(
+            SearchBootstrapSettings.CONCURRENT_SEGMENT_SEARCH_TARGET_MAX_SLICE_COUNT_KEY,
+            SearchBootstrapSettings.CONCURRENT_SEGMENT_SEARCH_TARGET_MAX_SLICE_COUNT_DEFAULT_VALUE
+        );
+        return slicesInternal(leaves, target_max_slices);
+    }
+
     public DirectoryReader getDirectoryReader() {
         final IndexReader reader = getIndexReader();
         assert reader instanceof DirectoryReader : "expected an instance of DirectoryReader, got " + reader.getClass();
@@ -518,4 +537,19 @@ private boolean shouldReverseLeafReaderContexts() {
         }
         return false;
     }
+
+    // package-private for testing
+    LeafSlice[] slicesInternal(List<LeafReaderContext> leaves, int target_max_slices) {
+        LeafSlice[] leafSlices;
+        if (target_max_slices <= 0) {
+            // use the default lucene slice calculation
+            leafSlices = super.slices(leaves);
+            logger.debug("Slice count using lucene default [{}]", leafSlices.length);
+        } else {
+            // use the custom slice calculation based on target_max_slices. It will sort
+            leafSlices = MaxTargetSliceSupplier.getSlices(leaves, target_max_slices);
+            logger.debug("Slice count using max target slice supplier [{}]", leafSlices.length);
+        }
+        return leafSlices;
+    }
 }
@@ -0,0 +1,61 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.search.internal;
+
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.search.IndexSearcher;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+
+/**
+ * Supplier to compute leaf slices based on passed in leaves and max target slice count to limit the number of computed slices. It sorts
+ * all the leaves based on document count and then assign each leaf in round-robin fashion to the target slice count slices. Based on
+ * experiment results as shared in <a href=https://github.com/opensearch-project/OpenSearch/issues/7358>issue-7358</a>
+ * we can see this mechanism helps to achieve better tail/median latency over default lucene slice computation.
+ */
+public class MaxTargetSliceSupplier {
+
+    public static IndexSearcher.LeafSlice[] getSlices(List<LeafReaderContext> leaves, int target_max_slice) {
+        if (target_max_slice <= 0) {
+            throw new IllegalArgumentException("MaxTargetSliceSupplier called with unexpected slice count of " + target_max_slice);
+        }
+
+        // slice count should not exceed the segment count
+        int target_slice_count = Math.min(target_max_slice, leaves.size());
+
+        // Make a copy so we can sort:
+        List<LeafReaderContext> sortedLeaves = new ArrayList<>(leaves);
+
+        // Sort by maxDoc, descending:
+        sortedLeaves.sort(Collections.reverseOrder(Comparator.comparingInt(l -> l.reader().maxDoc())));
+
+        final List<List<LeafReaderContext>> groupedLeaves = new ArrayList<>();
+        for (int i = 0; i < target_slice_count; ++i) {
+            groupedLeaves.add(new ArrayList<>());
+        }
+        // distribute the slices in round-robin fashion
+        List<LeafReaderContext> group;
+        for (int idx = 0; idx < sortedLeaves.size(); ++idx) {
+            int currentGroup = idx % target_slice_count;
+            group = groupedLeaves.get(currentGroup);
+            group.add(sortedLeaves.get(idx));
+        }
+
+        IndexSearcher.LeafSlice[] slices = new IndexSearcher.LeafSlice[target_slice_count];
+        int upto = 0;
+        for (List<LeafReaderContext> currentLeaf : groupedLeaves) {
+            slices[upto] = new IndexSearcher.LeafSlice(currentLeaf);
+            ++upto;
+        }
+        return slices;
+    }
+}
@@ -90,6 +90,7 @@
 import java.io.UncheckedIOException;
 import java.util.Collections;
 import java.util.IdentityHashMap;
+import java.util.List;
 import java.util.Set;
 
 import static org.mockito.Mockito.mock;
@@ -100,6 +101,7 @@
 import static org.opensearch.search.internal.ExitableDirectoryReader.ExitableTerms;
 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.instanceOf;
+import static org.opensearch.search.internal.IndexReaderUtils.getLeaves;
 
 public class ContextIndexSearcherTests extends OpenSearchTestCase {
     public void testIntersectScorerAndRoleBits() throws Exception {
@@ -304,6 +306,56 @@ public void onRemoval(ShardId shardId, Accountable accountable) {
         IOUtils.close(reader, w, dir);
     }
 
+    public void testSlicesInternal() throws Exception {
+        final List<LeafReaderContext> leaves = getLeaves(10);
+
+        final Directory directory = newDirectory();
+        IndexWriter iw = new IndexWriter(directory, new IndexWriterConfig(new StandardAnalyzer()).setMergePolicy(NoMergePolicy.INSTANCE));
+        Document document = new Document();
+        document.add(new StringField("field1", "value1", Field.Store.NO));
+        document.add(new StringField("field2", "value1", Field.Store.NO));
+        iw.addDocument(document);
+        iw.commit();
+        DirectoryReader directoryReader = DirectoryReader.open(directory);
+
+        SearchContext searchContext = mock(SearchContext.class);
+        IndexShard indexShard = mock(IndexShard.class);
+        when(searchContext.indexShard()).thenReturn(indexShard);
+        when(searchContext.bucketCollectorProcessor()).thenReturn(SearchContext.NO_OP_BUCKET_COLLECTOR_PROCESSOR);
+        ContextIndexSearcher searcher = new ContextIndexSearcher(
+            directoryReader,
+            IndexSearcher.getDefaultSimilarity(),
+            IndexSearcher.getDefaultQueryCache(),
+            IndexSearcher.getDefaultQueryCachingPolicy(),
+            true,
+            null,
+            searchContext
+        );
+        // Case 1: Verify the slice count when lucene default slice computation is used
+        IndexSearcher.LeafSlice[] slices = searcher.slicesInternal(leaves, -1);
+        int expectedSliceCount = 2;
+        // 2 slices will be created since max segment per slice of 5 will be reached
+        assertEquals(expectedSliceCount, slices.length);
+        for (int i = 0; i < expectedSliceCount; ++i) {
+            assertEquals(5, slices[i].leaves.length);
+        }
+
+        // Case 2: Verify the slice count when custom max slice computation is used
+        expectedSliceCount = 4;
+        slices = searcher.slicesInternal(leaves, expectedSliceCount);
+
+        // 4 slices will be created with 3 leaves in first 2 slices and 2 leaves in other slices
+        assertEquals(expectedSliceCount, slices.length);
+        for (int i = 0; i < expectedSliceCount; ++i) {
+            if (i < 2) {
+                assertEquals(3, slices[i].leaves.length);
+            } else {
+                assertEquals(2, slices[i].leaves.length);
+            }
+        }
+        IOUtils.close(directoryReader, iw, directory);
+    }
+
     private SparseFixedBitSet query(LeafReaderContext leaf, String field, String value) throws IOException {
         SparseFixedBitSet sparseFixedBitSet = new SparseFixedBitSet(leaf.reader().maxDoc());
         TermsEnum tenum = leaf.reader().terms(field).iterator();

@@ -0,0 +1,51 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.search.internal;
+
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.NoMergePolicy;
+import org.apache.lucene.store.Directory;
+
+import java.util.List;
+
+import static org.apache.lucene.tests.util.LuceneTestCase.newDirectory;
+
+public class IndexReaderUtils {
+
+    /**
+     * Utility to create leafCount number of {@link LeafReaderContext}
+     * @param leafCount count of leaves to create
+     * @return created leaves
+     */
+    public static List<LeafReaderContext> getLeaves(int leafCount) throws Exception {
+        final Directory directory = newDirectory();
+        IndexWriter iw = new IndexWriter(directory, new IndexWriterConfig(new StandardAnalyzer()).setMergePolicy(NoMergePolicy.INSTANCE));
+        for (int i = 0; i < leafCount; ++i) {
+            Document document = new Document();
+            final String fieldValue = "value" + i;
+            document.add(new StringField("field1", fieldValue, Field.Store.NO));
+            document.add(new StringField("field2", fieldValue, Field.Store.NO));
+            iw.addDocument(document);
+            iw.commit();
+        }
+        iw.close();
+        DirectoryReader directoryReader = DirectoryReader.open(directory);
+        List<LeafReaderContext> leaves = directoryReader.leaves();
+        directoryReader.close();
+        directory.close();
+        return leaves;
+    }
+}