Merge branch 'main' into pointrange_optimization

Signed-off-by: Harsha Vamsi Kalluri <[email protected]>
harshavamsi · Jul 24, 2024 · 46f9587 · 46f9587
2 parents 612998e + fcc231d
commit 46f9587
Show file tree

Hide file tree

Showing 157 changed files with 10,582 additions and 1,935 deletions.
diff --git a/.github/workflows/add-performance-comment.yml b/.github/workflows/add-performance-comment.yml
@@ -6,7 +6,10 @@ on:
 
 jobs:
   add-comment:
-    if: github.event.label.name == 'Performance'
+    if: |
+      github.event.label.name == 'Performance' ||
+      github.event.label.name == 'Search:Performance' ||
+      github.event.label.name == 'Indexing:Performance'
     runs-on: ubuntu-latest
     permissions:
       pull-requests: write

diff --git a/.github/workflows/benchmark-pull-request.yml b/.github/workflows/benchmark-pull-request.yml
@@ -13,7 +13,7 @@ jobs:
       pull-requests: write
     steps:
       - name: Checkout Repository
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
       - name: Set up required env vars
         run: |
           echo "PR_NUMBER=${{ github.event.issue.number }}" >> $GITHUB_ENV
@@ -77,18 +77,6 @@ jobs:
         run: |
           echo "Invalid comment format detected. Failing the workflow."
           exit 1
-      - id: get_approvers
-        run: |
-          echo "approvers=$(cat .github/CODEOWNERS | grep '^\*'  | tr -d '* ' | sed 's/@/,/g' | sed 's/,//1')" >> $GITHUB_OUTPUT
-      - uses: trstringer/manual-approval@v1
-        if: (!contains(steps.get_approvers.outputs.approvers, github.event.comment.user.login))
-        with:
-          secret: ${{ github.TOKEN }}
-          approvers: ${{ steps.get_approvers.outputs.approvers }}
-          minimum-approvals: 1
-          issue-title: 'Request to approve/deny benchmark run for PR #${{ env.PR_NUMBER }}'
-          issue-body: "Please approve or deny the benchmark run for PR #${{ env.PR_NUMBER }}"
-          exclude-workflow-initiator-as-approver: false
       - name: Get PR Details
         id: get_pr
         uses: actions/github-script@v7
@@ -106,21 +94,33 @@ jobs:
 
             return {
               "headRepoFullName": pull_request.head.repo.full_name,
-              "headRef": pull_request.head.ref
+              "headRefSha": pull_request.head.sha
             };
       - name: Set pr details env vars
         run: |
           echo '${{ steps.get_pr.outputs.result }}' | jq -r '.headRepoFullName'
-          echo '${{ steps.get_pr.outputs.result }}' | jq -r '.headRef'
+          echo '${{ steps.get_pr.outputs.result }}' | jq -r '.headRefSha'
           headRepo=$(echo '${{ steps.get_pr.outputs.result }}' | jq -r '.headRepoFullName')
-          headRef=$(echo '${{ steps.get_pr.outputs.result }}' | jq -r '.headRef')
+          headRefSha=$(echo '${{ steps.get_pr.outputs.result }}' | jq -r '.headRefSha')
           echo "prHeadRepo=$headRepo" >> $GITHUB_ENV
-          echo "prHeadRef=$headRef" >> $GITHUB_ENV
+          echo "prHeadRefSha=$headRefSha" >> $GITHUB_ENV
+      - id: get_approvers
+        run: |
+          echo "approvers=$(cat .github/CODEOWNERS | grep '^\*'  | tr -d '* ' | sed 's/@/,/g' | sed 's/,//1')" >> $GITHUB_OUTPUT
+      - uses: trstringer/manual-approval@v1
+        if: (!contains(steps.get_approvers.outputs.approvers, github.event.comment.user.login))
+        with:
+          secret: ${{ github.TOKEN }}
+          approvers: ${{ steps.get_approvers.outputs.approvers }}
+          minimum-approvals: 1
+          issue-title: 'Request to approve/deny benchmark run for PR #${{ env.PR_NUMBER }}'
+          issue-body: "Please approve or deny the benchmark run for PR #${{ env.PR_NUMBER }}"
+          exclude-workflow-initiator-as-approver: false
       - name: Checkout PR Repo
-        uses: actions/checkout@v2
+        uses: actions/checkout@v4
         with:
           repository: ${{ env.prHeadRepo }}
-          ref: ${{ env.prHeadRef }}
+          ref: ${{ env.prHeadRefSha }}
           token: ${{ secrets.GITHUB_TOKEN }}
       - name: Setup Java
         uses: actions/setup-java@v1

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -20,11 +20,18 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 - Add matchesPluginSystemIndexPattern to SystemIndexRegistry ([#14750](https://github.com/opensearch-project/OpenSearch/pull/14750))
 - Add Plugin interface for loading application based configuration templates (([#14659](https://github.com/opensearch-project/OpenSearch/issues/14659)))
 - Refactor remote-routing-table service inline with remote state interfaces([#14668](https://github.com/opensearch-project/OpenSearch/pull/14668))
+- Add shard-diff path to diff manifest to reduce number of read calls remote store (([#14684](https://github.com/opensearch-project/OpenSearch/pull/14684)))
+- Add SortResponseProcessor to Search Pipelines (([#14785](https://github.com/opensearch-project/OpenSearch/issues/14785)))
 - Add prefix mode verification setting for repository verification (([#14790](https://github.com/opensearch-project/OpenSearch/pull/14790)))
 - Add SplitResponseProcessor to Search Pipelines (([#14800](https://github.com/opensearch-project/OpenSearch/issues/14800)))
 - Optimize TransportNodesAction to not send DiscoveryNodes for NodeStats, NodesInfo and ClusterStats call ([14749](https://github.com/opensearch-project/OpenSearch/pull/14749))
 - Reduce logging in DEBUG for MasterService:run ([#14795](https://github.com/opensearch-project/OpenSearch/pull/14795))
 - Enabling term version check on local state for all ClusterManager Read Transport Actions ([#14273](https://github.com/opensearch-project/OpenSearch/pull/14273))
+- Add persian_stem filter (([#14847](https://github.com/opensearch-project/OpenSearch/pull/14847)))
+- Create listener to refresh search thread resource usage ([#14832](https://github.com/opensearch-project/OpenSearch/pull/14832))
+- Add rest, transport layer changes for hot to warm tiering - dedicated setup (([#13980](https://github.com/opensearch-project/OpenSearch/pull/13980))
+- Optimize Cluster Stats Indices to precomute node level stats ([#14426](https://github.com/opensearch-project/OpenSearch/pull/14426))
+- Add logic to create index templates (v2) using context field ([#14811](https://github.com/opensearch-project/OpenSearch/pull/14811))
 - [Range Queries] Add new approximateable query framework to short-circuit range queries ([#13788](https://github.com/opensearch-project/OpenSearch/pull/13788))
 
 ### Dependencies
@@ -37,16 +44,18 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 - Bump `commons-net:commons-net` from 3.10.0 to 3.11.1 ([#14396](https://github.com/opensearch-project/OpenSearch/pull/14396))
 - Bump `com.nimbusds:nimbus-jose-jwt` from 9.37.3 to 9.40 ([#14398](https://github.com/opensearch-project/OpenSearch/pull/14398))
 - Bump `org.apache.commons:commons-configuration2` from 2.10.1 to 2.11.0 ([#14399](https://github.com/opensearch-project/OpenSearch/pull/14399))
-- Bump `com.gradle.develocity` from 3.17.4 to 3.17.5 ([#14397](https://github.com/opensearch-project/OpenSearch/pull/14397))
+- Bump `com.gradle.develocity` from 3.17.4 to 3.17.6 ([#14397](https://github.com/opensearch-project/OpenSearch/pull/14397), [#14856](https://github.com/opensearch-project/OpenSearch/pull/14856))
 - Bump `opentelemetry` from 1.36.0 to 1.40.0 ([#14457](https://github.com/opensearch-project/OpenSearch/pull/14457), [#14674](https://github.com/opensearch-project/OpenSearch/pull/14674))
 - Bump `opentelemetry-semconv` from 1.25.0-alpha to 1.26.0-alpha ([#14674](https://github.com/opensearch-project/OpenSearch/pull/14674))
 - Bump `azure-identity` from 1.11.4 to 1.13.0, Bump `msal4j` from 1.14.3 to 1.15.1, Bump `msal4j-persistence-extension` from 1.2.0 to 1.3.0 ([#14506](https://github.com/opensearch-project/OpenSearch/pull/14673))
 - Bump `com.azure:azure-storage-common` from 12.21.2 to 12.25.1 ([#14517](https://github.com/opensearch-project/OpenSearch/pull/14517))
-- Bump `com.microsoft.azure:msal4j` from 1.15.1 to 1.16.0 ([#14610](https://github.com/opensearch-project/OpenSearch/pull/14610))
+- Bump `com.microsoft.azure:msal4j` from 1.15.1 to 1.16.1 ([#14610](https://github.com/opensearch-project/OpenSearch/pull/14610), [#14857](https://github.com/opensearch-project/OpenSearch/pull/14857))
 - Bump `com.github.spullara.mustache.java:compiler` from 0.9.13 to 0.9.14 ([#14672](https://github.com/opensearch-project/OpenSearch/pull/14672))
 - Bump `net.minidev:accessors-smart` from 2.5.0 to 2.5.1 ([#14673](https://github.com/opensearch-project/OpenSearch/pull/14673))
 - Bump `jackson` from 2.17.1 to 2.17.2 ([#14687](https://github.com/opensearch-project/OpenSearch/pull/14687))
 - Bump `net.minidev:json-smart` from 2.5.0 to 2.5.1 ([#14748](https://github.com/opensearch-project/OpenSearch/pull/14748))
+- Bump `actions/checkout` from 2 to 4 ([#14858](https://github.com/opensearch-project/OpenSearch/pull/14858))
+- Bump `org.apache.commons:commons-lang3` from 3.14.0 to 3.15.0 ([#14861](https://github.com/opensearch-project/OpenSearch/pull/14861))
 
 ### Changed
 - [Tiered Caching] Move query recomputation logic outside write lock ([#14187](https://github.com/opensearch-project/OpenSearch/pull/14187))
@@ -56,13 +65,16 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 - Allow @InternalApi annotation on classes not meant to be constructed outside of the OpenSearch core ([#14575](https://github.com/opensearch-project/OpenSearch/pull/14575))
 - Add @InternalApi annotation to japicmp exclusions ([#14597](https://github.com/opensearch-project/OpenSearch/pull/14597))
 - Allow system index warning in OpenSearchRestTestCase.refreshAllIndices ([#14635](https://github.com/opensearch-project/OpenSearch/pull/14635))
+- Make reroute iteration time-bound for large shard allocations ([#14848](https://github.com/opensearch-project/OpenSearch/pull/14848))
 
 ### Deprecated
+- Deprecate batch_size parameter on bulk API ([#14725](https://github.com/opensearch-project/OpenSearch/pull/14725))
 
 ### Removed
 - Remove query categorization changes ([#14759](https://github.com/opensearch-project/OpenSearch/pull/14759))
 
 ### Fixed
+- Fix allowUnmappedFields, mapUnmappedFieldAsString settings are not applied when parsing certain types of query string query ([#13957](https://github.com/opensearch-project/OpenSearch/pull/13957))
 - Fix bug in SBP cancellation logic ([#13259](https://github.com/opensearch-project/OpenSearch/pull/13474))
 - Fix handling of Short and Byte data types in ScriptProcessor ingest pipeline ([#14379](https://github.com/opensearch-project/OpenSearch/issues/14379))
 - Switch to iterative version of WKT format parser ([#14086](https://github.com/opensearch-project/OpenSearch/pull/14086))
@@ -79,11 +91,15 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 - Fix create or update alias API doesn't throw exception for unsupported parameters ([#14719](https://github.com/opensearch-project/OpenSearch/pull/14719))
 - Refactoring FilterPath.parse by using an iterative approach ([#14200](https://github.com/opensearch-project/OpenSearch/pull/14200))
 - Refactoring Grok.validatePatternBank by using an iterative approach ([#14206](https://github.com/opensearch-project/OpenSearch/pull/14206))
+- Fix NPE when creating index with index.number_of_replicas set to null ([#14812](https://github.com/opensearch-project/OpenSearch/pull/14812))
 - Update help output for _cat ([#14722](https://github.com/opensearch-project/OpenSearch/pull/14722))
 - Fix bulk upsert ignores the default_pipeline and final_pipeline when auto-created index matches the index template ([#12891](https://github.com/opensearch-project/OpenSearch/pull/12891))
 - Fix NPE in ReplicaShardAllocator ([#14385](https://github.com/opensearch-project/OpenSearch/pull/14385))
 - Fix constant_keyword field type used when creating index ([#14807](https://github.com/opensearch-project/OpenSearch/pull/14807))
 - Use circuit breaker in InternalHistogram when adding empty buckets ([#14754](https://github.com/opensearch-project/OpenSearch/pull/14754))
+- Create new IndexInput for multi part upload ([#14888](https://github.com/opensearch-project/OpenSearch/pull/14888))
+- Fix searchable snapshot failure with scripted fields ([#14411](https://github.com/opensearch-project/OpenSearch/pull/14411))
+- Fix the visit of inner query for NestedQueryBuilder ([#14739](https://github.com/opensearch-project/OpenSearch/pull/14739))
 
 ### Security
 

diff --git a/...lysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java b/...lysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java
@@ -75,6 +75,7 @@
 import org.apache.lucene.analysis.eu.BasqueAnalyzer;
 import org.apache.lucene.analysis.fa.PersianAnalyzer;
 import org.apache.lucene.analysis.fa.PersianNormalizationFilter;
+import org.apache.lucene.analysis.fa.PersianStemFilter;
 import org.apache.lucene.analysis.fi.FinnishAnalyzer;
 import org.apache.lucene.analysis.fr.FrenchAnalyzer;
 import org.apache.lucene.analysis.ga.IrishAnalyzer;
@@ -315,6 +316,7 @@ public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
         filters.put("pattern_capture", requiresAnalysisSettings(PatternCaptureGroupTokenFilterFactory::new));
         filters.put("pattern_replace", requiresAnalysisSettings(PatternReplaceTokenFilterFactory::new));
         filters.put("persian_normalization", PersianNormalizationFilterFactory::new);
+        filters.put("persian_stem", PersianStemTokenFilterFactory::new);
         filters.put("porter_stem", PorterStemTokenFilterFactory::new);
         filters.put(
             "predicate_token_filter",
@@ -558,6 +560,7 @@ public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
             );
         }));
         filters.add(PreConfiguredTokenFilter.singleton("persian_normalization", true, PersianNormalizationFilter::new));
+        filters.add(PreConfiguredTokenFilter.singleton("persian_stem", true, PersianStemFilter::new));
         filters.add(PreConfiguredTokenFilter.singleton("porter_stem", false, PorterStemFilter::new));
         filters.add(PreConfiguredTokenFilter.singleton("reverse", false, ReverseStringFilter::new));
         filters.add(PreConfiguredTokenFilter.singleton("russian_stem", false, input -> new SnowballFilter(input, "Russian")));

diff --git a/...is-common/src/main/java/org/opensearch/analysis/common/PersianStemTokenFilterFactory.java b/...is-common/src/main/java/org/opensearch/analysis/common/PersianStemTokenFilterFactory.java
@@ -0,0 +1,52 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Modifications Copyright OpenSearch Contributors. See
+ * GitHub history for details.
+ */
+
+package org.opensearch.analysis.common;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.fa.PersianStemFilter;
+import org.opensearch.common.settings.Settings;
+import org.opensearch.env.Environment;
+import org.opensearch.index.IndexSettings;
+import org.opensearch.index.analysis.AbstractTokenFilterFactory;
+
+public class PersianStemTokenFilterFactory extends AbstractTokenFilterFactory {
+
+    PersianStemTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
+        super(indexSettings, name, settings);
+    }
+
+    @Override
+    public TokenStream create(TokenStream tokenStream) {
+        return new PersianStemFilter(tokenStream);
+    }
+}
diff --git a/...alysis-common/src/main/java/org/opensearch/analysis/common/StemmerTokenFilterFactory.java b/...alysis-common/src/main/java/org/opensearch/analysis/common/StemmerTokenFilterFactory.java
@@ -47,6 +47,7 @@
 import org.apache.lucene.analysis.en.KStemFilter;
 import org.apache.lucene.analysis.en.PorterStemFilter;
 import org.apache.lucene.analysis.es.SpanishLightStemFilter;
+import org.apache.lucene.analysis.fa.PersianStemFilter;
 import org.apache.lucene.analysis.fi.FinnishLightStemFilter;
 import org.apache.lucene.analysis.fr.FrenchLightStemFilter;
 import org.apache.lucene.analysis.fr.FrenchMinimalStemFilter;
@@ -239,6 +240,8 @@ public TokenStream create(TokenStream tokenStream) {
                 return new NorwegianLightStemFilter(tokenStream, NorwegianLightStemmer.NYNORSK);
             } else if ("minimal_nynorsk".equalsIgnoreCase(language) || "minimalNynorsk".equalsIgnoreCase(language)) {
                 return new NorwegianMinimalStemFilter(tokenStream, NorwegianLightStemmer.NYNORSK);
+            } else if ("persian".equalsIgnoreCase(language)) {
+                return new PersianStemFilter(tokenStream);
 
                 // Portuguese stemmers
             } else if ("portuguese".equalsIgnoreCase(language)) {

diff --git a/...lysis-common/src/test/java/org/opensearch/analysis/common/CommonAnalysisFactoryTests.java b/...lysis-common/src/test/java/org/opensearch/analysis/common/CommonAnalysisFactoryTests.java
@@ -158,6 +158,7 @@ protected Map<String, Class<?>> getTokenFilters() {
         filters.put("brazilianstem", BrazilianStemTokenFilterFactory.class);
         filters.put("czechstem", CzechStemTokenFilterFactory.class);
         filters.put("germanstem", GermanStemTokenFilterFactory.class);
+        filters.put("persianstem", PersianStemTokenFilterFactory.class);
         filters.put("telugunormalization", TeluguNormalizationFilterFactory.class);
         filters.put("telugustem", TeluguStemFilterFactory.class);
         // this filter is not exposed and should only be used internally
@@ -220,6 +221,7 @@ protected Map<String, Class<?>> getPreConfiguredTokenFilters() {
         filters.put("ngram", null);
         filters.put("nGram", null);
         filters.put("persian_normalization", null);
+        filters.put("persian_stem", null);
         filters.put("porter_stem", null);
         filters.put("reverse", ReverseStringFilterFactory.class);
         filters.put("russian_stem", SnowballPorterFilterFactory.class);

diff --git a/...common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/40_token_filters.yml b/...common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/40_token_filters.yml
@@ -1781,6 +1781,37 @@
     - length: { tokens: 1 }
     - match:  { tokens.0.token: abschliess }
 
+---
+"persian_stem":
+  - do:
+      indices.create:
+        index: test
+        body:
+          settings:
+            analysis:
+              filter:
+                my_persian_stem:
+                  type: persian_stem
+  - do:
+      indices.analyze:
+        index: test
+        body:
+          text: جامدات
+          tokenizer: keyword
+          filter:    [my_persian_stem]
+  - length: { tokens: 1 }
+  - match:  { tokens.0.token: جامد }
+
+  # Test pre-configured token filter too:
+  - do:
+      indices.analyze:
+        body:
+          text: جامدات
+          tokenizer: keyword
+          filter:    [persian_stem]
+  - length: { tokens: 1 }
+  - match:  { tokens.0.token: جامد }
+
 ---
 "russian_stem":
     - do:

diff --git a/modules/ingest-common/src/yamlRestTest/resources/rest-api-spec/test/ingest/70_bulk.yml b/modules/ingest-common/src/yamlRestTest/resources/rest-api-spec/test/ingest/70_bulk.yml
@@ -207,15 +207,14 @@ teardown:
   - match: { _source: {"f1": "v2", "f2": 47, "field1": "value1", "field2": "value2"}}
 
 ---
-"Test bulk API with batch enabled happy case":
+"Test bulk API with default batch size":
   - skip:
       version: " - 2.13.99"
       reason: "Added in 2.14.0"
 
   - do:
       bulk:
         refresh: true
-        batch_size: 2
         pipeline: "pipeline1"
         body:
           - '{"index": {"_index": "test_index", "_id": "test_id1"}}'
@@ -245,36 +244,6 @@ teardown:
         id: test_id3
   - match: { _source: { "text": "text3", "field1": "value1" } }
 
----
-"Test bulk API with batch_size missing":
-  - skip:
-      version: " - 2.13.99"
-      reason: "Added in 2.14.0"
-
-  - do:
-      bulk:
-        refresh: true
-        pipeline: "pipeline1"
-        body:
-          - '{"index": {"_index": "test_index", "_id": "test_id1"}}'
-          - '{"text": "text1"}'
-          - '{"index": {"_index": "test_index", "_id": "test_id2"}}'
-          - '{"text": "text2"}'
-
-  - match: { errors: false }
-
-  - do:
-      get:
-        index: test_index
-        id: test_id1
-  - match: { _source: { "text": "text1", "field1": "value1" } }
-
-  - do:
-      get:
-        index: test_index
-        id: test_id2
-  - match: { _source: { "text": "text2", "field1": "value1" } }
-
 ---
 "Test bulk API with invalid batch_size":
   - skip: