Merge branch 'main' into carlosdelest/esql-match-function

carlosdelest · Oct 8, 2024 · 75161bb · 75161bb
2 parents c803d1b + b5d6fa0
commit 75161bb
Show file tree

Hide file tree

Showing 98 changed files with 2,927 additions and 1,291 deletions.
diff --git a/docs/changelog/113897.yaml b/docs/changelog/113897.yaml
@@ -0,0 +1,6 @@
+pr: 113897
+summary: "Add chunking settings configuration to `CohereService,` `AmazonBedrockService,`\
+  \ and `AzureOpenAiService`"
+area: Machine Learning
+type: enhancement
+issues: []
diff --git a/docs/changelog/114002.yaml b/docs/changelog/114002.yaml
@@ -0,0 +1,5 @@
+pr: 114002
+summary: Add a `mustache.max_output_size_bytes` setting to limit the length of results from mustache scripts
+area: Infra/Scripting
+type: enhancement
+issues: []
diff --git a/docs/changelog/114231.yaml b/docs/changelog/114231.yaml
@@ -0,0 +1,17 @@
+pr: 114231
+summary: Remove cluster state from `/_cluster/reroute` response
+area: Allocation
+type: breaking
+issues:
+ - 88978
+breaking:
+  title: Remove cluster state from `/_cluster/reroute` response
+  area: REST API
+  details: >-
+    The `POST /_cluster/reroute` API no longer returns the cluster state in its
+    response. The `?metric` query parameter to this API now has no effect and
+    its use will be forbidden in a future version.
+  impact: >-
+    Cease usage of the `?metric` query parameter when calling the
+    `POST /_cluster/reroute` API.
+  notable: false
diff --git a/docs/reference/cluster/reroute.asciidoc b/docs/reference/cluster/reroute.asciidoc
@@ -10,7 +10,7 @@ Changes the allocation of shards in a cluster.
 [[cluster-reroute-api-request]]
 ==== {api-request-title}
 
-`POST /_cluster/reroute?metric=none`
+`POST /_cluster/reroute`
 
 [[cluster-reroute-api-prereqs]]
 ==== {api-prereq-title}
@@ -193,7 +193,7 @@ This is a short example of a simple reroute API call:
 
 [source,console]
 --------------------------------------------------
-POST /_cluster/reroute?metric=none
+POST /_cluster/reroute
 {
   "commands": [
     {

diff --git a/docs/reference/commands/shard-tool.asciidoc b/docs/reference/commands/shard-tool.asciidoc
@@ -95,7 +95,7 @@ Changing allocation id V8QXk-QXSZinZMT-NvEq4w to tjm9Ve6uTBewVFAlfUMWjA
 
 You should run the following command to allocate this shard:
 
-POST /_cluster/reroute?metric=none
+POST /_cluster/reroute
 {
   "commands" : [
     {

diff --git a/docs/reference/intro.asciidoc b/docs/reference/intro.asciidoc
@@ -204,7 +204,7 @@ For general content, you have the following options for adding data to {es} indi
 If you're building a website or app, then you can call Elasticsearch APIs using an https://www.elastic.co/guide/en/elasticsearch/client/index.html[{es} client] in the programming language of your choice. If you use the Python client, then check out the `elasticsearch-labs` repo for various https://github.com/elastic/elasticsearch-labs/tree/main/notebooks/search/python-examples[example notebooks]. 
 * {kibana-ref}/connect-to-elasticsearch.html#upload-data-kibana[File upload]: Use the {kib} file uploader to index single files for one-off testing and exploration. The GUI guides you through setting up your index and field mappings.
 * https://github.com/elastic/crawler[Web crawler]: Extract and index web page content into {es} documents.
-* {enterprise-search-ref}/connectors.html[Connectors]: Sync data from various third-party data sources to create searchable, read-only replicas in {es}.
+* <<es-connectors,Connectors>>: Sync data from various third-party data sources to create searchable, read-only replicas in {es}.
 
 [discrete]
 [[es-ingestion-overview-timestamped]]
@@ -492,4 +492,4 @@ and restrictions. You can review the following guides to learn how to tune your
 * <<use-elasticsearch-for-time-series-data,Tune for time series data>>
 
 Many {es} options come with different performance considerations and trade-offs. The best way to determine the
-optimal configuration for your use case is through https://www.elastic.co/elasticon/conf/2016/sf/quantitative-cluster-sizing[testing with your own data and queries].
+optimal configuration for your use case is through https://www.elastic.co/elasticon/conf/2016/sf/quantitative-cluster-sizing[testing with your own data and queries].
diff --git a/docs/reference/ml/trained-models/apis/infer-trained-model.asciidoc b/docs/reference/ml/trained-models/apis/infer-trained-model.asciidoc
@@ -225,6 +225,17 @@ include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizatio
 (Optional, string)
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate]
 =======
+`deberta_v2`::::
+(Optional, object)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-deberta-v2]
++
+.Properties of deberta_v2
+[%collapsible%open]
+=======
+`truncate`::::
+(Optional, string)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate-deberta-v2]
+=======
 `roberta`::::
 (Optional, object)
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-roberta]
@@ -301,6 +312,17 @@ include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizatio
 (Optional, string)
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate]
 =======
+`deberta_v2`::::
+(Optional, object)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-deberta-v2]
++
+.Properties of deberta_v2
+[%collapsible%open]
+=======
+`truncate`::::
+(Optional, string)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate-deberta-v2]
+=======
 `roberta`::::
 (Optional, object)
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-roberta]
@@ -397,6 +419,21 @@ include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizatio
 (Optional, string)
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate]
 =======
+`deberta_v2`::::
+(Optional, object)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-deberta-v2]
++
+.Properties of deberta_v2
+[%collapsible%open]
+=======
+`span`::::
+(Optional, integer)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-span]
+
+`truncate`::::
+(Optional, string)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate-deberta-v2]
+=======
 `roberta`::::
 (Optional, object)
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-roberta]
@@ -517,6 +554,21 @@ include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizatio
 (Optional, string)
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate]
 =======
+`deberta_v2`::::
+(Optional, object)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-deberta-v2]
++
+.Properties of deberta_v2
+[%collapsible%open]
+=======
+`span`::::
+(Optional, integer)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-span]
+
+`truncate`::::
+(Optional, string)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate-deberta-v2]
+=======
 `roberta`::::
 (Optional, object)
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-roberta]
@@ -608,6 +660,17 @@ include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizatio
 (Optional, string)
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate]
 =======
+`deberta_v2`::::
+(Optional, object)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-deberta-v2]
++
+.Properties of deberta_v2
+[%collapsible%open]
+=======
+`truncate`::::
+(Optional, string)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate-deberta-v2]
+=======
 `roberta`::::
 (Optional, object)
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-roberta]
@@ -687,6 +750,21 @@ include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizatio
 (Optional, integer)
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-span]
 
+`with_special_tokens`::::
+(Optional, boolean)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens]
+=======
+`deberta_v2`::::
+(Optional, object)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-deberta-v2]
++
+.Properties of deberta_v2
+[%collapsible%open]
+=======
+`span`::::
+(Optional, integer)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-span]
+
 `with_special_tokens`::::
 (Optional, boolean)
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens]
@@ -790,6 +868,17 @@ include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizatio
 (Optional, string)
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate]
 =======
+`deberta_v2`::::
+(Optional, object)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-deberta-v2]
++
+.Properties of deberta_v2
+[%collapsible%open]
+=======
+`truncate`::::
+(Optional, string)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate-deberta-v2]
+=======
 `roberta`::::
 (Optional, object)
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-roberta]

diff --git a/docs/reference/reranking/index.asciidoc b/docs/reference/reranking/index.asciidoc
@@ -1,12 +1,12 @@
 [[re-ranking-overview]]
 = Re-ranking
 
-Many search systems are built on two-stage retrieval pipelines.
+Many search systems are built on multi-stage retrieval pipelines.
 
-The first stage uses cheap, fast algorithms to find a broad set of possible matches.
+Earlier stages use cheap, fast algorithms to find a broad set of possible matches.
 
-The second stage uses a more powerful model, often machine learning-based, to reorder the documents.
-This second step is called re-ranking.
+Later stages use more powerful models, often machine learning-based, to reorder the documents.
+This step is called re-ranking.
 Because the resource-intensive model is only applied to the smaller set of pre-filtered results, this approach returns more relevant results while still optimizing for search performance and computational costs.
 
 {es} supports various ranking and re-ranking techniques to optimize search relevance and performance.
@@ -18,7 +18,7 @@ Because the resource-intensive model is only applied to the smaller set of pre-f
 
 [float]
 [[re-ranking-first-stage-pipeline]]
-=== First stage: initial retrieval
+=== Initial retrieval
 
 [float]
 [[re-ranking-ranking-overview-bm25]]
@@ -45,7 +45,7 @@ Hybrid search techniques combine results from full-text and vector search pipeli
 
 [float]
 [[re-ranking-overview-second-stage]]
-=== Second stage: Re-ranking
+=== Re-ranking
 
 When using the following advanced re-ranking pipelines, first-stage retrieval mechanisms effectively generate a set of candidates.
 These candidates are funneled into the re-ranker to perform more computationally expensive re-ranking tasks.
@@ -67,4 +67,4 @@ Learning To Rank involves training a machine learning model to build a ranking f
 LTR is best suited for when you have ample training data and need highly customized relevance tuning.
 
 include::semantic-reranking.asciidoc[]
-include::learning-to-rank.asciidoc[]
+include::learning-to-rank.asciidoc[]
diff --git a/docs/reference/search/search-your-data/semantic-search-semantic-text.asciidoc b/docs/reference/search/search-your-data/semantic-search-semantic-text.asciidoc
@@ -89,6 +89,16 @@ PUT semantic-embeddings
 It will be used to generate the embeddings based on the input text.
 Every time you ingest data into the related `semantic_text` field, this endpoint will be used for creating the vector representation of the text.
 
+[NOTE]
+====
+If you're using web crawlers or connectors to generate indices, you have to
+<<indices-put-mapping,update the index mappings>> for these indices to
+include the `semantic_text` field. Once the mapping is updated, you'll need to run
+a full web crawl or a full connector sync. This ensures that all existing
+documents are reprocessed and updated with the new semantic embeddings,
+enabling semantic search on the updated data.
+====
+
 
 [discrete]
 [[semantic-text-load-data]]
@@ -118,6 +128,13 @@ Create the embeddings from the text by reindexing the data from the `test-data`
 The data in the `content` field will be reindexed into the `content` semantic text field of the destination index.
 The reindexed data will be processed by the {infer} endpoint associated with the `content` semantic text field.
 
+[NOTE]
+====
+This step uses the reindex API to simulate data ingestion. If you are working with data that has already been indexed,
+rather than using the test-data set, reindexing is required to ensure that the data is processed by the {infer} endpoint
+and the necessary embeddings are generated.
+====
+
 [source,console]
 ------------------------------------------------------------
 POST _reindex?wait_for_completion=false

diff --git a/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc b/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc
@@ -2,12 +2,12 @@
 === Red or yellow cluster health status
 
 A red or yellow cluster health status indicates one or more shards are not assigned to
-a node. 
+a node.
 
 * **Red health status**: The cluster has some unassigned primary shards, which
-means that some operations such as searches and indexing may fail. 
-* **Yellow health status**: The cluster has no unassigned primary shards but some 
-unassigned replica shards. This increases your risk of data loss and can degrade 
+means that some operations such as searches and indexing may fail.
+* **Yellow health status**: The cluster has no unassigned primary shards but some
+unassigned replica shards. This increases your risk of data loss and can degrade
 cluster performance.
 
 When your cluster has a red or yellow health status, it will continue to process
@@ -16,8 +16,8 @@ cleanup activities until the cluster returns to green health status. For instanc
 some <<index-lifecycle-management,{ilm-init}>> actions require the index on which they
 operate to have a green health status.
 
-In many cases, your cluster will recover to green health status automatically. 
-If the cluster doesn't automatically recover, then you must <<fix-red-yellow-cluster-status,manually address>> 
+In many cases, your cluster will recover to green health status automatically.
+If the cluster doesn't automatically recover, then you must <<fix-red-yellow-cluster-status,manually address>>
 the remaining problems so management and cleanup activities can proceed.
 
 [discrete]
@@ -107,7 +107,7 @@ asynchronously in the background.
 
 [source,console]
 ----
-POST _cluster/reroute?metric=none
+POST _cluster/reroute
 ----
 
 [discrete]
@@ -231,10 +231,10 @@ unassigned. See <<high-jvm-memory-pressure>>.
 
 If a node containing a primary shard is lost, {es} can typically replace it
 using a replica on another node. If you can't recover the node and replicas
-don't exist or are irrecoverable, <<cluster-allocation-explain,Allocation 
-Explain>> will report `no_valid_shard_copy` and you'll need to do one of the following: 
+don't exist or are irrecoverable, <<cluster-allocation-explain,Allocation
+Explain>> will report `no_valid_shard_copy` and you'll need to do one of the following:
 
-* restore the missing data from <<snapshot-restore,snapshot>> 
+* restore the missing data from <<snapshot-restore,snapshot>>
 * index the missing data from its original data source
 * accept data loss on the index-level by running <<indices-delete-index,Delete Index>>
 * accept data loss on the shard-level by executing <<cluster-reroute,Cluster Reroute>> allocate_stale_primary or allocate_empty_primary command with `accept_data_loss: true`
@@ -246,7 +246,7 @@ resulting in data loss.
 +
 [source,console]
 ----
-POST _cluster/reroute?metric=none
+POST _cluster/reroute
 {
   "commands": [
     {

diff --git a/...ash-bridge/src/main/java/org/elasticsearch/logstashbridge/script/ScriptServiceBridge.java b/...ash-bridge/src/main/java/org/elasticsearch/logstashbridge/script/ScriptServiceBridge.java
@@ -53,7 +53,7 @@ private static ScriptService getScriptService(final Settings settings, final Lon
             PainlessScriptEngine.NAME,
             new PainlessScriptEngine(settings, scriptContexts),
             MustacheScriptEngine.NAME,
-            new MustacheScriptEngine()
+            new MustacheScriptEngine(settings)
         );
         return new ScriptService(settings, scriptEngines, ScriptModule.CORE_CONTEXTS, timeProvider);
     }

diff --git a/libs/simdvec/build.gradle b/libs/simdvec/build.gradle
@@ -7,6 +7,7 @@
  * License v3.0 only", or the "Server Side Public License, v 1".
  */
 
+import org.elasticsearch.gradle.internal.info.BuildParams
 import org.elasticsearch.gradle.internal.precommit.CheckForbiddenApisTask
 
 apply plugin: 'elasticsearch.publish'
@@ -32,7 +33,7 @@ tasks.matching { it.name == "compileMain21Java" }.configureEach {
 }
 
 tasks.named('test').configure {
-  if (JavaVersion.current().majorVersion.toInteger() >= 21) {
+  if (BuildParams.getRuntimeJavaVersion().majorVersion.toInteger() >= 21) {
     jvmArgs '--add-modules=jdk.incubator.vector'
   }
 }

diff --git a/modules/lang-mustache/src/main/java/org/elasticsearch/script/mustache/MustachePlugin.java b/modules/lang-mustache/src/main/java/org/elasticsearch/script/mustache/MustachePlugin.java
@@ -44,7 +44,7 @@ public class MustachePlugin extends Plugin implements ScriptPlugin, ActionPlugin
 
     @Override
     public ScriptEngine getScriptEngine(Settings settings, Collection<ScriptContext<?>> contexts) {
-        return new MustacheScriptEngine();
+        return new MustacheScriptEngine(settings);
     }
 
     @Override