Merge branch 'main' into text_similarity_reranker_rework

pmpailis · Oct 7, 2024 · d82cfcc · d82cfcc
2 parents c1d8987 + 1292580
commit d82cfcc
Show file tree

Hide file tree

Showing 53 changed files with 483 additions and 252 deletions.
diff --git a/docs/changelog/113251.yaml b/docs/changelog/113251.yaml
@@ -0,0 +1,5 @@
+pr: 113251
+summary: Span term query to convert to match no docs when unmapped field is targeted
+area: Search
+type: bug
+issues: []
diff --git a/docs/reference/connector/apis/create-connector-api.asciidoc b/docs/reference/connector/apis/create-connector-api.asciidoc
@@ -116,7 +116,7 @@ PUT _connector/my-connector
   "name": "My Connector",
   "description": "My Connector to sync data to Elastic index from Google Drive",
   "service_type": "google_drive",
-  "language": "english"
+  "language": "en"
 }
 ----
 

diff --git a/docs/reference/ingest/processors/inference.asciidoc b/docs/reference/ingest/processors/inference.asciidoc
@@ -169,6 +169,18 @@ include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizatio
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate]
 =======
 
+`deberta_v2`::::
+(Optional, object)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-deberta-v2]
++
+.Properties of deberta_v2
+[%collapsible%open]
+=======
+`truncate`::::
+(Optional, string)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate-deberta-v2]
+=======
+
 `roberta`::::
 (Optional, object)
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-roberta]
@@ -224,6 +236,18 @@ include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizatio
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate]
 =======
 
+`deberta_v2`::::
+(Optional, object)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-deberta-v2]
++
+.Properties of deberta_v2
+[%collapsible%open]
+=======
+`truncate`::::
+(Optional, string)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate-deberta-v2]
+=======
+
 `roberta`::::
 (Optional, object)
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-roberta]
@@ -304,6 +328,23 @@ include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizatio
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate]
 =======
 
+`deberta_v2`::::
+(Optional, object)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-deberta-v2]
++
+.Properties of deberta_v2
+[%collapsible%open]
+=======
+`span`::::
+(Optional, integer)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-span]
+
+`truncate`::::
+(Optional, string)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate-deberta-v2]
+=======
+
+
 `roberta`::::
 (Optional, object)
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-roberta]
@@ -363,6 +404,18 @@ include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizatio
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate]
 =======
 
+`deberta_v2`::::
+(Optional, object)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-deberta-v2]
++
+.Properties of deberta_v2
+[%collapsible%open]
+=======
+`truncate`::::
+(Optional, string)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate-deberta-v2]
+=======
+
 `roberta`::::
 (Optional, object)
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-roberta]
@@ -424,6 +477,22 @@ include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizatio
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate]
 =======
 
+`deberta_v2`::::
+(Optional, object)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-deberta-v2]
++
+.Properties of deberta_v2
+[%collapsible%open]
+=======
+`span`::::
+(Optional, integer)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-span]
+
+`truncate`::::
+(Optional, string)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate-deberta-v2]
+=======
+
 `roberta`::::
 (Optional, object)
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-roberta]
@@ -515,6 +584,18 @@ include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizatio
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate]
 =======
 
+`deberta_v2`::::
+(Optional, object)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-deberta-v2]
++
+.Properties of deberta_v2
+[%collapsible%open]
+=======
+`truncate`::::
+(Optional, string)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate-deberta-v2]
+=======
+
 `roberta`::::
 (Optional, object)
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-roberta]

diff --git a/docs/reference/mapping/runtime.asciidoc b/docs/reference/mapping/runtime.asciidoc
@@ -821,8 +821,6 @@ address.
 [[lookup-runtime-fields]]
 ==== Retrieve fields from related indices
 
-experimental[]
-
 The <<search-fields,`fields`>> parameter on the `_search` API can also be used to retrieve fields from
 the related indices via runtime fields with a type of `lookup`.
 

diff --git a/docs/reference/mapping/types/date.asciidoc b/docs/reference/mapping/types/date.asciidoc
@@ -125,8 +125,7 @@ The following parameters are accepted by `date` fields:
 `locale`::
 
     The locale to use when parsing dates since months do not have the same names
-    and/or abbreviations in all languages. The default is the
-    https://docs.oracle.com/javase/8/docs/api/java/util/Locale.html#ROOT[`ROOT` locale].
+    and/or abbreviations in all languages. The default is ENGLISH.
 
 <<ignore-malformed,`ignore_malformed`>>::
 

diff --git a/docs/reference/ml/ml-shared.asciidoc b/docs/reference/ml/ml-shared.asciidoc
@@ -988,6 +988,7 @@ values are
 +
 --
 * `bert`: Use for BERT-style models
+* `deberta_v2`: Use for DeBERTa v2 and v3-style models
 * `mpnet`: Use for MPNet-style models
 * `roberta`: Use for RoBERTa-style and BART-style models
 * experimental:[] `xlm_roberta`: Use for XLMRoBERTa-style models
@@ -1037,6 +1038,19 @@ sequence. Therefore, do not use `second` in this case.
 
 end::inference-config-nlp-tokenization-truncate[]
 
+tag::inference-config-nlp-tokenization-truncate-deberta-v2[]
+Indicates how tokens are truncated when they exceed `max_sequence_length`.
+The default value is `first`.
++
+--
+* `balanced`: One or both of the first and second sequences may be truncated so as to balance the tokens included from both sequences.
+* `none`: No truncation occurs; the inference request receives an error.
+* `first`: Only the first sequence is truncated.
+* `second`: Only the second sequence is truncated. If there is just one sequence, that sequence is truncated.
+--
+
+end::inference-config-nlp-tokenization-truncate-deberta-v2[]
+
 tag::inference-config-nlp-tokenization-bert-with-special-tokens[]
 Tokenize with special tokens. The tokens typically included in BERT-style tokenization are:
 +
@@ -1050,10 +1064,23 @@ tag::inference-config-nlp-tokenization-bert-ja-with-special-tokens[]
 Tokenize with special tokens if `true`.
 end::inference-config-nlp-tokenization-bert-ja-with-special-tokens[]
 
+tag::inference-config-nlp-tokenization-deberta-v2[]
+DeBERTa-style tokenization is to be performed with the enclosed settings.
+end::inference-config-nlp-tokenization-deberta-v2[]
+
 tag::inference-config-nlp-tokenization-max-sequence-length[]
 Specifies the maximum number of tokens allowed to be output by the tokenizer.
 end::inference-config-nlp-tokenization-max-sequence-length[]
 
+tag::inference-config-nlp-tokenization-deberta-v2-with-special-tokens[]
+Tokenize with special tokens. The tokens typically included in DeBERTa-style tokenization are:
++
+--
+* `[CLS]`: The first token of the sequence being classified.
+* `[SEP]`: Indicates sequence separation and sequence end.
+--
+end::inference-config-nlp-tokenization-deberta-v2-with-special-tokens[]
+
 tag::inference-config-nlp-tokenization-roberta[]
 RoBERTa-style tokenization is to be performed with the enclosed settings.
 end::inference-config-nlp-tokenization-roberta[]

diff --git a/docs/reference/ml/trained-models/apis/infer-trained-model.asciidoc b/docs/reference/ml/trained-models/apis/infer-trained-model.asciidoc
@@ -137,6 +137,18 @@ include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizatio
 (Optional, string)
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate]
 =======
+`deberta_v2`::::
+(Optional, object)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-deberta-v2]
++
+.Properties of deberta_v2
+[%collapsible%open]
+=======
+`truncate`::::
+(Optional, string)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate-deberta-v2]
+=======
+
 `roberta`::::
 (Optional, object)
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-roberta]

diff --git a/docs/reference/ml/trained-models/apis/put-trained-models.asciidoc b/docs/reference/ml/trained-models/apis/put-trained-models.asciidoc
@@ -773,6 +773,37 @@ include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizatio
 (Optional, boolean)
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens]
 ====
+`deberta_v2`::
+(Optional, object)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-deberta-v2]
++
+.Properties of deberta_v2
+[%collapsible%open]
+====
+`do_lower_case`:::
+(Optional, boolean)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-do-lower-case]
++
+--
+Defaults to `false`.
+--
+
+`max_sequence_length`:::
+(Optional, integer)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-max-sequence-length]
+
+`span`:::
+(Optional, integer)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-span]
+
+`truncate`:::
+(Optional, string)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate-deberta-v2]
+
+`with_special_tokens`:::
+(Optional, boolean)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-deberta-v2-with-special-tokens]
+====
 `roberta`::
 (Optional, object)
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-roberta]

diff --git a/docs/reference/setup/install.asciidoc b/docs/reference/setup/install.asciidoc
@@ -76,27 +76,29 @@ Docker container images may be downloaded from the Elastic Docker Registry.
 [[jvm-version]]
 === Java (JVM) Version
 
-{es} is built using Java, and includes a bundled version of
-https://openjdk.java.net[OpenJDK] from the JDK maintainers (GPLv2+CE) within
-each distribution. The bundled JVM is the recommended JVM.
-
-To use your own version of Java, set the `ES_JAVA_HOME` environment variable.
-If you must use a version of Java that is different from the bundled JVM, it is
-best to use the latest release of a link:/support/matrix[supported]
-https://www.oracle.com/technetwork/java/eol-135779.html[LTS version of Java].
-{es} is closely coupled to certain OpenJDK-specific features, so it may not
-work correctly with other JVMs. {es} will refuse to start if a known-bad
-version of Java is used.
-
-If you use a JVM other than the bundled one, you are responsible for reacting
-to announcements related to its security issues and bug fixes, and must
-yourself determine whether each update is necessary or not. In contrast, the
-bundled JVM is treated as an integral part of {es}, which means that Elastic
-takes responsibility for keeping it up to date. Security issues and bugs within
-the bundled JVM are treated as if they were within {es} itself.
-
-The bundled JVM is located within the `jdk` subdirectory of the {es} home
-directory. You may remove this directory if using your own JVM.
+{es} is built using Java, and includes a bundled version of https://openjdk.java.net[OpenJDK] within each distribution. We strongly
+recommend using the bundled JVM in all installations of {es}.
+
+The bundled JVM is treated the same as any other dependency of {es} in terms of support and maintenance. This means that Elastic takes
+responsibility for keeping it up to date, and reacts to security issues and bug reports as needed to address vulnerabilities and other bugs
+in {es}. Elastic's support of the bundled JVM is subject to Elastic's https://www.elastic.co/support_policy[support policy] and
+https://www.elastic.co/support/eol[end-of-life schedule] and is independent of the support policy and end-of-life schedule offered by the
+original supplier of the JVM. Elastic does not support using the bundled JVM for purposes other than running {es}.
+
+TIP: {es} uses only a subset of the features offered by the JVM. Bugs and security issues in the bundled JVM often relate to features that
+{es} does not use. Such issues do not apply to {es}. Elastic analyzes reports of security vulnerabilities in all its dependencies, including
+in the bundled JVM, and will issue an https://www.elastic.co/community/security[Elastic Security Advisory] if such an advisory is needed.
+
+If you decide to run {es} using a version of Java that is different from the bundled one, prefer to use the latest release of a
+https://www.oracle.com/technetwork/java/eol-135779.html[LTS version of Java] which is link:/support/matrix[listed in the support matrix].
+Although such a configuration is supported, if you encounter a security issue or other bug in your chosen JVM then Elastic may not be able
+to help unless the issue is also present in the bundled JVM. Instead, you must seek assistance directly from the supplier of your chosen
+JVM. You must also take responsibility for reacting to security and bug announcements from the supplier of your chosen JVM. {es} may not
+perform optimally if using a JVM other than the bundled one. {es} is closely coupled to certain OpenJDK-specific features, so it may not
+work correctly with JVMs that are not OpenJDK. {es} will refuse to start if you attempt to use a known-bad JVM version.
+
+To use your own version of Java, set the `ES_JAVA_HOME` environment variable to the path to your own JVM installation. The bundled JVM is
+located within the `jdk` subdirectory of the {es} home directory. You may remove this directory if using your own JVM.
 
 [discrete]
 [[jvm-agents]]

diff --git a/libs/h3/src/main/java/org/elasticsearch/h3/CellBoundary.java b/libs/h3/src/main/java/org/elasticsearch/h3/CellBoundary.java
@@ -22,36 +22,52 @@
  */
 package org.elasticsearch.h3;
 
+import java.util.Arrays;
+import java.util.Objects;
+
 /**
  * cell boundary points as {@link LatLng}
  */
 public final class CellBoundary {
-
     /** Maximum number of cell boundary vertices; worst case is pentagon:
      *  5 original verts + 5 edge crossings
      */
-    private static final int MAX_CELL_BNDRY_VERTS = 10;
+    static final int MAX_CELL_BNDRY_VERTS = 10;
     /** How many points it holds */
-    private int numVertext;
+    private final int numPoints;
     /** The actual points */
-    private final LatLng[] points = new LatLng[MAX_CELL_BNDRY_VERTS];
-
-    CellBoundary() {}
+    private final LatLng[] points;
 
-    void add(LatLng point) {
-        points[numVertext++] = point;
+    CellBoundary(LatLng[] points, int numPoints) {
+        this.points = points;
+        this.numPoints = numPoints;
     }
 
     /** Number of points in this boundary */
     public int numPoints() {
-        return numVertext;
+        return numPoints;
     }
 
     /** Return the point at the given position*/
     public LatLng getLatLon(int i) {
-        if (i >= numVertext) {
-            throw new IndexOutOfBoundsException();
-        }
+        assert i >= 0 && i < numPoints;
         return points[i];
     }
+
+    @Override
+    public boolean equals(Object o) {
+        if (this == o) {
+            return true;
+        }
+        if (o == null || getClass() != o.getClass()) {
+            return false;
+        }
+        final CellBoundary that = (CellBoundary) o;
+        return numPoints == that.numPoints && Arrays.equals(points, that.points);
+    }
+
+    @Override
+    public int hashCode() {
+        return Objects.hash(numPoints, Arrays.hashCode(points));
+    }
 }
diff --git a/libs/h3/src/main/java/org/elasticsearch/h3/Constants.java b/libs/h3/src/main/java/org/elasticsearch/h3/Constants.java
@@ -34,10 +34,6 @@ final class Constants {
      * 2.0 * PI
      */
     public static final double M_2PI = 2.0 * Math.PI;
-    /**
-     * max H3 resolution; H3 version 1 has 16 resolutions, numbered 0 through 15
-     */
-    public static int MAX_H3_RES = 15;
     /**
      * The number of H3 base cells
      */