Skip to content

Commit

Permalink
Merge branch 'main' into text_similarity_reranker_rework
Browse files Browse the repository at this point in the history
  • Loading branch information
pmpailis authored Oct 7, 2024
2 parents c1d8987 + 1292580 commit d82cfcc
Show file tree
Hide file tree
Showing 53 changed files with 483 additions and 252 deletions.
5 changes: 5 additions & 0 deletions docs/changelog/113251.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 113251
summary: Span term query to convert to match no docs when unmapped field is targeted
area: Search
type: bug
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ PUT _connector/my-connector
"name": "My Connector",
"description": "My Connector to sync data to Elastic index from Google Drive",
"service_type": "google_drive",
"language": "english"
"language": "en"
}
----

Expand Down
81 changes: 81 additions & 0 deletions docs/reference/ingest/processors/inference.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,18 @@ include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizatio
include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate]
=======
`deberta_v2`::::
(Optional, object)
include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-deberta-v2]
+
.Properties of deberta_v2
[%collapsible%open]
=======
`truncate`::::
(Optional, string)
include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate-deberta-v2]
=======
`roberta`::::
(Optional, object)
include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-roberta]
Expand Down Expand Up @@ -224,6 +236,18 @@ include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizatio
include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate]
=======
`deberta_v2`::::
(Optional, object)
include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-deberta-v2]
+
.Properties of deberta_v2
[%collapsible%open]
=======
`truncate`::::
(Optional, string)
include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate-deberta-v2]
=======
`roberta`::::
(Optional, object)
include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-roberta]
Expand Down Expand Up @@ -304,6 +328,23 @@ include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizatio
include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate]
=======
`deberta_v2`::::
(Optional, object)
include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-deberta-v2]
+
.Properties of deberta_v2
[%collapsible%open]
=======
`span`::::
(Optional, integer)
include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-span]

`truncate`::::
(Optional, string)
include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate-deberta-v2]
=======
`roberta`::::
(Optional, object)
include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-roberta]
Expand Down Expand Up @@ -363,6 +404,18 @@ include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizatio
include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate]
=======
`deberta_v2`::::
(Optional, object)
include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-deberta-v2]
+
.Properties of deberta_v2
[%collapsible%open]
=======
`truncate`::::
(Optional, string)
include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate-deberta-v2]
=======
`roberta`::::
(Optional, object)
include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-roberta]
Expand Down Expand Up @@ -424,6 +477,22 @@ include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizatio
include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate]
=======
`deberta_v2`::::
(Optional, object)
include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-deberta-v2]
+
.Properties of deberta_v2
[%collapsible%open]
=======
`span`::::
(Optional, integer)
include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-span]

`truncate`::::
(Optional, string)
include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate-deberta-v2]
=======
`roberta`::::
(Optional, object)
include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-roberta]
Expand Down Expand Up @@ -515,6 +584,18 @@ include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizatio
include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate]
=======
`deberta_v2`::::
(Optional, object)
include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-deberta-v2]
+
.Properties of deberta_v2
[%collapsible%open]
=======
`truncate`::::
(Optional, string)
include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate-deberta-v2]
=======
`roberta`::::
(Optional, object)
include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-roberta]
Expand Down
2 changes: 0 additions & 2 deletions docs/reference/mapping/runtime.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -821,8 +821,6 @@ address.
[[lookup-runtime-fields]]
==== Retrieve fields from related indices

experimental[]

The <<search-fields,`fields`>> parameter on the `_search` API can also be used to retrieve fields from
the related indices via runtime fields with a type of `lookup`.

Expand Down
3 changes: 1 addition & 2 deletions docs/reference/mapping/types/date.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,7 @@ The following parameters are accepted by `date` fields:
`locale`::

The locale to use when parsing dates since months do not have the same names
and/or abbreviations in all languages. The default is the
https://docs.oracle.com/javase/8/docs/api/java/util/Locale.html#ROOT[`ROOT` locale].
and/or abbreviations in all languages. The default is ENGLISH.

<<ignore-malformed,`ignore_malformed`>>::

Expand Down
27 changes: 27 additions & 0 deletions docs/reference/ml/ml-shared.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -988,6 +988,7 @@ values are
+
--
* `bert`: Use for BERT-style models
* `deberta_v2`: Use for DeBERTa v2 and v3-style models
* `mpnet`: Use for MPNet-style models
* `roberta`: Use for RoBERTa-style and BART-style models
* experimental:[] `xlm_roberta`: Use for XLMRoBERTa-style models
Expand Down Expand Up @@ -1037,6 +1038,19 @@ sequence. Therefore, do not use `second` in this case.

end::inference-config-nlp-tokenization-truncate[]

tag::inference-config-nlp-tokenization-truncate-deberta-v2[]
Indicates how tokens are truncated when they exceed `max_sequence_length`.
The default value is `first`.
+
--
* `balanced`: One or both of the first and second sequences may be truncated so as to balance the tokens included from both sequences.
* `none`: No truncation occurs; the inference request receives an error.
* `first`: Only the first sequence is truncated.
* `second`: Only the second sequence is truncated. If there is just one sequence, that sequence is truncated.
--

end::inference-config-nlp-tokenization-truncate-deberta-v2[]

tag::inference-config-nlp-tokenization-bert-with-special-tokens[]
Tokenize with special tokens. The tokens typically included in BERT-style tokenization are:
+
Expand All @@ -1050,10 +1064,23 @@ tag::inference-config-nlp-tokenization-bert-ja-with-special-tokens[]
Tokenize with special tokens if `true`.
end::inference-config-nlp-tokenization-bert-ja-with-special-tokens[]

tag::inference-config-nlp-tokenization-deberta-v2[]
DeBERTa-style tokenization is to be performed with the enclosed settings.
end::inference-config-nlp-tokenization-deberta-v2[]

tag::inference-config-nlp-tokenization-max-sequence-length[]
Specifies the maximum number of tokens allowed to be output by the tokenizer.
end::inference-config-nlp-tokenization-max-sequence-length[]

tag::inference-config-nlp-tokenization-deberta-v2-with-special-tokens[]
Tokenize with special tokens. The tokens typically included in DeBERTa-style tokenization are:
+
--
* `[CLS]`: The first token of the sequence being classified.
* `[SEP]`: Indicates sequence separation and sequence end.
--
end::inference-config-nlp-tokenization-deberta-v2-with-special-tokens[]

tag::inference-config-nlp-tokenization-roberta[]
RoBERTa-style tokenization is to be performed with the enclosed settings.
end::inference-config-nlp-tokenization-roberta[]
Expand Down
12 changes: 12 additions & 0 deletions docs/reference/ml/trained-models/apis/infer-trained-model.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,18 @@ include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizatio
(Optional, string)
include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate]
=======
`deberta_v2`::::
(Optional, object)
include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-deberta-v2]
+
.Properties of deberta_v2
[%collapsible%open]
=======
`truncate`::::
(Optional, string)
include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate-deberta-v2]
=======
`roberta`::::
(Optional, object)
include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-roberta]
Expand Down
31 changes: 31 additions & 0 deletions docs/reference/ml/trained-models/apis/put-trained-models.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -773,6 +773,37 @@ include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizatio
(Optional, boolean)
include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens]
====
`deberta_v2`::
(Optional, object)
include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-deberta-v2]
+
.Properties of deberta_v2
[%collapsible%open]
====
`do_lower_case`:::
(Optional, boolean)
include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-do-lower-case]
+
--
Defaults to `false`.
--
`max_sequence_length`:::
(Optional, integer)
include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-max-sequence-length]
`span`:::
(Optional, integer)
include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-span]
`truncate`:::
(Optional, string)
include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate-deberta-v2]
`with_special_tokens`:::
(Optional, boolean)
include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-deberta-v2-with-special-tokens]
====
`roberta`::
(Optional, object)
include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-roberta]
Expand Down
44 changes: 23 additions & 21 deletions docs/reference/setup/install.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -76,27 +76,29 @@ Docker container images may be downloaded from the Elastic Docker Registry.
[[jvm-version]]
=== Java (JVM) Version

{es} is built using Java, and includes a bundled version of
https://openjdk.java.net[OpenJDK] from the JDK maintainers (GPLv2+CE) within
each distribution. The bundled JVM is the recommended JVM.

To use your own version of Java, set the `ES_JAVA_HOME` environment variable.
If you must use a version of Java that is different from the bundled JVM, it is
best to use the latest release of a link:/support/matrix[supported]
https://www.oracle.com/technetwork/java/eol-135779.html[LTS version of Java].
{es} is closely coupled to certain OpenJDK-specific features, so it may not
work correctly with other JVMs. {es} will refuse to start if a known-bad
version of Java is used.

If you use a JVM other than the bundled one, you are responsible for reacting
to announcements related to its security issues and bug fixes, and must
yourself determine whether each update is necessary or not. In contrast, the
bundled JVM is treated as an integral part of {es}, which means that Elastic
takes responsibility for keeping it up to date. Security issues and bugs within
the bundled JVM are treated as if they were within {es} itself.

The bundled JVM is located within the `jdk` subdirectory of the {es} home
directory. You may remove this directory if using your own JVM.
{es} is built using Java, and includes a bundled version of https://openjdk.java.net[OpenJDK] within each distribution. We strongly
recommend using the bundled JVM in all installations of {es}.

The bundled JVM is treated the same as any other dependency of {es} in terms of support and maintenance. This means that Elastic takes
responsibility for keeping it up to date, and reacts to security issues and bug reports as needed to address vulnerabilities and other bugs
in {es}. Elastic's support of the bundled JVM is subject to Elastic's https://www.elastic.co/support_policy[support policy] and
https://www.elastic.co/support/eol[end-of-life schedule] and is independent of the support policy and end-of-life schedule offered by the
original supplier of the JVM. Elastic does not support using the bundled JVM for purposes other than running {es}.

TIP: {es} uses only a subset of the features offered by the JVM. Bugs and security issues in the bundled JVM often relate to features that
{es} does not use. Such issues do not apply to {es}. Elastic analyzes reports of security vulnerabilities in all its dependencies, including
in the bundled JVM, and will issue an https://www.elastic.co/community/security[Elastic Security Advisory] if such an advisory is needed.

If you decide to run {es} using a version of Java that is different from the bundled one, prefer to use the latest release of a
https://www.oracle.com/technetwork/java/eol-135779.html[LTS version of Java] which is link:/support/matrix[listed in the support matrix].
Although such a configuration is supported, if you encounter a security issue or other bug in your chosen JVM then Elastic may not be able
to help unless the issue is also present in the bundled JVM. Instead, you must seek assistance directly from the supplier of your chosen
JVM. You must also take responsibility for reacting to security and bug announcements from the supplier of your chosen JVM. {es} may not
perform optimally if using a JVM other than the bundled one. {es} is closely coupled to certain OpenJDK-specific features, so it may not
work correctly with JVMs that are not OpenJDK. {es} will refuse to start if you attempt to use a known-bad JVM version.

To use your own version of Java, set the `ES_JAVA_HOME` environment variable to the path to your own JVM installation. The bundled JVM is
located within the `jdk` subdirectory of the {es} home directory. You may remove this directory if using your own JVM.

[discrete]
[[jvm-agents]]
Expand Down
40 changes: 28 additions & 12 deletions libs/h3/src/main/java/org/elasticsearch/h3/CellBoundary.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,36 +22,52 @@
*/
package org.elasticsearch.h3;

import java.util.Arrays;
import java.util.Objects;

/**
* cell boundary points as {@link LatLng}
*/
public final class CellBoundary {

/** Maximum number of cell boundary vertices; worst case is pentagon:
* 5 original verts + 5 edge crossings
*/
private static final int MAX_CELL_BNDRY_VERTS = 10;
static final int MAX_CELL_BNDRY_VERTS = 10;
/** How many points it holds */
private int numVertext;
private final int numPoints;
/** The actual points */
private final LatLng[] points = new LatLng[MAX_CELL_BNDRY_VERTS];

CellBoundary() {}
private final LatLng[] points;

void add(LatLng point) {
points[numVertext++] = point;
CellBoundary(LatLng[] points, int numPoints) {
this.points = points;
this.numPoints = numPoints;
}

/** Number of points in this boundary */
public int numPoints() {
return numVertext;
return numPoints;
}

/** Return the point at the given position*/
public LatLng getLatLon(int i) {
if (i >= numVertext) {
throw new IndexOutOfBoundsException();
}
assert i >= 0 && i < numPoints;
return points[i];
}

@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
final CellBoundary that = (CellBoundary) o;
return numPoints == that.numPoints && Arrays.equals(points, that.points);
}

@Override
public int hashCode() {
return Objects.hash(numPoints, Arrays.hashCode(points));
}
}
4 changes: 0 additions & 4 deletions libs/h3/src/main/java/org/elasticsearch/h3/Constants.java
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,6 @@ final class Constants {
* 2.0 * PI
*/
public static final double M_2PI = 2.0 * Math.PI;
/**
* max H3 resolution; H3 version 1 has 16 resolutions, numbered 0 through 15
*/
public static int MAX_H3_RES = 15;
/**
* The number of H3 base cells
*/
Expand Down
Loading

0 comments on commit d82cfcc

Please sign in to comment.