From 3735481aacdcc6a115e5df005eaf99e4af0239e7 Mon Sep 17 00:00:00 2001
From: Liam Thompson <leemthompo@gmail.com>
Date: Wed, 27 Nov 2024 11:58:23 +0100
Subject: [PATCH] [DOCS] Add Elastic Rerank usage docs

---
 .../inference/service-elasticsearch.asciidoc  | 31 +++++++-
 .../reranking/semantic-reranking.asciidoc     | 14 ++--
 docs/reference/search/retriever.asciidoc      | 70 ++++++++++++++++++-
 3 files changed, 106 insertions(+), 9 deletions(-)

diff --git a/docs/reference/inference/service-elasticsearch.asciidoc b/docs/reference/inference/service-elasticsearch.asciidoc
index 0103b425faefe..f859d5fb39d16 100644
--- a/docs/reference/inference/service-elasticsearch.asciidoc
+++ b/docs/reference/inference/service-elasticsearch.asciidoc
@@ -119,7 +119,6 @@ include::inference-shared.asciidoc[tag=task-settings]
 Returns the document instead of only the index. Defaults to `true`.
 =====
 
-
 [discrete]
 [[inference-example-elasticsearch-elser]]
 ==== ELSER via the `elasticsearch` service
@@ -150,6 +149,34 @@ PUT _inference/sparse_embedding/my-elser-model
 Valid values are `.elser_model_2` and `.elser_model_2_linux-x86_64`.
 For further details, refer to the {ml-docs}/ml-nlp-elser.html[ELSER model documentation].
 
+[discrete]
+[[inference-example-elastic-reranker]]
+==== Elastic Rerank via the `elasticsearch` service
+
+The following example shows how to create an {infer} endpoint called `my-elastic-rerank` to perform a `rerank` task type using the built-in Elastic Rerank cross-encoder model.
+
+The API request below will automatically download the Elastic Rerank model if it isn't already downloaded and then deploy the model.
+Once deployed, the model can be used for semantic re-ranking with a <<text-similarity-reranker-retriever-example-elastic-rerank,`text_similarity_reranker` retriever>>.
+
+[source,console]
+------------------------------------------------------------
+PUT _inference/rerank/my-elastic-rerank
+{
+  "service": "elasticsearch",
+  "service_settings": {
+    "model_id": ".rerank-v1", <1>
+    "num_threads": 1,
+    "adaptive_allocations": { <2>
+      "enabled": true,
+      "min_number_of_allocations": 1,
+      "max_number_of_allocations": 10
+    }
+  }
+}
+------------------------------------------------------------
+// TEST[skip:TBD]
+<1> The `model_id` must be the ID of the built-in Elastic Rerank model: `.rerank-v1`.
+<2> {ml-docs}/ml-nlp-auto-scale.html#nlp-model-adaptive-allocations[Adaptive allocations] will be enabled with the minimum of 1 and the maximum of 10 allocations. 
 
 [discrete]
 [[inference-example-elasticsearch]]
@@ -186,7 +213,7 @@ If using the Python client, you can set the `timeout` parameter to a higher valu
 
 [discrete]
 [[inference-example-eland]]
-==== Models uploaded by Eland via the elasticsearch service
+==== Models uploaded by Eland via the `elasticsearch` service
 
 The following example shows how to create an {infer} endpoint called
 `my-msmarco-minilm-model` to perform a `text_embedding` task type.
diff --git a/docs/reference/reranking/semantic-reranking.asciidoc b/docs/reference/reranking/semantic-reranking.asciidoc
index 4ebe90e44708e..2179e07481661 100644
--- a/docs/reference/reranking/semantic-reranking.asciidoc
+++ b/docs/reference/reranking/semantic-reranking.asciidoc
@@ -86,12 +86,14 @@ In {es}, semantic re-rankers are implemented using the {es} <<inference-apis,Inf
 To use semantic re-ranking in {es}, you need to:
 
 . *Choose a re-ranking model*.
-Currently you can:
+Currently you have the following options:
+.. Use the built-in <<inference-example-elastic-reranker,Elastic Rerank>> cross-encoder model via the inference API's {es} service. 
+.. Integrate directly with the <<infer-service-cohere,Cohere Rerank inference endpoint>> using the `rerank` task type
+.. Integrate directly with the <<infer-service-google-vertex-ai,Google Vertex AI inference endpoint>> using the `rerank` task type
+.. Upload a model to {es} from Hugging Face with {eland-docs}/machine-learning.html#ml-nlp-pytorch[Eland]. You'll need to use the `text_similarity` NLP task type when loading the model using Eland. Then set up an <<inference-example-eland,{es} service inference endpoint>> with the `rerank` task type.
++ 
+Refer to {ml-docs}/ml-nlp-model-ref.html#ml-nlp-model-ref-text-similarity[the Elastic NLP model reference] for a list of third party text similarity models supported by {es} for semantic re-ranking.
 
-** Integrate directly with the <<infer-service-cohere,Cohere Rerank inference endpoint>> using the `rerank` task type
-** Integrate directly with the <<infer-service-google-vertex-ai,Google Vertex AI inference endpoint>> using the `rerank` task type
-** Upload a model to {es} from Hugging Face with {eland-docs}/machine-learning.html#ml-nlp-pytorch[Eland]. You'll need to use the `text_similarity` NLP task type when loading the model using Eland. Refer to {ml-docs}/ml-nlp-model-ref.html#ml-nlp-model-ref-text-similarity[the Elastic NLP model reference] for a list of third party text similarity models supported by {es} for semantic re-ranking.
-*** Then set up an <<inference-example-eland,{es} service inference endpoint>> with the `rerank` task type
 . *Create a `rerank` task using the <<put-inference-api,{es} Inference API>>*.
 The Inference API creates an inference endpoint and configures your chosen machine learning model to perform the re-ranking task.
 . *Define a `text_similarity_reranker` retriever in your search request*.
@@ -117,7 +119,7 @@ POST _search
         }
       },
       "field": "text",
-      "inference_id": "my-cohere-rerank-model",
+      "inference_id": "my-elastic-rerank",
       "inference_text": "How often does the moon hide the sun?",
       "rank_window_size": 100,
       "min_score": 0.5
diff --git a/docs/reference/search/retriever.asciidoc b/docs/reference/search/retriever.asciidoc
index 86a81f1d155d2..4a381b8c5a713 100644
--- a/docs/reference/search/retriever.asciidoc
+++ b/docs/reference/search/retriever.asciidoc
@@ -11,6 +11,7 @@ This allows for complex behavior to be depicted in a tree-like structure, called
 [TIP]
 ====
 Refer to <<retrievers-overview>> for a high level overview of the retrievers abstraction.
+Refer to <<retrievers-examples, Retrievers examples>> for additional examples.
 ====
 
 The following retrievers are available:
@@ -386,8 +387,9 @@ To use `text_similarity_reranker` you must first set up a `rerank` task using th
 The `rerank` task should be set up with a machine learning model that can compute text similarity.
 Refer to {ml-docs}/ml-nlp-model-ref.html#ml-nlp-model-ref-text-similarity[the Elastic NLP model reference] for a list of third-party text similarity models supported by {es}.
 
-Currently you can:
+You have the following options:
 
+* Use the the built-in <<inference-example-elastic-reranker,Elastic Rerank>> cross-encoder model via the inference API's {es} service.
 * Integrate directly with the <<infer-service-cohere,Cohere Rerank inference endpoint>> using the `rerank` task type
 * Integrate directly with the <<infer-service-google-vertex-ai,Google Vertex AI inference endpoint>> using the `rerank` task type
 * Upload a model to {es} with {eland-docs}/machine-learning.html#ml-nlp-pytorch[Eland] using the `text_similarity` NLP task type.
@@ -436,6 +438,62 @@ Note that score calculations vary depending on the model used.
 Applies the specified <<query-dsl-bool-query, boolean query filter>> to the child  <<retriever, retriever>>.
 If the child retriever already specifies any filters, then this top-level filter is applied in conjuction with the filter defined in the child retriever.
 
+[discrete]
+[[text-similarity-reranker-retriever-example-elastic-rerank]]
+==== Example: Elastic Rerank
+
+This examples demonstrates how to deploy the Elastic Rerank model and use it to re-rank search results using the `text_similarity_reranker` retriever.
+
+Follow these steps:
+
+. Create an inference endpoint for the `rerank` task using the <<put-inference-api, Create {infer} API>>. 
++
+[source,console]
+----
+PUT _inference/rerank/my-elastic-rerank
+{
+  "service": "elasticsearch",
+  "service_settings": {
+    "model_id": ".rerank-v1", 
+    "num_threads": 1,
+    "adaptive_allocations": { <1>
+      "enabled": true,
+      "min_number_of_allocations": 1,
+      "max_number_of_allocations": 10
+    }
+  }
+}
+----
+// TEST[skip:uses ML]
+<1> {ml-docs}/ml-nlp-auto-scale.html#nlp-model-adaptive-allocations[Adaptive allocations] will be enabled with the minimum of 1 and the maximum of 10 allocations. 
++
+. Define a `text_similarity_rerank` retriever:
+[source,console]
+----
+POST _search
+{
+  "retriever": {
+    "text_similarity_reranker": {
+      "retriever": {
+        "standard": {
+          "query": {
+            "match": {
+              "text": "How often does the moon hide the sun?"
+            }
+          }
+        }
+      },
+      "field": "text",
+      "inference_id": "my-elastic-rerank",
+      "inference_text": "How often does the moon hide the sun?",
+      "rank_window_size": 100,
+      "min_score": 0.5
+    }
+  }
+}
+----
+// TEST[skip:uses ML]
+
 [discrete]
 [[text-similarity-reranker-retriever-example-cohere]]
 ==== Example: Cohere Rerank
@@ -680,6 +738,12 @@ GET movies/_search
 <1> The `rule` retriever is the outermost retriever, applying rules to the search results that were previously reranked using the `rrf` retriever.
 <2> The `rrf` retriever returns results from all of its sub-retrievers, and the output of the `rrf` retriever is used as input to the `rule` retriever.
 
+[discrete]
+[[retriever-common-parameters]]
+=== Common usage guidelines
+
+[discrete]
+[[retriever-size-pagination]]
 ==== Using `from` and `size` with a retriever tree
 
 The <<search-from-param, `from`>> and <<search-size-param, `size`>>
@@ -688,12 +752,16 @@ parameters are provided globally as part of the general
 They are applied to all retrievers in a retriever tree, unless a specific retriever overrides the `size` parameter using a different parameter such as `rank_window_size`.
 Though, the final search hits are always limited to `size`.
 
+[discrete]
+[[retriever-aggregations]]
 ==== Using aggregations with a retriever tree
 
 <<search-aggregations, Aggregations>> are globally specified as part of a search request.
 The query used for an aggregation is the combination of all leaf retrievers as `should`
 clauses in a <<query-dsl-bool-query, boolean query>>.
 
+[discrete]
+[[retriever-restrictions]]
 ==== Restrictions on search parameters when specifying a retriever
 
 When a retriever is specified as part of a search, the following elements are not allowed at the top-level.